#!/bin/tcsh -f #(ie run the cshell on this but don't read the .cshrc) echo version = 1.42 of medquery 2011 Oct 05 # 2011 Oct 05, 1.42: remove underscore (_) from key names! # 2010 Sep 04, 1.41: update documentation # 2010 Aug 31, 1.40: update address # 2009 Apr 22, 1.39: if error # 2008 Aug 15, 1.38: clean up use of files. # 2008 Jul 31, 1.37: more unmatched quote hunt, -q involved ... got it! # 2008 Jul 30, 1.36: unmatched quote hunt # 2008 Jun 25, 1.35: bibkey produced # 2008 Jun 10, 1.34: -q # 2008 Jun 07, 1.33: allow PubmedCentral numbers like PMC178597 # 2008 May 28, 1.32: wget refused by pubmed! Message: # 1: id: 16790843 Error occurred: Error 111 (Connection refused) # change wget call # 2008 Apr 16, 1.31: document 'pure' better # 2007 Nov 14, 1.30: better failure message; pmid gives result! # 2007 Nov 14, 1.29: documentation of what medquery looks for # 2007 Jun 11, 1.28: documentation # 2007 Jun 11, 1.27: cleanup # 2007 Jun 11, 1.26: require argument for file name # 2006 Jul 19, 1.25: fails - PubMed changed again? "ERROR 500: Server Error" # 2006 Jun 6, 1.24: Bus error crash # 2006 Mar 5, 1.23: crash? # 2005 Aug 18, 1.22: upgrade medlinebibp # 2005 May 11, 1.21: save the PAGE now (as query.fcgi.html or query.fcgi) # 2004 May 15, 1.20: pubmedgrab 12086598 crashes - [] problem # 2004 May 13, 1.19: handle 'PMID: 13918161 PubMed - OLDMEDLINE for Pre1966' # 2004 Apr 22, 1.18: eutils now functional # 2004 Apr 7, 1.17: use E-utilities to get entry more cleanly (failed) # 2004 Mar 18, 1.16: handle
to get PMID properly
# 2003 Jul 15, 1.15: now working with "E-Utilities"
# 2003 Jul 15, 1.14: PubMed format changed to "E-Utilities" - broke this script!
# 2002 May 4, 1.13: handle bibquery being empty when medlinebib fails
# 2001 May 24, 1.12: pubmed format changed! This fixes it
# 2001 Mar 29, 1.11: make medquery handle html if the person uses that to save.
# 2001 Mar 29, 1.10: rename query0 query2
# 2000 Jan 24, 1.05: use pmid preferentially
# 1999 Nov 22, 1.03: medquery now uses query.fcgi from the new pubmed
# origin 1999 Sep 5 from mq
if ($#argv == 1) then
if ("$1" == "-q") then
echo "Quitting by '-q' to prevent giving entire help list."
exit
endif
endif
if ($#argv == 0) then
echo 'usage: medquery [pubmed page OR pubmed ID OR pubmed central ID]'
echo 'Convert a saved PubMed reference page into BibTeX format'
echo
echo 'The medquery script accepts a single argument, the name of the file'
echo 'containing a PubMed web page.'
echo '(Note: The only item that medquery cares about on the page is the'
echo 'PubMed id. It looks for the string "PMID: " at the beginning'
echo 'of a line, followed by the pubmed ID number.'
echo 'It uses only that number for further processing.'
echo 'You can make a file containing "PMID: [number]" alone.)'
echo
echo 'If you give a PubMed Id as the argument, then the program'
echo 'constructs the necessary files and will give you the results.'
echo
echo 'In the current directory several files are created:'
echo ' - query (the original saved PubMed page is moved here)'
echo ' - The bibformat file will contain the last entry.'
echo ' - The bibkey file will contain the key for the last entry.'
echo ' - The bib file will contain all entries.'
echo ' - The medlinebibp file controls the medlinebib program.'
echo 'Note that the bib file is always appended to,'
echo 'so results from previous runs will be included.'
echo 'Spaces in the key name are repaced in medlinebib by dashes.'
echo
echo 'PubMed is a database of biology-related references at'
# echo 'http://www.ncbi.nlm.nih.gov/PubMed/medline.html' # gone: 2007jun11
echo 'http://www.ncbi.nlm.nih.gov/sites/entrez?db=pubmed'
echo
echo 'Using your browser, save the entire web page that contains a'
echo 'single reference of interest and medquery will extract the'
echo 'PubMed ID (PMID) from that. Then medquery obtains the medline'
echo 'format file (using wget) and converts it automatically to'
echo 'BibTeX format.'
echo
echo 'The key of the entry is generated from the first author,'
echo 'the last author and the year.'
echo 'You may need to edit the entry to make sure that italics'
echo 'and special symbols are correct.'
echo
echo '* Information about LaTeX (a typesetting language) and BibTeX'
echo '(a database language for references in papers) is at:'
echo 'http://www.ccrnp.ncifcrf.gov/~toms/latex.html'
echo
echo '* Medquery uses the program medlinebib:'
echo 'http://www.ccrnp.ncifcrf.gov/~toms/delila/medlinebib.html'
echo
echo '* Medquery uses wget:'
echo 'http://www.ccrnp.ncifcrf.gov/~toms/wget.html'
echo
echo '* You can use medquery with atchange:'
echo 'http://www.ccrnp.ncifcrf.gov/~toms/atchange.html'
echo "To use it, put an 'automate' in your home directory containing"
echo 'three lines:'
echo ''
echo 'query'
echo ' clear'
echo ' medquery query -q'
echo ''
echo 'This assumes that the file saved from PubMed is called "query"'
echo 'Start the automation by typing'
echo ''
echo ' atchange automate'
echo ''
echo 'Then find your reference in PubMed and simply save it to your'
echo 'home directory in the name "query".'
echo
echo 'PubMed changes their format once in a while so this script'
echo 'needs to be updated frequenty.'
echo
echo 'If the argument is "-q", medquery quits to prevent'
echo 'this error message from being shown when an argument'
echo 'is empty in a script.'
echo
echo 'Dr. Thomas D. Schneider'
echo 'National Institutes of Health'
echo 'National Cancer Institute'
echo 'Gene Regulation and Chromosome Biology Laboratory'
echo 'Molecular Information Theory Group'
echo 'Frederick, Maryland 21702-1201'
echo 'schneidt@mail.nih.gov'
echo 'permanent email: toms@alum.mit.edu (use only if first address fails)'
echo 'http://alum.mit.edu/www/toms (permanent)'
exit
else
set pubmedpage = "$1"
if ("$pubmedpage" == '') then
echo "empty argument for medquery"
exit
endif
if !(-f "$pubmedpage") then
echo "there is no file named $pubmedpage"
# is it a number?
set pure = `echo $pubmedpage | tr -d '0123456789'`
echo "Remove digits from argument, is it a pure number? Result: '$pure'"
# if ("$pure" == "") then
if (("$pure" == "")||("$pure" == "PMC")) then
# example of Pubmedcentral number:
# PMC178597
set tmp = "/tmp/`whoami`.medquery"
if ("$pure" == "PMC") then
echo "PMC id found, so creating 'fake' pubmed page"
echo "PMCID: $pubmedpage" > $tmp
else
echo "Pure number found, so creating 'fake' pubmed page"
echo "PMID: $pubmedpage" > $tmp
endif
set pubmedpage = $tmp
else
echo "This is not a pure number, can't be a PMID."
exit
endif
else
echo "Using $1 as the pubmed page"
endif
endif
# ******************************************************************************
# 2007 Jun 11: NOTE that the name has apparently changed again, to
# entrez.html
# 2008 Aug 15: they abandoned that and now give [PMID].html
# ******************************************************************************
# 2003 July 15
#
# http://www.nlm.nih.gov/pubs/techbull/ma03/ma03_technote.html#eutil
# PubMedŽ to Complete Transition to E-Utilities
# and Manually Constructed URLs
#
# April 03, 2003 [posted]
#
# In July 2002, NCBI announced the availability of new
# programming for the Entrez Utilities (E-Utilities) and informed
# utility users that they should convert URLs to the new format by
# the end of 2002.
#
# NCBI will phase out the old utilities completely in June 2003. This
# may affect customers of some products such as EndNoteŽ,
# ProCiteŽ, and Reference ManagerŽ. Please contact user support
# for your respective product if you have questions. Questions
# concerning the use of E-Utilities can be sent to:
# eutilities@ncbi.nlm.nih.gov.
#
# If you have manually created links to PubMed that contain the
# string: /htbin-post/, these should be changed to follow the
# specifications provided on the page, Linking to PubMed and other
# Entrez Databases. These changes must be in place prior to June
# 2003.
#
# Entrez Utilities (E-Utilities)
# http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
#
# Linking to PubMed and other Entrez Databases.
# http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
# ******************************************************************************
# $query is the name of the file used by medlinebib:
set query = query
#set query1 = /tmp/`whoami`-query1.html
#set query2 = /tmp/`whoami`-query2.html
set query1 = /tmp/`whoami`-1.medquery
set query2 = /tmp/`whoami`-2.medquery
if (-f $pubmedpage) then
mv $pubmedpage $query
endif
if (-f $query) then
if !(-f medlinebibp) then
echo 'creating new medlinebibp file'
cat > medlinebibp << EOF
1.74 version of medlinebibp that this parameter file is designed for.
n 'd' = debug
n 'e' = do everything
f 'f' = use final author, otherwise second author
d 'd' = double dash page numbers: 1--5, otherwise single dash.
70 The title line size, in characters.
EOF
endif
# detect html form
# set line = `grep "" $query` # failed on 2007 Jun 11
set line = `grep "" $query` # 2007 Jun 11: should be more reliable
if ("$line" == '') then
echo the file is not html
# the tr changes control M's to returns
# in case the mac format was used
# cat $query
# extract the id line:
# 2001 May 24: bug version:
# set line = `cat "$query" | tr "
" "\n" | tr ";" "," | grep "PMID:"`
# remove brackets they just introduced!!
# set line = `cat "$query" | tr "
" "\n" | tr -d '[]' | tr ";" "," | grep "PMID:"`
# normal line:
# PMID: 3357886 PubMed - indexed for MEDLINE
# ancient line:
# PMID: 13918161 PubMed - OLDMEDLINE for Pre1966
#echo "*******************************************************************"
#cat "$query"
#echo "*******************************************************************"
#exit
# set line = "`grep '^PMID: ' $query`"
# 2008 Jun 07 accoutn for PMCID;
set line = "`egrep '(^PMID|^PMCID): ' $query`"
# tr -d '[]' | tr ";" "," | `
echo The PMID or PMCID containing line is:
echo "$line"
# clean the line:
# set pmid0 = `echo $line | tr "," "\n" | grep "PMID: " | sed -e "s/PMID: //"`
# remove the new junk they put in just before 2001 May 24
# set pmid = `echo $pmid0 | sed -e "s/PubMed - indexed for MEDLINE//"`
# that's not enough, they change the message,
# eg PMID: 11358999 [PubMed - in process]
# echo "PMID line is '$pmid0'"
# set pmid = `echo $pmid0 | tr -d "A-Z"`
# naw... try again:
# from man tr:
# When the -c option is specified with -d, all characters except those
# specified by string1 will be deleted. The contents of string2 will be
# ignored, unless the -s option is also specified.
# so... delete everything on the line EXCEPT digits:
# set pmid = `echo $line | tr -cd "[:digit:]"`
# that fails on the ancient line that has numbers!!
set pmid = `echo "$line" | tr " " '\012' | head -2| tail -1`
echo "PMID is '$pmid'"
# find the UI if it is on the line:
set ui = `echo $line | tr "," "\n" | grep "UI: " | sed -e "s/UI: //"`
echo " UI is '$ui'"
else
echo the file is html
set pmid = \
`cat $query | tr '<>[]' "\n\n\n\n" | grep PMID | sed -e "s/PMID: //"|head -1`
echo PMID is '"'$pmid'"'
# No UI for HTML:
set ui = ""
endif
# see what we found and act accordingly: give preference to PMID
if ("$pmid" == "") then
if ("$ui" == "") then
echo "PMID and UI are missing"
echo "HALT"
exit
else
set uid = "$ui"
endif
else
# set uid = "$pmid"
if (`echo $pmid | grep PMC` == '') then
set dbkind = pubmed
set uid = "$pmid"
else
set dbkind = pmc
set uid = `echo "$pmid" | sed "s/PMC//"`
endif
endif
echo
echo ID used is: $uid
echo kind of database is: $dbkind
# ***************************************************************************
# given the $uid, grab the entry
# ***************************************************************************
# NEW FUNCTIONAL METHOD: using eutils
# 2004 Apr 22
# Retmode & rettype are outlined in the EFetch documentation at:
# http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.ht-
# ml#Retrieval Mode
#
# If you add retmode=text, you should get what you want.
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&i-
# d=14602927&r ettype=medline&retmode=text
#
# Please forward any questions about e-utilities to:
# eutilities@ncbi.nlm.nih.gov.
#
# Sincerely, N. Ruiz National Library of Medicine
# 2008 Jun 7
# help page for efetch at NCBI:
# http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
# Obtain the medline format entry for the reference
# using retmode=text:
if ($dbkind == 'pubmed') then
# original:
wget -O $query1 "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=${uid}&rettype=medline&retmode=text"
else
echo ====================
if ($dbkind == 'pmc') then
# wget -O $query1 "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$dbkind&id=${uid}&rettype=medline&retmode=text"
wget -O $query1 "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$dbkind&id=${uid}&rettype=medline&retmode=text"
echo wget -O $query1 "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$dbkind&id=${uid}&rettype=medline&retmode=text"
endif
echo ====================
cat $query1
a $query1 > /dev/null
echo ====================
echo not funtional as of 2008 jun 8, email sent
exit
endif
# 2008 May 28
# error!!
# for call:
# medquery 16790843
# 1: id: 16790843 Error occurred: Error 111 (Connection refused)
# try setting agent:
# wget --user-agent=seamonkey -O $query1 "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=${uid}&rettype=medline&retmode=text"
# that failed.
# 2004 Mar 18: Keep the html so that one keeps the PMID:
# this is now handled by medlinebib
cat $query1 |\
cat > $query
echo ----============================= medline format:
cat $query
echo ----=============================
if (`cat $query |wc -c` == 0) then
echo "$query file is empty\!"
echo "for PMID ${uid}"
exit
endif
echo begin running ----- medlinebib
medlinebib
echo done running ----- medlinebib
if !(-f bibformat) then
echo 'The medlinebib program failed to produce a bibformat file!'
exit
endif
set filesize = `cat bibformat | wc -c | tr -d " "`
if ($filesize == 0) then
echo 'The medlinebib program failed: the bibformat file is empty!'
exit
endif
if ($filesize < 5) then
echo 'The medlinebib program failed:'
echo "filesize is $filesize"
echo 'The bibformat file is NEARLY empty!'
exit
endif
cat bibformat >> bib
echo "THE BIBLIOGRAPHY is IN FILE ~/bibformat"
echo "CONCATENATED BIBLIOGRAPHIES ARE IN FILE ~/bib"
cat bibformat |\
head -2 |\
tail -1 |\
tr '{' '\n' |\
tr -d ',' |\
tail -1 |\
cat > bibkey
echo "The key for this entry is in ~/bibkey"
echo "---- bibformat file contains: ----"
cat bibformat
echo "----------------------------------"
echo " "
# remove query file so that it is not in the way for the next file
mv query $query2
echo "The query file used by medlinebib was moved to $query2"
else
# There is no query file, but we don't want to say this because
# it is designed to be used with atchange. When the file is moved away
# atchange will call medquery and we should just end gracefully.
echo "Medquery is DONE - there is no query file."
echo ""
endif
echo
exit
********************************************************************************
********************************************************************************
********************************************************************************
********************************************************************************
OLD MATERIAL FOR THE wget:
# 2003 July 15: functional again!
wget -O $query1 "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&uid=$uid&dopt=Medline"
#
# original htbin-post method NOW OBSOLETE:
# wget -O $query1 "http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=m&form=6&uid=$uid&Dopt=l&html=no&title=no"
#
# tests:
# wget -O zzz.html "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=12177305&dopt=Medline"
# (gave web page - html)
# wget -O yyy.html "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&uid=12177305&dopt=Medline"
#
# works:
# wget -O uuu.html "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&uid=12177305&dopt=Medline"
#
# 2001 Mar 29: For some reason they now have html stuff
# surrounding the medline, despite html=no!!
# so clear that out:
# grep -v "Entrez Reports" $query1 |\
# grep -v -- '----------------' |\
# grep -v -- '<' |\
# grep -v -- '>' |\
# grep -v -- '^$' |\
# cat > query
# OLD FUNCTIONAL METHOD: using entrez
# wget -O $query1 "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&uid=$uid&dopt=Medline"
# NON FUNCTIONAL ATTEMPTS:
# You really should not be using web query t o retrieve PubMed
# citations in text format, but should be using E-Utilities. Your
# query in EFetch would be:
#
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=14602927&rettype=medline
#
# This will provide you with a clean text file. For more information
# about E-Utili ties, please go to:
# http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
#
# Sincerely,
# N. Ruiz
# National Library of Medicine
#
# 2004 Apr 7: Using E-utils:
#wget -O $query1 "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=${uid}&rettype=medline"