#!/bin/tcsh -f #(ie run the tshell on this but don't read the .cshrc or .tcshrc) echo version = 1.30 of yvpg 2011 Mar 28 # 2011 Mar 28, 1.30: fail for 'Genome Res. 2004 14: 1975-1986' - all 3 are years! # 2011 Mar 14, 1.29: duplicated "if" statement caused bug - fixed # 2011 Feb 17, 1.28: had year, page, didn't do volume?? # 35 (2007) 5995-6003. # 2011 Feb 17, 1.28: Wu.Barker2002 PMID not found - too long? # 2011 Feb 17, 1.27: pronounce bibtex better usin "byb tech"! # 2010 Nov 25, 1.26: less delays when reporting missing parts # 2010 Nov 25, 1.25: delay before closing to allow browser to catch up? # no - just build mechanisms for finding other scripts # 2010 Nov 15, 1.24: "^@" means bibtex entry - parse accordingly # 2010 Oct 13, 1.23: non-years inside parenthesis are the issue - ignore # 2010 Oct 12, 1.22: report failure - screen delay and verbally # 2010 Oct 04, 1.21: no ',' in long token mode # 2010 Oct 01, 1.20: finish test 6 # 2010 Oct 01, 1.19: J.C. Angulo and J.S. Dehesa, J. Chem. Phys. 97 (1992) 6485. # = test6 # 2010 Sep 10, 1.18: another kind of dash # 2010 Sep 10, 1.17: 22(22) should be a volume - drop the issue number # 2010 Sep 10, 1.16: clean up and report more to output; use first biggest token # 2010 Sep 10, 1.15: add the biggest token to the search! # 2010 Sep 10, 1.14: another example # 2010 Apr 28, 1.13: colon should become a space # 2010 Apr 28, 1.12: Handle new dash on Sun computer. # 2010 Apr 28, 1.11: another dash – # 2010 Apr 23, 1.10: tmp files should be in /tmp! # 2010 Apr 19, 1.09: page dash from Biochemistry 2009 (failed) # 2010 Feb 26, 1.08: crash # 2010 Feb 03, 1.07: another example dash # 2010 Feb 03, 1.06: tokenize everything except dashes but for each word! # 2010 Feb 03, 1.05: tokenize everything except dashes! # 2010 Feb 02, 1.04: handle all non-ascii forms of dash # 2010 Jan 30, 1.03: fix if statement crash # 2010 Jan 28, 1.02: year overrides volume only if range ok?? ... unsure # 2010 Jan 28, 1.01: activate # 2010 Jan 28, 1.00: origin, test set delaygood = 0 # seconds to delay for good results set delaybad = 10 # seconds to delay for problems # say $delaygood $delaybad # buffer for testing yvp call: # 2006, 45, 6570 set tmp1 = /tmp/`whoami`-1.yvpg set tmp2 = /tmp/`whoami`-2.yvpg set tmp3 = /tmp/`whoami`-3.yvpg set dotoken = '0' # use biggest token if ($#argv == 0) then echo 'usage: yvpg [string] [-]' echo 'Guess the reference of a paper from some text and call yvp.' echo 'Use: highlight a reference in a paper, put it in the cut/paste buffer' echo '(use command-v for example), then call this script.' echo 'Yvpg will guess what the year, volume and page (YVP) are' echo 'from the text. It will add on the first longest alphabetic token' echo 'in the buffer to reduce alternative hits.' echo 'If the first or second argument is a dash ("-") then' echo 'the longest token is used (see test2).' echo echo 'If non string is given, the cut/paste buffer is used (Mac OSX only),' echo 'If the string is "testN" (N=1 to 4) then a test example is run.' echo if (`uname` == "Darwin") then set string = "`pbpaste`" echo 'using argument string' else echo 'This is not a Mac and no string is given so halt.' say 'This is not a Mac and no string is given so halt.' sleep $delaybad exit endif else if ("$1" == '-') then set dotoken = '1' endif if ("$#argv" == '2') then if ("$1" == '-') then set dotoken = '1' endif endif if ("$1" == 'test1') then cat > $tmp1 << EOF PUBLICATION E.J. Enemark and L. Joshua-Tor, Mechanism of DNA translocation in a replicative hexameric helicase, Nature 442 270-275 (2006). EOF endif if ("$1" == 'test2') then cat > $tmp1 << EOF Beecher, H. K. (1955) "The powerful placebo" Journal of the American Medical Association 159 pp.1602-1606 [Original article, most cited one, claiming a widespread placebo effect] EOF endif if ("$1" == 'test3') then cat > $tmp1 << EOF sequences (RefSeq): a curated non-redundant sequence database of genomes, transcripts and proteins. Nucleic Acids Res 2007;35:D61–5. EOF endif if ("$1" == 'test4') then cat > $tmp1 << EOF Marr,A.G. (1991) Microbiol. Rev., 55, 316-333. EOF endif if ("$1" == 'test5') then cat > $tmp1 << EOF Thompson, J.D., Higgins, D.G. and Gibson, T.J. CLUSTAL W: improving the sensitivity of progressive multiple sequence alignment through sequence weighting, position-specific gap penalties and weight matrix choice (1994) Nucleic Acids Res., 22(22), 4673-4680. EOF endif if ("$1" == 'test6') then cat > $tmp1 << EOF J.C. Angulo and J.S. Dehesa, J. Chem. Phys. 97 (1992) 6485. EOF endif if ("$1" == '-') then # token control: just use buffer set string = "`pbpaste`" else if (`echo "$1" | grep test|wc -l` > 0) then #cat /tmp1/hold | pbcopy set string = "`cat $tmp1`" else # use all arguments as the string set string = "$argv" endif endif endif # avoid Illegal byte problem for tr on Macs: setenv LC_ALL en_US.ISO8859-1 set baseyear = "1900" set todayyear = `date +%Y` echo "The current year is $todayyear" # Look for yvp program if (1) then # debug locating yvp which yvp echo echo "env ---------------------" env echo echo "pwd ---------------------" pwd echo echo " path -------------------" echo "$path" echo echo "whoami ------------------" whoami echo endif if (`which yvp| grep '/'|wc -l`> 0) then echo 'found the yvp program' set yvp = yvp # say 'found the yvp program' & else # Ok, it's being called as a 'button' on the desktop set me = `whoami` set foundyvp = 0 if (-d ~/script) then if (-f ~/script/yvp) then set yvp = `ls ~/script/yvp` set foundyvp = 1 # say found yvp #sleep $delaybad else set foundyvp = 0 endif else set foundyvp = 0 endif if !($foundyvp) then echo 'Could not find the yvp program\!' say 'Could not find the yvp program\!' echo 'yvp should be in your script directory at' say 'yvp should be in your script directory at' echo "~/script/yvp" say "~/script/yvp" sleep $delaybad exit endif endif # this goes into yvp: # Look for wgetas program if (1) then # debug locating wgetas which wgetas echo echo "env ---------------------" env echo echo "pwd ---------------------" pwd echo echo " path -------------------" echo "$path" echo echo "whoami ------------------" whoami echo endif if (`which wgetas| grep '/'|wc -l`> 0) then echo 'found the wgetas program' set wgetas = wgetas # say 'found the wgetas program' & else # Ok, it's being called as a 'button' on the desktop set me = `whoami` set foundwgetas = 0 if (-d ~/script) then if (-f ~/script/wgetas) then set wgetas = `ls ~/script/wgetas` set foundwgetas = 1 # say found wgetas #sleep $delaybad else set foundwgetas = 0 endif else set foundwgetas = 0 endif if !($foundwgetas) then echo 'Could not find the wgetas program\!' say 'Could not find the wgetas program\!' echo 'wgetas should be in your script directory at' say 'wgetas should be in your script directory at' echo "~/script/wgetas" say "~/script/yvp" sleep $delaybad exit endif endif echo echo "--- Captured string to examine: -----" echo "$string" set count = 0 set year = '' set volume = '' set page = '' # if $y is set, no other variable can override it set y = '' set v = '' set p = '' # ************************************************************************* # Special case: if the file contains @ on th start of a line, # assume it is a bibtex entry and parse out the yvp from that. if (`pbpaste|grep '^@'|wc -l`> 0) then pbpaste > $tmp1 # 2011 Feb 17: byb tech: finally a good pronunciation for 'bibtex'! say byb tech entry & echo bibtex entry set year = `grep 'year = ' $tmp1|tr -dc "[:digit:]"` set volume = `grep 'volume = ' $tmp1|tr -dc "[:digit:]"` set page = `grep 'pages = ' $tmp1|tr '-' '\n'|head -1|tr -dc "[:digit:]"` #echo -=----- $tmp1 #cat $tmp1 #echo -=----- #set authors = `cat $tmp1|tr -d '\n'|tr ',' '\n'|grep "author ="` #echo $authors if (1) then # find longest word set biggesttokensize = 0 set count = 0 foreach token (`cat "$tmp1"|tr -c '[:alnum:]()' ' '`) @ count = $count + 1 # zzz echo -n "token ${count} IS '${token}'" # track token size - of alphabetic strings set thistokensize = `echo "$token"|tr -dc '[:alpha:]'|wc -c` if ($thistokensize > $biggesttokensize) then set biggesttokensize = $thistokensize set biggesttoken = $token echo ", the biggest so far ("${biggesttokensize}" characters)" else echo endif end set key = $biggesttoken else # use the pmid to nail the entry: set key = `cat $tmp1|tr -d '\n'|tr ',' '\n'|grep "pmid ="|tr -dc '[:digit:]'` # that failed for Wu.Barker2002! endif echo "$yvp $year $volume $page $key" $yvp $year $volume $page $key sleep $delaygood exit endif # ************************************************************************* echo set zap = ";:." echo "---- Remove some characters (${zap}) from the string: ---" # echo "---- modified string ----------------" # set string = `echo "$string"|tr ";:." " "|tr -dc '[:print:]'` # don't do that! It zaps the dashes which are "non printable"! # set string = `echo "$string"|tr "$zap" " "` # 2010 Oct 1: That fails - loses spaces #set newstring = "`echo $string|tr ';:.' ' '> $tmp3`" #echo "$newstring" # Doing the above causes loss of the spaces!! Put it in a file. # Force parenthesis to be separate tokens!!! echo "$string"|tr ';:.' ' '|sed 's/)/) /g'|sed 's/(/ (/g'> $tmp3 set string = "`cat $tmp3`" echo "$string" echo "-------------------------------------" set biggesttokensize = 0 # Track size of tokens set biggesttoken = '' # the biggest token found so far # 2010 oct 04: : remove non al-numeric characters from the string: # foreach token (`echo "$string"|tr -c '[:alnum:]' ' '`) # 2010 oct 13: : remove non al-numeric characters from the string, keep (): foreach ptoken (`echo "$string"|tr -c '[:alnum:]()' ' '`) echo $ptoken @ count = $count + 1 echo -n "token ${count} IS '${ptoken}'" # for string ${string}" #echo "$ptoken" | egrep "\(|\)" #exit if (`echo "$ptoken" | egrep '\(|\)' | wc -l` > 0) then set token = `echo "$ptoken"|tr -d '()'` set notparenthesis = 0 echo -n " PARENTHESIS" else set token = "$ptoken" set notparenthesis = 1 endif # track token size - of alphabetic strings # set thistokensize = `wc -c "$token"` # picks up page numbers! set thistokensize = `echo "$token"|tr -dc '[:alpha:]'|wc -c` if ($thistokensize > $biggesttokensize) then set biggesttokensize = $thistokensize set biggesttoken = $token echo ", the biggest so far ("${biggesttokensize}" characters)" else echo endif # Dash of Street.Barrick2008.pdf reference: Greene Pace 1974 # direct cut/paste into vim gives: # # Greene, R. F., Jr., and Pace, C. N. (1974). Urea and guanidine # hydrochloride denaturation of ribonuclease, lysozyme, # alpha-chymotrypsin, and beta-lactoglobulin. J. Biol. Chem. 249, # 5388–5393. # This is not converted to a dash! # pbpaste >/tmp/t # then :r /tmp/t # gives: # Greene, R. F., Jr., and Pace, C. N. (1974). Urea and guanidine # hydrochloride denaturation of ribonuclease, lysozyme, # alpha-chymotrypsin, and beta-lactoglobulin. J. Biol. Chem. 249, # 5388?5393. # Convert all forms of dash one may find into pure ascii # See the u2a script. echo "$token" |\ sed "s/–/-/g" |\ sed 's/?/-/g' |\ sed "s/–/-/g" |\ sed "s/\\u2013/-/g" |\ sed 's/Ð/-/g' |\ tr "abcdefghijklmnopqrstuvwxyz" " " |\ tr "ABCDEFGHIJKLMNOPQRSTUVWXYZ" " " |\ # remove leading space as in D615: \ sed "s/^ //" |\ cat > $tmp2 set word = "`cat $tmp2`" # set year and page computations for just this word set ynow = '' set pnow = '' # Look for pure numbers, remove periods too. set digits = `echo $word | tr -dc "[:digit:]"|tr -d "."` if ("$digits" != '') then # echo "digits '$digits'" echo " Token ${count}: $word -> $digits" if (("$digits" > "1900")&&("$digits" <= "$todayyear")) then set couldbeyear = 1 # it could be a year echo " given the value ${digits}, it could be a year" else set couldbeyear = 0 # it probably is not a year echo " given the value ${digits}, it probably is not a year" endif # if the string has parenthesis, it is almost certainly a year. if (`echo "$word" | egrep '\(|\)' | wc -l` > 0) then echo " has parenthesis" # if (("$digits" > "1900")&&("$digits" <= "$todayyear")) then if ($couldbeyear) then echo " is greater than $baseyear and less or equal to $todayyear" echo " $digits is the year." set year = "$digits" set y = "$year" set ynow = "$year" endif endif # if the string has a dash, it is probably pages. if (`echo "$word" | egrep "-" | wc -l` > 0) then echo " has dash" set fp = `echo "$word" | tr '-' '\n' | head -1` set sp = `echo "$word" | tr '-' '\n' | tail -1` # Parts could be 'pp.1602' - so purify them to be just digits set firstpart = `echo "$fp" | tr -dc '[:digit:]'` set secondpart = `echo "$sp" | tr -dc '[:digit:]'` echo " first part is $firstpart" echo " second part is $secondpart" echo -n " first part = $firstpart" if ("$firstpart" < "$secondpart") then echo " is less than second part = $secondpart" else echo " is greater than second part = $secondpart" echo " ... use it anyway\!" endif echo " $firstpart is the page." set page = "$firstpart" set p = "$page" set pnow = "$page" endif # If it's not a year or page, assign it to be the year or volume if (("$ynow" == '')&&("$pnow" == '')) then # Wait! Woah! It might be it was a year # Don't assign if year already assigned! if (($couldbeyear)&&("$y" == '')) then echo " could be a year, so assign to be year" set year = "$digits" echo " $year is the year." set y = "$year" else if ("$volume" == '') then echo " not year and volume is empty, so assign to be volume" # the form 22(22) implies that we should drop the issue # see test5 if (`echo "$token"|grep '('|wc -c`>0) then # token has a '(' in it - assume material before that # is the actual volume and after is the issue set volume = `echo "$token"|tr '(' '\n'|head -1` if ("$volume" == '') then # safety test set volume = "$digits" endif else set volume = "$digits" endif echo " $volume is the volume." if ("$volume" == '') then # safety test echo 'program error' say 'program error' sleep $delaybad exit endif set v = "$volume" else if (("$page" == '')&&($notparenthesis)) then # echo "====== $word" # retokanize ANYTHING not a digit, to break away # first part of token: set digits = "`echo "$word"| # tr -c '[:digit:]' '\n'|head -1`" double quote used # 4 times is no good! set digits = `echo "$word"| tr -c '[:digit:]' '\n'|head -1` echo " year and volume assigned already, so assign to be page" set page = "$digits" echo " $page is the page." set p = "$page" endif endif endif endif echo --- endif end if (("$y" != '')&&("$v" != '')&& ("$p" != '')) then if !($dotoken) then set biggesttoken = '' # zap the hard-earned token endif echo "The call by yvpg to yvp is:" echo "$yvp $year $volume $page $biggesttoken" $yvp $year $volume $page $biggesttoken echo "---" echo "The call by yvpg to yvp was:" echo "$yvp $year $volume $page $biggesttoken" else echo "no yvp because:" set gavemessage = 0 if ("$y" == '') then echo "no year found" say "no year found" set gavemessage = 1 endif if ("$v" == '') then echo "no volume found" say "no volume found" set gavemessage = 1 endif if ("$p" == '') then echo "no page found" say "no page found" set gavemessage = 1 endif #if (`` == 'Darwin') then # echo 'not found' # say 'not found' # set gavemessage = 1 #endif if ($gavemessage) then echo "yvpg ok" else echo "no message, vypg FAILED" say "no message, vypg FAILED" endif sleep $delaybad endif # say exiting sleep $delaygood echo done yvpg exit ******************************************************************************** *** END OF PROGRM ************************************************************** ******************************************************************************** ******************************************************************************** ******************************************************************************** ******************************************************************************** Junk below this point example string: from: http://www.nsls.bnl.gov/newsroom/science/2006/12-Joshua-Tor.htm PUBLICATION E.J. Enemark and L. Joshua-Tor, β€œMechanism of DNA translocation in a replicative hexameric helicase,” Nature 442 270-275 (2006). ******************************************************************************** # echo -n 'Give user name: ' # # read user string (only works with one line, no good for this code) # set user = $< Unicode dash example: Ofria, C., Huang, W., Torng, E., 2008. On the gradual evolution of complexity and the sudden emergence of complex features. Artif. Life 14, 255–263. ******************************************************************************** 2010 Feb 03: tokenize everything except dashes because of: sequences (RefSeq): a curated non-redundant sequence database of genomes, transcripts and proteins. Nucleic Acids Res 2007;35:D61–5. Street T O, Courtemanche N and Barrick D 2008 Protein folding and stability using denaturants Methods Cell. Biol. 84 295–325 – some old code: # foreach token (`echo $string|sed "s/\\u2013/-/g"`) # foreach token (`echo "$string"|tr ";:." " "`) # foreach token (`echo "$string"|tr ";:." " "|tr -dc '[:print:]'`) # ^ more tokanization # # Convert all forms of dash one may find into pure ascii # # See the u2a script # echo "$token" |\ # sed "s/–/-/g" |\ # sed "s/–/-/g" |\ # sed "s/\\u2013/-/g" |\ # sed 's/Ð/-/g' |\ # cat > $tmp2 # set word = "`cat $tmp2`" # # echo $word is the word # set word = "$token" # page dash from Biochemistry pdf 2009: \ # sed 's/β€šΓ„Γ/-/g' |\ # sed 's/–/-/g' |\ # pbpaste and direct transfer: \ # sed 's/Π/-/g' |\ # that failed too. \ # Mol Microb 56 1481 2005 dash of first reference \ # CANNOT WRITE THE FILE FROM VIM on Mac OSX! Sun is ok. \ # CONVERSION ERROR 327L \ # turns out to be – which is already handled \ # sed 's//-/g' |\ #yyy \ # Tokanize everything else other than dash. \ # This works on Mac OSX but not on Sun :-( \ # tr -c "[:digit:]-" " " |\ #tr -c "[:digit:]-" " " |\ #tr -c "[:digit:]-" "#" |\ # do this for the Sun OS: \ # tr ";:" " " |\ # the above is zapping the dash!!! # set word = "$token" # # Convert all forms of dash one may find into pure ascii # # See the u2a script # echo "$string" |\ # sed "s/–/-/g" |\ # sed "s/–/-/g" |\ # sed "s/\\u2013/-/g" |\ # sed 's/Ð/-/g' |\ # # Tokanize everything else other than dash. \ # # This works on Mac OSX but not on Sun :-( \ # tr -c "[:digit:]-" " " |\ # # do this for the Sun OS: \ # tr ";:" " " |\ # tr "abcdefghijklmnopqrstuvwxyz" " " |\ # tr "ABCDEFGHIJKLMNOPQRSTUVWXYZ" " " |\ # # tr -d "." |\ # # \ # cat > $tmp2 # set string = "`cat $tmp2`" # tr -d "." |\ # \ # foreach token (`echo "$string"|tr -d "'\n"`) # 2010 oct 01: : remove non printable characters from the string: # foreach token (`echo "$string"|tr -cd '[:print:]'`) That failed on D. Sellis and Y. Almirantis, Gene 447, 18 (2009). it kept the comma in 'Almirantis,'!