program sortbibtex(fin, sortbibtexp, fout, output); (* sortbibtex: sort a bibtex database Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.ccrnp.ncifcrf.gov/~toms/ *) label 1; (* end of program *) const (* begin module version *) version = 2.29; (* of sortbibtex.p 2007 Jun 14 2007 Jun 14, 2.29: remove extra blanks between entries 2007 Jun 14, 2.28: remove duplicates 2005 May 11, 2.27: Make compatable with the Gnu Pascal Compiler (GPC) Mostly this is that GPC cannot read into packed arrays, so I switched to unpacked arrays. Hopefully there is enough memory to do the sort! 1997 Apr 15, 2.14: upgrade to sort on year forwards or backwards origin 1990 May 2 *) updateversion = 2.25; (* defines lowest acceptable current parameter file *) (* end module version *) (* begin module describe.sortbibtex *) (* name sortbibtex: sort a bibtex database synopsis sortbibtex(fin: in, sortbibtexp: in, fout: out, output: out) files fin: a bibtex database Each entry is identified by the '@' symbol; all other lines are ignored. This means that if you have blanks on lines between entries, the entries will be fused together. To avoid this, pass the database through the rembla program first. sortbibtexp: parameters to control the program. The file must contain the following parameters, one per line: The version number of the program. This allows the user to be warned if an old parameter file is used. sortcontrol: the first character on the line controls the sorting. k(ey): sort on key y(ear): sort on year r(everse year): sort on year in reverse order numbered: if the first character is 'n', number the entries in increasing year order. removeduplicates: if the first character is 'r', remove duplicate entries. If two entries are not the same data will be lost. fout: bibtex database sorted by the key output: messages to the user, including errors in the structure of the database and duplicate entries. description Sort a BibTeX database by the citation keys. If you want to collect unverified or incomplete references in a raw database as BiBTeX format, you can replace the "@" of an entry with a "#". Sortbibtex will drop these entries, giving a functional database. examples example of sortbibtexp: 2.14 version of sortbibtex that this parameter file is designed for. y sortcontrol: k(ey), y(ear), r(everse year) n numbered: n means number documentation see also rembla.p author Thomas Dana Schneider bugs Entries are defined by blank lines. Use rembla to make sure that there are no extra spaces on the ends of lines. technical notes *) (* end module describe.sortbibtex *) (* const *) mapmax = 200000; (* largest number of entries that can be handled by the program. It determines the size of the map array. *) linewidth = 80; (* maximum width of lines in the file (one should never exceed 80 characters. This is for safety of transportation of files on tape and over the net. It also avoids confusion with wrapped lines. *) type position = 0..mapmax; (* somewhere on the map. note: position 0 is not used, but it allows the quicksort to function properly *) lineptr = ^entryline; {GPC cannot read into packed arrays entryline = packed record (* a line of an entry *) } entryline = record (* a line of an entry *) string: array[1..linewidth] of char; (* a character string *) stringlength: integer; (* length of the string *) next: lineptr; (* pointer to the next line *) end; entryptr = ^entry; (* pointer to an entry *) {GPC cannot read into packed arrays entry = packed record (* entry of a bibtex database *) } entry = record (* entry of a bibtex database *) key: array[1..linewidth] of char; (* the citation key to sort on *) yearkey: integer; (* the year taken from the key (for sorting) *) yearentry: integer; (* the year taken from the entry (for sorting) *) line: lineptr; (* the lines of the entry *) number: integer; (* the entry number *) end; var fin, sortbibtexp, fout: text; (* files used by this program *) (* the entire set of entries read in *) {GPC cannot read into packed arrays map: packed array[1..mapmax] of entryptr; } map: array[1..mapmax] of entryptr; sortcontrol: char; (* k(ey), y(ear), r(everse year) *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 'prgmod 4.05 89 Aug 28 tds'; *) procedure writekey(var f: text; e: entryptr); (* write the key of the entry e *) var k: integer; (* position in a key *) begin k := 1; while e^.key[k] <> ' ' do begin write(f,e^.key[k]); k := k + 1; end; end; function lessthan(alow, blow: position): boolean; (* Is the entry at alow less than (before, alphabetically or by year) the entry at blow? *) var a,b: char; (* characters in the two keys *) done: boolean; (* are we done yet? *) k: integer; (* position in a key *) sortit: char; (* intermediate sort control *) begin if sortcontrol = 'k' then sortit := sortcontrol else begin (* if the years are the same, or one is 0 (no year) then sort on the key *) if (map[alow]^.yearentry = map[blow]^.yearentry) or (map[alow]^.yearentry = 0) or (map[blow]^.yearentry = 0) then sortit := 'k' else sortit := sortcontrol; if (map[alow]^.yearentry = 0) and (map[blow]^.yearentry <> 0) then sortit := 'a'; if (map[blow]^.yearentry = 0) and (map[alow]^.yearentry <> 0) then sortit := 'b'; end; { writeln('sortcontrol=',sortcontrol); writeln('sortit=',sortit); } { writeln(output,'-----------'); write(output,'sortit=',sortit); if alow = blow then write(output,' EQUAL') else write(output,' -----'); write(output,' alow=',alow:2); write(output,' blow=',blow:2); write(output,' "'); writekey(output,map[alow]); write(output,' " vs "'); writekey(output,map[blow]); writeln(output,'"'); } case sortit of 'a': begin lessthan := true end; 'b': begin lessthan := false end; 'y': begin if map[alow]^.yearentry < map[blow]^.yearentry then lessthan := true else lessthan := false; end; 'r': begin if map[alow]^.yearentry < map[blow]^.yearentry then lessthan := false else lessthan := true; end; 'k': begin done := false; k := 0; { writeln(output,'lessthan:'); write(output,'"'); writekey(output,map[alow]); write(output,'" vs "'); writekey(output,map[blow]); writeln(output,'"'); } while not done do begin k := succ(k); a := map[alow]^.key[k]; b := map[blow]^.key[k]; { writeln(output,'a=',a,' b=',b); } if (a = ' ') and (b = ' ') then begin { writeln(output,'identical entry keys: '); writekey(output,map[alow]); } done := true; lessthan := false end; if not done then if (a = ' ') or (b = ' ') then begin done := true; if a = ' ' then lessthan := true else lessthan := false; end else begin if ord(a) < ord(b) then begin { writekey(output,map[alow]); write(output,' < '); writekey(output,map[blow]); writeln(output); } lessthan := true; done := true; end; if ord(a) > ord(b) then begin { writekey(output,map[alow]); write(output,' > '); writekey(output,map[blow]); writeln(output); } lessthan := false; done := true; end end end; end; end; end; procedure swap(a, b: position); (* switch positions a and b *) var hold: entryptr; begin hold:=map[a]; map[a]:=map[b]; map[b]:=hold end; (* begin module quicksort *) procedure quicksort(left, right: position); (* quick sort a list between positions left and right, into ascending order. a position is simply a scalar of the form 0..max. the array to be sorted is dimensioned 1..max. (the difference in the ranges is important to the correct operation of the sort...) two external routines are used: function lessthan(a, b: position): boolean is a generalized test for value-at-a < value-at-b. procedure swap(a, b: position) switches the items at positions a and b. since these routines are external, the procedure is general. this procedure taken from the book 'algorithms + data structures = programs' by niklaus wirth, prentice-hall, inc., englewood cliffs, n.j.(1976), pp. 76-82 *) var lower, upper: position; (* the positions looked at currently *) center: position; (* the rough center of the region being sorted *) begin lower := left; center := (left + right) div 2; upper := right; repeat while lessthan(lower, center) do lower := succ(lower); while lessthan(center, upper) do upper := pred(upper); if lower <= upper then begin (* keep track of the center through the map: *) if lower = center then center:=upper else if upper = center then center:=lower; swap(lower, upper); lower := succ(lower); upper := pred(upper) end until lower > upper; if left < upper then quicksort(left, upper); if lower < right then quicksort(lower, right) end; (* end module quicksort version = 'prgmod 4.05 89 Aug 28 tds'; *) procedure readline(var f: text; var l: lineptr; linenumber: integer); (* read a bibtex line from f into l. The line number is linenumber *) var c: char; (* a character read in. GPC cannot read directly into the string *) begin new(l); with l^ do begin stringlength := 0; while (not eoln(f)) and (stringlength < linewidth) do begin stringlength := stringlength + 1; (* read(f,l^.string[stringlength]); *) read(f,c); l^.string[stringlength] := c; if (stringlength = linewidth) and not eoln(f) then begin writeln(output,'line ',linenumber:1,' is longer than ', linewidth:1,' characters. Make it two lines.'); end; end; readln(f); next := nil; end; end; procedure readentry(var f: text; var linenumber: integer; var e: entryptr); (* read a bibtex entry from f into e, keep track of the current line number in linenumber. If the end of file is found, e is nil. *) var c: char; (* a character in the key of the entry *) l: lineptr; (* pointer to a line of text *) i: integer; (* index to a line of an entry *) numberlength: integer; (* the length of the number *) numbers: set of '0'..'9'; p: integer; (* position on a line *) pkey: integer; (* position on a line that the key starts *) begin numbers:=['0','1','2','3','4','5','6','7','8','9']; new(e); with e^ do begin (* read the entry in *) (* locate the start of the entry *) while not eof(f) and (f^<>'@') do begin readln(f); linenumber := linenumber + 1; end; if not eof(f) then begin (* read the first line in *) readline(f,line,linenumber); linenumber := linenumber + 1; (* find the citation key on the line *) p := 1; while line^.string[p] <> '{' do begin if p = linewidth then begin writeln(output, 'line ',linenumber:1,' is missing the "{"'); halt end; p := p + 1; end; p := p + 1; pkey := p; (* read in the key *) yearkey := 0; numberlength := 0; while line^.string[p] <> ',' do begin if p = linewidth then begin writeln(output, 'line ',linenumber:1,' is missing the ","'); halt end; c := line^.string[p]; key[p-pkey+1] := c; (* pull out year as a number *) if c in numbers then begin yearkey := 10*yearkey + ord(c)-ord('0'); numberlength := succ(numberlength) end else begin (* if the yearkey is not THE LAST 4 characters, kill it: *) yearkey := 0; end; p := p + 1; end; if numberlength < 4 then yearkey := 0; (* finish the key with a blank character *) key[p-pkey+1] := ' '; { write(output,'line ',linenumber:1,' "'); writekey(output,e); writeln(output,'"'); } (* read the rest of the entry *) l := line; while (not eof(f)) and (not eoln(f)) do begin linenumber := linenumber + 1; readline(f,l^.next,linenumber); if not eof(f) then l := l^.next; end; (* redo year from rest of entry *) l := line; yearentry := 0; (* look for 'year' *) while l <> nil do with l^ do begin i := 1; (* skip blanks *) while (i < stringlength) and (string[i] = ' ') do i := i+1; if string[i] = 'y' then if string[i+1] = 'e' then if string[i+2] = 'a' then if string[i+3] = 'r' then begin (* skip blanks *) i := i + 4; while (i < stringlength) and (string[i] = ' ') do i := i+1; if string[i] = '=' then begin i := i + 1; while (i < stringlength) and (string[i] = ' ') do i := i+1; if string[i] = '"' then i := i + 1; numberlength := 0; while (string[i] in numbers) and (numberlength <= 4) do begin c := string[i]; yearentry := 10*yearentry + ord(c)-ord('0'); numberlength := succ(numberlength); i := i+1; end; if numberlength <> 4 then begin writeln(output,'bad year found in:'); writekey(output,e); writeln(output); end; end; end; l := l^.next end; { writeln(output,yearentry:1, ' ',yearkey:1); } if (yearentry <> yearkey) and (yearkey <> 0) then begin writeln(output,'year in entry not equal to year in key in:'); writekey(output,e); writeln(output); end; end else begin (* there was junk at the end of the file, clear out this entry it is meaningless *) dispose(e); e := nil end end; end; procedure writeentry(var f: text; e: entryptr); (* write the entry e to f *) var l: lineptr; (* pointer to a line of text *) p: integer; (* position on a line *) begin l := e^.line; while l <> nil do with l^ do begin for p := 1 to stringlength do write(f,string[p]); writeln(f); l := l^.next end end; procedure showentries(var afile: text; entries: integer); (* show all entries *) var e: position; (* index to the entries *) begin for e := 1 to entries do begin with map[e]^ do begin if yearentry <> 0 then write(afile, yearentry:4,' ') else write(afile, ' ':5); end; writekey(afile, map[e]); writeln(afile); end; end; (* begin module sortbibtex.themain *) procedure themain(var fin, fout: text); (* the main procedure of the program *) var linenumber: integer; (* line number in the file *) entries: integer; (* counter of the entries read in *) numbered: char; (* if 'n' number the entries *) e: position; (* index to the entries *) parameterversion: real; (* parameter version number *) removeduplicates: char; (* if the first character is 'r', remove duplicate entries. If two entries are not the same data will be lost. *) procedure giveentry; (* give the entry with or without blanks between *) begin writeentry(fout,map[e]); if e <> entries then writeln(fout) (* space between entries *) end; begin writeln(output,'sortbibtex ',version:4:2); reset(sortbibtexp); readln(sortbibtexp, parameterversion); if parameterversion < updateversion then begin writeln(output, 'You have an old parameter file!'); halt end; readln(sortbibtexp, sortcontrol); if not (sortcontrol in ['k','y','r']) then begin writeln(output,'sortcontrol must be one of k, y, r'); halt; end; readln(sortbibtexp, numbered); readln(sortbibtexp, removeduplicates); reset(fin); rewrite(fout); linenumber := 0; entries := 0; while not eof(fin) do begin entries := entries + 1; readentry(fin,linenumber,map[entries]); if map[entries] = nil then entries := entries - 1 end; writeln(output,entries:1,' entries read'); quicksort(1,entries); showentries(output,entries); { for e := 2 to entries do begin if not(lessthan(e, e-1)) and not(lessthan(e-1, e)) then begin (* must be identical! *) write(output,'duplicate entry: '); writekey(output,map[e]); writeln(output); end end; } (* put the entire set of entries into fout *) for e := 1 to entries do begin if numbered = 'n' then begin if sortcontrol = 'r' then writeln(fout,'% ',(entries-e+1):1) else writeln(fout,'% ',e:1); end; if e > 1 then begin if removeduplicates = 'r' then begin if not(lessthan(e, e-1)) and not(lessthan(e-1, e)) then begin (* must be identical! *) write(output,'duplicate entry: '); writekey(output,map[e]); writeln(output); end else giveentry end else giveentry end else giveentry; { else writeentry(fout,map[e]); end else writeentry(fout,map[e]); end else writeentry(fout,map[e]); if e <> entries then writeln(fout) (* space between entries *) } end; end; (* end module sortbibtex.themain *) begin themain(fin, fout); 1: end.