#!/usr/local/bin/perl

# Pete Martin. 	March 2 2000. Changes to generate refer records that can be
#			imported into EndNote.
#			- Do not join the month to the year information
#              	- Do not wrap at 72 characters. This breaks URLs
#			- Add fields for Notes, Month and ISBN 
#			
#
$version = '0.4'; # achilles@ira.uka.de (95/05/16)
#                   - new, correct macro substitution mechanism
#version = '0.3'; # achilles@ira.uka.de (94/11/25)
#                   - added conversion of url bibtex fields to the 
#                     undefined but widely used %U refer fields
#version = '0.2'; # achilles@ira.uka.de (94/09/15)
#                   - fixed bug in line wrapping
#                   - fixed bug in entry parsing
#version = '0.1'; # achilles@ira.uka.de (94/08/16)
#                   - First version number. 
#                   - Fixed a bug in expand_field and in the parsing of the 
#                     String entries
#
# Alf-Christian Achilles (achilles@ira.uka.de)
# 93/10/25
#
# Convert bibtex to refer. 
# Run as filter or with filenames of BibTeX files on command line.
# Refer output on stdout.
#
# usage: bibtex2refer [-s] [-strings FILE] ...  FILE ...
#
# the files given with the -strings option are only scanned for abbreviation 
# definitions to be used when converting the bibtex files
#
# option -s means silence: no warnings or error messages are displayed
#
# All bibtex files must have been filtered through bibclean 
# (available at ftp://ftp.math.utah.edu/pub/tex/bib/) without 
# errors, including the files specified with the -strings option !!!
#
# Handles @string abbreviations and concatenations ('#').
# Ignores entries with crossreferences !
#
# Does *not* convert LaTeX formatting commands to roff!
#
# A list of unknown fields and unknown abbreviations will be printed on stderr.
# 
# very slow !
#

# the following string is inserted for the join '#' operator
$join_space = ' ';

%refer = ();
$refer{'author'} = "A";
$refer{'address'} = "C";
$refer{'year'} = "D";
$refer{'editor'} = "E";
$refer{'publisher'} = "I";
$refer{'journal'} = "J";
$refer{'series'} = "S";
$refer{'keywords'} = "K";
$refer{'pages'} = "P";
$refer{'title'} = "T";
$refer{'number'} = "N";
$refer{'volume'} = "V";
$refer{'booktitle'} = "B";
$refer{'type'} = "R";
$refer{'annote'} = "O";
$refer{'organization'} = "I";
$refer{'organisation'} = "I";
$refer{'abstract'} = "X";
$refer{'url'} = "U";
$refer{'notes'} = "Z";
$refer{'month'} = "8";
$refer{'isbn'}  = "@";

# This string is used to order the refer entries 
# (label, author, corporate author, title, ...)
$order = "LAQTJBRSEVNDPICKUXO8Z@";

%abbrev = ('jan', '"January"',
	   'feb', '"February"',
	   'mar', '"March"',
	   'apr', '"April"',
	   'may', '"May"',
	   'jun', '"June"',
	   'jul', '"July"',
	   'aug', '"August"',
	   'sep', '"September"',
	   'oct', '"October"',
	   'nov', '"November"',
	   'dec', '"December"',
	   'acmcs', '"ACM Computing Surveys"',
	   'acta', '"Acta Informatica"',
	   'cacm', '"Communications of the ACM"',
	   'compj', '"Computer Journal"',
	   'ibmjrd', '"IBM Journal of Research and Development"',
	   'ibmsj', '"IBM Systems Journal"',
	   'ieeese', '"IEEE Transactions on Software Engineering"',
	   'ieeetc', '"IEEE Transactions on Computers"',
	   'ieeetcad', '"IEEE Transactions on Computer-Aided Design of Integrated Circuits"',
	   'ipl', '"Information Processing Letters"',
	   'jacm', '"Journal of the ACM"',
	   'jcss', '"Journal of Computer and System Sciences"',
	   'scp', '"Science of Computer Programming"',
	   'sicomp', '"SIAM Journal on Computing"',
	   'tocs', '"ACM Transactions on Computer Systems"',
	   'tods', '"ACM Transactions on Database Systems"',
	   'tog', '"ACM Transactions on Graphics"',
	   'toms', '"ACM Transactions on Mathematical Software"',
	   'toois', '"ACM Transactions on Office Information Systems"',
	   'toplas', '"ACM Transactions on Programming Languages and Systems"',
	   'tcs', '"Theoretical Computer Science"'
	   );

@string_files = ();
$silent = 0;

while ($_ = $ARGV[0], /^-/) {
    shift;

    if (!$ARGV[0]) {
	die "Missing argument for option $_\n";
    }

    if (/^-s$/) {
	$silent = 1;
 	next;	
    }

    if (/^-strings$/) {
	if (-f $ARGV[0] && -r $ARGV[0]) {
	    push(@string_files, ($ARGV[0]));
	} else {
	    die "Abbreviation file \"$ARGV[0]\" is not a readable file, stopped at ";
	}
	shift;
	next;
    }
    
    die "Unknown option: $_\n";
}

$*=1;
$/="\n@";

foreach $file (@string_files) {
    if (!open(STRINGS,$file)) {
	die "Can not open $file\n";
    }
    while (<STRINGS>) {

	# handle @string entries
	if (/^String\{(\S+) *= *(\S(.|\n +)*)\}\n\n/) {
	    if ($abbrev{$1}) {
		print STDERR "Multiple definition of abbreviation $abb\n"  if (!silent);
	    }
	    # store abbreviation in associative array
	    $abbrev{$1} = $2;
	    # remove redundant whitespace and all newlines
	    while ($abbrev{$1} =~ s/\n|(\s\s+)/ /g) {};
	    # macros are fully expanded when read
	    &expand_value($abbrev{$1}) 
	}
    }	
    close(STRINGS);
}

%ignored_fields = ();
%ignored_abbrevs = ();

while (<>) {
    # parse the bibtex entry
    /^\w+/;
    $type = $&;
    $type =~ tr/A-Z/a-z/;

    # handle @string entries
    if ($type eq 'string') {
	if (/^String\{(\S+) *= *(\S(.|\n +)*)\}\n\n/) {
	    if ($abbrev{$1}) {
		print STDERR "Multiple definition of abbreviation $abb\n"  if (!silent);
	    }
	    # store abbreviation in associative array
	    $abbrev{$1} = $2;
	    # remove redundant whitespace and all newlines
	    while ($abbrev{$1} =~ s/\n|(\s\s+)/ /g) {};
	    # macros are fully expanded when read
	    &expand_value($abbrev{$1}) 
	}
	next;
    }
	
    # get fields of the entry
    @pfields = split(/^  (\S+) =\s+\"/);

    # if there is only one field then ignore the entry. This handles gross
    # parsing errors and especially the case at the beginnning of the input,
    # where the first parsed entry actually is the data from the beginning
    # of the input to the first entry.
    next if (scalar(@pfields) <= 1);

    # get rid of the first field which is junk
    @pfields = @pfields[1..$#pfields];

    # clean the last field of the entry ending
    $pfields[$#pfields] =~ s/,?\s*\n\}(.|\n)+@?$/,\n/;

    # convert all field names to lower case
    for ($count=0;$count <= $#pfields; $count += 2) {
	$pfields[$count] =~ tr/A-Z/a-z/;
    }

    # convert the fields array into an associative array (the keys are
    # the field names and the values are the field contents)
    %fields = @pfields;
    # process the contents of each field
    foreach (keys(%fields)) {
	# remove all newlines and redundant whitespace
	while ($fields{$_} =~ s/\n|(\s\s+)/ /g) {};
	# remove the trailing comma
	$fields{$_} =~ s/,\s*$// ;
	# remove double quotes and/or expand abbreviations and/or
 	# concatenate strings ('#')
        &expand_value($fields{$_});
	# remove double quotes
	$fields{$_} =~ s/^"|"$//g;
	# remove all whitespace at the start and the end
	$fields{$_} =~ s/(^\s+)|(\s+$)//;
    }

    # cannot handle entries with crossreferences yet.
    if ($fields{'crossref'}) {
	if ($ignored_fields{'crossref'}) {
	    $ignored_fields{'crossref'}++;
	} else {
	    $ignored_fields{'crossref'} = 1;
	}	
	next;
    }	
    	       
    # convert fields
    # Pete Martin: Removed the mangling of month/year

    # commentaries
    # Many fields for comment, notes, etc. are commonly used in
    # bibtex. Combine them all into the annote field. When combining,
    # we introduce newline again. This newlines indicate a new
    # paragraph when a field is parsed for refer printing. 
    &map_field('howpublished','annote',"\n");
    if ($field{'chapter'}) {
        $fields{'chapter'} = "Chapter: " . $fields{'chapter'};
    }
    &map_field('chapter','annote',"\n");
    &map_field('note','annote',"\n");
    &map_field('comment','annote',"\n");

    # organization: combine school and institution into 
    &map_field('school','organization',",");
    &map_field('institution','organization',",");


    # address: combine address and location
    &map_field('location','address',",");

    # typos
    &map_field('keyword','keywords',",");
    &map_field('kwd','keywords',",");
    &map_field('kwds','keywords',",");

    if ($type eq 'mastersthesis') {
	$fields{'type'} = "M.S. Thesis " . $fields{'type'};
    }
    if ($type eq 'phdthesis') {
	$fields{'type'} = "Ph.D. Thesis " . $fields{'type'};
    }
    if (($type eq 'techreport')) {
	if (!$fields{'type'}) {
	    $fields{'type'} = "Technical Report";	
	}
	if ($fields{'number'}) {
	    $fields{'type'} .= ' ' . $fields{'number'};
	}
	$fields{'number'} = "";
    }

    # Pete Martin. Add publisher_address to publisher
    $fields{'publisher'} .= ' ' . $fields{'publisher_address'};



    # sort fields according to the order string
    @keys = sort {index($order,$refer{$a}) <=> 
		      index($order,$refer{$b})}
	       keys(%fields);

    # process each key for printing in refer format
    foreach (@keys) {
	if ($fields{$_}) {
	    if ($_ eq 'author') {
		# authors need special treatment to create one refer field
		# per author
		foreach (split(/ and /,$fields{'author'})) {		    
		    print '%' . $refer{'author'} . " $_\n";
		}
		next;
	    } elsif ($refer{$_}) {
		    # print the field in refer format while
		    # maintaining paragraphs indicated by newlines
		    $prefix = '%' . $refer{$_} . ' ';
		    foreach $field (split("\n",$fields{$_})) {
			$field =~ s/\n$//;
			$field =~ s/(^\s+)|(\s+$)//;
			# if the whole field is surrounded by matching braces,
			# then remove them
			# this is an expensive operation !
			#$field =~ s/^{([^{}]*({[^{}]*}[^{}]*)*)}$/\1/;
			# Pete Martin: Simpler filter to get rid of braces
			$field =~ s/\{//g;
			$field =~ s/\}//g;
			while (length($field) > 172) {
			    if (($wrap = rindex($field," ",172)) < 0) {
		 	        $wrap = index($field," ",172);
                            }	
			    print $prefix;
			    print substr($field,0,$wrap);
			    print "\n" if ($field !~ /\n$/);
			    $field = substr($field,$wrap+1);
			    $prefix = '   ';
			}
			print $prefix . $field;
			print "\n" if ($field !~ /\n$/);
		    }
	    } else {
		# unknown bibtex field
		if ($ignored_fields{$_}) {
		    $ignored_fields{$_}++;
		} else {
		    $ignored_fields{$_} = 1;
		}
	    }
	}
    }
    # extra newline after refer entry
    print "\n";
}
foreach (keys(%ignored_fields)) {
    print STDERR "Unmapped field: $_ (" . $ignored_fields{$_} . " occurrences)\n" if (!silent);
}
foreach (keys(%ignored_abbrevs)) {
    print STDERR "Undefined abbreviation: $_ (" . $ignored_abbrevs{$_} . " occurrences)\n" if (!silent);
}

sub map_field {
    # args: field to be mapped, field onto which to map, separator(optional)
    # map if field valid
    if ($fields{$_[0]}) {
	# append separator if given and necessary
	if ($fields{$_[1]} && (scalar(@_) > 2)) {
	    $fields{$_[1]} .= $_[2];
	}
	# append mapped field
	$fields{$_[1]} .= $fields{$_[0]};
	# invalidate the mapped field
	$fields{$_[0]} = "";
    }
}	
    

sub expand_value {
    # introduce a fake empty string at the beginning to get things started
    $_[0] = '"" # ' . $_[0];

    # substitute macros that follow a text
    while ($_[0] =~ s/(" # )([^\" \t\n#,]+)( # |$)/$1 . &abbrev($2) . $3/eg) {};

    # get rid of the dummy empty string
    $_[0] =~ s/^\"\" # //;

    # unexpand the strings with the magic cookies
    $_[0] =~ s/\"\000([^\000#, \t]+)\000\"/$1/g;

    # remove join operator between string values
    $_[0] =~ s/\" # \"/$join_space/og;

}

sub abbrev {
    if (defined($abbrev{$_[0]})) {
print "Found abbreviation for ", $_[0], " which is ", $abbrev{$_[0]};

	return $abbrev{$_[0]};
    } else {
	if ($ignored_abbrevs{$_[0]}) {
	    $ignored_abbrevs{$_[0]}++;
	} else {		
	    $ignored_abbrevs{$_[0]} = 1;
	}
	return "\"\000$_[0]\000\"";
    }
}

# for require
1;
