## package RefDB::Pubmed;
## Pubmed MEDLINE module

## markus@mhoenicka.de 2002-12-19
## $Id: Pubmed.pm,v 1.3 2003/04/30 21:38:17 mhoenicka Exp $

##   This program is free software; you can redistribute it and/or modify
##   it under the terms of the GNU General Public License as published by
##   the Free Software Foundation; either version 2 of the License, or
##   (at your option) any later version.
##   
##   This program is distributed in the hope that it will be useful,
##   but WITHOUT ANY WARRANTY; without even the implied warranty of
##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##   GNU General Public License for more details.
   
##   You should have received a copy of the GNU General Public License
##   along with this program; if not, write to the Free Software
##   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

## Package main documentation

=head1 NAME

RefDB::Pubmed - Perl extension for converting Pubmed bibliographic data to RIS

=head1 SYNOPSIS

  use RefDB::Pubmed;
  my $infile = "-";

  my $pm = new RefDB::Pubmed;

  $pm->in($infile);

  while ((my $set = $pm->next_pubmed_set())) {
      $set->parse_pmset();
      $set->convert_pmset();
      $set->dump_pmset_as_ris();
  }


=head1 DESCRIPTION

RefDB::Pubmed allows to convert Pubmed/Medline bibliographic data to the RIS format understood by RefDB and most other bibliographic software. Data can be provided as a string or they can be read from a file/stream.

=head1 FEEDBACK

Send bug reports, questions, and comments to the refdb-users mailing list at:

refdb-users@lists.sourceforge.net

For list information and archives, please visit:

http://lists.sourceforge.net/lists/listinfo/refdb-users


=head1 AUTHOR

Markus Hoenicka, markus@mhoenicka.de

=head1 SEE ALSO

This module is part of the RefDB package, a reference manager and bibliography tool for markup languages. Please visit http://refdb.sourceforge.net for further information.

=cut

# Preloaded methods go here.

use Text::Iconv;

=head1 PMSet package

This helper package defines methods to deal with a Pubmed dataset

=cut

######################################################################
######################################################################
## defines a class to deal with a Pubmed dataset
package PMset;

=head2 new

Title   : new

Usage   : $set = new PMset();

Function: Creates a new PMset object

=cut

######################################################################
## new(): constructor for a new Pubmed dataset
######################################################################
sub new() {
    my $class = shift;
    my $self = {};

    ## the raw data
    $self->{raw} = undef;

    ## this hash receives the parsed data. The keys will be all tags
    ## that occurred in the input data, the values will be lists of
    ## the strings that were tagged with the corresponding tag
    %{$self->{data}} = ();

    ## this list receives the converted RIS data except the type
    $self->{ris} = undef;

    ## the type
    $self->{type} = "TY  - JOUR";

    ## whether ("t") or not ("f") to print unmapped tags
    $self->{print_unmapped} = "f";

    ## the iconv character encoding converter goes here
    $self->{converter} = undef;

    bless $self, $class;
    return $self;
}

=head2 parse_pmset

Title   : parse_pmset

Usage   : $set->parse_pmset();

Function: parses the raw tagged Pubmed data

=cut

######################################################################
## parse_pmset(): parses the raw Pubmed data and adds data to a hash
######################################################################
sub parse_pmset() {
    my $self = shift;

    my $poolstring = undef;
    my $tag = undef;
    my $data = undef;
    my $prevtag = undef;

    ## loop over each line of the dataset. The logic is a bit weird as we
    ## have to check for continued lines (without a tag) which should be
    ## appended to the most recent line with a tag. Therefore we pool data
    ## until the next tag line starts. Then we add the previous tag line
    ## to the hash. This means we have to add the last line after the 
    ## foreach loop is done.
    foreach my $string (@{$self->{raw}}) {
	if ($string =~ /^(.{4}- )/) {
	    $tag = $1;
	}
	else {
	    $tag = undef;
	}

	if ($string =~ /^.{4}- (.*)/) {
	    $data = $1;
	}
	else {
	    $data = $string;
	}

	# remove leading and trailing whitespace
	$data =~ s%^\s*(.*)\s*$%$1%;

	if (defined($tag) && defined($prevtag)) {
	    push (@{${$self->{data}}{"$prevtag"}}, $poolstring);
	    $prevtag = $tag;
	    $poolstring = $data;
	}
	elsif (defined($tag)) {
	    $prevtag = $tag;
	    $poolstring = $data;
	}
	elsif (defined($prevtag)) {
	    $poolstring = $poolstring . " " . $data;
	}
    }
    
    if (defined($prevtag) && defined($poolstring)) {
	push (@{${$self->{data}}{"$prevtag"}}, $poolstring);
    }
}

=head2 convert_pmset

Title   : convert_pmset()

Usage   : $set->convert_pmset()

Function: Converts the parsed data to RIS data

=cut

######################################################################
## convert_pmset(): convert the Pubmed data stored in a hash to the
##                  RIS format
######################################################################
sub convert_pmset() {
    my $self = shift;
    
    while ((my $key, my $value) = each %{$self->{data}}) {
	foreach my $string (@{$value}) {
#	    print "$key$string\n";
	    $self->_convert_tag($key, $string);
	}
    }
}

=head2 dump_pmset_as_ris

Title   : dump_pmset_as_ris

Usage   : $set->dump_pmset_as_ris()

Function: Dumps the data as a valid RIS set

=cut

######################################################################
## dump_pmset_as_ris():dumps the converted RIS data
######################################################################
sub dump_pmset_as_ris() {
    my $self = shift;

    ## start with a newline and the type tag
    print "\n$self->{type}\n";

    foreach my $string (@{$self->{ris}}) {
	print $self->{converter}->convert("$string\n");
    }

    ## end with the end ref tag
    print "ER  - \n";
}

=head2 dump_pmset_as_pm

Title   : dump_pmset_as_pm

Usage   : $set->dump_pmset_as_pm()

Function: dumps the parsed data as a Pubmed set. The result differs from the input in that each tag with its associated data is always in a single line whereas the input data may contain continued lines w/o a tag. You can use this function to normalize the Pubmed tagged data.

=cut

######################################################################
##dump_pmset_as_pm(): dumps the data from the hash. This essentially
##                    creates an equivalent of the input data with the
##                    exception that all data of a tag are on a single
##                    line (input data may contain continuation lines
##                    w/o tag)
######################################################################
sub dump_pmset_as_pm() {
    my $self = shift;
    
    while ((my $key, my $value) = each %{$self->{data}}) {
	foreach my $string (@{$value}) {
	    print $self->{converter}->convert("$key$string\n");
	}
    }
}

=head2 _convert_tag

Title   : _convert_tag

Usage   : $self->_convert_tag($key, $string);

Function: Converts a Pubmed tag line to RIS and adds the result to a list

Argument: string containing the tag, something like 'KW  - '
          string containing the data associated with the tag

=cut

######################################################################
## _convert_tag(): converts a Pubmed tag line to a RIS tag line
## Arguments: string containing the tag, something like 'KW  - '
##            string containing the data associated with the tag
######################################################################
sub _convert_tag($$) {
    my ($self, $tag, $data) = @_;

    ## this hash helps to convert month names to numbers
    my %monthnames = (
		  "Jan" => "01",
		  "Feb" => "02",
		  "Mar" => "03",
		  "Apr" => "04",
		  "May" => "05",
		  "Jun" => "06",
		  "Jul" => "07",
		  "Aug" => "08",
		  "Sep" => "09",
		  "Oct" => "10",
		  "Nov" => "11",
		  "Dec" => "12");

    ## depending on the Pubmed tag, create appropriate RIS tags and push
    ## the resulting string(s) on a list

    if ($tag eq "PG  - ") { ## pages
	if ($data =~ /(.*)-(.*)/) {
	    push (@{$self->{ris}}, "SP  - $1");
	    push (@{$self->{ris}}, "EP  - $2");
	}
	else {
	    push (@{$self->{ris}}, "SP  - $data");
	}
    }
    elsif ($tag eq "VI  - ") { ## volume
	push (@{$self->{ris}}, "VL  - $data");
    }
    elsif ($tag eq "IP  - ") { ## issue
	push (@{$self->{ris}}, "IS  - $data");
    }
    elsif ($tag eq "DP  - ") { ## publication date
	my $year = $data;
	my $month = $data;
	my $day = $data;

	## publication date is something like 2002 May 1, with day
	## and probably month being optional
	$year =~ s%^(\d{4}).*%$1%;
	$month =~ s%^\d{4}\s*(\w*).*%$1%;
	$day =~ s%^\d{4}\s*\w*\s*(.*)%$1%;

	if (defined($year) && defined($month) && defined($day)) {
	    my $nummonth = $monthnames{$month};
	    if (length($day) == 1) {
		push (@{$self->{ris}}, "PY  - $year/$nummonth/0$day/");
	    }
	    else {
		push (@{$self->{ris}}, "PY  - $year/$nummonth/$day/");
	    }
	}
	elsif (defined($year) && defined($month)) {
	    my $nummonth = $monthnames{$month};
	    push (@{$self->{ris}}, "PY  - $year/$nummonth//");
	}
	elsif (defined($year)) {
	    push (@{$self->{ris}}, "PY  - $year///");
	}
    }
    elsif ($tag eq "TI  - ") { ## article title
	push (@{$self->{ris}}, "TI  - $data");
    }
    elsif ($tag eq "AB  - ") { ## abstract
	push (@{$self->{ris}}, "N2  - $data");
    }
    elsif ($tag eq "AD  - ") { ## address
	push (@{$self->{ris}}, "AD  - $data");
    }
    elsif ($tag eq "AU  - ") { ## author
	my $risauthor = $data;

	if ($risauthor =~ m%^\w*\s+[A-Z]{2}%) {
	    $risauthor =~ s%^(\w*)\s+([A-Z]{1})([A-Z]{1})%$1,$2.$3.%;
	}
	elsif ($risauthor =~ m%^\w*\s+[A-Z]{1}%) {
	    $risauthor =~ s%^(\w*)\s+([A-Z]{1})%$1,$2.%;
	}

	push (@{$self->{ris}}, "AU  - $risauthor");
    }
    elsif ($tag eq "PT  - ") { ## publication type
	## we're dealing only with journal articles so we use the 
	## default set in the constructor. The pubtype string
	## is added as a keyword
	push (@{$self->{ris}}, "KW  - $data");
    }
    elsif ($tag eq "TA  - ") { ## journal name, abbreviated
	push (@{$self->{ris}}, "JO  - $data");
    }
    elsif ($tag eq "RN  - ") { ## chemical substance
	push (@{$self->{ris}}, "KW  - $data");
    }
    elsif ($tag eq "MH  - ") { ## MeSH
	$self->_split_mesh($data);
    }
    elsif ($tag eq "AID - ") { ## article ID
	if ($data =~ /\[doi\]/) {
	    my $doi = $data;
	    $doi =~ s/(.*) \[doi\]/$1/;
	    push (@{$self->{ris}}, "M3  - $doi");
	}
    }
    else { ## unknown or unused tag
	if ($self->{print_unmapped} eq "t") {
	    ## if the user chose to do so, print the unknown or unused
	    ## tag along with the real data, preceeded by '<unmapped>'.
	    ## Users can inspect the result, apply changes if necessary,
	    ## and then strip the data with:
	    ## grep -v '^<unmapped>' < infile > outfile
	    push (@{$self->{ris}}, "<unmapped>$tag$data");
	}
    }
}

=head2 _split_mesh

Title   : _split_mesh

Usage   : $self->_split_mesh($string);

Function: splits a Pubmed MH line into one or more RIS KW lines

Argument: string containing the data associated with the MH tag

=cut

######################################################################
## _split_mesh(): splits a Pubmed MH line into one or more RIS KW lines
## Arguments: string containing the data associated with the MH tag
######################################################################
sub _split_mesh($) {
    my ($self, $data) = @_;

    ## Pubmed MeSH entries may contain qualifiers separated by slashes
    my @tokens = split m%/%, $data;

    my $keyword = shift(@tokens);

    if ($keyword eq $data) {
	## no qualifier
	push (@{$self->{ris}}, "KW  - $keyword");
    }
    else {
	## at least one qualifier. We want a separate KW line
	## for each qualifier
	foreach $qual (@tokens) {
	    ## strip leading '*' from qualifier
	    $qual =~ s/^\*//;
	    push (@{$self->{ris}}, "KW  - $keyword [$qual]");
	}
    }
}

=head2 set_print_unmapped

Title   : set_print_unmapped

Usage   : $pm->set_print_unmapped(1)

Function: switch on or off printing of unmapped Pubmed tags

Argument: 0 (zero) to switch off or non-zero to switch on

=cut

######################################################################
## set_print_unmapped(): switch on or off printing of unmapped Pubmed tags
## Returns: the previous setting
## Argument: "f" to switch off or "t" to switch on
######################################################################
sub set_print_unmapped($) {
    my ($self, $data) = @_;

    my $prev = $self->{print_unmapped};

    $self->{print_unmapped} = $data;

    return $prev;
}

=head2 set_converter

Title   : set_converter

Usage   : $pm->set_converter("from_enc", "to_enc")

Function: creates the iconv character encoding converter to be used

Arguments: from_enc, to_enc: the encoding of the source data and of the output data, respectively

=cut

######################################################################
## set_converter(): creates the iconv character encoding converter to be used
## Returns: the converter
## Argument: from_enc, to_enc
######################################################################
sub set_converter($$) {
    my ($self, $from_enc, $to_enc) = @_;

    $self->{converter} = Text::Iconv->new($from_enc, $to_enc);
}

=head2 add_raw_line

Title   : add_raw_line

Usage   : $set->add_raw_line($_)

Function: adds a raw Pubmed line to the internal list

Argument: string containing a full Pubmed line

=cut

######################################################################
## add_raw_line(): adds a raw Pubmed line to the internal list
## Argument: string containing a full Pubmed line
######################################################################
sub add_raw_line($) {
    my ($self, $data) = @_;

    push (@{$self->{raw}}, $data);
}

=head1 Pubmed package

This package defines functions to deal with collections of Pubmed datasets

=cut

######################################################################
######################################################################
## define a class to deal with Pubmed MEDLINE input data
package RefDB::Pubmed;

use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

$VERSION = "1.2";

=head2 new

Title   : new

Usage   : $pm = new RefDB::Pubmed();

Function: Creates a new Pubmed object

=cut

######################################################################
## new(): creates a new Pubmed element
######################################################################
sub new() {
    my $class = shift;
    my $self = {};

    ## the filehandle for the input stream
    $self->{fh} = undef;

    ## whether (!0) or not (0) to print unmapped tags
    $self->{print_unmapped} = "f";

    ## array with input data lines. Use for parsing strings
    $self->{lines} = undef;

    ## save the first line of the data as it is used for a sanity
    ## check in "in()"
    $self->{firstline} = undef;

    ## iconv support
    $self->{from_enc} = "";
    $self->{to_enc} = "";

    bless $self, $class;
    return $self;
}

=head2 in

Title   : in

Usage   : $pm->in($filename)

Function: Opens a file or input stream for reading

Returns : A filehandle or undef if the stream could not be opened

Argument: The path of a file or the name of an input stream

=cut

######################################################################
## in(): opens a file for input
## Argument: string containing filename or '-' for stdin
## Returns: filehandle if successful, undef if failed
######################################################################
sub in($) {
    my $self = shift;
    my $filename = shift;

    $self->{fh} = eval { local *FH; open(FH, $filename) or die; *FH{IO}};

    if ($@) {
	$self->{fh} = undef;
	print STDERR "Could not open input stream\n";
	return undef;
    }

    my $fh = $self->{fh};

    ## skip leading empty lines and save the first data line
    while (<$fh>) {
	if ($_ ne "\n") {
	    $self->{firstline} = $_;
	    last;
	}
    }

    ## this is the idiot's validity test for Pubmed data. Each dataset
    ## that I've seen so far starts with a unique identifier
    if ($self->{firstline} =~ m/^UI/ 
	|| $self->{firstline} =~ m/^PMID/) {
	return $fh;
    }
    else {
	return undef;
    }
}

=head2 string

Title   : string

Usage   : $pm->string($string)

Function: Accepts an input string for parsing

Returns : 1 if Pubmed data, 0 if not
Argument: A string containing input data

=cut

######################################################################
## string(): Accepts an input string for parsing
## Argument: A string containing input data
######################################################################
sub string($) {
    my ($self, $string) = @_;

    @{$self->{lines}} = split /\n/, $string;

    ## skip leading empty lines and save the first data line
    while (my $line = shift @{$self->{lines}}) {
	if ($line ne "\r"&& $line ne "") {
	    $line =~ s/\r//;
	    $self->{firstline} = $line;
	    last;
	}
    }

    ## this is the idiot's validity test for Pubmed data. Each dataset
    ## that I've seen so far starts with the unique identifier
    if ($self->{firstline} =~ m/^UI/) {
	unshift(@{$self->{lines}}, $self->{firstline});
	return 1;
    }
    else {
	return 0;
    }
    
}

=head2 next_pubmed_set

Title   : next_pubmed_set

Usage   : $pm->next_pubmed_set()

Function: Reads the next Pubmed dataset

Returns : A PMset containing the raw Pubmed tagged data, or undef if no data available

=cut

######################################################################
## next_pubmed_set(): reads the next Pubmed set from the input stream
## Returns: a new PMset instance containing the raw data of the next
##          Pubmed dataset or undef if no more data are available
######################################################################
sub next_pubmed_set() {
    my $self = shift;

    my $fh = $self->{fh};

    my $set = new PMset;

    $set->set_print_unmapped($self->{print_unmapped});
    $set->set_converter($self->{from_enc}, $self->{to_enc});

    my $havedata;

    if (defined($self->{lines})) { ## read from saved string
	
	## skip leading empty lines and save the first data line
	while (defined(my $line = shift @{$self->{lines}})) {
	    if ($line ne "\r" && $line ne "") {
		## strip trailing cr
		$line =~ s/\r//;

		## add back the newline which was killed by split
		$set->add_raw_line("$line\n");
		$havedata = 1;
	    }
	    else {
		if ($havedata) {
		    last;
		}
	    }
	}
    }
    else { ## read from file handle
	## feed back saved first line
	if (defined($self->{firstline})) {
	    $set->add_raw_line($self->{firstline});
	    $self->{firstline} = undef;
	}

	return undef if eof($fh);

    
	## skip empty lines at the beginning
	while (<$fh>) {
	    if ("$_" ne "\n") {
		$set->add_raw_line($_);
		$havedata = 1;
	    }
	    else {
		if ($havedata) {
		    last;
		}
	    }
	}
    }

    if ($havedata) {
	return $set;
    }
    else {
	return undef;
    }
}

=head2 set_print_unmapped

Title   : set_print_unmapped

Usage   : $pm->set_print_unmapped(1)

Function: switch on or off printing of unmapped Pubmed tags

Argument: 0 (zero) to switch off or non-zero to switch on

=cut

######################################################################
## set_print_unmapped(): switch on or off printing of unmapped Pubmed tags
## Returns: the previous setting
## Argument: "f" to switch off or "t" to switch on
######################################################################
sub set_print_unmapped($) {
    my ($self, $data) = @_;

    my $prev = $self->{print_unmapped};

    $self->{print_unmapped} = $data;

    return $prev;
}

=head2 set_encodings

Title   : set_encodings

Usage   : $pm->set_encodings("from_enc", "to_enc")

Function: set encodings of input and output data

Argument: from_enc, to_enc according to "man iconv_open"

=cut

######################################################################
## set_encodings(): set encodings of input and output data
## Arguments: from_enc, to_enc
######################################################################
sub set_encodings($$) {
    my ($self, $from_enc, $to_enc) = @_;

    $self->{from_enc} = $from_enc;
    $self->{to_enc} = $to_enc;
}

1; ## make use happy

__END__
