# Copyright (C) 2001,2004 Stichting LogReport Foundation logreport@logreport.org

# This file is part of Lire.

# Lire is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program (see COPYING); if not, check with
# http://www.gnu.org/copyleft/gpl.html or write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

# Author:
#   Francis J. Lacoste <flacoste@logreport.org>

package Lire::DlfAnalysers::ReferrerCategoriser;

use strict;

use Lire::Utils qw/parse_url/;

use base qw/Lire::DlfCategoriser/;

sub new {
    return bless {}, shift;
}

sub name {
    return "www-referrer";
}

sub title {
    return "Referrer DlfAnalyser";
}

sub description {
    return '<para>This categoriser analyses the <structfield>referer</structfield> field to extract information about the <structfield>referring_site</structfield>, the <structfield>search_engine</structfield> and <structfield>keywords</structfield> when the request seems to come from a search engine.</para>';
}

sub src_schema {
    return "www";
}

sub dst_schema {
    return "www-search";
}

sub initialise {
    my ( $self, $config ) = @_;

    return;
}

# Order is important
# Because google.yahoo.com isn't the same as www.google.com
my @Engine2Keywords = 
  (
   ["yahoo.com",    'p',	"Yahoo!"],
   ["altavista.com",'q',	"AltaVista"],
   ["google",	    'q',	"Google"],
   ["google",	    'query',	"Google"],
   ["www.google",   'q',	"Google"],
   ["aol.com",	    'query',	"AOL NetFind"],
   ["eureka.com",   'q',	"Eureka"],
   ["lycos.com",    'query',	"Lycos"],
   ["hotbot.com",   'MT',	"HotBot"],
   ["msn.com",	    'MT',	"Microsoft Network"],
   ["infoseek.com", 'qt',	"InfoSeek"],
   ["webcrawler",   'searchText', "WebCrawler"],
   [ "excite",	    'search',	"Excite"],
   ["netscape.com", 'search',	"Netscape"],
   ["mamma.com",    'query',	"Mamma"],
   ["alltheweb.com", 'query',	"All The Web"],
   ["northernlight.com", 'qr',	"Northern Light"],
   ["askjeeves.com", 'ask',	"Ask Jeeves"],
   ["looksmart.com", 'key',	"Look Smart"],
   ["goto.com",	     'key',	"Look Smart"],
   ["overture.com",  'Keywords',"Overture"],
   ["about.com",     'terms',	"About.COM"],
   ["metacrawler.com", 'general', "Meta Crawler"],
   ["about.com",     'terms',	"About.COM"],
   ["iwon.com",     'searchfor', "iWon"],
  );

sub categorise {
    my ( $self, $dlf ) = @_;

    return unless defined $dlf->{'referer'};

    if ( $dlf->{'referer'} =~ /bookmarks/i ) {
        $dlf->{'referring_site'} = 'Bookmarks';
        return;
    }

    my $parsed_url = eval { parse_url( $dlf->{'referer'} ) };
    return if $@;

    $dlf->{'referring_site'} = $parsed_url->{'host'};
    return unless defined $parsed_url->{'host'} &&
      defined $parsed_url->{'query'};

    my $host = $parsed_url->{'host'};
    foreach my $spec ( @Engine2Keywords ) {
	my ( $host_match, $param, $engine ) = @$spec;
	next if index( lc $host, $host_match ) == -1;
	next unless $parsed_url->{'query'} =~ /$param=(.*?)([;&]|$)/;
	my $keywords = $1;
	$keywords =~ tr/+/ /s;
	$keywords =~ s/%([0-9a-fA-F]{2})/chr(hex $1)/eg;
        # we've seen
        # http://www.google.com/search?as_q=&num=10&btnG=Google+Search&\
        #  as_epq=Trippin+Smurfs&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&\
        #  as_qdr=all&as_occt=any&as_dt=i&as_sitesearch=&safe=off
        # so: deal with empty search requests.
        $dlf->{'search_engine'} = $engine;
        $dlf->{'keywords'} = $keywords
          if $keywords;
        return;
    }

    return;
}

# keep perl happy
1;
