#===========================================================================

package Sitescooper::ForkHTTPClient;

use strict;

require Exporter;
use Carp;
use Socket;
use FileHandle;
use IO::Handle;
use IO::Select;

use HTTP::Response;
use HTTP::Headers;

use Sitescooper::HTTPClient;
use Sitescooper::ForkedPasswordAsker;

use vars qw(
	$STATE_BUSY $STATE_READY $DBG_PROTOCOL
	@ISA @EXPORT $VERSION
	);

# $DBG_PROTOCOL	= 1;

@ISA = qw(Sitescooper::HTTPClient);
@EXPORT= qw();
$VERSION = "0.1";
sub Version { $VERSION; }

$STATE_READY    = 1;
$STATE_BUSY     = 2;

sub new {
  my $class = shift; $class = ref($class) || $class;
  my $scoop = shift;
  croak "scoop not defd" unless defined ($scoop);
  my $num_procs = shift;

  my $self = {
    'scoop'		=> $scoop,
    'num'		=> $num_procs,
    'loaders'		=> [ ],
  };

  $self->{DBG_PROT} = $DBG_PROTOCOL;

  bless ($self, $class);
  $self;
}

sub init {
  my ($self) = @_;

  my $i;
  for ($i = 0; $i < $self->{num}; $i++)
  {
    # set up the pipes using socketpair(). Phew, for a moment
    # I thought I'd have to hack IPC::Open3 ;)
    #
    my $fh = new FileHandle;
    socketpair($fh, FORKHTTP_PARENT, AF_UNIX,
    					SOCK_STREAM, PF_UNSPEC)
			        or  die "socketpair: $!";

    my $pid = fork;
    if ($pid == 0) {
      FORKHTTP_PARENT->autoflush(1);
      close $fh;
      $self->run();

    } else {
      die "cannot fork: $!" unless defined $pid;
      $self->{scoop}->dbg ("started HTTP handling process $pid");
      close FORKHTTP_PARENT;

      my $ldr = { };
      $ldr->{pid} = $pid;

      $fh->autoflush(1);
      $ldr->{fh} = $fh;

      $ldr->{state} = $STATE_READY;
      ${$self->{loaders}}[$i] = $ldr;
      $self->test_loader ($fh);
    }
  }
}

sub DESTROY {
  my $self = shift;

  if ($#{$self->{loaders}} < 0) {
    return;	# a subprocess

  } else {
    my $i;
    for ($i = 0; $i < $self->{num}; $i++) {
      my $ldr = ${$self->{loaders}}[$i];
      my $fh = $ldr->{fh};
      print $fh "QUIT\n";
      my $resp = <$fh>;
    }
  }
}

sub get_max_active_requests {
  my ($self) = @_;
  $self->{num};
}

sub can_preload {
  my ($self) = @_;
  1;
}

sub test_loader {
  my ($self, $fh) = @_;

  my $str = "TEST\n";
  print $fh $str;
  $self->{DBG_PROT} and $self->{scoop}->dbg (">".$str);

  $_ = <$fh>;
  $self->{DBG_PROT} and $self->{scoop}->dbg ("<".$_);
  /^200 test ok/ or die ("bad response from httpclient to TEST: $_");
}

# ===========================================================================
# HTTPClient client (sitescooper process) code:

sub start_get {
  my ($self, $ref, $url, $lastmod, $is_dynamic) = @_;

  my $i;
  for ($i = 0; $i < $self->{num}; $i++)
  {
    next unless (${$self->{loaders}}[$i]->{state} ==
    			$STATE_READY);
    my $ldr = ${$self->{loaders}}[$i];
    $self->start_httpclient_with_loader ($ldr, $ref, $url, $lastmod, $is_dynamic);

    my $state = new Sitescooper::HTTPRequestState();
    $state->{loader} = $ldr;
    return $state;
  }
  undef;
}

sub start_httpclient_with_loader {
  my ($self, $ldr, $ref, $url, $lastmod, $is_dynamic) = @_;

  my $outfh = $ldr->{fh};
  my $infh = $ldr->{fh};
  $lastmod ||= "";

  my $str = "GET <referrer>$ref</referrer> <url>$url</url> ".
  		"<lastmod>$lastmod</lastmod> <isdyn>$is_dynamic</isdyn>\n";
  syswrite $outfh, $str, length $str;
  $self->{DBG_PROT} and $self->{scoop}->dbg (">".$str);

  $_ = getline ($infh);
  $self->{DBG_PROT} and $self->{scoop}->dbg ("<".$_);

  /^300 / or die ("bad response from httpclient after GET: $_");
  $ldr->{state} = $STATE_BUSY;
  1;
}

sub get_waiting_fh {
  my ($self, $state) = @_;
  my $ldr = $state->{loader};

  # if it's not in the BUSY state, something's gone wrong
  if ($ldr->{state} != $STATE_BUSY) {
    croak "loader not in STATE_BUSY";
  }
  $ldr->{fh};
}

sub ready_to_finish {
  my ($self, $state) = @_;
  my $ldr = $state->{loader};

  # if it's not in the BUSY state, something's gone wrong
  if ($ldr->{state} != $STATE_BUSY) {
    croak "loader not in STATE_BUSY";
  }

  my $sel = new IO::Select();
  $sel->add ($ldr->{fh});
  my @ready;
  if ((@ready = $sel->can_read (0.0)) && $#ready >= 0) { return 1; }
  0;
}

sub finish_get {
  my ($self, $state) = @_;

  my $ldr = $state->{loader};
  if ($ldr->{state} != $STATE_BUSY) {
    croak "loader not in STATE_BUSY";
  }
  my $resp = $self->finish_httpclient_for_loader ($ldr);
  $ldr->{state} = $STATE_READY;
  $resp;
}

sub finish_httpclient_for_loader {
  my ($self, $ldr) = @_;
  local ($_);
  my $infh = $ldr->{fh};
  my $outfh = $ldr->{fh};

  $_ = getline ($infh);
  $self->{DBG_PROT} and $self->{scoop}->dbg ("<".$_);
  if (s/^301 Need-Credentials://) {
    my $realm = undef;
    my $uri = undef;
    my $proxy = undef;

    s,<realm>(.*)</realm>,,g and $realm = $1;
    s,<uri>(.*)</uri>,,g and $uri = $1;
    s,<proxy>(.*)</proxy>,,g and $proxy = $1;

    if (!defined $realm || !defined $uri || !defined $proxy) {
      die ("bad response from httpclient in Need-Creds: $_");
    }

    my ($user, $pass) = 
    	$self->{scoop}->{useragent}->get_basic_credentials($realm, $uri, $proxy);
    my $str = "Provide-Credentials <user>$user</user> <pass>$pass</pass>\n";
    syswrite $outfh, $str, length $str;
    $_ = getline ($infh);
    $self->{DBG_PROT} and $self->{scoop}->dbg ("<".$_);
    /^302 / or die ("bad response from httpclient after Prov-Creds: $_");
    $ldr->{state} = $STATE_BUSY;
    return;
    # back to the select() call...

  } elsif (s/^201 //) {
    $_ = getline ($infh);
    $self->{DBG_PROT} and $self->{scoop}->dbg ("<".$_);
    /^Preload-length: (\d+)/ or
    	die ("bad response from httpclient, no Preload-length header: $_");
    my $len = $1+0;
    my $got = 0;
    my $respdata;
    while ($got < $len) {
      $got += sysread ($infh, $respdata, $len, $got);
    }
    $_ = getline ($infh);
    $self->{DBG_PROT} and $self->{scoop}->dbg ("<".$_);
    /^200 / or die ("bad response from httpclient, no 200: $_");
    return $self->data_to_response ($respdata);

  } else {
    die ("bad response from httpclient: $_");
  }
}

sub data_to_response {
  my ($self, $data) = @_;
  local ($_);

  my ($rc, $msg, %hdrs);

  $data =~ s/^(\d+) (.*?)\n//s;
  $rc = $1; $msg = $2;

  %hdrs = ();
  my $lasthdrname = "";
  while ($data =~ s/^(.*?)\n//s) {
    $_ = $1;
    /^\r?$/ and last;
    s/\r?$//;
    if (/^\s/) {
      $hdrs{$lasthdrname} .= $_; next;
    }
    /^(\S+): (.*)/ or die "bad HTTP hdr \"$_\"\n";

    $hdrs{$1} = $2; $lasthdrname = $1;
  }

  if (!defined $rc || !defined $msg || !defined $data)
  {
    die ("bad http message from httpclient: $data");
  }

  if ($rc == 504 && $msg == 'HTTPClient timed out') {
    return undef;		# timed out!
  }

  my $header = new HTTP::Headers;
  $header->header (%hdrs);
  my $resp = new HTTP::Response ($rc, $msg, $header, $data);
  $resp;
}

# ===========================================================================
# HTTPClient server (subprocess) code:

sub run {
  my ($self) = @_;
  local ($_);
  my $sofar = 0;

  my $asker = new Sitescooper::ForkedPasswordAsker();
  $self->{scoop}->{useragent}->set_password_asker ($asker);

  # we have to use sysread here, as we can't use <FORKHTTP_PARENT>
  # to read; buffering will screw up the IPC. TODO: write an efficient
  # fgets()-style read interface here :(
  while (defined ($_ = getline (\*FORKHTTP_PARENT))) {
    $self->{DBG_PROT} and $self->{scoop}->dbg ("handler$$> $_");

    if (/^TEST$/) {
      my $str = "200 test ok\n";
      syswrite FORKHTTP_PARENT, $str, length $str;

    } elsif (s/^GET //) {
      my ($referrer, $url, $lastmod, $isdyn);

      s,<referrer>(.*)</referrer>,,g and $referrer = $1;
      s,<url>(.*)</url>,,g and $url = $1;
      s,<lastmod>(.*)</lastmod>,,g and $lastmod = $1;
      s,<isdyn>(.*)</isdyn>,,g and $isdyn = $1;
      if (defined $lastmod && $lastmod eq '') { $lastmod = undef; }
      
      if (!defined $url) {
	$self->reply_with_err (401, "missing parameter",
		"A parameter was missing from GET request");
	next;
      }
      my $req = Sitescooper::LWPHTTPClient::make_http_request
					  ($referrer, $url, $lastmod, $isdyn);

      # use syswrite so the write is definitely not buffered; otherwise
      # if the URL is a file etc. this write will get mingled with the
      # 201 got message.
      my $str = "300 starting\n";
      syswrite FORKHTTP_PARENT, $str, length $str;

      my $timeout = 10;
      my $resp = Sitescooper::LWPHTTPClient::invoke_http_request
      						($req, $timeout);

      if (!defined $resp) {
	$self->timed_out ();
      }

      $self->reply_with_got ($resp->status_line . "\n".
		  $resp->headers_as_string . "\n". $resp->content);

    } elsif (s/^SETCRED //) {
      my $realm = undef;
      my $user = undef;
      my $pass = undef;

      s,<realm>(.*)</realm>,,g and $realm = $1;
      s,<user>(.*)</user>,,g and $user = $1;
      s,<pass>(.*)</pass>,,g and $pass = $1;

      if (!defined $realm || !defined $user || !defined $pass) {
	$self->reply_with_err (401, "missing parameter",
		"A parameter was missing from SETCRED request");
	next;
      }
      $self->{scoop}->{useragent}->set_credential ($realm, $user, $pass);

    } elsif (s/^CLRCRED //) {
      my $realm = undef;

      s,<realm>(.*)</realm>,,g and $realm = $1;
      
      if (!defined $realm) {
	$self->reply_with_err (401, "missing parameter",
		"A parameter was missing from CLRCRED request");
	next;
      }
      $self->{scoop}->{useragent}->clear_credential ($realm);

    } elsif (s/^QUIT//) {
      my $str = "200 bye then\n";
      syswrite FORKHTTP_PARENT, $str, length $str;
      exit;

    } else {
      warn "HTTPClient: got funny line: $_"; next;
    }
  }
  die "HTTP handler $$: sitescooper process closed connection\n";
}

sub timed_out {
  my ($self) = @_;

  $self->reply_with_got ("504 HTTPClient timed out\n\n".
	"<html><head><title>504 HTTPClient timed out</title></head>".
	"<body><h1>504 HTTPClient timed out</h1></body></html>\n");
}

sub reply_with_got {
  my ($self, $repl) = @_;

  my $str = "201 got\n".
  	"Preload-length: ".(length $repl)."\n";
  syswrite FORKHTTP_PARENT, $str, length $str;
  syswrite FORKHTTP_PARENT, $repl, length $repl;
  $str = "200 ok\n";
  syswrite FORKHTTP_PARENT, $str, length $str;
  $self->{DBG_PROT} and $self->{scoop}->dbg ("handler$$>".$str.$repl."200 ok\n");
}

sub reply_with_err {
  my ($self, $code, $err, $repl) = @_;

  my $str = "201 $err\n".
  	"Preload-length: ".(length $repl)."\n";
  syswrite (FORKHTTP_PARENT, $str, length($str));
  syswrite (FORKHTTP_PARENT, $repl, length($repl));
  $str = "200 ok\n";
  syswrite (FORKHTTP_PARENT, $str, length($str));
  $self->{DBG_PROT} and $self->{scoop}->dbg ("handler$$>".$str.$repl."200 ok\n");
}

sub getline {
  my ($fh) = @_;
  local ($_);
  my $sofar = 0;

  while (sysread ($fh, $_, 1, $sofar) > 0) {
    if (substr ($_, $sofar, 1) ne "\n") {
      $sofar++;
      next;
    }

    $sofar = 0;
    return $_;
  }

  undef;
}

1;

=pod

=head1 PROTOCOL

The ForkHTTPClient protocol used between the forked httpclient (p)
and the main sitescooper process (s) looks like this:

s: GET <referrer>http://foo</referrer> <url>http://url</url> <lastmod>93458345</lastmod> <isdyn>0</isdyn>
p: 300 starting
... time passes...
p: 201 got
p: Preload-length: [len]
p: [http response including status line and headers]
p: 200 ok

Repeat ad infinitum until (s) closes the connection. The <lastmod>...</lastmod>
parameter is optional.

Also supported:

s: SETCRED <realm>realm</realm> <user>user</user> <pass>pass</pass>
p: 200 credentials set ok
s: CLRCRED <realm>realm</realm>
p: 200 credentials cleared ok

to set credentials before a request is started, and

s: GET <referrer>http://foo</referrer> <url>http://url</url> <lastmod>93458345</lastmod> <isdyn>0</isdyn>
p: 300 starting
... time passes...
p: 301 Need-Credentials: <realm>realm</realm> <uri>uri</uri> <proxy>proxy</proxy>
s: Provide-Credentials <user>user</user> <pass>pass</pass>
p: 302 ok
... more time passes...
p: 201 got
p: Preload-length: [len]
p: [http response including status line and headers]
p: 200 ok

if credentials are called for while a request is in progress.

P can return the usual set of error codes (400-499, 500-599) if
an error condition occurs.

=cut
