#===========================================================================
# 

package Sitescooper::Robot;

require Exporter;
use Carp;
use File::Basename;
use HTTP::Status;

use Sitescooper::StripTablesFilter;
use Sitescooper::LinksURLProcessor;
use Sitescooper::StoryURLProcessor;
use Sitescooper::ImageURLProcessor;

@ISA = qw(Exporter);
@EXPORT= qw();
$VERSION = "0.1";
sub Version { $VERSION; }

use vars qw{
  @ISA @EXPORT $VERSION $SLASH
};

use strict;

sub new {
  my $class = shift; $class = ref($class) || $class;

  my $scoop = shift;
  my $url = shift;
  my $scf = shift;

  $SLASH = $Sitescooper::Main::SLASH;

  croak "scoop not defd" unless defined ($scoop);
  croak "url not defd" unless defined ($url);
  croak "scf not defd" unless defined ($scf);

  my $self = {
    'scoop'		=> $scoop,
    'cachefactory'	=> $scoop->{cachefactory},
    'cf'		=> $scoop->{cf},

    'url'		=> $url,

    'output_style'	=> $Sitescooper::Main::OUT_HTML,
    'syncfile'		=> undef,
    'pdbtitle'		=> undef,
    'sitename'		=> undef,
    'site'		=> undef,

    'convert_now'	=> 0,

    'sitekey'		=> undef,

    'warn_about_ext_links' => 0,	# turned on where necessary

    'already_tagged_for_downloading'  => { },
    'already_tried_download'          => { },

    'page_to_tmpfile'		=> { },
    'page_to_tmpfile_acc'	=> 0,

    'redirections'		=> { },
    'redirections_backwards'	=> { },
    'output_file'		=> { },

    'output_links_snarfed'	=> { },

    'images_snarfed'		=> [ ],

    'anchors_assigned'		=> { },
    'internal_href_to_external_url'	=> { },
    'anchors_assigned_acc'	=> 0,

    'bookmark_urls'		=> [ ],
    'url_title'			=> { }
  };

  $self->{scf} = $scf;
  bless ($self, $class);
  $self->clear_page_tmpfiles();
  $self;
}

sub dont_convert {
  my $self = shift;
  if (!defined $self->{syncfile}) {
    croak "cannot block conversion without syncfile!\n";
  }
  delete $self->{syncfile};
}

sub scoop_site {
  local ($_);
  my $self = shift;
  my $url = $self->{url};

  if (!defined $self->{sitename}) { $self->{sitename} = $url; }

  $_ = $self->{scf}->{site_defined_at}; /^(.*):/; my $site_file_name = $1;
  $self->{scoop}->verbose ("SITE START: now scooping site \"$site_file_name\".");

  my $time = gmtime;
  $self->dbg ("tmp dir: $self->{outtmp}, output dir: $self->{outdir}, now: $time");
  $self->{cache} = $self->{cachefactory}->get_per_site_cache
  					($self, $self->{sitename});

  (-d $self->{outtmp}) && File::Path::rmtree ($self->{outtmp});
  mkdir ($self->{outtmp}, 0755) || die "cannot mkdir $self->{outtmp}\n";

  # This apparently is needed on MacOS. Sounds unlikely, but there it
  # is...
  if ($self->{scoop}->MyOS() eq 'Mac') {
    my $parentdir = dirname($self->{outdir});
    if (!-d $parentdir) { mkdir ($parentdir, 0755); }
  }

  $self->clear_page_tmpfiles();

  # evaluate perl code for this site.
  my $proc = $self->{scf}->get_story_param ('eval_code', $url);
  if (defined $proc) {
    my $skip_site = 0;
    if (!eval $proc."; 1;") {
      $self->sitewarn ("EvaluatePerl failed: $@");
    } else {
      if ($skip_site) {
	$self->dbg ("skip_site set, skipping this site.");
	next;
      }
    }
  }

  $self->{output_file} = { };
  $self->{output_file}->{'MAIN'} = '';
  $self->{output_file}->{'ALL_STORIES'} = '';

  $self->{output_story_urls} = [ ];
  $self->{stories_found} = 0;

  $self->{file_size} = 0;
  $self->{hit_file_size_limit} = 0;

  # work out while file size limit to use.
  {
    my $lim;
    my $cflim = $self->{cf}->{filesizelimit};
    my $scflim = $self->{scf}->{sizelimit};

    if (defined $cflim) { $lim = $cflim; }
    if (defined $scflim && (!defined $lim || $scflim < $lim)) { $lim = $scflim; }
    if (!defined $lim) { $lim = $self->{cf}->{defaultfilesizelimit}; }
    $self->{file_size_limit_in_use} = $lim;
  }

  # work out what the min number of pages will be.
  {
    $self->dbg ("minpages: levels=".$self->{scf}->{levels});
    my $min = $self->{scf}->{levels}+2;		# 1-level = 1, 2-levels = 2, etc.
    if (defined $self->{scf}->{min_pages}) {
      $min = $self->{scf}->{min_pages};
    }
    $self->{min_pages_in_use} = $min;
  }

  my $upindex = $self->{current_story_index} = 0;
  $self->{all_front_pages} = { };

  Sitescooper::Main::set_got_intr_behaviour ('setflag');
  my $u;

  # if we're really running the internal unit tests, catch that here.
  if ($url eq 'file://__SITESCOOPER_INTERNAL_TESTS__/') {
    $self->dbg ("url is $url, running tests");
    eval '
      require Sitescooper::UnitTests;
      Sitescooper::UnitTests::run_top_level ($self);
    1; ' or die $@;
  }

  my @urls = $self->grep_unseen_urls ($url,
		 split (' ', $self->{scf}->{top_level_extra_urls}));

  foreach $u (@urls) {
    # if we were interrupted, clear the flag and go on
    if ($Sitescooper::Main::got_intr_flag) { Sitescooper::Main::set_got_intr_behaviour ('setflag'); }
    $self->add_page_tmpfile ($self->{outtmp}.$SLASH.$self->{outidxfile}, $u);
    $self->{all_front_pages}->{$u} = 1;

    if ($self->{scf}->{levels} >= 0) {
      $self->download_front_page (undef, $u, $self->{scf}->{levels}, $upindex);
    } else {
      $self->download_story_page (undef, $u, 1, $upindex);
    }

    if ($self->writing_html && $u ne $url) {
      # add a link to the front page so readers can find this one.
      ${$self->{output_file}}{'MAIN'} .= '
      <a href=\"'.$u.'\">(Additional Front Page URL)</a><br>
      ';
    }
  }

  # Now go through any additional URLs at the different levels, starting
  # at the highest level and working down.
  my $lev;
  for ($lev = $self->{scf}->{levels}; $lev >= 0; $lev--) {
    next unless (defined $self->{scf}->{links_extra_urls}->{$lev});
    my @urls = $self->grep_unseen_urls
		 (split (' ', $self->{scf}->{links_extra_urls}->{$lev}));
    foreach $u (@urls) {
      # if we were interrupted, clear the flag and go on
      if ($Sitescooper::Main::got_intr_flag) { Sitescooper::Main::set_got_intr_behaviour ('setflag'); }
      $self->add_page_tmpfile
      		($self->{outtmp}.$SLASH.$self->{outidxfile}, $u);
      $self->{all_front_pages}->{$u} = 1;

      $self->download_front_page (undef, $u, $lev, $upindex);

      if ($self->writing_html) {
	# add a link to the front page so readers can find this one.
	${$self->{output_file}}{'MAIN'} .= '
	<a href=\"'.$u.'\">(Additional URL)</a><br>
	';
      }
    }
  }

  # and finally the extra story URLs
  foreach $u (split (' ', $self->{scf}->{story_extra_urls})) {
    # if we were interrupted, clear the flag and go on
    if ($Sitescooper::Main::got_intr_flag) { Sitescooper::Main::set_got_intr_behaviour ('setflag'); }
    $self->add_page_tmpfile
    		($self->{outtmp}.$SLASH.$self->{outidxfile}, $u);
    $self->{all_front_pages}->{$u} = 1;

    $self->download_story_page (undef, $u, 1, $upindex);

    if ($self->writing_html) {
      # add a link to the front page so readers can find this one.
      ${$self->{output_file}}{'MAIN'} .= '
      <a href=\"'.$u.'\">(Additional URL)</a><br>
      ';
    }
  }

  # allow the retrieval to finish up.
  1 while $self->handler_run_queue();
  Sitescooper::Main::set_got_intr_behaviour ('exit');

  # kld: addition for image-only sites
  if (defined $self->{scf}->{image_only_site} &&
	$self->{scf}->{image_only_site} == 1 &&
	$self->{stories_found} == 0)
  {
    $self->dbg ("$self->{pdbtitle}: forcing conversion for image-only site");
    $self->{stories_found} = 1;
  }

  $self->dbg ("stories found: ".$self->{stories_found}." minimum: ".
		$self->{min_pages_in_use});

  if ($self->{stories_found} > 0 &&
    		$self->{stories_found} >= $self->{min_pages_in_use})
  {
    $self->finish_successful_scoop ();
  } else {
    $self->finish_unsuccessful_scoop ();
  }

  $self->{scoop}->verbose ("SITE END: done scooping site \"$site_file_name\".");
  delete $self->{cache};
  delete $self->{output_file}->{'MAIN'};
}

# ---------------------------------------------------------------------------

sub finish_successful_scoop {
  my ($self) = @_;

  my $url = $self->{url};
  $self->{scoop}->verbose ("$self->{pdbtitle}: $self->{stories_found} ".
	  "stories downloaded (".
	  sprintf ("%3.1f", $self->{file_size}/1024).
	  " K uncompressed).");

  # Used to order stories in single-page output mode. Compress into one
  # page now. This is unused in multipage mode.
  $self->{output_file}->{'MAIN'} .= $self->{output_file}->{'ALL_STORIES'};
  delete $self->{output_file}->{'ALL_STORIES'};

  my $ind = $self->{current_story_index};
  foreach my $ofkey (keys %{$self->{output_file}}) {
    # convert sitescooper navigation links: [<<][^][>>]
    my $story = $self->{output_file}->{$ofkey};

    # trim off the first and last ones anyway
    $story =~ s/\[<a href=\"__SITESCOOPER_STORY_(-1|${ind})\">.*?<\/a>\]//g;

    # and run through the rest
    my $i; for ($i = 0; $i < $ind; $i++) {
      next unless (defined ${$self->{output_story_urls}}[$i]);
      $story =~
      s/\"__SITESCOOPER_STORY_${i}\"/\"${$self->{output_story_urls}}[$i]\"/g;
    }

    # rewrite links that turned out to redirect to another URL
    $self->rewrite_redirected_links (\$story);

    # remove stray links
    $self->{output_file}->{$ofkey} = $self->remove_external_links ($story);
  }

  my $output_filename = $self->{outtmp}.$SLASH.$self->{outidxfile};

  $self->write_main_output_file ($output_filename);
  foreach $_ (keys %{$self->{output_file}}) {
    next if ($_ eq 'MAIN');
    $self->write_sub_output_file ($_);
  }

  if ($self->{cf}->{dump_output}) {
    # ensure we do not try to convert it later
    $self->dont_convert();

    if (!$self->{cf}->{fileperpage}) {
      # print the index page itself. Doesn't help with
      # images though, but that's -dump for you.
      open (IN, "<".$output_filename);
      while (<IN>) { print STDOUT; }
      close IN;
      File::Path::rmtree ($self->{outtmp});

    } else {
      # print just the path to the index page
      File::Path::rmtree ($self->{outdir});
      rename ($self->{outtmp}, $self->{outdir});
      $output_filename = $self->{outdir}.$SLASH.$self->{outidxfile};
      print STDOUT $output_filename."\n";
    }

  } else {
    File::Path::rmtree ($self->{outdir});
    rename ($self->{outtmp}, $self->{outdir});
    if ($self->{convert_now}) {
      $self->{scoop}->convert_output($self, $self->{scf}, $url);
    }
  }

  $self->dbg ("output dir: ".$self->{outdir});

  if (!$self->writing_images) {
    $self->dbg ("output index: ".
	      $self->{outdir}.$SLASH.$self->{outidxfile});
  }

  $self->{cache}->commit();
}

# ---------------------------------------------------------------------------

sub finish_unsuccessful_scoop {
  my ($self) = @_;

  close OUTFILE;
  if ($self->{stories_found} > 0 &&
	      $self->{min_pages_in_use} < $self->{stories_found})
  {
    $self->{scoop}->verbose ("$self->{pdbtitle}: not enough stories, ignoring.");
    $self->dbg ("reason: pages found=".$self->{stories_found}." < min=".$self->{min_pages_in_use});
  } else {
    $self->{scoop}->verbose ("$self->{pdbtitle}: no new stories, ignoring.");
  }
  $self->dbg ("(Not setting already-seen age cache since no links were followed)");
  $self->dont_convert();
  File::Path::rmtree ($self->{outtmp});
}

# ---------------------------------------------------------------------------

sub write_main_output_file {
  my ($self, $output_filename) = @_;
  open (OUTFILE, "> $output_filename")
		  or die "Failed to create $output_filename\n";

  my $tmpl = $self->{scoop}->{html_main_page};
  if ($self->writing_doc || $self->writing_text) {
    $tmpl = $self->{scoop}->{text_main_page};
  }

  print OUTFILE $self->rewrite_template ($tmpl, {
	'__MAIN_BODY__' => $self->{output_file}->{'MAIN'}
      });

  if ($self->writing_doc) {
    print OUTFILE "<".$self->{cf}->{bookmark_char}.">\n";
  }

  close OUTFILE or warn "Failed to write to $output_filename";
}

sub write_sub_output_file {
  my ($self, $filename) = @_;
  open (OUTFILE, "> $filename")
		  or die "Failed to create $filename\n";
  print OUTFILE $self->{output_file}->{$filename};

  if ($self->writing_doc) {
    print OUTFILE "<".$self->{cf}->{bookmark_char}.">\n";
  }

  close OUTFILE or warn "Failed to write to $filename";
}

# ---------------------------------------------------------------------------

sub rewrite_template {
  my $self = shift;
  my $str = shift;
  my $vars = shift;

  my ($k, $v);
  while (($k, $v) = each %$vars) {
    next unless defined $k;
    $v ||= '';
    $str =~ s/${k}/${v}/gs;
  }

  my $home = $self->{scoop}->{home_url};
  my $bmark = $self->{cf}->{bookmark_char};

  $str =~ s/__SITE_TITLE__/$self->{pdbtitle}/gs;
  $str =~ s/__SITE_NAME__/$self->{sitename}/gs;
  $str =~ s/__SITESCOOPER_HOME_URL__/${home}/gs;
  $str =~ s/__BOOKMARK_CHAR__/${bmark}/gs;
  $str =~ s/__ISILO_BOOKMARK_LINKS__/ $self->gen_isilo_bookmark_links(); /ges;

  my $rights = '';
  if (defined $self->{scf}->{rights}) { $rights = $self->{scf}->{rights}; }
  $str =~ s/__SITE_RIGHTS__/${rights}/gs;

  $str;
}

sub gen_isilo_bookmark_links {
  my $self = shift;
  my $url;

  if (!$self->{cf}->{include_isilo_bookmarks}) {
    return '';
  } # else

  my $str = "<X-ISILO TYPE=\"DOCUMENT\">";
  foreach $url (@{$self->{bookmark_urls}}) {
    my $title = $self->{url_title}{$url};

    # This should result in something like:
    # <A HREF="toc.htm#Table Of Contents">
    # <A HREF="page1.htm#First Page">
    # <A HREF="#Second Page">
    # (see http://www.isilo.com/support/html.htm#bookmarks )
    #
    if ($self->{cf}->{fileperpage}) {
      $str .= "<A HREF=\"".
      		$self->href_to_multipage_href($url). "#".$title."\">\n";
    } else {
      $str .= "<A HREF=\"#".$title."\">\n";
    }
  }

  $str . "</X-ISILO>\n";
}

# ---------------------------------------------------------------------------

sub download_front_page {
  my ($self, $refer, $url, $level, $upindex) = @_;

  my $handler = new Sitescooper::LinksURLProcessor ($self->{scoop},
  	$self, $self->{scf}, $refer, $url, $level, $upindex);
  $self->handler_start ($handler);
}

# ---------------------------------------------------------------------------

sub download_story_page {
  my ($self, $refer, $url, $is_dynamic_html, $upindex) = @_;

  my $handler = new Sitescooper::StoryURLProcessor ($self->{scoop},
  	$self, $self->{scf}, $refer, $url, $is_dynamic_html, $upindex);
  $self->handler_start ($handler);
}

# ---------------------------------------------------------------------------

sub download_image {
  my ($self, $refer, $url, $tag) = @_;

  my $handler = new Sitescooper::ImageURLProcessor ($self->{scoop},
  	$self, $self->{scf}, $refer, $url, $tag);
  $self->handler_start ($handler);

  if ($self->{cf}->{turn_big_imgs_to_hrefs}) {
    my $h = 0;
    my $w = 0;

    ($tag =~ / height\s*=\s*[\"\']?(\d+)/i) and ($h = $1+0);
    ($tag =~ / width\s*=\s*[\"\']?(\d+)/i)  and ($w = $1+0);

    if ($h > 400 || $w > 400) {
      $self->{scoop}->dbg ("turning big image into a link: $url");
      # add an extra </a> to block any existing <a href> scope.
      return "</a><a href=\"".$handler->{relative}."\">".
      		"[tap to display image]</a>";
    }
  }

  return "<img src=\"".$handler->{relative}."\" $tag>";
}

# ---------------------------------------------------------------------------

sub re_request_page {
  my ($self, $handler) = @_;
  $self->handler_start ($handler);
}

# ---------------------------------------------------------------------------

sub handler_start {
  my ($self, $handler) = @_;

  my @keys;
  for (@keys = $self->{scoop}->get_httpclient_queue_keys();
  	$#keys + 1 >= $self->{scoop}->{httpclient}->get_max_active_requests();
  	@keys = $self->{scoop}->get_httpclient_queue_keys())
  {
    $self->dbg ("queue needs to empty before url-handler can start: ".
    	$handler->to_string());
    $self->dbg ($self->{scoop}->httpclient_queue_to_string());

    # spin our wheels until a queue entry leaves the queue
    my $runqstatus;
    1 while (defined($runqstatus = $self->handler_run_queue_once())
    			&& $runqstatus == 0);
  }

  if ($handler->start_get()) {
    $self->dbg ("adding url-handler to queue: ".$handler->to_string());
    $self->{scoop}->add_httpclient ($handler);
  }
  $self->dbg ($self->{scoop}->httpclient_queue_to_string());
}

# ---------------------------------------------------------------------------

sub preload_front_page {
  my ($self, $url) = @_;

  my $handler = new Sitescooper::PreloadURLProcessor ($self->{scoop},
  	$self, $self->{scf}, undef, $url);
  $self->handler_start ($handler);
}

sub finish_preload {
  my ($self) = @_;

  $self->handler_run_queue_once();
}

# ---------------------------------------------------------------------------
# run the handler queue. Keep trying until all handlers have finished
# retrieving pages.
#
sub handler_run_queue {
  my ($self) = @_;
  local ($_);
  my $firsttime = 1;

  $self->dbg ("waiting for queue to empty: ".
  			$self->{scoop}->httpclient_queue_to_string());

  while (1) {
    if ($firsttime == 0) {
      # we're going in circles; sleep a little so we don't eat
      # 100% cpu while we're doing this.
      select (undef, undef, undef, 0.25);
    } else {
      $firsttime = 0;
    }

    if (!defined ($self->handler_run_queue_once())) {
      $self->dbg ("queue now completely empty");
      last;
    }

    my @keys = $self->{scoop}->get_httpclient_queue_keys();
    my $stillone = 0;
    my $idx;
    foreach $idx (sort @keys) {
      $_ = $self->{scoop}->{httpclient_queue}->{$idx};
      next unless defined $_;
      if ($_->{robot} == $self) {
	$self->dbg ("still one of our httpclients in queue: ".
  			$self->{scoop}->httpclient_queue_to_string());
	$stillone = 1; last;
      }
    }
    next if $stillone;

    $self->dbg ("queue now empty of our httpclients: ".
  			$self->{scoop}->httpclient_queue_to_string());
    last;
  }

  undef;
}

# ---------------------------------------------------------------------------

sub url_is_in_queue {
  my ($self, $url) = @_;
  local ($_);
  my @keys = $self->{scoop}->get_httpclient_queue_keys();
  my $stillone = 0;
  my $idx;

  foreach $idx (@keys) {
    $_ = $self->{scoop}->{httpclient_queue}->{$idx};
    next unless defined $_;
    if ($_->{robot} == $self && $_->{url} eq $url) {
      $self->dbg ("URL already loading: ".
		      $self->{scoop}->httpclient_queue_to_string());
      return 1;
    }
  }
  return 0;
}

# ---------------------------------------------------------------------------

sub handler_run_queue_once {
  my ($self) = @_;
  local ($_);
  my @hdlrs = ();
  my ($idx);

  # first of all, get the keys and sort them, so we (to a reasonable
  # degree) get the pages in order.
  my @keys = $self->{scoop}->get_httpclient_queue_keys();
  foreach $idx (sort @keys) {
    $_ = $self->{scoop}->{httpclient_queue}->{$idx};
    next unless defined $_;
    push (@hdlrs, $_);
  }
  if ($#hdlrs < 0) { return undef; }		# nothing left to get

  @hdlrs = Sitescooper::URLProcessor::get_ready_handlers (@hdlrs);
  if ($#hdlrs == 0 && !defined $hdlrs[0]) { return undef; }	# none ready - yet

  my $handlers_done = 0;
  my $h;
  foreach $h (@hdlrs) {
    if (!$h->run) {
      $self->dbg ("url-handler done: ".$h->to_string());
      $self->{scoop}->remove_httpclient ($h);
      $handlers_done++;
    }
  }
  $handlers_done;
}

# ---------------------------------------------------------------------------

sub remove_an_ext_link {
  my $self = shift;

  my ($link, $text, $ahref, $posthref) = @_;

  if (!$self->writing_html) {
    return $text;
  }

  if ($self->{cf}->{nolinkrewrite}) { goto gotit; }
  if (defined ($self->{output_links_snarfed}->{$link})) { goto gotit; }
  if (!$self->{cf}->{fileperpage} && $link =~ /^(.*?)__HASH__/
	&& defined ($self->{output_links_snarfed}->{$1})) { goto gotit; }
  if ($self->{cf}->{fileperpage} && $link =~ /^(.*?)#/
    	&& defined ($self->{output_links_snarfed}->{$1})) { goto gotit; }

  $self->dbg ("Removing non-snarfed link: $link (\"$text\")");

  # convert our "internal" links to proper URLs before passing to
  # delink_unscooped_external_link().
  if ((!$self->{cf}->{fileperpage} && $link =~ /^\#\d+$/)
    || ($self->{cf}->{fileperpage} && $link =~ /^[^\/]+$/))
  {
    $link = $self->{internal_href_to_external_url}->{$link};
  }
  return $self->{scoop}->delink_unscooped_external_link ($link, $text);

gotit:
  $ahref.$link.$posthref.$text."</a>";
}

# ---------------------------------------------------------------------------

sub remove_external_links {
  my $self = shift;

  local ($_) = $_[0];

  $self->dbg2 ("all links found: ".join(' ',
  			sort keys %{$self->{output_links_snarfed}}));

  # jm: imposed limit of 200 chars on link text to stop runaway links
  s/(<a\s+[^>]*href\s*=\s*\")([^\"]+)(\"[^>]*?>)(.{0,200}?)<\/a>/
	  $self->remove_an_ext_link ($2, $4, $1, $3);
      /gies;
  s/(<a\s+[^>]*href\s*=\s*\')([^\']+)(\'[^>]*?>)(.{0,200}?)<\/a>/
	  $self->remove_an_ext_link ($2, $4, $1, $3);
      /gies;

  # fix REAL external links so they're now active and valid
  s/HREF_EXTERNAL/href/gs;

  $_;
}

# ---------------------------------------------------------------------------

sub rewrite_redirected_links {
  my ($self, $text) = @_;
  my $oldurl;

  foreach $oldurl (keys %{$self->{redirections}}) {
    my $newurl = $self->{redirections}{$oldurl};
    $self->{scoop}->dbg ("fixing redirection: $oldurl => $newurl");

    if ($self->{cf}->{fileperpage}) {
      $oldurl = $self->href_to_multipage_href ($oldurl);
      $newurl = $self->href_to_multipage_href ($newurl);
    } else {
      $oldurl = "#".$self->href_to_singlepage_href ($oldurl);
      $newurl = "#".$self->href_to_singlepage_href ($newurl);
    }

    $$text =~ s/href=\s*[\"\']${oldurl}[\"\']/href=\"${newurl}\"/g;
  }
}

# ---------------------------------------------------------------------------
 
sub write_as_story {
  local ($_);
  my ($self, $is_front, $url, $page, $headline, $upindex) = @_;

  my $fullurl = $url;
  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  # provide a shorter version of the URL for palm-sized screens. Split
  # at both / and & signs to ensure CGI urls are clear too.
  my $shorturl = $url;
  if (length $shorturl > 35) { $shorturl =~ s/^.*[\/\&](.{20,35})$/...\/$1/; }

  my $one_page_anchor = '';
  my $one_page_anchor_tag = '';
  if ($self->writing_html && !$self->{cf}->{fileperpage}) {
    $one_page_anchor = $self->href_to_singlepage_href ($url);
    $one_page_anchor_tag = "<a name=\"$one_page_anchor\">";
    $one_page_anchor = '#'.$one_page_anchor;
  }

  if (defined $headline) {
    $self->set_url_title ($url, $headline);
  }

  # for text output, split at newlines appropriately.
  if ($self->writing_text || $self->writing_doc) {
    my @page = (split (/\n/, $page));
    $page = '';
    foreach $_ (@page) {
      if ($self->writing_text) {
	# wrap each line after 70 columns
	while (s/^(.{70}\S*)\s+//) {
	  $page .= $1."\n";
	}
      }
      $page .= $_."\n";
    }
  }

  my $tmpl = $self->{scoop}->{html_story};
  if ($self->writing_doc || $self->writing_text) {
    $tmpl = $self->{scoop}->{text_story};
  }

  my $hdlineank = '';
  if ($self->{cf}->{include_isilo_bookmarks} && defined $headline) {
    $_ = $headline; s/\"/\'/g;
    $hdlineank = "<a name=\"$_\">";
  }

  my $outtext = $self->rewrite_template ($tmpl, {
      '__STORY_TEXT__' => $page,
      '__STORY_ANCHOR__' => $one_page_anchor_tag,
      '__STORY_URL__' => $url,
      '__SHORT_URL__' => $shorturl,
      '__STORY_PREV_LINK__' =>
	      '__SITESCOOPER_STORY_'.($self->{current_story_index}-1),
      '__STORY_UP_LINK__' =>
	      '__SITESCOOPER_STORY_'.$upindex,
      '__STORY_NEXT_LINK__' =>
	      '__SITESCOOPER_STORY_'.($self->{current_story_index}+1),
      '__HEADLINE__' => $headline,
      '__ISILO_HEADLINE_ANCHOR__' => $hdlineank
    });

  if ($self->{cf}->{fileperpage}) {
    $self->write_multipage_story ($fullurl, \$outtext, $headline, $url);
  } else {
    $self->write_1page_story ($fullurl, \$outtext, $headline, $url, $one_page_anchor);
  }

  $self->{current_story_index}++;
  $self->add_snarfed_link ($url);
  if ($fullurl ne $url) { $self->add_snarfed_link ($fullurl); }
  $self->up_file_size ($url, length($outtext), "story");
  $self->{stories_found}++;

  if ($self->{cf}->{storylimit} > 0 &&
	  $self->{stories_found} >= $self->{cf}->{storylimit})
  {
    $self->{scoop}->verbose ("over story limit, stopping this site.");
    $self->{hit_file_size_limit} = 1;
  }
}

# ---------------------------------------------------------------------------

sub write_multipage_story {
  my ($self, $fullurl, $outtext, $headline, $url) = @_;

  my $fname = $self->href_to_multipage_filename ($url);
  my $relative = $self->href_to_multipage_href ($url);

  $self->add_snarfed_link ($relative);
  if ($relative =~ /^(.*?)\#/) {
    $self->add_snarfed_link ($1);
  }

  my $redir_from = $self->{redirections_backwards}{$url};
  $redir_from ||= '';
  my @these_urls = ($url, split (' ', $redir_from));

  foreach $url (@these_urls) {
    if (defined $self->{all_front_pages}->{$url}) {
      # this is a front page, just append it to the main index file
      $self->{output_file}->{'MAIN'} .= $$outtext;
      ${$self->{output_story_urls}}[$self->{current_story_index}] = $relative;
      return;
    }
  }

  if ($self->writing_html) {
    my $tmpl = $self->{scoop}->{html_sub_page};
    if ($self->writing_doc || $self->writing_text) {
      $tmpl = $self->{scoop}->{text_sub_page};
    }

    $self->{output_file}->{$fname} = $self->rewrite_template ($tmpl, {
	  '__SUB_BODY__' => $$outtext,
	  '__HEADLINE__' => $headline,
	});
    ${$self->{output_story_urls}}[$self->{current_story_index}] = $relative;

  } else {
    $self->{output_file}->{$fname} .= $$outtext;
  }
}

sub write_1page_story {
  my ($self, $fullurl, $outtext, $headline, $url, $one_page_anchor) = @_;

  # this is single-page output mode -- output everything to the main
  # file.
  my $redir_from = $self->{redirections_backwards}{$url};
  $redir_from ||= '';
  my @these_urls = ($url, split (' ', $redir_from));
  my $relative = $one_page_anchor;

  $self->add_snarfed_link ($relative);
  if ($relative =~ /^(.*?)__HASH__/) {
    $self->add_snarfed_link ($1);
  }

  ${$self->{output_story_urls}}[$self->{current_story_index}] = $relative;

  foreach $url (@these_urls) {
    if (defined $self->{all_front_pages}->{$url}) {
      # this is a front page, just append it to the main index file
      $self->{output_file}->{'MAIN'} .= $$outtext;
      return;
    }
  }

  $self->{output_file}->{'ALL_STORIES'} .= $$outtext;
}

# ---------------------------------------------------------------------------

sub add_image {
  my ($self, $fname) = @_;

  return unless ($self->writing_images);

  push (@{$self->{images_snarfed}}, $fname);
}

# ---------------------------------------------------------------------------

sub up_file_size {
  my ($self, $url, $siz, $typetxt) = @_;

  $self->{file_size} += $siz;
  $self->dbg ("$typetxt written, ".
  	($self->{file_size}/1024)." K, limit ".
	$self->{file_size_limit_in_use}." K");

  if ($self->{file_size}/1024 >= $self->{file_size_limit_in_use}) {
    $self->{hit_file_size_limit} = 1;
  }
}

# ---------------------------------------------------------------------------

sub add_snarfed_link {
  my ($self, $mainurl) = @_;
  my $url;

  my $redir_from = $self->{redirections_backwards}{$mainurl};
  $redir_from ||= '';

  foreach $url ($mainurl, split (' ', $redir_from)) {
    if ($self->{cf}->{debug} > 1) {
      my ($class, $file, $line) = caller();
      $self->dbg2 ("Tracking snarfed link: $url (at $class:$line)");
    }

    $self->{output_links_snarfed}->{$url} = 1;

    my $internal;
    if ($self->{cf}->{fileperpage}) {
      $internal = $self->href_to_multipage_href ($url);
    } else {
      $internal = '#'.$self->href_to_singlepage_href ($url);
    }
    if ($self->{cf}->{debug} > 1) {
      my ($class, $file, $line) = caller();
      $self->dbg2 ("Tracking snarfed link: $internal (at $class:$line)");
    }

    $self->{output_links_snarfed}->{$internal} = 1;
  }
}

# ---------------------------------------------------------------------------

sub clear_page_tmpfiles {
  my $self = shift;
  $self->{page_to_tmpfile} = { };
  $self->{page_to_tmpfile_acc} = 0;
}

sub add_page_tmpfile {
  my ($self, $fname, $url) = @_;

  $self->{page_to_tmpfile}->{$url} = $fname;

  $fname =~ /[\\\/\:]([-_\.A-Za-z0-9]+)$/;
  $self->{internal_href_to_external_url}->{$1} = $url;

  if ($self->{cf}->{debug} > 1) {
    my ($class, $file, $line) = caller();
    $self->dbg2 ("assigned page file: $url = $1: (at $class:$line)");
  }
  return $fname;
}

sub add_page_anchor {
  my ($self, $ank, $url) = @_;

  $self->{anchors_assigned}->{$url} = $ank;

  $self->{internal_href_to_external_url}->{'#'.$ank} = $url;

  if ($self->{cf}->{debug} > 1) {
    my ($class, $file, $line) = caller();
    $self->dbg2 ("assigned anchor: $url = #$ank (at $class:$line)");
  }
  return $ank;
}

# ---------------------------------------------------------------------------
# note: we cannot just fold these two into one function, as single-page
# scoops containing images will call *both* of them, one for internal
# html-to-html links, one for <img> tags. doh.

sub href_to_multipage_filename {
  my ($self, $url, $type) = @_;

  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  my $assigned = $self->{page_to_tmpfile}->{$url};
  if (defined $assigned) { return $assigned; }

  # new-style: just use numeric filenames. saves space.
  my $name = $self->{page_to_tmpfile_acc}++;

  $type ||= '.html';
  $name .= $type;
  my $fullname = $self->{outtmp}.$SLASH.$name;
  return $self->add_page_tmpfile ($fullname, $url);
}

sub href_to_singlepage_filename {
  my ($self, $url, $type) = @_;

  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  my $assigned = $self->{anchors_assigned}->{$url};
  if (defined $assigned) { return $assigned; }

  # new-style: just use numeric anchors. saves space.
  my $ank = $self->{anchors_assigned_acc}++;
  
  return $self->add_page_anchor ($ank, $url);
}

# ---------------------------------------------------------------------------

sub href_to_multipage_href {
  my ($self, $url, $type) = @_;

  my $anchor = $self->get_href_anchor ($url);
  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  #if ($self->{cf}->{debug} > 1) {
    #my ($class, $file, $line) = caller();
    #$self->dbg2 ("may assign page file: $url (at $class:$line)");
  #}

  my $fname = $self->href_to_multipage_filename ($url, $type);
  $fname =~ /[\\\/\:]([-_\.A-Za-z0-9]+)$/;
  $1.$anchor;
}

sub href_to_singlepage_href {
  my ($self, $url) = @_;

  my $anchor = $self->get_href_anchor ($url);
  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  #if ($self->{cf}->{debug} > 1) {
    #my ($class, $file, $line) = caller();
    #$self->dbg2 ("may assign anchor: $url (at $class:$line)");
  #}

  my $fname = $self->href_to_singlepage_filename ($url);
  $fname.$anchor;
}

# ---------------------------------------------------------------------------

sub get_href_anchor {
  my ($self, $url) = @_;

  my $ank = Sitescooper::Util::URLAnchor ($url);
  if ($ank ne '') {
    if ($self->{cf}->{fileperpage}) {
      $ank = $ank;
    } else {
      $ank =~ s/ /_20/g; $ank =~ s/[^-_A-Za-z0-9\#]/_/g; $ank =~ s/\#/__HASH__/g;
    }
  }
  $ank;
}

# ---------------------------------------------------------------------------

sub got_redirected {
  my ($self, $oldurl, $newurl) = @_;

  $self->{redirections}{$oldurl} = $newurl;
  $self->{redirections_backwards}{$newurl} .= " ".$oldurl;
}

sub set_url_title {
  my ($self, $url, $title) = @_;
  if (defined $title && $title =~ /\S/) {
    $title =~ s/\s+/ /g;
    $title =~ s/"/'/g;
    $title =~ s/^ //;
    $title =~ s/ $//;
    $self->{url_title}{$url} = $title;
    push (@{$self->{bookmark_urls}}, $url);
    $self->{scoop}->dbg ("bookmark title for $url: \"$title\".");
  }
}

sub grep_unseen_urls_impl {
  my ($self, $absurl, @links) = @_;
  local ($_);
  my @deduped = ();

  foreach $_ (@links) {
    if (defined $absurl) { $_ = Sitescooper::Util::AbsoluteURL ($absurl, $_); }
    next if (defined $self->{already_tagged_for_downloading}{$_});
    $self->{already_tagged_for_downloading}{$_} = 1;
    push (@deduped, $_);
  }

  @deduped;
}

sub grep_unseen_urls {
  my ($self, @links) = @_;
  $self->grep_unseen_urls_impl (undef, @links);
}

sub absolutify_and_grep_unseen_urls {
  my ($self, $url, @links) = @_;
  $self->grep_unseen_urls_impl ($url, @links);
}

# ---------------------------------------------------------------------------

sub str_to_title {
  my ($self, $sitename) = @_;

  $sitename =~ s/\s+/ /g;
  $sitename =~ s/"/'/g;
  $sitename =~ s/^ //;
  $sitename =~ s/ $//;

  my $filedesc = $self->{cf}->{filename_template};
  $filedesc =~ s/Site/${sitename}/g;
  $filedesc =~ s/Section//g;            # backwards compat

  # trim out dangerous chars (modified to allow most 8-bit stuff, thx to
  # <avatar /at/ deva.net>.
  $filedesc =~ s/[\s\;\*\$\%\!\&\<\>\(\)\|\?\'\"\`\]\[\{\}\:\\\/\000-\037]+/_/g;

  $filedesc =~ s/^[ _]+//g; $filedesc =~ s/[ _]+$//g;

  if (Sitescooper::Main::MyOS() eq 'Mac') {
    # try to limit the filename to 32 characters
    $filedesc =~ s/^(.{26}).*$/$1/g;
  }

  return $filedesc;
}

# ---------------------------------------------------------------------------

sub set_title {
  my ($self, $title) = @_;

  my $filedesc = $self->str_to_title ($title);
  my $outdir = $self->{cf}->{outdir}.$SLASH.$filedesc;
  my $outtmp = $self->{cf}->{outdir}.$SLASH.$filedesc.'.tmp';

  my $outidxfile;
  if ($self->writing_html()) {
    $outidxfile = $filedesc.'.html';
  } elsif ($self->writing_images()) {
    $outidxfile = $filedesc.'_NN.tmp';
  } else {
    $outidxfile = $filedesc.'.txt';
  }

  my $pdbtitle = $self->{cf}->{pdb_title};
  $pdbtitle =~ s/Site/${title}/g;
  $pdbtitle =~ s/Section//g;          # backwards compat
  $pdbtitle =~ s/^[ _]+//g; $pdbtitle =~ s/[ _]+$//g;
  $pdbtitle =~ s/^(.{0,32}).*$/$1/;   # limit to 32 chars for iSilo's sake

  $self->{outdir} = $outdir;
  $self->{outtmp} = $outtmp;
  $self->{outidxfile} = $outidxfile;
  $self->{sitename} = $title;
  $self->{pdbtitle} = $pdbtitle;

  if ($self->{cf}->{dumppdb}) {
    $self->{syncfile} = $outtmp;           # reuse it!
  } else {
    $self->{syncfile} =
      $self->{cf}->{pdbdir}.$SLASH.$filedesc.'.'.$self->{cf}->{outputextn};
  }

  delete $self->{need_title};

  $self->dbg ("site title set to: \"$title\", file base: $filedesc");
  $self->dbg ("tmp dir: $self->{outtmp}, output dir: $self->{outdir}");
  $filedesc;
}

sub change_title {
  my ($self, $title) = @_;

  my $oldtmpdir = $self->{outtmp};
  $self->set_title ($title);

  if ($oldtmpdir ne $self->{outtmp}) {
    rename ($oldtmpdir, $self->{outtmp});
  }
}

# ---------------------------------------------------------------------------

sub sitewarn {
  my $self = shift;
  $self->{scoop}->sitewarn_file_line ($self->{scf}->{site_defined_at}, @_);
}

sub writing_doc {
  my ($self) = @_;
  ($self->{cf}->{output_style} == $Sitescooper::Main::OUT_DOC);
}

sub writing_html {
  my ($self) = @_;
  ($self->{cf}->{output_style} == $Sitescooper::Main::OUT_HTML);
}

sub writing_images {
  my ($self) = @_;
  ($self->{cf}->{output_style} == $Sitescooper::Main::OUT_IMAGES);
}

sub writing_text {
  my ($self) = @_;
  ($self->{cf}->{output_style} == $Sitescooper::Main::OUT_TEXT);
}

# ---------------------------------------------------------------------------

sub dbg {
  my $self = shift;
  $self->{scoop}->dbg(@_);
}

sub dbg2 {
  my $self = shift;
  if ($self->{cf}->{debug} > 1) { $self->{scoop}->dbg(@_); }
}

1;
