#===========================================================================
# 

package Sitescooper::DirCacheFactory;

use Carp;
use File::Find;

use Sitescooper::Main;
use Sitescooper::CacheFactory;
use Sitescooper::PerSiteDirCache;
use Sitescooper::CacheObject;

@ISA = qw(Sitescooper::CacheFactory);
use vars qw{ @ISA $SLASH };
use strict;

# ---------------------------------------------------------------------------

sub new {
  my $class = shift; $class = ref($class) || $class;

  my ($main) = @_;
  my $self = $class->SUPER::new($main);
  bless ($self, $class);

  $SLASH				= $Sitescooper::Main::SLASH;
  $self->{cf}				= $main->{cf};
  $self->{cachedir}			= undef;
  $self->{newcachedir}			= undef;

  # $self->{seen_this_time}		= [ ];
  # $self->{last_modtime}		= { };
  # $self->{oldest_modtime_at_host}	= { };

  $self->{caches_to_rename}		= { };
  $self;
}

# ---------------------------------------------------------------------------

sub open_cache {
  my ($self) = @_;

  my $tmpdir = $self->{cf}->{tmpdir};

  $self->{alreadyseen} = $tmpdir.$SLASH."already_seen.txt";
  if (-f $self->{alreadyseen}) {
    unlink ($self->{alreadyseen});	# it's now obsolete!
  }

  # $self->{newalreadyseen} = $self->{alreadyseen};

  $self->{cachedir} = $tmpdir.$SLASH."cache";
  if (!-d $self->{cachedir}) {
    mkdir ($self->{cachedir}, 0777) or
	       die "failed to mkdir '$self->{cachedir}'\n";
  }
  $self->{newcachedir} = $self->{cachedir};

  if (defined $self->{cf}->{sharedcache}) {
    $self->{sharedcachedir} = $self->{cf}->{sharedcache};
    if (!-d $self->{sharedcachedir}) {
      mkdir ($self->{sharedcachedir}, 0777) or
		 die "failed to mkdir '$self->{sharedcachedir}'\n";
    }
    $self->{pagecachedir} = $self->{cf}->{sharedcache}.$SLASH."page_cache_dir";

  } else {
    $self->{pagecachedir} = $tmpdir.$SLASH."page_cache_dir";
  }

  if (!-d $self->{pagecachedir}) {
    mkdir ($self->{pagecachedir}, 0777) or
	       die "failed to mkdir '$self->{pagecachedir}'\n";
  }

  if ($self->{cf}->{nowrite}) {
    $self->{newcachedir} = $tmpdir.$SLASH."new_cache";
    if (!-d $self->{newcachedir}) {
      mkdir ($self->{newcachedir}, 0777) or
		 die "failed to mkdir '$self->{newcachedir}'\n";
    }
    # $self->{cf}->{newalreadyseen} =
		 # $tmpdir.$SLASH."new_already_seen.txt";

  } else {
    $self->expire_cache();
  }
}

# ---------------------------------------------------------------------------

sub close_cache {
  my ($self) = @_;

  my ($from, $to);
  while (($from,$to) = each %{$self->{caches_to_rename}}) {
    $self->dbg ("Saving new cache file: $to");
    rename ($from, $to) or warn ("rename $from -> $to failed\n");
  }
}

# ---------------------------------------------------------------------------

sub get_per_site_cache {
  my ($self, $robot, $sitename) = @_;
  return new Sitescooper::PerSiteDirCache ($self->{main},
  					$self, $robot, $sitename);
}

# ---------------------------------------------------------------------------

# sub open_global_already_seen {
#   my $self = shift;
# 
#   if (!open (IN, "<".$self->{alreadyseen})) {
#     $self->verbose ("Cannot read $self->{alreadyseen}, ".
# 		      "creating a new one");
#     return;
#   }
# 
#   while (<IN>) {
#     /^(\S+) lastmod=(\d+)$/;
#     next if (!defined $2);
#     my $url = $1;
#     my $mod = $2;
#     $self->{last_modtime}->{$url} = $mod+0;
# 
#     if ($url =~ m,http://(\S+?)/,) {
#       my $urlhost = $1;
#       if (defined($self->{oldest_modtime_at_host}->{$urlhost})
# 		? $self->{oldest_modtime_at_host}->{$urlhost} > $mod : 1)
#       {
# 	$self->{oldest_modtime_at_host}->{$urlhost} = $mod;
#       }
#     }
#   }
#   close IN;
# }
# 
# # ---------------------------------------------------------------------------
# 
# sub commit_global_already_seen {
#   my $self = shift;
# 
#   my $towrite = '';
#   my $now = time;
#   my $twomonthsago = $now - (24*60*60*30*2);
#   my $twomegs = (1024*1024*2);
#   my $mod;
#   my $urlhost;
# 
#   # keep the already-seen list small by cutting out old entries.  We
#   # define "old entries" as (a) older than 2 months and (b) older than
#   # the oldest link we saw in today's scooping run.
#   #
#   if ($self->{cf}->{refresh} || (-s $self->{alreadyseen}) > $twomegs) {
#     $self->dbg ("trying to cut old entries from already-seen URL cache");
# 
#     foreach $_ (keys %{$self->{last_modtime}}) {
#       m,http://(\S+?)/,; $urlhost = $1; next unless defined ($urlhost);
# 
#       my $mod = $self->{last_modtime}->{$_};
#       if (defined $mod && defined $self->{oldest_modtime_at_host}->{$urlhost})
#       {
# 	if ($twomonthsago > $mod &&
# 		      $self->{oldest_modtime_at_host}->{$urlhost} > $mod)
# 	{
# 	  $self->dbg ("stripping old entry: $_ lastmod=$mod (".
# 	  				$self->{main}->time2datestr($mod).")");
# 	  next;
# 	}
#       }
#       $towrite .= $_." lastmod=".(defined $mod ? $mod : $now)."\n";
#     }
# 
#     if (open (OUT, ">".$self->{newalreadyseen})) {
#       print OUT $towrite;     # do it as one big atomic write, for safety
#       close OUT or warn "Cannot rewrite $self->{newalreadyseen}\n";
#     } else {
#       warn "Cannot rewrite $self->{newalreadyseen}\n";
#     }
# 
#   } else {
#     # it's small enough -- so we can just append to it.
#     $self->dbg ("appending already-seen URLs to $self->{newalreadyseen}");
# 
#     foreach $_ (@{$self->{seen_this_time}}) {
#       $towrite .= $_." lastmod=".(defined $self->{last_modtime}->{$_}
# 		  ? $self->{last_modtime}->{$_} : $now)."\n";
#     }
# 
#     if (open (OUT, ">>".$self->{newalreadyseen})) {
#       print OUT $towrite;     # do it as one big atomic write, for safety
#       close OUT or warn "Cannot append to $self->{newalreadyseen}\n";
#     } else {
#       warn "Cannot append to $self->{newalreadyseen}\n";
#     }
#   }
# }

# ---------------------------------------------------------------------------

sub expire_cache {
  my $self = shift;

  sub expire_cache_do
  {
    # clean up files
    my @stbuf = stat;
    unlink if (-f _ && -M _ > $TmpGlobal::expiry_days);

    # clean up old dirs
    rmdir if (-d _ && $stbuf[3] <= 2);
  }

  my $stamp = $self->{cachedir}.$SLASH."last_clean.stamp";
  if (!defined (-M $stamp) || -M _ > $self->{cf}->{expiry_days}/2.0)
  {
    $self->verbose ("Expiring cache files...");

    $TmpGlobal::expiry_days = $self->{cf}->{expiry_days};
    open (TOUCH, ">$stamp"); close TOUCH;
    find(\&expire_cache_do, $self->{cachedir});
    if (defined $self->{sharedcachedir}) {
      find(\&expire_cache_do, $self->{sharedcachedir});
    }
    undef $TmpGlobal::expiry_days;
  }
}

# ---------------------------------------------------------------------------

sub dbg {
  my $self = shift;
  $self->{main}->dbg(@_);
}

sub verbose {
  my $self = shift;
  $self->{main}->verbose(@_);
}

1;
