(* Scheduled downloading *)
open Printf
open Unix
open Www
open Document
open Url
open Feed
open Retrieve

let debug = ref false 

(* Handling of data downloaded by this scheduler *)
module type Data =
  sig
   type t

   val load: handle -> document_id list -> string -> t
        (* [load dh referers file] *)
   val cache_access : Url.t -> document_id -> t
        (* [cache_access url referer] *)       	     
   val error : Url.t -> (document_id * (t -> unit)) list -> unit
        (* [error url conts] *)
   val error_msg : (Www.request * string) -> unit
       (* Retrieval produces Invalid_url *)
  end
   

module type S =
  sig
    type shared_data
    val add_request : 
      	Www.request -> document_id -> (shared_data -> unit) -> unit
        (* [add_request delayed wwwr ref_did cont] *)
    val stop : document_id -> unit
        (* [stop ref_did] *)

    (* Delayed queues for this scheduler *)
    type delayed
    val new_delayed : unit -> delayed
    val add_delayed : 
      	delayed -> Www.request -> document_id -> (shared_data -> unit) -> unit
    val flush_delayed : delayed -> unit
    val flush_one : delayed -> Url.t -> unit
    val is_empty : delayed -> bool
    val maxactive : int ref
  end   


module Make(J: Data) = 
  struct
    type shared_data = J.t

    let maxactive = ref 3

    (* A job is: a list of referers, with the continuations *)
    type job = {
	mutable stop : unit -> unit;
	mutable conts : (document_id * (shared_data -> unit)) list
      }

    (* The list of active requests *)
    let active = ref 0
    and actives = (Hashtbl.create 11 : (Url.t, job) Hashtbl.t)

    type queue = (request * document_id * (shared_data -> unit)) Queue.t

    (* scheduled is mutable because we need to remove items from the queue *)
    let scheduled = ref (Queue.create ())

    let rec next_request () =
      if !active < !maxactive then
        try
          let j = Queue.take !scheduled in
            process_request j;
            next_request()
        with
            Queue.Empty -> ()

    and process_request (wr, did, cont) =
      try (* if we are in the cache of shared objects, apply continuation *)
        cont (J.cache_access wr.www_url did)
      with
        Not_found ->
          (* find out if we are in the active jobs *)
        let url = wr.www_url in
         try
           let oldjob = Hashtbl.find actives url in
	     (* then add a new continuation *)
             oldjob.conts <- (did, cont) :: oldjob.conts
         with
           Not_found -> begin (* start a new job *)
             if !debug then
               Log.f (sprintf "Starting job for %s" (Url.string_of url));
             let job = {
                stop = (fun () ->
			  Hashtbl.remove actives url;
			  decr active);
                conts =  [did, cont]
                } in
           (* Add to set of active *)
             incr active;
             Hashtbl.add actives url job;
           (* We are going to run the retrieval process *)
           (* Continuations for the retrieval *)
             let handle_data dh =
	       (* job.stop is also used when stop is called *)
	       (* it must remove the url from actives       *)
	       job.stop <- 
		   (fun () -> 
		       dclose true dh;
		       Hashtbl.remove actives url;
		       decr active);
               try
                 (* open the temporary file *)
                 let file = Msys.mktemp "data" in
                 let oc = open_out file 
                 and buffer = String.create 2048 in
                 dh.document_feed.feed_schedule
		     (fun _ ->
		       try
			let n = dh.document_feed.feed_read buffer 0 2048 in
			  if n <> 0 then output oc buffer 0 n
			  else begin (* end of document *)
			    dclose true dh;
			    close_out oc;
			    let referers = List.map fst job.conts in
			    begin try 
      	       	       	     let data = J.load dh referers file in
			       List.iter (fun (referer,cont) -> 
			                   try Printexc.print cont data with
      	       	       	       	       	    _ -> flush Pervasives.stderr)
      	       	       	       	 job.conts
			    with (* load failed *)
			      e -> 
			        Printf.eprintf "Load error %s\n" 
      	       	       	       	   (Stringexc.f e);
				flush Pervasives.stderr;
      	       	       	       	J.error url job.conts
			    end;
			    (* we should remove from active only after 
			       loading. If loading is interactive, there 
			       could be a moment during which the document 
			       is not marked as loaded, but not active either.
			       But then dh has to be closed otherwise the
			       callback will we called indefinitely *)
			     Hashtbl.remove actives url;
			     decr active;
			     if !debug then begin
			       Printf.eprintf "Finished job for %s\n" 
			                      (Url.string_of url);
			       flush Pervasives.stderr
			       end;
			     next_request()
			    end
		       with
			  Unix_error(code,s,s') -> 
			     Printf.eprintf "Unix error (%s) in scheduler \
			                       %s %s\n"
				(error_message code) s s';
			     flush Pervasives.stderr;
			     close_out oc; error url job
      	       	       	| Sys_error s ->
			     Printf.eprintf "IO error (%s) in scheduler\n" s;
			     flush Pervasives.stderr;
      	       	       	     close_out oc; error url job
      	       	       	| e -> Printf.eprintf "Bug in scheduler %s\n"
                                     (Stringexc.f e);
      	       	       	       flush Pervasives.stderr;
      	       	       	       close_out oc; error url job)
               with (* error creating tmp file *)
		 Sys_error s -> 
		    Printf.eprintf "Can't create temporary file (%s)\n" s;
      	       	    flush Pervasives.stderr;
      	       	    error url job
	       | e -> Printf.eprintf "Bug in scheduler %s\n"
			    (Stringexc.f e);
		      flush Pervasives.stderr;
		      error url job

           (* Data has moved. The best way to do this properly is to 
              reschedule the job conts as new requests *)
	     and retry_data hlink =
	       try
		let newr = Www.make hlink in
		  newr.www_error <- wr.www_error;
		  newr.www_logging <- wr.www_logging;
		 List.iter (fun (did,cont) ->
			     Queue.add (newr, did, cont) !scheduled)
			 job.conts;
		job.stop();
		next_request()           
	       with
		_ -> error url job
              in
	   (* Okay, go for the retrieval now *)
	      try match Retrieve.f wr retry_data
		    {document_process = handle_data;
		     document_finish = (fun f -> if f then error url job)}
                  with
		     Retrieve.Started _ -> ()
		   | Retrieve.InUse ->
		     (* somebody else has started a request bypassing the
		        scheduler, dammit. Our only hope is that he's going
			to set the cache properly, so we can reschedule 
			ourself and try later *)
		         List.iter (fun (did,cont) ->
			       Queue.add (wr, did, cont) !scheduled)
			      job.conts;
			 job.stop();
			 next_request()           
	      with Invalid_request(w,msg) -> (* retrieve failed *)
	        J.error_msg (w,msg); error url job
	    end 


    (* error during data downloading *)
    and error url job =
      job.stop();
      J.error url job.conts;
      if !debug then begin
	Printf.eprintf "Retrieval of %s failed\n" (Url.string_of url);
	flush Pervasives.stderr
	end;
      next_request()

    let add_request wr did cont =
      Queue.add (wr, did, cont) !scheduled;
      next_request()

    let stop did =
    (* remove pending requests by did *)
      let q = Queue.create () in
	Queue.iter (function
		      (wr, didr, cont) when did = didr -> ()
		    | r -> Queue.add r q) 
		    !scheduled;
      scheduled := q;
     (* stop active jobs requested by did *)
       let rem = ref [] in
       Hashtbl.iter 
	 (fun url job ->
	   try 
	     job.conts <- Mlist.except_assoc did job.conts;
	     if job.conts = [] then rem := job :: !rem
	   with
	     Not_found -> ())
	 actives;
     (* each stop closes the cnx properly and remove the job from actives *)
       List.iter (fun job -> job.stop()) !rem;
       next_request()

     (* Delayed queues *)
     type delayed =
	 (request * document_id * (J.t -> unit)) list ref

     let new_delayed () = ref []

     let is_empty l = List.length !l = 0

     (* Actually, if the document is already in the cache, then process
        the continuation *)
     let add_delayed l wr did cont =
	try cont (J.cache_access wr.www_url did)
	with Not_found -> 
	   l := (wr,did,cont) :: !l

     let flush_delayed l =
       List.iter (fun q -> Queue.add q !scheduled) !l;
       l := [];
       next_request()

     let flush_one l url =
	let rec f = function
	  [] -> []
        | (wr,did,cont) as q :: rest when wr.www_url = url ->
	      Queue.add q !scheduled;
	      f rest (* we may have queued several times the same URL *)
        | q::rest ->
	     q::f rest
        in
         l := f !l;
	 next_request()
   end
