/* $Id: recurse.c,v 1.1.1.1 2001/04/23 14:26:40 ossi Exp $ *
 *
 * puf 0.9  Copyright (C) 2000,2001 by Oswald Buddenhagen <puf@ossi.cjb.net>
 * based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
 *
 * You may modify and distribute this code under the terms of the GPL.
 * The is NO WARRANTY of any kind. See COPYING for details.
 *
 * recurse.c - scan files/buffers for references to other urls
 *
 */

#include "puf.h"


/*
 *  1.  Expand filename into a fully qualified url (relative
 *      to referer if it's not absolute)
 *  2.  Check to see if it should be recursed into, and if so
 *      then add it
 */
void add_reference(char *ref, int len, url_t *referer)
{
    char buf[SHORTSTR];
    url_t *u;
    int totlen, pathlen, hash, p, t, sl;
    proxy_t *proxy;

    if (!len || !referer)
	return;

    checken("add_reference (top)");

    dbg(URL, ("add_reference '%.*s' by 'http://%s/%s'\n", len, ref,
	referer->host->name, referer->local_part));

    proxy = xtr_proxy(referer);
    if (parse_add_url(ref, len, 0, proxy, referer->strictproxy,
		      referer, 0, 0, referer->recurse_type) != -1)
	return;		/*  with protocol, maybe some error  */

    if (ref[0] == '/') {
	pathlen = 0;
	p = 1;
    } else {
	pathlen = referer->path_len;
	memcpy(buf, referer->local_part, pathlen);
	p = 0;
    }

    for (totlen = pathlen, sl = '/'; p < len; p++) {
	/*  Here we are positioned at the first character of a name:  */

	/*  Find the next slash or end of string  */
	for (t = p; p < len && ref[p] != sl; p++)
	    if (ref[p] == '?') {
		if (inhibit_cgiget >= 0) {
		    prx(NFO, "not adding relative ?-ref '%.*s'\n", len, ref);
		    return;
		} else
		    sl = -1;
	    }

	if (p - t > 0) {	/*  only add if there really was a name  */
	    if (p - t == 2 && ref[t] == '.' && ref[t + 1] == '.') {
		if (!totlen) {
		    prx(WRN, "relative ref '%.*s' in http://%s/%s points below /\n", 
		        len, ref, referer->host->name, referer->local_part);
		    return;
		}
		for (; --totlen > 0 && buf[totlen - 1] != '/'; );
		pathlen = totlen;
	    } else if (p - t != 1 || ref[t] != '.') {
		if (totlen + (p - t) + 1 > SHORTSTR) {
		    prx(ERR, "relative ref '%.*s' in http://%s/%s too long\n", 
			len, ref, referer->host->name, referer->local_part);
		    return;
		}
		memcpy(buf + totlen, ref + t, p - t);
		totlen += p - t;
		if (ref[p] == '/') {
		    buf[totlen++] = '/';
		    pathlen = totlen;
		}
	    }
	}
    }
    dbg(URL, ("'%.*s' => / '%.*s' '%.*s'\n", len, ref, pathlen, buf,
	totlen - pathlen, buf + pathlen));

    if (find_url(buf, totlen, referer->host->info, &hash))
	return;

    if (!same_dir(buf, totlen, referer))
	return;

    if (!(u = mmalloc(sizeof(*u) + totlen + 1 + (proxy ? sizeof (proxy_t *) : 0))))
	return;

    u->local_part[totlen] = '\0';
    memcpy(u->local_part, buf, totlen);

    if (proxy)
	*(proxy_t **)(u->local_part + totlen + 1) = proxy;

#ifdef USE_MAGIC
    u->len = (u->local_part - (char *)&(u->len)) + totlen + 1 + (proxy ? sizeof (proxy_t *) : 0);
#endif
    u->url_hash = hash;
    u->referer = referer;
    u->host = referer->host;
    u->port = referer->port;
    u->http_auth = referer->http_auth;
    u->path_len = pathlen;
    u->disp_pathoff = referer->disp_pathoff;
    u->recurse_type = referer->recurse_type;
    u->is_top_dir = 0;
    u->havedisp = 0;
    u->haveproxy = proxy ? 1 : 0;
    u->relocs = 0;

    checken("add_reference (pre-end)");

    add_url(u);
}


/* Find rule in buf and return pointer past the match */
/* Don't touch anything without looking at the compiled code. 
   Even a tiny change may disturb gcc's optimizer */
/* This is case-sensitive!!! */
char *matchen(char *buff, int blen, char *rule)
{
    char r, b, *buf, *rp;
    int bp, len;

    for (buf = buff - 1, len = blen;;) {
	bp = 0;
	rp = rule;
	r = *rp;
	do {
	    buf++;
	    if (--len < 0)
		goto ret0;
	} while (*buf != r);
	do {
	    bp++;
	    rp++;
	    b = buf[bp];
	    if (!(r = *rp))
		goto complete;
	} while (b == r);
    }

  complete:
    return buf + bp;
  ret0:
    return 0;
}

/*  scan a buffer for references  */
int recurse_buff(url_t *u, char *buf, int len, int notlast)
{
    char lbuf[MAXBUFSIZE + OVERLAPLEN], *moff;
    static char *stris[] = { "href=", "src=", "background=" };
    int p, p2, max = len - (notlast ? OVERLAPLEN : 15);
    unsigned cs;

    if (max <= 0)
	return 0;

    checken("recurse_buff (top)");

    for (p = 0; p < len; p++)
	lbuf[p] = tolower((int)buf[p]);

    for (cs = u->recurse_type == IMAGE_RECURSIVE ? 1 : 0;
	 cs < sizeof(stris) / sizeof(stris[0]); cs++) {
	for (p = 0; (moff = matchen(lbuf + p, max - p, stris[cs]));) {
	    p = moff - lbuf;

	    /*  Find the end of the href-string  */
	    if (buf[p] == '\\' && buf[p + 1] == '"') {
		p += 2;
		p2 = p;
		while (p + 2 < len && buf[p] != '#' && buf[p] > ' '
		       && (buf[p] != '\\' || buf[p + 1] != '"'))
		    p++;
	    } else if (buf[p] == '"') {
		p++;
		p2 = p;
		while (p + 1 < len && buf[p] != '#' && buf[p] > ' '
		       && buf[p] != '"')
		    p++;
	    } else {
		p2 = p;
		while (p + 1 < len && buf[p] != '#' && buf[p] > ' '
		       && buf[p] != '>')
		    p++;
	    }

	    if (p - p2 > 0)
		add_reference(buf + p2, p - p2, u);
	}
    }

    checken("recurse_buff (end)");

    return max;
}


/*  scan a partial file for references  */

void recurse_pfile(url_t *u, int fi, char **bupo, int *lepo)
{
    char buf[MAXBUFSIZE];
    int len, off;

    /*  Scan file for href's and src's:  */
    off = 0;
    while ((len = read(fi, buf + off, MAXBUFSIZE - off) + off) ==
	   MAXBUFSIZE) {
	recurse_buff(u, buf, len, 1);
	memcpy(buf, buf + MAXBUFSIZE - OVERLAPLEN, off = OVERLAPLEN);
    }
    off = recurse_buff(u, buf, len, !!bupo);
    if (bupo) {
	len -= off;
	*bupo -= len;
	*lepo += len;
	memcpy(*bupo, buf + off, len);
    }

    checken("recurse_pfile (end)");
}


/*  scan an entire file for references  */

void recurse_file(url_t *u, char *name)
{
    int fi;

    if ((fi = mmfopen(name, O_RDONLY, &fi)) >= 0) {
	recurse_pfile(u, fi, 0, 0);
	close(fi);
    } else
	prx(ERR, "cannot scan %s for links: %s\n", name, strerror(errno));
}

