///////
   //    HtmlParser.cc
   //    HtmlParser Class definitions
   //
   //    Class for parsing HTML documents
   //
   //    Copyright (c) 1999-2002 Comune di Prato - Prato - Italy
   //    Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
   //
   //    For copyright details, see the file COPYING in your distribution
   //    or the GNU General Public License version 2 or later 
   //    <http://www.gnu.org/copyleft/gpl.html>
   //
   //    $Id: HtmlParser.cc,v 1.27 2002/02/06 11:04:34 angusgb Exp $
   //
   //    G.Bartolini
   //    started: 30.03.2000
///////

#include "HtmlParser.h"
#include "HtSGMLCodec.h"
#include <ctype.h>   // for isspace()
#include "Configuration.h" // for META attributes parsing

   // Static variables initialization
      int HtmlParser::debug = 0;

// This define the maximum number of characters present in an HTML tag
// between the starting '<' and the closing '>'.
#define MAX_TAG_SIZE 4096


// Default constructor
HtmlParser::HtmlParser()
: CurrentScheduler(0), BaseUrl(0)
{
}

// Destructor
HtmlParser::~HtmlParser()
{
   if (BaseUrl && BaseUrl != CurrentScheduler->CurrentUrl)
      delete BaseUrl; // Base Url different from CurrentUrl. So delete it.
}


// Operator overloading () -> makes this function a function object.
// This is used by the Scheduler object in order to parse a 
// document (previously retrieved)

HtmlParser::HtmlParser_Codes HtmlParser::operator() (Scheduler &scheduler)
{

   // Initialization
   CurrentScheduler = &scheduler;
   in_script = false;
   in_title = false;
   ignore = false;
   memo = true;


   // HTML Title of the document
   String Title = 0;
   Title.trunc();
   
   // Set debug Level
   SetDebugLevel(CurrentScheduler->GetDebugLevel());
   
   // Contents of the document
   String Contents = CurrentScheduler->CurrentResponse->GetContents();   

   // position is set to the beginning of the retrieved document contents
   position = (unsigned char *) Contents.get();
   
   // Initialize the tag position index
   TagPosition = 0;

   // Assign the base URL used for resolving relative paths
   BaseUrl = CurrentScheduler->CurrentUrl;

   // Let's start parsing the HTML document, from the beginning
   while (*position)
   {

      // Let's check for a comment or a possible DTD declaration

      if (strncmp((char *)position, "<!", 2) == 0)
      {
         position +=2;
         if (strncmp((char *)position, "--", 2) == 0)
         {
            // Yes ... it is a comment - Go to its end
            do    // Loop until we find a '>' preceded by 2 '-' at least
            {
               ppos = (unsigned char *) strstr((char *)position, "--");
            
               if (!ppos)
               {
                  *position ='\0';
                  break;
               }
               else
               {
               
                  position = ppos + 2;
               
                  // Skip extra dashes after a badly formed comment
                  while (*position == '-') ++position;

                  // Skip whitespace
                  while (isspace(*position)) ++position;
               }
            } while (*position && *position != '>');
            
            if (*position == '>') ++position; // End of comment
         }
         else
         {
            // It's not a comment declaration but could be a DTD declaration
            ppos = (unsigned char *) strchr((char *)position, '>');
            if (ppos) position=ppos+1; // Found the end. Let's skip the char
            else *position='\0'; // Not found. Ignore the rest.
         }
         
         continue;
         
      }
      
      if (*position =='<')
      {
         ++position; // skip the initial '<'
         
         // Now ... something strange may appear. Let's think of
         // a malformed HTML document, in which the writer puts
         // a '<' symbol instead of a '&lt;' sgml entity.
         // Let's try to catch it, even if it is very difficult;
         
         // Do we have a valid character after the '<'?
         while (isspace(*position))
            ++position;

         // Maybe it wasn't a valid tag
         // If we are here we may assume we have a valid character,
         // after '<', so an alpha char, or a '/' for closing tags.
         // But we can also have something like:
         // <B.%2  -- Don't ask me why, but somebody got it!!!
         
         // Another check to perform is if we find a not alphabetic
         // character before a space or a closing tag.

         bool not_good = false;         
         for (ppos = position; !not_good && *ppos && !isspace(*ppos)
            && *ppos != '>'; ++ppos)
         {
            // cout << *ppos << endl;
            if (!isalpha(*ppos) && *ppos!='/')
               not_good = true;
         }
         
         // We found a not valid characther before a space! Skip this tag.
         if (not_good) continue;
                  
         // Start of a tag. Let's search for the closing '>'
         // But we can also have it after the previous loop
         if (*ppos && *ppos != '>')
            ppos = (unsigned char *) strchr((char *)position, '>');
         
         if (ppos)
         {

            // Another trick to catch a malformed tag declaration
            // that is to say a missing '&lt;', let's check if
            // the tag size is bigger than a fixed size (MAX_TAG_SIZE)
            
            if ((int) (ppos - position) > MAX_TAG_SIZE)
               continue;
               
            // Found. Let's copy it, by skipping '<' and '>'
            ptext=text;

      	    // copy the characters from the source to the destination            
            while (position < ppos)
            {
//               cout << (int) (ppos - position) << " _  " << (int) position
//                  << " _  " << (int) ppos << ": " << *position << endl;
               *ptext++ = *position++;
            }

            *ptext='\0';   // close the string
            ++position;    // Skip the closing '>'

            ++TagPosition;

            // Let's parse the tag by using the member attribute 'text'
      	    // and then Status of the parser
	    switch(ParseTag())
	    {
	       case HtmlParser_EmptyTag:
      	          if (debug > 1)
      	             cout << "Warning! Empty tag: "
		     	<< htmlstatement << " - " << text << endl;
	          break;

	       case HtmlParser_TagNotStored:
      	          if (debug > 3)
      	             cout << "Tag not stored: "
		     	<< htmlstatement << " - " << text << endl;
	          break;

	       case HtmlParser_MalformedTag:
      	          if (debug > 0)
      	             cout << "Warning! Malformed tag: "
		     	<< htmlstatement << " - " << text << endl;
	          break;

	       case HtmlParser_StatementFailed:
      	          if (debug > 0)
      	             cout << "Error! Insert of HTML statement failed: "
		     	<< htmlstatement << " - " << text << endl;
		  return HtmlParser_StatementFailed;
	          break;

	       case HtmlParser_AttributeFailed:
      	          if (debug > 0)
      	             cout << "Error! Insert of HTML attribute failed: "
		     	<< htmlattribute << " - " << text << endl;
		  return HtmlParser_AttributeFailed;
	          break;

	       case HtmlParser_LinkFailed:
      	          if (debug > 0)
      	             cout << "Error! Insert of this link failed: "
		     	<< link << " - " << text << endl;
		  return HtmlParser_AttributeFailed;
	          break;

	       case HtmlParser_OK:  // Do nothing
	       default:  // Do nothing
	          break;
	    }
            
         }
         else
         {
            while (*position) ++position;  // reach the end (no more tags)
         }
      }
      else
      {
         if (in_title) Title.append(*position);
         ++position;
      }

   }

   CurrentScheduler->CurrentUrl->SetTitle(encodeSGML(Title));

   return HtmlParser_OK;
   
}


HtmlParser::HtmlParser_Codes HtmlParser::ParseTag ()
{

   bool has_attributes = false;
   bool tag_with_link = false;
   bool malformed_tag = false;
   
   // Temporary pointer
   register unsigned char *ptmp;

   // Statement
   register unsigned char *Statement = text;

   // Skip initial spaces
   while (*Statement && isspace(*Statement))
      ++Statement;

   if (!*Statement)
      return HtmlParser_EmptyTag;   // Empty

   // Reset htmlstatement variable
   htmlstatement.Reset();
   
   // Set the IDUrl for the HtmlStatement object
   htmlstatement.SetIDUrl(CurrentScheduler->CurrentSchedule.GetIDSchedule());

   // Set the whole statement
   htmlstatement.SetStatement((char *)Statement);
   
   // Set the tag position
   htmlstatement.SetTagPosition(TagPosition);

   ptmp=Statement;   // Stores the beginning of the tag

   while (*Statement && !isspace(*Statement))
      ++Statement;

   if (ptmp==Statement) // No tag !!!
      return HtmlParser_EmptyTag;

   if (*Statement)
   {
      // Check for a tag with attributes
      *Statement='\0';

      if (debug>5)
         cout << "Tag found: " << ptmp << endl;

      // go on
      ++Statement;   
      
      // Skip everything but alphanum chars after the tag
      while (*Statement && !isalpha(*Statement))
         ++Statement;
      
      if (*Statement)
         has_attributes = true; // The current tag has attributes

   }
   
   htmlstatement.SetTag((char *)ptmp);   

   // We got the TAG info we need

   if (! CheckTag((char *)ptmp))
      memo=false;    // Not store it
   else memo=true;

   if (ignore)
   {
      if (!in_script)
      {

         // We just found a closing </SCRIPT> tag
         ignore = false;
         memo = true;
      }
      else memo = false;
   }
   else
   {
      if (in_script)    // We found a <SCRIPT> tag. We ignore the following tags
         ignore = true;
   }

   // We don't have to store it
   if (!memo) return HtmlParser_TagNotStored;

   if (has_attributes)
   {
      // Let's look for attributes
      // Starting point: Statement now points to the first attribute

      unsigned int AttrPosition = 0;

      while (*Statement)   // Until we reach the end look for attributes
      {
         ptmp = Statement;

      // Look for an attribute definition
      // Goes on until we reach:
      // 1) the end or until a whitespace not follwed by '=' (empty attribute)
      // 2) a '=': the attribute has a content which may contain SGML entities too
      
         while (*Statement && !isspace(*Statement) && *Statement!='=')
            ++Statement;

         while (*Statement && isspace(*Statement))
            *Statement++='\0'; // Close the attribute string

         if (ptmp == Statement) // No attribute !!!
         {
            // Hey guys, if statement is not empty, this may
            // represent a malformed tag. Let's show it!
            if (*Statement)
               malformed_tag = true;
             
            *Statement='\0';
            continue;
         }

         // Reset htmlattribute variable
         htmlattribute.Reset();
   
         // Set the IDUrl for the HtmlAttribute object
         htmlattribute.SetIDUrl(htmlstatement.GetIDUrl());

         // Set the tag position
         htmlattribute.SetTagPosition(TagPosition);

         // Set the attribute position
         htmlattribute.SetAttrPosition(++AttrPosition);

         bool has_content = false;
         bool attribute_with_link = false;
         
         if (*Statement && *Statement == '=')
         {
               has_content = true;  // Attribute has a content
               *Statement++='\0';
         }

         htmlattribute.SetAttribute((char *)ptmp);

         if (has_content)
         {
            // The content can be written inside '"' or not.
            // If yes we search for next '"', else for the first space.

            while(*Statement && (isspace(*Statement) || *Statement=='='))
               ++Statement;   // Skip spaces after '=' or multiple '='
            
            if (*Statement)
            {

               // Not empty content
               if (*Statement == '"' || *Statement == '\'')
               {

                  char qm=*Statement;  // Store the quotation mark
                  ++Statement;         // Skip quotation mark (' or ")

                  ptmp=Statement;      
                  
                  // Look for a closing quotation mark
                  Statement = (unsigned char *) strchr ((char *)ptmp, qm);
                  
                  if (Statement)
                  {
                     // Found.
                     *Statement = '\0';
                     ++Statement;
                  }
                  else
                  {
                     // Not found the closing quotation mark
                     // Everything is content
                     Statement=ptmp;
                     while (*Statement) ++Statement; // reach the end
                  }

                  // Set content                  
                  htmlattribute.SetContent((char *)ptmp);
                  
               }
               else
               {
                  // Content outside a quotation mark
                  ptmp=Statement;
               
                  // Content is considered until a whitespace or the end
                  // is reached.
               
                  while (*Statement && !isspace(*Statement))
                     ++Statement;
               
                  if (*Statement)
                  {
                     *Statement='\0';
                     ++Statement;
                  }
               
                  htmlattribute.SetContent((char *)ptmp);
               
               }

            }

            // We got a HTML attribute with a content.
            // Let's find a Link
            
            switch(FindLink())
	    {
	       case HtmlParser_LinkFailed:   // insert of the link failed
	          return HtmlParser_LinkFailed;
		  break;
		  
	       case HtmlParser_NormalLink:   // it has a link   
	       case HtmlParser_DirectLink:   // ditto
	       case HtmlParser_Anchor: // we must store it
                  attribute_with_link = true;   // the attribute contains a link
	          break;
		  
	       case HtmlParser_NoLink: // No Link. Do nothing
	       default:
	          break;
	    }
         
         }

         // The attribute is stored:
         // - if we set the option "store_only_links" to false
         // - If it contains a link, anyway
         
         if (attribute_with_link ||
            ! CurrentScheduler->Config->Boolean("store_only_links"))
         {

            // The tag also has to be inserted
            if (!tag_with_link)
            {
               // Database Insertion of the HtmlStatement object

      	       // Check if it fails
               if (!CurrentScheduler->GetDB()->Insert(htmlstatement))
	          return HtmlParser_StatementFailed; // Failed

               tag_with_link = true;

            }

            // Database Insertion of the HtmlAttribute object
            if (!CurrentScheduler->GetDB()->Insert(htmlattribute))
	       return HtmlParser_AttributeFailed; // Failed

         }
                  
         while (*Statement && isspace(*Statement))
            ++Statement;   // goes on ...
            
      }   
   }
   else
   {
       // Tag with No attributes

      if ( ! CurrentScheduler->Config->Boolean("store_only_links"))
      {
         // The tag also has to be inserted
         if (!CurrentScheduler->GetDB()->Insert(htmlstatement))
            return HtmlParser_StatementFailed; // Failed
   
      }

   }

   if (malformed_tag)
      return HtmlParser_MalformedTag;
      
   return HtmlParser_OK;

}



// This method realize if a tag needs to be stored and if it contains
// a link inside. If yes it provides its storing.
// A value is returned, giving the calling function the idea
// of what happened inside.

HtmlParser::HtmlParser_Codes HtmlParser::FindLink ()
{

   String Tag = htmlstatement.GetTag();
   String Attribute = htmlattribute.GetAttribute();
   int is_a_link = 0; // Values: 0 - No Link ; 1 - Normal Link ; 2 - Direct Link
                            //   -1 : Anchor (no link)
                            
   String Content(htmlattribute.GetContent());

   ///////
      //    'A' tag
   ///////
      
   if (! Tag.nocase_compare("A"))
   {
      if (! Attribute.nocase_compare("href")) // A href
         is_a_link = 1;
      else if (! Attribute.nocase_compare("name")) // A name
      {
         // It's a anchor. Let's decode it's SGML entities
         htmlattribute.SetContent(encodeSGML(htmlattribute.GetContent()));
         // And let's store it always ... even if it's not a link
         is_a_link = -1;   // Special case - not to be stored in the link table
      }
   }

   ///////
      //    'META' tag
   ///////

   else if (! Tag.nocase_compare("META"))
   {
      if (! Attribute.nocase_compare("content")) // Here it's the info
      {
         Configuration attrs;
         
         attrs.NameValueSeparators("=");
         attrs.Add(htmlstatement.GetStatement());
         
         if (!attrs["http-equiv"].empty())
         {
            if (! mystrcasecmp(attrs["http-equiv"], "refresh"))
            {

               String tmp (htmlattribute.GetContent());
               char *q = (char *) mystrcasestr((char *)tmp, "url=");

               if (q)
               {
                  // Found a Meta 'refresh' directive
                  
                  if (debug > 4)
                     cout << " META refresh found. " << endl;
               
                  q+=3; // skipping "URL"

      	          // And any junk space between 'URL' and '=' and after
      	          while (*q && ((*q == '=') || isspace(*q))) q++;
		  
                  char *qq = q;
                  while (*qq && (*qq != ';') && (*qq != '"') &&
                     !isspace(*qq)) ++qq;
                  
                  *qq = 0;
                  
                  is_a_link = 1;

                  Content = q;
                  
               }
            }
         }
      }
   }

   ///////
      //    'FRAME' tag
   ///////
   else if (! Tag.nocase_compare("FRAME"))
   {
      if (! Attribute.nocase_compare("src")) // FRAME src
         is_a_link = 1;
   }

   ///////
      //    'EMBED' tag
   ///////
   else if (! Tag.nocase_compare("EMBED"))
   {
      if (! Attribute.nocase_compare("src")) // EMBED src
         is_a_link = 2; // Direct Link
   }

   ///////
      //    'OBJECT' tag
   ///////
   else if (! Tag.nocase_compare("OBJECT"))
   {
      if (! Attribute.nocase_compare("src")) // OBJECT src
         is_a_link = 2; // Direct Link
      else if (! Attribute.nocase_compare("data")) // OBJECT data
         is_a_link = 2; // Direct Link
   }

   ///////
      //    'IMG' tag
   ///////
   else if (! Tag.nocase_compare("IMG"))
   {
      if (! Attribute.nocase_compare("src") // IMG src
         || ! Attribute.nocase_compare("lowsrc")) // IMG lowsrc
         is_a_link = 2; // Direct Link
   }

   ///////
      //    'AREA' tag
   ///////
   else if (! Tag.nocase_compare("AREA"))
   {
      if (! Attribute.nocase_compare("href")) // AREA href
         is_a_link = 1;
   }

   ///////
      //    'LINK' tag
   ///////
   else if (! Tag.nocase_compare("LINK"))
   {
      if (! Attribute.nocase_compare("href")) // LINK href
         is_a_link = 1;
   }

   ///////
      //    'BASE' tag (Ugly command!)  ;-) 
   ///////
   else if (! Tag.nocase_compare("BASE"))
   {
      if (! Attribute.nocase_compare("href")) // BASE href
      {
         // Let's define a new BASE Url, used for resolving
         // relative URIs. I don't know who can use this, but HTML 4.0
         // enables it.

         if (BaseUrl != CurrentScheduler->CurrentUrl)
            delete BaseUrl; // Base Url different from CurrentUrl. So delete it.

         BaseUrl = new _Url (encodeSGML(Content),
            *(CurrentScheduler->CurrentUrl));

         if (BaseUrl)
         {
            if (debug > 0)      
               cout << " New Base Url for relative URIs: "
                  << BaseUrl->get() << endl;
         }
         else BaseUrl = CurrentScheduler->CurrentUrl;

      }
   }

   ///////
      //    Let's store any other 'href' attribute
   ///////
   else if (! Attribute.nocase_compare("href"))
      is_a_link = 1;
   
   ///////
      //    Let's store any other 'src' attribute
   ///////
   else if (! Attribute.nocase_compare("src"))
      is_a_link = 1;

   ///////
      //    Let's store any 'background' attribute (BODY, TABLE, etc ...)
   ///////
   else if (! Attribute.nocase_compare("background"))
         is_a_link = 2; // Direct Link

   // Let's store the links

   if (is_a_link > 0)
   {

      _Url *DestUrl = new _Url (encodeSGML(Content),
            *BaseUrl);

      if (DestUrl)
      {

         unsigned int IDUrlDest; // Valid referenced Url
         
         CurrentScheduler->AddUrl(DestUrl->get(), IDUrlDest);

         if (debug > 3)      
            cout << htmlattribute.GetContent() << " -> "
               << DestUrl->get() << endl;

         link.Reset();     // reset the previous link object
            
         // Set the source Url ID
         link.SetIDUrlSrc(CurrentScheduler->CurrentUrl->GetID());            

         // Set the dest Url ID
         link.SetIDUrlDest(IDUrlDest);
            
         // Set the tag position
         link.SetTagPosition(htmlstatement.GetTagPosition());

         // Set the attribute position
         link.SetAttrPosition(htmlattribute.GetAttrPosition());

         // Set the anchor field, if a '#' is present in the
         // HTML attribute's content
         int position;  // position of '#' inside a URL

         if ((position=htmlattribute.GetContent().lastIndexOf('#'))!=-1)
         {
            // There's an anchor
            link.SetAnchor(encodeSGML(
               htmlattribute.GetContent().sub(position+1)));
         }
            
         // Set the Link Type
         switch(is_a_link)
         {
            case 1:
               link.SetLinkType("Normal");
               break;
            case 2:
               link.SetLinkType("Direct");
               break;
         }

         // Let's check whether it regards a 'file://' call 
         // which is certainly broken, or an e-mail address
         
         if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_FileProtocol)
         {
            // Hey, there's a 'file://' call, it's an error!
            
            link.SetLinkResult("Broken");
            if (debug > 2)      
               cout << " 'file:/' link, error!" << endl;
         }
         else if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_Malformed)
         {
            // Hey, there's a malformed URL, it's an error!
            
            link.SetLinkResult("Broken");
            if (debug > 2)      
               cout << " link to a malformed URL, error!" << endl;
         }
         else if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_EMail)
         {
            // There's an e-mail address!
            link.SetLinkResult("EMail");
            if (debug > 2)      
               cout << " e-mail address!" << endl;
         }
         else if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_Javascript)
         {
            // There's a Javascript inserted through the pseudo-protocol
	    // that is to say 'javascript:'
            link.SetLinkResult("Javascript");
            if (debug > 2)      
               cout << " link to Javascript URL "
	          << "(through the 'javascript:' pseudo-protocol)!" << endl;
         }
         
         // Write the link object
         if (!CurrentScheduler->GetDB()->Insert(link))
            return HtmlParser_LinkFailed;
            
      }
               
      delete DestUrl;

   }

   switch (is_a_link)
   {
      case 0:
      	 return HtmlParser_NoLink;
	 break;
      case 1:
      	 return HtmlParser_NormalLink;
	 break;
      case 2:
      	 return HtmlParser_DirectLink;
	 break;
      case -1:
      	 return HtmlParser_Anchor;
	 break;
   }

   // We should not get up to here, anyway this avoid warning messages
   return HtmlParser_NoLink;
   
}


const String HtmlParser::encodeSGML(const String &str)
{

   return HtSGMLCodec::instance()->encode(str);
   
}

const String HtmlParser::decodeSGML(const String &str)
{

   return HtSGMLCodec::instance()->decode(str);
   
}


int HtmlParser::CheckTag(char *tag)
{

   // More controls in order to decide which tags to store
   if (debug > 5)
      cout << "Checking tag: " << tag << endl;
   
   if (!mystrcasecmp(tag, "SCRIPT"))
      in_script = true;
   else if (!mystrcasecmp(tag, "/SCRIPT"))
      in_script = false;
   else if (!mystrcasecmp(tag, "TITLE"))
      in_title = true;
   else if (!mystrcasecmp(tag, "/TITLE"))
      in_title = false;

/*
   // We can control for a set of tag
   if (!mystrcasecmp(tag, "A")) return 1;   
   if (!mystrcasecmp(tag, "IMG")) return 1;
   if (!mystrcasecmp(tag, "META")) return 1;
   if (!mystrcasecmp(tag, "OBJECT")) return 1;
   if (!mystrcasecmp(tag, "EMBED")) return 1;
   if (!mystrcasecmp(tag, "BODY")) return 1;
   if (!mystrcasecmp(tag, "FRAME")) return 1;
   if (!mystrcasecmp(tag, "AREA")) return 1;
   if (!mystrcasecmp(tag, "LINK")) return 1;
   if (!mystrcasecmp(tag, "BASE")) return 1;
   
   return 0;
*/

   return 1;

}
