1 files changed, 1002 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc
new file mode 100644
index 00000000..56e1d00f
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc
@@ -0,0 +1,1002 @@
+//
+// HTML.cc
+//
+// HTML: Class to parse HTML documents and return useful information 
+//       to the Retriever
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: HTML.cc,v 1.76 2004/06/09 17:35:34 grdetil Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htdig.h"
+#include "HTML.h"
+#include "HtSGMLCodec.h"
+#include "HtConfiguration.h"
+#include "StringMatch.h"
+#include "StringList.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "WordType.h"
+
+#include <ctype.h>
+
+#include "defaults.h"
+
+// Flags for noindex & nofollow, indicating who turned indexing off/on...
+#define TAGnoindex		0x0001
+#define TAGstyle		0x0002
+#define TAGscript		0x0004
+#define TAGmeta_htdig_noindex	0x0008
+#define TAGmeta_robots		0x0010
+
+static StringMatch	tags;
+static StringMatch	nobreaktags;
+static StringMatch	spacebeforetags;
+static StringMatch	spaceaftertags;
+static StringMatch	metadatetags;
+static StringMatch	descriptionMatch;
+static StringMatch	keywordsMatch;
+//static int		keywordsCount;
+//static int		max_keywords;
+
+
+//*****************************************************************************
+// ADDSPACE() macro, to insert space where needed in various strings
+// 		Reduces all multiple whitespace to a single space
+
+#define ADDSPACE(in_space)	\
+    if (!in_space)							\
+    {									\
+	if (in_title && !noindex)					\
+	{								\
+	    title << ' ';						\
+	}								\
+	if (in_ref && description.length() < max_description_length)	\
+	{								\
+	    description << ' ';						\
+	}								\
+	if (head.length() < max_head_length && !noindex && !in_title)	\
+	{								\
+	    head << ' ';						\
+	}								\
+	in_space = 1;							\
+    }
+
+
+//*****************************************************************************
+// HTML::HTML()
+//
+HTML::HTML() :
+	    skip_start (HtConfiguration::config()->Find("noindex_start")," \t"),
+	    skip_end   (HtConfiguration::config()->Find("noindex_end"),  " \t")
+{
+	HtConfiguration *config= HtConfiguration::config();
+    //
+    // Initialize the patterns that we will try to match.
+    // The tags Match object is used to match tag commands while
+    //
+    tags.IgnoreCase();
+    tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object|link|style|/style|script|/script");
+
+    // These tags don't cause a word break.  They may also be in "tags" above,
+    // except for the "a" tag, which must be handled as a special case.
+    // Note that <sup> & <sub> should cause a word break.
+    nobreaktags.IgnoreCase();
+    nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s");
+
+    // These tags, which may also be in "tags" above, cause word breaks and
+    // therefore cause space to be inserted before (or after) do_tag() is done.
+    spacebeforetags.IgnoreCase();
+    spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer");
+    spaceaftertags.IgnoreCase();
+    spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote");
+
+    // These are the name values of meta tags that carry date information.
+    metadatetags.IgnoreCase();
+    metadatetags.Pattern("date|dc.date|dc.date.created|dc.date.modified");
+
+    // These are the name values of meta tags that carry descriptions.
+    StringList descrNames(config->Find("description_meta_tag_names"), " \t");
+    descriptionMatch.IgnoreCase();
+    descriptionMatch.Pattern(descrNames.Join('|'));
+
+    // These are the name values of meta tags that carry keywords.
+    StringList keywordNames(config->Find("keywords_meta_tag_names"), " \t");
+    keywordsMatch.IgnoreCase();
+    keywordsMatch.Pattern(keywordNames.Join('|'));
+//    (now in Parser)
+//    max_keywords = config->Value("max_keywords", -1);
+//    if (max_keywords < 0)
+//	max_keywords = (int) ((unsigned int) ~1 >> 1);
+
+    // skip_start/end mark sections of text to be ignored by ht://Dig
+    // Make sure there are equal numbers of each, and warn of deprecated
+    // syntax.
+    if (skip_start.Count() > 1 || skip_end.Count() > 1)
+    {
+	if (skip_start.Count() != 0 && skip_end.Count() != 0)
+	{
+	    // check for old-style start/end which allowed unquoted spaces
+	    // (Check noindex_start/end for exactly one "<" or/followed-by
+	    // exactly one ">", and no leading quotes.)
+	    // Can someone think of a better (or simpler) check??
+	    String noindex_end (config->Find ("noindex_end"));
+	    char *first_left = strchr (noindex_end.get(), '<');
+	    char *secnd_left = first_left ? strchr(first_left+1,'<') : (char*)0;
+	    char *first_right= strchr (noindex_end.get(), '>');
+	    char *secnd_right= first_right? strchr(first_right+1,'>'): (char*)0;
+	    String noindex_start (config->Find ("noindex_start"));
+	    char *first_lft = strchr (noindex_start.get(), '<');
+	    char *secnd_lft = first_left ? strchr (first_lft +1,'<') : (char*)0;
+	    char *first_rght= strchr (noindex_start.get(), '>');
+	    char *secnd_rght= first_right? strchr (first_rght+1,'>') : (char*)0;
+
+	    if (((first_right && !secnd_right && first_right < first_left) ||
+		 (first_left  && !secnd_left  && !first_right) ||
+		 (first_rght && !secnd_rght && first_rght < first_lft) ||
+		 (first_lft  && !secnd_lft  && !first_rght)) &&
+		noindex_end[0] != '\"' && noindex_start[0] != '\"')
+	    {
+		cout << "\nWarning: To allow multiple  noindex_start/end  patterns, patterns containing\nspaces should now be in quotation marks.  (If the entries are indended to be\nmultiple patterns, this warning can be suppressed by placing the first pattern\nin quotes.)\n\n";
+		// Should we treat the patterns as if they had been quoted
+		// (as we assume was intended)?
+	    }
+	}
+    }
+
+    // check each start has an end
+    if (skip_start.Count() < skip_end.Count())
+    {
+	cout << "Warning:  " << skip_end.Count()
+	     << "  noindex_end  patterns, but only  " << skip_start.Count()
+	     << "  noindex_start  patterns.\n";
+    } else
+    {
+	while (skip_start.Count () > skip_end.Count())
+	{
+	    int missing = skip_end.Count() - 1;
+	    skip_end.Add ((missing >= 0) ? skip_end [missing]
+					 : "<!--/htdig_noindex-->");
+	    cout << "Warning: Copying " << skip_end [missing+1]
+		 << " as  noindex_end  match for " << skip_start [missing+1]
+		 << endl;
+	}
+    }
+
+    word = 0;
+    href = 0;
+    title = 0;
+    description = 0;
+    head = 0;
+    meta_dsc = 0;
+    tag = 0;
+    in_title = 0;
+    in_ref = 0;
+    in_heading = 0;
+    base = 0;
+    noindex = 0;
+    nofollow = 0;
+//    minimumWordLength = config->Value("minimum_word_length", 3);
+}
+
+
+//*****************************************************************************
+// HTML::~HTML()
+//
+HTML::~HTML()
+{
+}
+
+
+//*****************************************************************************
+// void HTML::parse(Retriever &retriever, URL &baseURL)
+//   Parse the HTML document using the Retriever object for all the callbacks.
+//   The HTML document contents are contained in the contents String.
+//
+void
+HTML::parse(Retriever &retriever, URL &baseURL)
+{
+    if (contents == 0 || contents->length() == 0)
+	return;
+
+    base = &baseURL;
+    
+    //
+    // We have some variables which will contain the various items we
+    // are looking for
+    //
+    int			wordindex = 1;
+    int			in_space;
+    int			in_punct;
+    String		scratch, textified;
+    unsigned char	*q, *start;
+    unsigned char	*position = (unsigned char *) contents->get();
+    unsigned char       *text = (unsigned char *)new char[contents->length()+1];
+    unsigned char       *ptext = text;
+
+    keywordsCount = 0;
+    title = 0;
+    head = 0;
+    meta_dsc = 0;
+    noindex = 0;
+    nofollow = 0;
+    in_heading = 0;
+    in_title = 0;
+    in_ref = 0;
+    in_space = 0;
+    in_punct = 0;
+	
+    while (*position)
+    {
+
+      //
+      // Filter out section marked to be ignored for indexing. 
+      // This can contain any HTML. 
+      // On finding a  noindex_start,  skip to first occurrence of matching
+      // noindex_end.  Any  noindex_start  within will be ignored.
+      //
+      int i;
+      for (i = 0; i < skip_start.Count(); i++)
+      {
+        if (mystrncasecmp((char *)position, skip_start[i],
+				  ((String*)skip_start.Nth(i))->length()) == 0)
+	  break;		// break from this loop for "continue" below...
+      }
+      if (i < skip_start.Count())	// found a match;
+	{
+	  q = (unsigned char*)mystrcasestr((char *)position, skip_end[i]);
+	  if (!q)
+	    *position = '\0';       // Rest of document will be skipped...
+	  else
+	    position = q + ((String*)skip_end.Nth(i))->length();
+	  continue;
+	}
+      // end of  noindex_start/end  code
+
+
+      if (strncmp((char *)position, "<!", 2) == 0)
+	{
+	  //
+	  // Possible comment declaration (but could be DTD declaration!)
+	  // A comment can contain other '<' and '>':
+	  // we have to ignore complete comment declarations
+	  // but of course also DTD declarations.
+	  //
+	  position += 2;	// Get past declaration start
+	  if (strncmp((char *)position, "--", 2) == 0)
+	    {
+	      // Found start of comment - now find the end
+	      position += 2;
+	      do
+		{
+		  q = (unsigned char*)strstr((char *)position, "--");
+		  if (!q)
+		    {
+		      *position = '\0';
+		      break;	// Rest of document seems to be a comment...
+		    }
+		  else
+		    {
+		      position = q + 2;
+		      // Skip extra dashes after a badly formed comment
+		      while (*position == '-')
+			  position++;
+		      // Skip whitespace after an individual comment
+		      while (isspace(*position))
+			  position++;
+		    }
+		  // if comment declaration hasn't ended, skip another comment
+		}
+	      while (*position && *position != '>');
+	      if (*position == '>')
+		{
+		  position++;	// End of comment declaration
+		}
+	    }
+	  else
+	    {
+	      // Not a comment declaration after all
+	      // but possibly DTD: get to the end
+	      q = (unsigned char*)strchr((char *)position, '>');
+	      if (q)
+		{
+		  position = q + 1;
+		  // End of (whatever) declaration
+		}
+	      else
+		{
+		  *position = '\0'; // Rest of document is DTD?
+		}
+	    }
+	  continue;
+	}
+
+	if (*position == '<')
+	{
+	    //
+	    // Start of a tag.  Since tags cannot be nested, we can simply
+	    // search for the closing '>'
+	    //
+	    q = (unsigned char*)strchr((char *)position, '>');
+	    if (q)
+	      { // copy tag
+		while (position <= q)
+		  *ptext++ = *position++;
+	      }
+	    else
+	      { // copy rest of text, as tag does not end
+		while (*position)
+		  *ptext++ = *position++;
+	      }
+	}
+	else if (*position == '&')
+	  {
+	    q = (unsigned char*)strchr((char *)position, ';');
+	    if (q && q <= position+10)
+	      {	// got ending, looks like valid SGML entity
+		scratch = 0;
+		scratch.append((char*)position, q+1 - position);
+		textified = HtSGMLCodec::instance()->encode(scratch);
+		if (textified[0] != '&' || textified.length() == 1)
+		  {	// it was decoded, copy it
+		    position = (unsigned char *)textified.get();
+		    while (*position)
+		      {
+			if (*position == '<')
+			  { // got a decoded &lt;, make a fake tag for it
+			    // to avoid confusing it with real tag start
+			    *ptext++ = '<';
+			    *ptext++ = '~';
+			    *ptext++ = '>';
+			    position++;
+			  }
+			else
+			    *ptext++ = *position++;
+		      }
+		    position = q+1;
+		  }
+		else	// it wasn't decoded, copy '&', and rest will follow
+		    *ptext++ = *position++;
+	      }
+	    else	// not SGML entity, copy bare '&'
+		*ptext++ = *position++;
+	  }
+        else
+        {
+           *ptext++ = *position++;
+        }
+      }
+      *ptext++ = '\0';
+
+      position = text;
+      start = position;
+
+      while (*position)
+      {
+	if (*position == '<' && (position[1] != '~' || position[2] != '>'))
+	  {
+	    //
+	    // Start of a tag.  Since tags cannot be nested, we can simply
+	    // search for the closing '>'
+	    //
+	    q = (unsigned char*)strchr((char *)position, '>');
+	    if (!q)
+	      break; // Syntax error in the doc.  Tag never ends.
+	    position++;
+	    if (noindex & TAGscript)
+	    {	// Special handling in case '<' is part of JavaScript code
+		while (isspace(*position))
+		    position++;
+		if (mystrncasecmp((char *)position, "/script", 7) != 0)
+		    continue;
+	    }
+	    tag = 0;
+	    tag.append((char*)position, q - position);
+	    while (isspace(*position))
+		position++;
+	    if (!in_space && spacebeforetags.CompareWord((char *)position)
+		|| !in_space && !in_punct && *position != '/')
+	    {
+		// These opening tags cause a space to be inserted
+		// before anything they insert.
+		// Tags processed here (i.e. not in nobreaktags), like <a ...>
+		// tag, are a special case: they don't actually add space in
+		// formatted text, but because in our processing it causes
+		// a word break, we avoid word concatenation in "head" string.
+		ADDSPACE(in_space);
+		in_punct = 0;
+	    }
+	    do_tag(retriever, tag);
+	    if (!in_space && spaceaftertags.CompareWord((char *)position))
+	    {
+		// These closing tags cause a space to be inserted
+		// after anything they insert.
+		ADDSPACE(in_space);
+		in_punct = 0;
+	    }
+	    position = q+1;
+	  }
+	else if (*position > 0 && HtIsStrictWordChar(*position))
+	{
+	    //
+	    // Start of a word.  Try to find the whole thing
+	    //
+	    word = 0;
+	    in_space = 0;
+	    in_punct = 0;
+	    while (*position && HtIsWordChar(*position))
+	    {
+		word << (char)*position;
+		// handle case where '<' is in extra_word_characters...
+		if (strncmp((char *)position, "<~>", 3) == 0)
+		    position += 2;	// skip over fake tag for decoded '<'
+		position++;
+		if (*position == '<')
+		{
+		    q = position+1;
+		    while (isspace(*q))
+			q++;
+		    // Does this tag cause a word break?
+		    if (nobreaktags.CompareWord((char *)q))
+		    {
+			// These tags just change character formatting and
+			// don't break words.
+			q = (unsigned char*)strchr((char *)position, '>');
+			if (q)
+			{
+			    position++;
+			    tag = 0;
+			    tag.append((char*)position, q - position);
+			    do_tag(retriever, tag);
+			    position = q+1;
+			}
+		    }
+		}
+	    }
+
+	    if (in_title && !noindex)
+	    {
+		title << word;
+	    }
+
+	    if (in_ref)
+	    {
+		if (description.length() < max_description_length)
+		{
+		    description << word;
+		}
+		else
+		{
+		    description << " ...";
+		    if (!nofollow)
+		      retriever.got_href(*href, (char*)description);
+		    in_ref = 0;
+		    description = 0;
+		}
+	    }
+
+	    if (head.length() < max_head_length && !noindex && !in_title)
+	    {
+		//
+		// Capitalize H1 and H2 blocks
+	        //
+	        if (in_heading > 1 && in_heading < 4)
+	         {
+	           word.uppercase();
+	         }
+
+		//
+		// Append the word to the head (excerpt)
+		//
+		  head << word;
+	    }
+
+	    if (word.length() >= (int)minimum_word_length && !noindex)
+	    {
+	      retriever.got_word((char*)word, wordindex++, in_heading);
+	    }
+	}
+	else
+	{
+	    //
+	    // Characters that are not part of a word
+	    //
+	    if (isspace(*position))
+	    {
+		ADDSPACE(in_space);
+		in_punct = 0;
+	    }
+	    else
+	    {
+		//
+		// Not whitespace
+		//
+		if (head.length() < max_head_length && !noindex && !in_title)
+		{
+		    // We don't want to add random chars to the 
+		    // excerpt if we're in the title.
+		    head << *position;
+		}
+		if (in_ref && description.length() < max_description_length)
+		{
+		    description << *position;
+		}
+		if (in_title && !noindex)
+		{
+		    title << *position;
+		}
+		in_space = 0;
+		in_punct = 1;
+		// handle normal case where decoded '<' is punctuation...
+		if (strncmp((char *)position, "<~>", 3) == 0)
+		    position += 2;	// skip over fake tag for decoded '<'
+	    }
+	    position++;
+	}
+    }
+    retriever.got_head((char*)head);
+
+    delete [] text;
+}
+
+
+//*****************************************************************************
+// void HTML::do_tag(Retriever &retriever, String &tag)
+//
+void
+HTML::do_tag(Retriever &retriever, String &tag)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    int			wordindex = 1;
+    char		*position = tag.get();
+    int			which, length;
+    static int		ignore_alt_text = config->Boolean("ignore_alt_text", 0);
+
+    while (isspace(*position))
+	position++;
+
+    which = -1;
+    if (tags.CompareWord(position, which, length) < 0)
+	return; // Nothing matched.
+
+    // Use the configuration code to match attributes as key-value pairs
+    HtConfiguration	attrs;
+    attrs.NameValueSeparators("=");
+    attrs.Add(position);
+
+    if (debug > 3)
+	cout << "Tag: <" << tag << ">, matched " << which << endl;
+    
+    switch (which)
+    {
+	case 0:		// "title"
+	    if (title.length())
+	    {
+		if (debug)
+		    cout << "More than one <title> tag in document!"
+			 << " (possible search engine spamming)" << endl;
+		break;
+	    }
+	    in_title = 1;
+	    in_heading = 1;
+	    break;
+			
+	case 1:		// "/title"
+	    if (!in_title)
+		break;
+	    in_title = 0;
+	    in_heading = 0;
+	    retriever.got_title((char*)title);
+	    break;
+			
+	case 2:		// "a"
+	{
+	  if (!attrs["href"].empty())
+	    {
+	      //
+	      // a href seen
+	      //
+	      if (in_ref)
+		{
+		  if (debug > 1)
+		    cout << "Terminating previous <a href=...> tag,"
+			 << " which didn't have a closing </a> tag."
+			 << endl;
+		  if (!nofollow)
+		      retriever.got_href(*href, (char*)description);
+		  in_ref = 0;
+		}
+	      if (href)
+		delete href;
+	      href = new URL(transSGML(attrs["href"]), *base);
+	      in_ref = 1;
+	      description = 0;
+	      break;
+	    }
+	  
+	  if (!attrs["title"].empty() && !attrs["href"].empty())
+	    {
+	      //
+	      // a title seen for href
+	      //
+	      retriever.got_href(*href, transSGML(attrs["title"]));
+	    }
+
+	  if (!attrs["name"].empty())
+	    {
+	      //
+	      // a name seen
+	      //
+	      retriever.got_anchor(transSGML(attrs["name"]));
+	    }
+	  break;
+	}
+				   
+	case 3:		// "/a"
+	    if (in_ref)
+	    {
+	      if (!nofollow)
+		retriever.got_href(*href, (char*)description);
+	      in_ref = 0;
+	    }
+	    break;
+
+	case 4:		// "h1"
+	    in_heading = 2;
+	    break;
+
+	case 5:		// "h2"
+	    in_heading = 3;
+	    break;
+
+	case 6:		// "h3"
+	    in_heading = 4;
+	    break;
+
+	case 7:		// "h4"
+	    in_heading = 5;
+	    break;
+
+	case 8:		// "h5"
+	    in_heading = 6;
+	    break;
+
+	case 9:		// "h6"
+	    in_heading = 7;
+	    break;
+
+	case 10:	// "/h1"
+	case 11:	// "/h2"
+	case 12:	// "/h3"
+	case 13:	// "/h4"
+	case 14:	// "/h5"
+	case 15:	// "/h6"
+	    in_heading = 0;
+	    break;
+
+	case 16:	// "noindex"
+	    noindex |= TAGnoindex;
+	    nofollow |= TAGnoindex;
+	    if (!attrs["follow"].empty())
+		nofollow &= ~TAGnoindex;
+	    break;
+
+	case 27:	// "style"
+	    noindex |= TAGstyle;
+	    nofollow |= TAGstyle;
+	    break;
+
+        case 29:        // "script"
+	    noindex |= TAGscript;
+	    nofollow |= TAGscript;
+	    break;
+
+	case 17:	// "/noindex"
+	    noindex &= ~TAGnoindex;
+	    nofollow &= ~TAGnoindex;
+	    break;
+
+	case 28:	// "/style"
+	    noindex &= ~TAGstyle;
+	    nofollow &= ~TAGstyle;
+	    break;
+
+        case 30:	// "/script"
+	    noindex &= ~TAGscript;
+	    nofollow &= ~TAGscript;
+	    break;
+
+	case 19:	// "li"
+	    if (!noindex && !in_title && head.length() < max_head_length)
+		head << "* ";
+	    break;
+
+	case 20:	// "meta"
+	{
+	    //
+	    // First test for old-style meta tags (these break any
+	    // reasonable DTD...)
+	    //
+	    if (!attrs["htdig-noindex"].empty())
+	      {
+		retriever.got_noindex();
+		noindex |= TAGmeta_htdig_noindex;
+		nofollow |= TAGmeta_htdig_noindex;
+	      }
+	    if (!attrs["htdig-index"].empty())
+	      {
+		noindex &= ~TAGmeta_htdig_noindex;
+		nofollow &= ~TAGmeta_htdig_noindex;
+	      }
+	    if (!attrs["htdig-email"].empty())
+	      retriever.got_meta_email(transSGML(attrs["htdig-email"]));
+
+	    if (!attrs["htdig-notification-date"].empty())
+	      retriever.got_meta_notification(transSGML(attrs["htdig-notification-date"]));
+
+	    if (!attrs["htdig-email-subject"].empty())
+	      retriever.got_meta_subject(transSGML(attrs["htdig-email-subject"]));
+
+	    if (!attrs["htdig-keywords"].empty() || !attrs["keywords"].empty())
+	    {
+		//
+		// Keywords are added as being at the very top of the
+		// document and have a weight factor of
+		// keywords-factor which is assigned to slot 9 in the
+		// factor table.
+		//
+		const String keywords = attrs["htdig-keywords"].empty() ?
+		  attrs["htdig-keywords"] :
+		  attrs["keywords"];
+		if (!noindex)
+		  {
+		    String tmp = transSGML(keywords);
+		    addKeywordString (retriever, tmp, wordindex);
+		  }
+	    }
+	
+	    if (!attrs["http-equiv"].empty())
+	      {
+
+		// <META HTTP-EQUIV=REFRESH case
+		if (mystrcasecmp(attrs["http-equiv"], "refresh") == 0
+		    && !attrs["content"].empty())
+		  {
+		    String content = attrs["content"];
+		    char *q = (char*)mystrcasestr((char*)content, "url");
+		    if (q && *q)
+		      {
+			q += 3; // skiping "URL"
+			while (*q && ((*q == '=') || isspace(*q))) q++;
+			char *qq = q;
+			while (*qq && (*qq != ';') && (*qq != '"') &&
+			       !isspace(*qq))qq++;
+			*qq = 0;
+			if (href)
+			  delete href;
+			href = new URL(transSGML(q), *base);
+			// I don't know why anyone would do this, but hey...
+			if (!nofollow)
+			  retriever.got_href(*href, "");
+		      }
+		  }
+	      }
+
+	    //
+	    // Now check for <meta name=...  content=...> tags that
+	    // fly with any reasonable DTD out there
+	    //
+
+	    if (!attrs["name"].empty() && !attrs["content"].empty())
+	    {
+		const String cache = attrs["name"];
+
+		  // First of all, check for META description
+
+		  if (descriptionMatch.CompareWord(cache) 
+			 && !attrs["content"].empty())
+		  {
+		    //
+		    // We need to do two things. First grab the description
+		    // and clean it up
+		    //
+		    meta_dsc = transSGML(attrs["content"]);
+		    meta_dsc.replace('\n', ' ');
+		    meta_dsc.replace('\r', ' ');
+		    meta_dsc.replace('\t', ' ');
+		    if (meta_dsc.length() > max_meta_description_length)
+		     meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+		   if (debug > 1)
+		     cout << "META Description: " << attrs["content"] << endl;
+		   retriever.got_meta_dsc((char*)meta_dsc);
+
+
+		   //
+		   // Now add the words to the word list
+		   // Slot 10 is the current slot for this
+		   //
+		   if (!noindex)
+		     {
+		       String tmp = transSGML(attrs["content"]);
+		       addString (retriever, tmp, wordindex, 10);
+		     }
+		}
+
+		if (keywordsMatch.CompareWord(cache) && !noindex)
+		{
+		    String tmp = transSGML(attrs["content"]);
+		    addKeywordString (retriever, tmp, wordindex);
+		}
+		else if (mystrcasecmp(cache, "author") == 0)
+		{
+		    String author = transSGML(attrs["content"]);
+		    retriever.got_author(author.get());
+		    if (!noindex)
+			addString (retriever, author, wordindex, 11);
+		}
+		else if (mystrcasecmp(cache, "htdig-email") == 0)
+		{
+		    retriever.got_meta_email(transSGML(attrs["content"]));
+		}
+		else if (metadatetags.CompareWord(cache, which, length) && 
+			 (cache.get())[length] == '\0' && config->Boolean("use_doc_date",0))
+		  {
+		    retriever.got_time(transSGML(attrs["content"]));
+		  }
+		else if (mystrcasecmp(cache, "htdig-notification-date") == 0)
+		{
+		    retriever.got_meta_notification(transSGML(attrs["content"]));
+		}
+		else if (mystrcasecmp(cache, "htdig-email-subject") == 0)
+		{
+		    retriever.got_meta_subject(transSGML(attrs["content"]));
+		}
+		else if (mystrcasecmp(cache, "htdig-noindex") == 0)
+		  {
+		    retriever.got_noindex();
+		    noindex |= TAGmeta_htdig_noindex;
+		    nofollow |= TAGmeta_htdig_noindex;
+		  }
+		else if (mystrcasecmp(cache, "robots") == 0
+			 && !attrs["content"].empty())
+		  {
+		    String   content_cache = attrs["content"];
+		    content_cache.lowercase();
+		    if (content_cache.indexOf("noindex") != -1)
+		      {
+			noindex |= TAGmeta_robots;
+			retriever.got_noindex();
+		      }
+		    if (content_cache.indexOf("nofollow") != -1)
+			nofollow |= TAGmeta_robots;
+		    if (content_cache.indexOf("none") != -1)
+		      {
+			noindex |= TAGmeta_robots;
+			nofollow |= TAGmeta_robots;
+			retriever.got_noindex();
+		      }
+		  }
+	    }
+	    else if (mystrcasecmp(attrs["name"], "htdig-noindex") == 0)
+	    {
+	        retriever.got_noindex();
+		noindex |= TAGmeta_htdig_noindex;
+		nofollow |= TAGmeta_htdig_noindex;
+	    }
+	    break;
+	}
+
+	case 21:	// frame
+        case 24:	// embed
+	{
+	  if (!attrs["src"].empty())
+	    {
+	      //
+	      // src seen
+	      //
+	      if (!nofollow)
+		{
+		  if (href)
+		    delete href;
+		  href = new URL(transSGML(attrs["src"]), *base);
+		  // Frames have the same hopcount as the parent.
+		  retriever.got_href(*href, transSGML(attrs["title"]), 0);
+		  in_ref = 0;
+		}
+	    }
+	  break;
+	}
+	  
+        case 25:	// object
+	{
+	  if (!attrs["data"].empty())
+	    {
+	      //
+	      // data seen
+	      //
+	      if (!nofollow)
+		{
+		  if (href)
+		    delete href;
+		  href = new URL(transSGML(attrs["data"]), *base);
+		  // Assume objects have the same hopcount as the parent.
+		  retriever.got_href(*href, transSGML(attrs["title"]), 0);
+		  in_ref = 0;
+		}
+	    }
+	  break;
+	}
+	  
+	case 22:	// area
+        case 26:	// link
+	{
+	  if (!attrs["href"].empty())
+	    {
+	      // href seen
+	      if (!nofollow)
+		{
+		  if (href)
+		    delete href;
+		  href = new URL(transSGML(attrs["href"]), *base);
+		  // area & link are like anchor tags -- one hopcount!
+		  retriever.got_href(*href, transSGML(attrs["title"]), 1);
+		  in_ref = 0;
+		}
+	    }
+	  break;
+	}
+	  
+	case 23:	// base
+	{
+	  if (!attrs["href"].empty())
+	    {
+	      URL tempBase(transSGML(attrs["href"]));
+	      *base = tempBase;
+	    }
+	  break;
+	}
+	
+	case 18: // img
+	  {
+	    if (!ignore_alt_text && !attrs["alt"].empty())
+	      {
+		String tmp = transSGML(attrs["alt"]);
+		if (!noindex && in_title)
+		    title << tmp << " ";
+		if (in_ref && description.length() < max_description_length)
+		    description << tmp << " ";
+		if (!noindex && !in_title && head.length() < max_head_length)
+		    head << tmp << " ";
+		if (!noindex)
+		    addString (retriever, tmp, wordindex, 8);	// slot for  img_alt
+	      }
+	    if (!attrs["src"].empty())
+	      {
+		retriever.got_image(transSGML(attrs["src"]));
+	      }
+	    break;
+	  }
+
+	default:
+	  return;	// Nothing...
+    }
+}
+
+
+//*****************************************************************************
+// const String HTML::transSGML(const String& str)
+//
+const String
+HTML::transSGML(const String& str)
+{
+    return HtSGMLCodec::instance()->encode(str);
+}