DEB htdig: Added to repository.

Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
author: Slávek Banko <slavek.banko@axis.cz> 2021-11-05 13:28:23 +0100
committer: Slávek Banko <slavek.banko@axis.cz> 2021-11-05 13:28:23 +0100
commit: 8c787c3591c1c885b91a54128835b400858c5cca (patch)
tree: eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htdig
parent: fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff)
download: extra-dependencies-8c787c35.tar.gz
extra-dependencies-8c787c35.zip
24 files changed, 7440 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore b/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore
new file mode 100644
index 00000000..4de01869
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore
@@ -0,0 +1,8 @@
+Makefile
+*.lo
+*.la
+.purify
+.pure
+.deps
+.libs
+htdig
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.cc b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc
new file mode 100644
index 00000000..87272686
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc
@@ -0,0 +1,784 @@
+//
+// Document.cc
+//
+// Document: This class holds everything there is to know about a document.
+//           The actual contents of the document may or may not be present at
+//           all times for memory conservation reasons.
+//           The document can be told to retrieve its contents.  This is done
+//           with the Retrieve call.  In case the retrieval causes a 
+//           redirect, the link is followed, but this process is done 
+//           only once (to prevent loops.) If the redirect didn't 
+//           work, Document_not_found is returned.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Document.cc,v 1.71 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+
+#include "Document.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "ExternalParser.h"
+#include "lib.h"
+
+#include "Transport.h"
+#include "HtHTTP.h"
+
+#ifdef HAVE_SSL_H
+#include "HtHTTPSecure.h"
+#endif
+
+#include "HtHTTPBasic.h"
+#include "ExternalTransport.h"
+
+#include "defaults.h"
+
+#if 1
+typedef void (*SIGNAL_HANDLER) (...);
+#else
+typedef SIG_PF SIGNAL_HANDLER;
+#endif
+
+//*****************************************************************************
+// Document::Document(char *u)
+//   Initialize with the given url as the location for this document.
+//   If the max_size is given, use that for size, otherwise use the
+//   config value.
+//
+Document::Document(char *u, int max_size)
+{
+    url = 0;
+    proxy = 0;
+    referer = 0;
+    contents = 0;
+    transportConnect = 0;
+    HTTPConnect = 0;
+    HTTPSConnect = 0;
+    FileConnect = 0;
+    FTPConnect = 0;
+    NNTPConnect = 0;
+    externalConnect = 0;
+	HtConfiguration* config= HtConfiguration::config();
+
+    // We probably need to move assignment of max_doc_size, according
+    // to a server or url configuration value. The same is valid for
+    // max_retries.
+
+    if (max_size > 0)
+	max_doc_size = max_size;
+    else
+	max_doc_size = config->Value("max_doc_size");
+
+   if (config->Value("max_retries") > 0)
+      num_retries = config->Value("max_retries");
+   else num_retries = 2;
+   
+   // Initialize some static variables of Transport
+
+   Transport::SetDebugLevel(debug);
+
+   // Initialize some static variables of Transport
+   // and the User Agent for every HtHTTP objects
+
+   HtHTTP::SetParsingController(ExternalParser::canParse);
+
+   // Set the default parser content-type string
+   Transport::SetDefaultParserContentType ("text/");
+
+    contents.allocate(max_doc_size + 100);
+    contentType = "";
+    contentLength = -1;
+    if (u)
+    {
+	Url(u);
+    }
+}
+
+
+//*****************************************************************************
+// Document::~Document()
+//
+Document::~Document()
+{
+   // We delete only the derived class objects
+    if (HTTPConnect)
+      delete HTTPConnect;
+    if (HTTPSConnect)
+      delete HTTPSConnect;
+    if (FileConnect)
+      delete FileConnect;
+    if (FTPConnect)
+      delete FTPConnect;
+    if (NNTPConnect)
+      delete NNTPConnect;
+    if (externalConnect)
+      delete externalConnect;
+      
+    if (url)
+      delete url;
+    if (proxy)
+      delete proxy;
+    if (referer)
+      delete referer;
+
+#if MEM_DEBUG
+    char *p = new char;
+    cout << "==== Document deleted: " << this << " new at " <<
+	((void *) p) << endl;
+    delete p;
+#endif
+}
+
+
+//*****************************************************************************
+// void Document::Reset()
+//   Restore the Document object to an initial state.
+//   We will not reset the authorization information since it can be reused.
+//
+void
+Document::Reset()
+{
+    contentType = 0;
+    contentLength = -1;
+    if (url)
+      delete url;
+    url = 0;
+    if (referer)
+      delete referer;
+
+    referer = 0;
+
+    proxy=0;
+    authorization=0;
+    proxy_authorization=0;
+    contents = 0;
+    document_length = 0;
+    redirected_to = 0;
+
+}
+
+
+//*****************************************************************************
+// void Document::Url(const String &u)
+//   Set the URL for this document
+//
+void
+Document::Url(const String &u)
+{
+    HtConfiguration* config= HtConfiguration::config();
+    if (url)
+      delete url;
+    url = new URL(u);
+
+    // Re-initialise the proxy
+    if (proxy)
+    	delete proxy;
+    proxy = 0;
+
+    // Get the proxy information for this URL
+    const String proxyURL = config->Find(url,"http_proxy");
+
+    // If http_proxy is not empty we set the proxy for the current URL
+    if (proxyURL.length())
+    {
+        proxy = new URL(proxyURL);
+        proxy->normalize();
+    	// set the proxy authorization information
+        setProxyUsernamePassword(config->Find(url,"http_proxy_authorization"));
+    }
+
+    // Set the authorization information
+    setUsernamePassword(config->Find(url,"authorization"));
+
+}
+
+
+//*****************************************************************************
+// void Document::Referer(const String &u)
+//   Set the Referring URL for this document
+//
+void
+Document::Referer(const String &u)
+{
+    if (referer)
+      delete referer;
+    referer = new URL(u);
+}
+
+
+//*****************************************************************************
+// int Document::UseProxy()
+//   Returns 1 if the given url is to be retrieved from the proxy server,
+//   or 0 if it's not.
+//
+int
+Document::UseProxy()
+{
+	HtConfiguration* config= HtConfiguration::config();
+    static HtRegex *excludeProxy = 0;
+
+    //
+    // Initialize excludeProxy list if this is the first time.
+    //
+    if (!excludeProxy)
+    {
+    	excludeProxy = new HtRegex();
+	StringList l(config->Find("http_proxy_exclude"), " \t");
+	excludeProxy->setEscaped(l, config->Boolean("case_sensitive"));
+	l.Release();
+    }
+
+    if ((proxy) && (excludeProxy->match(url->get(), 0, 0) == 0))
+      return true;    // if the exclude pattern is empty, use the proxy
+    return false;
+}
+
+
+//*****************************************************************************
+// DocStatus Document::Retrieve(HtDateTime date)
+//   Attempt to retrieve the document pointed to by our internal URL
+//
+Transport::DocStatus
+Document::Retrieve(Server *server, HtDateTime date)
+{
+   // Right now we just handle http:// service
+   // Soon this will include file://
+   // as well as an ExternalTransport system
+   // eventually maybe ftp:// and a few others
+
+   Transport::DocStatus	status;
+   Transport_Response	*response = 0;
+   HtDateTime 		*ptrdatetime = 0;
+   int			useproxy = UseProxy();
+   int                  NumRetries;
+  
+   transportConnect = 0;
+
+   if (ExternalTransport::canHandle(url->service()))
+     {
+       if (externalConnect)
+	 {
+	   delete externalConnect;
+        }	
+       externalConnect = new ExternalTransport(url->service());
+       transportConnect = externalConnect;
+     }
+#ifdef HAVE_SSL_H
+   else if (mystrncasecmp(url->service(), "https", 5) == 0)
+   {
+      if (!HTTPSConnect)
+      {
+         if (debug>4)
+            cout << "Creating an HtHTTPSecure object" << endl;
+      
+         HTTPSConnect = new HtHTTPSecure();
+
+         if (!HTTPSConnect)
+               return Transport::Document_other_error;
+      }
+      
+      if (HTTPSConnect)
+      {
+         // Here we must set only thing for a HTTP request
+	  
+         HTTPSConnect->SetRequestURL(*url);
+
+	 // Set the user agent which can vary per server
+	 HTTPSConnect->SetRequestUserAgent(server->UserAgent());
+
+	 // Set the accept language which can vary per server
+	 HTTPSConnect->SetAcceptLanguage(server->AcceptLanguage());
+
+         // Set the referer
+         if (referer)
+            HTTPSConnect->SetRefererURL(*referer);
+
+         // Let's disable the cookies if we decided that in the config file
+         if (server->DisableCookies())
+            HTTPSConnect->DisableCookies();
+         else HTTPSConnect->AllowCookies();
+
+	  // We may issue a config paramater to enable/disable them
+         if (server->IsPersistentConnectionAllowed())
+         {
+            // Persistent connections allowed
+            HTTPSConnect->AllowPersistentConnection();
+         }
+         else HTTPSConnect->DisablePersistentConnection();
+
+            // Head before Get option control
+            if (server->HeadBeforeGet())
+               HTTPSConnect->EnableHeadBeforeGet();
+            else
+               HTTPSConnect->DisableHeadBeforeGet();
+
+	  // http->SetRequestMethod(HtHTTP::Method_GET);
+         if (debug > 2)
+         {
+            cout << "Making HTTPS request on " << url->get();
+	      
+            if (useproxy)
+               cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")";
+	      
+            cout << endl;
+         }
+      }
+
+      HTTPSConnect->SetProxy(useproxy);
+      transportConnect = HTTPSConnect;
+   }
+#endif
+   else if (mystrncasecmp(url->service(), "http", 4) == 0)
+   {
+      if (!HTTPConnect)
+      {
+         if (debug>4)
+            cout << "Creating an HtHTTPBasic object" << endl;
+      
+         HTTPConnect = new HtHTTPBasic();
+
+         if (!HTTPConnect)
+               return Transport::Document_other_error;
+      }
+      
+      if (HTTPConnect)
+      {
+         // Here we must set only thing for a HTTP request
+	  
+         HTTPConnect->SetRequestURL(*url);
+
+	 // Set the user agent which can vary per server
+	 HTTPConnect->SetRequestUserAgent(server->UserAgent());
+
+	 // Set the accept language which can vary per server
+	 HTTPConnect->SetAcceptLanguage(server->AcceptLanguage());
+
+         // Set the referer
+         if (referer)
+            HTTPConnect->SetRefererURL(*referer);
+	  
+         // Let's disable the cookies if we decided that in the config file 
+         if (server->DisableCookies())
+            HTTPConnect->DisableCookies();
+         else HTTPConnect->AllowCookies();
+
+	  // We may issue a config paramater to enable/disable them
+         if (server->IsPersistentConnectionAllowed())
+         {
+            // Persistent connections allowed
+            HTTPConnect->AllowPersistentConnection();
+         }
+         else HTTPConnect->DisablePersistentConnection();
+
+            // Head before Get option control
+            if (server->HeadBeforeGet())
+               HTTPConnect->EnableHeadBeforeGet();
+            else
+               HTTPConnect->DisableHeadBeforeGet();
+
+	  // http->SetRequestMethod(HtHTTP::Method_GET);
+         if (debug > 2)
+         {
+            cout << "Making HTTP request on " << url->get();
+	      
+            if (useproxy)
+               cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")";
+	      
+            cout << endl;
+         }
+      }
+
+      HTTPConnect->SetProxy(useproxy);
+      transportConnect = HTTPConnect;
+   }
+   else if (mystrncasecmp(url->service(), "file", 4) == 0)
+   {
+      if (!FileConnect)
+      {
+         if (debug>4)
+            cout << "Creating an HtFile object" << endl;
+
+         FileConnect = new HtFile();
+
+         if (!FileConnect)
+               return Transport::Document_other_error;
+      }
+
+      if (FileConnect)
+      {
+         // Here we must set only thing for a file request
+	
+         FileConnect->SetRequestURL(*url);
+	
+         // Set the referer
+         if (referer)
+            FileConnect->SetRefererURL(*referer);
+	
+         if (debug > 2)
+            cout << "Making 'file' request on " << url->get() << endl;
+      }
+
+      transportConnect = FileConnect;
+   } 
+   else if (mystrncasecmp(url->service(), "ftp", 3) == 0)
+     { 
+       // the following FTP handling is modeled very closely on 
+       // the prior 'file'-protocol handling, so beware of bugs
+
+      if (!FTPConnect)
+      {
+         if (debug>4)
+            cout << "Creating an HtFTP object" << endl;
+
+         FTPConnect = new HtFTP();
+
+         if (!FTPConnect)
+               return Transport::Document_other_error;
+      }
+      if (FTPConnect)
+      {
+         // Here we must set only thing for a FTP request
+	
+         FTPConnect->SetRequestURL(*url);
+         ////////////////////////////////////////////////////    
+	 ///
+         /// stuff may be missing here or in need of change             
+         ///
+	 ///////////////////////////////////////////////////
+
+         // Set the referer
+         if (referer)
+            FTPConnect->SetRefererURL(*referer);
+	
+         if (debug > 2)
+            cout << "Making 'ftp' request on " << url->get() << endl;
+      }
+
+      transportConnect = FTPConnect;
+   } // end of else if (mystrncasecmp(url->service(), "ftp", 3) == 0)
+
+   else if (mystrncasecmp(url->service(), "news", 4) == 0)
+   {
+      if (!NNTPConnect)
+      {
+         if (debug>4)
+            cout << "Creating an HtNNTP object" << endl;
+
+         NNTPConnect = new HtNNTP();
+
+         if (!NNTPConnect)
+               return Transport::Document_other_error;
+      }
+
+      if (NNTPConnect)
+      {
+         // Here we got an Usenet document request
+	
+         NNTPConnect->SetRequestURL(*url);
+	
+         if (debug > 2)
+            cout << "Making 'NNTP' request on " << url->get() << endl;
+      }
+
+      transportConnect = NNTPConnect;
+   }
+   else
+   {
+      if (debug)
+      {
+         cout << '"' << url->service() <<
+            "\" not a recognized transport service. Ignoring\n";
+      }
+      
+      return Transport::Document_not_recognized_service;
+   }
+  
+   // Is a transport object pointer available?
+  
+   if (transportConnect)
+   {
+      // Set all the appropriate parameters
+      if (useproxy)
+      {
+         transportConnect->SetConnection(proxy);
+	 if (proxy_authorization.length())
+	     transportConnect->SetProxyCredentials(proxy_authorization);
+      }
+      else
+         transportConnect->SetConnection(url);
+      
+      // OK. Let's set the connection time out
+      transportConnect->SetTimeOut(server->TimeOut());
+      
+      // Let's set number of retries for a failed connection attempt
+      transportConnect->SetRetry(server->TcpMaxRetries());
+      
+      // ... And the wait time after a failure
+      transportConnect->SetWaitTime(server->TcpWaitTime());
+      
+      // OK. Let's set the maximum size of a document to be retrieved
+      transportConnect->SetRequestMaxDocumentSize(max_doc_size);
+      
+      // Let's set the credentials
+      transportConnect->SetCredentials(authorization);
+      
+      // Let's set the modification time (in order not to retrieve a
+      // document we already have)
+      transportConnect->SetRequestModificationTime(date);
+      
+      // Make the request
+      // Here is the main operation ... Let's make the request !!!
+      // We now perform a loop until we want to retry the request
+
+      NumRetries = 0;
+      
+      do
+      {
+         status = transportConnect->Request();
+
+         if (NumRetries++)
+            if(debug>0)
+               cout << ".";
+
+      } while (ShouldWeRetry(status) && NumRetries < num_retries);
+      
+            
+      // Let's get out the info we need
+      response = transportConnect->GetResponse();
+      
+      if (response)
+      {
+         // We got the response
+	  
+         contents = response->GetContents();
+         contentType = response->GetContentType();
+         contentLength = response->GetContentLength();
+         ptrdatetime = response->GetModificationTime();
+         document_length = response->GetDocumentLength();
+
+         // This test is ugly!  Can whoever put it here explain why it's
+         // needed?  Why would GetLocation() ever return a non-empty string
+         // from a Transport subclass that's not supposed to redirect?
+         if (transportConnect == HTTPConnect || transportConnect == HTTPSConnect || transportConnect == externalConnect)
+            redirected_to =  ((HtHTTP_Response *)response)->GetLocation();
+
+         if (ptrdatetime)
+         {
+            // We got the modification date/time
+            modtime = *ptrdatetime;
+         }
+
+         // How to manage it when there's no modification date/time?
+	  
+         if (debug > 5)
+         {
+            cout << "Contents:\n" << contents << endl;
+            cout << "Content Type: " << contentType << endl;
+            cout << "Content Length: " << contentLength << endl;
+            cout << "Modification Time: " << modtime.GetISO8601() << endl;
+         }
+      }
+      
+      return status;
+      
+   }
+   else
+      return Transport::Document_not_found;
+}
+  
+//*****************************************************************************
+// DocStatus Document::RetrieveLocal(HtDateTime date, StringList *filenames)
+//   Attempt to retrieve the document pointed to by our internal URL
+//   using a list of potential local filenames given. Returns Document_ok,
+//   Document_not_changed or Document_not_local (in which case the
+//   retriever tries it again using the standard retrieve method).
+//
+Transport::DocStatus
+Document::RetrieveLocal(HtDateTime date, StringList *filenames)
+{
+    HtConfiguration* config= HtConfiguration::config();
+    struct stat stat_buf;
+    String *filename;
+
+    filenames->Start_Get();
+
+    // Loop through list of potential filenames until the list is exhausted
+    // or a suitable file is found to exist as a regular file.
+    while ((filename = (String *)filenames->Get_Next()) &&
+	   ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
+        if (debug > 1)
+	    cout << "  tried local file " << *filename << endl;
+    
+    if (!filename)
+        return Transport::Document_not_local;
+
+    if (debug > 1)
+        cout << "  found existing file " << *filename << endl;
+
+    modtime = stat_buf.st_mtime;
+    if (modtime <= date)
+      return Transport::Document_not_changed;
+
+    char *ext = strrchr((char*)*filename, '.');
+    if (ext == NULL)
+      return Transport::Document_not_local;
+    const String *type = HtFile::Ext2Mime (ext + 1);
+
+    static Dictionary *bad_local_ext = 0;
+    if (!bad_local_ext)
+    {
+	// A list of bad extensions, separated by spaces or tabs
+	bad_local_ext = new Dictionary;
+	String	t = config->Find("bad_local_extensions");
+	String lowerp;
+	char	*p = strtok(t, " \t");
+	while (p)
+	{
+	  // Extensions are case insensitive
+	  lowerp = p;
+	  lowerp.lowercase();
+	  bad_local_ext->Add(lowerp, 0);
+	  p = strtok(0, " \t");
+	}
+    }
+    if (type == NULL || bad_local_ext->Exists(ext))
+    {
+      if (debug > 1 && type != NULL)
+	cout << "\nBad local extension: " << *filename << endl;
+      return Transport::Document_not_local;
+    }
+    else 
+      contentType = *type;
+
+    // Open it
+    FILE *f = fopen((char*)*filename, "r");
+    if (f == NULL)
+      return Transport::Document_not_local;
+
+    //
+    // Read in the document itself
+    //
+    max_doc_size = config->Value(url,"max_doc_size");
+    contents = 0;
+    char	docBuffer[8192];
+    int		bytesRead;
+
+    while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0)
+    {
+	if (debug > 2)
+	    cout << "Read " << bytesRead << " from document\n";
+	if (contents.length() + bytesRead > max_doc_size)
+	    bytesRead = max_doc_size - contents.length();
+	contents.append(docBuffer, bytesRead);
+	if (contents.length() >= max_doc_size)
+	    break;
+    }
+    fclose(f);
+    document_length = contents.length();
+    contentLength = stat_buf.st_size;
+
+    if (debug > 2)
+	cout << "Read a total of " << document_length << " bytes\n";
+
+    if (document_length < contentLength)
+      document_length = contentLength;
+    return Transport::Document_ok;
+}
+
+
+//*****************************************************************************
+// Parsable *Document::getParsable()
+//   Given the content-type of a document, returns a document parser.
+//   This will first look through the list of user supplied parsers and
+//   then at our (limited) builtin list of parsers.  The user supplied
+//   parsers are external programs that will be used.
+//
+Parsable *
+Document::getParsable()
+{
+    static HTML			*html = 0;
+    static Plaintext		*plaintext = 0;
+    static ExternalParser	*externalParser = 0;
+    
+    Parsable	*parsable = 0;
+
+    if (ExternalParser::canParse(contentType))
+    {
+	if (externalParser)
+	{
+	    delete externalParser;
+	}
+	externalParser = new ExternalParser(contentType);
+	parsable = externalParser;
+    }
+    else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
+    {
+	if (!html)
+	    html = new HTML();
+	parsable = html;
+    }
+    else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
+    {
+	if (!plaintext)
+	    plaintext = new Plaintext();
+	parsable = plaintext;
+    }
+    else if (mystrncasecmp((char *)contentType, "text/css", 8) == 0)
+      {
+	return NULL;
+      }
+    else if (mystrncasecmp((char *)contentType, "text/", 5) == 0)
+    {
+	if (!plaintext)
+	    plaintext = new Plaintext();
+	parsable = plaintext;
+	if (debug > 1)
+	{
+	    cout << '"' << contentType <<
+		"\" not a recognized type.  Assuming text/plain\n";
+	}
+    }
+    else
+      {
+	if (debug > 1)
+	{
+	    cout << '"' << contentType <<
+		"\" not a recognized type.  Ignoring\n";
+	}
+	return NULL;
+      }
+
+    parsable->setContents(contents.get(), contents.length());
+    return parsable;
+}
+
+
+int Document::ShouldWeRetry(Transport::DocStatus DocumentStatus)
+{
+
+   if (DocumentStatus == Transport::Document_connection_down)
+      return 1;
+      
+   if (DocumentStatus == Transport::Document_no_connection)
+      return 1;
+      
+   if (DocumentStatus == Transport::Document_no_header)
+      return 1;
+      
+   return 0;
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.h b/debian/htdig/htdig-3.2.0b6/htdig/Document.h
new file mode 100644
index 00000000..215897c4
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.h
@@ -0,0 +1,138 @@
+//
+// Document.h
+//
+// Document: This class holds everything there is to know about a document.
+//           The actual contents of the document may or may not be present at
+//           all times for memory conservation reasons.
+//           The document can be told to retrieve its contents.  This is done
+//           with the Retrieve call.  In case the retrieval causes a 
+//           redirect, the link is followed, but this process is done 
+//           only once (to prevent loops.) If the redirect didn't 
+//           work, Document_not_found is returned.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Document.h,v 1.19 2004/05/28 13:15:14 lha Exp $
+//
+//
+#ifndef _Document_h_
+#define _Document_h_
+
+#include "Parsable.h"
+#include "Object.h"
+#include "URL.h"
+#include "htString.h"
+#include "StringList.h"
+#include "Transport.h"
+#include "HtHTTP.h"
+#include "HtFile.h"
+#include "HtFTP.h"
+#include "HtNNTP.h"
+#include "ExternalTransport.h"
+#include "Server.h"
+
+
+class Connection;
+
+
+class Document : public Object
+{
+public:
+    //
+    // Construction/Destruction
+    //
+    Document(char *url = 0, int max_size = 0);
+    ~Document();
+
+    //
+    // Interface to the document.
+    //
+    void			Reset();
+    int				Length()	  {return document_length;}
+    int				ContentLength()	  {return contentLength;}
+    int				StoredLength()	  {return contents.length();}
+    char			*Contents()	  {return contents;}
+    void			Contents(char *s) {contents = s; document_length = contents.length();}
+    char			*ContentType()	  {return contentType.get();}
+    
+    //
+    // In case the retrieval process went through a redirect process,
+    // the new url can be gotten using the following call
+    //
+    char			*Redirected()		{return redirected_to;}
+    URL				*Url()			{return url;}
+    void			Url(const String &url);
+    void			Referer(const String &url);
+    time_t			ModTime()		{return modtime.GetTime_t();}
+
+    Transport::DocStatus	Retrieve(Server *server, HtDateTime date);
+    Transport::DocStatus	RetrieveLocal(HtDateTime date, StringList *filenames);
+
+    //
+    // Return an appropriate parsable object for the document type.
+    //
+    Parsable			*getParsable();
+
+    //
+    // Set the username and password to be used in any requests
+    //
+    void			setUsernamePassword(const String& credentials)
+                                          { authorization = credentials;}
+
+    void			setProxyUsernamePassword(const String& credentials)
+                                          { proxy_authorization = credentials;}
+
+    HtHTTP *GetHTTPHandler() const { return HTTPConnect; }
+	
+private:
+    enum
+    {
+	Header_ok,
+	Header_not_found,
+	Header_not_changed,
+	Header_redirect,
+	Header_not_text,
+	Header_not_authorized
+    };
+
+    URL				*url;
+    URL				*proxy;
+    URL				*referer;
+    String			contents;
+    String			redirected_to;
+    String			contentType;
+    String			authorization;
+    String			proxy_authorization;
+    int				contentLength;
+    int				document_length;
+    HtDateTime			modtime;
+    int				max_doc_size;
+    int				num_retries;
+
+    int				UseProxy();
+
+    Transport			*transportConnect;
+    HtHTTP			*HTTPConnect;
+    HtHTTP			*HTTPSConnect;
+    HtFile			*FileConnect;
+    HtFTP                       *FTPConnect;
+    HtNNTP			*NNTPConnect;
+    ExternalTransport		*externalConnect;
+    
+
+ ///////
+    //    Tell us if we should retry to retrieve an URL depending on
+    //    the first returned document status
+ ///////
+
+   int ShouldWeRetry(Transport::DocStatus DocumentStatus);    
+   
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
new file mode 100644
index 00000000..d967ba0b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
@@ -0,0 +1,614 @@
+//
+// ExternalParser.cc
+//
+// ExternalParser: Implementation of ExternalParser
+//                 Allows external programs to parse unknown document formats.
+//                 The parser is expected to return the document in a 
+//                 specific format. The format is documented 
+//                 in http://www.htdig.org/attrs.html#external_parser
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalParser.cc,v 1.29 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "Dictionary.h"
+#include "good_strtok.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#ifdef _MSC_VER /* _WIN32 */
+#include <process.h>
+#endif
+
+
+#include "defaults.h"
+
+static Dictionary	*parsers = 0;
+static Dictionary	*toTypes = 0;
+extern String		configFile;
+
+//*****************************************************************************
+// ExternalParser::ExternalParser(char *contentType)
+//
+ExternalParser::ExternalParser(char *contentType)
+{
+  String mime;
+  int sep;
+
+    if (canParse(contentType))
+    {
+        String mime = contentType;
+	mime.lowercase();
+	sep = mime.indexOf(';');
+	if (sep != -1)
+	  mime = mime.sub(0, sep).get();
+	
+	currentParser = ((String *)parsers->Find(mime))->get();
+    }
+    ExternalParser::contentType = contentType;
+}
+
+
+//*****************************************************************************
+// ExternalParser::~ExternalParser()
+//
+ExternalParser::~ExternalParser()
+{
+}
+
+
+//*****************************************************************************
+// int ExternalParser::readLine(FILE *in, String &line)
+//
+int
+ExternalParser::readLine(FILE *in, String &line)
+{
+    char	buffer[2048];
+    int		length;
+    
+    line = 0; // read(in, buffer, sizeof(buffer)
+    while (fgets(buffer, sizeof(buffer), in))
+    {
+	length = strlen(buffer);
+	if (buffer[length - 1] == '\n')
+	{
+	    //
+	    // A full line has been read.  Return it.
+	    //
+	    line << buffer;
+	    line.chop('\n');
+	    return 1;
+	}
+	else
+	{
+	    //
+	    // Only a partial line was read.  Append it to the line
+	    // and read some more.
+	    //
+	    line << buffer;
+	}
+    }
+    return line.length() > 0;
+}
+
+
+//*****************************************************************************
+// int ExternalParser::canParse(char *contentType)
+//
+int
+ExternalParser::canParse(char *contentType)
+{
+  HtConfiguration* config= HtConfiguration::config();
+  int			sep;
+
+    if (!parsers)
+    {
+	parsers = new Dictionary();
+	toTypes = new Dictionary();
+	
+	QuotedStringList	qsl(config->Find("external_parsers"), " \t");
+	String			from, to;
+	int			i;
+
+	for (i = 0; qsl[i]; i += 2)
+	{
+	    from = qsl[i];
+	    to = "";
+	    sep = from.indexOf("->");
+	    if (sep != -1)
+	    {
+		to = from.sub(sep+2).get();
+		from = from.sub(0, sep).get();
+	    }
+	    from.lowercase();
+	    sep = from.indexOf(';');
+	    if (sep != -1)
+	      from = from.sub(0, sep).get();
+
+	    parsers->Add(from, new String(qsl[i + 1]));
+	    toTypes->Add(from, new String(to));
+	}
+    }
+
+    String mime = contentType;
+    mime.lowercase();
+    sep = mime.indexOf(';');
+    if (sep != -1)
+      mime = mime.sub(0, sep).get();
+    return parsers->Exists(mime);
+}
+
+//*****************************************************************************
+// void ExternalParser::parse(Retriever &retriever, URL &base)
+//
+void
+ExternalParser::parse(Retriever &retriever, URL &base)
+{
+// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
+#ifndef _MSC_VER /* _WIN32 */
+	HtConfiguration* config= HtConfiguration::config();
+    if (contents == 0 || contents->length() == 0 ||
+	currentParser.length() == 0)
+    {
+	return;
+    }
+
+    //
+    // Write the contents to a temporary file.
+    //
+    String      path = getenv("TMPDIR");
+    int		fd;
+    if (path.length() == 0)
+      path = "/tmp";
+#ifndef HAVE_MKSTEMP
+    path << "/htdext." << getpid(); // This is unfortunately predictable
+
+#ifdef O_BINARY
+    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY);
+#else
+    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL);
+#endif
+#else
+    path << "/htdex.XXXXXX";
+    fd = mkstemp((char*)path);
+    // can we force binary mode somehow under Cygwin, if it has mkstemp?
+#endif
+    if (fd < 0)
+    {
+      if (debug)
+	cout << "External parser error: Can't create temp file "
+	     << (char *)path << endl;
+      return;
+    }
+    
+    write(fd, contents->get(), contents->length());
+    close(fd);
+
+//  unsigned int minimum_word_length = config->Value("minimum_word_length", 3);
+    String	line;
+    char	*token1, *token2, *token3;
+    int		loc = 0, hd = 0;
+    URL		url;
+    String mime = contentType;
+    mime.lowercase();
+    int	sep = mime.indexOf(';');
+    if (sep != -1)
+      mime = mime.sub(0, sep).get();
+    String	convertToType = ((String *)toTypes->Find(mime))->get();
+    int		get_hdr = (convertToType.nocase_compare("user-defined") == 0);
+    int		get_file = (convertToType.length() != 0);
+    String	newcontent;
+
+    StringList	cpargs(currentParser);
+    char   **parsargs = new char * [cpargs.Count() + 5];
+    int    argi;
+    for (argi = 0; argi < cpargs.Count(); argi++)
+	parsargs[argi] = (char *)cpargs[argi];
+    parsargs[argi++] = path.get();
+    parsargs[argi++] = contentType.get();
+    parsargs[argi++] = (char *)base.get().get();
+    parsargs[argi++] = configFile.get();
+    parsargs[argi++] = 0;
+
+    int    stdout_pipe[2];
+    int	   fork_result = -1;
+    int	   fork_try;
+
+    if (pipe(stdout_pipe) == -1)
+    {
+      if (debug)
+	cout << "External parser error: Can't create pipe!" << endl;
+      unlink((char*)path);
+      delete [] parsargs;
+      return;
+    }
+
+    for (fork_try = 4; --fork_try >= 0;)
+    {
+      fork_result = fork(); // Fork so we can execute in the child process
+      if (fork_result != -1)
+	break;
+      if (fork_try)
+	sleep(3);
+    }
+    if (fork_result == -1)
+    {
+      if (debug)
+	cout << "Fork Failure in ExternalParser" << endl;
+      unlink((char*)path);
+      delete [] parsargs;
+      return;
+    }
+
+    if (fork_result == 0) // Child process
+    {
+	close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+	dup(stdout_pipe[1]);
+	close(stdout_pipe[0]);
+	close(stdout_pipe[1]);
+	close(STDIN_FILENO); // Close STDIN to replace with file
+	open((char*)path, O_RDONLY);
+
+	// Call External Parser
+	execv(parsargs[0], parsargs);
+
+	exit(EXIT_FAILURE);
+    }
+
+    // Parent Process
+    delete [] parsargs;
+    close(stdout_pipe[1]); // Close STDOUT for writing
+#ifdef O_BINARY
+    FILE *input = fdopen(stdout_pipe[0], "rb");
+#else
+    FILE *input = fdopen(stdout_pipe[0], "r");
+#endif
+    if (input == NULL)
+    {
+      if (debug)
+	cout << "Fdopen Failure in ExternalParser" << endl;
+      unlink((char*)path);
+      return;
+    }
+
+    while ((!get_file || get_hdr) && readLine(input, line))
+    {
+	if (get_hdr)
+	{
+	    line.chop('\r');
+	    if (line.length() == 0)
+		get_hdr = false;
+	    else if (mystrncasecmp((char*)line, "content-type:", 13) == 0)
+	    {
+		token1 = line.get() + 13;
+		while (*token1 && isspace(*token1))
+		    token1++;
+		token1 = strtok(token1, "\n\t");
+		convertToType = token1;
+	    }
+	    continue;
+	}
+#ifdef O_BINARY
+	line.chop('\r');
+#endif
+	token1 = strtok(line, "\t");
+	if (token1 == NULL)
+	    token1 = "";
+	token2 = NULL;
+	token3 = NULL;
+	switch (*token1)
+	{
+	    case 'w':	// word
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  token2 = strtok(0, "\t");
+		if (token2 != NULL)
+		  token3 = strtok(0, "\t");
+		if (token1 != NULL && token2 != NULL && token3 != NULL &&
+			(loc = atoi(token2)) >= 0 &&
+			(hd = atoi(token3)) >= 0 && hd < 12)
+		  retriever.got_word(token1, loc, hd);
+		else
+		  cerr<< "External parser error: expected word in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'u':	// href
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  token2 = strtok(0, "\t");
+		if (token1 != NULL && token2 != NULL)
+		{
+		  url.parse(token1);
+		  url.hopcount(base.hopcount() + 1);
+		  retriever.got_href(url, token2);
+		}
+		else
+		  cerr<< "External parser error: expected URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 't':	// title
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_title(token1);
+		else
+		  cerr<< "External parser error: expected title in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'h':	// head
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_head(token1);
+		else
+		  cerr<< "External parser error: expected text in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'a':	// anchor
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_anchor(token1);
+		else
+		  cerr<< "External parser error: expected anchor in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'i':	// image url
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_image(token1);
+		else
+		  cerr<< "External parser error: expected image URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+
+	    case 'm':	// meta
+	      {
+		// Using good_strtok means we can accept empty
+		// fields.
+		char *httpEquiv = good_strtok(token1+2, '\t');
+		char *name = good_strtok(0, '\t');
+		char *content = good_strtok(0, '\t');
+
+		if (httpEquiv != NULL && name != NULL && content != NULL)
+		{
+		  // It would be preferable if we could share
+		  // this part with HTML.cc, but it has other
+		  // chores too, and I do not see a point where to
+		  // split it up to get a common shared function
+		  // (or class).  This should not stop anybody from
+		  // finding a better solution.
+		  // For now, there is duplicated code.
+		  static StringMatch *keywordsMatch = 0;
+		  if (!keywordsMatch)
+		  {
+			StringList kn(config->Find("keywords_meta_tag_names"), " \t");
+			keywordsMatch = new StringMatch();
+			keywordsMatch->IgnoreCase();
+			keywordsMatch->Pattern(kn.Join('|'));
+		  }
+		  static StringMatch *descriptionMatch = 0;
+		  if (!descriptionMatch)
+		  {
+			StringList dn(config->Find("description_meta_tag_names"), " \t");
+			descriptionMatch = new StringMatch();
+			descriptionMatch->IgnoreCase();
+			descriptionMatch->Pattern(dn.Join('|'));
+		  }
+		  static StringMatch *metadatetags = 0;
+		  if (!metadatetags)
+		  {
+			metadatetags = new StringMatch();
+			metadatetags->IgnoreCase();
+			metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
+		  }
+    
+		  // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5> 
+		  // says that the "name" attribute defaults to
+		  // the http-equiv attribute if empty.
+		  if (*name == '\0')
+		    name = httpEquiv;
+
+		  if (*httpEquiv != '\0')
+		  {
+		    // <META HTTP-EQUIV=REFRESH case
+		    if (mystrcasecmp(httpEquiv, "refresh") == 0
+			&& *content != '\0')
+		    {
+		      char *q = (char*)mystrcasestr(content, "url");
+		      if (q && *q)
+		      {
+			q += 3; // skiping "URL"
+			while (*q && ((*q == '=') || isspace(*q))) q++;
+			char *qq = q;
+			while (*qq && (*qq != ';') && (*qq != '"') &&
+			       !isspace(*qq))qq++;
+			*qq = 0;
+			URL href(q, base);
+			// I don't know why anyone would do this, but hey...
+			retriever.got_href(href, "");
+		      }
+		    }
+		  }
+
+		  //
+		  // Now check for <meta name=...  content=...> tags that
+		  // fly with any reasonable DTD out there
+		  //
+		  if (*name != '\0' && *content != '\0')
+		  {
+		    if (keywordsMatch->CompareWord(name))
+		    {
+			int wordindex = 1;
+			addKeywordString (retriever, content, wordindex);
+//			// can this be merged with Parser::addKeywordString ?
+//		      char	*w = strtok(content, " ,\t\r");
+//		      while (w)
+//		      {
+//			if (strlen(w) >= minimum_word_length)
+//			  retriever.got_word(w, 1, 9);
+//			w = strtok(0, " ,\t\r");
+//		      }
+		    }
+		    if (metadatetags->CompareWord(name) &&
+					config->Boolean("use_doc_date", 0))
+		    {
+		      retriever.got_time(content);
+		    }
+		    else if (mystrcasecmp(name, "author") == 0)
+		    {
+			int wordindex = 1;
+			retriever.got_author(content);
+			addString (retriever, content, wordindex, 11);
+		    }
+		    else if (mystrcasecmp(name, "htdig-email") == 0)
+		    {
+		      retriever.got_meta_email(content);
+		    }
+		    else if (mystrcasecmp(name, "htdig-notification-date") == 0)
+		    {
+		      retriever.got_meta_notification(content);
+		    }
+		    else if (mystrcasecmp(name, "htdig-email-subject") == 0)
+		    {
+		      retriever.got_meta_subject(content);
+		    }
+		    else if (descriptionMatch->CompareWord(name)
+			     && strlen(content) != 0)
+		    {
+		      //
+		      // We need to do two things. First grab the description
+		      //
+		      String meta_dsc = content;
+
+		      if (meta_dsc.length() > max_meta_description_length)
+			meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+		      if (debug > 1)
+			cout << "META Description: " << content << endl;
+		      retriever.got_meta_dsc((char*)meta_dsc);
+
+		      //
+		      // Now add the words to the word list
+		      // (slot 10 is the new slot for this)
+		      //
+		      int wordindex = 1;
+		      addString (retriever, content, wordindex, 10);
+//		      // can this be merged with Parser::addString ?
+//		      char	  *w = strtok(content, " \t\r");
+//		      while (w)
+//		      {
+//			if (strlen(w) >= minimum_word_length)
+//			  retriever.got_word(w, 1, 10);
+//			w = strtok(0, " \t\r");
+//		      }
+		    }
+		  }
+		}
+		else
+		  cerr<< "External parser error: expected metadata in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+	      }
+
+	    default:
+		  cerr<< "External parser error: unknown field in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+	}
+    } // while(readLine)
+    if (get_file)
+    {
+	if (!canParse(convertToType) &&
+	    mystrncasecmp((char*)convertToType, "text/", 5) != 0)
+	{
+	    if (mystrcasecmp((char*)convertToType, "user-defined") == 0)
+		cerr << "External parser error: no Content-Type given\n";
+	    else
+		cerr << "External parser error: can't parse Content-Type \""
+		     << convertToType << "\"\n";
+	    cerr << " URL: " << base.get() << "\n";
+	}
+	else
+	{
+	    char	buffer[2048];
+	    int		length;
+	    int		nbytes = config->Value("max_doc_size");
+	    while (nbytes > 0 &&
+			(length = fread(buffer, 1, sizeof(buffer), input)) > 0)
+	    {
+		nbytes -= length;
+		if (nbytes < 0)
+		    length += nbytes;
+		newcontent.append(buffer, length);
+	    }
+	}
+    }
+    fclose(input);
+    // close(stdout_pipe[0]); // This is closed for us by the fclose()
+    int rpid, status;
+    while ((rpid = wait(&status)) != fork_result && rpid != -1)
+	;
+    unlink((char*)path);
+
+    if (newcontent.length() > 0)
+    {
+	static HTML			*html = 0;
+	static Plaintext		*plaintext = 0;
+	Parsable			*parsable = 0;
+
+	contentType = convertToType;
+	if (canParse(contentType))
+	{
+	    currentParser = ((String *)parsers->Find(contentType))->get();
+	    parsable = this;
+	}
+	else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
+	{
+	    if (!html)
+		html = new HTML();
+	    parsable = html;
+	}
+	else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
+	{
+	    if (!plaintext)
+		plaintext = new Plaintext();
+	    parsable = plaintext;
+	}
+	else 
+	{
+	    if (!plaintext)
+		plaintext = new Plaintext();
+	    parsable = plaintext;
+	    if (debug)
+		cout << "External parser error: \"" << contentType <<
+			"\" not a recognized type.  Assuming text/plain\n";
+	}
+	parsable->setContents(newcontent.get(), newcontent.length());
+	parsable->parse(retriever, base);
+    }
+#endif //ifndef _MSC_VER /* _WIN32 */
+}
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h
new file mode 100644
index 00000000..4c7579a1
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h
@@ -0,0 +1,58 @@
+//
+// ExternalParser.h
+//
+// ExternalParser: Allows external programs to parse unknown document formats.
+//                 The parser is expected to return the document in a 
+//                 specific format. The format is documented 
+//                 in http://www.htdig.org/attrs.html#external_parser
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalParser.h,v 1.8 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifndef _ExternalParser_h_
+#define _ExternalParser_h_
+
+#include "Parsable.h"
+#include "htString.h"
+
+#include <stdio.h>
+
+class URL;
+
+
+class ExternalParser : public Parsable
+{
+public:
+    //
+    // Construction/Destruction
+    //
+                        ExternalParser(char *contentType);
+    virtual		~ExternalParser();
+
+    //
+    // Main parser interface.
+    //
+    virtual void	parse(Retriever &retriever, URL &);
+
+    //
+    // Check if the given contentType has an external parser associated
+    // with it
+    //
+    static int		canParse(char *contentType);
+    
+private:
+    String		currentParser;
+    String		contentType;
+
+    int			readLine(FILE *, String &);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc
new file mode 100644
index 00000000..c418e62c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc
@@ -0,0 +1,376 @@
+//
+// ExternalTransport.cc
+//
+// ExternalTransport: Allows external programs to retrieve given URLs with
+//                    unknown protocols.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalTransport.cc,v 1.9 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "ExternalTransport.h"
+#include "htdig.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "Dictionary.h"
+#include "good_strtok.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#include "defaults.h"
+
+static Dictionary	*handlers = 0;
+static Dictionary	*toTypes = 0;
+extern String		configFile;
+
+//*****************************************************************************
+// ExternalTransport::ExternalTransport(char *protocol)
+//
+ExternalTransport::ExternalTransport(const String &protocol)
+{
+    if (canHandle(protocol))
+    {
+	_Handler = ((String *)handlers->Find(protocol))->get();
+    }
+    ExternalTransport::_Protocol = protocol;
+    _Response = new ExternalTransport_Response;
+}
+
+
+//*****************************************************************************
+// ExternalTransport::~ExternalTransport()
+//
+ExternalTransport::~ExternalTransport()
+{
+    if (_Response)
+    {
+	delete _Response;
+    }
+}
+
+
+//*****************************************************************************
+// int ExternalTransport::canHandle(const String &protocol)
+//
+int
+ExternalTransport::canHandle(const String &protocol)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    if (!handlers)
+    {
+	handlers = new Dictionary();
+	toTypes = new Dictionary();
+	
+	QuotedStringList	qsl(config->Find("external_protocols"), " \t");
+	String			from, to;
+	int			i;
+	int			sep;
+
+	for (i = 0; qsl[i]; i += 2)
+	{
+	    from = qsl[i];
+	    to = "";
+	    sep = from.indexOf("->");
+	    if (sep != -1)
+	    {
+		to = from.sub(sep+2).get();
+		from = from.sub(0, sep).get();
+	    }
+
+	    // Recognise service specified as "https://" rather than "https"
+	    sep = from.indexOf(":");
+	    if (sep != -1)
+		from = from.sub(0, sep).get();
+
+	    handlers->Add(from, new String(qsl[i + 1]));
+	    toTypes->Add(from, new String(to));
+	}
+    }
+    return handlers->Exists(protocol);
+}
+
+
+//*****************************************************************************
+// void ExternalTransport::SetConnection(URL *u)
+//
+void ExternalTransport::SetConnection (URL *u)
+{
+    // Grab the actual URL to pass to the handler
+    _URL = *u;
+
+    // OK, now call the parent method to make sure everything else is set up.
+    Transport::SetConnection (u->host(), u->port());
+}
+
+
+//*****************************************************************************
+// DocStatus ExternalTransport::Request()
+//
+Transport::DocStatus ExternalTransport::Request()
+{
+// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
+#ifndef _MSC_VER /* _WIN32 */
+    //
+    // Start the external handler, passing the protocol, URL and config file
+    // as command arguments
+    //
+    StringList	hargs(_Handler);
+    char   **handlargs = new char * [hargs.Count() + 5];
+    int    argi;
+    for (argi = 0; argi < hargs.Count(); argi++)
+	handlargs[argi] = (char *)hargs[argi];
+    handlargs[argi++] = _Protocol.get();
+    handlargs[argi++] = (char *)_URL.get().get();
+    handlargs[argi++] = configFile.get();
+    handlargs[argi++] = 0;
+
+    int    stdout_pipe[2];
+    int	   fork_result = -1;
+    int	   fork_try;
+
+    if (pipe(stdout_pipe) == -1)
+    {
+      if (debug)
+	cerr << "External transport error: Can't create pipe!" << endl;
+      delete [] handlargs;
+      return GetDocumentStatus(_Response);
+    }
+
+    for (fork_try = 4; --fork_try >= 0;)
+    {
+      fork_result = fork(); // Fork so we can execute in the child process
+      if (fork_result != -1)
+	break;
+      if (fork_try)
+	sleep(3);
+    }
+    if (fork_result == -1)
+    {
+      if (debug)
+	cerr << "Fork Failure in ExternalTransport" << endl;
+      delete [] handlargs;
+      return GetDocumentStatus(_Response);
+    }
+
+    if (fork_result == 0) // Child process
+    {
+	close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+	dup(stdout_pipe[1]);
+	close(stdout_pipe[0]);
+	close(stdout_pipe[1]);
+	// not really necessary, and may pose Cygwin incompatibility...
+	//close(STDIN_FILENO); // Close STDIN to replace with null dev.
+	//open("/dev/null", O_RDONLY);
+
+	// Call External Transport Handler
+	execv(handlargs[0], handlargs);
+
+	exit(EXIT_FAILURE);
+    }
+
+    // Parent Process
+    delete [] handlargs;
+    close(stdout_pipe[1]); // Close STDOUT for writing
+    FILE *input = fdopen(stdout_pipe[0], "r");
+    if (input == NULL)
+    {
+      if (debug)
+	cerr << "Fdopen Failure in ExternalTransport" << endl;
+      return GetDocumentStatus(_Response);
+    }
+    
+    // Set up a response for this request
+    _Response->Reset();
+    // We just accessed the document
+    _Response->_access_time = new HtDateTime();
+    _Response->_access_time->SettoNow();
+    
+    
+    // OK, now parse the stuff we got back from the handler...
+    String	line;
+    char	*token1;
+    int		in_header = 1;
+
+    while (in_header && readLine(input, line))
+    {
+	line.chop('\r');
+	if (line.length() > 0 && debug > 2)
+	    cout << "Header line: " << line << endl;
+	token1 = strtok(line, "\t");
+	if (token1 == NULL)
+	  {
+	    token1 = "";
+	    in_header = 0;
+	    break;
+	  }
+
+	switch (*token1)
+	 {
+	    case 's':	// status code
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  _Response->_status_code = atoi(token1);
+		else
+		  cerr<< "External transport error: expected status code in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+		
+	    case 'r':	// status reason
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  _Response->_reason_phrase = token1;
+		else
+		  cerr<< "External transport error: expected status reason in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+		
+	    case 'm':	// modification time
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  _Response->_modification_time= NewDate(token1);  // Hopefully we can grok it...
+		else
+		  cerr<< "External transport error: expected modification time in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+		
+	    case 't':	// Content-Type
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  _Response->_content_type = token1;
+		else
+		  cerr<< "External transport error: expected content-type in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+		
+	    case 'l':	// Content-Length
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  _Response->_content_length = atoi(token1);
+		else
+		  cerr<< "External transport error: expected content-length in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+	
+	    case 'u':	// redirect target
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  _Response->_location = token1;
+		else
+		  cerr<< "External transport error: expected URL in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+		
+	    default:
+		  cerr<< "External transport error: unknown field in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+		break;
+	}
+    }
+
+    // OK, now we read in the rest of the document as contents...
+    _Response->_contents = 0;
+    char        docBuffer[8192];
+    int         bytesRead;
+
+    while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), input)) > 0)
+      {
+        if (debug > 2)
+	  cout << "Read " << bytesRead << " from document\n";
+        if (_Response->_contents.length() + bytesRead > _max_document_size)
+            bytesRead = _max_document_size - _Response->_contents.length();
+        _Response->_contents.append(docBuffer, bytesRead);
+        if (_Response->_contents.length() >= _max_document_size)
+            break;
+      }
+    _Response->_document_length = _Response->_contents.length();
+    fclose(input);
+    // close(stdout_pipe[0]); // This is closed for us by the fclose()
+
+    int rpid, status;
+    while ((rpid = wait(&status)) != fork_result && rpid != -1)
+	;
+
+#endif
+
+    return GetDocumentStatus(_Response);
+}
+
+
+//*****************************************************************************
+// private
+// DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r)
+//
+Transport::DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r)
+{ 
+   // The default is 'not found' if we can't figure it out...
+   DocStatus returnStatus = Document_not_found;
+   int statuscode = r->GetStatusCode();
+
+   if (statuscode == 200)
+   { 
+	    returnStatus = Document_ok;   // OK
+   	    // Is it parsable?
+   }
+   
+   else if (statuscode > 200 && statuscode < 300)
+	    returnStatus = Document_ok;      	   	 // Successful 2xx
+   else if (statuscode == 304)
+	    returnStatus = Document_not_changed;   	 // Not modified
+   else if (statuscode > 300 && statuscode < 400)
+	    returnStatus = Document_redirect;      	 // Redirection 3xx
+   else if (statuscode == 401)
+	    returnStatus = Document_not_authorized;   // Unauthorized
+
+   return returnStatus;
+}
+
+
+//*****************************************************************************
+// private
+// int ExternalTransport::readLine(FILE *in, String &line)
+//
+int
+ExternalTransport::readLine(FILE *in, String &line)
+{
+    char	buffer[2048];
+    int		length;
+    
+    line = 0;
+    while (fgets(buffer, sizeof(buffer), in))
+    {
+	length = strlen(buffer);
+	if (buffer[length - 1] == '\n')
+	{
+	    //
+	    // A full line has been read.  Return it.
+	    //
+	    line << buffer;
+	    line.chop('\n');
+	    return 1;
+	}
+	else
+	{
+	    //
+	    // Only a partial line was read.  Append it to the line
+	    // and read some more.
+	    //
+	    line << buffer;
+	}
+    }
+    return line.length() > 0;
+}
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h
new file mode 100644
index 00000000..4c946a96
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h
@@ -0,0 +1,88 @@
+//
+// ExternalTransport.h
+//
+// ExternalTransport: Allows external programs to retrieve given URLs with
+//                    unknown protocols.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalTransport.h,v 1.5 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifndef _ExternalTransport_h_
+#define _ExternalTransport_h_
+
+#include "Transport.h"
+#include "htString.h"
+
+#include <stdio.h>
+
+// First we must declare a derived Transport_Response class
+// This requires declaring the main class in advance
+class ExternalTransport;
+class ExternalTransport_Response : public Transport_Response
+{
+  friend class ExternalTransport;
+
+  // Nothing else... We just want it so we can access the protected fields
+};
+
+// Right, now we get on with the show...
+class ExternalTransport : public Transport
+{
+public:
+    //
+    // Construction/Destruction
+    //
+                        ExternalTransport(const String &protocol);
+    virtual		~ExternalTransport();
+
+
+    //
+    // Check if the given protocol has a handler
+    //
+    static int		canHandle(const String &protocol);
+    
+    // Setting connections is obviously a bit different than the base class
+    // from a URL pointer
+    void SetConnection (URL *u);
+    
+    // from a URL object
+    void SetConnection (URL &u)
+        { SetConnection (&u); }
+
+    // Make the request
+    DocStatus Request();
+   
+    // Get the response or the status
+    Transport_Response	*GetResponse()	 { return _Response; }
+    DocStatus GetDocumentStatus() { return GetDocumentStatus(_Response); }
+    
+
+private:
+    // The command to handle the current protocol
+    String			_Handler;
+    // And the current protocol
+    String			_Protocol;
+    
+    // The URL to Request()
+    URL				_URL;
+    
+    // The result of the Request()
+    ExternalTransport_Response	*_Response;
+
+    
+    
+    // Private helper to read in the result from the handler
+    int			readLine(FILE *, String &);
+    // Work out the DocStatus from the HTTP-style status codes
+    DocStatus		GetDocumentStatus(ExternalTransport_Response *r);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc
new file mode 100644
index 00000000..56e1d00f
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc
@@ -0,0 +1,1002 @@
+//
+// HTML.cc
+//
+// HTML: Class to parse HTML documents and return useful information 
+//       to the Retriever
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: HTML.cc,v 1.76 2004/06/09 17:35:34 grdetil Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htdig.h"
+#include "HTML.h"
+#include "HtSGMLCodec.h"
+#include "HtConfiguration.h"
+#include "StringMatch.h"
+#include "StringList.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "WordType.h"
+
+#include <ctype.h>
+
+#include "defaults.h"
+
+// Flags for noindex & nofollow, indicating who turned indexing off/on...
+#define TAGnoindex		0x0001
+#define TAGstyle		0x0002
+#define TAGscript		0x0004
+#define TAGmeta_htdig_noindex	0x0008
+#define TAGmeta_robots		0x0010
+
+static StringMatch	tags;
+static StringMatch	nobreaktags;
+static StringMatch	spacebeforetags;
+static StringMatch	spaceaftertags;
+static StringMatch	metadatetags;
+static StringMatch	descriptionMatch;
+static StringMatch	keywordsMatch;
+//static int		keywordsCount;
+//static int		max_keywords;
+
+
+//*****************************************************************************
+// ADDSPACE() macro, to insert space where needed in various strings
+// 		Reduces all multiple whitespace to a single space
+
+#define ADDSPACE(in_space)	\
+    if (!in_space)							\
+    {									\
+	if (in_title && !noindex)					\
+	{								\
+	    title << ' ';						\
+	}								\
+	if (in_ref && description.length() < max_description_length)	\
+	{								\
+	    description << ' ';						\
+	}								\
+	if (head.length() < max_head_length && !noindex && !in_title)	\
+	{								\
+	    head << ' ';						\
+	}								\
+	in_space = 1;							\
+    }
+
+
+//*****************************************************************************
+// HTML::HTML()
+//
+HTML::HTML() :
+	    skip_start (HtConfiguration::config()->Find("noindex_start")," \t"),
+	    skip_end   (HtConfiguration::config()->Find("noindex_end"),  " \t")
+{
+	HtConfiguration *config= HtConfiguration::config();
+    //
+    // Initialize the patterns that we will try to match.
+    // The tags Match object is used to match tag commands while
+    //
+    tags.IgnoreCase();
+    tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object|link|style|/style|script|/script");
+
+    // These tags don't cause a word break.  They may also be in "tags" above,
+    // except for the "a" tag, which must be handled as a special case.
+    // Note that <sup> & <sub> should cause a word break.
+    nobreaktags.IgnoreCase();
+    nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s");
+
+    // These tags, which may also be in "tags" above, cause word breaks and
+    // therefore cause space to be inserted before (or after) do_tag() is done.
+    spacebeforetags.IgnoreCase();
+    spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer");
+    spaceaftertags.IgnoreCase();
+    spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote");
+
+    // These are the name values of meta tags that carry date information.
+    metadatetags.IgnoreCase();
+    metadatetags.Pattern("date|dc.date|dc.date.created|dc.date.modified");
+
+    // These are the name values of meta tags that carry descriptions.
+    StringList descrNames(config->Find("description_meta_tag_names"), " \t");
+    descriptionMatch.IgnoreCase();
+    descriptionMatch.Pattern(descrNames.Join('|'));
+
+    // These are the name values of meta tags that carry keywords.
+    StringList keywordNames(config->Find("keywords_meta_tag_names"), " \t");
+    keywordsMatch.IgnoreCase();
+    keywordsMatch.Pattern(keywordNames.Join('|'));
+//    (now in Parser)
+//    max_keywords = config->Value("max_keywords", -1);
+//    if (max_keywords < 0)
+//	max_keywords = (int) ((unsigned int) ~1 >> 1);
+
+    // skip_start/end mark sections of text to be ignored by ht://Dig
+    // Make sure there are equal numbers of each, and warn of deprecated
+    // syntax.
+    if (skip_start.Count() > 1 || skip_end.Count() > 1)
+    {
+	if (skip_start.Count() != 0 && skip_end.Count() != 0)
+	{
+	    // check for old-style start/end which allowed unquoted spaces
+	    // (Check noindex_start/end for exactly one "<" or/followed-by
+	    // exactly one ">", and no leading quotes.)
+	    // Can someone think of a better (or simpler) check??
+	    String noindex_end (config->Find ("noindex_end"));
+	    char *first_left = strchr (noindex_end.get(), '<');
+	    char *secnd_left = first_left ? strchr(first_left+1,'<') : (char*)0;
+	    char *first_right= strchr (noindex_end.get(), '>');
+	    char *secnd_right= first_right? strchr(first_right+1,'>'): (char*)0;
+	    String noindex_start (config->Find ("noindex_start"));
+	    char *first_lft = strchr (noindex_start.get(), '<');
+	    char *secnd_lft = first_left ? strchr (first_lft +1,'<') : (char*)0;
+	    char *first_rght= strchr (noindex_start.get(), '>');
+	    char *secnd_rght= first_right? strchr (first_rght+1,'>') : (char*)0;
+
+	    if (((first_right && !secnd_right && first_right < first_left) ||
+		 (first_left  && !secnd_left  && !first_right) ||
+		 (first_rght && !secnd_rght && first_rght < first_lft) ||
+		 (first_lft  && !secnd_lft  && !first_rght)) &&
+		noindex_end[0] != '\"' && noindex_start[0] != '\"')
+	    {
+		cout << "\nWarning: To allow multiple  noindex_start/end  patterns, patterns containing\nspaces should now be in quotation marks.  (If the entries are indended to be\nmultiple patterns, this warning can be suppressed by placing the first pattern\nin quotes.)\n\n";
+		// Should we treat the patterns as if they had been quoted
+		// (as we assume was intended)?
+	    }
+	}
+    }
+
+    // check each start has an end
+    if (skip_start.Count() < skip_end.Count())
+    {
+	cout << "Warning:  " << skip_end.Count()
+	     << "  noindex_end  patterns, but only  " << skip_start.Count()
+	     << "  noindex_start  patterns.\n";
+    } else
+    {
+	while (skip_start.Count () > skip_end.Count())
+	{
+	    int missing = skip_end.Count() - 1;
+	    skip_end.Add ((missing >= 0) ? skip_end [missing]
+					 : "<!--/htdig_noindex-->");
+	    cout << "Warning: Copying " << skip_end [missing+1]
+		 << " as  noindex_end  match for " << skip_start [missing+1]
+		 << endl;
+	}
+    }
+
+    word = 0;
+    href = 0;
+    title = 0;
+    description = 0;
+    head = 0;
+    meta_dsc = 0;
+    tag = 0;
+    in_title = 0;
+    in_ref = 0;
+    in_heading = 0;
+    base = 0;
+    noindex = 0;
+    nofollow = 0;
+//    minimumWordLength = config->Value("minimum_word_length", 3);
+}
+
+
+//*****************************************************************************
+// HTML::~HTML()
+//
+HTML::~HTML()
+{
+}
+
+
+//*****************************************************************************
+// void HTML::parse(Retriever &retriever, URL &baseURL)
+//   Parse the HTML document using the Retriever object for all the callbacks.
+//   The HTML document contents are contained in the contents String.
+//
+void
+HTML::parse(Retriever &retriever, URL &baseURL)
+{
+    if (contents == 0 || contents->length() == 0)
+	return;
+
+    base = &baseURL;
+    
+    //
+    // We have some variables which will contain the various items we
+    // are looking for
+    //
+    int			wordindex = 1;
+    int			in_space;
+    int			in_punct;
+    String		scratch, textified;
+    unsigned char	*q, *start;
+    unsigned char	*position = (unsigned char *) contents->get();
+    unsigned char       *text = (unsigned char *)new char[contents->length()+1];
+    unsigned char       *ptext = text;
+
+    keywordsCount = 0;
+    title = 0;
+    head = 0;
+    meta_dsc = 0;
+    noindex = 0;
+    nofollow = 0;
+    in_heading = 0;
+    in_title = 0;
+    in_ref = 0;
+    in_space = 0;
+    in_punct = 0;
+	
+    while (*position)
+    {
+
+      //
+      // Filter out section marked to be ignored for indexing. 
+      // This can contain any HTML. 
+      // On finding a  noindex_start,  skip to first occurrence of matching
+      // noindex_end.  Any  noindex_start  within will be ignored.
+      //
+      int i;
+      for (i = 0; i < skip_start.Count(); i++)
+      {
+        if (mystrncasecmp((char *)position, skip_start[i],
+				  ((String*)skip_start.Nth(i))->length()) == 0)
+	  break;		// break from this loop for "continue" below...
+      }
+      if (i < skip_start.Count())	// found a match;
+	{
+	  q = (unsigned char*)mystrcasestr((char *)position, skip_end[i]);
+	  if (!q)
+	    *position = '\0';       // Rest of document will be skipped...
+	  else
+	    position = q + ((String*)skip_end.Nth(i))->length();
+	  continue;
+	}
+      // end of  noindex_start/end  code
+
+
+      if (strncmp((char *)position, "<!", 2) == 0)
+	{
+	  //
+	  // Possible comment declaration (but could be DTD declaration!)
+	  // A comment can contain other '<' and '>':
+	  // we have to ignore complete comment declarations
+	  // but of course also DTD declarations.
+	  //
+	  position += 2;	// Get past declaration start
+	  if (strncmp((char *)position, "--", 2) == 0)
+	    {
+	      // Found start of comment - now find the end
+	      position += 2;
+	      do
+		{
+		  q = (unsigned char*)strstr((char *)position, "--");
+		  if (!q)
+		    {
+		      *position = '\0';
+		      break;	// Rest of document seems to be a comment...
+		    }
+		  else
+		    {
+		      position = q + 2;
+		      // Skip extra dashes after a badly formed comment
+		      while (*position == '-')
+			  position++;
+		      // Skip whitespace after an individual comment
+		      while (isspace(*position))
+			  position++;
+		    }
+		  // if comment declaration hasn't ended, skip another comment
+		}
+	      while (*position && *position != '>');
+	      if (*position == '>')
+		{
+		  position++;	// End of comment declaration
+		}
+	    }
+	  else
+	    {
+	      // Not a comment declaration after all
+	      // but possibly DTD: get to the end
+	      q = (unsigned char*)strchr((char *)position, '>');
+	      if (q)
+		{
+		  position = q + 1;
+		  // End of (whatever) declaration
+		}
+	      else
+		{
+		  *position = '\0'; // Rest of document is DTD?
+		}
+	    }
+	  continue;
+	}
+
+	if (*position == '<')
+	{
+	    //
+	    // Start of a tag.  Since tags cannot be nested, we can simply
+	    // search for the closing '>'
+	    //
+	    q = (unsigned char*)strchr((char *)position, '>');
+	    if (q)
+	      { // copy tag
+		while (position <= q)
+		  *ptext++ = *position++;
+	      }
+	    else
+	      { // copy rest of text, as tag does not end
+		while (*position)
+		  *ptext++ = *position++;
+	      }
+	}
+	else if (*position == '&')
+	  {
+	    q = (unsigned char*)strchr((char *)position, ';');
+	    if (q && q <= position+10)
+	      {	// got ending, looks like valid SGML entity
+		scratch = 0;
+		scratch.append((char*)position, q+1 - position);
+		textified = HtSGMLCodec::instance()->encode(scratch);
+		if (textified[0] != '&' || textified.length() == 1)
+		  {	// it was decoded, copy it
+		    position = (unsigned char *)textified.get();
+		    while (*position)
+		      {
+			if (*position == '<')
+			  { // got a decoded &lt;, make a fake tag for it
+			    // to avoid confusing it with real tag start
+			    *ptext++ = '<';
+			    *ptext++ = '~';
+			    *ptext++ = '>';
+			    position++;
+			  }
+			else
+			    *ptext++ = *position++;
+		      }
+		    position = q+1;
+		  }
+		else	// it wasn't decoded, copy '&', and rest will follow
+		    *ptext++ = *position++;
+	      }
+	    else	// not SGML entity, copy bare '&'
+		*ptext++ = *position++;
+	  }
+        else
+        {
+           *ptext++ = *position++;
+        }
+      }
+      *ptext++ = '\0';
+
+      position = text;
+      start = position;
+
+      while (*position)
+      {
+	if (*position == '<' && (position[1] != '~' || position[2] != '>'))
+	  {
+	    //
+	    // Start of a tag.  Since tags cannot be nested, we can simply
+	    // search for the closing '>'
+	    //
+	    q = (unsigned char*)strchr((char *)position, '>');
+	    if (!q)
+	      break; // Syntax error in the doc.  Tag never ends.
+	    position++;
+	    if (noindex & TAGscript)
+	    {	// Special handling in case '<' is part of JavaScript code
+		while (isspace(*position))
+		    position++;
+		if (mystrncasecmp((char *)position, "/script", 7) != 0)
+		    continue;
+	    }
+	    tag = 0;
+	    tag.append((char*)position, q - position);
+	    while (isspace(*position))
+		position++;
+	    if (!in_space && spacebeforetags.CompareWord((char *)position)
+		|| !in_space && !in_punct && *position != '/')
+	    {
+		// These opening tags cause a space to be inserted
+		// before anything they insert.
+		// Tags processed here (i.e. not in nobreaktags), like <a ...>
+		// tag, are a special case: they don't actually add space in
+		// formatted text, but because in our processing it causes
+		// a word break, we avoid word concatenation in "head" string.
+		ADDSPACE(in_space);
+		in_punct = 0;
+	    }
+	    do_tag(retriever, tag);
+	    if (!in_space && spaceaftertags.CompareWord((char *)position))
+	    {
+		// These closing tags cause a space to be inserted
+		// after anything they insert.
+		ADDSPACE(in_space);
+		in_punct = 0;
+	    }
+	    position = q+1;
+	  }
+	else if (*position > 0 && HtIsStrictWordChar(*position))
+	{
+	    //
+	    // Start of a word.  Try to find the whole thing
+	    //
+	    word = 0;
+	    in_space = 0;
+	    in_punct = 0;
+	    while (*position && HtIsWordChar(*position))
+	    {
+		word << (char)*position;
+		// handle case where '<' is in extra_word_characters...
+		if (strncmp((char *)position, "<~>", 3) == 0)
+		    position += 2;	// skip over fake tag for decoded '<'
+		position++;
+		if (*position == '<')
+		{
+		    q = position+1;
+		    while (isspace(*q))
+			q++;
+		    // Does this tag cause a word break?
+		    if (nobreaktags.CompareWord((char *)q))
+		    {
+			// These tags just change character formatting and
+			// don't break words.
+			q = (unsigned char*)strchr((char *)position, '>');
+			if (q)
+			{
+			    position++;
+			    tag = 0;
+			    tag.append((char*)position, q - position);
+			    do_tag(retriever, tag);
+			    position = q+1;
+			}
+		    }
+		}
+	    }
+
+	    if (in_title && !noindex)
+	    {
+		title << word;
+	    }
+
+	    if (in_ref)
+	    {
+		if (description.length() < max_description_length)
+		{
+		    description << word;
+		}
+		else
+		{
+		    description << " ...";
+		    if (!nofollow)
+		      retriever.got_href(*href, (char*)description);
+		    in_ref = 0;
+		    description = 0;
+		}
+	    }
+
+	    if (head.length() < max_head_length && !noindex && !in_title)
+	    {
+		//
+		// Capitalize H1 and H2 blocks
+	        //
+	        if (in_heading > 1 && in_heading < 4)
+	         {
+	           word.uppercase();
+	         }
+
+		//
+		// Append the word to the head (excerpt)
+		//
+		  head << word;
+	    }
+
+	    if (word.length() >= (int)minimum_word_length && !noindex)
+	    {
+	      retriever.got_word((char*)word, wordindex++, in_heading);
+	    }
+	}
+	else
+	{
+	    //
+	    // Characters that are not part of a word
+	    //
+	    if (isspace(*position))
+	    {
+		ADDSPACE(in_space);
+		in_punct = 0;
+	    }
+	    else
+	    {
+		//
+		// Not whitespace
+		//
+		if (head.length() < max_head_length && !noindex && !in_title)
+		{
+		    // We don't want to add random chars to the 
+		    // excerpt if we're in the title.
+		    head << *position;
+		}
+		if (in_ref && description.length() < max_description_length)
+		{
+		    description << *position;
+		}
+		if (in_title && !noindex)
+		{
+		    title << *position;
+		}
+		in_space = 0;
+		in_punct = 1;
+		// handle normal case where decoded '<' is punctuation...
+		if (strncmp((char *)position, "<~>", 3) == 0)
+		    position += 2;	// skip over fake tag for decoded '<'
+	    }
+	    position++;
+	}
+    }
+    retriever.got_head((char*)head);
+
+    delete [] text;
+}
+
+
+//*****************************************************************************
+// void HTML::do_tag(Retriever &retriever, String &tag)
+//
+void
+HTML::do_tag(Retriever &retriever, String &tag)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    int			wordindex = 1;
+    char		*position = tag.get();
+    int			which, length;
+    static int		ignore_alt_text = config->Boolean("ignore_alt_text", 0);
+
+    while (isspace(*position))
+	position++;
+
+    which = -1;
+    if (tags.CompareWord(position, which, length) < 0)
+	return; // Nothing matched.
+
+    // Use the configuration code to match attributes as key-value pairs
+    HtConfiguration	attrs;
+    attrs.NameValueSeparators("=");
+    attrs.Add(position);
+
+    if (debug > 3)
+	cout << "Tag: <" << tag << ">, matched " << which << endl;
+    
+    switch (which)
+    {
+	case 0:		// "title"
+	    if (title.length())
+	    {
+		if (debug)
+		    cout << "More than one <title> tag in document!"
+			 << " (possible search engine spamming)" << endl;
+		break;
+	    }
+	    in_title = 1;
+	    in_heading = 1;
+	    break;
+			
+	case 1:		// "/title"
+	    if (!in_title)
+		break;
+	    in_title = 0;
+	    in_heading = 0;
+	    retriever.got_title((char*)title);
+	    break;
+			
+	case 2:		// "a"
+	{
+	  if (!attrs["href"].empty())
+	    {
+	      //
+	      // a href seen
+	      //
+	      if (in_ref)
+		{
+		  if (debug > 1)
+		    cout << "Terminating previous <a href=...> tag,"
+			 << " which didn't have a closing </a> tag."
+			 << endl;
+		  if (!nofollow)
+		      retriever.got_href(*href, (char*)description);
+		  in_ref = 0;
+		}
+	      if (href)
+		delete href;
+	      href = new URL(transSGML(attrs["href"]), *base);
+	      in_ref = 1;
+	      description = 0;
+	      break;
+	    }
+	  
+	  if (!attrs["title"].empty() && !attrs["href"].empty())
+	    {
+	      //
+	      // a title seen for href
+	      //
+	      retriever.got_href(*href, transSGML(attrs["title"]));
+	    }
+
+	  if (!attrs["name"].empty())
+	    {
+	      //
+	      // a name seen
+	      //
+	      retriever.got_anchor(transSGML(attrs["name"]));
+	    }
+	  break;
+	}
+				   
+	case 3:		// "/a"
+	    if (in_ref)
+	    {
+	      if (!nofollow)
+		retriever.got_href(*href, (char*)description);
+	      in_ref = 0;
+	    }
+	    break;
+
+	case 4:		// "h1"
+	    in_heading = 2;
+	    break;
+
+	case 5:		// "h2"
+	    in_heading = 3;
+	    break;
+
+	case 6:		// "h3"
+	    in_heading = 4;
+	    break;
+
+	case 7:		// "h4"
+	    in_heading = 5;
+	    break;
+
+	case 8:		// "h5"
+	    in_heading = 6;
+	    break;
+
+	case 9:		// "h6"
+	    in_heading = 7;
+	    break;
+
+	case 10:	// "/h1"
+	case 11:	// "/h2"
+	case 12:	// "/h3"
+	case 13:	// "/h4"
+	case 14:	// "/h5"
+	case 15:	// "/h6"
+	    in_heading = 0;
+	    break;
+
+	case 16:	// "noindex"
+	    noindex |= TAGnoindex;
+	    nofollow |= TAGnoindex;
+	    if (!attrs["follow"].empty())
+		nofollow &= ~TAGnoindex;
+	    break;
+
+	case 27:	// "style"
+	    noindex |= TAGstyle;
+	    nofollow |= TAGstyle;
+	    break;
+
+        case 29:        // "script"
+	    noindex |= TAGscript;
+	    nofollow |= TAGscript;
+	    break;
+
+	case 17:	// "/noindex"
+	    noindex &= ~TAGnoindex;
+	    nofollow &= ~TAGnoindex;
+	    break;
+
+	case 28:	// "/style"
+	    noindex &= ~TAGstyle;
+	    nofollow &= ~TAGstyle;
+	    break;
+
+        case 30:	// "/script"
+	    noindex &= ~TAGscript;
+	    nofollow &= ~TAGscript;
+	    break;
+
+	case 19:	// "li"
+	    if (!noindex && !in_title && head.length() < max_head_length)
+		head << "* ";
+	    break;
+
+	case 20:	// "meta"
+	{
+	    //
+	    // First test for old-style meta tags (these break any
+	    // reasonable DTD...)
+	    //
+	    if (!attrs["htdig-noindex"].empty())
+	      {
+		retriever.got_noindex();
+		noindex |= TAGmeta_htdig_noindex;
+		nofollow |= TAGmeta_htdig_noindex;
+	      }
+	    if (!attrs["htdig-index"].empty())
+	      {
+		noindex &= ~TAGmeta_htdig_noindex;
+		nofollow &= ~TAGmeta_htdig_noindex;
+	      }
+	    if (!attrs["htdig-email"].empty())
+	      retriever.got_meta_email(transSGML(attrs["htdig-email"]));
+
+	    if (!attrs["htdig-notification-date"].empty())
+	      retriever.got_meta_notification(transSGML(attrs["htdig-notification-date"]));
+
+	    if (!attrs["htdig-email-subject"].empty())
+	      retriever.got_meta_subject(transSGML(attrs["htdig-email-subject"]));
+
+	    if (!attrs["htdig-keywords"].empty() || !attrs["keywords"].empty())
+	    {
+		//
+		// Keywords are added as being at the very top of the
+		// document and have a weight factor of
+		// keywords-factor which is assigned to slot 9 in the
+		// factor table.
+		//
+		const String keywords = attrs["htdig-keywords"].empty() ?
+		  attrs["htdig-keywords"] :
+		  attrs["keywords"];
+		if (!noindex)
+		  {
+		    String tmp = transSGML(keywords);
+		    addKeywordString (retriever, tmp, wordindex);
+		  }
+	    }
+	
+	    if (!attrs["http-equiv"].empty())
+	      {
+
+		// <META HTTP-EQUIV=REFRESH case
+		if (mystrcasecmp(attrs["http-equiv"], "refresh") == 0
+		    && !attrs["content"].empty())
+		  {
+		    String content = attrs["content"];
+		    char *q = (char*)mystrcasestr((char*)content, "url");
+		    if (q && *q)
+		      {
+			q += 3; // skiping "URL"
+			while (*q && ((*q == '=') || isspace(*q))) q++;
+			char *qq = q;
+			while (*qq && (*qq != ';') && (*qq != '"') &&
+			       !isspace(*qq))qq++;
+			*qq = 0;
+			if (href)
+			  delete href;
+			href = new URL(transSGML(q), *base);
+			// I don't know why anyone would do this, but hey...
+			if (!nofollow)
+			  retriever.got_href(*href, "");
+		      }
+		  }
+	      }
+
+	    //
+	    // Now check for <meta name=...  content=...> tags that
+	    // fly with any reasonable DTD out there
+	    //
+
+	    if (!attrs["name"].empty() && !attrs["content"].empty())
+	    {
+		const String cache = attrs["name"];
+
+		  // First of all, check for META description
+
+		  if (descriptionMatch.CompareWord(cache) 
+			 && !attrs["content"].empty())
+		  {
+		    //
+		    // We need to do two things. First grab the description
+		    // and clean it up
+		    //
+		    meta_dsc = transSGML(attrs["content"]);
+		    meta_dsc.replace('\n', ' ');
+		    meta_dsc.replace('\r', ' ');
+		    meta_dsc.replace('\t', ' ');
+		    if (meta_dsc.length() > max_meta_description_length)
+		     meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+		   if (debug > 1)
+		     cout << "META Description: " << attrs["content"] << endl;
+		   retriever.got_meta_dsc((char*)meta_dsc);
+
+
+		   //
+		   // Now add the words to the word list
+		   // Slot 10 is the current slot for this
+		   //
+		   if (!noindex)
+		     {
+		       String tmp = transSGML(attrs["content"]);
+		       addString (retriever, tmp, wordindex, 10);
+		     }
+		}
+
+		if (keywordsMatch.CompareWord(cache) && !noindex)
+		{
+		    String tmp = transSGML(attrs["content"]);
+		    addKeywordString (retriever, tmp, wordindex);
+		}
+		else if (mystrcasecmp(cache, "author") == 0)
+		{
+		    String author = transSGML(attrs["content"]);
+		    retriever.got_author(author.get());
+		    if (!noindex)
+			addString (retriever, author, wordindex, 11);
+		}
+		else if (mystrcasecmp(cache, "htdig-email") == 0)
+		{
+		    retriever.got_meta_email(transSGML(attrs["content"]));
+		}
+		else if (metadatetags.CompareWord(cache, which, length) && 
+			 (cache.get())[length] == '\0' && config->Boolean("use_doc_date",0))
+		  {
+		    retriever.got_time(transSGML(attrs["content"]));
+		  }
+		else if (mystrcasecmp(cache, "htdig-notification-date") == 0)
+		{
+		    retriever.got_meta_notification(transSGML(attrs["content"]));
+		}
+		else if (mystrcasecmp(cache, "htdig-email-subject") == 0)
+		{
+		    retriever.got_meta_subject(transSGML(attrs["content"]));
+		}
+		else if (mystrcasecmp(cache, "htdig-noindex") == 0)
+		  {
+		    retriever.got_noindex();
+		    noindex |= TAGmeta_htdig_noindex;
+		    nofollow |= TAGmeta_htdig_noindex;
+		  }
+		else if (mystrcasecmp(cache, "robots") == 0
+			 && !attrs["content"].empty())
+		  {
+		    String   content_cache = attrs["content"];
+		    content_cache.lowercase();
+		    if (content_cache.indexOf("noindex") != -1)
+		      {
+			noindex |= TAGmeta_robots;
+			retriever.got_noindex();
+		      }
+		    if (content_cache.indexOf("nofollow") != -1)
+			nofollow |= TAGmeta_robots;
+		    if (content_cache.indexOf("none") != -1)
+		      {
+			noindex |= TAGmeta_robots;
+			nofollow |= TAGmeta_robots;
+			retriever.got_noindex();
+		      }
+		  }
+	    }
+	    else if (mystrcasecmp(attrs["name"], "htdig-noindex") == 0)
+	    {
+	        retriever.got_noindex();
+		noindex |= TAGmeta_htdig_noindex;
+		nofollow |= TAGmeta_htdig_noindex;
+	    }
+	    break;
+	}
+
+	case 21:	// frame
+        case 24:	// embed
+	{
+	  if (!attrs["src"].empty())
+	    {
+	      //
+	      // src seen
+	      //
+	      if (!nofollow)
+		{
+		  if (href)
+		    delete href;
+		  href = new URL(transSGML(attrs["src"]), *base);
+		  // Frames have the same hopcount as the parent.
+		  retriever.got_href(*href, transSGML(attrs["title"]), 0);
+		  in_ref = 0;
+		}
+	    }
+	  break;
+	}
+	  
+        case 25:	// object
+	{
+	  if (!attrs["data"].empty())
+	    {
+	      //
+	      // data seen
+	      //
+	      if (!nofollow)
+		{
+		  if (href)
+		    delete href;
+		  href = new URL(transSGML(attrs["data"]), *base);
+		  // Assume objects have the same hopcount as the parent.
+		  retriever.got_href(*href, transSGML(attrs["title"]), 0);
+		  in_ref = 0;
+		}
+	    }
+	  break;
+	}
+	  
+	case 22:	// area
+        case 26:	// link
+	{
+	  if (!attrs["href"].empty())
+	    {
+	      // href seen
+	      if (!nofollow)
+		{
+		  if (href)
+		    delete href;
+		  href = new URL(transSGML(attrs["href"]), *base);
+		  // area & link are like anchor tags -- one hopcount!
+		  retriever.got_href(*href, transSGML(attrs["title"]), 1);
+		  in_ref = 0;
+		}
+	    }
+	  break;
+	}
+	  
+	case 23:	// base
+	{
+	  if (!attrs["href"].empty())
+	    {
+	      URL tempBase(transSGML(attrs["href"]));
+	      *base = tempBase;
+	    }
+	  break;
+	}
+	
+	case 18: // img
+	  {
+	    if (!ignore_alt_text && !attrs["alt"].empty())
+	      {
+		String tmp = transSGML(attrs["alt"]);
+		if (!noindex && in_title)
+		    title << tmp << " ";
+		if (in_ref && description.length() < max_description_length)
+		    description << tmp << " ";
+		if (!noindex && !in_title && head.length() < max_head_length)
+		    head << tmp << " ";
+		if (!noindex)
+		    addString (retriever, tmp, wordindex, 8);	// slot for  img_alt
+	      }
+	    if (!attrs["src"].empty())
+	      {
+		retriever.got_image(transSGML(attrs["src"]));
+	      }
+	    break;
+	  }
+
+	default:
+	  return;	// Nothing...
+    }
+}
+
+
+//*****************************************************************************
+// const String HTML::transSGML(const String& str)
+//
+const String
+HTML::transSGML(const String& str)
+{
+    return HtSGMLCodec::instance()->encode(str);
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.h b/debian/htdig/htdig-3.2.0b6/htdig/HTML.h
new file mode 100644
index 00000000..867381ed
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.h
@@ -0,0 +1,69 @@
+//
+// HTML.h
+//
+// HTML: Class to parse HTML documents and return useful information 
+//       to the Retriever
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: HTML.h,v 1.14 2004/05/28 13:15:15 lha Exp $
+//
+#ifndef _HTML_h_
+#define _HTML_h_
+
+#include "Parsable.h"
+#include "QuotedStringList.h"
+
+class Retriever;
+class URL;
+
+
+class HTML : public Parsable
+{
+public:
+    //
+    // Construction/Destruction
+    //
+                        HTML();
+    virtual		~HTML();
+
+    //
+    // Main parser interface.
+    //
+    virtual void	parse(Retriever &retriever, URL &baseURL);
+
+private:
+    //
+    // Our state variables
+    //
+    String		word;
+    URL			*href;
+    String		title;
+    String		description;
+    String		head;
+    String		meta_dsc;
+    String		tag;
+    int			in_title;
+    int			in_ref;
+    int			in_heading;
+    int			noindex;
+    int                 nofollow;
+//    unsigned int	minimumWordLength;
+    URL			*base;
+    QuotedStringList	skip_start;
+    QuotedStringList	skip_end;
+
+    //
+    // Helper functions
+    //
+    void		do_tag(Retriever &, String &);
+    const String	transSGML(const String& str);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am
new file mode 100644
index 00000000..1e8368b4
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am
@@ -0,0 +1,16 @@
+
+include $(top_srcdir)/Makefile.config
+
+bin_PROGRAMS = htdig
+
+htdig_SOURCES = Document.cc HTML.cc \
+	Parsable.cc Plaintext.cc \
+	Retriever.cc Server.cc ExternalTransport.cc \
+	URLRef.cc htdig.cc ExternalParser.cc
+
+noinst_HEADERS = Document.h ExternalParser.h HTML.h \
+	Parsable.h Plaintext.h Retriever.h Server.h  URLRef.h htdig.h \
+	ExternalTransport.h
+htdig_DEPENDENCIES = $(HTLIBS)
+htdig_LDFLAGS = $(PROFILING) ${extra_ldflags}
+htdig_LDADD = $(HTLIBS)
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in
new file mode 100644
index 00000000..52d9a862
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in
@@ -0,0 +1,487 @@
+# Makefile.in generated by automake 1.7.9 from Makefile.am.
+# @configure_input@
+
+# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
+# Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+#
+# To compile with profiling do the following:
+#
+# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all
+#
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+top_builddir = ..
+
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+INSTALL = @INSTALL@
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_triplet = @host@
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+AMDEP_FALSE = @AMDEP_FALSE@
+AMDEP_TRUE = @AMDEP_TRUE@
+AMTAR = @AMTAR@
+APACHE = @APACHE@
+APACHE_MODULES = @APACHE_MODULES@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CGIBIN_DIR = @CGIBIN_DIR@
+COMMON_DIR = @COMMON_DIR@
+CONFIG_DIR = @CONFIG_DIR@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DATABASE_DIR = @DATABASE_DIR@
+DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO = @ECHO@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FIND = @FIND@
+GUNZIP = @GUNZIP@
+HAVE_SSL = @HAVE_SSL@
+HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@
+HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@
+HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@
+IMAGE_DIR = @IMAGE_DIR@
+IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LDFLAGS = @LDFLAGS@
+LEX = @LEX@
+LEXLIB = @LEXLIB@
+LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
+MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
+MAKEINFO = @MAKEINFO@
+MV = @MV@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PERL = @PERL@
+RANLIB = @RANLIB@
+RRDTOOL = @RRDTOOL@
+SEARCH_DIR = @SEARCH_DIR@
+SEARCH_FORM = @SEARCH_FORM@
+SED = @SED@
+SENDMAIL = @SENDMAIL@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TAR = @TAR@
+TESTS_FALSE = @TESTS_FALSE@
+TESTS_TRUE = @TESTS_TRUE@
+TIME = @TIME@
+TIMEV = @TIMEV@
+USER = @USER@
+VERSION = @VERSION@
+YACC = @YACC@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_F77 = @ac_ct_F77@
+ac_ct_RANLIB = @ac_ct_RANLIB@
+ac_ct_STRIP = @ac_ct_STRIP@
+am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
+am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
+am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
+am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+datadir = @datadir@
+exec_prefix = @exec_prefix@
+extra_ldflags = @extra_ldflags@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+oldincludedir = @oldincludedir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+subdirs = @subdirs@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+
+AUTOMAKE_OPTIONS = foreign no-dependencies
+
+INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \
+		-I$(top_srcdir)/include -I$(top_srcdir)/htlib \
+		-I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \
+		-I$(top_srcdir)/htword \
+		-I$(top_srcdir)/db -I$(top_builddir)/db \
+		$(LOCAL_DEFINES) $(PROFILING)
+
+
+HTLIBS = $(top_builddir)/htnet/libhtnet.la \
+		$(top_builddir)/htcommon/libcommon.la \
+		$(top_builddir)/htword/libhtword.la \
+		$(top_builddir)/htlib/libht.la \
+		$(top_builddir)/htcommon/libcommon.la \
+		$(top_builddir)/htword/libhtword.la \
+		$(top_builddir)/db/libhtdb.la \
+		$(top_builddir)/htlib/libht.la
+
+
+bin_PROGRAMS = htdig
+
+htdig_SOURCES = Document.cc HTML.cc \
+	Parsable.cc Plaintext.cc \
+	Retriever.cc Server.cc ExternalTransport.cc \
+	URLRef.cc htdig.cc ExternalParser.cc
+
+
+noinst_HEADERS = Document.h ExternalParser.h HTML.h \
+	Parsable.h Plaintext.h Retriever.h Server.h  URLRef.h htdig.h \
+	ExternalTransport.h
+
+htdig_DEPENDENCIES = $(HTLIBS)
+htdig_LDFLAGS = $(PROFILING) ${extra_ldflags}
+htdig_LDADD = $(HTLIBS)
+subdir = htdig
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = $(top_builddir)/include/config.h
+CONFIG_CLEAN_FILES =
+bin_PROGRAMS = htdig$(EXEEXT)
+PROGRAMS = $(bin_PROGRAMS)
+
+am_htdig_OBJECTS = Document.$(OBJEXT) HTML.$(OBJEXT) Parsable.$(OBJEXT) \
+	Plaintext.$(OBJEXT) Retriever.$(OBJEXT) Server.$(OBJEXT) \
+	ExternalTransport.$(OBJEXT) URLRef.$(OBJEXT) htdig.$(OBJEXT) \
+	ExternalParser.$(OBJEXT)
+htdig_OBJECTS = $(am_htdig_OBJECTS)
+
+DEFAULT_INCLUDES =  -I. -I$(srcdir) -I$(top_builddir)/include
+depcomp =
+am__depfiles_maybe =
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CXXFLAGS) $(CXXFLAGS)
+CXXLD = $(CXX)
+CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES = $(htdig_SOURCES)
+HEADERS = $(noinst_HEADERS)
+
+DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.in \
+	$(top_srcdir)/Makefile.config Makefile.am
+SOURCES = $(htdig_SOURCES)
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .cc .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4)
+	cd $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign  htdig/Makefile
+Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)
+binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
+install-binPROGRAMS: $(bin_PROGRAMS)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(bindir)
+	@list='$(bin_PROGRAMS)'; for p in $$list; do \
+	  p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \
+	  if test -f $$p \
+	     || test -f $$p1 \
+	  ; then \
+	    f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \
+	   echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f"; \
+	   $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f || exit 1; \
+	  else :; fi; \
+	done
+
+uninstall-binPROGRAMS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(bin_PROGRAMS)'; for p in $$list; do \
+	  f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \
+	  echo " rm -f $(DESTDIR)$(bindir)/$$f"; \
+	  rm -f $(DESTDIR)$(bindir)/$$f; \
+	done
+
+clean-binPROGRAMS:
+	@list='$(bin_PROGRAMS)'; for p in $$list; do \
+	  f=`echo $$p|sed 's/$(EXEEXT)$$//'`; \
+	  echo " rm -f $$p $$f"; \
+	  rm -f $$p $$f ; \
+	done
+htdig$(EXEEXT): $(htdig_OBJECTS) $(htdig_DEPENDENCIES) 
+	@rm -f htdig$(EXEEXT)
+	$(CXXLINK) $(htdig_LDFLAGS) $(htdig_OBJECTS) $(htdig_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT) core *.core
+
+distclean-compile:
+	-rm -f *.tab.c
+
+.cc.o:
+	$(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$<
+
+.cc.obj:
+	$(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi`
+
+.cc.lo:
+	$(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+	-rm -f libtool
+uninstall-info-am:
+
+ETAGS = etags
+ETAGSFLAGS =
+
+CTAGS = ctags
+CTAGSFLAGS =
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -fID $$unique
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$tags$$unique" \
+	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	     $$tags $$unique
+
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(CTAGS_ARGS)$$tags$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$tags $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && cd $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) $$here
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+top_distdir = ..
+distdir = $(top_distdir)/$(PACKAGE)-$(VERSION)
+
+distdir: $(DISTFILES)
+	$(mkinstalldirs) $(distdir)/..
+	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
+	list='$(DISTFILES)'; for file in $$list; do \
+	  case $$file in \
+	    $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
+	    $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
+	  esac; \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
+	  if test "$$dir" != "$$file" && test "$$dir" != "."; then \
+	    dir="/$$dir"; \
+	    $(mkinstalldirs) "$(distdir)$$dir"; \
+	  else \
+	    dir=''; \
+	  fi; \
+	  if test -d $$d/$$file; then \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+	    fi; \
+	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(PROGRAMS) $(HEADERS)
+
+installdirs:
+	$(mkinstalldirs) $(DESTDIR)$(bindir)
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f $(CONFIG_CLEAN_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-libtool distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-exec-am: install-binPROGRAMS
+
+install-info: install-info-am
+
+install-man:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-binPROGRAMS uninstall-info-am
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \
+	clean-generic clean-libtool ctags distclean distclean-compile \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am info info-am install install-am install-binPROGRAMS \
+	install-data install-data-am install-exec install-exec-am \
+	install-info install-info-am install-man install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am uninstall-binPROGRAMS \
+	uninstall-info-am
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32
new file mode 100644
index 00000000..49839a7c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32
@@ -0,0 +1,29 @@
+#
+# Makefile - makefile for rightnow
+#
+
+APP_NAME    = Right Now Web CGI
+RNT_PRODUCT = rnw
+
+TARGET      = $(BINDIR)/htdig$(EXESFX)
+
+include ../Makedefs.win32
+
+# -----------------------------------------------------------------------------
+# add new executable members to this list
+
+
+CXXSRC = Document.cc HTML.cc Parsable.cc Plaintext.cc Retriever.cc \
+    Server.cc ExternalTransport.cc URLRef.cc htdig.cc ExternalParser.cc
+
+CPPFLAGS += -I. -I../include -I../htlib -I../htcommon -I../htword -I../db -I../htnet
+
+LDLIBS = ../lib/$(ARCH)/libhtnet.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libhtdb.lib 
+OTHERLIBS = ws2_32.lib L:/win32/lib/zlib114/zlib.lib
+
+DEPLIBS += $(LDLIBS)
+
+$(TARGET): $(OBJDIRDEP) $(BINDIRDEP) $(OBJS) $(DEPLIBS)
+	$(EXELD) $(LDFLAGS) $(OBJS) $(LDLIBS) $(OTHERLIBS)
+
+include ../Makerules.win32
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc
new file mode 100644
index 00000000..049362a8
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc
@@ -0,0 +1,96 @@
+//
+// Parsable.cc
+//
+// Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...)
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Parsable.cc,v 1.9 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Parsable.h"
+#include "htdig.h"
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Parsable::Parsable()
+//
+Parsable::Parsable()
+{
+	HtConfiguration* config= HtConfiguration::config();
+    contents = 0;
+    max_head_length = config->Value("max_head_length", 0);
+    max_description_length = config->Value("max_description_length", 50);
+    max_meta_description_length = config->Value("max_meta_description_length", 0);
+
+    max_keywords = config->Value("max_keywords", -1);
+    if (max_keywords < 0)
+	max_keywords = (int) ((unsigned int) ~1 >> 1);
+    minimum_word_length = config->Value("minimum_word_length", 3);
+}
+
+
+//*****************************************************************************
+// Parsable::~Parsable()
+//
+Parsable::~Parsable()
+{
+    delete contents;
+}
+
+
+//*****************************************************************************
+// void Parsable::setContents(char *data, int length)
+//   This will set the contents of the parsable object.
+//
+void
+Parsable::setContents(char *data, int length)
+{
+    delete contents;
+    contents = new String(data, length);
+}
+
+//*****************************************************************************
+// void Parsable::addString(char *s, int& wordindex, int slot)
+//   Add all words in string s in "heading level" slot, incrementing  wordindex
+//   along the way.  String  s  is corrupted.
+//
+void
+Parsable::addString(Retriever& retriever, char *s, int& wordindex, int slot)
+{
+    char *w = HtWordToken(s);
+    while (w)
+    {
+	if (strlen(w) >= minimum_word_length)
+	    retriever.got_word(w, wordindex++, slot); // slot for img_alt
+	w = HtWordToken(0);
+    }
+    w = '\0';
+}
+
+//*****************************************************************************
+// void Parsable::addKeywordString(char *s, int& wordindex)
+//   Add all words in string  s  as keywords, incrementing  wordindex
+//   along the way.  String  s  is corrupted.
+//
+void
+Parsable::addKeywordString(Retriever& retriever, char *s, int& wordindex)
+{
+    char	*w = HtWordToken(s);
+    while (w)
+    {
+	if (strlen(w) >= minimum_word_length && ++keywordsCount <= max_keywords)
+	    retriever.got_word(w, wordindex++, 9);
+	w = HtWordToken(0);
+    }
+    w = '\0';
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h
new file mode 100644
index 00000000..7149fe7c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h
@@ -0,0 +1,57 @@
+//
+// Parsable.h
+//
+// Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...)
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Parsable.h,v 1.10 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifndef _Parsable_h_
+#define _Parsable_h_
+
+#include "htString.h"
+#include "Retriever.h"
+
+class URL;
+
+
+class Parsable
+{
+public:
+    //
+    // Construction/Destruction
+    //
+                        Parsable();
+    virtual		~Parsable();
+
+    //
+    // Main parser interface.
+    //
+    virtual void	parse(Retriever &retriever, URL &) = 0;
+
+    //
+    // The rest of the members are used by the Document to provide us
+    // the data that we contain.
+    //
+    virtual void	setContents(char *data, int length);
+    void addString(Retriever& retriever, char *s, int& wordindex, int slot);
+    void addKeywordString(Retriever& retriever,  char *s, int& wordindex);
+	
+protected:
+    String		*contents;
+    int			max_head_length;
+    int			max_description_length;
+    int			max_meta_description_length;
+    int			max_keywords, keywordsCount;
+    unsigned int	minimum_word_length;
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
new file mode 100644
index 00000000..e7006fb1
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
@@ -0,0 +1,116 @@
+//
+// Plaintext.cc
+//
+// Plaintext: Parses plaintext files. Not much to do, really.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Plaintext.cc,v 1.20 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "WordType.h"
+
+#include <ctype.h>
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Plaintext::Plaintext()
+//
+Plaintext::Plaintext()
+{
+}
+
+
+//*****************************************************************************
+// Plaintext::~Plaintext()
+//
+Plaintext::~Plaintext()
+{
+}
+
+
+//*****************************************************************************
+// void Plaintext::parse(Retriever &retriever, URL &)
+//
+void
+Plaintext::parse(Retriever &retriever, URL &)
+{
+    if (contents == 0 || contents->length() == 0)
+	return;
+
+	HtConfiguration* config= HtConfiguration::config();
+    unsigned char       *position = (unsigned char *) contents->get();
+    static int	minimumWordLength = config->Value("minimum_word_length", 3);
+    int		wordIndex = 1;
+    int		in_space = 0;
+    String	word;
+    String	head;
+
+    while (*position)
+    {
+	word = 0;
+
+	if (HtIsStrictWordChar(*position))
+	{
+	    //
+	    // Start of a word.  Try to find the whole thing
+	    //
+	    in_space = 0;
+	    while (*position && HtIsWordChar(*position))
+	    {
+		word << *position;
+		position++;
+	    }
+
+	    if (head.length() < max_head_length)
+	    {
+		head << word;
+	    }
+
+	    if (word.length() >= minimumWordLength)
+	    {
+		retriever.got_word((char*)word, wordIndex++, 0);
+	    }
+	}
+		
+	if (head.length() < max_head_length)
+	{
+	    //
+	    // Characters that are not part of a word
+	    //
+	    if (*position && isspace(*position))
+	    {
+		//
+		// Reduce all multiple whitespace to a single space
+		//
+		if (!in_space)
+		{
+		    head << ' ';
+		}
+		in_space = 1;
+	    }
+	    else
+	    {
+	        head << *position;
+		in_space = 0;
+	    }
+	}
+	if (*position)
+	    position++;
+    }
+    retriever.got_head((char*)head);
+}
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h
new file mode 100644
index 00000000..a6275c41
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h
@@ -0,0 +1,41 @@
+//
+// Plaintext.h
+//
+// Plaintext: Parses plaintext files. Not much to do, really.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Plaintext.h,v 1.6 2004/05/28 13:15:15 lha Exp $
+//
+#ifndef _Plaintext_h_
+#define _Plaintext_h_
+
+#include "Parsable.h"
+
+class URL;
+
+
+class Plaintext : public Parsable
+{
+public:
+    //
+    // Construction/Destruction
+    //
+                        Plaintext();
+    virtual		~Plaintext();
+
+    //
+    // Main parser interface.
+    //
+    virtual void	parse(Retriever &retriever, URL &);
+	
+private:
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc
new file mode 100644
index 00000000..13243571
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc
@@ -0,0 +1,2013 @@
+//
+// Retriever.cc
+//
+// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
+//            parser notifies the Retriever object that it got something
+//            (got_* functions) and the Retriever object feed the databases
+//            and statistics accordingly.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#ifdef _MSC_VER /* _WIN32 */
+# include <sys/types.h>
+# include <winsock2.h>
+#endif
+
+
+#include "Retriever.h"
+#include "htdig.h"
+#include "HtWordList.h"
+#include "WordRecord.h"
+#include "URLRef.h"
+#include "Server.h"
+#include "Parsable.h"
+#include "Document.h"
+#include "StringList.h"
+#include "WordType.h"
+#include "Transport.h"
+#include "HtHTTP.h"			  // For HTTP statistics
+#include "md5.h"
+#include "defaults.h"
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <pwd.h>
+#endif
+
+#include <signal.h>
+#include <stdio.h>
+
+
+static int noSignal;
+
+// no_store_phrases:
+// If true, only store first occurrence of each word in a document
+static bool no_store_phrases;
+
+//*****************************************************************************
+// Retriever::Retriever()
+//
+Retriever::Retriever(RetrieverLog flags):
+words(*(HtConfiguration::config())),
+words_to_add (100, 0.75)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	FILE *urls_parsed;
+
+	currenthopcount = 0;
+	max_hop_count = config->Value("max_hop_count", 999999);
+
+	no_store_phrases = !config->Boolean("store_phrases");
+
+	//
+	// Initialize the flags for the various HTML factors
+	//
+
+	// text_factor
+	factor[0] = FLAG_TEXT;
+	// title_factor
+	factor[1] = FLAG_TITLE;
+	// heading factor (now generic)
+	factor[2] = FLAG_HEADING;
+	factor[3] = FLAG_HEADING;
+	factor[4] = FLAG_HEADING;
+	factor[5] = FLAG_HEADING;
+	factor[6] = FLAG_HEADING;
+	factor[7] = FLAG_HEADING;
+	// img alt text
+	//factor[8] = FLAG_KEYWORDS;
+	factor[8] = FLAG_TEXT;	  // treat alt text as plain text, until it has
+	// its own FLAG and factor.
+	// keywords factor
+	factor[9] = FLAG_KEYWORDS;
+	// META description factor
+	factor[10] = FLAG_DESCRIPTION;
+	factor[11] = FLAG_AUTHOR;
+
+	doc = new Document();
+	minimumWordLength = config->Value("minimum_word_length", 3);
+
+	log = flags;
+	// if in restart mode
+	if (Retriever_noLog != log)
+	{
+		String filelog = config->Find("url_log");
+		char buffer[1024];
+		int l;
+
+		urls_parsed = fopen((char *) filelog, "r");
+		if (urls_parsed != 0)
+		{
+			// read all url discovered but not fetched before 
+			while (fgets(buffer, sizeof(buffer), urls_parsed))
+			{
+				l = strlen(buffer);
+				buffer[l - 1] = 0;
+				Initial(buffer, 2);
+			}
+			fclose(urls_parsed);
+		}
+		unlink((char *) filelog);
+	}
+
+	check_unique_md5 = config->Boolean("check_unique_md5", 0);
+	check_unique_date = config->Boolean("check_unique_date", 0);
+
+	d_md5 = 0;
+	if (check_unique_md5)
+	{
+		d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+		if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
+		{
+			cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
+		}
+	}
+
+}
+
+
+//*****************************************************************************
+// Retriever::~Retriever()
+//
+Retriever::~Retriever()
+{
+	if (d_md5)
+		d_md5->Close();
+	delete doc;
+}
+
+
+//*****************************************************************************
+// void Retriever::setUsernamePassword(char *credentials)
+//
+void Retriever::setUsernamePassword(const char *credentials)
+{
+	doc->setUsernamePassword(credentials);
+}
+
+
+//*****************************************************************************
+// void Retriever::Initial(char *list, int from)
+//   Add a single URL to the list of URLs to visit.
+//   Since URLs are stored on a per server basis, we first need to find the
+//   the correct server to add the URL's path to.
+//
+//   from == 0 urls in db.docs and no db.log
+//   from == 1 urls in start_url add url only if not already in the list 
+//   from == 2 add url from db.log 
+//   from == 3 urls in db.docs and there was a db.log 
+//
+void Retriever::Initial(const String & list, int from)
+{
+	//
+	// Split the list of urls up into individual urls.
+	//
+	StringList tokens(list, " \t");
+	String sig;
+	String url;
+	Server *server;
+
+	for (int i = 0; i < tokens.Count(); i++)
+	{
+		URL u(tokens[i]);
+		url = u.get();	// get before  u.signature()  resolves aliases
+		server = (Server *) servers[u.signature()];
+		if (debug > 2)
+			cout << "\t" << from << ":" << (int) log << ":" << url;
+		if (!server)
+		{
+			String robotsURL = u.signature();
+			robotsURL << "robots.txt";
+			StringList *localRobotsFile = GetLocal(robotsURL);
+
+			server = new Server(u, localRobotsFile);
+			servers.Add(u.signature(), server);
+			delete localRobotsFile;
+		}
+
+		if (from && visited.Exists(url))
+		{
+			if (debug > 2)
+				cout << " skipped" << endl;
+			continue;
+		}
+		else if (IsValidURL(url) != 1)
+		{
+			if (debug > 2)
+				cout << endl;
+			continue;
+		}
+
+		if (Retriever_noLog == log || from != 3)
+		{
+			if (debug > 2)
+				cout << " pushed";
+			server->push(u.get(), 0, 0, IsLocalURL(url.get()));
+		}
+		if (debug > 2)
+			cout << endl;
+		visited.Add(url, 0);
+	}
+}
+
+
+//*****************************************************************************
+// void Retriever::Initial(List &list, int from)
+//
+void Retriever::Initial(List & list, int from)
+{
+	list.Start_Get();
+	String *str;
+
+	// from == 0 is an optimisation for pushing url in update mode
+	//  assuming that 
+	// 1) there's many more urls in docdb 
+	// 2) they're pushed first
+	// 3) there's no duplicate url in docdb
+	// then they don't need to be check against already pushed urls
+	// But 2) can be false with -l option
+	//
+	// FIXME it's nasty, what have to be test is :
+	// we have urls to push from db.docs but do we already have them in
+	// db.log? For this it's using a side effect with 'visited' and that
+	// urls in db.docs are only pushed via this method, and that db.log are pushed
+	// first, db.docs second, start_urls third!
+	//  
+	if (!from && visited.Count())
+	{
+		from = 3;
+	}
+	while ((str = (String *) list.Get_Next()))
+	{
+		Initial(str->get(), from);
+	}
+}
+
+//*****************************************************************************
+//
+static void sigexit(int)
+{
+	noSignal = 0;   //don't exit here.. just set the flag.
+}
+
+static void sigpipe(int)
+{
+}
+
+//*****************************************************************************
+// static void sig_handlers
+//   initialise signal handlers
+//
+static void sig_handlers(void)
+{
+#ifndef _MSC_VER /* _WIN32 */
+    //POSIX SIGNALS
+	struct sigaction action;
+
+	/* SIGINT, SIGQUIT, SIGTERM */
+	action.sa_handler = sigexit;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	if (sigaction(SIGINT, &action, NULL) != 0)
+		reportError("Cannot install SIGINT handler\n");
+	if (sigaction(SIGQUIT, &action, NULL) != 0)
+		reportError("Cannot install SIGQUIT handler\n");
+	if (sigaction(SIGTERM, &action, NULL) != 0)
+		reportError("Cannot install SIGTERM handler\n");
+	if (sigaction(SIGHUP, &action, NULL) != 0)
+		reportError("Cannot install SIGHUP handler\n");
+#else
+    //ANSI C signal handling - Limited to supported Windows signals.
+    signal(SIGINT, sigexit); 
+    signal(SIGTERM, sigexit); 
+#endif //_MSC_VER /* _WIN32 */
+}
+
+
+
+static void sig_phandler(void)
+{
+#ifndef _MSC_VER /* _WIN32 */
+	struct sigaction action;
+
+	sigemptyset(&action.sa_mask);
+	action.sa_handler = sigpipe;
+	action.sa_flags = SA_RESTART;
+	if (sigaction(SIGPIPE, &action, NULL) != 0)
+		reportError("Cannot install SIGPIPE handler\n");
+#endif //_MSC_VER /* _WIN32 */
+}
+
+
+//*****************************************************************************
+// static void win32_check_messages
+//   Check WIN32 messages!
+//
+#ifdef _MSC_VER /* _WIN32 */
+static void win32_check_messages(void)
+{
+// NEAL - NEEDS FINISHING/TESTING
+#if 0
+    MSG msg = {0, 0, 0, 0};
+    int cDown = 0;
+    int controlDown = 0;
+
+    if( GetMessage(&msg, 0, 0, 0) )
+    {
+
+        switch(msg.message)
+        {
+            case WM_KEYDOWN:
+                {
+                    if(LOWORD(msg.message)== 17)
+                        controlDown = 1;
+                    else if(LOWORD(msg.message) == 67)
+                    {
+                        cDown = 1;
+                    }
+                }
+                break;
+            case WM_KEYUP:
+                {
+                    if(LOWORD(msg.message) == 17)
+                        controlDown = 0;
+                    else if(LOWORD(msg.message) == 67)
+                        cDown = 0;
+                }
+                break;
+        }
+    }
+
+    DispatchMessage(&msg);
+#endif
+}
+#endif //_MSC_VER /* _WIN32 */
+
+
+//*****************************************************************************
+// void Retriever::Start()
+//   This is the main loop of the retriever.  We will go through the
+//   list of paths stored for each server.  While parsing the
+//   retrieved documents, new paths will be added to the servers.  We
+//   return if no more paths need to be retrieved.
+//
+void Retriever::Start()
+{
+	//
+	// Main digger loop.  The todo list should initialy have the start
+	// URL and all the URLs which were seen in a previous dig.  The
+	// loop will continue as long as there are more URLs to visit.
+	//
+	int more = 1;
+	Server *server;
+	URLRef *ref;
+
+	HtConfiguration *config = HtConfiguration::config();
+
+	//  
+	// Always sig . The delay bother me but a bad db is worst
+	// 
+	if (Retriever_noLog != log)
+	{
+		sig_handlers();
+	}
+	sig_phandler();
+	noSignal = 1;
+
+
+///////
+	//    Main loop. We keep on retrieving until a signal is received
+	//    or all the servers' queues are empty.
+///////
+
+#ifdef _MSC_VER /* _WIN32 */
+    win32_check_messages();
+#endif
+
+	while (more && noSignal)
+	{
+		more = 0;
+
+		//
+		// Go through all the current servers in sequence.
+		// If they support persistent connections, we keep on popping
+		// from the same server queue until it's empty or we reach a maximum
+		// number of consecutive requests ("max_connection_requests").
+		// Or the loop may also continue for the infinite,
+		// if we set the "max_connection_requests" to -1.
+		// If the server doesn't support persistent connection, we take
+		// only an URL from it, then we skip to the next server.
+		//
+		// Since 15.05.02: even when persistent connections are activated
+		// we should wait for a 'server_wait_time' number of seconds
+		// after the 'max_connection_requests' value has been reached.
+		//
+
+		// Let's position at the beginning
+		servers.Start_Get();
+
+		int count;
+
+		// Maximum number of repeated requests with the same
+		// TCP connection (so on the same Server:Port).
+
+		int max_connection_requests;
+
+#ifdef _MSC_VER /* _WIN32 */
+        win32_check_messages();
+#endif
+
+		while ((server = (Server *) servers.Get_NextElement()) && noSignal)
+		{
+			if (debug > 1)
+				cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl;
+
+			// We already know if a server supports HTTP pers. connections,
+			// because we asked it for the robots.txt file (constructor of
+			// the class).
+
+			// If the Server doesn't support persistent connections
+			// we turn it down to 1.
+
+			if (server->IsPersistentConnectionAllowed())
+			{
+
+				// Let's check for a '0' value (out of range)
+				// If set, we change it to 1.
+
+				if (config->Value("server", server->host(), "max_connection_requests") == 0)
+					max_connection_requests = 1;
+				else
+					max_connection_requests =
+						config->Value("server", server->host(), "max_connection_requests");
+
+				if (debug > 2)
+				{
+
+					cout << "> " << server->host() << " supports HTTP persistent connections";
+
+					if (max_connection_requests == -1)
+						cout << " (" << "infinite" << ")" << endl;
+					else
+						cout << " (" << max_connection_requests << ")" << endl;
+
+				}
+
+			}
+			else
+			{
+
+				// No HTTP persistent connections. So we request only 1 document.
+
+				max_connection_requests = 1;
+
+				if (debug > 2)
+					cout << "> " << server->host() << " with a traditional HTTP connection" << endl;
+
+			}
+
+
+			count = 0;
+
+#ifdef _MSC_VER /* _WIN32 */
+            win32_check_messages();
+#endif
+
+
+			while (((max_connection_requests == -1) ||
+				   (count < max_connection_requests)) && (ref = server->pop()) && noSignal)
+			{
+				count++;
+
+				//
+				// We have a URL to index, now.  We need to register the
+				// fact that we are not done yet by setting the 'more'
+				// variable. So, we have to restart scanning the queue.
+				//
+
+				more = 1;
+
+				//
+				// Deal with the actual URL.
+				// We'll check with the server to see if we need to sleep()
+				// before parsing it.
+				//
+
+				parse_url(*ref);
+				delete ref;
+
+				// We reached the maximum number of connections (either with
+				// or without persistent connections) and we must pause and
+				// respect the 'net ethic'.
+				if ((max_connection_requests - count) == 0)
+					server->delay();	// This will pause if needed
+				// and reset the time
+
+#ifdef _MSC_VER /* _WIN32 */
+                win32_check_messages();
+#endif
+
+			}
+
+#ifdef _MSC_VER /* _WIN32 */
+            win32_check_messages();
+#endif
+
+		}
+	}
+
+#ifdef _MSC_VER /* _WIN32 */
+    win32_check_messages();
+#endif
+
+
+	// if we exited on signal 
+	if (Retriever_noLog != log && !noSignal)
+	{
+		FILE *urls_parsed;
+		String filelog = config->Find("url_log");
+		// save url seen but not fetched
+		urls_parsed = fopen((char *) filelog, "w");
+		if (0 == urls_parsed)
+		{
+			reportError(form("Unable to create URL log file '%s'", filelog.get()));
+		}
+		else
+		{
+			servers.Start_Get();
+			while ((server = (Server *) servers.Get_NextElement()))
+			{
+				while (NULL != (ref = server->pop()))
+				{
+					fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get());
+					delete ref;
+				}
+			}
+			fclose(urls_parsed);
+		}
+	}
+	words.Close();
+}
+
+
+//*****************************************************************************
+// void Retriever::parse_url(URLRef &urlRef)
+//
+void Retriever::parse_url(URLRef & urlRef)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	URL url;
+	DocumentRef *ref;
+	int old_document;
+	time_t date;
+	static int index = 0;
+	static int local_urls_only = config->Boolean("local_urls_only");
+	static int mark_dead_servers = config->Boolean("ignore_dead_servers");
+	Server *server;
+
+	url.parse(urlRef.GetURL().get());
+
+	currenthopcount = urlRef.GetHopCount();
+
+	ref = docs[url.get()];	  // It might be nice to have just an Exists() here
+	if (ref)
+	{
+		//
+		// We already have an entry for this document in our database.
+		// This means we can get the document ID and last modification
+		// time from there.
+		//
+		current_id = ref->DocID();
+		date = ref->DocTime();
+		if (ref->DocAccessed())
+			old_document = 1;
+		else				  // we haven't retrieved it yet, so we only have the first link
+			old_document = 0;
+		ref->DocBackLinks(ref->DocBackLinks() + 1);	// we had a new link
+		ref->DocAccessed(time(0));
+		ref->DocState(Reference_normal);
+		currenthopcount = ref->DocHopCount();
+	}
+	else
+	{
+		//
+		// Never seen this document before.  We need to create an
+		// entry for it.  This implies that it gets a new document ID.
+		//
+		date = 0;
+		current_id = docs.NextDocID();
+		ref = new DocumentRef;
+		ref->DocID(current_id);
+		ref->DocURL(url.get());
+		ref->DocState(Reference_normal);
+		ref->DocAccessed(time(0));
+		ref->DocHopCount(currenthopcount);
+		ref->DocBackLinks(1); // We had to have a link to get here!
+		old_document = 0;
+	}
+
+	word_context.DocID(ref->DocID());
+
+	if (debug > 0)
+	{
+		//
+		// Display progress
+		//
+		cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": ";
+		cout.flush();
+	}
+
+	// Reset the document to clean out any old data
+	doc->Reset();
+	doc->Url(url.get());
+	doc->Referer(urlRef.GetReferer().get());
+
+	base = doc->Url();
+
+	// Retrieve document, first trying local file access if possible.
+	Transport::DocStatus status;
+	server = (Server *) servers[url.signature()];
+	StringList *local_filenames = GetLocal(url.get());
+	if (local_filenames)
+	{
+		if (debug > 1)
+			cout << "Trying local files" << endl;
+		status = doc->RetrieveLocal(date, local_filenames);
+		if (status == Transport::Document_not_local)
+		{
+			if (debug > 1)
+				cout << "Local retrieval failed, trying HTTP" << endl;
+			if (server && !server->IsDead() && !local_urls_only)
+				status = doc->Retrieve(server, date);
+			else
+				status = Transport::Document_no_host;
+		}
+		delete local_filenames;
+	}
+	else if (server && !server->IsDead() && !local_urls_only)
+		status = doc->Retrieve(server, date);
+	else
+		status = Transport::Document_no_host;
+
+	current_ref = ref;
+
+	//
+	// Determine what to do by looking at the status code returned by
+	// the Document retrieval process.
+	//
+
+	String shash;
+	String sx;
+	char bhash[16];
+	time_t ddate;
+
+	switch (status)
+	{
+
+	case Transport::Document_ok:
+		trackWords = 1;
+
+		if (check_unique_md5)
+		{
+			if (doc->StoredLength() > 0)
+			{
+				if (check_unique_date)
+				{
+					ddate = doc->ModTime();
+					if (ddate < time(NULL) - 10)
+					{	  // Unknown date was set to current time
+						md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug);
+					}
+					else
+					{
+						md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
+					}
+				}
+				else
+					md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
+
+				shash.append(bhash, MD5_LENGTH);
+				d_md5->Get(shash, sx);
+
+				if (!sx.empty())
+				{
+					if (debug > 1)
+					{
+						cout << " Detected duplicate by md5 hash" << endl;
+					}
+					words.Skip();
+					break; // Duplicate - don't index
+				}
+				else
+				{
+					d_md5->Put(shash, "x");
+				}
+
+			}
+		}
+
+		if (old_document)
+		{
+			if (doc->ModTime() == ref->DocTime())
+			{
+				words.Skip();
+				if (debug)
+					cout << " retrieved but not changed" << endl;
+				words.Skip();
+				break;
+			}
+			//
+			// Since we already had a record of this document and
+			// we were able to retrieve it, it must have changed
+			// since the last time we scanned it.  This means that
+			// we need to assign a new document ID to it and mark
+			// the old one as obsolete.
+			//
+			words.Skip();
+			int backlinks = ref->DocBackLinks();
+			ref->DocState(Reference_obsolete);
+			docs.Add(*ref);
+			delete ref;
+
+			current_id = docs.NextDocID();
+			word_context.DocID(current_id);
+			ref = new DocumentRef;
+			ref->DocID(current_id);
+			ref->DocURL(url.get());
+			ref->DocState(Reference_normal);
+			ref->DocAccessed(time(0));
+			ref->DocHopCount(currenthopcount);
+			ref->DocBackLinks(backlinks);
+			if (debug)
+				cout << " (changed) ";
+		}
+		RetrievedDocument(*doc, url.get(), ref);
+		// Hey! If this document is marked noindex, don't even bother
+		// adding new words. Mark this as gone and get rid of it!
+		if (ref->DocState() == Reference_noindex)
+		{
+			if (debug > 1)
+				cout << " ( " << ref->DocURL() << " ignored)";
+			words.Skip();
+		}
+		else
+			words.Flush();
+		if (debug)
+			cout << " size = " << doc->Length() << endl;
+
+		if (urls_seen)
+		{
+			fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n",
+				   (const char *) url.get(), doc->Length(), doc->ContentType(),
+				   (int) doc->ModTime(), currenthopcount);
+		}
+		break;
+
+	case Transport::Document_not_changed:
+		if (debug)
+			cout << " not changed" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_not_found:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " not found" << endl;
+		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found);
+		words.Skip();
+		break;
+
+	case Transport::Document_no_host:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " host not found" << endl;
+		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host);
+		words.Skip();
+
+		// Mark the server as being down
+		if (server && mark_dead_servers)
+			server->IsDead(1);
+		break;
+
+	case Transport::Document_no_port:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " host not found (port)" << endl;
+		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port);
+		words.Skip();
+
+		// Mark the server as being down
+		if (server && mark_dead_servers)
+			server->IsDead(1);
+		break;
+
+	case Transport::Document_not_parsable:
+		ref->DocState(Reference_noindex);
+		if (debug)
+			cout << " not Parsable" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_redirect:
+		if (debug)
+			cout << " redirect" << endl;
+		ref->DocState(Reference_obsolete);
+		words.Skip();
+		got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get());
+		break;
+
+	case Transport::Document_not_authorized:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " not authorized" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_not_local:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " not local" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_no_header:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " no header" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_connection_down:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " connection down" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_no_connection:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " no connection" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_not_recognized_service:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " service not recognized" << endl;
+
+		// Mark the server as being down
+		if (server && mark_dead_servers)
+			server->IsDead(1);
+		words.Skip();
+		break;
+
+	case Transport::Document_other_error:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " other error" << endl;
+		words.Skip();
+		break;
+	}
+	docs.Add(*ref);
+	delete ref;
+}
+
+
+//*****************************************************************************
+// void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
+//   We found a document that needs to be parsed.  Since we don't know the
+//   document type, we'll let the Document itself return an appropriate
+//   Parsable object which we can call upon to parse the document contents.
+//
+void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref)
+{
+	n_links = 0;
+	current_ref = ref;
+	current_title = 0;
+	word_context.Anchor(0);
+	current_time = 0;
+	current_head = 0;
+	current_meta_dsc = 0;
+
+	//
+	// Create a parser object and let it have a go at the document.
+	// We will pass ourselves as a callback object for all the got_*()
+	// routines.
+	// This will generate the Parsable object as a specific parser
+	//
+	Parsable *parsable = doc.getParsable();
+	if (parsable)
+		parsable->parse(*this, *base);
+	else
+	{					  // If we didn't get a parser, then we should get rid of this!
+		ref->DocState(Reference_noindex);
+		return;
+	}
+
+	// If just storing the first occurrence of each word in a document,
+	// we must now flush the words we saw in that document
+	if (no_store_phrases)
+	{
+	    DictionaryCursor cursor;
+	    char *key;
+	    HtWordReference wordRef;
+	    for (words_to_add.Start_Get (cursor);
+		    (key = words_to_add.Get_Next(cursor)); )
+	    {
+		word_entry *entry = (word_entry*) (words_to_add [key]);
+
+		wordRef.Location(entry->location);
+		wordRef.Flags(entry->flags);
+		wordRef.Word(key);
+		words.Replace(WordReference::Merge(wordRef, entry->context));
+		// How do I clean up properly?
+		delete entry;
+	    }
+	    words_to_add.Release ();
+	}
+
+	//
+	// We don't need to dispose of the parsable object since it will
+	// automatically be reused.
+	//
+
+	//
+	// Update the document reference
+	//
+	ref->DocHead((char *) current_head);
+	ref->DocMetaDsc((char *) current_meta_dsc);
+	if (current_time == 0)
+		ref->DocTime(doc.ModTime());
+	else
+		ref->DocTime(current_time);
+	ref->DocTitle((char *) current_title);
+	ref->DocSize(doc.Length());
+	ref->DocAccessed(time(0));
+	ref->DocLinks(n_links);
+}
+
+
+//*****************************************************************************
+// int Retriever::Need2Get(const String &u)
+//   Return TRUE if we need to retrieve the given url.  This will
+//   check the list of urls we have already visited.
+//
+int Retriever::Need2Get(const String & u)
+{
+	static String url;
+	url = u;
+
+	return !visited.Exists(url);
+}
+
+
+
+//*****************************************************************************
+// int Retriever::IsValidURL(const String &u)
+//   Return TRUE if we need to retrieve the given url.  We will check
+//   for limits here.
+//
+int Retriever::IsValidURL(const String & u)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	Dictionary invalids;
+	Dictionary valids;
+	URL aUrl(u);
+	StringList tmpList;
+
+	// A list of bad extensions, separated by spaces or tabs
+	String t = config->Find(&aUrl, "bad_extensions");
+	String lowerp;
+	char *p = strtok(t, " \t");
+	while (p)
+	{
+		// Extensions are case insensitive
+		lowerp = p;
+		lowerp.lowercase();
+		invalids.Add(lowerp, 0);
+		p = strtok(0, " \t");
+	}
+
+	//
+	// Valid extensions are performed similarly 
+	//
+	// A list of valid extensions, separated by spaces or tabs
+
+	t = config->Find(&aUrl, "valid_extensions");
+	p = strtok(t, " \t");
+	while (p)
+	{
+		// Extensions are case insensitive
+		lowerp = p;
+		lowerp.lowercase();
+		valids.Add(lowerp, 0);
+		p = strtok(0, " \t");
+	}
+
+	static String url;
+	url = u;
+
+	//
+	// If the URL contains any of the patterns in the exclude list,
+	// mark it as invalid
+	//
+	String exclude_urls = config->Find(&aUrl, "exclude_urls");
+	static String *prevexcludes = 0;
+	static HtRegexList *excludes = 0;
+	if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0)
+	{
+		if (!excludes)
+			excludes = new HtRegexList;
+		if (prevexcludes)
+			delete prevexcludes;
+		prevexcludes = new String(exclude_urls);
+		tmpList.Create(exclude_urls, " \t");
+		excludes->setEscaped(tmpList, config->Boolean("case_sensitive"));
+		tmpList.Destroy();
+	}
+	if (excludes->match(url, 0, 0) != 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: item in exclude list ";
+		return (HTDIG_ERROR_TESTURL_EXCLUDE);
+	}
+
+	//
+	// If the URL has a query string and it is in the bad query list
+	// mark it as invalid
+	//
+	String bad_querystr = config->Find(&aUrl, "bad_querystr");
+	static String *prevbadquerystr = 0;
+	static HtRegexList *badquerystr = 0;
+	if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0)
+	{
+		if (!badquerystr)
+			badquerystr = new HtRegexList;
+		if (prevbadquerystr)
+			delete prevbadquerystr;
+		prevbadquerystr = new String(bad_querystr);
+		tmpList.Create(bad_querystr, " \t");
+		badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive"));
+		tmpList.Destroy();
+	}
+	char *ext = strrchr((char *) url, '?');
+	if (ext && badquerystr->match(ext, 0, 0) != 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: item in bad query list ";
+		return (HTDIG_ERROR_TESTURL_BADQUERY);
+	}
+
+	//
+	// See if the file extension is in the list of invalid ones
+	//
+	String urlpath = url.get();
+	int parm = urlpath.indexOf('?');	// chop off URL parameter
+	if (parm >= 0)
+		urlpath.chop(urlpath.length() - parm);
+	ext = strrchr((char *) urlpath.get(), '.');
+	String lowerext;
+	if (ext && strchr(ext, '/'))	// Ignore a dot if it's not in the
+		ext = NULL;		  // final component of the path.
+	if (ext)
+	{
+		lowerext.set(ext);
+		lowerext.lowercase();
+		if (invalids.Exists(lowerext))
+		{
+			if (debug > 2)
+				cout << endl << "   Rejected: Extension is invalid!";
+			return (HTDIG_ERROR_TESTURL_EXTENSION);
+		}
+	}
+	//
+	// Or NOT in the list of valid ones
+	//
+	if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: Extension is not valid!";
+		return (HTDIG_ERROR_TESTURL_EXTENSION2);
+	}
+
+	//
+	// If none of the limits is met, we disallow the URL
+	//
+	if (limits.match(url, 1, 0) == 0)
+	{
+		if (debug > 1)
+			cout << endl << "   Rejected: URL not in the limits! ";
+		return (HTDIG_ERROR_TESTURL_LIMITS);
+	}
+	//
+	// Likewise if not in list of normalized urls
+	//
+	// Warning!
+	// should be last in checks because of aUrl normalization
+	//
+		// signature()  implicitly normalizes the URL.  Be efficient...
+	Server *server = (Server *) servers[aUrl.signature()];
+//	aUrl.normalize();
+	if (limitsn.match(aUrl.get(), 1, 0) == 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: not in \"limit_normalized\" list!";
+		return (HTDIG_ERROR_TESTURL_LIMITSNORM);
+	}
+
+	//
+	// After that gauntlet, check to see if the server allows it
+	// (robots.txt)
+	//
+	if (server && server->IsDisallowed(url) != 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: forbidden by server robots.txt!";
+		return (HTDIG_ERROR_TESTURL_ROBOT_FORBID);
+	}
+
+	return (1);
+}
+
+
+//*****************************************************************************
+// StringList* Retriever::GetLocal(const String &url)
+//   Returns a list of strings containing the (possible) local filenames
+//   of the given url, or 0 if it's definitely not local.
+//   THE CALLER MUST FREE THE STRINGLIST AFTER USE!
+//   Returned strings are not hex encoded.
+//
+StringList *Retriever::GetLocal(const String & strurl)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	static StringList *prefixes = 0;
+	String url = strurl;
+
+	static StringList *paths = 0;
+	StringList *defaultdocs = 0;
+	URL aUrl(url);
+	url = aUrl.get();		  // make sure we look at a parsed URL
+
+	//
+	// Initialize prefix/path list if this is the first time.
+	// The list is given in format "prefix1=path1 prefix2=path2 ..."
+	//
+	if (!prefixes)
+	{
+		prefixes = new StringList();
+		paths = new StringList();
+
+		String t = config->Find("local_urls");
+		char *p = strtok(t, " \t");
+		while (p)
+		{
+			char *path = strchr(p, '=');
+			if (!path)
+			{
+				p = strtok(0, " \t");
+				continue;
+			}
+			*path++ = '\0';
+			String *pre = new String(p);
+			decodeURL(*pre);
+			prefixes->Add(pre);
+			String *pat = new String(path);
+			decodeURL(*pat);
+			paths->Add(pat);
+			p = strtok(0, " \t");
+		}
+	}
+	if (!config->Find(&aUrl, "local_default_doc").empty())
+	{
+		defaultdocs = new StringList();
+		String t = config->Find(&aUrl, "local_default_doc");
+		char *p = strtok(t, " \t");
+		while (p)
+		{
+			String *def = new String(p);
+			decodeURL(*def);
+			defaultdocs->Add(def);
+			p = strtok(0, " \t");
+		}
+		if (defaultdocs->Count() == 0)
+		{
+			delete defaultdocs;
+			defaultdocs = 0;
+		}
+	}
+
+	// Begin by hex-decoding URL...
+	String hexurl = url;
+	decodeURL(hexurl);
+	url = hexurl.get();
+
+	// Check first for local user...
+	if (strchr(url.get(), '~'))
+	{
+		StringList *local = GetLocalUser(url, defaultdocs);
+		if (local)
+		{
+			if (defaultdocs)
+				delete defaultdocs;
+			return local;
+		}
+	}
+
+	// This shouldn't happen, but check anyway...
+	if (strstr(url.get(), ".."))
+		return 0;
+
+	String *prefix, *path;
+	String *defaultdoc;
+	StringList *local_names = new StringList();
+	prefixes->Start_Get();
+	paths->Start_Get();
+	while ((prefix = (String *) prefixes->Get_Next()))
+	{
+		path = (String *) paths->Get_Next();
+		if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0)
+		{
+			int l = strlen(url.get()) - prefix->length() + path->length() + 4;
+			String *local = new String(*path, l);
+			*local += &url[prefix->length()];
+			if (local->last() == '/' && defaultdocs)
+			{
+				defaultdocs->Start_Get();
+				while ((defaultdoc = (String *) defaultdocs->Get_Next()))
+				{
+					String *localdefault =
+						new String(*local, local->length() + defaultdoc->length() + 1);
+					localdefault->append(*defaultdoc);
+					local_names->Add(localdefault);
+				}
+				delete local;
+			}
+			else
+				local_names->Add(local);
+		}
+	}
+	if (local_names->Count() > 0)
+	{
+		if (defaultdocs)
+			delete defaultdocs;
+		return local_names;
+	}
+
+	if (defaultdocs)
+		delete defaultdocs;
+	delete local_names;
+	return 0;
+}
+
+
+//*****************************************************************************
+// StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs)
+//   If the URL has ~user part, return a list of strings containing the
+//   (possible) local filenames of the given url, or 0 if it's
+//   definitely not local.
+//   THE CALLER MUST FREE THE STRINGLIST AFTER USE!
+//
+StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs)
+{
+//  NOTE:  Native Windows does not have this contruct for the user Web files
+#ifndef _MSC_VER /* _WIN32 */
+	HtConfiguration *config = HtConfiguration::config();
+	static StringList *prefixes = 0, *paths = 0, *dirs = 0;
+	static Dictionary home_cache;
+	URL aUrl(url);
+
+	//
+	// Initialize prefix/path list if this is the first time.
+	// The list is given in format "prefix1=path1,dir1 ..."
+	// If path is zero-length, user's home directory is looked up. 
+	//
+	if (!prefixes)
+	{
+		prefixes = new StringList();
+		paths = new StringList();
+		dirs = new StringList();
+		String t = config->Find("local_user_urls");
+		char *p = strtok(t, " \t");
+		while (p)
+		{
+			char *path = strchr(p, '=');
+			if (!path)
+			{
+				p = strtok(0, " \t");
+				continue;
+			}
+			*path++ = '\0';
+			char *dir = strchr(path, ',');
+			if (!dir)
+			{
+				p = strtok(0, " \t");
+				continue;
+			}
+			*dir++ = '\0';
+			String *pre = new String(p);
+			decodeURL(*pre);
+			prefixes->Add(pre);
+			String *pat = new String(path);
+			decodeURL(*pat);
+			paths->Add(pat);
+			String *ptd = new String(dir);
+			decodeURL(*ptd);
+			dirs->Add(ptd);
+			p = strtok(0, " \t");
+		}
+	}
+
+	// Can we do anything about this?
+	if (!strchr(url, '~') || !prefixes->Count() || strstr(url, ".."))
+		return 0;
+
+	// Split the URL to components
+	String tmp = url;
+	char *name = strchr((char *) tmp, '~');
+	*name++ = '\0';
+	char *rest = strchr(name, '/');
+	if (!rest || (rest - name <= 1) || (rest - name > 32))
+		return 0;
+	*rest++ = '\0';
+
+	// Look it up in the prefix/path/dir table
+	prefixes->Start_Get();
+	paths->Start_Get();
+	dirs->Start_Get();
+	String *prefix, *path, *dir;
+	String *defaultdoc;
+	StringList *local_names = new StringList();
+	while ((prefix = (String *) prefixes->Get_Next()))
+	{
+		path = (String *) paths->Get_Next();
+		dir = (String *) dirs->Get_Next();
+		if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0)
+			continue;
+
+		String *local = new String;
+		// No path, look up home directory
+		if (path->length() == 0)
+		{
+			String *home = (String *) home_cache[name];
+			if (!home)
+			{
+				struct passwd *passwd = getpwnam(name);
+				if (passwd)
+				{
+					home = new String(passwd->pw_dir);
+					home_cache.Add(name, home);
+				}
+			}
+			if (home)
+				*local += *home;
+			else
+				continue;
+		}
+		else
+		{
+			*local += *path;
+			*local += name;
+		}
+		*local += *dir;
+		*local += rest;
+		if (local->last() == '/' && defaultdocs)
+		{
+			defaultdocs->Start_Get();
+			while ((defaultdoc = (String *) defaultdocs->Get_Next()))
+			{
+				String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1);
+				localdefault->append(*defaultdoc);
+				local_names->Add(localdefault);
+			}
+			delete local;
+		}
+		else
+			local_names->Add(local);
+	}
+
+	if (local_names->Count() > 0)
+		return local_names;
+
+	delete local_names;
+#endif //_MSC_VER /* _WIN32 */
+
+    return 0;
+}
+
+
+//*****************************************************************************
+// int Retriever::IsLocalURL(const String &url)
+//   Returns 1 if the given url has a (possible) local filename
+//   or 0 if it's definitely not local.
+//
+int Retriever::IsLocalURL(const String & url)
+{
+	int ret;
+
+	StringList *local_filename = GetLocal(url);
+	ret = (local_filename != 0);
+	if (local_filename)
+		delete local_filename;
+
+	return ret;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_word(char *word, int location, int heading)
+//   The location is normalized to be in the range 0 - 1000.
+//
+void Retriever::got_word(const char *word, int location, int heading)
+{
+	if (debug > 3)
+		cout << "word: " << word << '@' << location << endl;
+	if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0)
+		heading = 0;		  // Assume it's just normal text
+	if (trackWords && strlen(word) >= (unsigned int) minimumWordLength)
+	{
+		String w = word;
+		HtWordReference wordRef;
+
+		if (no_store_phrases)
+		{
+		    // Add new word, or mark existing word as also being at
+		    // this heading level
+		    word_entry *entry;
+		    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+		    {
+			words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+		    } else
+		    {
+			entry->flags |= factor[heading];
+		    }
+		} else
+		{
+		    wordRef.Location(location);
+		    wordRef.Flags(factor[heading]);
+		    wordRef.Word(w);
+		    words.Replace(WordReference::Merge(wordRef, word_context));
+		}
+
+		// Check for compound words...
+		String parts = word;
+		int added;
+		int nparts = 1;
+		do
+		{
+			added = 0;
+			char *start = parts.get();
+			char *punctp = 0, *nextp = 0, *p;
+			char punct;
+			int n;
+			while (*start)
+			{
+				p = start;
+				for (n = 0; n < nparts; n++)
+				{
+					while (HtIsStrictWordChar((unsigned char) *p))
+						p++;
+					punctp = p;
+					if (!*punctp && n + 1 < nparts)
+						break;
+					while (*p && !HtIsStrictWordChar((unsigned char) *p))
+						p++;
+					if (n == 0)
+						nextp = p;
+				}
+				if (n < nparts)
+					break;
+				punct = *punctp;
+				*punctp = '\0';
+				if (*start && (*p || start > parts.get()))
+				{
+					w = start;
+					HtStripPunctuation(w);
+					if (w.length() >= minimumWordLength)
+					{
+					        if (no_store_phrases)
+						{
+						    // Add new word, or mark existing word as also being at
+						    // this heading level
+						    word_entry *entry;
+						    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+						    {
+							words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+						    } else
+						    {
+							entry->flags |= factor[heading];
+						    }
+						} else
+						{
+						    wordRef.Word(w);
+						    words.Replace(WordReference::Merge(wordRef, word_context));
+						}
+						if (debug > 3)
+							cout << "word part: " << start << '@' << location << endl;
+					}
+					added++;
+				}
+				start = nextp;
+				*punctp = punct;
+			}
+			nparts++;
+		}
+		while (added > 2);
+	}
+}
+
+
+//*****************************************************************************
+// void Retriever::got_title(const char *title)
+//
+void Retriever::got_title(const char *title)
+{
+	if (debug > 1)
+		cout << "\ntitle: " << title << endl;
+	current_title = title;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_author(const char *e)
+//
+void Retriever::got_author(const char *author)
+{
+	if (debug > 1)
+		cout << "\nauthor: " << author << endl;
+	current_ref->DocAuthor(author);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_time(const char *time)
+//
+void Retriever::got_time(const char *time)
+{
+	HtDateTime new_time(current_time);
+
+	if (debug > 1)
+		cout << "\ntime: " << time << endl;
+
+	//
+	// As defined by the Dublin Core, this should be YYYY-MM-DD
+	// In the future, we'll need to deal with the scheme portion
+	//  in case someone picks a different format.
+	//
+	new_time.SetFTime(time, "%Y-%m-%d");
+	current_time = new_time.GetTime_t();
+
+	// If we can't convert it, current_time stays the same and we get
+	// the default--the date returned by the server...
+}
+
+//*****************************************************************************
+// void Retriever::got_anchor(const char *anchor)
+//
+void Retriever::got_anchor(const char *anchor)
+{
+	if (debug > 2)
+		cout << "anchor: " << anchor << endl;
+	current_ref->AddAnchor(anchor);
+	word_context.Anchor(word_context.Anchor() + 1);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_image(const char *src)
+//
+void Retriever::got_image(const char *src)
+{
+	URL url(src, *base);
+	const char *image = (const char *) url.get();
+
+	if (debug > 2)
+		cout << "image: " << image << endl;
+
+	if (images_seen)
+		fprintf(images_seen, "%s\n", image);
+}
+
+
+//*****************************************************************************
+//
+void Retriever::got_href(URL & url, const char *description, int hops)
+{
+	DocumentRef *ref = 0;
+	Server *server = 0;
+	int valid_url_code = 0;
+
+	// Rewrite the URL (if need be) before we do anything to it.
+	url.rewrite();
+
+	if (debug > 2)
+		cout << "href: " << url.get() << " (" << description << ')' << endl;
+
+	n_links++;
+
+	if (urls_seen)
+		fprintf(urls_seen, "%s\n", (const char *) url.get());
+
+	//
+	// Check if this URL falls within the valid range of URLs.
+	//
+	valid_url_code = IsValidURL(url.get());
+	if (valid_url_code > 0)
+	{
+		//
+		// It is valid.  Normalize it (resolve cnames for the server)
+		// and check again...
+		//
+		if (debug > 2)
+		{
+			cout << "resolving '" << url.get() << "'\n";
+			cout.flush();
+		}
+
+		url.normalize();
+
+		// If it is a backlink from the current document,
+		// just update that field.  Writing to the database
+		// is meaningless, as it will be overwritten.
+		// Adding it as a new document may even be harmful, as
+		// that will be a duplicate.  This can happen if the
+		// current document is never referenced before, as in a
+		// start_url.
+
+		if (strcmp(url.get(), current_ref->DocURL()) == 0)
+		{
+			current_ref->DocBackLinks(current_ref->DocBackLinks() + 1);
+			current_ref->AddDescription(description, words);
+		}
+		else
+		{
+
+			//
+			// First add it to the document database
+			//
+			ref = docs[url.get()];
+			// if ref exists we have to call AddDescription even
+			// if max_hop_count is reached
+			if (!ref && currenthopcount + hops > max_hop_count)
+				return;
+
+			if (!ref)
+			{
+				//
+				// Didn't see this one, yet.  Create a new reference
+				// for it with a unique document ID
+				//
+				ref = new DocumentRef;
+				ref->DocID(docs.NextDocID());
+				ref->DocHopCount(currenthopcount + hops);
+				ref->DocURL(url.get());
+			}
+			ref->DocBackLinks(ref->DocBackLinks() + 1);	// This one!
+			ref->AddDescription(description, words);
+
+			//
+			// If the dig is restricting by hop count, perform the check here 
+			// too
+			if (currenthopcount + hops > max_hop_count)
+			{
+				delete ref;
+				return;
+			}
+
+			if (ref->DocHopCount() > currenthopcount + hops)
+				ref->DocHopCount(currenthopcount + hops);
+
+			docs.Add(*ref);
+
+			//
+			// Now put it in the list of URLs to still visit.
+			//
+			if (Need2Get(url.get()))
+			{
+				if (debug > 1)
+					cout << "\n   pushing " << url.get() << endl;
+				server = (Server *) servers[url.signature()];
+				if (!server)
+				{
+					//
+					// Hadn't seen this server, yet.  Register it
+					//
+					String robotsURL = url.signature();
+					robotsURL << "robots.txt";
+					StringList *localRobotsFile = GetLocal(robotsURL.get());
+
+					server = new Server(url, localRobotsFile);
+					servers.Add(url.signature(), server);
+					delete localRobotsFile;
+				}
+				//
+				// Let's just be sure we're not pushing an empty URL
+				//
+				if (strlen(url.get()))
+					server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()));
+
+				String temp = url.get();
+				visited.Add(temp, 0);
+				if (debug)
+					cout << '+';
+			}
+			else if (debug)
+				cout << '*';
+			delete ref;
+		}
+	}
+	else
+	{
+		//
+		// Not a valid URL
+		//
+		if (debug > 1)
+			cout << "\nurl rejected: (level 1)" << url.get() << endl;
+		if (debug == 1)
+			cout << '-';
+
+		if (urls_seen)
+		{
+			fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code);
+		}
+
+	}
+	if (debug)
+		cout.flush();
+}
+
+
+//*****************************************************************************
+// void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref)
+//
+void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer)
+{
+	// First we must piece together the new URL, which may be relative
+	URL parent(old_ref->DocURL());
+	URL url(new_url, parent);
+
+	// Rewrite the URL (if need be) before we do anything to it.
+	url.rewrite();
+
+	if (debug > 2)
+		cout << "redirect: " << url.get() << endl;
+
+	n_links++;
+
+	if (urls_seen)
+		fprintf(urls_seen, "%s\n", (const char *) url.get());
+
+	//
+	// Check if this URL falls within the valid range of URLs.
+	//
+	if (IsValidURL(url.get()) > 0)
+	{
+		//
+		// It is valid.  Normalize it (resolve cnames for the server)
+		// and check again...
+		//
+		if (debug > 2)
+		{
+			cout << "resolving '" << url.get() << "'\n";
+			cout.flush();
+		}
+
+		url.normalize();
+		//
+		// First add it to the document database
+		//
+		DocumentRef *ref = docs[url.get()];
+		if (!ref)
+		{
+			//
+			// Didn't see this one, yet.  Create a new reference
+			// for it with a unique document ID
+			//
+			ref = new DocumentRef;
+			ref->DocID(docs.NextDocID());
+			ref->DocHopCount(currenthopcount);
+		}
+		ref->DocURL(url.get());
+
+		//
+		// Copy the descriptions of the old DocRef to this one
+		//
+		List *d = old_ref->Descriptions();
+		if (d)
+		{
+			d->Start_Get();
+			String *str;
+			while ((str = (String *) d->Get_Next()))
+			{
+				ref->AddDescription(str->get(), words);
+			}
+		}
+		if (ref->DocHopCount() > old_ref->DocHopCount())
+			ref->DocHopCount(old_ref->DocHopCount());
+
+		// Copy the number of backlinks
+		ref->DocBackLinks(old_ref->DocBackLinks());
+
+		docs.Add(*ref);
+
+		//
+		// Now put it in the list of URLs to still visit.
+		//
+		if (Need2Get(url.get()))
+		{
+			if (debug > 1)
+				cout << "   pushing " << url.get() << endl;
+			Server *server = (Server *) servers[url.signature()];
+			if (!server)
+			{
+				//
+				// Hadn't seen this server, yet.  Register it
+				//
+				String robotsURL = url.signature();
+				robotsURL << "robots.txt";
+				StringList *localRobotsFile = GetLocal(robotsURL.get());
+
+				server = new Server(url, localRobotsFile);
+				servers.Add(url.signature(), server);
+				delete localRobotsFile;
+			}
+			if (!referer || strlen(referer) == 0)
+				server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0);
+			else
+				server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0);
+
+			String temp = url.get();
+			visited.Add(temp, 0);
+		}
+
+		delete ref;
+	}
+}
+
+
+//*****************************************************************************
+// void Retriever::got_head(const char *head)
+//
+void Retriever::got_head(const char *head)
+{
+	if (debug > 4)
+		cout << "head: " << head << endl;
+	current_head = head;
+}
+
+//*****************************************************************************
+// void Retriever::got_meta_dsc(const char *md)
+//
+void Retriever::got_meta_dsc(const char *md)
+{
+	if (debug > 4)
+		cout << "meta description: " << md << endl;
+	current_meta_dsc = md;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_email(const char *e)
+//
+void Retriever::got_meta_email(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta email: " << e << endl;
+	current_ref->DocEmail(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_notification(const char *e)
+//
+void Retriever::got_meta_notification(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta notification date: " << e << endl;
+	current_ref->DocNotification(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_subject(const char *e)
+//
+void Retriever::got_meta_subject(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta subect: " << e << endl;
+	current_ref->DocSubject(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_noindex()
+//
+void Retriever::got_noindex()
+{
+	if (debug > 1)
+		cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
+	current_ref->DocState(Reference_noindex);
+}
+
+
+//*****************************************************************************
+//
+void Retriever::recordNotFound(const String & url, const String & referer, int reason)
+{
+	char *message = "";
+
+	switch (reason)
+	{
+	case Transport::Document_not_found:
+		message = "Not found";
+		break;
+
+	case Transport::Document_no_host:
+		message = "Unknown host or unable to contact server";
+		break;
+
+	case Transport::Document_no_port:
+		message = "Unknown host or unable to contact server (port)";
+		break;
+
+	default:
+		break;
+
+	}
+
+	notFound << message << ": " << url << " Ref: " << referer << '\n';
+}
+
+//*****************************************************************************
+// void Retriever::ReportStatistics(char *name)
+//
+void Retriever::ReportStatistics(const String & name)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	cout << name << ": Run complete\n";
+	cout << name << ": " << servers.Count() << " server";
+	if (servers.Count() > 1)
+		cout << "s";
+	cout << " seen:\n";
+
+	Server *server;
+	String buffer;
+	StringList results;
+	String newname = name;
+
+	newname << ":    ";
+
+	servers.Start_Get();
+	while ((server = (Server *) servers.Get_NextElement()))
+	{
+		buffer = 0;
+		server->reportStatistics(buffer, newname);
+		results.Add(buffer);
+	}
+	results.Sort();
+
+	for (int i = 0; i < results.Count(); i++)
+	{
+		cout << results[i] << "\n";
+	}
+
+	if (notFound.length() > 0)
+	{
+		cout << "\n" << name << ": Errors to take note of:\n";
+		cout << notFound;
+	}
+
+	cout << endl;
+
+	// Report HTTP connections stats
+	cout << "HTTP statistics" << endl;
+	cout << "===============" << endl;
+
+	if (config->Boolean("persistent_connections"))
+	{
+		cout << " Persistent connections    : Yes" << endl;
+
+		if (config->Boolean("head_before_get"))
+			cout << " HEAD call before GET      : Yes" << endl;
+		else
+			cout << " HEAD call before GET      : No" << endl;
+	}
+	else
+	{
+		cout << " Persistent connections    : No" << endl;
+	}
+
+	HtHTTP::ShowStatistics(cout) << endl;
+
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
new file mode 100644
index 00000000..b2fff24d
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
@@ -0,0 +1,183 @@
+//
+// Retriever.h
+//
+// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
+//            parser notifies the Retriever object that it got something
+//            (got_* functions) and the Retriever object feed the databases
+//            and statistics accordingly.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Retriever.h,v 1.28 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifndef _Retriever_h_
+#define _Retriever_h_
+
+#include "DocumentRef.h"
+#include "Dictionary.h"
+#include "Queue.h"
+#include "HtWordReference.h"
+#include "List.h"
+#include "StringList.h"
+#include "DocumentDB.h"
+
+#define  HTDIG_ERROR_TESTURL_EXCLUDE           -109
+#define  HTDIG_ERROR_TESTURL_BADQUERY          -110
+#define  HTDIG_ERROR_TESTURL_EXTENSION         -111
+#define  HTDIG_ERROR_TESTURL_EXTENSION2        -112
+#define  HTDIG_ERROR_TESTURL_LIMITS            -113
+#define  HTDIG_ERROR_TESTURL_LIMITSNORM        -114
+#define  HTDIG_ERROR_TESTURL_SRCH_RESTRICT     -115
+#define  HTDIG_ERROR_TESTURL_SRCH_EXCLUDE      -116
+#define  HTDIG_ERROR_TESTURL_REWRITE_EMPTY     -117
+#define  HTDIG_ERROR_TESTURL_ROBOT_FORBID      -118
+
+
+class URL;
+class Document;
+class URLRef;
+class HtWordList;
+
+enum  RetrieverLog {
+    Retriever_noLog,
+    Retriever_logUrl,
+    Retriever_Restart
+};
+
+struct word_entry : public Object
+{
+    		word_entry (int loc, int fl, HtWordReference& ref) :
+		    	location (loc), flags (fl), context (ref)
+			{};
+    int		location;
+    int		flags;
+    HtWordReference context;
+};
+
+class Retriever
+{
+public:
+    //
+    // Construction/Destruction
+    //
+    			Retriever(RetrieverLog flags = Retriever_noLog);
+    virtual		~Retriever();
+
+    //
+    // Getting it all started
+    //
+    void		Initial(const String& url, int checked = 0);
+    void		Initial(List &list , int checked = 0);
+    void		Start();
+
+    //
+    // Report statistics about the parser
+    //
+    void		ReportStatistics(const String& name);
+	
+    //
+    // These are the callbacks that we need to write code for
+    //
+    void		got_word(const char *word, int location, int heading);
+    void		got_href(URL &url, const char *description, int hops = 1);
+    void		got_title(const char *title);
+    void		got_author(const char *author);
+    void		got_time(const char *time);
+    void		got_head(const char *head);
+    void		got_meta_dsc(const char *md);
+    void		got_anchor(const char *anchor);
+    void		got_image(const char *src);
+    void		got_meta_email(const char *);
+    void		got_meta_notification(const char *);
+    void		got_meta_subject(const char *);
+    void                got_noindex();
+
+    //
+    // Allow for the indexing of protected sites by using a
+    // username/password
+    //
+    void		setUsernamePassword(const char *credentials);
+
+    //
+    // Routines for dealing with local filesystem access
+    //
+    StringList *	GetLocal(const String &strurl);
+    StringList *	GetLocalUser(const String &url, StringList *defaultdocs);
+    int			IsLocalURL(const String &url);
+
+private:
+    //
+    // A hash to keep track of what we've seen
+    //
+    Dictionary		visited;
+    
+    URL			*base;
+    String		current_title;
+    String		current_head;
+    String		current_meta_dsc;
+    time_t		current_time;
+    int			current_id;
+    DocumentRef		*current_ref;
+    int			current_anchor_number;
+    int			trackWords;
+    int			n_links;
+    String		credentials;
+    HtWordReference	word_context;
+    HtWordList		words;
+
+    Dictionary		words_to_add;
+	
+    int			check_unique_md5;
+    int			check_unique_date;
+
+
+    RetrieverLog log;
+    //
+    // These are weights for the words.  The index is the heading level.
+    //
+    long int		factor[12];
+    int			currenthopcount;
+
+    //
+    // Some semi-constants...
+    //
+    int			max_hop_count;
+	
+    //
+    // The list of server-specific information objects is indexed by
+    // ip address and port number.  The list contains Server objects.
+    //
+    Dictionary		servers;
+
+    //
+    // For efficiency reasons, we will only use one document object which
+    // we reuse.
+    //
+    Document		*doc;
+
+    Database 		*d_md5;
+
+    String		notFound;
+
+    // Some useful constants
+    int              minimumWordLength;
+
+    //
+    // Helper routines
+    //
+    int			Need2Get(const String &url);
+    int			IsValidURL(const String &url);
+    void		RetrievedDocument(Document &, const String &url, DocumentRef *ref);
+    void		parse_url(URLRef &urlRef);
+    void		got_redirect(const char *, DocumentRef *, const char * = 0);
+    void		recordNotFound(const String &url, const String &referer, int reason);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.cc b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc
new file mode 100644
index 00000000..3afdebd3
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc
@@ -0,0 +1,435 @@
+//
+// Server.cc
+//
+// Server: A class to keep track of server specific information.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Server.cc,v 1.29 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htdig.h"
+#include "Server.h"
+#include "good_strtok.h"
+#include "htString.h"
+#include "URL.h"
+#include "Document.h"
+#include "URLRef.h"
+#include "Transport.h"
+#include "HtHTTP.h"    // for checking persistent connections
+#include "StringList.h"
+
+#include <ctype.h>
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Server::Server(URL u, StringList *local_robots_files)
+//  u is the base URL for this server
+//
+Server::Server(URL u, StringList *local_robots_files)
+:
+    _host(u.host()),
+    _port(u.port()),
+    _bad_server(0),
+    _documents(0),
+   _accept_language(0)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    if (debug)
+      cout << endl << "New server: " << _host << ", " << _port << endl;
+
+    // We take it from the configuration
+    _persistent_connections = config->Boolean("server", _host.get(),"persistent_connections");
+    _head_before_get = config->Boolean("server", _host.get(),"head_before_get");
+
+    _max_documents = config->Value("server",_host.get(),"server_max_docs");
+    _connection_space = config->Value("server",_host.get(),"server_wait_time");
+    _user_agent = config->Find("server", _host.get(), "user_agent");
+    _disable_cookies = config->Boolean("server", _host.get(), "disable_cookies");
+
+    // Accept-Language directive
+    StringList _accept_language_list(config->Find("server", _host.get(),
+      "accept_language"), " \t");
+
+    _accept_language.trunc(); // maybe not needed
+    
+    for (int i = 0; i < _accept_language_list.Count(); i++)
+    {
+       if (i>0)
+       	 _accept_language << ",";   // for multiple choices
+
+       	 _accept_language << _accept_language_list[i];
+    }
+    
+    // Timeout setting
+    _timeout = config->Value("server",_host.get(),"timeout");
+
+    // Number of consecutive attempts to establish a TCP connection
+    _tcp_max_retries = config->Value("server",_host.get(),"tcp_max_retries");
+
+    // Seconds to wait after a timeout occurs
+    _tcp_wait_time = config->Value("server",_host.get(),"tcp_wait_time");
+
+
+    if (debug > 1)
+    {
+      cout << " - Persistent connections: " <<
+         (_persistent_connections?"enabled":"disabled") << endl;
+
+      cout << " - HEAD before GET: " <<
+         (_head_before_get?"enabled":"disabled") << endl;
+
+      cout << " - Timeout: " << _timeout << endl;
+      cout << " - Connection space: " << _connection_space << endl;
+      cout << " - Max Documents: " << _max_documents << endl;
+      cout << " - TCP retries: " << _tcp_max_retries << endl;
+      cout << " - TCP wait time: " << _tcp_wait_time << endl;
+      cout << " - Accept-Language: " << _accept_language << endl;
+
+    }
+
+    _last_connection.SettoNow();  // For getting robots.txt
+
+    if (strcmp(u.service(),"http") == 0 || strcmp(u.service(),"https") == 0)
+      {
+	//
+	// Attempt to get a robots.txt file from the specified server
+	//
+	String	url;
+	url.trunc();
+
+	if (debug>1)
+	  cout << "Trying to retrieve robots.txt file" << endl;        
+	url << u.signature() << "robots.txt";
+	
+	static int	local_urls_only = config->Boolean("local_urls_only");
+	time_t 		timeZero = 0; // Right now we want to get this every time
+	Document	doc(url, 0);
+	Transport::DocStatus	status;
+	if (local_robots_files)
+	  {  
+	    if (debug > 1)
+	      cout << "Trying local files" << endl;
+	    status = doc.RetrieveLocal(timeZero, local_robots_files);
+	    if (status == Transport::Document_not_local)
+	      {
+		if (local_urls_only)
+		  status = Transport::Document_not_found;
+		else
+		  {
+		    if (debug > 1)
+		      cout << "Local retrieval failed, trying HTTP" << endl;
+		    status = doc.Retrieve(this, timeZero);
+		  }
+	      }
+	  }
+	else if (!local_urls_only)
+        {
+	  status = doc.Retrieve(this, timeZero);
+
+          // Let's check if persistent connections are both
+          // allowed by the configuration and possible after
+          // having requested the robots.txt file.
+
+          HtHTTP * http;
+          if (IsPersistentConnectionAllowed() &&
+                  ( http = doc.GetHTTPHandler()))
+          {
+              if (! http->isPersistentConnectionPossible())
+                  _persistent_connections=0;  // not possible. Let's disable
+                                              // them on this server.
+          }
+
+        }
+	else
+	  status = Transport::Document_not_found;
+
+	switch (status)
+	  {
+	  case Transport::Document_ok:
+	    //
+	    // Found a robots.txt file.  Go parse it.
+	    //
+	    robotstxt(doc);
+	    break;
+			
+	  case Transport::Document_not_found:
+	  case Transport::Document_not_parsable:
+	  case Transport::Document_redirect:
+	  case Transport::Document_not_authorized:
+	    //
+	    // These cases are for when there is no robots.txt file.
+	    // We will just go on happily without restrictions
+	    //
+	    break;
+			
+	  case Transport::Document_no_host:
+	  default:
+	    //
+	    // In all other cases the server could not be reached.
+	    // We will remember this fact so that no more attempts to
+	    // contact this server will be made.
+	    //
+	    _bad_server = 1;
+	    break;
+	  } // end switch
+      } // end if (http || https)
+}
+
+// Copy constructor
+Server::Server(const Server& rhs)
+:_host(_host),
+_port(rhs._port),
+_bad_server(rhs._bad_server),
+_connection_space(rhs._connection_space),
+_last_connection(rhs._last_connection),
+_paths(rhs._paths),
+_disallow(rhs._disallow),
+_documents(rhs._documents),
+_max_documents(rhs._max_documents),
+_persistent_connections(rhs._persistent_connections),
+_head_before_get(rhs._head_before_get),
+_disable_cookies(rhs._disable_cookies),
+_timeout(rhs._timeout),
+_tcp_wait_time(rhs._tcp_wait_time),
+_tcp_max_retries(rhs._tcp_max_retries),
+_user_agent(rhs._user_agent),
+_accept_language(rhs._accept_language)
+{
+}
+
+
+//*****************************************************************************
+// Server::~Server()
+//
+Server::~Server()
+{
+}
+
+
+//*****************************************************************************
+// void Server::robotstxt(Document &doc)
+//   This will parse the robots.txt file which is contained in the document.
+//
+void Server::robotstxt(Document &doc)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    String	contents = doc.Contents();
+    int		length;
+    int		pay_attention = 0;
+    String	pattern;
+    String	myname = config->Find("server", _host.get(), "robotstxt_name");
+    int		seen_myname = 0;
+    char	*name, *rest;
+    
+    if (debug > 1)
+	cout << "Parsing robots.txt file using myname = " << myname << "\n";
+
+    //
+    // Go through the lines in the file and determine if we need to
+    // pay attention to them
+    //
+    for (char *line = strtok(contents, "\r\n"); line; line = strtok(0, "\r\n"))
+    {
+	if (debug > 2)
+	    cout << "Robots.txt line: " << line << endl;
+
+	//
+	// Strip comments
+	//
+	if (strchr(line, '#'))
+	{
+	    *(strchr(line, '#')) = '\0';
+	}
+	
+	name = good_strtok(line, ':');
+	if (!name)
+	    continue;
+	while (name && isspace(*name))  name++;
+	rest = good_strtok(NULL, '\r');
+	if (!rest)
+	    rest = "";
+
+	while (rest && isspace(*rest))
+	    rest++;
+			
+	length = strlen(rest);
+	if (length > 0)
+	{
+	    while (length > 0 && isspace(rest[length - 1]))
+		length--;
+	    rest[length] = '\0';
+	}
+
+	if (mystrcasecmp(name, "user-agent") == 0)
+	{
+	    if (debug > 1)
+		cout << "Found 'user-agent' line: " << rest << endl;
+
+	    if (*rest == '*' && !seen_myname)
+	    {
+		//
+		// This matches all search engines...
+		//
+		pay_attention = 1;
+	    }
+	    else if (mystrncasecmp(rest, (char*)myname, myname.length()) == 0)
+	    {
+		//
+		// This is for us!  This will override any previous patterns
+		// that may have been set.
+		//
+		if (!seen_myname)	// only take first section with our name
+		{
+		    seen_myname = 1;
+		    pay_attention = 1;
+		    pattern = 0;	// ignore previous User-agent: *
+		}
+		else
+		    pay_attention = 0;
+	    }
+	    else
+	    {
+		//
+		// This doesn't concern us
+		//
+		pay_attention = 0;
+	    }
+	}
+	else if (pay_attention && mystrcasecmp(name, "disallow") == 0)
+	{
+	    if (debug > 1)
+		cout << "Found 'disallow' line: " << rest << endl;
+				
+	    //
+	    // Add this path to our list to ignore
+	    //
+	    if (*rest)
+	    {
+		if (pattern.length())
+		    pattern << '|';
+		while (*rest)
+		{
+		    if (strchr("^.[$()|*+?{\\", *rest))
+			pattern << '\\';
+		    pattern << *rest++;
+		}
+	    }
+	}
+	//
+	// Ignore anything else (comments)
+	//
+    }
+
+    //
+    // Compile the pattern (if any...)
+    //
+    if (debug > 1)
+	cout << "Pattern: " << pattern << endl;
+		
+    // Empty "disallow" allows all, so don't make entry which matches all.
+    if (!pattern.empty())
+    {
+	String	fullpatt = "^[^:]*://[^/]*(";
+	fullpatt << pattern << ')';
+	_disallow.set(fullpatt, config->Boolean("case_sensitive"));
+    }
+}
+
+
+//*****************************************************************************
+// void Server::push(String &path, int hopcount, char *referer, int local, int newDoc)
+//
+void Server::push(const String &path, int hopcount, const String &referer,
+		  int local, int newDoc)
+{
+    if (_bad_server && !local)
+	return;
+
+    if (IsDisallowed(path) != 0)
+      {
+	if (debug > 2)
+	  cout << endl << "   Rejected: forbidden by server robots.txt!";
+
+	return;
+      }
+
+    // We use -1 as no limit, but we also don't want
+    // to forbid redirects from old places
+    if (_max_documents != -1 && newDoc &&
+	_documents >= _max_documents)
+    {
+       if (debug>2)     // Hey! we only want to get max_docs
+          cout << "Limit of " << _max_documents << " reached for " << _host << endl;
+        
+       return;
+    }
+
+    URLRef	*ref = new URLRef();
+    ref->SetURL(path);
+    ref->SetHopCount(hopcount);
+    ref->SetReferer(referer);
+    _paths.Add(ref);
+
+    if (newDoc)
+      _documents++;
+
+//     cout << "***** pushing '" << path << "' with '" << referer << "'\n";
+}
+
+
+//*****************************************************************************
+// URLRef *Server::pop()
+//
+URLRef *Server::pop()
+{
+    URLRef	*ref = (URLRef *) _paths.Remove();
+
+    if (!ref)
+	return 0;
+
+    return ref;
+}
+
+
+//*****************************************************************************
+// void Server::delay()
+//
+// Keeps track of how long it's been since we've seen this server
+// and call sleep if necessary
+//
+void Server::delay()
+{
+  HtDateTime now;
+
+  int time_taken = HtDateTime::GetDiff(now, _last_connection);  // arg1-arg2 > 0
+
+  if (time_taken < _connection_space)
+    sleep(_connection_space - time_taken);
+
+  now.SettoNow();
+  _last_connection = now;  // Reset the clock for the next delay!
+
+  return;
+}
+
+
+//*****************************************************************************
+// void Server::reportStatistics(String &out, char *name)
+//
+void Server::reportStatistics(String &out, char *name)
+{
+    out << name << " " << _host << ":" << _port;
+    out << " " << _documents << " document";
+    if (_documents != 1)
+	out << "s";
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.h b/debian/htdig/htdig-3.2.0b6/htdig/Server.h
new file mode 100644
index 00000000..ca6a4f04
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.h
@@ -0,0 +1,142 @@
+//
+// Server.h
+//
+// Server: A class to keep track of server specific information.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Server.h,v 1.13 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifndef _Server_h_
+#define _Server_h_
+
+#include "Object.h"
+#include "htString.h"
+#include "StringList.h"
+#include "Stack.h"
+#include "HtHeap.h"
+#include "HtRegex.h"
+#include "StringMatch.h"
+#include "URLRef.h"
+#include "HtDateTime.h"
+
+
+class Document;
+
+class Server : public Object
+{
+public:
+	//
+	// Construction/Destruction
+	//
+	Server(URL u, StringList *local_robots_files = NULL);
+	Server(const Server& rhs);
+	~Server();
+
+	//
+	// This needs to be called with a document containing the
+	// robots.txt file for this server
+	//
+	void			robotstxt(Document &doc);
+
+	//
+	// Provide some way of getting at the host and port for this server
+	//
+	int port() const            {return _port;}
+	const String &host() const  {return _host;}
+	
+	//
+	// Provide some way of getting at the status of this server
+	//
+	int			IsDead()		{return _bad_server;}
+	void			IsDead(int flag)	{_bad_server = flag;}
+
+	//
+	// Add a path to the queue for this server.
+	// This will check to see if the server is up if the URL is not local
+	// if it's down, it simply will not be added
+	//
+	void push(const String &path, int hopcount, const String &referer,
+         int local = 0, int newDoc = 1);
+
+	//
+	// Return the next URL from the queue for this server.
+	//
+	URLRef			*pop();
+
+	//
+	// Delays the server if necessary. If the time between requests
+	// is long enough, the request can occur immediately.
+	//
+	void			delay();
+
+	//
+	// Produce statistics for this server.
+	//
+	void			reportStatistics(String &out, char *name);
+
+	//
+        // Methods for managing persistent connections
+	//
+        void			AllowPersistentConnection() { _persistent_connections = true; }
+        void AvoidPersistentConnection() { _persistent_connections = false; }
+        bool IsPersistentConnectionAllowed () const
+         { return _persistent_connections; }
+
+        // Methods for getting info regarding server configuration
+        bool HeadBeforeGet() const { return _head_before_get; }
+        unsigned int TimeOut() const { return _timeout; }
+        unsigned int TcpWaitTime() const { return _tcp_wait_time; }
+        unsigned int TcpMaxRetries() const { return _tcp_max_retries; }
+        unsigned int MaxDocuments() const { return _max_documents; }
+	const String &UserAgent() const { return _user_agent; }
+	const String &AcceptLanguage() const { return _accept_language; }
+        bool DisableCookies() const { return _disable_cookies; }
+        
+	//
+	// Return the URLs to be excluded from this server
+	// (for inclusion in the exclude_urls attribute)
+	//
+	int			IsDisallowed(String url) { return _disallow.match(url, 0, 0); }
+	
+private:
+	String			_host;
+	int			_port;
+	int			_bad_server;		// TRUE if we shouldn't use this one
+	int		        _connection_space;	// Seconds between connections
+	HtDateTime		_last_connection;	// Time of last connection to this server
+	HtHeap			_paths;
+	HtRegex			_disallow;	// This pattern will be used to test paths
+	int		        _documents;	// Number of documents visited
+
+	int                     _max_documents;  // Maximum number of documents from this server
+
+        bool                    _persistent_connections; // Are pcs allowed
+
+        bool                    _head_before_get; // HEAD call before a GET?
+
+        bool                    _disable_cookies; // Should we send cookies?
+
+        int                     _timeout;       // Timeout for this server
+                                                
+        unsigned int            _tcp_wait_time;     // Wait time after a timeout
+                                                // has been raised.
+                                                
+        unsigned int            _tcp_max_retries;   // Max number of retries when
+                                                // connection is not possible
+                                                // and timeout occurs
+	String			_user_agent;    // User agent to use for this server
+	String			_accept_language; // Accept-language to be sent
+      	             	      	             	   // for the HTTP server
+
+        
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc
new file mode 100644
index 00000000..6cc8bc43
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc
@@ -0,0 +1,47 @@
+//
+// URLRef.cc
+//
+// URLRef: A definition of a URL/Referer pair with associated hopcount
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: URLRef.cc,v 1.9 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "URLRef.h"
+
+
+//*****************************************************************************
+// URLRef::URLRef()
+//
+URLRef::URLRef()
+{
+  hopcount = 0;
+}
+
+
+//*****************************************************************************
+// URLRef::~URLRef()
+//
+URLRef::~URLRef()
+{
+}
+
+
+//*****************************************************************************
+//
+int URLRef::compare(const URLRef& to) const
+{
+  return hopcount - to.hopcount;
+}
+
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h
new file mode 100644
index 00000000..dfc251ec
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h
@@ -0,0 +1,50 @@
+//
+// URLRef.h
+//
+// URLRef: A definition of a URL/Referer pair with associated hopcount
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: URLRef.h,v 1.9 2004/05/28 13:15:16 lha Exp $
+//
+//
+#ifndef _URLRef_h_
+#define _URLRef_h_
+
+#include "Object.h"
+#include "htString.h"
+#include "URL.h"
+
+class URLRef : public Object
+{
+public:
+	//
+	// Construction/Destruction
+	//
+	                URLRef();
+	                ~URLRef();
+
+	const URL	&GetURL() const			{return url;}
+	int		GetHopCount() const		{return hopcount;}
+	const URL	&GetReferer() const		{return referer;}
+	
+	void		SetURL(const URL &u)	        {url = u;}
+	void		SetHopCount(int h)		{hopcount = h;}
+	void		SetReferer(const URL &ref)	{referer = ref;}
+
+	int		compare(const Object& to) const	{ return compare((const URLRef&) to); }
+	int		compare(const URLRef& to) const;
+	
+private:
+	URL		url;
+	URL		referer;
+	int		hopcount;
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
new file mode 100644
index 00000000..ba1d842a
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
@@ -0,0 +1,536 @@
+//
+// htdig.cc
+// 
+// htdig: Indexes the web sites specified in the config file
+//        generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: htdig.cc,v 1.42 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Document.h"
+#include "Retriever.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "defaults.h"
+#include "HtURLCodec.h"
+#include "WordContext.h"
+#include "HtDateTime.h"
+#include "HtURLRewriter.h"
+
+////////////////////////////
+// For cookie jar
+////////////////////////////
+#include "HtCookieJar.h"
+#include "HtCookieMemJar.h"
+#include "HtCookieInFileJar.h"
+#include "HtHTTP.h"
+////////////////////////////
+
+// If we have this, we probably want it.
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#elif HAVE_GETOPT_LOCAL
+#include <getopt_local.h>
+#endif
+
+#ifdef HAVE_STD
+#include <iostream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <iostream.h>
+#endif /* HAVE_STD */
+
+//
+// Global variables
+//
+int			debug = 0;
+int			report_statistics = 0;
+DocumentDB		docs;
+HtRegexList		limits;
+HtRegexList		limitsn;
+FILE			*urls_seen = NULL;
+FILE			*images_seen = NULL;
+String			configFile = DEFAULT_CONFIG_FILE;
+String			minimalFile = 0;
+HtDateTime		StartTime;
+HtDateTime		EndTime;
+
+void usage();
+void reportError(char *msg);
+
+
+//
+// Start of the program.
+//
+int main(int ac, char **av)
+{
+    int			c;
+    extern char		*optarg;
+    String		credentials;
+    int			initial = 0;
+    int			alt_work_area = 0;
+    int			create_text_database = 0;
+    char		*max_hops = 0;
+
+    // Cookie jar dynamic creation.
+    HtCookieJar*        _cookie_jar = new HtCookieMemJar(); // new cookie jar
+    if (_cookie_jar)
+       HtHTTP::SetCookieJar(_cookie_jar);
+
+//extern int yydebug;
+//yydebug=1;
+	
+    //
+    // Parse command line arguments
+    //
+    while ((c = getopt(ac, av, "lsm:c:vith:u:a")) != -1)
+    {
+        unsigned int pos;
+	switch (c)
+	{
+	    case 'c':
+		configFile = optarg;
+		break;
+	    case 'v':
+		debug++;
+		break;
+	    case 'i':
+		initial++;
+		break;
+	    case 't':
+		create_text_database++;
+		break;
+	    case 'h':
+		max_hops = optarg;
+		break;
+	    case 's':
+		report_statistics++;
+		break;
+	    case 'u':
+		credentials = optarg;
+		for (pos = 0; pos < strlen(optarg); pos++)
+		  optarg[pos] = '*';
+		break;
+	    case 'a':
+		alt_work_area++;
+		break;
+	    case 'm':
+	        minimalFile = optarg;
+		max_hops = "0";
+	        break;
+	    case '?':
+		usage();
+	    default:
+	        break;
+	}
+    }
+
+    // Shows Start Time
+    if (debug>0)
+	cout << "ht://dig Start Time: " << StartTime.GetAscTime() << endl;
+    
+    //
+    // First set all the defaults and then read the specified config
+    // file to override the defaults.
+    //
+	HtConfiguration* const config= HtConfiguration::config();
+    config->Defaults(&defaults[0]);
+    if (access((char*)configFile, R_OK) < 0)
+    {
+	reportError(form("Unable to find configuration file '%s'",
+			 configFile.get()));
+    }
+    config->Read(configFile);
+
+    // Warn user if any obsolete options are found in config file
+    // For efficiency, check all fields here.  If different config
+    // files are used for searching, obsolete options may remain
+    char *deprecatedOptions [] = {
+	"heading_factor_1", "heading_factor_2", "heading_factor_3",
+	"heading_factor_4", "heading_factor_5", "heading_factor_6",
+	"modification_time_is_now", "pdf_parser", "translate_amp",
+	"translate_lt_gt", "translate_quot", "uncoded_db_compatible",
+	""	// empty terminator
+    };
+    char **option;
+    for (option = deprecatedOptions; **option; option++)
+    {
+	if (!config->Find(*option).empty())
+	    cout << "Warning: Configuration option " << *option <<
+		" is no longer supported\n";
+    }
+
+    if (config->Find("locale").empty() && debug > 0)
+      cout << "Warning: unknown locale!\n";
+
+    if (max_hops)
+    {
+	config->Add("max_hop_count", max_hops);
+    }
+
+    // Set up credentials for this run
+    if (credentials.length())
+	config->Add("authorization", credentials);
+
+    //
+    // Check url_part_aliases and common_url_parts for
+    // errors.
+    String url_part_errors = HtURLCodec::instance()->ErrMsg();
+
+    if (url_part_errors.length() != 0)
+      reportError(form("Invalid url_part_aliases or common_url_parts: %s",
+                       url_part_errors.get()));
+
+    //
+    // Check url_rewrite_rules for errors.
+    String url_rewrite_rules = HtURLRewriter::instance()->ErrMsg();
+    
+    if (url_rewrite_rules.length() != 0)
+      reportError(form("Invalid url_rewrite_rules: %s",
+		       url_rewrite_rules.get()));
+
+    //
+    // If indicated, change the database file names to have the .work
+    // extension
+    //
+    if (alt_work_area != 0)
+    {
+	String	configValue = config->Find("doc_db");
+
+	if (configValue.length() != 0)
+	{
+	    configValue << ".work";
+	    config->Add("doc_db", configValue);
+	}
+
+	configValue = config->Find("word_db");
+	if (configValue.length() != 0)
+	{
+	    configValue << ".work";
+	    config->Add("word_db", configValue);
+	}
+
+	configValue = config->Find("doc_index");
+	if (configValue.length() != 0)
+	{
+	    configValue << ".work";
+	    config->Add("doc_index", configValue);
+	}
+
+	configValue = config->Find("doc_excerpt");
+	if (configValue.length() != 0)
+	{
+	    configValue << ".work";
+	    config->Add("doc_excerpt", configValue);
+	}
+
+	configValue = config->Find("md5_db");
+	if (configValue.length() != 0)
+	{
+	    configValue << ".work";
+	    config->Add("md5_db", configValue);
+	}
+    }
+    
+    // Imports the cookies file
+    const String CookiesInputFile = config->Find("cookies_input_file");
+    if (CookiesInputFile.length())
+    {
+	if (debug>0)
+	cout << "Importing Cookies input file "
+	    << CookiesInputFile << endl;
+	int result;
+	HtCookieJar::SetDebugLevel(debug); // Set the debug level
+	HtCookieInFileJar* cookie_file = new HtCookieInFileJar(CookiesInputFile, result);
+	if (cookie_file)
+	{
+	    if (!result)
+	    {
+		if (debug>0)
+		    cookie_file->ShowSummary();
+		delete _cookie_jar;	// Deletes previous cookie jar
+		_cookie_jar = (HtCookieJar*) cookie_file; // set the imported one
+		HtHTTP::SetCookieJar(_cookie_jar); // and set the new HTTP jar
+	    }
+	    else if (debug > 0)
+		cout << "Warning: Import failed! (" << CookiesInputFile << ")" << endl;
+	}
+	else
+	    reportError(form("Unable to load cookies file '%s' in memory",
+		CookiesInputFile.get()));
+    }
+
+    //
+    // If needed, we will create a list of every URL we come across.
+    //
+    if (config->Boolean("create_url_list"))
+    {
+	const String	filename = config->Find("url_list");
+	urls_seen = fopen(filename, initial ? "w" : "a");
+	if (urls_seen == 0)
+	{
+	    reportError(form("Unable to create URL file '%s'",
+			     filename.get()));
+	}
+    }
+
+    //
+    // If needed, we will create a list of every image we come across.
+    //
+    if (config->Boolean("create_image_list"))
+    {
+	const String	filename = config->Find("image_list");
+	images_seen = fopen(filename, initial ? "w" : "a");
+	if (images_seen == 0)
+	{
+	    reportError(form("Unable to create images file '%s'",
+			     filename.get()));
+	}
+    }
+
+    //
+    // Set up the limits list
+    //
+    StringList l(config->Find("limit_urls_to"), " \t");
+    limits.setEscaped(l, config->Boolean("case_sensitive"));
+    l.Destroy();
+
+    l.Create(config->Find("limit_normalized"), " \t");
+    limitsn.setEscaped(l, config->Boolean("case_sensitive"));
+    l.Destroy();
+
+    //
+    // Open the document database
+    //
+    const String		filename = config->Find("doc_db");
+    if (initial)
+	unlink(filename);
+
+    const String		index_filename = config->Find("doc_index");
+    if (initial)
+	unlink(index_filename);
+
+    const String		head_filename = config->Find("doc_excerpt");
+    if (initial)
+        unlink(head_filename);
+
+    if (docs.Open(filename, index_filename, head_filename) < 0)
+    {
+	reportError(form("Unable to open/create document database '%s'",
+			 filename.get()));
+    }
+
+    const String		word_filename = config->Find("word_db");
+    if (initial)
+    {
+       unlink(word_filename);
+       unlink((word_filename + "_weakcmpr").get());
+
+       // Remove "duplicate detection" database
+       unlink(config->Find("md5_db"));
+
+       // using  -i,  also ignore seen-but-not-processed URLs from last pass
+       unlink(config->Find("url_log"));
+    }
+
+    // Initialize htword
+    WordContext::Initialize(*config);
+
+    // Create the Retriever object which we will use to parse all the
+    // HTML files.
+    // In case this is just an update dig, we will add all existing
+    // URLs?
+    //
+    Retriever	retriever(Retriever_logUrl);
+    if (minimalFile.length() == 0)
+      {
+	List	*list = docs.URLs();
+	retriever.Initial(*list);
+	delete list;
+
+	// Add start_url to the initial list of the retriever.
+	// Don't check a URL twice!
+	// Beware order is important, if this bugs you could change 
+	// previous line retriever.Initial(*list, 0) to Initial(*list,1)
+	retriever.Initial(config->Find("start_url"), 1);
+      }
+
+    // Handle list of URLs given in a file (stdin, if "-") specified as
+    // argument to -m or as an optional trailing argument.
+    if (optind < ac)
+    {
+	if (debug)
+	    if (minimalFile.length() != 0)
+		cout << "Warning: argument " << av[optind]
+		     << " overrides -m " << minimalFile << endl;
+	minimalFile = av[optind];
+    }
+    if (strcmp (minimalFile.get(), "-") == 0)
+      {
+	String str;
+	// Why not combine this with the code below, with  input = stdin ?
+	while (!cin.eof())
+	  {
+	    cin >> str;
+	    str.chop("\r\n");	// (Why "\r\n" here and "\r\n\t " below?)
+	    if (str.length() > 0)
+	        retriever.Initial(str, 1);
+	  }
+      }
+    else if (minimalFile.length() != 0)
+      {
+	    FILE	*input = fopen(minimalFile.get(), "r");
+	    char	buffer[1000];
+
+	    if (input)
+	      {
+		while (fgets(buffer, sizeof(buffer), input))
+		  {
+		    String	str(buffer);
+		    str.chop("\r\n\t ");
+		    if (str.length() > 0)
+		      retriever.Initial(str, 1);
+		  }
+		fclose(input);
+	      }
+	    else
+	      {
+		cerr << "Could not open argument '" << minimalFile
+		     << "' of flag -m\n";
+		exit (1);
+	      }
+      }
+
+    //
+    // Go do it!
+    //
+    retriever.Start();
+
+    //
+    // All done with parsing.
+    //
+
+    //
+    // If the user so wants, create a text version of the document database.
+    //
+
+    if (create_text_database)
+    {
+	const String doc_list = config->Find("doc_list");
+	if (initial)
+	    unlink(doc_list);
+	docs.DumpDB(doc_list);
+	const String word_dump = config->Find("word_dump");
+	if (initial)
+	    unlink(word_dump);
+	HtWordList words(*config);
+	if(words.Open(config->Find("word_db"), O_RDONLY) == OK) {
+	  words.Dump(word_dump);
+	}
+    }
+
+    //
+    // Cleanup
+    //
+    if (urls_seen)
+	fclose(urls_seen);
+    if (images_seen)
+	fclose(images_seen);
+
+    //
+    // If needed, report some statistics
+    //
+    if (report_statistics)
+    {
+	retriever.ReportStatistics("htdig");
+    }
+
+    // Shows End Time
+    if (debug>0)
+    {
+	EndTime.SettoNow();
+	cout << "ht://dig End Time: " << EndTime.GetAscTime() << endl;
+    }
+
+    if (_cookie_jar)
+        delete _cookie_jar;
+}
+
+
+//
+// Display usage information for the htdig program
+//
+void usage()
+{
+    cout << "usage: htdig [-v][-i][-c configfile][-t][-m minimalfile]\n";
+    cout << "This program is part of ht://Dig " << VERSION << "\n\n";
+    cout << "Options:\n";
+
+    cout << "\t-v\tVerbose mode.  This increases the verbosity of the\n";
+    cout << "\t\tprogram.  Using more than 2 is probably only useful\n";
+    cout << "\t\tfor debugging purposes.  The default verbose mode\n";
+    cout << "\t\tgives a nice progress report while digging.\n\n";
+
+    cout << "\t-i\tInitial.  Do not use any old databases.  This is\n";
+    cout << "\t\taccomplished by first erasing the databases.\n\n";
+
+    cout << "\t-c configfile\n";
+    cout << "\t\tUse the specified configuration file instead of the\n";
+    cout << "\t\tdefault.\n\n";
+
+    cout << "\t-t\tCreate an ASCII version of the document database.\n";
+    cout << "\t\tThis database is easy to parse with other programs so\n";
+    cout << "\t\tthat information can be extracted from it.\n\n";
+
+    cout << "\t-h hopcount\n";
+    cout << "\t\tLimit the stored documents to those which are at\n";
+    cout << "\t\tmost hopcount links away from the start URL.\n\n";
+
+    cout << "\t-s\tReport statistics after completion.\n\n";
+
+    cout << "\t-u username:password\n";
+    cout << "\t\tTells htdig to send the supplied username and\n";
+    cout << "\t\tpassword with each HTTP request.  The credentials\n";
+    cout << "\t\twill be encoded using the 'Basic' authentication scheme.\n";
+    cout << "\t\tThere *HAS* to be a colon (:) between the username\n";
+    cout << "\t\tand password.\n\n";
+
+    cout << "\t-a\tUse alternate work files.\n";
+    cout << "\t\tTells htdig to append .work to database files, causing\n";
+    cout << "\t\ta second copy of the database to be built.  This allows\n";
+    cout << "\t\tthe original files to be used by htsearch during the\n";
+    cout << "\t\tindexing run.\n\n";
+
+    cout << "\t-m minimalfile  (or just a file name at end of arguments)\n";
+    cout << "\t\tTells htdig to read URLs from the supplied file and index\n";
+    cout << "\t\tthem in place of (or in addition to) the existing URLs in\n";
+    cout << "\t\tthe database and the start_url.  With the -m, only the\n";
+    cout << "\t\tURLs specified are added to the database.  A file name of\n";
+    cout << "\t\t'-' indicates the standard input.\n\n";
+
+
+	
+    exit(0);
+}
+
+//
+// Report an error and die
+//
+void reportError(char *msg)
+{
+    cout << "htdig: " << msg << "\n\n";
+    exit(1);
+}
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.h b/debian/htdig/htdig-3.2.0b6/htdig/htdig.h
new file mode 100644
index 00000000..5eb5b9bb
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.h
@@ -0,0 +1,55 @@
+//
+// htdig.h
+//
+// htdig: Indexes the web sites specified in the config file
+//        generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: htdig.h,v 1.16 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifndef _htdig_h_
+#define _htdig_h_
+
+#include "HtConfiguration.h"
+#include "List.h"
+#include "DocumentDB.h"
+#include "StringMatch.h"
+#include "htconfig.h"
+#include "HtRegexList.h"
+#include <stdlib.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+
+extern int		debug;
+extern DocumentDB	docs;
+extern HtRegexList	limits;
+extern HtRegexList	limitsn;
+extern HtRegexList	excludes;
+extern HtRegexList	badquerystr;
+extern FILE		*urls_seen;
+extern FILE		*images_seen;
+
+extern void reportError(char *msg);
+
+#endif
+
+
author	Slávek Banko <slavek.banko@axis.cz>	2021-11-05 13:28:23 +0100
committer	Slávek Banko <slavek.banko@axis.cz>	2021-11-05 13:28:23 +0100
commit	8c787c3591c1c885b91a54128835b400858c5cca (patch)
tree	eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htdig
parent	fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff)
download	extra-dependencies-8c787c35.tar.gz extra-dependencies-8c787c35.zip