summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htdig
diff options
context:
space:
mode:
authorSlávek Banko <slavek.banko@axis.cz>2021-11-05 13:28:23 +0100
committerSlávek Banko <slavek.banko@axis.cz>2021-11-05 13:28:23 +0100
commit8c787c3591c1c885b91a54128835b400858c5cca (patch)
treeeca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htdig
parentfe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff)
downloadextra-dependencies-8c787c35.tar.gz
extra-dependencies-8c787c35.zip
DEB htdig: Added to repository.
Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/.cvsignore8
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Document.cc784
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Document.h138
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc614
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h58
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc376
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h88
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/HTML.cc1002
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/HTML.h69
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Makefile.am16
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Makefile.in487
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Makefile.win3229
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc96
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Parsable.h57
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc116
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h41
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc2013
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Retriever.h183
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Server.cc435
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Server.h142
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc47
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/URLRef.h50
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/htdig.cc536
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/htdig.h55
24 files changed, 7440 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore b/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore
new file mode 100644
index 00000000..4de01869
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore
@@ -0,0 +1,8 @@
+Makefile
+*.lo
+*.la
+.purify
+.pure
+.deps
+.libs
+htdig
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.cc b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc
new file mode 100644
index 00000000..87272686
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc
@@ -0,0 +1,784 @@
+//
+// Document.cc
+//
+// Document: This class holds everything there is to know about a document.
+// The actual contents of the document may or may not be present at
+// all times for memory conservation reasons.
+// The document can be told to retrieve its contents. This is done
+// with the Retrieve call. In case the retrieval causes a
+// redirect, the link is followed, but this process is done
+// only once (to prevent loops.) If the redirect didn't
+// work, Document_not_found is returned.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Document.cc,v 1.71 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+
+#include "Document.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "ExternalParser.h"
+#include "lib.h"
+
+#include "Transport.h"
+#include "HtHTTP.h"
+
+#ifdef HAVE_SSL_H
+#include "HtHTTPSecure.h"
+#endif
+
+#include "HtHTTPBasic.h"
+#include "ExternalTransport.h"
+
+#include "defaults.h"
+
+#if 1
+typedef void (*SIGNAL_HANDLER) (...);
+#else
+typedef SIG_PF SIGNAL_HANDLER;
+#endif
+
+//*****************************************************************************
+// Document::Document(char *u)
+// Initialize with the given url as the location for this document.
+// If the max_size is given, use that for size, otherwise use the
+// config value.
+//
+Document::Document(char *u, int max_size)
+{
+ url = 0;
+ proxy = 0;
+ referer = 0;
+ contents = 0;
+ transportConnect = 0;
+ HTTPConnect = 0;
+ HTTPSConnect = 0;
+ FileConnect = 0;
+ FTPConnect = 0;
+ NNTPConnect = 0;
+ externalConnect = 0;
+ HtConfiguration* config= HtConfiguration::config();
+
+ // We probably need to move assignment of max_doc_size, according
+ // to a server or url configuration value. The same is valid for
+ // max_retries.
+
+ if (max_size > 0)
+ max_doc_size = max_size;
+ else
+ max_doc_size = config->Value("max_doc_size");
+
+ if (config->Value("max_retries") > 0)
+ num_retries = config->Value("max_retries");
+ else num_retries = 2;
+
+ // Initialize some static variables of Transport
+
+ Transport::SetDebugLevel(debug);
+
+ // Initialize some static variables of Transport
+ // and the User Agent for every HtHTTP objects
+
+ HtHTTP::SetParsingController(ExternalParser::canParse);
+
+ // Set the default parser content-type string
+ Transport::SetDefaultParserContentType ("text/");
+
+ contents.allocate(max_doc_size + 100);
+ contentType = "";
+ contentLength = -1;
+ if (u)
+ {
+ Url(u);
+ }
+}
+
+
+//*****************************************************************************
+// Document::~Document()
+//
+Document::~Document()
+{
+ // We delete only the derived class objects
+ if (HTTPConnect)
+ delete HTTPConnect;
+ if (HTTPSConnect)
+ delete HTTPSConnect;
+ if (FileConnect)
+ delete FileConnect;
+ if (FTPConnect)
+ delete FTPConnect;
+ if (NNTPConnect)
+ delete NNTPConnect;
+ if (externalConnect)
+ delete externalConnect;
+
+ if (url)
+ delete url;
+ if (proxy)
+ delete proxy;
+ if (referer)
+ delete referer;
+
+#if MEM_DEBUG
+ char *p = new char;
+ cout << "==== Document deleted: " << this << " new at " <<
+ ((void *) p) << endl;
+ delete p;
+#endif
+}
+
+
+//*****************************************************************************
+// void Document::Reset()
+// Restore the Document object to an initial state.
+// We will not reset the authorization information since it can be reused.
+//
+void
+Document::Reset()
+{
+ contentType = 0;
+ contentLength = -1;
+ if (url)
+ delete url;
+ url = 0;
+ if (referer)
+ delete referer;
+
+ referer = 0;
+
+ proxy=0;
+ authorization=0;
+ proxy_authorization=0;
+ contents = 0;
+ document_length = 0;
+ redirected_to = 0;
+
+}
+
+
+//*****************************************************************************
+// void Document::Url(const String &u)
+// Set the URL for this document
+//
+void
+Document::Url(const String &u)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ if (url)
+ delete url;
+ url = new URL(u);
+
+ // Re-initialise the proxy
+ if (proxy)
+ delete proxy;
+ proxy = 0;
+
+ // Get the proxy information for this URL
+ const String proxyURL = config->Find(url,"http_proxy");
+
+ // If http_proxy is not empty we set the proxy for the current URL
+ if (proxyURL.length())
+ {
+ proxy = new URL(proxyURL);
+ proxy->normalize();
+ // set the proxy authorization information
+ setProxyUsernamePassword(config->Find(url,"http_proxy_authorization"));
+ }
+
+ // Set the authorization information
+ setUsernamePassword(config->Find(url,"authorization"));
+
+}
+
+
+//*****************************************************************************
+// void Document::Referer(const String &u)
+// Set the Referring URL for this document
+//
+void
+Document::Referer(const String &u)
+{
+ if (referer)
+ delete referer;
+ referer = new URL(u);
+}
+
+
+//*****************************************************************************
+// int Document::UseProxy()
+// Returns 1 if the given url is to be retrieved from the proxy server,
+// or 0 if it's not.
+//
+int
+Document::UseProxy()
+{
+ HtConfiguration* config= HtConfiguration::config();
+ static HtRegex *excludeProxy = 0;
+
+ //
+ // Initialize excludeProxy list if this is the first time.
+ //
+ if (!excludeProxy)
+ {
+ excludeProxy = new HtRegex();
+ StringList l(config->Find("http_proxy_exclude"), " \t");
+ excludeProxy->setEscaped(l, config->Boolean("case_sensitive"));
+ l.Release();
+ }
+
+ if ((proxy) && (excludeProxy->match(url->get(), 0, 0) == 0))
+ return true; // if the exclude pattern is empty, use the proxy
+ return false;
+}
+
+
+//*****************************************************************************
+// DocStatus Document::Retrieve(HtDateTime date)
+// Attempt to retrieve the document pointed to by our internal URL
+//
+Transport::DocStatus
+Document::Retrieve(Server *server, HtDateTime date)
+{
+ // Right now we just handle http:// service
+ // Soon this will include file://
+ // as well as an ExternalTransport system
+ // eventually maybe ftp:// and a few others
+
+ Transport::DocStatus status;
+ Transport_Response *response = 0;
+ HtDateTime *ptrdatetime = 0;
+ int useproxy = UseProxy();
+ int NumRetries;
+
+ transportConnect = 0;
+
+ if (ExternalTransport::canHandle(url->service()))
+ {
+ if (externalConnect)
+ {
+ delete externalConnect;
+ }
+ externalConnect = new ExternalTransport(url->service());
+ transportConnect = externalConnect;
+ }
+#ifdef HAVE_SSL_H
+ else if (mystrncasecmp(url->service(), "https", 5) == 0)
+ {
+ if (!HTTPSConnect)
+ {
+ if (debug>4)
+ cout << "Creating an HtHTTPSecure object" << endl;
+
+ HTTPSConnect = new HtHTTPSecure();
+
+ if (!HTTPSConnect)
+ return Transport::Document_other_error;
+ }
+
+ if (HTTPSConnect)
+ {
+ // Here we must set only thing for a HTTP request
+
+ HTTPSConnect->SetRequestURL(*url);
+
+ // Set the user agent which can vary per server
+ HTTPSConnect->SetRequestUserAgent(server->UserAgent());
+
+ // Set the accept language which can vary per server
+ HTTPSConnect->SetAcceptLanguage(server->AcceptLanguage());
+
+ // Set the referer
+ if (referer)
+ HTTPSConnect->SetRefererURL(*referer);
+
+ // Let's disable the cookies if we decided that in the config file
+ if (server->DisableCookies())
+ HTTPSConnect->DisableCookies();
+ else HTTPSConnect->AllowCookies();
+
+ // We may issue a config paramater to enable/disable them
+ if (server->IsPersistentConnectionAllowed())
+ {
+ // Persistent connections allowed
+ HTTPSConnect->AllowPersistentConnection();
+ }
+ else HTTPSConnect->DisablePersistentConnection();
+
+ // Head before Get option control
+ if (server->HeadBeforeGet())
+ HTTPSConnect->EnableHeadBeforeGet();
+ else
+ HTTPSConnect->DisableHeadBeforeGet();
+
+ // http->SetRequestMethod(HtHTTP::Method_GET);
+ if (debug > 2)
+ {
+ cout << "Making HTTPS request on " << url->get();
+
+ if (useproxy)
+ cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")";
+
+ cout << endl;
+ }
+ }
+
+ HTTPSConnect->SetProxy(useproxy);
+ transportConnect = HTTPSConnect;
+ }
+#endif
+ else if (mystrncasecmp(url->service(), "http", 4) == 0)
+ {
+ if (!HTTPConnect)
+ {
+ if (debug>4)
+ cout << "Creating an HtHTTPBasic object" << endl;
+
+ HTTPConnect = new HtHTTPBasic();
+
+ if (!HTTPConnect)
+ return Transport::Document_other_error;
+ }
+
+ if (HTTPConnect)
+ {
+ // Here we must set only thing for a HTTP request
+
+ HTTPConnect->SetRequestURL(*url);
+
+ // Set the user agent which can vary per server
+ HTTPConnect->SetRequestUserAgent(server->UserAgent());
+
+ // Set the accept language which can vary per server
+ HTTPConnect->SetAcceptLanguage(server->AcceptLanguage());
+
+ // Set the referer
+ if (referer)
+ HTTPConnect->SetRefererURL(*referer);
+
+ // Let's disable the cookies if we decided that in the config file
+ if (server->DisableCookies())
+ HTTPConnect->DisableCookies();
+ else HTTPConnect->AllowCookies();
+
+ // We may issue a config paramater to enable/disable them
+ if (server->IsPersistentConnectionAllowed())
+ {
+ // Persistent connections allowed
+ HTTPConnect->AllowPersistentConnection();
+ }
+ else HTTPConnect->DisablePersistentConnection();
+
+ // Head before Get option control
+ if (server->HeadBeforeGet())
+ HTTPConnect->EnableHeadBeforeGet();
+ else
+ HTTPConnect->DisableHeadBeforeGet();
+
+ // http->SetRequestMethod(HtHTTP::Method_GET);
+ if (debug > 2)
+ {
+ cout << "Making HTTP request on " << url->get();
+
+ if (useproxy)
+ cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")";
+
+ cout << endl;
+ }
+ }
+
+ HTTPConnect->SetProxy(useproxy);
+ transportConnect = HTTPConnect;
+ }
+ else if (mystrncasecmp(url->service(), "file", 4) == 0)
+ {
+ if (!FileConnect)
+ {
+ if (debug>4)
+ cout << "Creating an HtFile object" << endl;
+
+ FileConnect = new HtFile();
+
+ if (!FileConnect)
+ return Transport::Document_other_error;
+ }
+
+ if (FileConnect)
+ {
+ // Here we must set only thing for a file request
+
+ FileConnect->SetRequestURL(*url);
+
+ // Set the referer
+ if (referer)
+ FileConnect->SetRefererURL(*referer);
+
+ if (debug > 2)
+ cout << "Making 'file' request on " << url->get() << endl;
+ }
+
+ transportConnect = FileConnect;
+ }
+ else if (mystrncasecmp(url->service(), "ftp", 3) == 0)
+ {
+ // the following FTP handling is modeled very closely on
+ // the prior 'file'-protocol handling, so beware of bugs
+
+ if (!FTPConnect)
+ {
+ if (debug>4)
+ cout << "Creating an HtFTP object" << endl;
+
+ FTPConnect = new HtFTP();
+
+ if (!FTPConnect)
+ return Transport::Document_other_error;
+ }
+ if (FTPConnect)
+ {
+ // Here we must set only thing for a FTP request
+
+ FTPConnect->SetRequestURL(*url);
+ ////////////////////////////////////////////////////
+ ///
+ /// stuff may be missing here or in need of change
+ ///
+ ///////////////////////////////////////////////////
+
+ // Set the referer
+ if (referer)
+ FTPConnect->SetRefererURL(*referer);
+
+ if (debug > 2)
+ cout << "Making 'ftp' request on " << url->get() << endl;
+ }
+
+ transportConnect = FTPConnect;
+ } // end of else if (mystrncasecmp(url->service(), "ftp", 3) == 0)
+
+ else if (mystrncasecmp(url->service(), "news", 4) == 0)
+ {
+ if (!NNTPConnect)
+ {
+ if (debug>4)
+ cout << "Creating an HtNNTP object" << endl;
+
+ NNTPConnect = new HtNNTP();
+
+ if (!NNTPConnect)
+ return Transport::Document_other_error;
+ }
+
+ if (NNTPConnect)
+ {
+ // Here we got an Usenet document request
+
+ NNTPConnect->SetRequestURL(*url);
+
+ if (debug > 2)
+ cout << "Making 'NNTP' request on " << url->get() << endl;
+ }
+
+ transportConnect = NNTPConnect;
+ }
+ else
+ {
+ if (debug)
+ {
+ cout << '"' << url->service() <<
+ "\" not a recognized transport service. Ignoring\n";
+ }
+
+ return Transport::Document_not_recognized_service;
+ }
+
+ // Is a transport object pointer available?
+
+ if (transportConnect)
+ {
+ // Set all the appropriate parameters
+ if (useproxy)
+ {
+ transportConnect->SetConnection(proxy);
+ if (proxy_authorization.length())
+ transportConnect->SetProxyCredentials(proxy_authorization);
+ }
+ else
+ transportConnect->SetConnection(url);
+
+ // OK. Let's set the connection time out
+ transportConnect->SetTimeOut(server->TimeOut());
+
+ // Let's set number of retries for a failed connection attempt
+ transportConnect->SetRetry(server->TcpMaxRetries());
+
+ // ... And the wait time after a failure
+ transportConnect->SetWaitTime(server->TcpWaitTime());
+
+ // OK. Let's set the maximum size of a document to be retrieved
+ transportConnect->SetRequestMaxDocumentSize(max_doc_size);
+
+ // Let's set the credentials
+ transportConnect->SetCredentials(authorization);
+
+ // Let's set the modification time (in order not to retrieve a
+ // document we already have)
+ transportConnect->SetRequestModificationTime(date);
+
+ // Make the request
+ // Here is the main operation ... Let's make the request !!!
+ // We now perform a loop until we want to retry the request
+
+ NumRetries = 0;
+
+ do
+ {
+ status = transportConnect->Request();
+
+ if (NumRetries++)
+ if(debug>0)
+ cout << ".";
+
+ } while (ShouldWeRetry(status) && NumRetries < num_retries);
+
+
+ // Let's get out the info we need
+ response = transportConnect->GetResponse();
+
+ if (response)
+ {
+ // We got the response
+
+ contents = response->GetContents();
+ contentType = response->GetContentType();
+ contentLength = response->GetContentLength();
+ ptrdatetime = response->GetModificationTime();
+ document_length = response->GetDocumentLength();
+
+ // This test is ugly! Can whoever put it here explain why it's
+ // needed? Why would GetLocation() ever return a non-empty string
+ // from a Transport subclass that's not supposed to redirect?
+ if (transportConnect == HTTPConnect || transportConnect == HTTPSConnect || transportConnect == externalConnect)
+ redirected_to = ((HtHTTP_Response *)response)->GetLocation();
+
+ if (ptrdatetime)
+ {
+ // We got the modification date/time
+ modtime = *ptrdatetime;
+ }
+
+ // How to manage it when there's no modification date/time?
+
+ if (debug > 5)
+ {
+ cout << "Contents:\n" << contents << endl;
+ cout << "Content Type: " << contentType << endl;
+ cout << "Content Length: " << contentLength << endl;
+ cout << "Modification Time: " << modtime.GetISO8601() << endl;
+ }
+ }
+
+ return status;
+
+ }
+ else
+ return Transport::Document_not_found;
+}
+
+//*****************************************************************************
+// DocStatus Document::RetrieveLocal(HtDateTime date, StringList *filenames)
+// Attempt to retrieve the document pointed to by our internal URL
+// using a list of potential local filenames given. Returns Document_ok,
+// Document_not_changed or Document_not_local (in which case the
+// retriever tries it again using the standard retrieve method).
+//
+Transport::DocStatus
+Document::RetrieveLocal(HtDateTime date, StringList *filenames)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ struct stat stat_buf;
+ String *filename;
+
+ filenames->Start_Get();
+
+ // Loop through list of potential filenames until the list is exhausted
+ // or a suitable file is found to exist as a regular file.
+ while ((filename = (String *)filenames->Get_Next()) &&
+ ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
+ if (debug > 1)
+ cout << " tried local file " << *filename << endl;
+
+ if (!filename)
+ return Transport::Document_not_local;
+
+ if (debug > 1)
+ cout << " found existing file " << *filename << endl;
+
+ modtime = stat_buf.st_mtime;
+ if (modtime <= date)
+ return Transport::Document_not_changed;
+
+ char *ext = strrchr((char*)*filename, '.');
+ if (ext == NULL)
+ return Transport::Document_not_local;
+ const String *type = HtFile::Ext2Mime (ext + 1);
+
+ static Dictionary *bad_local_ext = 0;
+ if (!bad_local_ext)
+ {
+ // A list of bad extensions, separated by spaces or tabs
+ bad_local_ext = new Dictionary;
+ String t = config->Find("bad_local_extensions");
+ String lowerp;
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ bad_local_ext->Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+ }
+ if (type == NULL || bad_local_ext->Exists(ext))
+ {
+ if (debug > 1 && type != NULL)
+ cout << "\nBad local extension: " << *filename << endl;
+ return Transport::Document_not_local;
+ }
+ else
+ contentType = *type;
+
+ // Open it
+ FILE *f = fopen((char*)*filename, "r");
+ if (f == NULL)
+ return Transport::Document_not_local;
+
+ //
+ // Read in the document itself
+ //
+ max_doc_size = config->Value(url,"max_doc_size");
+ contents = 0;
+ char docBuffer[8192];
+ int bytesRead;
+
+ while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0)
+ {
+ if (debug > 2)
+ cout << "Read " << bytesRead << " from document\n";
+ if (contents.length() + bytesRead > max_doc_size)
+ bytesRead = max_doc_size - contents.length();
+ contents.append(docBuffer, bytesRead);
+ if (contents.length() >= max_doc_size)
+ break;
+ }
+ fclose(f);
+ document_length = contents.length();
+ contentLength = stat_buf.st_size;
+
+ if (debug > 2)
+ cout << "Read a total of " << document_length << " bytes\n";
+
+ if (document_length < contentLength)
+ document_length = contentLength;
+ return Transport::Document_ok;
+}
+
+
+//*****************************************************************************
+// Parsable *Document::getParsable()
+// Given the content-type of a document, returns a document parser.
+// This will first look through the list of user supplied parsers and
+// then at our (limited) builtin list of parsers. The user supplied
+// parsers are external programs that will be used.
+//
+Parsable *
+Document::getParsable()
+{
+ static HTML *html = 0;
+ static Plaintext *plaintext = 0;
+ static ExternalParser *externalParser = 0;
+
+ Parsable *parsable = 0;
+
+ if (ExternalParser::canParse(contentType))
+ {
+ if (externalParser)
+ {
+ delete externalParser;
+ }
+ externalParser = new ExternalParser(contentType);
+ parsable = externalParser;
+ }
+ else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
+ {
+ if (!html)
+ html = new HTML();
+ parsable = html;
+ }
+ else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ }
+ else if (mystrncasecmp((char *)contentType, "text/css", 8) == 0)
+ {
+ return NULL;
+ }
+ else if (mystrncasecmp((char *)contentType, "text/", 5) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ if (debug > 1)
+ {
+ cout << '"' << contentType <<
+ "\" not a recognized type. Assuming text/plain\n";
+ }
+ }
+ else
+ {
+ if (debug > 1)
+ {
+ cout << '"' << contentType <<
+ "\" not a recognized type. Ignoring\n";
+ }
+ return NULL;
+ }
+
+ parsable->setContents(contents.get(), contents.length());
+ return parsable;
+}
+
+
+int Document::ShouldWeRetry(Transport::DocStatus DocumentStatus)
+{
+
+ if (DocumentStatus == Transport::Document_connection_down)
+ return 1;
+
+ if (DocumentStatus == Transport::Document_no_connection)
+ return 1;
+
+ if (DocumentStatus == Transport::Document_no_header)
+ return 1;
+
+ return 0;
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.h b/debian/htdig/htdig-3.2.0b6/htdig/Document.h
new file mode 100644
index 00000000..215897c4
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.h
@@ -0,0 +1,138 @@
+//
+// Document.h
+//
+// Document: This class holds everything there is to know about a document.
+// The actual contents of the document may or may not be present at
+// all times for memory conservation reasons.
+// The document can be told to retrieve its contents. This is done
+// with the Retrieve call. In case the retrieval causes a
+// redirect, the link is followed, but this process is done
+// only once (to prevent loops.) If the redirect didn't
+// work, Document_not_found is returned.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Document.h,v 1.19 2004/05/28 13:15:14 lha Exp $
+//
+//
+#ifndef _Document_h_
+#define _Document_h_
+
+#include "Parsable.h"
+#include "Object.h"
+#include "URL.h"
+#include "htString.h"
+#include "StringList.h"
+#include "Transport.h"
+#include "HtHTTP.h"
+#include "HtFile.h"
+#include "HtFTP.h"
+#include "HtNNTP.h"
+#include "ExternalTransport.h"
+#include "Server.h"
+
+
+class Connection;
+
+
+class Document : public Object
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ Document(char *url = 0, int max_size = 0);
+ ~Document();
+
+ //
+ // Interface to the document.
+ //
+ void Reset();
+ int Length() {return document_length;}
+ int ContentLength() {return contentLength;}
+ int StoredLength() {return contents.length();}
+ char *Contents() {return contents;}
+ void Contents(char *s) {contents = s; document_length = contents.length();}
+ char *ContentType() {return contentType.get();}
+
+ //
+ // In case the retrieval process went through a redirect process,
+ // the new url can be gotten using the following call
+ //
+ char *Redirected() {return redirected_to;}
+ URL *Url() {return url;}
+ void Url(const String &url);
+ void Referer(const String &url);
+ time_t ModTime() {return modtime.GetTime_t();}
+
+ Transport::DocStatus Retrieve(Server *server, HtDateTime date);
+ Transport::DocStatus RetrieveLocal(HtDateTime date, StringList *filenames);
+
+ //
+ // Return an appropriate parsable object for the document type.
+ //
+ Parsable *getParsable();
+
+ //
+ // Set the username and password to be used in any requests
+ //
+ void setUsernamePassword(const String& credentials)
+ { authorization = credentials;}
+
+ void setProxyUsernamePassword(const String& credentials)
+ { proxy_authorization = credentials;}
+
+ HtHTTP *GetHTTPHandler() const { return HTTPConnect; }
+
+private:
+ enum
+ {
+ Header_ok,
+ Header_not_found,
+ Header_not_changed,
+ Header_redirect,
+ Header_not_text,
+ Header_not_authorized
+ };
+
+ URL *url;
+ URL *proxy;
+ URL *referer;
+ String contents;
+ String redirected_to;
+ String contentType;
+ String authorization;
+ String proxy_authorization;
+ int contentLength;
+ int document_length;
+ HtDateTime modtime;
+ int max_doc_size;
+ int num_retries;
+
+ int UseProxy();
+
+ Transport *transportConnect;
+ HtHTTP *HTTPConnect;
+ HtHTTP *HTTPSConnect;
+ HtFile *FileConnect;
+ HtFTP *FTPConnect;
+ HtNNTP *NNTPConnect;
+ ExternalTransport *externalConnect;
+
+
+ ///////
+ // Tell us if we should retry to retrieve an URL depending on
+ // the first returned document status
+ ///////
+
+ int ShouldWeRetry(Transport::DocStatus DocumentStatus);
+
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
new file mode 100644
index 00000000..d967ba0b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
@@ -0,0 +1,614 @@
+//
+// ExternalParser.cc
+//
+// ExternalParser: Implementation of ExternalParser
+// Allows external programs to parse unknown document formats.
+// The parser is expected to return the document in a
+// specific format. The format is documented
+// in http://www.htdig.org/attrs.html#external_parser
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalParser.cc,v 1.29 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "Dictionary.h"
+#include "good_strtok.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#ifdef _MSC_VER /* _WIN32 */
+#include <process.h>
+#endif
+
+
+#include "defaults.h"
+
+static Dictionary *parsers = 0;
+static Dictionary *toTypes = 0;
+extern String configFile;
+
+//*****************************************************************************
+// ExternalParser::ExternalParser(char *contentType)
+//
+ExternalParser::ExternalParser(char *contentType)
+{
+ String mime;
+ int sep;
+
+ if (canParse(contentType))
+ {
+ String mime = contentType;
+ mime.lowercase();
+ sep = mime.indexOf(';');
+ if (sep != -1)
+ mime = mime.sub(0, sep).get();
+
+ currentParser = ((String *)parsers->Find(mime))->get();
+ }
+ ExternalParser::contentType = contentType;
+}
+
+
+//*****************************************************************************
+// ExternalParser::~ExternalParser()
+//
+ExternalParser::~ExternalParser()
+{
+}
+
+
+//*****************************************************************************
+// int ExternalParser::readLine(FILE *in, String &line)
+//
+int
+ExternalParser::readLine(FILE *in, String &line)
+{
+ char buffer[2048];
+ int length;
+
+ line = 0; // read(in, buffer, sizeof(buffer)
+ while (fgets(buffer, sizeof(buffer), in))
+ {
+ length = strlen(buffer);
+ if (buffer[length - 1] == '\n')
+ {
+ //
+ // A full line has been read. Return it.
+ //
+ line << buffer;
+ line.chop('\n');
+ return 1;
+ }
+ else
+ {
+ //
+ // Only a partial line was read. Append it to the line
+ // and read some more.
+ //
+ line << buffer;
+ }
+ }
+ return line.length() > 0;
+}
+
+
+//*****************************************************************************
+// int ExternalParser::canParse(char *contentType)
+//
+int
+ExternalParser::canParse(char *contentType)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ int sep;
+
+ if (!parsers)
+ {
+ parsers = new Dictionary();
+ toTypes = new Dictionary();
+
+ QuotedStringList qsl(config->Find("external_parsers"), " \t");
+ String from, to;
+ int i;
+
+ for (i = 0; qsl[i]; i += 2)
+ {
+ from = qsl[i];
+ to = "";
+ sep = from.indexOf("->");
+ if (sep != -1)
+ {
+ to = from.sub(sep+2).get();
+ from = from.sub(0, sep).get();
+ }
+ from.lowercase();
+ sep = from.indexOf(';');
+ if (sep != -1)
+ from = from.sub(0, sep).get();
+
+ parsers->Add(from, new String(qsl[i + 1]));
+ toTypes->Add(from, new String(to));
+ }
+ }
+
+ String mime = contentType;
+ mime.lowercase();
+ sep = mime.indexOf(';');
+ if (sep != -1)
+ mime = mime.sub(0, sep).get();
+ return parsers->Exists(mime);
+}
+
+//*****************************************************************************
+// void ExternalParser::parse(Retriever &retriever, URL &base)
+//
+void
+ExternalParser::parse(Retriever &retriever, URL &base)
+{
+// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
+#ifndef _MSC_VER /* _WIN32 */
+ HtConfiguration* config= HtConfiguration::config();
+ if (contents == 0 || contents->length() == 0 ||
+ currentParser.length() == 0)
+ {
+ return;
+ }
+
+ //
+ // Write the contents to a temporary file.
+ //
+ String path = getenv("TMPDIR");
+ int fd;
+ if (path.length() == 0)
+ path = "/tmp";
+#ifndef HAVE_MKSTEMP
+ path << "/htdext." << getpid(); // This is unfortunately predictable
+
+#ifdef O_BINARY
+ fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY);
+#else
+ fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL);
+#endif
+#else
+ path << "/htdex.XXXXXX";
+ fd = mkstemp((char*)path);
+ // can we force binary mode somehow under Cygwin, if it has mkstemp?
+#endif
+ if (fd < 0)
+ {
+ if (debug)
+ cout << "External parser error: Can't create temp file "
+ << (char *)path << endl;
+ return;
+ }
+
+ write(fd, contents->get(), contents->length());
+ close(fd);
+
+// unsigned int minimum_word_length = config->Value("minimum_word_length", 3);
+ String line;
+ char *token1, *token2, *token3;
+ int loc = 0, hd = 0;
+ URL url;
+ String mime = contentType;
+ mime.lowercase();
+ int sep = mime.indexOf(';');
+ if (sep != -1)
+ mime = mime.sub(0, sep).get();
+ String convertToType = ((String *)toTypes->Find(mime))->get();
+ int get_hdr = (convertToType.nocase_compare("user-defined") == 0);
+ int get_file = (convertToType.length() != 0);
+ String newcontent;
+
+ StringList cpargs(currentParser);
+ char **parsargs = new char * [cpargs.Count() + 5];
+ int argi;
+ for (argi = 0; argi < cpargs.Count(); argi++)
+ parsargs[argi] = (char *)cpargs[argi];
+ parsargs[argi++] = path.get();
+ parsargs[argi++] = contentType.get();
+ parsargs[argi++] = (char *)base.get().get();
+ parsargs[argi++] = configFile.get();
+ parsargs[argi++] = 0;
+
+ int stdout_pipe[2];
+ int fork_result = -1;
+ int fork_try;
+
+ if (pipe(stdout_pipe) == -1)
+ {
+ if (debug)
+ cout << "External parser error: Can't create pipe!" << endl;
+ unlink((char*)path);
+ delete [] parsargs;
+ return;
+ }
+
+ for (fork_try = 4; --fork_try >= 0;)
+ {
+ fork_result = fork(); // Fork so we can execute in the child process
+ if (fork_result != -1)
+ break;
+ if (fork_try)
+ sleep(3);
+ }
+ if (fork_result == -1)
+ {
+ if (debug)
+ cout << "Fork Failure in ExternalParser" << endl;
+ unlink((char*)path);
+ delete [] parsargs;
+ return;
+ }
+
+ if (fork_result == 0) // Child process
+ {
+ close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+ dup(stdout_pipe[1]);
+ close(stdout_pipe[0]);
+ close(stdout_pipe[1]);
+ close(STDIN_FILENO); // Close STDIN to replace with file
+ open((char*)path, O_RDONLY);
+
+ // Call External Parser
+ execv(parsargs[0], parsargs);
+
+ exit(EXIT_FAILURE);
+ }
+
+ // Parent Process
+ delete [] parsargs;
+ close(stdout_pipe[1]); // Close STDOUT for writing
+#ifdef O_BINARY
+ FILE *input = fdopen(stdout_pipe[0], "rb");
+#else
+ FILE *input = fdopen(stdout_pipe[0], "r");
+#endif
+ if (input == NULL)
+ {
+ if (debug)
+ cout << "Fdopen Failure in ExternalParser" << endl;
+ unlink((char*)path);
+ return;
+ }
+
+ while ((!get_file || get_hdr) && readLine(input, line))
+ {
+ if (get_hdr)
+ {
+ line.chop('\r');
+ if (line.length() == 0)
+ get_hdr = false;
+ else if (mystrncasecmp((char*)line, "content-type:", 13) == 0)
+ {
+ token1 = line.get() + 13;
+ while (*token1 && isspace(*token1))
+ token1++;
+ token1 = strtok(token1, "\n\t");
+ convertToType = token1;
+ }
+ continue;
+ }
+#ifdef O_BINARY
+ line.chop('\r');
+#endif
+ token1 = strtok(line, "\t");
+ if (token1 == NULL)
+ token1 = "";
+ token2 = NULL;
+ token3 = NULL;
+ switch (*token1)
+ {
+ case 'w': // word
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ token2 = strtok(0, "\t");
+ if (token2 != NULL)
+ token3 = strtok(0, "\t");
+ if (token1 != NULL && token2 != NULL && token3 != NULL &&
+ (loc = atoi(token2)) >= 0 &&
+ (hd = atoi(token3)) >= 0 && hd < 12)
+ retriever.got_word(token1, loc, hd);
+ else
+ cerr<< "External parser error: expected word in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'u': // href
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ token2 = strtok(0, "\t");
+ if (token1 != NULL && token2 != NULL)
+ {
+ url.parse(token1);
+ url.hopcount(base.hopcount() + 1);
+ retriever.got_href(url, token2);
+ }
+ else
+ cerr<< "External parser error: expected URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 't': // title
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_title(token1);
+ else
+ cerr<< "External parser error: expected title in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'h': // head
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_head(token1);
+ else
+ cerr<< "External parser error: expected text in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'a': // anchor
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_anchor(token1);
+ else
+ cerr<< "External parser error: expected anchor in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'i': // image url
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_image(token1);
+ else
+ cerr<< "External parser error: expected image URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'm': // meta
+ {
+ // Using good_strtok means we can accept empty
+ // fields.
+ char *httpEquiv = good_strtok(token1+2, '\t');
+ char *name = good_strtok(0, '\t');
+ char *content = good_strtok(0, '\t');
+
+ if (httpEquiv != NULL && name != NULL && content != NULL)
+ {
+ // It would be preferable if we could share
+ // this part with HTML.cc, but it has other
+ // chores too, and I do not see a point where to
+ // split it up to get a common shared function
+ // (or class). This should not stop anybody from
+ // finding a better solution.
+ // For now, there is duplicated code.
+ static StringMatch *keywordsMatch = 0;
+ if (!keywordsMatch)
+ {
+ StringList kn(config->Find("keywords_meta_tag_names"), " \t");
+ keywordsMatch = new StringMatch();
+ keywordsMatch->IgnoreCase();
+ keywordsMatch->Pattern(kn.Join('|'));
+ }
+ static StringMatch *descriptionMatch = 0;
+ if (!descriptionMatch)
+ {
+ StringList dn(config->Find("description_meta_tag_names"), " \t");
+ descriptionMatch = new StringMatch();
+ descriptionMatch->IgnoreCase();
+ descriptionMatch->Pattern(dn.Join('|'));
+ }
+ static StringMatch *metadatetags = 0;
+ if (!metadatetags)
+ {
+ metadatetags = new StringMatch();
+ metadatetags->IgnoreCase();
+ metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
+ }
+
+ // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5>
+ // says that the "name" attribute defaults to
+ // the http-equiv attribute if empty.
+ if (*name == '\0')
+ name = httpEquiv;
+
+ if (*httpEquiv != '\0')
+ {
+ // <META HTTP-EQUIV=REFRESH case
+ if (mystrcasecmp(httpEquiv, "refresh") == 0
+ && *content != '\0')
+ {
+ char *q = (char*)mystrcasestr(content, "url");
+ if (q && *q)
+ {
+ q += 3; // skiping "URL"
+ while (*q && ((*q == '=') || isspace(*q))) q++;
+ char *qq = q;
+ while (*qq && (*qq != ';') && (*qq != '"') &&
+ !isspace(*qq))qq++;
+ *qq = 0;
+ URL href(q, base);
+ // I don't know why anyone would do this, but hey...
+ retriever.got_href(href, "");
+ }
+ }
+ }
+
+ //
+ // Now check for <meta name=... content=...> tags that
+ // fly with any reasonable DTD out there
+ //
+ if (*name != '\0' && *content != '\0')
+ {
+ if (keywordsMatch->CompareWord(name))
+ {
+ int wordindex = 1;
+ addKeywordString (retriever, content, wordindex);
+// // can this be merged with Parser::addKeywordString ?
+// char *w = strtok(content, " ,\t\r");
+// while (w)
+// {
+// if (strlen(w) >= minimum_word_length)
+// retriever.got_word(w, 1, 9);
+// w = strtok(0, " ,\t\r");
+// }
+ }
+ if (metadatetags->CompareWord(name) &&
+ config->Boolean("use_doc_date", 0))
+ {
+ retriever.got_time(content);
+ }
+ else if (mystrcasecmp(name, "author") == 0)
+ {
+ int wordindex = 1;
+ retriever.got_author(content);
+ addString (retriever, content, wordindex, 11);
+ }
+ else if (mystrcasecmp(name, "htdig-email") == 0)
+ {
+ retriever.got_meta_email(content);
+ }
+ else if (mystrcasecmp(name, "htdig-notification-date") == 0)
+ {
+ retriever.got_meta_notification(content);
+ }
+ else if (mystrcasecmp(name, "htdig-email-subject") == 0)
+ {
+ retriever.got_meta_subject(content);
+ }
+ else if (descriptionMatch->CompareWord(name)
+ && strlen(content) != 0)
+ {
+ //
+ // We need to do two things. First grab the description
+ //
+ String meta_dsc = content;
+
+ if (meta_dsc.length() > max_meta_description_length)
+ meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+ if (debug > 1)
+ cout << "META Description: " << content << endl;
+ retriever.got_meta_dsc((char*)meta_dsc);
+
+ //
+ // Now add the words to the word list
+ // (slot 10 is the new slot for this)
+ //
+ int wordindex = 1;
+ addString (retriever, content, wordindex, 10);
+// // can this be merged with Parser::addString ?
+// char *w = strtok(content, " \t\r");
+// while (w)
+// {
+// if (strlen(w) >= minimum_word_length)
+// retriever.got_word(w, 1, 10);
+// w = strtok(0, " \t\r");
+// }
+ }
+ }
+ }
+ else
+ cerr<< "External parser error: expected metadata in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+ }
+
+ default:
+ cerr<< "External parser error: unknown field in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+ }
+ } // while(readLine)
+ if (get_file)
+ {
+ if (!canParse(convertToType) &&
+ mystrncasecmp((char*)convertToType, "text/", 5) != 0)
+ {
+ if (mystrcasecmp((char*)convertToType, "user-defined") == 0)
+ cerr << "External parser error: no Content-Type given\n";
+ else
+ cerr << "External parser error: can't parse Content-Type \""
+ << convertToType << "\"\n";
+ cerr << " URL: " << base.get() << "\n";
+ }
+ else
+ {
+ char buffer[2048];
+ int length;
+ int nbytes = config->Value("max_doc_size");
+ while (nbytes > 0 &&
+ (length = fread(buffer, 1, sizeof(buffer), input)) > 0)
+ {
+ nbytes -= length;
+ if (nbytes < 0)
+ length += nbytes;
+ newcontent.append(buffer, length);
+ }
+ }
+ }
+ fclose(input);
+ // close(stdout_pipe[0]); // This is closed for us by the fclose()
+ int rpid, status;
+ while ((rpid = wait(&status)) != fork_result && rpid != -1)
+ ;
+ unlink((char*)path);
+
+ if (newcontent.length() > 0)
+ {
+ static HTML *html = 0;
+ static Plaintext *plaintext = 0;
+ Parsable *parsable = 0;
+
+ contentType = convertToType;
+ if (canParse(contentType))
+ {
+ currentParser = ((String *)parsers->Find(contentType))->get();
+ parsable = this;
+ }
+ else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
+ {
+ if (!html)
+ html = new HTML();
+ parsable = html;
+ }
+ else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ }
+ else
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ if (debug)
+ cout << "External parser error: \"" << contentType <<
+ "\" not a recognized type. Assuming text/plain\n";
+ }
+ parsable->setContents(newcontent.get(), newcontent.length());
+ parsable->parse(retriever, base);
+ }
+#endif //ifndef _MSC_VER /* _WIN32 */
+}
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h
new file mode 100644
index 00000000..4c7579a1
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h
@@ -0,0 +1,58 @@
+//
+// ExternalParser.h
+//
+// ExternalParser: Allows external programs to parse unknown document formats.
+// The parser is expected to return the document in a
+// specific format. The format is documented
+// in http://www.htdig.org/attrs.html#external_parser
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalParser.h,v 1.8 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifndef _ExternalParser_h_
+#define _ExternalParser_h_
+
+#include "Parsable.h"
+#include "htString.h"
+
+#include <stdio.h>
+
+class URL;
+
+
+class ExternalParser : public Parsable
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ ExternalParser(char *contentType);
+ virtual ~ExternalParser();
+
+ //
+ // Main parser interface.
+ //
+ virtual void parse(Retriever &retriever, URL &);
+
+ //
+ // Check if the given contentType has an external parser associated
+ // with it
+ //
+ static int canParse(char *contentType);
+
+private:
+ String currentParser;
+ String contentType;
+
+ int readLine(FILE *, String &);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc
new file mode 100644
index 00000000..c418e62c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc
@@ -0,0 +1,376 @@
+//
+// ExternalTransport.cc
+//
+// ExternalTransport: Allows external programs to retrieve given URLs with
+// unknown protocols.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalTransport.cc,v 1.9 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "ExternalTransport.h"
+#include "htdig.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "Dictionary.h"
+#include "good_strtok.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#include "defaults.h"
+
+static Dictionary *handlers = 0;
+static Dictionary *toTypes = 0;
+extern String configFile;
+
+//*****************************************************************************
+// ExternalTransport::ExternalTransport(char *protocol)
+//
+ExternalTransport::ExternalTransport(const String &protocol)
+{
+ if (canHandle(protocol))
+ {
+ _Handler = ((String *)handlers->Find(protocol))->get();
+ }
+ ExternalTransport::_Protocol = protocol;
+ _Response = new ExternalTransport_Response;
+}
+
+
+//*****************************************************************************
+// ExternalTransport::~ExternalTransport()
+//
+ExternalTransport::~ExternalTransport()
+{
+ if (_Response)
+ {
+ delete _Response;
+ }
+}
+
+
+//*****************************************************************************
+// int ExternalTransport::canHandle(const String &protocol)
+//
+int
+ExternalTransport::canHandle(const String &protocol)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ if (!handlers)
+ {
+ handlers = new Dictionary();
+ toTypes = new Dictionary();
+
+ QuotedStringList qsl(config->Find("external_protocols"), " \t");
+ String from, to;
+ int i;
+ int sep;
+
+ for (i = 0; qsl[i]; i += 2)
+ {
+ from = qsl[i];
+ to = "";
+ sep = from.indexOf("->");
+ if (sep != -1)
+ {
+ to = from.sub(sep+2).get();
+ from = from.sub(0, sep).get();
+ }
+
+ // Recognise service specified as "https://" rather than "https"
+ sep = from.indexOf(":");
+ if (sep != -1)
+ from = from.sub(0, sep).get();
+
+ handlers->Add(from, new String(qsl[i + 1]));
+ toTypes->Add(from, new String(to));
+ }
+ }
+ return handlers->Exists(protocol);
+}
+
+
+//*****************************************************************************
+// void ExternalTransport::SetConnection(URL *u)
+//
+void ExternalTransport::SetConnection (URL *u)
+{
+ // Grab the actual URL to pass to the handler
+ _URL = *u;
+
+ // OK, now call the parent method to make sure everything else is set up.
+ Transport::SetConnection (u->host(), u->port());
+}
+
+
+//*****************************************************************************
+// DocStatus ExternalTransport::Request()
+//
+Transport::DocStatus ExternalTransport::Request()
+{
+// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
+#ifndef _MSC_VER /* _WIN32 */
+ //
+ // Start the external handler, passing the protocol, URL and config file
+ // as command arguments
+ //
+ StringList hargs(_Handler);
+ char **handlargs = new char * [hargs.Count() + 5];
+ int argi;
+ for (argi = 0; argi < hargs.Count(); argi++)
+ handlargs[argi] = (char *)hargs[argi];
+ handlargs[argi++] = _Protocol.get();
+ handlargs[argi++] = (char *)_URL.get().get();
+ handlargs[argi++] = configFile.get();
+ handlargs[argi++] = 0;
+
+ int stdout_pipe[2];
+ int fork_result = -1;
+ int fork_try;
+
+ if (pipe(stdout_pipe) == -1)
+ {
+ if (debug)
+ cerr << "External transport error: Can't create pipe!" << endl;
+ delete [] handlargs;
+ return GetDocumentStatus(_Response);
+ }
+
+ for (fork_try = 4; --fork_try >= 0;)
+ {
+ fork_result = fork(); // Fork so we can execute in the child process
+ if (fork_result != -1)
+ break;
+ if (fork_try)
+ sleep(3);
+ }
+ if (fork_result == -1)
+ {
+ if (debug)
+ cerr << "Fork Failure in ExternalTransport" << endl;
+ delete [] handlargs;
+ return GetDocumentStatus(_Response);
+ }
+
+ if (fork_result == 0) // Child process
+ {
+ close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+ dup(stdout_pipe[1]);
+ close(stdout_pipe[0]);
+ close(stdout_pipe[1]);
+ // not really necessary, and may pose Cygwin incompatibility...
+ //close(STDIN_FILENO); // Close STDIN to replace with null dev.
+ //open("/dev/null", O_RDONLY);
+
+ // Call External Transport Handler
+ execv(handlargs[0], handlargs);
+
+ exit(EXIT_FAILURE);
+ }
+
+ // Parent Process
+ delete [] handlargs;
+ close(stdout_pipe[1]); // Close STDOUT for writing
+ FILE *input = fdopen(stdout_pipe[0], "r");
+ if (input == NULL)
+ {
+ if (debug)
+ cerr << "Fdopen Failure in ExternalTransport" << endl;
+ return GetDocumentStatus(_Response);
+ }
+
+ // Set up a response for this request
+ _Response->Reset();
+ // We just accessed the document
+ _Response->_access_time = new HtDateTime();
+ _Response->_access_time->SettoNow();
+
+
+ // OK, now parse the stuff we got back from the handler...
+ String line;
+ char *token1;
+ int in_header = 1;
+
+ while (in_header && readLine(input, line))
+ {
+ line.chop('\r');
+ if (line.length() > 0 && debug > 2)
+ cout << "Header line: " << line << endl;
+ token1 = strtok(line, "\t");
+ if (token1 == NULL)
+ {
+ token1 = "";
+ in_header = 0;
+ break;
+ }
+
+ switch (*token1)
+ {
+ case 's': // status code
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ _Response->_status_code = atoi(token1);
+ else
+ cerr<< "External transport error: expected status code in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+
+ case 'r': // status reason
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ _Response->_reason_phrase = token1;
+ else
+ cerr<< "External transport error: expected status reason in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+
+ case 'm': // modification time
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ _Response->_modification_time= NewDate(token1); // Hopefully we can grok it...
+ else
+ cerr<< "External transport error: expected modification time in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+
+ case 't': // Content-Type
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ _Response->_content_type = token1;
+ else
+ cerr<< "External transport error: expected content-type in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+
+ case 'l': // Content-Length
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ _Response->_content_length = atoi(token1);
+ else
+ cerr<< "External transport error: expected content-length in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+
+ case 'u': // redirect target
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ _Response->_location = token1;
+ else
+ cerr<< "External transport error: expected URL in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+
+ default:
+ cerr<< "External transport error: unknown field in line "<<line<<"\n" << " URL: " << _URL.get() << "\n";
+ break;
+ }
+ }
+
+ // OK, now we read in the rest of the document as contents...
+ _Response->_contents = 0;
+ char docBuffer[8192];
+ int bytesRead;
+
+ while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), input)) > 0)
+ {
+ if (debug > 2)
+ cout << "Read " << bytesRead << " from document\n";
+ if (_Response->_contents.length() + bytesRead > _max_document_size)
+ bytesRead = _max_document_size - _Response->_contents.length();
+ _Response->_contents.append(docBuffer, bytesRead);
+ if (_Response->_contents.length() >= _max_document_size)
+ break;
+ }
+ _Response->_document_length = _Response->_contents.length();
+ fclose(input);
+ // close(stdout_pipe[0]); // This is closed for us by the fclose()
+
+ int rpid, status;
+ while ((rpid = wait(&status)) != fork_result && rpid != -1)
+ ;
+
+#endif
+
+ return GetDocumentStatus(_Response);
+}
+
+
+//*****************************************************************************
+// private
+// DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r)
+//
+Transport::DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r)
+{
+ // The default is 'not found' if we can't figure it out...
+ DocStatus returnStatus = Document_not_found;
+ int statuscode = r->GetStatusCode();
+
+ if (statuscode == 200)
+ {
+ returnStatus = Document_ok; // OK
+ // Is it parsable?
+ }
+
+ else if (statuscode > 200 && statuscode < 300)
+ returnStatus = Document_ok; // Successful 2xx
+ else if (statuscode == 304)
+ returnStatus = Document_not_changed; // Not modified
+ else if (statuscode > 300 && statuscode < 400)
+ returnStatus = Document_redirect; // Redirection 3xx
+ else if (statuscode == 401)
+ returnStatus = Document_not_authorized; // Unauthorized
+
+ return returnStatus;
+}
+
+
+//*****************************************************************************
+// private
+// int ExternalTransport::readLine(FILE *in, String &line)
+//
+int
+ExternalTransport::readLine(FILE *in, String &line)
+{
+ char buffer[2048];
+ int length;
+
+ line = 0;
+ while (fgets(buffer, sizeof(buffer), in))
+ {
+ length = strlen(buffer);
+ if (buffer[length - 1] == '\n')
+ {
+ //
+ // A full line has been read. Return it.
+ //
+ line << buffer;
+ line.chop('\n');
+ return 1;
+ }
+ else
+ {
+ //
+ // Only a partial line was read. Append it to the line
+ // and read some more.
+ //
+ line << buffer;
+ }
+ }
+ return line.length() > 0;
+}
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h
new file mode 100644
index 00000000..4c946a96
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h
@@ -0,0 +1,88 @@
+//
+// ExternalTransport.h
+//
+// ExternalTransport: Allows external programs to retrieve given URLs with
+// unknown protocols.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalTransport.h,v 1.5 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifndef _ExternalTransport_h_
+#define _ExternalTransport_h_
+
+#include "Transport.h"
+#include "htString.h"
+
+#include <stdio.h>
+
+// First we must declare a derived Transport_Response class
+// This requires declaring the main class in advance
+class ExternalTransport;
+class ExternalTransport_Response : public Transport_Response
+{
+ friend class ExternalTransport;
+
+ // Nothing else... We just want it so we can access the protected fields
+};
+
+// Right, now we get on with the show...
+class ExternalTransport : public Transport
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ ExternalTransport(const String &protocol);
+ virtual ~ExternalTransport();
+
+
+ //
+ // Check if the given protocol has a handler
+ //
+ static int canHandle(const String &protocol);
+
+ // Setting connections is obviously a bit different than the base class
+ // from a URL pointer
+ void SetConnection (URL *u);
+
+ // from a URL object
+ void SetConnection (URL &u)
+ { SetConnection (&u); }
+
+ // Make the request
+ DocStatus Request();
+
+ // Get the response or the status
+ Transport_Response *GetResponse() { return _Response; }
+ DocStatus GetDocumentStatus() { return GetDocumentStatus(_Response); }
+
+
+private:
+ // The command to handle the current protocol
+ String _Handler;
+ // And the current protocol
+ String _Protocol;
+
+ // The URL to Request()
+ URL _URL;
+
+ // The result of the Request()
+ ExternalTransport_Response *_Response;
+
+
+
+ // Private helper to read in the result from the handler
+ int readLine(FILE *, String &);
+ // Work out the DocStatus from the HTTP-style status codes
+ DocStatus GetDocumentStatus(ExternalTransport_Response *r);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc
new file mode 100644
index 00000000..56e1d00f
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc
@@ -0,0 +1,1002 @@
+//
+// HTML.cc
+//
+// HTML: Class to parse HTML documents and return useful information
+// to the Retriever
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: HTML.cc,v 1.76 2004/06/09 17:35:34 grdetil Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htdig.h"
+#include "HTML.h"
+#include "HtSGMLCodec.h"
+#include "HtConfiguration.h"
+#include "StringMatch.h"
+#include "StringList.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "WordType.h"
+
+#include <ctype.h>
+
+#include "defaults.h"
+
+// Flags for noindex & nofollow, indicating who turned indexing off/on...
+#define TAGnoindex 0x0001
+#define TAGstyle 0x0002
+#define TAGscript 0x0004
+#define TAGmeta_htdig_noindex 0x0008
+#define TAGmeta_robots 0x0010
+
+static StringMatch tags;
+static StringMatch nobreaktags;
+static StringMatch spacebeforetags;
+static StringMatch spaceaftertags;
+static StringMatch metadatetags;
+static StringMatch descriptionMatch;
+static StringMatch keywordsMatch;
+//static int keywordsCount;
+//static int max_keywords;
+
+
+//*****************************************************************************
+// ADDSPACE() macro, to insert space where needed in various strings
+// Reduces all multiple whitespace to a single space
+
+#define ADDSPACE(in_space) \
+ if (!in_space) \
+ { \
+ if (in_title && !noindex) \
+ { \
+ title << ' '; \
+ } \
+ if (in_ref && description.length() < max_description_length) \
+ { \
+ description << ' '; \
+ } \
+ if (head.length() < max_head_length && !noindex && !in_title) \
+ { \
+ head << ' '; \
+ } \
+ in_space = 1; \
+ }
+
+
+//*****************************************************************************
+// HTML::HTML()
+//
+HTML::HTML() :
+ skip_start (HtConfiguration::config()->Find("noindex_start")," \t"),
+ skip_end (HtConfiguration::config()->Find("noindex_end"), " \t")
+{
+ HtConfiguration *config= HtConfiguration::config();
+ //
+ // Initialize the patterns that we will try to match.
+ // The tags Match object is used to match tag commands while
+ //
+ tags.IgnoreCase();
+ tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object|link|style|/style|script|/script");
+
+ // These tags don't cause a word break. They may also be in "tags" above,
+ // except for the "a" tag, which must be handled as a special case.
+ // Note that <sup> & <sub> should cause a word break.
+ nobreaktags.IgnoreCase();
+ nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s");
+
+ // These tags, which may also be in "tags" above, cause word breaks and
+ // therefore cause space to be inserted before (or after) do_tag() is done.
+ spacebeforetags.IgnoreCase();
+ spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer");
+ spaceaftertags.IgnoreCase();
+ spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote");
+
+ // These are the name values of meta tags that carry date information.
+ metadatetags.IgnoreCase();
+ metadatetags.Pattern("date|dc.date|dc.date.created|dc.date.modified");
+
+ // These are the name values of meta tags that carry descriptions.
+ StringList descrNames(config->Find("description_meta_tag_names"), " \t");
+ descriptionMatch.IgnoreCase();
+ descriptionMatch.Pattern(descrNames.Join('|'));
+
+ // These are the name values of meta tags that carry keywords.
+ StringList keywordNames(config->Find("keywords_meta_tag_names"), " \t");
+ keywordsMatch.IgnoreCase();
+ keywordsMatch.Pattern(keywordNames.Join('|'));
+// (now in Parser)
+// max_keywords = config->Value("max_keywords", -1);
+// if (max_keywords < 0)
+// max_keywords = (int) ((unsigned int) ~1 >> 1);
+
+ // skip_start/end mark sections of text to be ignored by ht://Dig
+ // Make sure there are equal numbers of each, and warn of deprecated
+ // syntax.
+ if (skip_start.Count() > 1 || skip_end.Count() > 1)
+ {
+ if (skip_start.Count() != 0 && skip_end.Count() != 0)
+ {
+ // check for old-style start/end which allowed unquoted spaces
+ // (Check noindex_start/end for exactly one "<" or/followed-by
+ // exactly one ">", and no leading quotes.)
+ // Can someone think of a better (or simpler) check??
+ String noindex_end (config->Find ("noindex_end"));
+ char *first_left = strchr (noindex_end.get(), '<');
+ char *secnd_left = first_left ? strchr(first_left+1,'<') : (char*)0;
+ char *first_right= strchr (noindex_end.get(), '>');
+ char *secnd_right= first_right? strchr(first_right+1,'>'): (char*)0;
+ String noindex_start (config->Find ("noindex_start"));
+ char *first_lft = strchr (noindex_start.get(), '<');
+ char *secnd_lft = first_left ? strchr (first_lft +1,'<') : (char*)0;
+ char *first_rght= strchr (noindex_start.get(), '>');
+ char *secnd_rght= first_right? strchr (first_rght+1,'>') : (char*)0;
+
+ if (((first_right && !secnd_right && first_right < first_left) ||
+ (first_left && !secnd_left && !first_right) ||
+ (first_rght && !secnd_rght && first_rght < first_lft) ||
+ (first_lft && !secnd_lft && !first_rght)) &&
+ noindex_end[0] != '\"' && noindex_start[0] != '\"')
+ {
+ cout << "\nWarning: To allow multiple noindex_start/end patterns, patterns containing\nspaces should now be in quotation marks. (If the entries are indended to be\nmultiple patterns, this warning can be suppressed by placing the first pattern\nin quotes.)\n\n";
+ // Should we treat the patterns as if they had been quoted
+ // (as we assume was intended)?
+ }
+ }
+ }
+
+ // check each start has an end
+ if (skip_start.Count() < skip_end.Count())
+ {
+ cout << "Warning: " << skip_end.Count()
+ << " noindex_end patterns, but only " << skip_start.Count()
+ << " noindex_start patterns.\n";
+ } else
+ {
+ while (skip_start.Count () > skip_end.Count())
+ {
+ int missing = skip_end.Count() - 1;
+ skip_end.Add ((missing >= 0) ? skip_end [missing]
+ : "<!--/htdig_noindex-->");
+ cout << "Warning: Copying " << skip_end [missing+1]
+ << " as noindex_end match for " << skip_start [missing+1]
+ << endl;
+ }
+ }
+
+ word = 0;
+ href = 0;
+ title = 0;
+ description = 0;
+ head = 0;
+ meta_dsc = 0;
+ tag = 0;
+ in_title = 0;
+ in_ref = 0;
+ in_heading = 0;
+ base = 0;
+ noindex = 0;
+ nofollow = 0;
+// minimumWordLength = config->Value("minimum_word_length", 3);
+}
+
+
+//*****************************************************************************
+// HTML::~HTML()
+//
+HTML::~HTML()
+{
+}
+
+
+//*****************************************************************************
+// void HTML::parse(Retriever &retriever, URL &baseURL)
+// Parse the HTML document using the Retriever object for all the callbacks.
+// The HTML document contents are contained in the contents String.
+//
+void
+HTML::parse(Retriever &retriever, URL &baseURL)
+{
+ if (contents == 0 || contents->length() == 0)
+ return;
+
+ base = &baseURL;
+
+ //
+ // We have some variables which will contain the various items we
+ // are looking for
+ //
+ int wordindex = 1;
+ int in_space;
+ int in_punct;
+ String scratch, textified;
+ unsigned char *q, *start;
+ unsigned char *position = (unsigned char *) contents->get();
+ unsigned char *text = (unsigned char *)new char[contents->length()+1];
+ unsigned char *ptext = text;
+
+ keywordsCount = 0;
+ title = 0;
+ head = 0;
+ meta_dsc = 0;
+ noindex = 0;
+ nofollow = 0;
+ in_heading = 0;
+ in_title = 0;
+ in_ref = 0;
+ in_space = 0;
+ in_punct = 0;
+
+ while (*position)
+ {
+
+ //
+ // Filter out section marked to be ignored for indexing.
+ // This can contain any HTML.
+ // On finding a noindex_start, skip to first occurrence of matching
+ // noindex_end. Any noindex_start within will be ignored.
+ //
+ int i;
+ for (i = 0; i < skip_start.Count(); i++)
+ {
+ if (mystrncasecmp((char *)position, skip_start[i],
+ ((String*)skip_start.Nth(i))->length()) == 0)
+ break; // break from this loop for "continue" below...
+ }
+ if (i < skip_start.Count()) // found a match;
+ {
+ q = (unsigned char*)mystrcasestr((char *)position, skip_end[i]);
+ if (!q)
+ *position = '\0'; // Rest of document will be skipped...
+ else
+ position = q + ((String*)skip_end.Nth(i))->length();
+ continue;
+ }
+ // end of noindex_start/end code
+
+
+ if (strncmp((char *)position, "<!", 2) == 0)
+ {
+ //
+ // Possible comment declaration (but could be DTD declaration!)
+ // A comment can contain other '<' and '>':
+ // we have to ignore complete comment declarations
+ // but of course also DTD declarations.
+ //
+ position += 2; // Get past declaration start
+ if (strncmp((char *)position, "--", 2) == 0)
+ {
+ // Found start of comment - now find the end
+ position += 2;
+ do
+ {
+ q = (unsigned char*)strstr((char *)position, "--");
+ if (!q)
+ {
+ *position = '\0';
+ break; // Rest of document seems to be a comment...
+ }
+ else
+ {
+ position = q + 2;
+ // Skip extra dashes after a badly formed comment
+ while (*position == '-')
+ position++;
+ // Skip whitespace after an individual comment
+ while (isspace(*position))
+ position++;
+ }
+ // if comment declaration hasn't ended, skip another comment
+ }
+ while (*position && *position != '>');
+ if (*position == '>')
+ {
+ position++; // End of comment declaration
+ }
+ }
+ else
+ {
+ // Not a comment declaration after all
+ // but possibly DTD: get to the end
+ q = (unsigned char*)strchr((char *)position, '>');
+ if (q)
+ {
+ position = q + 1;
+ // End of (whatever) declaration
+ }
+ else
+ {
+ *position = '\0'; // Rest of document is DTD?
+ }
+ }
+ continue;
+ }
+
+ if (*position == '<')
+ {
+ //
+ // Start of a tag. Since tags cannot be nested, we can simply
+ // search for the closing '>'
+ //
+ q = (unsigned char*)strchr((char *)position, '>');
+ if (q)
+ { // copy tag
+ while (position <= q)
+ *ptext++ = *position++;
+ }
+ else
+ { // copy rest of text, as tag does not end
+ while (*position)
+ *ptext++ = *position++;
+ }
+ }
+ else if (*position == '&')
+ {
+ q = (unsigned char*)strchr((char *)position, ';');
+ if (q && q <= position+10)
+ { // got ending, looks like valid SGML entity
+ scratch = 0;
+ scratch.append((char*)position, q+1 - position);
+ textified = HtSGMLCodec::instance()->encode(scratch);
+ if (textified[0] != '&' || textified.length() == 1)
+ { // it was decoded, copy it
+ position = (unsigned char *)textified.get();
+ while (*position)
+ {
+ if (*position == '<')
+ { // got a decoded &lt;, make a fake tag for it
+ // to avoid confusing it with real tag start
+ *ptext++ = '<';
+ *ptext++ = '~';
+ *ptext++ = '>';
+ position++;
+ }
+ else
+ *ptext++ = *position++;
+ }
+ position = q+1;
+ }
+ else // it wasn't decoded, copy '&', and rest will follow
+ *ptext++ = *position++;
+ }
+ else // not SGML entity, copy bare '&'
+ *ptext++ = *position++;
+ }
+ else
+ {
+ *ptext++ = *position++;
+ }
+ }
+ *ptext++ = '\0';
+
+ position = text;
+ start = position;
+
+ while (*position)
+ {
+ if (*position == '<' && (position[1] != '~' || position[2] != '>'))
+ {
+ //
+ // Start of a tag. Since tags cannot be nested, we can simply
+ // search for the closing '>'
+ //
+ q = (unsigned char*)strchr((char *)position, '>');
+ if (!q)
+ break; // Syntax error in the doc. Tag never ends.
+ position++;
+ if (noindex & TAGscript)
+ { // Special handling in case '<' is part of JavaScript code
+ while (isspace(*position))
+ position++;
+ if (mystrncasecmp((char *)position, "/script", 7) != 0)
+ continue;
+ }
+ tag = 0;
+ tag.append((char*)position, q - position);
+ while (isspace(*position))
+ position++;
+ if (!in_space && spacebeforetags.CompareWord((char *)position)
+ || !in_space && !in_punct && *position != '/')
+ {
+ // These opening tags cause a space to be inserted
+ // before anything they insert.
+ // Tags processed here (i.e. not in nobreaktags), like <a ...>
+ // tag, are a special case: they don't actually add space in
+ // formatted text, but because in our processing it causes
+ // a word break, we avoid word concatenation in "head" string.
+ ADDSPACE(in_space);
+ in_punct = 0;
+ }
+ do_tag(retriever, tag);
+ if (!in_space && spaceaftertags.CompareWord((char *)position))
+ {
+ // These closing tags cause a space to be inserted
+ // after anything they insert.
+ ADDSPACE(in_space);
+ in_punct = 0;
+ }
+ position = q+1;
+ }
+ else if (*position > 0 && HtIsStrictWordChar(*position))
+ {
+ //
+ // Start of a word. Try to find the whole thing
+ //
+ word = 0;
+ in_space = 0;
+ in_punct = 0;
+ while (*position && HtIsWordChar(*position))
+ {
+ word << (char)*position;
+ // handle case where '<' is in extra_word_characters...
+ if (strncmp((char *)position, "<~>", 3) == 0)
+ position += 2; // skip over fake tag for decoded '<'
+ position++;
+ if (*position == '<')
+ {
+ q = position+1;
+ while (isspace(*q))
+ q++;
+ // Does this tag cause a word break?
+ if (nobreaktags.CompareWord((char *)q))
+ {
+ // These tags just change character formatting and
+ // don't break words.
+ q = (unsigned char*)strchr((char *)position, '>');
+ if (q)
+ {
+ position++;
+ tag = 0;
+ tag.append((char*)position, q - position);
+ do_tag(retriever, tag);
+ position = q+1;
+ }
+ }
+ }
+ }
+
+ if (in_title && !noindex)
+ {
+ title << word;
+ }
+
+ if (in_ref)
+ {
+ if (description.length() < max_description_length)
+ {
+ description << word;
+ }
+ else
+ {
+ description << " ...";
+ if (!nofollow)
+ retriever.got_href(*href, (char*)description);
+ in_ref = 0;
+ description = 0;
+ }
+ }
+
+ if (head.length() < max_head_length && !noindex && !in_title)
+ {
+ //
+ // Capitalize H1 and H2 blocks
+ //
+ if (in_heading > 1 && in_heading < 4)
+ {
+ word.uppercase();
+ }
+
+ //
+ // Append the word to the head (excerpt)
+ //
+ head << word;
+ }
+
+ if (word.length() >= (int)minimum_word_length && !noindex)
+ {
+ retriever.got_word((char*)word, wordindex++, in_heading);
+ }
+ }
+ else
+ {
+ //
+ // Characters that are not part of a word
+ //
+ if (isspace(*position))
+ {
+ ADDSPACE(in_space);
+ in_punct = 0;
+ }
+ else
+ {
+ //
+ // Not whitespace
+ //
+ if (head.length() < max_head_length && !noindex && !in_title)
+ {
+ // We don't want to add random chars to the
+ // excerpt if we're in the title.
+ head << *position;
+ }
+ if (in_ref && description.length() < max_description_length)
+ {
+ description << *position;
+ }
+ if (in_title && !noindex)
+ {
+ title << *position;
+ }
+ in_space = 0;
+ in_punct = 1;
+ // handle normal case where decoded '<' is punctuation...
+ if (strncmp((char *)position, "<~>", 3) == 0)
+ position += 2; // skip over fake tag for decoded '<'
+ }
+ position++;
+ }
+ }
+ retriever.got_head((char*)head);
+
+ delete [] text;
+}
+
+
+//*****************************************************************************
+// void HTML::do_tag(Retriever &retriever, String &tag)
+//
+void
+HTML::do_tag(Retriever &retriever, String &tag)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ int wordindex = 1;
+ char *position = tag.get();
+ int which, length;
+ static int ignore_alt_text = config->Boolean("ignore_alt_text", 0);
+
+ while (isspace(*position))
+ position++;
+
+ which = -1;
+ if (tags.CompareWord(position, which, length) < 0)
+ return; // Nothing matched.
+
+ // Use the configuration code to match attributes as key-value pairs
+ HtConfiguration attrs;
+ attrs.NameValueSeparators("=");
+ attrs.Add(position);
+
+ if (debug > 3)
+ cout << "Tag: <" << tag << ">, matched " << which << endl;
+
+ switch (which)
+ {
+ case 0: // "title"
+ if (title.length())
+ {
+ if (debug)
+ cout << "More than one <title> tag in document!"
+ << " (possible search engine spamming)" << endl;
+ break;
+ }
+ in_title = 1;
+ in_heading = 1;
+ break;
+
+ case 1: // "/title"
+ if (!in_title)
+ break;
+ in_title = 0;
+ in_heading = 0;
+ retriever.got_title((char*)title);
+ break;
+
+ case 2: // "a"
+ {
+ if (!attrs["href"].empty())
+ {
+ //
+ // a href seen
+ //
+ if (in_ref)
+ {
+ if (debug > 1)
+ cout << "Terminating previous <a href=...> tag,"
+ << " which didn't have a closing </a> tag."
+ << endl;
+ if (!nofollow)
+ retriever.got_href(*href, (char*)description);
+ in_ref = 0;
+ }
+ if (href)
+ delete href;
+ href = new URL(transSGML(attrs["href"]), *base);
+ in_ref = 1;
+ description = 0;
+ break;
+ }
+
+ if (!attrs["title"].empty() && !attrs["href"].empty())
+ {
+ //
+ // a title seen for href
+ //
+ retriever.got_href(*href, transSGML(attrs["title"]));
+ }
+
+ if (!attrs["name"].empty())
+ {
+ //
+ // a name seen
+ //
+ retriever.got_anchor(transSGML(attrs["name"]));
+ }
+ break;
+ }
+
+ case 3: // "/a"
+ if (in_ref)
+ {
+ if (!nofollow)
+ retriever.got_href(*href, (char*)description);
+ in_ref = 0;
+ }
+ break;
+
+ case 4: // "h1"
+ in_heading = 2;
+ break;
+
+ case 5: // "h2"
+ in_heading = 3;
+ break;
+
+ case 6: // "h3"
+ in_heading = 4;
+ break;
+
+ case 7: // "h4"
+ in_heading = 5;
+ break;
+
+ case 8: // "h5"
+ in_heading = 6;
+ break;
+
+ case 9: // "h6"
+ in_heading = 7;
+ break;
+
+ case 10: // "/h1"
+ case 11: // "/h2"
+ case 12: // "/h3"
+ case 13: // "/h4"
+ case 14: // "/h5"
+ case 15: // "/h6"
+ in_heading = 0;
+ break;
+
+ case 16: // "noindex"
+ noindex |= TAGnoindex;
+ nofollow |= TAGnoindex;
+ if (!attrs["follow"].empty())
+ nofollow &= ~TAGnoindex;
+ break;
+
+ case 27: // "style"
+ noindex |= TAGstyle;
+ nofollow |= TAGstyle;
+ break;
+
+ case 29: // "script"
+ noindex |= TAGscript;
+ nofollow |= TAGscript;
+ break;
+
+ case 17: // "/noindex"
+ noindex &= ~TAGnoindex;
+ nofollow &= ~TAGnoindex;
+ break;
+
+ case 28: // "/style"
+ noindex &= ~TAGstyle;
+ nofollow &= ~TAGstyle;
+ break;
+
+ case 30: // "/script"
+ noindex &= ~TAGscript;
+ nofollow &= ~TAGscript;
+ break;
+
+ case 19: // "li"
+ if (!noindex && !in_title && head.length() < max_head_length)
+ head << "* ";
+ break;
+
+ case 20: // "meta"
+ {
+ //
+ // First test for old-style meta tags (these break any
+ // reasonable DTD...)
+ //
+ if (!attrs["htdig-noindex"].empty())
+ {
+ retriever.got_noindex();
+ noindex |= TAGmeta_htdig_noindex;
+ nofollow |= TAGmeta_htdig_noindex;
+ }
+ if (!attrs["htdig-index"].empty())
+ {
+ noindex &= ~TAGmeta_htdig_noindex;
+ nofollow &= ~TAGmeta_htdig_noindex;
+ }
+ if (!attrs["htdig-email"].empty())
+ retriever.got_meta_email(transSGML(attrs["htdig-email"]));
+
+ if (!attrs["htdig-notification-date"].empty())
+ retriever.got_meta_notification(transSGML(attrs["htdig-notification-date"]));
+
+ if (!attrs["htdig-email-subject"].empty())
+ retriever.got_meta_subject(transSGML(attrs["htdig-email-subject"]));
+
+ if (!attrs["htdig-keywords"].empty() || !attrs["keywords"].empty())
+ {
+ //
+ // Keywords are added as being at the very top of the
+ // document and have a weight factor of
+ // keywords-factor which is assigned to slot 9 in the
+ // factor table.
+ //
+ const String keywords = attrs["htdig-keywords"].empty() ?
+ attrs["htdig-keywords"] :
+ attrs["keywords"];
+ if (!noindex)
+ {
+ String tmp = transSGML(keywords);
+ addKeywordString (retriever, tmp, wordindex);
+ }
+ }
+
+ if (!attrs["http-equiv"].empty())
+ {
+
+ // <META HTTP-EQUIV=REFRESH case
+ if (mystrcasecmp(attrs["http-equiv"], "refresh") == 0
+ && !attrs["content"].empty())
+ {
+ String content = attrs["content"];
+ char *q = (char*)mystrcasestr((char*)content, "url");
+ if (q && *q)
+ {
+ q += 3; // skiping "URL"
+ while (*q && ((*q == '=') || isspace(*q))) q++;
+ char *qq = q;
+ while (*qq && (*qq != ';') && (*qq != '"') &&
+ !isspace(*qq))qq++;
+ *qq = 0;
+ if (href)
+ delete href;
+ href = new URL(transSGML(q), *base);
+ // I don't know why anyone would do this, but hey...
+ if (!nofollow)
+ retriever.got_href(*href, "");
+ }
+ }
+ }
+
+ //
+ // Now check for <meta name=... content=...> tags that
+ // fly with any reasonable DTD out there
+ //
+
+ if (!attrs["name"].empty() && !attrs["content"].empty())
+ {
+ const String cache = attrs["name"];
+
+ // First of all, check for META description
+
+ if (descriptionMatch.CompareWord(cache)
+ && !attrs["content"].empty())
+ {
+ //
+ // We need to do two things. First grab the description
+ // and clean it up
+ //
+ meta_dsc = transSGML(attrs["content"]);
+ meta_dsc.replace('\n', ' ');
+ meta_dsc.replace('\r', ' ');
+ meta_dsc.replace('\t', ' ');
+ if (meta_dsc.length() > max_meta_description_length)
+ meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+ if (debug > 1)
+ cout << "META Description: " << attrs["content"] << endl;
+ retriever.got_meta_dsc((char*)meta_dsc);
+
+
+ //
+ // Now add the words to the word list
+ // Slot 10 is the current slot for this
+ //
+ if (!noindex)
+ {
+ String tmp = transSGML(attrs["content"]);
+ addString (retriever, tmp, wordindex, 10);
+ }
+ }
+
+ if (keywordsMatch.CompareWord(cache) && !noindex)
+ {
+ String tmp = transSGML(attrs["content"]);
+ addKeywordString (retriever, tmp, wordindex);
+ }
+ else if (mystrcasecmp(cache, "author") == 0)
+ {
+ String author = transSGML(attrs["content"]);
+ retriever.got_author(author.get());
+ if (!noindex)
+ addString (retriever, author, wordindex, 11);
+ }
+ else if (mystrcasecmp(cache, "htdig-email") == 0)
+ {
+ retriever.got_meta_email(transSGML(attrs["content"]));
+ }
+ else if (metadatetags.CompareWord(cache, which, length) &&
+ (cache.get())[length] == '\0' && config->Boolean("use_doc_date",0))
+ {
+ retriever.got_time(transSGML(attrs["content"]));
+ }
+ else if (mystrcasecmp(cache, "htdig-notification-date") == 0)
+ {
+ retriever.got_meta_notification(transSGML(attrs["content"]));
+ }
+ else if (mystrcasecmp(cache, "htdig-email-subject") == 0)
+ {
+ retriever.got_meta_subject(transSGML(attrs["content"]));
+ }
+ else if (mystrcasecmp(cache, "htdig-noindex") == 0)
+ {
+ retriever.got_noindex();
+ noindex |= TAGmeta_htdig_noindex;
+ nofollow |= TAGmeta_htdig_noindex;
+ }
+ else if (mystrcasecmp(cache, "robots") == 0
+ && !attrs["content"].empty())
+ {
+ String content_cache = attrs["content"];
+ content_cache.lowercase();
+ if (content_cache.indexOf("noindex") != -1)
+ {
+ noindex |= TAGmeta_robots;
+ retriever.got_noindex();
+ }
+ if (content_cache.indexOf("nofollow") != -1)
+ nofollow |= TAGmeta_robots;
+ if (content_cache.indexOf("none") != -1)
+ {
+ noindex |= TAGmeta_robots;
+ nofollow |= TAGmeta_robots;
+ retriever.got_noindex();
+ }
+ }
+ }
+ else if (mystrcasecmp(attrs["name"], "htdig-noindex") == 0)
+ {
+ retriever.got_noindex();
+ noindex |= TAGmeta_htdig_noindex;
+ nofollow |= TAGmeta_htdig_noindex;
+ }
+ break;
+ }
+
+ case 21: // frame
+ case 24: // embed
+ {
+ if (!attrs["src"].empty())
+ {
+ //
+ // src seen
+ //
+ if (!nofollow)
+ {
+ if (href)
+ delete href;
+ href = new URL(transSGML(attrs["src"]), *base);
+ // Frames have the same hopcount as the parent.
+ retriever.got_href(*href, transSGML(attrs["title"]), 0);
+ in_ref = 0;
+ }
+ }
+ break;
+ }
+
+ case 25: // object
+ {
+ if (!attrs["data"].empty())
+ {
+ //
+ // data seen
+ //
+ if (!nofollow)
+ {
+ if (href)
+ delete href;
+ href = new URL(transSGML(attrs["data"]), *base);
+ // Assume objects have the same hopcount as the parent.
+ retriever.got_href(*href, transSGML(attrs["title"]), 0);
+ in_ref = 0;
+ }
+ }
+ break;
+ }
+
+ case 22: // area
+ case 26: // link
+ {
+ if (!attrs["href"].empty())
+ {
+ // href seen
+ if (!nofollow)
+ {
+ if (href)
+ delete href;
+ href = new URL(transSGML(attrs["href"]), *base);
+ // area & link are like anchor tags -- one hopcount!
+ retriever.got_href(*href, transSGML(attrs["title"]), 1);
+ in_ref = 0;
+ }
+ }
+ break;
+ }
+
+ case 23: // base
+ {
+ if (!attrs["href"].empty())
+ {
+ URL tempBase(transSGML(attrs["href"]));
+ *base = tempBase;
+ }
+ break;
+ }
+
+ case 18: // img
+ {
+ if (!ignore_alt_text && !attrs["alt"].empty())
+ {
+ String tmp = transSGML(attrs["alt"]);
+ if (!noindex && in_title)
+ title << tmp << " ";
+ if (in_ref && description.length() < max_description_length)
+ description << tmp << " ";
+ if (!noindex && !in_title && head.length() < max_head_length)
+ head << tmp << " ";
+ if (!noindex)
+ addString (retriever, tmp, wordindex, 8); // slot for img_alt
+ }
+ if (!attrs["src"].empty())
+ {
+ retriever.got_image(transSGML(attrs["src"]));
+ }
+ break;
+ }
+
+ default:
+ return; // Nothing...
+ }
+}
+
+
+//*****************************************************************************
+// const String HTML::transSGML(const String& str)
+//
+const String
+HTML::transSGML(const String& str)
+{
+ return HtSGMLCodec::instance()->encode(str);
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.h b/debian/htdig/htdig-3.2.0b6/htdig/HTML.h
new file mode 100644
index 00000000..867381ed
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.h
@@ -0,0 +1,69 @@
+//
+// HTML.h
+//
+// HTML: Class to parse HTML documents and return useful information
+// to the Retriever
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: HTML.h,v 1.14 2004/05/28 13:15:15 lha Exp $
+//
+#ifndef _HTML_h_
+#define _HTML_h_
+
+#include "Parsable.h"
+#include "QuotedStringList.h"
+
+class Retriever;
+class URL;
+
+
+class HTML : public Parsable
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ HTML();
+ virtual ~HTML();
+
+ //
+ // Main parser interface.
+ //
+ virtual void parse(Retriever &retriever, URL &baseURL);
+
+private:
+ //
+ // Our state variables
+ //
+ String word;
+ URL *href;
+ String title;
+ String description;
+ String head;
+ String meta_dsc;
+ String tag;
+ int in_title;
+ int in_ref;
+ int in_heading;
+ int noindex;
+ int nofollow;
+// unsigned int minimumWordLength;
+ URL *base;
+ QuotedStringList skip_start;
+ QuotedStringList skip_end;
+
+ //
+ // Helper functions
+ //
+ void do_tag(Retriever &, String &);
+ const String transSGML(const String& str);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am
new file mode 100644
index 00000000..1e8368b4
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am
@@ -0,0 +1,16 @@
+
+include $(top_srcdir)/Makefile.config
+
+bin_PROGRAMS = htdig
+
+htdig_SOURCES = Document.cc HTML.cc \
+ Parsable.cc Plaintext.cc \
+ Retriever.cc Server.cc ExternalTransport.cc \
+ URLRef.cc htdig.cc ExternalParser.cc
+
+noinst_HEADERS = Document.h ExternalParser.h HTML.h \
+ Parsable.h Plaintext.h Retriever.h Server.h URLRef.h htdig.h \
+ ExternalTransport.h
+htdig_DEPENDENCIES = $(HTLIBS)
+htdig_LDFLAGS = $(PROFILING) ${extra_ldflags}
+htdig_LDADD = $(HTLIBS)
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in
new file mode 100644
index 00000000..52d9a862
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in
@@ -0,0 +1,487 @@
+# Makefile.in generated by automake 1.7.9 from Makefile.am.
+# @configure_input@
+
+# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
+# Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+#
+# To compile with profiling do the following:
+#
+# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all
+#
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+top_builddir = ..
+
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+INSTALL = @INSTALL@
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_triplet = @host@
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+AMDEP_FALSE = @AMDEP_FALSE@
+AMDEP_TRUE = @AMDEP_TRUE@
+AMTAR = @AMTAR@
+APACHE = @APACHE@
+APACHE_MODULES = @APACHE_MODULES@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CGIBIN_DIR = @CGIBIN_DIR@
+COMMON_DIR = @COMMON_DIR@
+CONFIG_DIR = @CONFIG_DIR@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DATABASE_DIR = @DATABASE_DIR@
+DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO = @ECHO@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FIND = @FIND@
+GUNZIP = @GUNZIP@
+HAVE_SSL = @HAVE_SSL@
+HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@
+HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@
+HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@
+IMAGE_DIR = @IMAGE_DIR@
+IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LDFLAGS = @LDFLAGS@
+LEX = @LEX@
+LEXLIB = @LEXLIB@
+LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
+MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
+MAKEINFO = @MAKEINFO@
+MV = @MV@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PERL = @PERL@
+RANLIB = @RANLIB@
+RRDTOOL = @RRDTOOL@
+SEARCH_DIR = @SEARCH_DIR@
+SEARCH_FORM = @SEARCH_FORM@
+SED = @SED@
+SENDMAIL = @SENDMAIL@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TAR = @TAR@
+TESTS_FALSE = @TESTS_FALSE@
+TESTS_TRUE = @TESTS_TRUE@
+TIME = @TIME@
+TIMEV = @TIMEV@
+USER = @USER@
+VERSION = @VERSION@
+YACC = @YACC@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_F77 = @ac_ct_F77@
+ac_ct_RANLIB = @ac_ct_RANLIB@
+ac_ct_STRIP = @ac_ct_STRIP@
+am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
+am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
+am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
+am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+datadir = @datadir@
+exec_prefix = @exec_prefix@
+extra_ldflags = @extra_ldflags@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+oldincludedir = @oldincludedir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+subdirs = @subdirs@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+
+AUTOMAKE_OPTIONS = foreign no-dependencies
+
+INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \
+ -I$(top_srcdir)/include -I$(top_srcdir)/htlib \
+ -I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \
+ -I$(top_srcdir)/htword \
+ -I$(top_srcdir)/db -I$(top_builddir)/db \
+ $(LOCAL_DEFINES) $(PROFILING)
+
+
+HTLIBS = $(top_builddir)/htnet/libhtnet.la \
+ $(top_builddir)/htcommon/libcommon.la \
+ $(top_builddir)/htword/libhtword.la \
+ $(top_builddir)/htlib/libht.la \
+ $(top_builddir)/htcommon/libcommon.la \
+ $(top_builddir)/htword/libhtword.la \
+ $(top_builddir)/db/libhtdb.la \
+ $(top_builddir)/htlib/libht.la
+
+
+bin_PROGRAMS = htdig
+
+htdig_SOURCES = Document.cc HTML.cc \
+ Parsable.cc Plaintext.cc \
+ Retriever.cc Server.cc ExternalTransport.cc \
+ URLRef.cc htdig.cc ExternalParser.cc
+
+
+noinst_HEADERS = Document.h ExternalParser.h HTML.h \
+ Parsable.h Plaintext.h Retriever.h Server.h URLRef.h htdig.h \
+ ExternalTransport.h
+
+htdig_DEPENDENCIES = $(HTLIBS)
+htdig_LDFLAGS = $(PROFILING) ${extra_ldflags}
+htdig_LDADD = $(HTLIBS)
+subdir = htdig
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = $(top_builddir)/include/config.h
+CONFIG_CLEAN_FILES =
+bin_PROGRAMS = htdig$(EXEEXT)
+PROGRAMS = $(bin_PROGRAMS)
+
+am_htdig_OBJECTS = Document.$(OBJEXT) HTML.$(OBJEXT) Parsable.$(OBJEXT) \
+ Plaintext.$(OBJEXT) Retriever.$(OBJEXT) Server.$(OBJEXT) \
+ ExternalTransport.$(OBJEXT) URLRef.$(OBJEXT) htdig.$(OBJEXT) \
+ ExternalParser.$(OBJEXT)
+htdig_OBJECTS = $(am_htdig_OBJECTS)
+
+DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/include
+depcomp =
+am__depfiles_maybe =
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \
+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+ $(AM_CXXFLAGS) $(CXXFLAGS)
+CXXLD = $(CXX)
+CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES = $(htdig_SOURCES)
+HEADERS = $(noinst_HEADERS)
+
+DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.in \
+ $(top_srcdir)/Makefile.config Makefile.am
+SOURCES = $(htdig_SOURCES)
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .cc .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4)
+ cd $(top_srcdir) && \
+ $(AUTOMAKE) --foreign htdig/Makefile
+Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status
+ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)
+binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
+install-binPROGRAMS: $(bin_PROGRAMS)
+ @$(NORMAL_INSTALL)
+ $(mkinstalldirs) $(DESTDIR)$(bindir)
+ @list='$(bin_PROGRAMS)'; for p in $$list; do \
+ p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \
+ if test -f $$p \
+ || test -f $$p1 \
+ ; then \
+ f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \
+ echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f"; \
+ $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f || exit 1; \
+ else :; fi; \
+ done
+
+uninstall-binPROGRAMS:
+ @$(NORMAL_UNINSTALL)
+ @list='$(bin_PROGRAMS)'; for p in $$list; do \
+ f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \
+ echo " rm -f $(DESTDIR)$(bindir)/$$f"; \
+ rm -f $(DESTDIR)$(bindir)/$$f; \
+ done
+
+clean-binPROGRAMS:
+ @list='$(bin_PROGRAMS)'; for p in $$list; do \
+ f=`echo $$p|sed 's/$(EXEEXT)$$//'`; \
+ echo " rm -f $$p $$f"; \
+ rm -f $$p $$f ; \
+ done
+htdig$(EXEEXT): $(htdig_OBJECTS) $(htdig_DEPENDENCIES)
+ @rm -f htdig$(EXEEXT)
+ $(CXXLINK) $(htdig_LDFLAGS) $(htdig_OBJECTS) $(htdig_LDADD) $(LIBS)
+
+mostlyclean-compile:
+ -rm -f *.$(OBJEXT) core *.core
+
+distclean-compile:
+ -rm -f *.tab.c
+
+.cc.o:
+ $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$<
+
+.cc.obj:
+ $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi`
+
+.cc.lo:
+ $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$<
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+
+distclean-libtool:
+ -rm -f libtool
+uninstall-info-am:
+
+ETAGS = etags
+ETAGSFLAGS =
+
+CTAGS = ctags
+CTAGSFLAGS =
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ mkid -fID $$unique
+
+TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ tags=; \
+ here=`pwd`; \
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ test -z "$(ETAGS_ARGS)$$tags$$unique" \
+ || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$tags $$unique
+
+ctags: CTAGS
+CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ tags=; \
+ here=`pwd`; \
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ test -z "$(CTAGS_ARGS)$$tags$$unique" \
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+ $$tags $$unique
+
+GTAGS:
+ here=`$(am__cd) $(top_builddir) && pwd` \
+ && cd $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) $$here
+
+distclean-tags:
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+top_distdir = ..
+distdir = $(top_distdir)/$(PACKAGE)-$(VERSION)
+
+distdir: $(DISTFILES)
+ $(mkinstalldirs) $(distdir)/..
+ @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
+ list='$(DISTFILES)'; for file in $$list; do \
+ case $$file in \
+ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
+ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
+ esac; \
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test "$$dir" != "$$file" && test "$$dir" != "."; then \
+ dir="/$$dir"; \
+ $(mkinstalldirs) "$(distdir)$$dir"; \
+ else \
+ dir=''; \
+ fi; \
+ if test -d $$d/$$file; then \
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+ fi; \
+ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+ else \
+ test -f $(distdir)/$$file \
+ || cp -p $$d/$$file $(distdir)/$$file \
+ || exit 1; \
+ fi; \
+ done
+check-am: all-am
+check: check-am
+all-am: Makefile $(PROGRAMS) $(HEADERS)
+
+installdirs:
+ $(mkinstalldirs) $(DESTDIR)$(bindir)
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ `test -z '$(STRIP)' || \
+ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -rm -f $(CONFIG_CLEAN_FILES)
+
+maintainer-clean-generic:
+ @echo "This command is intended for maintainers to use"
+ @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+ -rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+ distclean-libtool distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-exec-am: install-binPROGRAMS
+
+install-info: install-info-am
+
+install-man:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+ -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-binPROGRAMS uninstall-info-am
+
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \
+ clean-generic clean-libtool ctags distclean distclean-compile \
+ distclean-generic distclean-libtool distclean-tags distdir dvi \
+ dvi-am info info-am install install-am install-binPROGRAMS \
+ install-data install-data-am install-exec install-exec-am \
+ install-info install-info-am install-man install-strip \
+ installcheck installcheck-am installdirs maintainer-clean \
+ maintainer-clean-generic mostlyclean mostlyclean-compile \
+ mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+ tags uninstall uninstall-am uninstall-binPROGRAMS \
+ uninstall-info-am
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32
new file mode 100644
index 00000000..49839a7c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32
@@ -0,0 +1,29 @@
+#
+# Makefile - makefile for rightnow
+#
+
+APP_NAME = Right Now Web CGI
+RNT_PRODUCT = rnw
+
+TARGET = $(BINDIR)/htdig$(EXESFX)
+
+include ../Makedefs.win32
+
+# -----------------------------------------------------------------------------
+# add new executable members to this list
+
+
+CXXSRC = Document.cc HTML.cc Parsable.cc Plaintext.cc Retriever.cc \
+ Server.cc ExternalTransport.cc URLRef.cc htdig.cc ExternalParser.cc
+
+CPPFLAGS += -I. -I../include -I../htlib -I../htcommon -I../htword -I../db -I../htnet
+
+LDLIBS = ../lib/$(ARCH)/libhtnet.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libhtdb.lib
+OTHERLIBS = ws2_32.lib L:/win32/lib/zlib114/zlib.lib
+
+DEPLIBS += $(LDLIBS)
+
+$(TARGET): $(OBJDIRDEP) $(BINDIRDEP) $(OBJS) $(DEPLIBS)
+ $(EXELD) $(LDFLAGS) $(OBJS) $(LDLIBS) $(OTHERLIBS)
+
+include ../Makerules.win32
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc
new file mode 100644
index 00000000..049362a8
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc
@@ -0,0 +1,96 @@
+//
+// Parsable.cc
+//
+// Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...)
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Parsable.cc,v 1.9 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Parsable.h"
+#include "htdig.h"
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Parsable::Parsable()
+//
+Parsable::Parsable()
+{
+ HtConfiguration* config= HtConfiguration::config();
+ contents = 0;
+ max_head_length = config->Value("max_head_length", 0);
+ max_description_length = config->Value("max_description_length", 50);
+ max_meta_description_length = config->Value("max_meta_description_length", 0);
+
+ max_keywords = config->Value("max_keywords", -1);
+ if (max_keywords < 0)
+ max_keywords = (int) ((unsigned int) ~1 >> 1);
+ minimum_word_length = config->Value("minimum_word_length", 3);
+}
+
+
+//*****************************************************************************
+// Parsable::~Parsable()
+//
+Parsable::~Parsable()
+{
+ delete contents;
+}
+
+
+//*****************************************************************************
+// void Parsable::setContents(char *data, int length)
+// This will set the contents of the parsable object.
+//
+void
+Parsable::setContents(char *data, int length)
+{
+ delete contents;
+ contents = new String(data, length);
+}
+
+//*****************************************************************************
+// void Parsable::addString(char *s, int& wordindex, int slot)
+// Add all words in string s in "heading level" slot, incrementing wordindex
+// along the way. String s is corrupted.
+//
+void
+Parsable::addString(Retriever& retriever, char *s, int& wordindex, int slot)
+{
+ char *w = HtWordToken(s);
+ while (w)
+ {
+ if (strlen(w) >= minimum_word_length)
+ retriever.got_word(w, wordindex++, slot); // slot for img_alt
+ w = HtWordToken(0);
+ }
+ w = '\0';
+}
+
+//*****************************************************************************
+// void Parsable::addKeywordString(char *s, int& wordindex)
+// Add all words in string s as keywords, incrementing wordindex
+// along the way. String s is corrupted.
+//
+void
+Parsable::addKeywordString(Retriever& retriever, char *s, int& wordindex)
+{
+ char *w = HtWordToken(s);
+ while (w)
+ {
+ if (strlen(w) >= minimum_word_length && ++keywordsCount <= max_keywords)
+ retriever.got_word(w, wordindex++, 9);
+ w = HtWordToken(0);
+ }
+ w = '\0';
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h
new file mode 100644
index 00000000..7149fe7c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h
@@ -0,0 +1,57 @@
+//
+// Parsable.h
+//
+// Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...)
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Parsable.h,v 1.10 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifndef _Parsable_h_
+#define _Parsable_h_
+
+#include "htString.h"
+#include "Retriever.h"
+
+class URL;
+
+
+class Parsable
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ Parsable();
+ virtual ~Parsable();
+
+ //
+ // Main parser interface.
+ //
+ virtual void parse(Retriever &retriever, URL &) = 0;
+
+ //
+ // The rest of the members are used by the Document to provide us
+ // the data that we contain.
+ //
+ virtual void setContents(char *data, int length);
+ void addString(Retriever& retriever, char *s, int& wordindex, int slot);
+ void addKeywordString(Retriever& retriever, char *s, int& wordindex);
+
+protected:
+ String *contents;
+ int max_head_length;
+ int max_description_length;
+ int max_meta_description_length;
+ int max_keywords, keywordsCount;
+ unsigned int minimum_word_length;
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
new file mode 100644
index 00000000..e7006fb1
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
@@ -0,0 +1,116 @@
+//
+// Plaintext.cc
+//
+// Plaintext: Parses plaintext files. Not much to do, really.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Plaintext.cc,v 1.20 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "WordType.h"
+
+#include <ctype.h>
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Plaintext::Plaintext()
+//
+Plaintext::Plaintext()
+{
+}
+
+
+//*****************************************************************************
+// Plaintext::~Plaintext()
+//
+Plaintext::~Plaintext()
+{
+}
+
+
+//*****************************************************************************
+// void Plaintext::parse(Retriever &retriever, URL &)
+//
+void
+Plaintext::parse(Retriever &retriever, URL &)
+{
+ if (contents == 0 || contents->length() == 0)
+ return;
+
+ HtConfiguration* config= HtConfiguration::config();
+ unsigned char *position = (unsigned char *) contents->get();
+ static int minimumWordLength = config->Value("minimum_word_length", 3);
+ int wordIndex = 1;
+ int in_space = 0;
+ String word;
+ String head;
+
+ while (*position)
+ {
+ word = 0;
+
+ if (HtIsStrictWordChar(*position))
+ {
+ //
+ // Start of a word. Try to find the whole thing
+ //
+ in_space = 0;
+ while (*position && HtIsWordChar(*position))
+ {
+ word << *position;
+ position++;
+ }
+
+ if (head.length() < max_head_length)
+ {
+ head << word;
+ }
+
+ if (word.length() >= minimumWordLength)
+ {
+ retriever.got_word((char*)word, wordIndex++, 0);
+ }
+ }
+
+ if (head.length() < max_head_length)
+ {
+ //
+ // Characters that are not part of a word
+ //
+ if (*position && isspace(*position))
+ {
+ //
+ // Reduce all multiple whitespace to a single space
+ //
+ if (!in_space)
+ {
+ head << ' ';
+ }
+ in_space = 1;
+ }
+ else
+ {
+ head << *position;
+ in_space = 0;
+ }
+ }
+ if (*position)
+ position++;
+ }
+ retriever.got_head((char*)head);
+}
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h
new file mode 100644
index 00000000..a6275c41
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h
@@ -0,0 +1,41 @@
+//
+// Plaintext.h
+//
+// Plaintext: Parses plaintext files. Not much to do, really.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Plaintext.h,v 1.6 2004/05/28 13:15:15 lha Exp $
+//
+#ifndef _Plaintext_h_
+#define _Plaintext_h_
+
+#include "Parsable.h"
+
+class URL;
+
+
+class Plaintext : public Parsable
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ Plaintext();
+ virtual ~Plaintext();
+
+ //
+ // Main parser interface.
+ //
+ virtual void parse(Retriever &retriever, URL &);
+
+private:
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc
new file mode 100644
index 00000000..13243571
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc
@@ -0,0 +1,2013 @@
+//
+// Retriever.cc
+//
+// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
+// parser notifies the Retriever object that it got something
+// (got_* functions) and the Retriever object feed the databases
+// and statistics accordingly.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#ifdef _MSC_VER /* _WIN32 */
+# include <sys/types.h>
+# include <winsock2.h>
+#endif
+
+
+#include "Retriever.h"
+#include "htdig.h"
+#include "HtWordList.h"
+#include "WordRecord.h"
+#include "URLRef.h"
+#include "Server.h"
+#include "Parsable.h"
+#include "Document.h"
+#include "StringList.h"
+#include "WordType.h"
+#include "Transport.h"
+#include "HtHTTP.h" // For HTTP statistics
+#include "md5.h"
+#include "defaults.h"
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <pwd.h>
+#endif
+
+#include <signal.h>
+#include <stdio.h>
+
+
+static int noSignal;
+
+// no_store_phrases:
+// If true, only store first occurrence of each word in a document
+static bool no_store_phrases;
+
+//*****************************************************************************
+// Retriever::Retriever()
+//
+Retriever::Retriever(RetrieverLog flags):
+words(*(HtConfiguration::config())),
+words_to_add (100, 0.75)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ FILE *urls_parsed;
+
+ currenthopcount = 0;
+ max_hop_count = config->Value("max_hop_count", 999999);
+
+ no_store_phrases = !config->Boolean("store_phrases");
+
+ //
+ // Initialize the flags for the various HTML factors
+ //
+
+ // text_factor
+ factor[0] = FLAG_TEXT;
+ // title_factor
+ factor[1] = FLAG_TITLE;
+ // heading factor (now generic)
+ factor[2] = FLAG_HEADING;
+ factor[3] = FLAG_HEADING;
+ factor[4] = FLAG_HEADING;
+ factor[5] = FLAG_HEADING;
+ factor[6] = FLAG_HEADING;
+ factor[7] = FLAG_HEADING;
+ // img alt text
+ //factor[8] = FLAG_KEYWORDS;
+ factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has
+ // its own FLAG and factor.
+ // keywords factor
+ factor[9] = FLAG_KEYWORDS;
+ // META description factor
+ factor[10] = FLAG_DESCRIPTION;
+ factor[11] = FLAG_AUTHOR;
+
+ doc = new Document();
+ minimumWordLength = config->Value("minimum_word_length", 3);
+
+ log = flags;
+ // if in restart mode
+ if (Retriever_noLog != log)
+ {
+ String filelog = config->Find("url_log");
+ char buffer[1024];
+ int l;
+
+ urls_parsed = fopen((char *) filelog, "r");
+ if (urls_parsed != 0)
+ {
+ // read all url discovered but not fetched before
+ while (fgets(buffer, sizeof(buffer), urls_parsed))
+ {
+ l = strlen(buffer);
+ buffer[l - 1] = 0;
+ Initial(buffer, 2);
+ }
+ fclose(urls_parsed);
+ }
+ unlink((char *) filelog);
+ }
+
+ check_unique_md5 = config->Boolean("check_unique_md5", 0);
+ check_unique_date = config->Boolean("check_unique_date", 0);
+
+ d_md5 = 0;
+ if (check_unique_md5)
+ {
+ d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+ if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
+ {
+ cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
+ }
+ }
+
+}
+
+
+//*****************************************************************************
+// Retriever::~Retriever()
+//
+Retriever::~Retriever()
+{
+ if (d_md5)
+ d_md5->Close();
+ delete doc;
+}
+
+
+//*****************************************************************************
+// void Retriever::setUsernamePassword(char *credentials)
+//
+void Retriever::setUsernamePassword(const char *credentials)
+{
+ doc->setUsernamePassword(credentials);
+}
+
+
+//*****************************************************************************
+// void Retriever::Initial(char *list, int from)
+// Add a single URL to the list of URLs to visit.
+// Since URLs are stored on a per server basis, we first need to find the
+// the correct server to add the URL's path to.
+//
+// from == 0 urls in db.docs and no db.log
+// from == 1 urls in start_url add url only if not already in the list
+// from == 2 add url from db.log
+// from == 3 urls in db.docs and there was a db.log
+//
+void Retriever::Initial(const String & list, int from)
+{
+ //
+ // Split the list of urls up into individual urls.
+ //
+ StringList tokens(list, " \t");
+ String sig;
+ String url;
+ Server *server;
+
+ for (int i = 0; i < tokens.Count(); i++)
+ {
+ URL u(tokens[i]);
+ url = u.get(); // get before u.signature() resolves aliases
+ server = (Server *) servers[u.signature()];
+ if (debug > 2)
+ cout << "\t" << from << ":" << (int) log << ":" << url;
+ if (!server)
+ {
+ String robotsURL = u.signature();
+ robotsURL << "robots.txt";
+ StringList *localRobotsFile = GetLocal(robotsURL);
+
+ server = new Server(u, localRobotsFile);
+ servers.Add(u.signature(), server);
+ delete localRobotsFile;
+ }
+
+ if (from && visited.Exists(url))
+ {
+ if (debug > 2)
+ cout << " skipped" << endl;
+ continue;
+ }
+ else if (IsValidURL(url) != 1)
+ {
+ if (debug > 2)
+ cout << endl;
+ continue;
+ }
+
+ if (Retriever_noLog == log || from != 3)
+ {
+ if (debug > 2)
+ cout << " pushed";
+ server->push(u.get(), 0, 0, IsLocalURL(url.get()));
+ }
+ if (debug > 2)
+ cout << endl;
+ visited.Add(url, 0);
+ }
+}
+
+
+//*****************************************************************************
+// void Retriever::Initial(List &list, int from)
+//
+void Retriever::Initial(List & list, int from)
+{
+ list.Start_Get();
+ String *str;
+
+ // from == 0 is an optimisation for pushing url in update mode
+ // assuming that
+ // 1) there's many more urls in docdb
+ // 2) they're pushed first
+ // 3) there's no duplicate url in docdb
+ // then they don't need to be check against already pushed urls
+ // But 2) can be false with -l option
+ //
+ // FIXME it's nasty, what have to be test is :
+ // we have urls to push from db.docs but do we already have them in
+ // db.log? For this it's using a side effect with 'visited' and that
+ // urls in db.docs are only pushed via this method, and that db.log are pushed
+ // first, db.docs second, start_urls third!
+ //
+ if (!from && visited.Count())
+ {
+ from = 3;
+ }
+ while ((str = (String *) list.Get_Next()))
+ {
+ Initial(str->get(), from);
+ }
+}
+
+//*****************************************************************************
+//
+static void sigexit(int)
+{
+ noSignal = 0; //don't exit here.. just set the flag.
+}
+
+static void sigpipe(int)
+{
+}
+
+//*****************************************************************************
+// static void sig_handlers
+// initialise signal handlers
+//
+static void sig_handlers(void)
+{
+#ifndef _MSC_VER /* _WIN32 */
+ //POSIX SIGNALS
+ struct sigaction action;
+
+ /* SIGINT, SIGQUIT, SIGTERM */
+ action.sa_handler = sigexit;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ if (sigaction(SIGINT, &action, NULL) != 0)
+ reportError("Cannot install SIGINT handler\n");
+ if (sigaction(SIGQUIT, &action, NULL) != 0)
+ reportError("Cannot install SIGQUIT handler\n");
+ if (sigaction(SIGTERM, &action, NULL) != 0)
+ reportError("Cannot install SIGTERM handler\n");
+ if (sigaction(SIGHUP, &action, NULL) != 0)
+ reportError("Cannot install SIGHUP handler\n");
+#else
+ //ANSI C signal handling - Limited to supported Windows signals.
+ signal(SIGINT, sigexit);
+ signal(SIGTERM, sigexit);
+#endif //_MSC_VER /* _WIN32 */
+}
+
+
+
+static void sig_phandler(void)
+{
+#ifndef _MSC_VER /* _WIN32 */
+ struct sigaction action;
+
+ sigemptyset(&action.sa_mask);
+ action.sa_handler = sigpipe;
+ action.sa_flags = SA_RESTART;
+ if (sigaction(SIGPIPE, &action, NULL) != 0)
+ reportError("Cannot install SIGPIPE handler\n");
+#endif //_MSC_VER /* _WIN32 */
+}
+
+
+//*****************************************************************************
+// static void win32_check_messages
+// Check WIN32 messages!
+//
+#ifdef _MSC_VER /* _WIN32 */
+static void win32_check_messages(void)
+{
+// NEAL - NEEDS FINISHING/TESTING
+#if 0
+ MSG msg = {0, 0, 0, 0};
+ int cDown = 0;
+ int controlDown = 0;
+
+ if( GetMessage(&msg, 0, 0, 0) )
+ {
+
+ switch(msg.message)
+ {
+ case WM_KEYDOWN:
+ {
+ if(LOWORD(msg.message)== 17)
+ controlDown = 1;
+ else if(LOWORD(msg.message) == 67)
+ {
+ cDown = 1;
+ }
+ }
+ break;
+ case WM_KEYUP:
+ {
+ if(LOWORD(msg.message) == 17)
+ controlDown = 0;
+ else if(LOWORD(msg.message) == 67)
+ cDown = 0;
+ }
+ break;
+ }
+ }
+
+ DispatchMessage(&msg);
+#endif
+}
+#endif //_MSC_VER /* _WIN32 */
+
+
+//*****************************************************************************
+// void Retriever::Start()
+// This is the main loop of the retriever. We will go through the
+// list of paths stored for each server. While parsing the
+// retrieved documents, new paths will be added to the servers. We
+// return if no more paths need to be retrieved.
+//
+void Retriever::Start()
+{
+ //
+ // Main digger loop. The todo list should initialy have the start
+ // URL and all the URLs which were seen in a previous dig. The
+ // loop will continue as long as there are more URLs to visit.
+ //
+ int more = 1;
+ Server *server;
+ URLRef *ref;
+
+ HtConfiguration *config = HtConfiguration::config();
+
+ //
+ // Always sig . The delay bother me but a bad db is worst
+ //
+ if (Retriever_noLog != log)
+ {
+ sig_handlers();
+ }
+ sig_phandler();
+ noSignal = 1;
+
+
+///////
+ // Main loop. We keep on retrieving until a signal is received
+ // or all the servers' queues are empty.
+///////
+
+#ifdef _MSC_VER /* _WIN32 */
+ win32_check_messages();
+#endif
+
+ while (more && noSignal)
+ {
+ more = 0;
+
+ //
+ // Go through all the current servers in sequence.
+ // If they support persistent connections, we keep on popping
+ // from the same server queue until it's empty or we reach a maximum
+ // number of consecutive requests ("max_connection_requests").
+ // Or the loop may also continue for the infinite,
+ // if we set the "max_connection_requests" to -1.
+ // If the server doesn't support persistent connection, we take
+ // only an URL from it, then we skip to the next server.
+ //
+ // Since 15.05.02: even when persistent connections are activated
+ // we should wait for a 'server_wait_time' number of seconds
+ // after the 'max_connection_requests' value has been reached.
+ //
+
+ // Let's position at the beginning
+ servers.Start_Get();
+
+ int count;
+
+ // Maximum number of repeated requests with the same
+ // TCP connection (so on the same Server:Port).
+
+ int max_connection_requests;
+
+#ifdef _MSC_VER /* _WIN32 */
+ win32_check_messages();
+#endif
+
+ while ((server = (Server *) servers.Get_NextElement()) && noSignal)
+ {
+ if (debug > 1)
+ cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl;
+
+ // We already know if a server supports HTTP pers. connections,
+ // because we asked it for the robots.txt file (constructor of
+ // the class).
+
+ // If the Server doesn't support persistent connections
+ // we turn it down to 1.
+
+ if (server->IsPersistentConnectionAllowed())
+ {
+
+ // Let's check for a '0' value (out of range)
+ // If set, we change it to 1.
+
+ if (config->Value("server", server->host(), "max_connection_requests") == 0)
+ max_connection_requests = 1;
+ else
+ max_connection_requests =
+ config->Value("server", server->host(), "max_connection_requests");
+
+ if (debug > 2)
+ {
+
+ cout << "> " << server->host() << " supports HTTP persistent connections";
+
+ if (max_connection_requests == -1)
+ cout << " (" << "infinite" << ")" << endl;
+ else
+ cout << " (" << max_connection_requests << ")" << endl;
+
+ }
+
+ }
+ else
+ {
+
+ // No HTTP persistent connections. So we request only 1 document.
+
+ max_connection_requests = 1;
+
+ if (debug > 2)
+ cout << "> " << server->host() << " with a traditional HTTP connection" << endl;
+
+ }
+
+
+ count = 0;
+
+#ifdef _MSC_VER /* _WIN32 */
+ win32_check_messages();
+#endif
+
+
+ while (((max_connection_requests == -1) ||
+ (count < max_connection_requests)) && (ref = server->pop()) && noSignal)
+ {
+ count++;
+
+ //
+ // We have a URL to index, now. We need to register the
+ // fact that we are not done yet by setting the 'more'
+ // variable. So, we have to restart scanning the queue.
+ //
+
+ more = 1;
+
+ //
+ // Deal with the actual URL.
+ // We'll check with the server to see if we need to sleep()
+ // before parsing it.
+ //
+
+ parse_url(*ref);
+ delete ref;
+
+ // We reached the maximum number of connections (either with
+ // or without persistent connections) and we must pause and
+ // respect the 'net ethic'.
+ if ((max_connection_requests - count) == 0)
+ server->delay(); // This will pause if needed
+ // and reset the time
+
+#ifdef _MSC_VER /* _WIN32 */
+ win32_check_messages();
+#endif
+
+ }
+
+#ifdef _MSC_VER /* _WIN32 */
+ win32_check_messages();
+#endif
+
+ }
+ }
+
+#ifdef _MSC_VER /* _WIN32 */
+ win32_check_messages();
+#endif
+
+
+ // if we exited on signal
+ if (Retriever_noLog != log && !noSignal)
+ {
+ FILE *urls_parsed;
+ String filelog = config->Find("url_log");
+ // save url seen but not fetched
+ urls_parsed = fopen((char *) filelog, "w");
+ if (0 == urls_parsed)
+ {
+ reportError(form("Unable to create URL log file '%s'", filelog.get()));
+ }
+ else
+ {
+ servers.Start_Get();
+ while ((server = (Server *) servers.Get_NextElement()))
+ {
+ while (NULL != (ref = server->pop()))
+ {
+ fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get());
+ delete ref;
+ }
+ }
+ fclose(urls_parsed);
+ }
+ }
+ words.Close();
+}
+
+
+//*****************************************************************************
+// void Retriever::parse_url(URLRef &urlRef)
+//
+void Retriever::parse_url(URLRef & urlRef)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ URL url;
+ DocumentRef *ref;
+ int old_document;
+ time_t date;
+ static int index = 0;
+ static int local_urls_only = config->Boolean("local_urls_only");
+ static int mark_dead_servers = config->Boolean("ignore_dead_servers");
+ Server *server;
+
+ url.parse(urlRef.GetURL().get());
+
+ currenthopcount = urlRef.GetHopCount();
+
+ ref = docs[url.get()]; // It might be nice to have just an Exists() here
+ if (ref)
+ {
+ //
+ // We already have an entry for this document in our database.
+ // This means we can get the document ID and last modification
+ // time from there.
+ //
+ current_id = ref->DocID();
+ date = ref->DocTime();
+ if (ref->DocAccessed())
+ old_document = 1;
+ else // we haven't retrieved it yet, so we only have the first link
+ old_document = 0;
+ ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link
+ ref->DocAccessed(time(0));
+ ref->DocState(Reference_normal);
+ currenthopcount = ref->DocHopCount();
+ }
+ else
+ {
+ //
+ // Never seen this document before. We need to create an
+ // entry for it. This implies that it gets a new document ID.
+ //
+ date = 0;
+ current_id = docs.NextDocID();
+ ref = new DocumentRef;
+ ref->DocID(current_id);
+ ref->DocURL(url.get());
+ ref->DocState(Reference_normal);
+ ref->DocAccessed(time(0));
+ ref->DocHopCount(currenthopcount);
+ ref->DocBackLinks(1); // We had to have a link to get here!
+ old_document = 0;
+ }
+
+ word_context.DocID(ref->DocID());
+
+ if (debug > 0)
+ {
+ //
+ // Display progress
+ //
+ cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": ";
+ cout.flush();
+ }
+
+ // Reset the document to clean out any old data
+ doc->Reset();
+ doc->Url(url.get());
+ doc->Referer(urlRef.GetReferer().get());
+
+ base = doc->Url();
+
+ // Retrieve document, first trying local file access if possible.
+ Transport::DocStatus status;
+ server = (Server *) servers[url.signature()];
+ StringList *local_filenames = GetLocal(url.get());
+ if (local_filenames)
+ {
+ if (debug > 1)
+ cout << "Trying local files" << endl;
+ status = doc->RetrieveLocal(date, local_filenames);
+ if (status == Transport::Document_not_local)
+ {
+ if (debug > 1)
+ cout << "Local retrieval failed, trying HTTP" << endl;
+ if (server && !server->IsDead() && !local_urls_only)
+ status = doc->Retrieve(server, date);
+ else
+ status = Transport::Document_no_host;
+ }
+ delete local_filenames;
+ }
+ else if (server && !server->IsDead() && !local_urls_only)
+ status = doc->Retrieve(server, date);
+ else
+ status = Transport::Document_no_host;
+
+ current_ref = ref;
+
+ //
+ // Determine what to do by looking at the status code returned by
+ // the Document retrieval process.
+ //
+
+ String shash;
+ String sx;
+ char bhash[16];
+ time_t ddate;
+
+ switch (status)
+ {
+
+ case Transport::Document_ok:
+ trackWords = 1;
+
+ if (check_unique_md5)
+ {
+ if (doc->StoredLength() > 0)
+ {
+ if (check_unique_date)
+ {
+ ddate = doc->ModTime();
+ if (ddate < time(NULL) - 10)
+ { // Unknown date was set to current time
+ md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug);
+ }
+ else
+ {
+ md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
+ }
+ }
+ else
+ md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
+
+ shash.append(bhash, MD5_LENGTH);
+ d_md5->Get(shash, sx);
+
+ if (!sx.empty())
+ {
+ if (debug > 1)
+ {
+ cout << " Detected duplicate by md5 hash" << endl;
+ }
+ words.Skip();
+ break; // Duplicate - don't index
+ }
+ else
+ {
+ d_md5->Put(shash, "x");
+ }
+
+ }
+ }
+
+ if (old_document)
+ {
+ if (doc->ModTime() == ref->DocTime())
+ {
+ words.Skip();
+ if (debug)
+ cout << " retrieved but not changed" << endl;
+ words.Skip();
+ break;
+ }
+ //
+ // Since we already had a record of this document and
+ // we were able to retrieve it, it must have changed
+ // since the last time we scanned it. This means that
+ // we need to assign a new document ID to it and mark
+ // the old one as obsolete.
+ //
+ words.Skip();
+ int backlinks = ref->DocBackLinks();
+ ref->DocState(Reference_obsolete);
+ docs.Add(*ref);
+ delete ref;
+
+ current_id = docs.NextDocID();
+ word_context.DocID(current_id);
+ ref = new DocumentRef;
+ ref->DocID(current_id);
+ ref->DocURL(url.get());
+ ref->DocState(Reference_normal);
+ ref->DocAccessed(time(0));
+ ref->DocHopCount(currenthopcount);
+ ref->DocBackLinks(backlinks);
+ if (debug)
+ cout << " (changed) ";
+ }
+ RetrievedDocument(*doc, url.get(), ref);
+ // Hey! If this document is marked noindex, don't even bother
+ // adding new words. Mark this as gone and get rid of it!
+ if (ref->DocState() == Reference_noindex)
+ {
+ if (debug > 1)
+ cout << " ( " << ref->DocURL() << " ignored)";
+ words.Skip();
+ }
+ else
+ words.Flush();
+ if (debug)
+ cout << " size = " << doc->Length() << endl;
+
+ if (urls_seen)
+ {
+ fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n",
+ (const char *) url.get(), doc->Length(), doc->ContentType(),
+ (int) doc->ModTime(), currenthopcount);
+ }
+ break;
+
+ case Transport::Document_not_changed:
+ if (debug)
+ cout << " not changed" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_not_found:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " not found" << endl;
+ recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found);
+ words.Skip();
+ break;
+
+ case Transport::Document_no_host:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " host not found" << endl;
+ recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host);
+ words.Skip();
+
+ // Mark the server as being down
+ if (server && mark_dead_servers)
+ server->IsDead(1);
+ break;
+
+ case Transport::Document_no_port:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " host not found (port)" << endl;
+ recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port);
+ words.Skip();
+
+ // Mark the server as being down
+ if (server && mark_dead_servers)
+ server->IsDead(1);
+ break;
+
+ case Transport::Document_not_parsable:
+ ref->DocState(Reference_noindex);
+ if (debug)
+ cout << " not Parsable" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_redirect:
+ if (debug)
+ cout << " redirect" << endl;
+ ref->DocState(Reference_obsolete);
+ words.Skip();
+ got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get());
+ break;
+
+ case Transport::Document_not_authorized:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " not authorized" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_not_local:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " not local" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_no_header:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " no header" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_connection_down:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " connection down" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_no_connection:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " no connection" << endl;
+ words.Skip();
+ break;
+
+ case Transport::Document_not_recognized_service:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " service not recognized" << endl;
+
+ // Mark the server as being down
+ if (server && mark_dead_servers)
+ server->IsDead(1);
+ words.Skip();
+ break;
+
+ case Transport::Document_other_error:
+ ref->DocState(Reference_not_found);
+ if (debug)
+ cout << " other error" << endl;
+ words.Skip();
+ break;
+ }
+ docs.Add(*ref);
+ delete ref;
+}
+
+
+//*****************************************************************************
+// void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
+// We found a document that needs to be parsed. Since we don't know the
+// document type, we'll let the Document itself return an appropriate
+// Parsable object which we can call upon to parse the document contents.
+//
+void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref)
+{
+ n_links = 0;
+ current_ref = ref;
+ current_title = 0;
+ word_context.Anchor(0);
+ current_time = 0;
+ current_head = 0;
+ current_meta_dsc = 0;
+
+ //
+ // Create a parser object and let it have a go at the document.
+ // We will pass ourselves as a callback object for all the got_*()
+ // routines.
+ // This will generate the Parsable object as a specific parser
+ //
+ Parsable *parsable = doc.getParsable();
+ if (parsable)
+ parsable->parse(*this, *base);
+ else
+ { // If we didn't get a parser, then we should get rid of this!
+ ref->DocState(Reference_noindex);
+ return;
+ }
+
+ // If just storing the first occurrence of each word in a document,
+ // we must now flush the words we saw in that document
+ if (no_store_phrases)
+ {
+ DictionaryCursor cursor;
+ char *key;
+ HtWordReference wordRef;
+ for (words_to_add.Start_Get (cursor);
+ (key = words_to_add.Get_Next(cursor)); )
+ {
+ word_entry *entry = (word_entry*) (words_to_add [key]);
+
+ wordRef.Location(entry->location);
+ wordRef.Flags(entry->flags);
+ wordRef.Word(key);
+ words.Replace(WordReference::Merge(wordRef, entry->context));
+ // How do I clean up properly?
+ delete entry;
+ }
+ words_to_add.Release ();
+ }
+
+ //
+ // We don't need to dispose of the parsable object since it will
+ // automatically be reused.
+ //
+
+ //
+ // Update the document reference
+ //
+ ref->DocHead((char *) current_head);
+ ref->DocMetaDsc((char *) current_meta_dsc);
+ if (current_time == 0)
+ ref->DocTime(doc.ModTime());
+ else
+ ref->DocTime(current_time);
+ ref->DocTitle((char *) current_title);
+ ref->DocSize(doc.Length());
+ ref->DocAccessed(time(0));
+ ref->DocLinks(n_links);
+}
+
+
+//*****************************************************************************
+// int Retriever::Need2Get(const String &u)
+// Return TRUE if we need to retrieve the given url. This will
+// check the list of urls we have already visited.
+//
+int Retriever::Need2Get(const String & u)
+{
+ static String url;
+ url = u;
+
+ return !visited.Exists(url);
+}
+
+
+
+//*****************************************************************************
+// int Retriever::IsValidURL(const String &u)
+// Return TRUE if we need to retrieve the given url. We will check
+// for limits here.
+//
+int Retriever::IsValidURL(const String & u)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ Dictionary invalids;
+ Dictionary valids;
+ URL aUrl(u);
+ StringList tmpList;
+
+ // A list of bad extensions, separated by spaces or tabs
+ String t = config->Find(&aUrl, "bad_extensions");
+ String lowerp;
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ invalids.Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+
+ //
+ // Valid extensions are performed similarly
+ //
+ // A list of valid extensions, separated by spaces or tabs
+
+ t = config->Find(&aUrl, "valid_extensions");
+ p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ valids.Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+
+ static String url;
+ url = u;
+
+ //
+ // If the URL contains any of the patterns in the exclude list,
+ // mark it as invalid
+ //
+ String exclude_urls = config->Find(&aUrl, "exclude_urls");
+ static String *prevexcludes = 0;
+ static HtRegexList *excludes = 0;
+ if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0)
+ {
+ if (!excludes)
+ excludes = new HtRegexList;
+ if (prevexcludes)
+ delete prevexcludes;
+ prevexcludes = new String(exclude_urls);
+ tmpList.Create(exclude_urls, " \t");
+ excludes->setEscaped(tmpList, config->Boolean("case_sensitive"));
+ tmpList.Destroy();
+ }
+ if (excludes->match(url, 0, 0) != 0)
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: item in exclude list ";
+ return (HTDIG_ERROR_TESTURL_EXCLUDE);
+ }
+
+ //
+ // If the URL has a query string and it is in the bad query list
+ // mark it as invalid
+ //
+ String bad_querystr = config->Find(&aUrl, "bad_querystr");
+ static String *prevbadquerystr = 0;
+ static HtRegexList *badquerystr = 0;
+ if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0)
+ {
+ if (!badquerystr)
+ badquerystr = new HtRegexList;
+ if (prevbadquerystr)
+ delete prevbadquerystr;
+ prevbadquerystr = new String(bad_querystr);
+ tmpList.Create(bad_querystr, " \t");
+ badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive"));
+ tmpList.Destroy();
+ }
+ char *ext = strrchr((char *) url, '?');
+ if (ext && badquerystr->match(ext, 0, 0) != 0)
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: item in bad query list ";
+ return (HTDIG_ERROR_TESTURL_BADQUERY);
+ }
+
+ //
+ // See if the file extension is in the list of invalid ones
+ //
+ String urlpath = url.get();
+ int parm = urlpath.indexOf('?'); // chop off URL parameter
+ if (parm >= 0)
+ urlpath.chop(urlpath.length() - parm);
+ ext = strrchr((char *) urlpath.get(), '.');
+ String lowerext;
+ if (ext && strchr(ext, '/')) // Ignore a dot if it's not in the
+ ext = NULL; // final component of the path.
+ if (ext)
+ {
+ lowerext.set(ext);
+ lowerext.lowercase();
+ if (invalids.Exists(lowerext))
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: Extension is invalid!";
+ return (HTDIG_ERROR_TESTURL_EXTENSION);
+ }
+ }
+ //
+ // Or NOT in the list of valid ones
+ //
+ if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: Extension is not valid!";
+ return (HTDIG_ERROR_TESTURL_EXTENSION2);
+ }
+
+ //
+ // If none of the limits is met, we disallow the URL
+ //
+ if (limits.match(url, 1, 0) == 0)
+ {
+ if (debug > 1)
+ cout << endl << " Rejected: URL not in the limits! ";
+ return (HTDIG_ERROR_TESTURL_LIMITS);
+ }
+ //
+ // Likewise if not in list of normalized urls
+ //
+ // Warning!
+ // should be last in checks because of aUrl normalization
+ //
+ // signature() implicitly normalizes the URL. Be efficient...
+ Server *server = (Server *) servers[aUrl.signature()];
+// aUrl.normalize();
+ if (limitsn.match(aUrl.get(), 1, 0) == 0)
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: not in \"limit_normalized\" list!";
+ return (HTDIG_ERROR_TESTURL_LIMITSNORM);
+ }
+
+ //
+ // After that gauntlet, check to see if the server allows it
+ // (robots.txt)
+ //
+ if (server && server->IsDisallowed(url) != 0)
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: forbidden by server robots.txt!";
+ return (HTDIG_ERROR_TESTURL_ROBOT_FORBID);
+ }
+
+ return (1);
+}
+
+
+//*****************************************************************************
+// StringList* Retriever::GetLocal(const String &url)
+// Returns a list of strings containing the (possible) local filenames
+// of the given url, or 0 if it's definitely not local.
+// THE CALLER MUST FREE THE STRINGLIST AFTER USE!
+// Returned strings are not hex encoded.
+//
+StringList *Retriever::GetLocal(const String & strurl)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ static StringList *prefixes = 0;
+ String url = strurl;
+
+ static StringList *paths = 0;
+ StringList *defaultdocs = 0;
+ URL aUrl(url);
+ url = aUrl.get(); // make sure we look at a parsed URL
+
+ //
+ // Initialize prefix/path list if this is the first time.
+ // The list is given in format "prefix1=path1 prefix2=path2 ..."
+ //
+ if (!prefixes)
+ {
+ prefixes = new StringList();
+ paths = new StringList();
+
+ String t = config->Find("local_urls");
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ char *path = strchr(p, '=');
+ if (!path)
+ {
+ p = strtok(0, " \t");
+ continue;
+ }
+ *path++ = '\0';
+ String *pre = new String(p);
+ decodeURL(*pre);
+ prefixes->Add(pre);
+ String *pat = new String(path);
+ decodeURL(*pat);
+ paths->Add(pat);
+ p = strtok(0, " \t");
+ }
+ }
+ if (!config->Find(&aUrl, "local_default_doc").empty())
+ {
+ defaultdocs = new StringList();
+ String t = config->Find(&aUrl, "local_default_doc");
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ String *def = new String(p);
+ decodeURL(*def);
+ defaultdocs->Add(def);
+ p = strtok(0, " \t");
+ }
+ if (defaultdocs->Count() == 0)
+ {
+ delete defaultdocs;
+ defaultdocs = 0;
+ }
+ }
+
+ // Begin by hex-decoding URL...
+ String hexurl = url;
+ decodeURL(hexurl);
+ url = hexurl.get();
+
+ // Check first for local user...
+ if (strchr(url.get(), '~'))
+ {
+ StringList *local = GetLocalUser(url, defaultdocs);
+ if (local)
+ {
+ if (defaultdocs)
+ delete defaultdocs;
+ return local;
+ }
+ }
+
+ // This shouldn't happen, but check anyway...
+ if (strstr(url.get(), ".."))
+ return 0;
+
+ String *prefix, *path;
+ String *defaultdoc;
+ StringList *local_names = new StringList();
+ prefixes->Start_Get();
+ paths->Start_Get();
+ while ((prefix = (String *) prefixes->Get_Next()))
+ {
+ path = (String *) paths->Get_Next();
+ if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0)
+ {
+ int l = strlen(url.get()) - prefix->length() + path->length() + 4;
+ String *local = new String(*path, l);
+ *local += &url[prefix->length()];
+ if (local->last() == '/' && defaultdocs)
+ {
+ defaultdocs->Start_Get();
+ while ((defaultdoc = (String *) defaultdocs->Get_Next()))
+ {
+ String *localdefault =
+ new String(*local, local->length() + defaultdoc->length() + 1);
+ localdefault->append(*defaultdoc);
+ local_names->Add(localdefault);
+ }
+ delete local;
+ }
+ else
+ local_names->Add(local);
+ }
+ }
+ if (local_names->Count() > 0)
+ {
+ if (defaultdocs)
+ delete defaultdocs;
+ return local_names;
+ }
+
+ if (defaultdocs)
+ delete defaultdocs;
+ delete local_names;
+ return 0;
+}
+
+
+//*****************************************************************************
+// StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs)
+// If the URL has ~user part, return a list of strings containing the
+// (possible) local filenames of the given url, or 0 if it's
+// definitely not local.
+// THE CALLER MUST FREE THE STRINGLIST AFTER USE!
+//
+StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs)
+{
+// NOTE: Native Windows does not have this contruct for the user Web files
+#ifndef _MSC_VER /* _WIN32 */
+ HtConfiguration *config = HtConfiguration::config();
+ static StringList *prefixes = 0, *paths = 0, *dirs = 0;
+ static Dictionary home_cache;
+ URL aUrl(url);
+
+ //
+ // Initialize prefix/path list if this is the first time.
+ // The list is given in format "prefix1=path1,dir1 ..."
+ // If path is zero-length, user's home directory is looked up.
+ //
+ if (!prefixes)
+ {
+ prefixes = new StringList();
+ paths = new StringList();
+ dirs = new StringList();
+ String t = config->Find("local_user_urls");
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ char *path = strchr(p, '=');
+ if (!path)
+ {
+ p = strtok(0, " \t");
+ continue;
+ }
+ *path++ = '\0';
+ char *dir = strchr(path, ',');
+ if (!dir)
+ {
+ p = strtok(0, " \t");
+ continue;
+ }
+ *dir++ = '\0';
+ String *pre = new String(p);
+ decodeURL(*pre);
+ prefixes->Add(pre);
+ String *pat = new String(path);
+ decodeURL(*pat);
+ paths->Add(pat);
+ String *ptd = new String(dir);
+ decodeURL(*ptd);
+ dirs->Add(ptd);
+ p = strtok(0, " \t");
+ }
+ }
+
+ // Can we do anything about this?
+ if (!strchr(url, '~') || !prefixes->Count() || strstr(url, ".."))
+ return 0;
+
+ // Split the URL to components
+ String tmp = url;
+ char *name = strchr((char *) tmp, '~');
+ *name++ = '\0';
+ char *rest = strchr(name, '/');
+ if (!rest || (rest - name <= 1) || (rest - name > 32))
+ return 0;
+ *rest++ = '\0';
+
+ // Look it up in the prefix/path/dir table
+ prefixes->Start_Get();
+ paths->Start_Get();
+ dirs->Start_Get();
+ String *prefix, *path, *dir;
+ String *defaultdoc;
+ StringList *local_names = new StringList();
+ while ((prefix = (String *) prefixes->Get_Next()))
+ {
+ path = (String *) paths->Get_Next();
+ dir = (String *) dirs->Get_Next();
+ if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0)
+ continue;
+
+ String *local = new String;
+ // No path, look up home directory
+ if (path->length() == 0)
+ {
+ String *home = (String *) home_cache[name];
+ if (!home)
+ {
+ struct passwd *passwd = getpwnam(name);
+ if (passwd)
+ {
+ home = new String(passwd->pw_dir);
+ home_cache.Add(name, home);
+ }
+ }
+ if (home)
+ *local += *home;
+ else
+ continue;
+ }
+ else
+ {
+ *local += *path;
+ *local += name;
+ }
+ *local += *dir;
+ *local += rest;
+ if (local->last() == '/' && defaultdocs)
+ {
+ defaultdocs->Start_Get();
+ while ((defaultdoc = (String *) defaultdocs->Get_Next()))
+ {
+ String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1);
+ localdefault->append(*defaultdoc);
+ local_names->Add(localdefault);
+ }
+ delete local;
+ }
+ else
+ local_names->Add(local);
+ }
+
+ if (local_names->Count() > 0)
+ return local_names;
+
+ delete local_names;
+#endif //_MSC_VER /* _WIN32 */
+
+ return 0;
+}
+
+
+//*****************************************************************************
+// int Retriever::IsLocalURL(const String &url)
+// Returns 1 if the given url has a (possible) local filename
+// or 0 if it's definitely not local.
+//
+int Retriever::IsLocalURL(const String & url)
+{
+ int ret;
+
+ StringList *local_filename = GetLocal(url);
+ ret = (local_filename != 0);
+ if (local_filename)
+ delete local_filename;
+
+ return ret;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_word(char *word, int location, int heading)
+// The location is normalized to be in the range 0 - 1000.
+//
+void Retriever::got_word(const char *word, int location, int heading)
+{
+ if (debug > 3)
+ cout << "word: " << word << '@' << location << endl;
+ if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0)
+ heading = 0; // Assume it's just normal text
+ if (trackWords && strlen(word) >= (unsigned int) minimumWordLength)
+ {
+ String w = word;
+ HtWordReference wordRef;
+
+ if (no_store_phrases)
+ {
+ // Add new word, or mark existing word as also being at
+ // this heading level
+ word_entry *entry;
+ if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+ {
+ words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+ } else
+ {
+ entry->flags |= factor[heading];
+ }
+ } else
+ {
+ wordRef.Location(location);
+ wordRef.Flags(factor[heading]);
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+ }
+
+ // Check for compound words...
+ String parts = word;
+ int added;
+ int nparts = 1;
+ do
+ {
+ added = 0;
+ char *start = parts.get();
+ char *punctp = 0, *nextp = 0, *p;
+ char punct;
+ int n;
+ while (*start)
+ {
+ p = start;
+ for (n = 0; n < nparts; n++)
+ {
+ while (HtIsStrictWordChar((unsigned char) *p))
+ p++;
+ punctp = p;
+ if (!*punctp && n + 1 < nparts)
+ break;
+ while (*p && !HtIsStrictWordChar((unsigned char) *p))
+ p++;
+ if (n == 0)
+ nextp = p;
+ }
+ if (n < nparts)
+ break;
+ punct = *punctp;
+ *punctp = '\0';
+ if (*start && (*p || start > parts.get()))
+ {
+ w = start;
+ HtStripPunctuation(w);
+ if (w.length() >= minimumWordLength)
+ {
+ if (no_store_phrases)
+ {
+ // Add new word, or mark existing word as also being at
+ // this heading level
+ word_entry *entry;
+ if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+ {
+ words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+ } else
+ {
+ entry->flags |= factor[heading];
+ }
+ } else
+ {
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+ }
+ if (debug > 3)
+ cout << "word part: " << start << '@' << location << endl;
+ }
+ added++;
+ }
+ start = nextp;
+ *punctp = punct;
+ }
+ nparts++;
+ }
+ while (added > 2);
+ }
+}
+
+
+//*****************************************************************************
+// void Retriever::got_title(const char *title)
+//
+void Retriever::got_title(const char *title)
+{
+ if (debug > 1)
+ cout << "\ntitle: " << title << endl;
+ current_title = title;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_author(const char *e)
+//
+void Retriever::got_author(const char *author)
+{
+ if (debug > 1)
+ cout << "\nauthor: " << author << endl;
+ current_ref->DocAuthor(author);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_time(const char *time)
+//
+void Retriever::got_time(const char *time)
+{
+ HtDateTime new_time(current_time);
+
+ if (debug > 1)
+ cout << "\ntime: " << time << endl;
+
+ //
+ // As defined by the Dublin Core, this should be YYYY-MM-DD
+ // In the future, we'll need to deal with the scheme portion
+ // in case someone picks a different format.
+ //
+ new_time.SetFTime(time, "%Y-%m-%d");
+ current_time = new_time.GetTime_t();
+
+ // If we can't convert it, current_time stays the same and we get
+ // the default--the date returned by the server...
+}
+
+//*****************************************************************************
+// void Retriever::got_anchor(const char *anchor)
+//
+void Retriever::got_anchor(const char *anchor)
+{
+ if (debug > 2)
+ cout << "anchor: " << anchor << endl;
+ current_ref->AddAnchor(anchor);
+ word_context.Anchor(word_context.Anchor() + 1);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_image(const char *src)
+//
+void Retriever::got_image(const char *src)
+{
+ URL url(src, *base);
+ const char *image = (const char *) url.get();
+
+ if (debug > 2)
+ cout << "image: " << image << endl;
+
+ if (images_seen)
+ fprintf(images_seen, "%s\n", image);
+}
+
+
+//*****************************************************************************
+//
+void Retriever::got_href(URL & url, const char *description, int hops)
+{
+ DocumentRef *ref = 0;
+ Server *server = 0;
+ int valid_url_code = 0;
+
+ // Rewrite the URL (if need be) before we do anything to it.
+ url.rewrite();
+
+ if (debug > 2)
+ cout << "href: " << url.get() << " (" << description << ')' << endl;
+
+ n_links++;
+
+ if (urls_seen)
+ fprintf(urls_seen, "%s\n", (const char *) url.get());
+
+ //
+ // Check if this URL falls within the valid range of URLs.
+ //
+ valid_url_code = IsValidURL(url.get());
+ if (valid_url_code > 0)
+ {
+ //
+ // It is valid. Normalize it (resolve cnames for the server)
+ // and check again...
+ //
+ if (debug > 2)
+ {
+ cout << "resolving '" << url.get() << "'\n";
+ cout.flush();
+ }
+
+ url.normalize();
+
+ // If it is a backlink from the current document,
+ // just update that field. Writing to the database
+ // is meaningless, as it will be overwritten.
+ // Adding it as a new document may even be harmful, as
+ // that will be a duplicate. This can happen if the
+ // current document is never referenced before, as in a
+ // start_url.
+
+ if (strcmp(url.get(), current_ref->DocURL()) == 0)
+ {
+ current_ref->DocBackLinks(current_ref->DocBackLinks() + 1);
+ current_ref->AddDescription(description, words);
+ }
+ else
+ {
+
+ //
+ // First add it to the document database
+ //
+ ref = docs[url.get()];
+ // if ref exists we have to call AddDescription even
+ // if max_hop_count is reached
+ if (!ref && currenthopcount + hops > max_hop_count)
+ return;
+
+ if (!ref)
+ {
+ //
+ // Didn't see this one, yet. Create a new reference
+ // for it with a unique document ID
+ //
+ ref = new DocumentRef;
+ ref->DocID(docs.NextDocID());
+ ref->DocHopCount(currenthopcount + hops);
+ ref->DocURL(url.get());
+ }
+ ref->DocBackLinks(ref->DocBackLinks() + 1); // This one!
+ ref->AddDescription(description, words);
+
+ //
+ // If the dig is restricting by hop count, perform the check here
+ // too
+ if (currenthopcount + hops > max_hop_count)
+ {
+ delete ref;
+ return;
+ }
+
+ if (ref->DocHopCount() > currenthopcount + hops)
+ ref->DocHopCount(currenthopcount + hops);
+
+ docs.Add(*ref);
+
+ //
+ // Now put it in the list of URLs to still visit.
+ //
+ if (Need2Get(url.get()))
+ {
+ if (debug > 1)
+ cout << "\n pushing " << url.get() << endl;
+ server = (Server *) servers[url.signature()];
+ if (!server)
+ {
+ //
+ // Hadn't seen this server, yet. Register it
+ //
+ String robotsURL = url.signature();
+ robotsURL << "robots.txt";
+ StringList *localRobotsFile = GetLocal(robotsURL.get());
+
+ server = new Server(url, localRobotsFile);
+ servers.Add(url.signature(), server);
+ delete localRobotsFile;
+ }
+ //
+ // Let's just be sure we're not pushing an empty URL
+ //
+ if (strlen(url.get()))
+ server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()));
+
+ String temp = url.get();
+ visited.Add(temp, 0);
+ if (debug)
+ cout << '+';
+ }
+ else if (debug)
+ cout << '*';
+ delete ref;
+ }
+ }
+ else
+ {
+ //
+ // Not a valid URL
+ //
+ if (debug > 1)
+ cout << "\nurl rejected: (level 1)" << url.get() << endl;
+ if (debug == 1)
+ cout << '-';
+
+ if (urls_seen)
+ {
+ fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code);
+ }
+
+ }
+ if (debug)
+ cout.flush();
+}
+
+
+//*****************************************************************************
+// void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref)
+//
+void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer)
+{
+ // First we must piece together the new URL, which may be relative
+ URL parent(old_ref->DocURL());
+ URL url(new_url, parent);
+
+ // Rewrite the URL (if need be) before we do anything to it.
+ url.rewrite();
+
+ if (debug > 2)
+ cout << "redirect: " << url.get() << endl;
+
+ n_links++;
+
+ if (urls_seen)
+ fprintf(urls_seen, "%s\n", (const char *) url.get());
+
+ //
+ // Check if this URL falls within the valid range of URLs.
+ //
+ if (IsValidURL(url.get()) > 0)
+ {
+ //
+ // It is valid. Normalize it (resolve cnames for the server)
+ // and check again...
+ //
+ if (debug > 2)
+ {
+ cout << "resolving '" << url.get() << "'\n";
+ cout.flush();
+ }
+
+ url.normalize();
+ //
+ // First add it to the document database
+ //
+ DocumentRef *ref = docs[url.get()];
+ if (!ref)
+ {
+ //
+ // Didn't see this one, yet. Create a new reference
+ // for it with a unique document ID
+ //
+ ref = new DocumentRef;
+ ref->DocID(docs.NextDocID());
+ ref->DocHopCount(currenthopcount);
+ }
+ ref->DocURL(url.get());
+
+ //
+ // Copy the descriptions of the old DocRef to this one
+ //
+ List *d = old_ref->Descriptions();
+ if (d)
+ {
+ d->Start_Get();
+ String *str;
+ while ((str = (String *) d->Get_Next()))
+ {
+ ref->AddDescription(str->get(), words);
+ }
+ }
+ if (ref->DocHopCount() > old_ref->DocHopCount())
+ ref->DocHopCount(old_ref->DocHopCount());
+
+ // Copy the number of backlinks
+ ref->DocBackLinks(old_ref->DocBackLinks());
+
+ docs.Add(*ref);
+
+ //
+ // Now put it in the list of URLs to still visit.
+ //
+ if (Need2Get(url.get()))
+ {
+ if (debug > 1)
+ cout << " pushing " << url.get() << endl;
+ Server *server = (Server *) servers[url.signature()];
+ if (!server)
+ {
+ //
+ // Hadn't seen this server, yet. Register it
+ //
+ String robotsURL = url.signature();
+ robotsURL << "robots.txt";
+ StringList *localRobotsFile = GetLocal(robotsURL.get());
+
+ server = new Server(url, localRobotsFile);
+ servers.Add(url.signature(), server);
+ delete localRobotsFile;
+ }
+ if (!referer || strlen(referer) == 0)
+ server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0);
+ else
+ server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0);
+
+ String temp = url.get();
+ visited.Add(temp, 0);
+ }
+
+ delete ref;
+ }
+}
+
+
+//*****************************************************************************
+// void Retriever::got_head(const char *head)
+//
+void Retriever::got_head(const char *head)
+{
+ if (debug > 4)
+ cout << "head: " << head << endl;
+ current_head = head;
+}
+
+//*****************************************************************************
+// void Retriever::got_meta_dsc(const char *md)
+//
+void Retriever::got_meta_dsc(const char *md)
+{
+ if (debug > 4)
+ cout << "meta description: " << md << endl;
+ current_meta_dsc = md;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_email(const char *e)
+//
+void Retriever::got_meta_email(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta email: " << e << endl;
+ current_ref->DocEmail(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_notification(const char *e)
+//
+void Retriever::got_meta_notification(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta notification date: " << e << endl;
+ current_ref->DocNotification(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_subject(const char *e)
+//
+void Retriever::got_meta_subject(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta subect: " << e << endl;
+ current_ref->DocSubject(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_noindex()
+//
+void Retriever::got_noindex()
+{
+ if (debug > 1)
+ cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
+ current_ref->DocState(Reference_noindex);
+}
+
+
+//*****************************************************************************
+//
+void Retriever::recordNotFound(const String & url, const String & referer, int reason)
+{
+ char *message = "";
+
+ switch (reason)
+ {
+ case Transport::Document_not_found:
+ message = "Not found";
+ break;
+
+ case Transport::Document_no_host:
+ message = "Unknown host or unable to contact server";
+ break;
+
+ case Transport::Document_no_port:
+ message = "Unknown host or unable to contact server (port)";
+ break;
+
+ default:
+ break;
+
+ }
+
+ notFound << message << ": " << url << " Ref: " << referer << '\n';
+}
+
+//*****************************************************************************
+// void Retriever::ReportStatistics(char *name)
+//
+void Retriever::ReportStatistics(const String & name)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ cout << name << ": Run complete\n";
+ cout << name << ": " << servers.Count() << " server";
+ if (servers.Count() > 1)
+ cout << "s";
+ cout << " seen:\n";
+
+ Server *server;
+ String buffer;
+ StringList results;
+ String newname = name;
+
+ newname << ": ";
+
+ servers.Start_Get();
+ while ((server = (Server *) servers.Get_NextElement()))
+ {
+ buffer = 0;
+ server->reportStatistics(buffer, newname);
+ results.Add(buffer);
+ }
+ results.Sort();
+
+ for (int i = 0; i < results.Count(); i++)
+ {
+ cout << results[i] << "\n";
+ }
+
+ if (notFound.length() > 0)
+ {
+ cout << "\n" << name << ": Errors to take note of:\n";
+ cout << notFound;
+ }
+
+ cout << endl;
+
+ // Report HTTP connections stats
+ cout << "HTTP statistics" << endl;
+ cout << "===============" << endl;
+
+ if (config->Boolean("persistent_connections"))
+ {
+ cout << " Persistent connections : Yes" << endl;
+
+ if (config->Boolean("head_before_get"))
+ cout << " HEAD call before GET : Yes" << endl;
+ else
+ cout << " HEAD call before GET : No" << endl;
+ }
+ else
+ {
+ cout << " Persistent connections : No" << endl;
+ }
+
+ HtHTTP::ShowStatistics(cout) << endl;
+
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
new file mode 100644
index 00000000..b2fff24d
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
@@ -0,0 +1,183 @@
+//
+// Retriever.h
+//
+// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
+// parser notifies the Retriever object that it got something
+// (got_* functions) and the Retriever object feed the databases
+// and statistics accordingly.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Retriever.h,v 1.28 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifndef _Retriever_h_
+#define _Retriever_h_
+
+#include "DocumentRef.h"
+#include "Dictionary.h"
+#include "Queue.h"
+#include "HtWordReference.h"
+#include "List.h"
+#include "StringList.h"
+#include "DocumentDB.h"
+
+#define HTDIG_ERROR_TESTURL_EXCLUDE -109
+#define HTDIG_ERROR_TESTURL_BADQUERY -110
+#define HTDIG_ERROR_TESTURL_EXTENSION -111
+#define HTDIG_ERROR_TESTURL_EXTENSION2 -112
+#define HTDIG_ERROR_TESTURL_LIMITS -113
+#define HTDIG_ERROR_TESTURL_LIMITSNORM -114
+#define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115
+#define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116
+#define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117
+#define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118
+
+
+class URL;
+class Document;
+class URLRef;
+class HtWordList;
+
+enum RetrieverLog {
+ Retriever_noLog,
+ Retriever_logUrl,
+ Retriever_Restart
+};
+
+struct word_entry : public Object
+{
+ word_entry (int loc, int fl, HtWordReference& ref) :
+ location (loc), flags (fl), context (ref)
+ {};
+ int location;
+ int flags;
+ HtWordReference context;
+};
+
+class Retriever
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ Retriever(RetrieverLog flags = Retriever_noLog);
+ virtual ~Retriever();
+
+ //
+ // Getting it all started
+ //
+ void Initial(const String& url, int checked = 0);
+ void Initial(List &list , int checked = 0);
+ void Start();
+
+ //
+ // Report statistics about the parser
+ //
+ void ReportStatistics(const String& name);
+
+ //
+ // These are the callbacks that we need to write code for
+ //
+ void got_word(const char *word, int location, int heading);
+ void got_href(URL &url, const char *description, int hops = 1);
+ void got_title(const char *title);
+ void got_author(const char *author);
+ void got_time(const char *time);
+ void got_head(const char *head);
+ void got_meta_dsc(const char *md);
+ void got_anchor(const char *anchor);
+ void got_image(const char *src);
+ void got_meta_email(const char *);
+ void got_meta_notification(const char *);
+ void got_meta_subject(const char *);
+ void got_noindex();
+
+ //
+ // Allow for the indexing of protected sites by using a
+ // username/password
+ //
+ void setUsernamePassword(const char *credentials);
+
+ //
+ // Routines for dealing with local filesystem access
+ //
+ StringList * GetLocal(const String &strurl);
+ StringList * GetLocalUser(const String &url, StringList *defaultdocs);
+ int IsLocalURL(const String &url);
+
+private:
+ //
+ // A hash to keep track of what we've seen
+ //
+ Dictionary visited;
+
+ URL *base;
+ String current_title;
+ String current_head;
+ String current_meta_dsc;
+ time_t current_time;
+ int current_id;
+ DocumentRef *current_ref;
+ int current_anchor_number;
+ int trackWords;
+ int n_links;
+ String credentials;
+ HtWordReference word_context;
+ HtWordList words;
+
+ Dictionary words_to_add;
+
+ int check_unique_md5;
+ int check_unique_date;
+
+
+ RetrieverLog log;
+ //
+ // These are weights for the words. The index is the heading level.
+ //
+ long int factor[12];
+ int currenthopcount;
+
+ //
+ // Some semi-constants...
+ //
+ int max_hop_count;
+
+ //
+ // The list of server-specific information objects is indexed by
+ // ip address and port number. The list contains Server objects.
+ //
+ Dictionary servers;
+
+ //
+ // For efficiency reasons, we will only use one document object which
+ // we reuse.
+ //
+ Document *doc;
+
+ Database *d_md5;
+
+ String notFound;
+
+ // Some useful constants
+ int minimumWordLength;
+
+ //
+ // Helper routines
+ //
+ int Need2Get(const String &url);
+ int IsValidURL(const String &url);
+ void RetrievedDocument(Document &, const String &url, DocumentRef *ref);
+ void parse_url(URLRef &urlRef);
+ void got_redirect(const char *, DocumentRef *, const char * = 0);
+ void recordNotFound(const String &url, const String &referer, int reason);
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.cc b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc
new file mode 100644
index 00000000..3afdebd3
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc
@@ -0,0 +1,435 @@
+//
+// Server.cc
+//
+// Server: A class to keep track of server specific information.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Server.cc,v 1.29 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htdig.h"
+#include "Server.h"
+#include "good_strtok.h"
+#include "htString.h"
+#include "URL.h"
+#include "Document.h"
+#include "URLRef.h"
+#include "Transport.h"
+#include "HtHTTP.h" // for checking persistent connections
+#include "StringList.h"
+
+#include <ctype.h>
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Server::Server(URL u, StringList *local_robots_files)
+// u is the base URL for this server
+//
+Server::Server(URL u, StringList *local_robots_files)
+:
+ _host(u.host()),
+ _port(u.port()),
+ _bad_server(0),
+ _documents(0),
+ _accept_language(0)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ if (debug)
+ cout << endl << "New server: " << _host << ", " << _port << endl;
+
+ // We take it from the configuration
+ _persistent_connections = config->Boolean("server", _host.get(),"persistent_connections");
+ _head_before_get = config->Boolean("server", _host.get(),"head_before_get");
+
+ _max_documents = config->Value("server",_host.get(),"server_max_docs");
+ _connection_space = config->Value("server",_host.get(),"server_wait_time");
+ _user_agent = config->Find("server", _host.get(), "user_agent");
+ _disable_cookies = config->Boolean("server", _host.get(), "disable_cookies");
+
+ // Accept-Language directive
+ StringList _accept_language_list(config->Find("server", _host.get(),
+ "accept_language"), " \t");
+
+ _accept_language.trunc(); // maybe not needed
+
+ for (int i = 0; i < _accept_language_list.Count(); i++)
+ {
+ if (i>0)
+ _accept_language << ","; // for multiple choices
+
+ _accept_language << _accept_language_list[i];
+ }
+
+ // Timeout setting
+ _timeout = config->Value("server",_host.get(),"timeout");
+
+ // Number of consecutive attempts to establish a TCP connection
+ _tcp_max_retries = config->Value("server",_host.get(),"tcp_max_retries");
+
+ // Seconds to wait after a timeout occurs
+ _tcp_wait_time = config->Value("server",_host.get(),"tcp_wait_time");
+
+
+ if (debug > 1)
+ {
+ cout << " - Persistent connections: " <<
+ (_persistent_connections?"enabled":"disabled") << endl;
+
+ cout << " - HEAD before GET: " <<
+ (_head_before_get?"enabled":"disabled") << endl;
+
+ cout << " - Timeout: " << _timeout << endl;
+ cout << " - Connection space: " << _connection_space << endl;
+ cout << " - Max Documents: " << _max_documents << endl;
+ cout << " - TCP retries: " << _tcp_max_retries << endl;
+ cout << " - TCP wait time: " << _tcp_wait_time << endl;
+ cout << " - Accept-Language: " << _accept_language << endl;
+
+ }
+
+ _last_connection.SettoNow(); // For getting robots.txt
+
+ if (strcmp(u.service(),"http") == 0 || strcmp(u.service(),"https") == 0)
+ {
+ //
+ // Attempt to get a robots.txt file from the specified server
+ //
+ String url;
+ url.trunc();
+
+ if (debug>1)
+ cout << "Trying to retrieve robots.txt file" << endl;
+ url << u.signature() << "robots.txt";
+
+ static int local_urls_only = config->Boolean("local_urls_only");
+ time_t timeZero = 0; // Right now we want to get this every time
+ Document doc(url, 0);
+ Transport::DocStatus status;
+ if (local_robots_files)
+ {
+ if (debug > 1)
+ cout << "Trying local files" << endl;
+ status = doc.RetrieveLocal(timeZero, local_robots_files);
+ if (status == Transport::Document_not_local)
+ {
+ if (local_urls_only)
+ status = Transport::Document_not_found;
+ else
+ {
+ if (debug > 1)
+ cout << "Local retrieval failed, trying HTTP" << endl;
+ status = doc.Retrieve(this, timeZero);
+ }
+ }
+ }
+ else if (!local_urls_only)
+ {
+ status = doc.Retrieve(this, timeZero);
+
+ // Let's check if persistent connections are both
+ // allowed by the configuration and possible after
+ // having requested the robots.txt file.
+
+ HtHTTP * http;
+ if (IsPersistentConnectionAllowed() &&
+ ( http = doc.GetHTTPHandler()))
+ {
+ if (! http->isPersistentConnectionPossible())
+ _persistent_connections=0; // not possible. Let's disable
+ // them on this server.
+ }
+
+ }
+ else
+ status = Transport::Document_not_found;
+
+ switch (status)
+ {
+ case Transport::Document_ok:
+ //
+ // Found a robots.txt file. Go parse it.
+ //
+ robotstxt(doc);
+ break;
+
+ case Transport::Document_not_found:
+ case Transport::Document_not_parsable:
+ case Transport::Document_redirect:
+ case Transport::Document_not_authorized:
+ //
+ // These cases are for when there is no robots.txt file.
+ // We will just go on happily without restrictions
+ //
+ break;
+
+ case Transport::Document_no_host:
+ default:
+ //
+ // In all other cases the server could not be reached.
+ // We will remember this fact so that no more attempts to
+ // contact this server will be made.
+ //
+ _bad_server = 1;
+ break;
+ } // end switch
+ } // end if (http || https)
+}
+
+// Copy constructor
+Server::Server(const Server& rhs)
+:_host(_host),
+_port(rhs._port),
+_bad_server(rhs._bad_server),
+_connection_space(rhs._connection_space),
+_last_connection(rhs._last_connection),
+_paths(rhs._paths),
+_disallow(rhs._disallow),
+_documents(rhs._documents),
+_max_documents(rhs._max_documents),
+_persistent_connections(rhs._persistent_connections),
+_head_before_get(rhs._head_before_get),
+_disable_cookies(rhs._disable_cookies),
+_timeout(rhs._timeout),
+_tcp_wait_time(rhs._tcp_wait_time),
+_tcp_max_retries(rhs._tcp_max_retries),
+_user_agent(rhs._user_agent),
+_accept_language(rhs._accept_language)
+{
+}
+
+
+//*****************************************************************************
+// Server::~Server()
+//
+Server::~Server()
+{
+}
+
+
+//*****************************************************************************
+// void Server::robotstxt(Document &doc)
+// This will parse the robots.txt file which is contained in the document.
+//
+void Server::robotstxt(Document &doc)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ String contents = doc.Contents();
+ int length;
+ int pay_attention = 0;
+ String pattern;
+ String myname = config->Find("server", _host.get(), "robotstxt_name");
+ int seen_myname = 0;
+ char *name, *rest;
+
+ if (debug > 1)
+ cout << "Parsing robots.txt file using myname = " << myname << "\n";
+
+ //
+ // Go through the lines in the file and determine if we need to
+ // pay attention to them
+ //
+ for (char *line = strtok(contents, "\r\n"); line; line = strtok(0, "\r\n"))
+ {
+ if (debug > 2)
+ cout << "Robots.txt line: " << line << endl;
+
+ //
+ // Strip comments
+ //
+ if (strchr(line, '#'))
+ {
+ *(strchr(line, '#')) = '\0';
+ }
+
+ name = good_strtok(line, ':');
+ if (!name)
+ continue;
+ while (name && isspace(*name)) name++;
+ rest = good_strtok(NULL, '\r');
+ if (!rest)
+ rest = "";
+
+ while (rest && isspace(*rest))
+ rest++;
+
+ length = strlen(rest);
+ if (length > 0)
+ {
+ while (length > 0 && isspace(rest[length - 1]))
+ length--;
+ rest[length] = '\0';
+ }
+
+ if (mystrcasecmp(name, "user-agent") == 0)
+ {
+ if (debug > 1)
+ cout << "Found 'user-agent' line: " << rest << endl;
+
+ if (*rest == '*' && !seen_myname)
+ {
+ //
+ // This matches all search engines...
+ //
+ pay_attention = 1;
+ }
+ else if (mystrncasecmp(rest, (char*)myname, myname.length()) == 0)
+ {
+ //
+ // This is for us! This will override any previous patterns
+ // that may have been set.
+ //
+ if (!seen_myname) // only take first section with our name
+ {
+ seen_myname = 1;
+ pay_attention = 1;
+ pattern = 0; // ignore previous User-agent: *
+ }
+ else
+ pay_attention = 0;
+ }
+ else
+ {
+ //
+ // This doesn't concern us
+ //
+ pay_attention = 0;
+ }
+ }
+ else if (pay_attention && mystrcasecmp(name, "disallow") == 0)
+ {
+ if (debug > 1)
+ cout << "Found 'disallow' line: " << rest << endl;
+
+ //
+ // Add this path to our list to ignore
+ //
+ if (*rest)
+ {
+ if (pattern.length())
+ pattern << '|';
+ while (*rest)
+ {
+ if (strchr("^.[$()|*+?{\\", *rest))
+ pattern << '\\';
+ pattern << *rest++;
+ }
+ }
+ }
+ //
+ // Ignore anything else (comments)
+ //
+ }
+
+ //
+ // Compile the pattern (if any...)
+ //
+ if (debug > 1)
+ cout << "Pattern: " << pattern << endl;
+
+ // Empty "disallow" allows all, so don't make entry which matches all.
+ if (!pattern.empty())
+ {
+ String fullpatt = "^[^:]*://[^/]*(";
+ fullpatt << pattern << ')';
+ _disallow.set(fullpatt, config->Boolean("case_sensitive"));
+ }
+}
+
+
+//*****************************************************************************
+// void Server::push(String &path, int hopcount, char *referer, int local, int newDoc)
+//
+void Server::push(const String &path, int hopcount, const String &referer,
+ int local, int newDoc)
+{
+ if (_bad_server && !local)
+ return;
+
+ if (IsDisallowed(path) != 0)
+ {
+ if (debug > 2)
+ cout << endl << " Rejected: forbidden by server robots.txt!";
+
+ return;
+ }
+
+ // We use -1 as no limit, but we also don't want
+ // to forbid redirects from old places
+ if (_max_documents != -1 && newDoc &&
+ _documents >= _max_documents)
+ {
+ if (debug>2) // Hey! we only want to get max_docs
+ cout << "Limit of " << _max_documents << " reached for " << _host << endl;
+
+ return;
+ }
+
+ URLRef *ref = new URLRef();
+ ref->SetURL(path);
+ ref->SetHopCount(hopcount);
+ ref->SetReferer(referer);
+ _paths.Add(ref);
+
+ if (newDoc)
+ _documents++;
+
+// cout << "***** pushing '" << path << "' with '" << referer << "'\n";
+}
+
+
+//*****************************************************************************
+// URLRef *Server::pop()
+//
+URLRef *Server::pop()
+{
+ URLRef *ref = (URLRef *) _paths.Remove();
+
+ if (!ref)
+ return 0;
+
+ return ref;
+}
+
+
+//*****************************************************************************
+// void Server::delay()
+//
+// Keeps track of how long it's been since we've seen this server
+// and call sleep if necessary
+//
+void Server::delay()
+{
+ HtDateTime now;
+
+ int time_taken = HtDateTime::GetDiff(now, _last_connection); // arg1-arg2 > 0
+
+ if (time_taken < _connection_space)
+ sleep(_connection_space - time_taken);
+
+ now.SettoNow();
+ _last_connection = now; // Reset the clock for the next delay!
+
+ return;
+}
+
+
+//*****************************************************************************
+// void Server::reportStatistics(String &out, char *name)
+//
+void Server::reportStatistics(String &out, char *name)
+{
+ out << name << " " << _host << ":" << _port;
+ out << " " << _documents << " document";
+ if (_documents != 1)
+ out << "s";
+}
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.h b/debian/htdig/htdig-3.2.0b6/htdig/Server.h
new file mode 100644
index 00000000..ca6a4f04
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.h
@@ -0,0 +1,142 @@
+//
+// Server.h
+//
+// Server: A class to keep track of server specific information.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Server.h,v 1.13 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifndef _Server_h_
+#define _Server_h_
+
+#include "Object.h"
+#include "htString.h"
+#include "StringList.h"
+#include "Stack.h"
+#include "HtHeap.h"
+#include "HtRegex.h"
+#include "StringMatch.h"
+#include "URLRef.h"
+#include "HtDateTime.h"
+
+
+class Document;
+
+class Server : public Object
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ Server(URL u, StringList *local_robots_files = NULL);
+ Server(const Server& rhs);
+ ~Server();
+
+ //
+ // This needs to be called with a document containing the
+ // robots.txt file for this server
+ //
+ void robotstxt(Document &doc);
+
+ //
+ // Provide some way of getting at the host and port for this server
+ //
+ int port() const {return _port;}
+ const String &host() const {return _host;}
+
+ //
+ // Provide some way of getting at the status of this server
+ //
+ int IsDead() {return _bad_server;}
+ void IsDead(int flag) {_bad_server = flag;}
+
+ //
+ // Add a path to the queue for this server.
+ // This will check to see if the server is up if the URL is not local
+ // if it's down, it simply will not be added
+ //
+ void push(const String &path, int hopcount, const String &referer,
+ int local = 0, int newDoc = 1);
+
+ //
+ // Return the next URL from the queue for this server.
+ //
+ URLRef *pop();
+
+ //
+ // Delays the server if necessary. If the time between requests
+ // is long enough, the request can occur immediately.
+ //
+ void delay();
+
+ //
+ // Produce statistics for this server.
+ //
+ void reportStatistics(String &out, char *name);
+
+ //
+ // Methods for managing persistent connections
+ //
+ void AllowPersistentConnection() { _persistent_connections = true; }
+ void AvoidPersistentConnection() { _persistent_connections = false; }
+ bool IsPersistentConnectionAllowed () const
+ { return _persistent_connections; }
+
+ // Methods for getting info regarding server configuration
+ bool HeadBeforeGet() const { return _head_before_get; }
+ unsigned int TimeOut() const { return _timeout; }
+ unsigned int TcpWaitTime() const { return _tcp_wait_time; }
+ unsigned int TcpMaxRetries() const { return _tcp_max_retries; }
+ unsigned int MaxDocuments() const { return _max_documents; }
+ const String &UserAgent() const { return _user_agent; }
+ const String &AcceptLanguage() const { return _accept_language; }
+ bool DisableCookies() const { return _disable_cookies; }
+
+ //
+ // Return the URLs to be excluded from this server
+ // (for inclusion in the exclude_urls attribute)
+ //
+ int IsDisallowed(String url) { return _disallow.match(url, 0, 0); }
+
+private:
+ String _host;
+ int _port;
+ int _bad_server; // TRUE if we shouldn't use this one
+ int _connection_space; // Seconds between connections
+ HtDateTime _last_connection; // Time of last connection to this server
+ HtHeap _paths;
+ HtRegex _disallow; // This pattern will be used to test paths
+ int _documents; // Number of documents visited
+
+ int _max_documents; // Maximum number of documents from this server
+
+ bool _persistent_connections; // Are pcs allowed
+
+ bool _head_before_get; // HEAD call before a GET?
+
+ bool _disable_cookies; // Should we send cookies?
+
+ int _timeout; // Timeout for this server
+
+ unsigned int _tcp_wait_time; // Wait time after a timeout
+ // has been raised.
+
+ unsigned int _tcp_max_retries; // Max number of retries when
+ // connection is not possible
+ // and timeout occurs
+ String _user_agent; // User agent to use for this server
+ String _accept_language; // Accept-language to be sent
+ // for the HTTP server
+
+
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc
new file mode 100644
index 00000000..6cc8bc43
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc
@@ -0,0 +1,47 @@
+//
+// URLRef.cc
+//
+// URLRef: A definition of a URL/Referer pair with associated hopcount
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: URLRef.cc,v 1.9 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "URLRef.h"
+
+
+//*****************************************************************************
+// URLRef::URLRef()
+//
+URLRef::URLRef()
+{
+ hopcount = 0;
+}
+
+
+//*****************************************************************************
+// URLRef::~URLRef()
+//
+URLRef::~URLRef()
+{
+}
+
+
+//*****************************************************************************
+//
+int URLRef::compare(const URLRef& to) const
+{
+ return hopcount - to.hopcount;
+}
+
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h
new file mode 100644
index 00000000..dfc251ec
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h
@@ -0,0 +1,50 @@
+//
+// URLRef.h
+//
+// URLRef: A definition of a URL/Referer pair with associated hopcount
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: URLRef.h,v 1.9 2004/05/28 13:15:16 lha Exp $
+//
+//
+#ifndef _URLRef_h_
+#define _URLRef_h_
+
+#include "Object.h"
+#include "htString.h"
+#include "URL.h"
+
+class URLRef : public Object
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ URLRef();
+ ~URLRef();
+
+ const URL &GetURL() const {return url;}
+ int GetHopCount() const {return hopcount;}
+ const URL &GetReferer() const {return referer;}
+
+ void SetURL(const URL &u) {url = u;}
+ void SetHopCount(int h) {hopcount = h;}
+ void SetReferer(const URL &ref) {referer = ref;}
+
+ int compare(const Object& to) const { return compare((const URLRef&) to); }
+ int compare(const URLRef& to) const;
+
+private:
+ URL url;
+ URL referer;
+ int hopcount;
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
new file mode 100644
index 00000000..ba1d842a
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
@@ -0,0 +1,536 @@
+//
+// htdig.cc
+//
+// htdig: Indexes the web sites specified in the config file
+// generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: htdig.cc,v 1.42 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Document.h"
+#include "Retriever.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "defaults.h"
+#include "HtURLCodec.h"
+#include "WordContext.h"
+#include "HtDateTime.h"
+#include "HtURLRewriter.h"
+
+////////////////////////////
+// For cookie jar
+////////////////////////////
+#include "HtCookieJar.h"
+#include "HtCookieMemJar.h"
+#include "HtCookieInFileJar.h"
+#include "HtHTTP.h"
+////////////////////////////
+
+// If we have this, we probably want it.
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#elif HAVE_GETOPT_LOCAL
+#include <getopt_local.h>
+#endif
+
+#ifdef HAVE_STD
+#include <iostream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <iostream.h>
+#endif /* HAVE_STD */
+
+//
+// Global variables
+//
+int debug = 0;
+int report_statistics = 0;
+DocumentDB docs;
+HtRegexList limits;
+HtRegexList limitsn;
+FILE *urls_seen = NULL;
+FILE *images_seen = NULL;
+String configFile = DEFAULT_CONFIG_FILE;
+String minimalFile = 0;
+HtDateTime StartTime;
+HtDateTime EndTime;
+
+void usage();
+void reportError(char *msg);
+
+
+//
+// Start of the program.
+//
+int main(int ac, char **av)
+{
+ int c;
+ extern char *optarg;
+ String credentials;
+ int initial = 0;
+ int alt_work_area = 0;
+ int create_text_database = 0;
+ char *max_hops = 0;
+
+ // Cookie jar dynamic creation.
+ HtCookieJar* _cookie_jar = new HtCookieMemJar(); // new cookie jar
+ if (_cookie_jar)
+ HtHTTP::SetCookieJar(_cookie_jar);
+
+//extern int yydebug;
+//yydebug=1;
+
+ //
+ // Parse command line arguments
+ //
+ while ((c = getopt(ac, av, "lsm:c:vith:u:a")) != -1)
+ {
+ unsigned int pos;
+ switch (c)
+ {
+ case 'c':
+ configFile = optarg;
+ break;
+ case 'v':
+ debug++;
+ break;
+ case 'i':
+ initial++;
+ break;
+ case 't':
+ create_text_database++;
+ break;
+ case 'h':
+ max_hops = optarg;
+ break;
+ case 's':
+ report_statistics++;
+ break;
+ case 'u':
+ credentials = optarg;
+ for (pos = 0; pos < strlen(optarg); pos++)
+ optarg[pos] = '*';
+ break;
+ case 'a':
+ alt_work_area++;
+ break;
+ case 'm':
+ minimalFile = optarg;
+ max_hops = "0";
+ break;
+ case '?':
+ usage();
+ default:
+ break;
+ }
+ }
+
+ // Shows Start Time
+ if (debug>0)
+ cout << "ht://dig Start Time: " << StartTime.GetAscTime() << endl;
+
+ //
+ // First set all the defaults and then read the specified config
+ // file to override the defaults.
+ //
+ HtConfiguration* const config= HtConfiguration::config();
+ config->Defaults(&defaults[0]);
+ if (access((char*)configFile, R_OK) < 0)
+ {
+ reportError(form("Unable to find configuration file '%s'",
+ configFile.get()));
+ }
+ config->Read(configFile);
+
+ // Warn user if any obsolete options are found in config file
+ // For efficiency, check all fields here. If different config
+ // files are used for searching, obsolete options may remain
+ char *deprecatedOptions [] = {
+ "heading_factor_1", "heading_factor_2", "heading_factor_3",
+ "heading_factor_4", "heading_factor_5", "heading_factor_6",
+ "modification_time_is_now", "pdf_parser", "translate_amp",
+ "translate_lt_gt", "translate_quot", "uncoded_db_compatible",
+ "" // empty terminator
+ };
+ char **option;
+ for (option = deprecatedOptions; **option; option++)
+ {
+ if (!config->Find(*option).empty())
+ cout << "Warning: Configuration option " << *option <<
+ " is no longer supported\n";
+ }
+
+ if (config->Find("locale").empty() && debug > 0)
+ cout << "Warning: unknown locale!\n";
+
+ if (max_hops)
+ {
+ config->Add("max_hop_count", max_hops);
+ }
+
+ // Set up credentials for this run
+ if (credentials.length())
+ config->Add("authorization", credentials);
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance()->ErrMsg();
+
+ if (url_part_errors.length() != 0)
+ reportError(form("Invalid url_part_aliases or common_url_parts: %s",
+ url_part_errors.get()));
+
+ //
+ // Check url_rewrite_rules for errors.
+ String url_rewrite_rules = HtURLRewriter::instance()->ErrMsg();
+
+ if (url_rewrite_rules.length() != 0)
+ reportError(form("Invalid url_rewrite_rules: %s",
+ url_rewrite_rules.get()));
+
+ //
+ // If indicated, change the database file names to have the .work
+ // extension
+ //
+ if (alt_work_area != 0)
+ {
+ String configValue = config->Find("doc_db");
+
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("doc_db", configValue);
+ }
+
+ configValue = config->Find("word_db");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("word_db", configValue);
+ }
+
+ configValue = config->Find("doc_index");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("doc_index", configValue);
+ }
+
+ configValue = config->Find("doc_excerpt");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("doc_excerpt", configValue);
+ }
+
+ configValue = config->Find("md5_db");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("md5_db", configValue);
+ }
+ }
+
+ // Imports the cookies file
+ const String CookiesInputFile = config->Find("cookies_input_file");
+ if (CookiesInputFile.length())
+ {
+ if (debug>0)
+ cout << "Importing Cookies input file "
+ << CookiesInputFile << endl;
+ int result;
+ HtCookieJar::SetDebugLevel(debug); // Set the debug level
+ HtCookieInFileJar* cookie_file = new HtCookieInFileJar(CookiesInputFile, result);
+ if (cookie_file)
+ {
+ if (!result)
+ {
+ if (debug>0)
+ cookie_file->ShowSummary();
+ delete _cookie_jar; // Deletes previous cookie jar
+ _cookie_jar = (HtCookieJar*) cookie_file; // set the imported one
+ HtHTTP::SetCookieJar(_cookie_jar); // and set the new HTTP jar
+ }
+ else if (debug > 0)
+ cout << "Warning: Import failed! (" << CookiesInputFile << ")" << endl;
+ }
+ else
+ reportError(form("Unable to load cookies file '%s' in memory",
+ CookiesInputFile.get()));
+ }
+
+ //
+ // If needed, we will create a list of every URL we come across.
+ //
+ if (config->Boolean("create_url_list"))
+ {
+ const String filename = config->Find("url_list");
+ urls_seen = fopen(filename, initial ? "w" : "a");
+ if (urls_seen == 0)
+ {
+ reportError(form("Unable to create URL file '%s'",
+ filename.get()));
+ }
+ }
+
+ //
+ // If needed, we will create a list of every image we come across.
+ //
+ if (config->Boolean("create_image_list"))
+ {
+ const String filename = config->Find("image_list");
+ images_seen = fopen(filename, initial ? "w" : "a");
+ if (images_seen == 0)
+ {
+ reportError(form("Unable to create images file '%s'",
+ filename.get()));
+ }
+ }
+
+ //
+ // Set up the limits list
+ //
+ StringList l(config->Find("limit_urls_to"), " \t");
+ limits.setEscaped(l, config->Boolean("case_sensitive"));
+ l.Destroy();
+
+ l.Create(config->Find("limit_normalized"), " \t");
+ limitsn.setEscaped(l, config->Boolean("case_sensitive"));
+ l.Destroy();
+
+ //
+ // Open the document database
+ //
+ const String filename = config->Find("doc_db");
+ if (initial)
+ unlink(filename);
+
+ const String index_filename = config->Find("doc_index");
+ if (initial)
+ unlink(index_filename);
+
+ const String head_filename = config->Find("doc_excerpt");
+ if (initial)
+ unlink(head_filename);
+
+ if (docs.Open(filename, index_filename, head_filename) < 0)
+ {
+ reportError(form("Unable to open/create document database '%s'",
+ filename.get()));
+ }
+
+ const String word_filename = config->Find("word_db");
+ if (initial)
+ {
+ unlink(word_filename);
+ unlink((word_filename + "_weakcmpr").get());
+
+ // Remove "duplicate detection" database
+ unlink(config->Find("md5_db"));
+
+ // using -i, also ignore seen-but-not-processed URLs from last pass
+ unlink(config->Find("url_log"));
+ }
+
+ // Initialize htword
+ WordContext::Initialize(*config);
+
+ // Create the Retriever object which we will use to parse all the
+ // HTML files.
+ // In case this is just an update dig, we will add all existing
+ // URLs?
+ //
+ Retriever retriever(Retriever_logUrl);
+ if (minimalFile.length() == 0)
+ {
+ List *list = docs.URLs();
+ retriever.Initial(*list);
+ delete list;
+
+ // Add start_url to the initial list of the retriever.
+ // Don't check a URL twice!
+ // Beware order is important, if this bugs you could change
+ // previous line retriever.Initial(*list, 0) to Initial(*list,1)
+ retriever.Initial(config->Find("start_url"), 1);
+ }
+
+ // Handle list of URLs given in a file (stdin, if "-") specified as
+ // argument to -m or as an optional trailing argument.
+ if (optind < ac)
+ {
+ if (debug)
+ if (minimalFile.length() != 0)
+ cout << "Warning: argument " << av[optind]
+ << " overrides -m " << minimalFile << endl;
+ minimalFile = av[optind];
+ }
+ if (strcmp (minimalFile.get(), "-") == 0)
+ {
+ String str;
+ // Why not combine this with the code below, with input = stdin ?
+ while (!cin.eof())
+ {
+ cin >> str;
+ str.chop("\r\n"); // (Why "\r\n" here and "\r\n\t " below?)
+ if (str.length() > 0)
+ retriever.Initial(str, 1);
+ }
+ }
+ else if (minimalFile.length() != 0)
+ {
+ FILE *input = fopen(minimalFile.get(), "r");
+ char buffer[1000];
+
+ if (input)
+ {
+ while (fgets(buffer, sizeof(buffer), input))
+ {
+ String str(buffer);
+ str.chop("\r\n\t ");
+ if (str.length() > 0)
+ retriever.Initial(str, 1);
+ }
+ fclose(input);
+ }
+ else
+ {
+ cerr << "Could not open argument '" << minimalFile
+ << "' of flag -m\n";
+ exit (1);
+ }
+ }
+
+ //
+ // Go do it!
+ //
+ retriever.Start();
+
+ //
+ // All done with parsing.
+ //
+
+ //
+ // If the user so wants, create a text version of the document database.
+ //
+
+ if (create_text_database)
+ {
+ const String doc_list = config->Find("doc_list");
+ if (initial)
+ unlink(doc_list);
+ docs.DumpDB(doc_list);
+ const String word_dump = config->Find("word_dump");
+ if (initial)
+ unlink(word_dump);
+ HtWordList words(*config);
+ if(words.Open(config->Find("word_db"), O_RDONLY) == OK) {
+ words.Dump(word_dump);
+ }
+ }
+
+ //
+ // Cleanup
+ //
+ if (urls_seen)
+ fclose(urls_seen);
+ if (images_seen)
+ fclose(images_seen);
+
+ //
+ // If needed, report some statistics
+ //
+ if (report_statistics)
+ {
+ retriever.ReportStatistics("htdig");
+ }
+
+ // Shows End Time
+ if (debug>0)
+ {
+ EndTime.SettoNow();
+ cout << "ht://dig End Time: " << EndTime.GetAscTime() << endl;
+ }
+
+ if (_cookie_jar)
+ delete _cookie_jar;
+}
+
+
+//
+// Display usage information for the htdig program
+//
+void usage()
+{
+ cout << "usage: htdig [-v][-i][-c configfile][-t][-m minimalfile]\n";
+ cout << "This program is part of ht://Dig " << VERSION << "\n\n";
+ cout << "Options:\n";
+
+ cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
+ cout << "\t\tprogram. Using more than 2 is probably only useful\n";
+ cout << "\t\tfor debugging purposes. The default verbose mode\n";
+ cout << "\t\tgives a nice progress report while digging.\n\n";
+
+ cout << "\t-i\tInitial. Do not use any old databases. This is\n";
+ cout << "\t\taccomplished by first erasing the databases.\n\n";
+
+ cout << "\t-c configfile\n";
+ cout << "\t\tUse the specified configuration file instead of the\n";
+ cout << "\t\tdefault.\n\n";
+
+ cout << "\t-t\tCreate an ASCII version of the document database.\n";
+ cout << "\t\tThis database is easy to parse with other programs so\n";
+ cout << "\t\tthat information can be extracted from it.\n\n";
+
+ cout << "\t-h hopcount\n";
+ cout << "\t\tLimit the stored documents to those which are at\n";
+ cout << "\t\tmost hopcount links away from the start URL.\n\n";
+
+ cout << "\t-s\tReport statistics after completion.\n\n";
+
+ cout << "\t-u username:password\n";
+ cout << "\t\tTells htdig to send the supplied username and\n";
+ cout << "\t\tpassword with each HTTP request. The credentials\n";
+ cout << "\t\twill be encoded using the 'Basic' authentication scheme.\n";
+ cout << "\t\tThere *HAS* to be a colon (:) between the username\n";
+ cout << "\t\tand password.\n\n";
+
+ cout << "\t-a\tUse alternate work files.\n";
+ cout << "\t\tTells htdig to append .work to database files, causing\n";
+ cout << "\t\ta second copy of the database to be built. This allows\n";
+ cout << "\t\tthe original files to be used by htsearch during the\n";
+ cout << "\t\tindexing run.\n\n";
+
+ cout << "\t-m minimalfile (or just a file name at end of arguments)\n";
+ cout << "\t\tTells htdig to read URLs from the supplied file and index\n";
+ cout << "\t\tthem in place of (or in addition to) the existing URLs in\n";
+ cout << "\t\tthe database and the start_url. With the -m, only the\n";
+ cout << "\t\tURLs specified are added to the database. A file name of\n";
+ cout << "\t\t'-' indicates the standard input.\n\n";
+
+
+
+ exit(0);
+}
+
+//
+// Report an error and die
+//
+void reportError(char *msg)
+{
+ cout << "htdig: " << msg << "\n\n";
+ exit(1);
+}
+
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.h b/debian/htdig/htdig-3.2.0b6/htdig/htdig.h
new file mode 100644
index 00000000..5eb5b9bb
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.h
@@ -0,0 +1,55 @@
+//
+// htdig.h
+//
+// htdig: Indexes the web sites specified in the config file
+// generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: htdig.h,v 1.16 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifndef _htdig_h_
+#define _htdig_h_
+
+#include "HtConfiguration.h"
+#include "List.h"
+#include "DocumentDB.h"
+#include "StringMatch.h"
+#include "htconfig.h"
+#include "HtRegexList.h"
+#include <stdlib.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+
+extern int debug;
+extern DocumentDB docs;
+extern HtRegexList limits;
+extern HtRegexList limitsn;
+extern HtRegexList excludes;
+extern HtRegexList badquerystr;
+extern FILE *urls_seen;
+extern FILE *images_seen;
+
+extern void reportError(char *msg);
+
+#endif
+
+