1 files changed, 2013 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc
new file mode 100644
index 00000000..13243571
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc
@@ -0,0 +1,2013 @@
+//
+// Retriever.cc
+//
+// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
+//            parser notifies the Retriever object that it got something
+//            (got_* functions) and the Retriever object feed the databases
+//            and statistics accordingly.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#ifdef _MSC_VER /* _WIN32 */
+# include <sys/types.h>
+# include <winsock2.h>
+#endif
+
+
+#include "Retriever.h"
+#include "htdig.h"
+#include "HtWordList.h"
+#include "WordRecord.h"
+#include "URLRef.h"
+#include "Server.h"
+#include "Parsable.h"
+#include "Document.h"
+#include "StringList.h"
+#include "WordType.h"
+#include "Transport.h"
+#include "HtHTTP.h"			  // For HTTP statistics
+#include "md5.h"
+#include "defaults.h"
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <pwd.h>
+#endif
+
+#include <signal.h>
+#include <stdio.h>
+
+
+static int noSignal;
+
+// no_store_phrases:
+// If true, only store first occurrence of each word in a document
+static bool no_store_phrases;
+
+//*****************************************************************************
+// Retriever::Retriever()
+//
+Retriever::Retriever(RetrieverLog flags):
+words(*(HtConfiguration::config())),
+words_to_add (100, 0.75)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	FILE *urls_parsed;
+
+	currenthopcount = 0;
+	max_hop_count = config->Value("max_hop_count", 999999);
+
+	no_store_phrases = !config->Boolean("store_phrases");
+
+	//
+	// Initialize the flags for the various HTML factors
+	//
+
+	// text_factor
+	factor[0] = FLAG_TEXT;
+	// title_factor
+	factor[1] = FLAG_TITLE;
+	// heading factor (now generic)
+	factor[2] = FLAG_HEADING;
+	factor[3] = FLAG_HEADING;
+	factor[4] = FLAG_HEADING;
+	factor[5] = FLAG_HEADING;
+	factor[6] = FLAG_HEADING;
+	factor[7] = FLAG_HEADING;
+	// img alt text
+	//factor[8] = FLAG_KEYWORDS;
+	factor[8] = FLAG_TEXT;	  // treat alt text as plain text, until it has
+	// its own FLAG and factor.
+	// keywords factor
+	factor[9] = FLAG_KEYWORDS;
+	// META description factor
+	factor[10] = FLAG_DESCRIPTION;
+	factor[11] = FLAG_AUTHOR;
+
+	doc = new Document();
+	minimumWordLength = config->Value("minimum_word_length", 3);
+
+	log = flags;
+	// if in restart mode
+	if (Retriever_noLog != log)
+	{
+		String filelog = config->Find("url_log");
+		char buffer[1024];
+		int l;
+
+		urls_parsed = fopen((char *) filelog, "r");
+		if (urls_parsed != 0)
+		{
+			// read all url discovered but not fetched before 
+			while (fgets(buffer, sizeof(buffer), urls_parsed))
+			{
+				l = strlen(buffer);
+				buffer[l - 1] = 0;
+				Initial(buffer, 2);
+			}
+			fclose(urls_parsed);
+		}
+		unlink((char *) filelog);
+	}
+
+	check_unique_md5 = config->Boolean("check_unique_md5", 0);
+	check_unique_date = config->Boolean("check_unique_date", 0);
+
+	d_md5 = 0;
+	if (check_unique_md5)
+	{
+		d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+		if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
+		{
+			cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
+		}
+	}
+
+}
+
+
+//*****************************************************************************
+// Retriever::~Retriever()
+//
+Retriever::~Retriever()
+{
+	if (d_md5)
+		d_md5->Close();
+	delete doc;
+}
+
+
+//*****************************************************************************
+// void Retriever::setUsernamePassword(char *credentials)
+//
+void Retriever::setUsernamePassword(const char *credentials)
+{
+	doc->setUsernamePassword(credentials);
+}
+
+
+//*****************************************************************************
+// void Retriever::Initial(char *list, int from)
+//   Add a single URL to the list of URLs to visit.
+//   Since URLs are stored on a per server basis, we first need to find the
+//   the correct server to add the URL's path to.
+//
+//   from == 0 urls in db.docs and no db.log
+//   from == 1 urls in start_url add url only if not already in the list 
+//   from == 2 add url from db.log 
+//   from == 3 urls in db.docs and there was a db.log 
+//
+void Retriever::Initial(const String & list, int from)
+{
+	//
+	// Split the list of urls up into individual urls.
+	//
+	StringList tokens(list, " \t");
+	String sig;
+	String url;
+	Server *server;
+
+	for (int i = 0; i < tokens.Count(); i++)
+	{
+		URL u(tokens[i]);
+		url = u.get();	// get before  u.signature()  resolves aliases
+		server = (Server *) servers[u.signature()];
+		if (debug > 2)
+			cout << "\t" << from << ":" << (int) log << ":" << url;
+		if (!server)
+		{
+			String robotsURL = u.signature();
+			robotsURL << "robots.txt";
+			StringList *localRobotsFile = GetLocal(robotsURL);
+
+			server = new Server(u, localRobotsFile);
+			servers.Add(u.signature(), server);
+			delete localRobotsFile;
+		}
+
+		if (from && visited.Exists(url))
+		{
+			if (debug > 2)
+				cout << " skipped" << endl;
+			continue;
+		}
+		else if (IsValidURL(url) != 1)
+		{
+			if (debug > 2)
+				cout << endl;
+			continue;
+		}
+
+		if (Retriever_noLog == log || from != 3)
+		{
+			if (debug > 2)
+				cout << " pushed";
+			server->push(u.get(), 0, 0, IsLocalURL(url.get()));
+		}
+		if (debug > 2)
+			cout << endl;
+		visited.Add(url, 0);
+	}
+}
+
+
+//*****************************************************************************
+// void Retriever::Initial(List &list, int from)
+//
+void Retriever::Initial(List & list, int from)
+{
+	list.Start_Get();
+	String *str;
+
+	// from == 0 is an optimisation for pushing url in update mode
+	//  assuming that 
+	// 1) there's many more urls in docdb 
+	// 2) they're pushed first
+	// 3) there's no duplicate url in docdb
+	// then they don't need to be check against already pushed urls
+	// But 2) can be false with -l option
+	//
+	// FIXME it's nasty, what have to be test is :
+	// we have urls to push from db.docs but do we already have them in
+	// db.log? For this it's using a side effect with 'visited' and that
+	// urls in db.docs are only pushed via this method, and that db.log are pushed
+	// first, db.docs second, start_urls third!
+	//  
+	if (!from && visited.Count())
+	{
+		from = 3;
+	}
+	while ((str = (String *) list.Get_Next()))
+	{
+		Initial(str->get(), from);
+	}
+}
+
+//*****************************************************************************
+//
+static void sigexit(int)
+{
+	noSignal = 0;   //don't exit here.. just set the flag.
+}
+
+static void sigpipe(int)
+{
+}
+
+//*****************************************************************************
+// static void sig_handlers
+//   initialise signal handlers
+//
+static void sig_handlers(void)
+{
+#ifndef _MSC_VER /* _WIN32 */
+    //POSIX SIGNALS
+	struct sigaction action;
+
+	/* SIGINT, SIGQUIT, SIGTERM */
+	action.sa_handler = sigexit;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	if (sigaction(SIGINT, &action, NULL) != 0)
+		reportError("Cannot install SIGINT handler\n");
+	if (sigaction(SIGQUIT, &action, NULL) != 0)
+		reportError("Cannot install SIGQUIT handler\n");
+	if (sigaction(SIGTERM, &action, NULL) != 0)
+		reportError("Cannot install SIGTERM handler\n");
+	if (sigaction(SIGHUP, &action, NULL) != 0)
+		reportError("Cannot install SIGHUP handler\n");
+#else
+    //ANSI C signal handling - Limited to supported Windows signals.
+    signal(SIGINT, sigexit); 
+    signal(SIGTERM, sigexit); 
+#endif //_MSC_VER /* _WIN32 */
+}
+
+
+
+static void sig_phandler(void)
+{
+#ifndef _MSC_VER /* _WIN32 */
+	struct sigaction action;
+
+	sigemptyset(&action.sa_mask);
+	action.sa_handler = sigpipe;
+	action.sa_flags = SA_RESTART;
+	if (sigaction(SIGPIPE, &action, NULL) != 0)
+		reportError("Cannot install SIGPIPE handler\n");
+#endif //_MSC_VER /* _WIN32 */
+}
+
+
+//*****************************************************************************
+// static void win32_check_messages
+//   Check WIN32 messages!
+//
+#ifdef _MSC_VER /* _WIN32 */
+static void win32_check_messages(void)
+{
+// NEAL - NEEDS FINISHING/TESTING
+#if 0
+    MSG msg = {0, 0, 0, 0};
+    int cDown = 0;
+    int controlDown = 0;
+
+    if( GetMessage(&msg, 0, 0, 0) )
+    {
+
+        switch(msg.message)
+        {
+            case WM_KEYDOWN:
+                {
+                    if(LOWORD(msg.message)== 17)
+                        controlDown = 1;
+                    else if(LOWORD(msg.message) == 67)
+                    {
+                        cDown = 1;
+                    }
+                }
+                break;
+            case WM_KEYUP:
+                {
+                    if(LOWORD(msg.message) == 17)
+                        controlDown = 0;
+                    else if(LOWORD(msg.message) == 67)
+                        cDown = 0;
+                }
+                break;
+        }
+    }
+
+    DispatchMessage(&msg);
+#endif
+}
+#endif //_MSC_VER /* _WIN32 */
+
+
+//*****************************************************************************
+// void Retriever::Start()
+//   This is the main loop of the retriever.  We will go through the
+//   list of paths stored for each server.  While parsing the
+//   retrieved documents, new paths will be added to the servers.  We
+//   return if no more paths need to be retrieved.
+//
+void Retriever::Start()
+{
+	//
+	// Main digger loop.  The todo list should initialy have the start
+	// URL and all the URLs which were seen in a previous dig.  The
+	// loop will continue as long as there are more URLs to visit.
+	//
+	int more = 1;
+	Server *server;
+	URLRef *ref;
+
+	HtConfiguration *config = HtConfiguration::config();
+
+	//  
+	// Always sig . The delay bother me but a bad db is worst
+	// 
+	if (Retriever_noLog != log)
+	{
+		sig_handlers();
+	}
+	sig_phandler();
+	noSignal = 1;
+
+
+///////
+	//    Main loop. We keep on retrieving until a signal is received
+	//    or all the servers' queues are empty.
+///////
+
+#ifdef _MSC_VER /* _WIN32 */
+    win32_check_messages();
+#endif
+
+	while (more && noSignal)
+	{
+		more = 0;
+
+		//
+		// Go through all the current servers in sequence.
+		// If they support persistent connections, we keep on popping
+		// from the same server queue until it's empty or we reach a maximum
+		// number of consecutive requests ("max_connection_requests").
+		// Or the loop may also continue for the infinite,
+		// if we set the "max_connection_requests" to -1.
+		// If the server doesn't support persistent connection, we take
+		// only an URL from it, then we skip to the next server.
+		//
+		// Since 15.05.02: even when persistent connections are activated
+		// we should wait for a 'server_wait_time' number of seconds
+		// after the 'max_connection_requests' value has been reached.
+		//
+
+		// Let's position at the beginning
+		servers.Start_Get();
+
+		int count;
+
+		// Maximum number of repeated requests with the same
+		// TCP connection (so on the same Server:Port).
+
+		int max_connection_requests;
+
+#ifdef _MSC_VER /* _WIN32 */
+        win32_check_messages();
+#endif
+
+		while ((server = (Server *) servers.Get_NextElement()) && noSignal)
+		{
+			if (debug > 1)
+				cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl;
+
+			// We already know if a server supports HTTP pers. connections,
+			// because we asked it for the robots.txt file (constructor of
+			// the class).
+
+			// If the Server doesn't support persistent connections
+			// we turn it down to 1.
+
+			if (server->IsPersistentConnectionAllowed())
+			{
+
+				// Let's check for a '0' value (out of range)
+				// If set, we change it to 1.
+
+				if (config->Value("server", server->host(), "max_connection_requests") == 0)
+					max_connection_requests = 1;
+				else
+					max_connection_requests =
+						config->Value("server", server->host(), "max_connection_requests");
+
+				if (debug > 2)
+				{
+
+					cout << "> " << server->host() << " supports HTTP persistent connections";
+
+					if (max_connection_requests == -1)
+						cout << " (" << "infinite" << ")" << endl;
+					else
+						cout << " (" << max_connection_requests << ")" << endl;
+
+				}
+
+			}
+			else
+			{
+
+				// No HTTP persistent connections. So we request only 1 document.
+
+				max_connection_requests = 1;
+
+				if (debug > 2)
+					cout << "> " << server->host() << " with a traditional HTTP connection" << endl;
+
+			}
+
+
+			count = 0;
+
+#ifdef _MSC_VER /* _WIN32 */
+            win32_check_messages();
+#endif
+
+
+			while (((max_connection_requests == -1) ||
+				   (count < max_connection_requests)) && (ref = server->pop()) && noSignal)
+			{
+				count++;
+
+				//
+				// We have a URL to index, now.  We need to register the
+				// fact that we are not done yet by setting the 'more'
+				// variable. So, we have to restart scanning the queue.
+				//
+
+				more = 1;
+
+				//
+				// Deal with the actual URL.
+				// We'll check with the server to see if we need to sleep()
+				// before parsing it.
+				//
+
+				parse_url(*ref);
+				delete ref;
+
+				// We reached the maximum number of connections (either with
+				// or without persistent connections) and we must pause and
+				// respect the 'net ethic'.
+				if ((max_connection_requests - count) == 0)
+					server->delay();	// This will pause if needed
+				// and reset the time
+
+#ifdef _MSC_VER /* _WIN32 */
+                win32_check_messages();
+#endif
+
+			}
+
+#ifdef _MSC_VER /* _WIN32 */
+            win32_check_messages();
+#endif
+
+		}
+	}
+
+#ifdef _MSC_VER /* _WIN32 */
+    win32_check_messages();
+#endif
+
+
+	// if we exited on signal 
+	if (Retriever_noLog != log && !noSignal)
+	{
+		FILE *urls_parsed;
+		String filelog = config->Find("url_log");
+		// save url seen but not fetched
+		urls_parsed = fopen((char *) filelog, "w");
+		if (0 == urls_parsed)
+		{
+			reportError(form("Unable to create URL log file '%s'", filelog.get()));
+		}
+		else
+		{
+			servers.Start_Get();
+			while ((server = (Server *) servers.Get_NextElement()))
+			{
+				while (NULL != (ref = server->pop()))
+				{
+					fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get());
+					delete ref;
+				}
+			}
+			fclose(urls_parsed);
+		}
+	}
+	words.Close();
+}
+
+
+//*****************************************************************************
+// void Retriever::parse_url(URLRef &urlRef)
+//
+void Retriever::parse_url(URLRef & urlRef)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	URL url;
+	DocumentRef *ref;
+	int old_document;
+	time_t date;
+	static int index = 0;
+	static int local_urls_only = config->Boolean("local_urls_only");
+	static int mark_dead_servers = config->Boolean("ignore_dead_servers");
+	Server *server;
+
+	url.parse(urlRef.GetURL().get());
+
+	currenthopcount = urlRef.GetHopCount();
+
+	ref = docs[url.get()];	  // It might be nice to have just an Exists() here
+	if (ref)
+	{
+		//
+		// We already have an entry for this document in our database.
+		// This means we can get the document ID and last modification
+		// time from there.
+		//
+		current_id = ref->DocID();
+		date = ref->DocTime();
+		if (ref->DocAccessed())
+			old_document = 1;
+		else				  // we haven't retrieved it yet, so we only have the first link
+			old_document = 0;
+		ref->DocBackLinks(ref->DocBackLinks() + 1);	// we had a new link
+		ref->DocAccessed(time(0));
+		ref->DocState(Reference_normal);
+		currenthopcount = ref->DocHopCount();
+	}
+	else
+	{
+		//
+		// Never seen this document before.  We need to create an
+		// entry for it.  This implies that it gets a new document ID.
+		//
+		date = 0;
+		current_id = docs.NextDocID();
+		ref = new DocumentRef;
+		ref->DocID(current_id);
+		ref->DocURL(url.get());
+		ref->DocState(Reference_normal);
+		ref->DocAccessed(time(0));
+		ref->DocHopCount(currenthopcount);
+		ref->DocBackLinks(1); // We had to have a link to get here!
+		old_document = 0;
+	}
+
+	word_context.DocID(ref->DocID());
+
+	if (debug > 0)
+	{
+		//
+		// Display progress
+		//
+		cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": ";
+		cout.flush();
+	}
+
+	// Reset the document to clean out any old data
+	doc->Reset();
+	doc->Url(url.get());
+	doc->Referer(urlRef.GetReferer().get());
+
+	base = doc->Url();
+
+	// Retrieve document, first trying local file access if possible.
+	Transport::DocStatus status;
+	server = (Server *) servers[url.signature()];
+	StringList *local_filenames = GetLocal(url.get());
+	if (local_filenames)
+	{
+		if (debug > 1)
+			cout << "Trying local files" << endl;
+		status = doc->RetrieveLocal(date, local_filenames);
+		if (status == Transport::Document_not_local)
+		{
+			if (debug > 1)
+				cout << "Local retrieval failed, trying HTTP" << endl;
+			if (server && !server->IsDead() && !local_urls_only)
+				status = doc->Retrieve(server, date);
+			else
+				status = Transport::Document_no_host;
+		}
+		delete local_filenames;
+	}
+	else if (server && !server->IsDead() && !local_urls_only)
+		status = doc->Retrieve(server, date);
+	else
+		status = Transport::Document_no_host;
+
+	current_ref = ref;
+
+	//
+	// Determine what to do by looking at the status code returned by
+	// the Document retrieval process.
+	//
+
+	String shash;
+	String sx;
+	char bhash[16];
+	time_t ddate;
+
+	switch (status)
+	{
+
+	case Transport::Document_ok:
+		trackWords = 1;
+
+		if (check_unique_md5)
+		{
+			if (doc->StoredLength() > 0)
+			{
+				if (check_unique_date)
+				{
+					ddate = doc->ModTime();
+					if (ddate < time(NULL) - 10)
+					{	  // Unknown date was set to current time
+						md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug);
+					}
+					else
+					{
+						md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
+					}
+				}
+				else
+					md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug);
+
+				shash.append(bhash, MD5_LENGTH);
+				d_md5->Get(shash, sx);
+
+				if (!sx.empty())
+				{
+					if (debug > 1)
+					{
+						cout << " Detected duplicate by md5 hash" << endl;
+					}
+					words.Skip();
+					break; // Duplicate - don't index
+				}
+				else
+				{
+					d_md5->Put(shash, "x");
+				}
+
+			}
+		}
+
+		if (old_document)
+		{
+			if (doc->ModTime() == ref->DocTime())
+			{
+				words.Skip();
+				if (debug)
+					cout << " retrieved but not changed" << endl;
+				words.Skip();
+				break;
+			}
+			//
+			// Since we already had a record of this document and
+			// we were able to retrieve it, it must have changed
+			// since the last time we scanned it.  This means that
+			// we need to assign a new document ID to it and mark
+			// the old one as obsolete.
+			//
+			words.Skip();
+			int backlinks = ref->DocBackLinks();
+			ref->DocState(Reference_obsolete);
+			docs.Add(*ref);
+			delete ref;
+
+			current_id = docs.NextDocID();
+			word_context.DocID(current_id);
+			ref = new DocumentRef;
+			ref->DocID(current_id);
+			ref->DocURL(url.get());
+			ref->DocState(Reference_normal);
+			ref->DocAccessed(time(0));
+			ref->DocHopCount(currenthopcount);
+			ref->DocBackLinks(backlinks);
+			if (debug)
+				cout << " (changed) ";
+		}
+		RetrievedDocument(*doc, url.get(), ref);
+		// Hey! If this document is marked noindex, don't even bother
+		// adding new words. Mark this as gone and get rid of it!
+		if (ref->DocState() == Reference_noindex)
+		{
+			if (debug > 1)
+				cout << " ( " << ref->DocURL() << " ignored)";
+			words.Skip();
+		}
+		else
+			words.Flush();
+		if (debug)
+			cout << " size = " << doc->Length() << endl;
+
+		if (urls_seen)
+		{
+			fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n",
+				   (const char *) url.get(), doc->Length(), doc->ContentType(),
+				   (int) doc->ModTime(), currenthopcount);
+		}
+		break;
+
+	case Transport::Document_not_changed:
+		if (debug)
+			cout << " not changed" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_not_found:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " not found" << endl;
+		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found);
+		words.Skip();
+		break;
+
+	case Transport::Document_no_host:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " host not found" << endl;
+		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host);
+		words.Skip();
+
+		// Mark the server as being down
+		if (server && mark_dead_servers)
+			server->IsDead(1);
+		break;
+
+	case Transport::Document_no_port:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " host not found (port)" << endl;
+		recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port);
+		words.Skip();
+
+		// Mark the server as being down
+		if (server && mark_dead_servers)
+			server->IsDead(1);
+		break;
+
+	case Transport::Document_not_parsable:
+		ref->DocState(Reference_noindex);
+		if (debug)
+			cout << " not Parsable" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_redirect:
+		if (debug)
+			cout << " redirect" << endl;
+		ref->DocState(Reference_obsolete);
+		words.Skip();
+		got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get());
+		break;
+
+	case Transport::Document_not_authorized:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " not authorized" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_not_local:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " not local" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_no_header:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " no header" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_connection_down:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " connection down" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_no_connection:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " no connection" << endl;
+		words.Skip();
+		break;
+
+	case Transport::Document_not_recognized_service:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " service not recognized" << endl;
+
+		// Mark the server as being down
+		if (server && mark_dead_servers)
+			server->IsDead(1);
+		words.Skip();
+		break;
+
+	case Transport::Document_other_error:
+		ref->DocState(Reference_not_found);
+		if (debug)
+			cout << " other error" << endl;
+		words.Skip();
+		break;
+	}
+	docs.Add(*ref);
+	delete ref;
+}
+
+
+//*****************************************************************************
+// void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
+//   We found a document that needs to be parsed.  Since we don't know the
+//   document type, we'll let the Document itself return an appropriate
+//   Parsable object which we can call upon to parse the document contents.
+//
+void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref)
+{
+	n_links = 0;
+	current_ref = ref;
+	current_title = 0;
+	word_context.Anchor(0);
+	current_time = 0;
+	current_head = 0;
+	current_meta_dsc = 0;
+
+	//
+	// Create a parser object and let it have a go at the document.
+	// We will pass ourselves as a callback object for all the got_*()
+	// routines.
+	// This will generate the Parsable object as a specific parser
+	//
+	Parsable *parsable = doc.getParsable();
+	if (parsable)
+		parsable->parse(*this, *base);
+	else
+	{					  // If we didn't get a parser, then we should get rid of this!
+		ref->DocState(Reference_noindex);
+		return;
+	}
+
+	// If just storing the first occurrence of each word in a document,
+	// we must now flush the words we saw in that document
+	if (no_store_phrases)
+	{
+	    DictionaryCursor cursor;
+	    char *key;
+	    HtWordReference wordRef;
+	    for (words_to_add.Start_Get (cursor);
+		    (key = words_to_add.Get_Next(cursor)); )
+	    {
+		word_entry *entry = (word_entry*) (words_to_add [key]);
+
+		wordRef.Location(entry->location);
+		wordRef.Flags(entry->flags);
+		wordRef.Word(key);
+		words.Replace(WordReference::Merge(wordRef, entry->context));
+		// How do I clean up properly?
+		delete entry;
+	    }
+	    words_to_add.Release ();
+	}
+
+	//
+	// We don't need to dispose of the parsable object since it will
+	// automatically be reused.
+	//
+
+	//
+	// Update the document reference
+	//
+	ref->DocHead((char *) current_head);
+	ref->DocMetaDsc((char *) current_meta_dsc);
+	if (current_time == 0)
+		ref->DocTime(doc.ModTime());
+	else
+		ref->DocTime(current_time);
+	ref->DocTitle((char *) current_title);
+	ref->DocSize(doc.Length());
+	ref->DocAccessed(time(0));
+	ref->DocLinks(n_links);
+}
+
+
+//*****************************************************************************
+// int Retriever::Need2Get(const String &u)
+//   Return TRUE if we need to retrieve the given url.  This will
+//   check the list of urls we have already visited.
+//
+int Retriever::Need2Get(const String & u)
+{
+	static String url;
+	url = u;
+
+	return !visited.Exists(url);
+}
+
+
+
+//*****************************************************************************
+// int Retriever::IsValidURL(const String &u)
+//   Return TRUE if we need to retrieve the given url.  We will check
+//   for limits here.
+//
+int Retriever::IsValidURL(const String & u)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	Dictionary invalids;
+	Dictionary valids;
+	URL aUrl(u);
+	StringList tmpList;
+
+	// A list of bad extensions, separated by spaces or tabs
+	String t = config->Find(&aUrl, "bad_extensions");
+	String lowerp;
+	char *p = strtok(t, " \t");
+	while (p)
+	{
+		// Extensions are case insensitive
+		lowerp = p;
+		lowerp.lowercase();
+		invalids.Add(lowerp, 0);
+		p = strtok(0, " \t");
+	}
+
+	//
+	// Valid extensions are performed similarly 
+	//
+	// A list of valid extensions, separated by spaces or tabs
+
+	t = config->Find(&aUrl, "valid_extensions");
+	p = strtok(t, " \t");
+	while (p)
+	{
+		// Extensions are case insensitive
+		lowerp = p;
+		lowerp.lowercase();
+		valids.Add(lowerp, 0);
+		p = strtok(0, " \t");
+	}
+
+	static String url;
+	url = u;
+
+	//
+	// If the URL contains any of the patterns in the exclude list,
+	// mark it as invalid
+	//
+	String exclude_urls = config->Find(&aUrl, "exclude_urls");
+	static String *prevexcludes = 0;
+	static HtRegexList *excludes = 0;
+	if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0)
+	{
+		if (!excludes)
+			excludes = new HtRegexList;
+		if (prevexcludes)
+			delete prevexcludes;
+		prevexcludes = new String(exclude_urls);
+		tmpList.Create(exclude_urls, " \t");
+		excludes->setEscaped(tmpList, config->Boolean("case_sensitive"));
+		tmpList.Destroy();
+	}
+	if (excludes->match(url, 0, 0) != 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: item in exclude list ";
+		return (HTDIG_ERROR_TESTURL_EXCLUDE);
+	}
+
+	//
+	// If the URL has a query string and it is in the bad query list
+	// mark it as invalid
+	//
+	String bad_querystr = config->Find(&aUrl, "bad_querystr");
+	static String *prevbadquerystr = 0;
+	static HtRegexList *badquerystr = 0;
+	if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0)
+	{
+		if (!badquerystr)
+			badquerystr = new HtRegexList;
+		if (prevbadquerystr)
+			delete prevbadquerystr;
+		prevbadquerystr = new String(bad_querystr);
+		tmpList.Create(bad_querystr, " \t");
+		badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive"));
+		tmpList.Destroy();
+	}
+	char *ext = strrchr((char *) url, '?');
+	if (ext && badquerystr->match(ext, 0, 0) != 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: item in bad query list ";
+		return (HTDIG_ERROR_TESTURL_BADQUERY);
+	}
+
+	//
+	// See if the file extension is in the list of invalid ones
+	//
+	String urlpath = url.get();
+	int parm = urlpath.indexOf('?');	// chop off URL parameter
+	if (parm >= 0)
+		urlpath.chop(urlpath.length() - parm);
+	ext = strrchr((char *) urlpath.get(), '.');
+	String lowerext;
+	if (ext && strchr(ext, '/'))	// Ignore a dot if it's not in the
+		ext = NULL;		  // final component of the path.
+	if (ext)
+	{
+		lowerext.set(ext);
+		lowerext.lowercase();
+		if (invalids.Exists(lowerext))
+		{
+			if (debug > 2)
+				cout << endl << "   Rejected: Extension is invalid!";
+			return (HTDIG_ERROR_TESTURL_EXTENSION);
+		}
+	}
+	//
+	// Or NOT in the list of valid ones
+	//
+	if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: Extension is not valid!";
+		return (HTDIG_ERROR_TESTURL_EXTENSION2);
+	}
+
+	//
+	// If none of the limits is met, we disallow the URL
+	//
+	if (limits.match(url, 1, 0) == 0)
+	{
+		if (debug > 1)
+			cout << endl << "   Rejected: URL not in the limits! ";
+		return (HTDIG_ERROR_TESTURL_LIMITS);
+	}
+	//
+	// Likewise if not in list of normalized urls
+	//
+	// Warning!
+	// should be last in checks because of aUrl normalization
+	//
+		// signature()  implicitly normalizes the URL.  Be efficient...
+	Server *server = (Server *) servers[aUrl.signature()];
+//	aUrl.normalize();
+	if (limitsn.match(aUrl.get(), 1, 0) == 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: not in \"limit_normalized\" list!";
+		return (HTDIG_ERROR_TESTURL_LIMITSNORM);
+	}
+
+	//
+	// After that gauntlet, check to see if the server allows it
+	// (robots.txt)
+	//
+	if (server && server->IsDisallowed(url) != 0)
+	{
+		if (debug > 2)
+			cout << endl << "   Rejected: forbidden by server robots.txt!";
+		return (HTDIG_ERROR_TESTURL_ROBOT_FORBID);
+	}
+
+	return (1);
+}
+
+
+//*****************************************************************************
+// StringList* Retriever::GetLocal(const String &url)
+//   Returns a list of strings containing the (possible) local filenames
+//   of the given url, or 0 if it's definitely not local.
+//   THE CALLER MUST FREE THE STRINGLIST AFTER USE!
+//   Returned strings are not hex encoded.
+//
+StringList *Retriever::GetLocal(const String & strurl)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	static StringList *prefixes = 0;
+	String url = strurl;
+
+	static StringList *paths = 0;
+	StringList *defaultdocs = 0;
+	URL aUrl(url);
+	url = aUrl.get();		  // make sure we look at a parsed URL
+
+	//
+	// Initialize prefix/path list if this is the first time.
+	// The list is given in format "prefix1=path1 prefix2=path2 ..."
+	//
+	if (!prefixes)
+	{
+		prefixes = new StringList();
+		paths = new StringList();
+
+		String t = config->Find("local_urls");
+		char *p = strtok(t, " \t");
+		while (p)
+		{
+			char *path = strchr(p, '=');
+			if (!path)
+			{
+				p = strtok(0, " \t");
+				continue;
+			}
+			*path++ = '\0';
+			String *pre = new String(p);
+			decodeURL(*pre);
+			prefixes->Add(pre);
+			String *pat = new String(path);
+			decodeURL(*pat);
+			paths->Add(pat);
+			p = strtok(0, " \t");
+		}
+	}
+	if (!config->Find(&aUrl, "local_default_doc").empty())
+	{
+		defaultdocs = new StringList();
+		String t = config->Find(&aUrl, "local_default_doc");
+		char *p = strtok(t, " \t");
+		while (p)
+		{
+			String *def = new String(p);
+			decodeURL(*def);
+			defaultdocs->Add(def);
+			p = strtok(0, " \t");
+		}
+		if (defaultdocs->Count() == 0)
+		{
+			delete defaultdocs;
+			defaultdocs = 0;
+		}
+	}
+
+	// Begin by hex-decoding URL...
+	String hexurl = url;
+	decodeURL(hexurl);
+	url = hexurl.get();
+
+	// Check first for local user...
+	if (strchr(url.get(), '~'))
+	{
+		StringList *local = GetLocalUser(url, defaultdocs);
+		if (local)
+		{
+			if (defaultdocs)
+				delete defaultdocs;
+			return local;
+		}
+	}
+
+	// This shouldn't happen, but check anyway...
+	if (strstr(url.get(), ".."))
+		return 0;
+
+	String *prefix, *path;
+	String *defaultdoc;
+	StringList *local_names = new StringList();
+	prefixes->Start_Get();
+	paths->Start_Get();
+	while ((prefix = (String *) prefixes->Get_Next()))
+	{
+		path = (String *) paths->Get_Next();
+		if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0)
+		{
+			int l = strlen(url.get()) - prefix->length() + path->length() + 4;
+			String *local = new String(*path, l);
+			*local += &url[prefix->length()];
+			if (local->last() == '/' && defaultdocs)
+			{
+				defaultdocs->Start_Get();
+				while ((defaultdoc = (String *) defaultdocs->Get_Next()))
+				{
+					String *localdefault =
+						new String(*local, local->length() + defaultdoc->length() + 1);
+					localdefault->append(*defaultdoc);
+					local_names->Add(localdefault);
+				}
+				delete local;
+			}
+			else
+				local_names->Add(local);
+		}
+	}
+	if (local_names->Count() > 0)
+	{
+		if (defaultdocs)
+			delete defaultdocs;
+		return local_names;
+	}
+
+	if (defaultdocs)
+		delete defaultdocs;
+	delete local_names;
+	return 0;
+}
+
+
+//*****************************************************************************
+// StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs)
+//   If the URL has ~user part, return a list of strings containing the
+//   (possible) local filenames of the given url, or 0 if it's
+//   definitely not local.
+//   THE CALLER MUST FREE THE STRINGLIST AFTER USE!
+//
+StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs)
+{
+//  NOTE:  Native Windows does not have this contruct for the user Web files
+#ifndef _MSC_VER /* _WIN32 */
+	HtConfiguration *config = HtConfiguration::config();
+	static StringList *prefixes = 0, *paths = 0, *dirs = 0;
+	static Dictionary home_cache;
+	URL aUrl(url);
+
+	//
+	// Initialize prefix/path list if this is the first time.
+	// The list is given in format "prefix1=path1,dir1 ..."
+	// If path is zero-length, user's home directory is looked up. 
+	//
+	if (!prefixes)
+	{
+		prefixes = new StringList();
+		paths = new StringList();
+		dirs = new StringList();
+		String t = config->Find("local_user_urls");
+		char *p = strtok(t, " \t");
+		while (p)
+		{
+			char *path = strchr(p, '=');
+			if (!path)
+			{
+				p = strtok(0, " \t");
+				continue;
+			}
+			*path++ = '\0';
+			char *dir = strchr(path, ',');
+			if (!dir)
+			{
+				p = strtok(0, " \t");
+				continue;
+			}
+			*dir++ = '\0';
+			String *pre = new String(p);
+			decodeURL(*pre);
+			prefixes->Add(pre);
+			String *pat = new String(path);
+			decodeURL(*pat);
+			paths->Add(pat);
+			String *ptd = new String(dir);
+			decodeURL(*ptd);
+			dirs->Add(ptd);
+			p = strtok(0, " \t");
+		}
+	}
+
+	// Can we do anything about this?
+	if (!strchr(url, '~') || !prefixes->Count() || strstr(url, ".."))
+		return 0;
+
+	// Split the URL to components
+	String tmp = url;
+	char *name = strchr((char *) tmp, '~');
+	*name++ = '\0';
+	char *rest = strchr(name, '/');
+	if (!rest || (rest - name <= 1) || (rest - name > 32))
+		return 0;
+	*rest++ = '\0';
+
+	// Look it up in the prefix/path/dir table
+	prefixes->Start_Get();
+	paths->Start_Get();
+	dirs->Start_Get();
+	String *prefix, *path, *dir;
+	String *defaultdoc;
+	StringList *local_names = new StringList();
+	while ((prefix = (String *) prefixes->Get_Next()))
+	{
+		path = (String *) paths->Get_Next();
+		dir = (String *) dirs->Get_Next();
+		if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0)
+			continue;
+
+		String *local = new String;
+		// No path, look up home directory
+		if (path->length() == 0)
+		{
+			String *home = (String *) home_cache[name];
+			if (!home)
+			{
+				struct passwd *passwd = getpwnam(name);
+				if (passwd)
+				{
+					home = new String(passwd->pw_dir);
+					home_cache.Add(name, home);
+				}
+			}
+			if (home)
+				*local += *home;
+			else
+				continue;
+		}
+		else
+		{
+			*local += *path;
+			*local += name;
+		}
+		*local += *dir;
+		*local += rest;
+		if (local->last() == '/' && defaultdocs)
+		{
+			defaultdocs->Start_Get();
+			while ((defaultdoc = (String *) defaultdocs->Get_Next()))
+			{
+				String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1);
+				localdefault->append(*defaultdoc);
+				local_names->Add(localdefault);
+			}
+			delete local;
+		}
+		else
+			local_names->Add(local);
+	}
+
+	if (local_names->Count() > 0)
+		return local_names;
+
+	delete local_names;
+#endif //_MSC_VER /* _WIN32 */
+
+    return 0;
+}
+
+
+//*****************************************************************************
+// int Retriever::IsLocalURL(const String &url)
+//   Returns 1 if the given url has a (possible) local filename
+//   or 0 if it's definitely not local.
+//
+int Retriever::IsLocalURL(const String & url)
+{
+	int ret;
+
+	StringList *local_filename = GetLocal(url);
+	ret = (local_filename != 0);
+	if (local_filename)
+		delete local_filename;
+
+	return ret;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_word(char *word, int location, int heading)
+//   The location is normalized to be in the range 0 - 1000.
+//
+void Retriever::got_word(const char *word, int location, int heading)
+{
+	if (debug > 3)
+		cout << "word: " << word << '@' << location << endl;
+	if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0)
+		heading = 0;		  // Assume it's just normal text
+	if (trackWords && strlen(word) >= (unsigned int) minimumWordLength)
+	{
+		String w = word;
+		HtWordReference wordRef;
+
+		if (no_store_phrases)
+		{
+		    // Add new word, or mark existing word as also being at
+		    // this heading level
+		    word_entry *entry;
+		    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+		    {
+			words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+		    } else
+		    {
+			entry->flags |= factor[heading];
+		    }
+		} else
+		{
+		    wordRef.Location(location);
+		    wordRef.Flags(factor[heading]);
+		    wordRef.Word(w);
+		    words.Replace(WordReference::Merge(wordRef, word_context));
+		}
+
+		// Check for compound words...
+		String parts = word;
+		int added;
+		int nparts = 1;
+		do
+		{
+			added = 0;
+			char *start = parts.get();
+			char *punctp = 0, *nextp = 0, *p;
+			char punct;
+			int n;
+			while (*start)
+			{
+				p = start;
+				for (n = 0; n < nparts; n++)
+				{
+					while (HtIsStrictWordChar((unsigned char) *p))
+						p++;
+					punctp = p;
+					if (!*punctp && n + 1 < nparts)
+						break;
+					while (*p && !HtIsStrictWordChar((unsigned char) *p))
+						p++;
+					if (n == 0)
+						nextp = p;
+				}
+				if (n < nparts)
+					break;
+				punct = *punctp;
+				*punctp = '\0';
+				if (*start && (*p || start > parts.get()))
+				{
+					w = start;
+					HtStripPunctuation(w);
+					if (w.length() >= minimumWordLength)
+					{
+					        if (no_store_phrases)
+						{
+						    // Add new word, or mark existing word as also being at
+						    // this heading level
+						    word_entry *entry;
+						    if ((entry = (word_entry*)words_to_add.Find (w)) == NULL)
+						    {
+							words_to_add.Add(w, new word_entry (location, factor[heading], word_context));
+						    } else
+						    {
+							entry->flags |= factor[heading];
+						    }
+						} else
+						{
+						    wordRef.Word(w);
+						    words.Replace(WordReference::Merge(wordRef, word_context));
+						}
+						if (debug > 3)
+							cout << "word part: " << start << '@' << location << endl;
+					}
+					added++;
+				}
+				start = nextp;
+				*punctp = punct;
+			}
+			nparts++;
+		}
+		while (added > 2);
+	}
+}
+
+
+//*****************************************************************************
+// void Retriever::got_title(const char *title)
+//
+void Retriever::got_title(const char *title)
+{
+	if (debug > 1)
+		cout << "\ntitle: " << title << endl;
+	current_title = title;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_author(const char *e)
+//
+void Retriever::got_author(const char *author)
+{
+	if (debug > 1)
+		cout << "\nauthor: " << author << endl;
+	current_ref->DocAuthor(author);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_time(const char *time)
+//
+void Retriever::got_time(const char *time)
+{
+	HtDateTime new_time(current_time);
+
+	if (debug > 1)
+		cout << "\ntime: " << time << endl;
+
+	//
+	// As defined by the Dublin Core, this should be YYYY-MM-DD
+	// In the future, we'll need to deal with the scheme portion
+	//  in case someone picks a different format.
+	//
+	new_time.SetFTime(time, "%Y-%m-%d");
+	current_time = new_time.GetTime_t();
+
+	// If we can't convert it, current_time stays the same and we get
+	// the default--the date returned by the server...
+}
+
+//*****************************************************************************
+// void Retriever::got_anchor(const char *anchor)
+//
+void Retriever::got_anchor(const char *anchor)
+{
+	if (debug > 2)
+		cout << "anchor: " << anchor << endl;
+	current_ref->AddAnchor(anchor);
+	word_context.Anchor(word_context.Anchor() + 1);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_image(const char *src)
+//
+void Retriever::got_image(const char *src)
+{
+	URL url(src, *base);
+	const char *image = (const char *) url.get();
+
+	if (debug > 2)
+		cout << "image: " << image << endl;
+
+	if (images_seen)
+		fprintf(images_seen, "%s\n", image);
+}
+
+
+//*****************************************************************************
+//
+void Retriever::got_href(URL & url, const char *description, int hops)
+{
+	DocumentRef *ref = 0;
+	Server *server = 0;
+	int valid_url_code = 0;
+
+	// Rewrite the URL (if need be) before we do anything to it.
+	url.rewrite();
+
+	if (debug > 2)
+		cout << "href: " << url.get() << " (" << description << ')' << endl;
+
+	n_links++;
+
+	if (urls_seen)
+		fprintf(urls_seen, "%s\n", (const char *) url.get());
+
+	//
+	// Check if this URL falls within the valid range of URLs.
+	//
+	valid_url_code = IsValidURL(url.get());
+	if (valid_url_code > 0)
+	{
+		//
+		// It is valid.  Normalize it (resolve cnames for the server)
+		// and check again...
+		//
+		if (debug > 2)
+		{
+			cout << "resolving '" << url.get() << "'\n";
+			cout.flush();
+		}
+
+		url.normalize();
+
+		// If it is a backlink from the current document,
+		// just update that field.  Writing to the database
+		// is meaningless, as it will be overwritten.
+		// Adding it as a new document may even be harmful, as
+		// that will be a duplicate.  This can happen if the
+		// current document is never referenced before, as in a
+		// start_url.
+
+		if (strcmp(url.get(), current_ref->DocURL()) == 0)
+		{
+			current_ref->DocBackLinks(current_ref->DocBackLinks() + 1);
+			current_ref->AddDescription(description, words);
+		}
+		else
+		{
+
+			//
+			// First add it to the document database
+			//
+			ref = docs[url.get()];
+			// if ref exists we have to call AddDescription even
+			// if max_hop_count is reached
+			if (!ref && currenthopcount + hops > max_hop_count)
+				return;
+
+			if (!ref)
+			{
+				//
+				// Didn't see this one, yet.  Create a new reference
+				// for it with a unique document ID
+				//
+				ref = new DocumentRef;
+				ref->DocID(docs.NextDocID());
+				ref->DocHopCount(currenthopcount + hops);
+				ref->DocURL(url.get());
+			}
+			ref->DocBackLinks(ref->DocBackLinks() + 1);	// This one!
+			ref->AddDescription(description, words);
+
+			//
+			// If the dig is restricting by hop count, perform the check here 
+			// too
+			if (currenthopcount + hops > max_hop_count)
+			{
+				delete ref;
+				return;
+			}
+
+			if (ref->DocHopCount() > currenthopcount + hops)
+				ref->DocHopCount(currenthopcount + hops);
+
+			docs.Add(*ref);
+
+			//
+			// Now put it in the list of URLs to still visit.
+			//
+			if (Need2Get(url.get()))
+			{
+				if (debug > 1)
+					cout << "\n   pushing " << url.get() << endl;
+				server = (Server *) servers[url.signature()];
+				if (!server)
+				{
+					//
+					// Hadn't seen this server, yet.  Register it
+					//
+					String robotsURL = url.signature();
+					robotsURL << "robots.txt";
+					StringList *localRobotsFile = GetLocal(robotsURL.get());
+
+					server = new Server(url, localRobotsFile);
+					servers.Add(url.signature(), server);
+					delete localRobotsFile;
+				}
+				//
+				// Let's just be sure we're not pushing an empty URL
+				//
+				if (strlen(url.get()))
+					server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()));
+
+				String temp = url.get();
+				visited.Add(temp, 0);
+				if (debug)
+					cout << '+';
+			}
+			else if (debug)
+				cout << '*';
+			delete ref;
+		}
+	}
+	else
+	{
+		//
+		// Not a valid URL
+		//
+		if (debug > 1)
+			cout << "\nurl rejected: (level 1)" << url.get() << endl;
+		if (debug == 1)
+			cout << '-';
+
+		if (urls_seen)
+		{
+			fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code);
+		}
+
+	}
+	if (debug)
+		cout.flush();
+}
+
+
+//*****************************************************************************
+// void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref)
+//
+void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer)
+{
+	// First we must piece together the new URL, which may be relative
+	URL parent(old_ref->DocURL());
+	URL url(new_url, parent);
+
+	// Rewrite the URL (if need be) before we do anything to it.
+	url.rewrite();
+
+	if (debug > 2)
+		cout << "redirect: " << url.get() << endl;
+
+	n_links++;
+
+	if (urls_seen)
+		fprintf(urls_seen, "%s\n", (const char *) url.get());
+
+	//
+	// Check if this URL falls within the valid range of URLs.
+	//
+	if (IsValidURL(url.get()) > 0)
+	{
+		//
+		// It is valid.  Normalize it (resolve cnames for the server)
+		// and check again...
+		//
+		if (debug > 2)
+		{
+			cout << "resolving '" << url.get() << "'\n";
+			cout.flush();
+		}
+
+		url.normalize();
+		//
+		// First add it to the document database
+		//
+		DocumentRef *ref = docs[url.get()];
+		if (!ref)
+		{
+			//
+			// Didn't see this one, yet.  Create a new reference
+			// for it with a unique document ID
+			//
+			ref = new DocumentRef;
+			ref->DocID(docs.NextDocID());
+			ref->DocHopCount(currenthopcount);
+		}
+		ref->DocURL(url.get());
+
+		//
+		// Copy the descriptions of the old DocRef to this one
+		//
+		List *d = old_ref->Descriptions();
+		if (d)
+		{
+			d->Start_Get();
+			String *str;
+			while ((str = (String *) d->Get_Next()))
+			{
+				ref->AddDescription(str->get(), words);
+			}
+		}
+		if (ref->DocHopCount() > old_ref->DocHopCount())
+			ref->DocHopCount(old_ref->DocHopCount());
+
+		// Copy the number of backlinks
+		ref->DocBackLinks(old_ref->DocBackLinks());
+
+		docs.Add(*ref);
+
+		//
+		// Now put it in the list of URLs to still visit.
+		//
+		if (Need2Get(url.get()))
+		{
+			if (debug > 1)
+				cout << "   pushing " << url.get() << endl;
+			Server *server = (Server *) servers[url.signature()];
+			if (!server)
+			{
+				//
+				// Hadn't seen this server, yet.  Register it
+				//
+				String robotsURL = url.signature();
+				robotsURL << "robots.txt";
+				StringList *localRobotsFile = GetLocal(robotsURL.get());
+
+				server = new Server(url, localRobotsFile);
+				servers.Add(url.signature(), server);
+				delete localRobotsFile;
+			}
+			if (!referer || strlen(referer) == 0)
+				server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0);
+			else
+				server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0);
+
+			String temp = url.get();
+			visited.Add(temp, 0);
+		}
+
+		delete ref;
+	}
+}
+
+
+//*****************************************************************************
+// void Retriever::got_head(const char *head)
+//
+void Retriever::got_head(const char *head)
+{
+	if (debug > 4)
+		cout << "head: " << head << endl;
+	current_head = head;
+}
+
+//*****************************************************************************
+// void Retriever::got_meta_dsc(const char *md)
+//
+void Retriever::got_meta_dsc(const char *md)
+{
+	if (debug > 4)
+		cout << "meta description: " << md << endl;
+	current_meta_dsc = md;
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_email(const char *e)
+//
+void Retriever::got_meta_email(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta email: " << e << endl;
+	current_ref->DocEmail(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_notification(const char *e)
+//
+void Retriever::got_meta_notification(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta notification date: " << e << endl;
+	current_ref->DocNotification(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_meta_subject(const char *e)
+//
+void Retriever::got_meta_subject(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta subect: " << e << endl;
+	current_ref->DocSubject(e);
+}
+
+
+//*****************************************************************************
+// void Retriever::got_noindex()
+//
+void Retriever::got_noindex()
+{
+	if (debug > 1)
+		cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
+	current_ref->DocState(Reference_noindex);
+}
+
+
+//*****************************************************************************
+//
+void Retriever::recordNotFound(const String & url, const String & referer, int reason)
+{
+	char *message = "";
+
+	switch (reason)
+	{
+	case Transport::Document_not_found:
+		message = "Not found";
+		break;
+
+	case Transport::Document_no_host:
+		message = "Unknown host or unable to contact server";
+		break;
+
+	case Transport::Document_no_port:
+		message = "Unknown host or unable to contact server (port)";
+		break;
+
+	default:
+		break;
+
+	}
+
+	notFound << message << ": " << url << " Ref: " << referer << '\n';
+}
+
+//*****************************************************************************
+// void Retriever::ReportStatistics(char *name)
+//
+void Retriever::ReportStatistics(const String & name)
+{
+	HtConfiguration *config = HtConfiguration::config();
+	cout << name << ": Run complete\n";
+	cout << name << ": " << servers.Count() << " server";
+	if (servers.Count() > 1)
+		cout << "s";
+	cout << " seen:\n";
+
+	Server *server;
+	String buffer;
+	StringList results;
+	String newname = name;
+
+	newname << ":    ";
+
+	servers.Start_Get();
+	while ((server = (Server *) servers.Get_NextElement()))
+	{
+		buffer = 0;
+		server->reportStatistics(buffer, newname);
+		results.Add(buffer);
+	}
+	results.Sort();
+
+	for (int i = 0; i < results.Count(); i++)
+	{
+		cout << results[i] << "\n";
+	}
+
+	if (notFound.length() > 0)
+	{
+		cout << "\n" << name << ": Errors to take note of:\n";
+		cout << notFound;
+	}
+
+	cout << endl;
+
+	// Report HTTP connections stats
+	cout << "HTTP statistics" << endl;
+	cout << "===============" << endl;
+
+	if (config->Boolean("persistent_connections"))
+	{
+		cout << " Persistent connections    : Yes" << endl;
+
+		if (config->Boolean("head_before_get"))
+			cout << " HEAD call before GET      : Yes" << endl;
+		else
+			cout << " HEAD call before GET      : No" << endl;
+	}
+	else
+	{
+		cout << " Persistent connections    : No" << endl;
+	}
+
+	HtHTTP::ShowStatistics(cout) << endl;
+
+}