1 files changed, 435 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.cc b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc
new file mode 100644
index 00000000..3afdebd3
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc
@@ -0,0 +1,435 @@
+//
+// Server.cc
+//
+// Server: A class to keep track of server specific information.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Server.cc,v 1.29 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htdig.h"
+#include "Server.h"
+#include "good_strtok.h"
+#include "htString.h"
+#include "URL.h"
+#include "Document.h"
+#include "URLRef.h"
+#include "Transport.h"
+#include "HtHTTP.h"    // for checking persistent connections
+#include "StringList.h"
+
+#include <ctype.h>
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Server::Server(URL u, StringList *local_robots_files)
+//  u is the base URL for this server
+//
+Server::Server(URL u, StringList *local_robots_files)
+:
+    _host(u.host()),
+    _port(u.port()),
+    _bad_server(0),
+    _documents(0),
+   _accept_language(0)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    if (debug)
+      cout << endl << "New server: " << _host << ", " << _port << endl;
+
+    // We take it from the configuration
+    _persistent_connections = config->Boolean("server", _host.get(),"persistent_connections");
+    _head_before_get = config->Boolean("server", _host.get(),"head_before_get");
+
+    _max_documents = config->Value("server",_host.get(),"server_max_docs");
+    _connection_space = config->Value("server",_host.get(),"server_wait_time");
+    _user_agent = config->Find("server", _host.get(), "user_agent");
+    _disable_cookies = config->Boolean("server", _host.get(), "disable_cookies");
+
+    // Accept-Language directive
+    StringList _accept_language_list(config->Find("server", _host.get(),
+      "accept_language"), " \t");
+
+    _accept_language.trunc(); // maybe not needed
+    
+    for (int i = 0; i < _accept_language_list.Count(); i++)
+    {
+       if (i>0)
+       	 _accept_language << ",";   // for multiple choices
+
+       	 _accept_language << _accept_language_list[i];
+    }
+    
+    // Timeout setting
+    _timeout = config->Value("server",_host.get(),"timeout");
+
+    // Number of consecutive attempts to establish a TCP connection
+    _tcp_max_retries = config->Value("server",_host.get(),"tcp_max_retries");
+
+    // Seconds to wait after a timeout occurs
+    _tcp_wait_time = config->Value("server",_host.get(),"tcp_wait_time");
+
+
+    if (debug > 1)
+    {
+      cout << " - Persistent connections: " <<
+         (_persistent_connections?"enabled":"disabled") << endl;
+
+      cout << " - HEAD before GET: " <<
+         (_head_before_get?"enabled":"disabled") << endl;
+
+      cout << " - Timeout: " << _timeout << endl;
+      cout << " - Connection space: " << _connection_space << endl;
+      cout << " - Max Documents: " << _max_documents << endl;
+      cout << " - TCP retries: " << _tcp_max_retries << endl;
+      cout << " - TCP wait time: " << _tcp_wait_time << endl;
+      cout << " - Accept-Language: " << _accept_language << endl;
+
+    }
+
+    _last_connection.SettoNow();  // For getting robots.txt
+
+    if (strcmp(u.service(),"http") == 0 || strcmp(u.service(),"https") == 0)
+      {
+	//
+	// Attempt to get a robots.txt file from the specified server
+	//
+	String	url;
+	url.trunc();
+
+	if (debug>1)
+	  cout << "Trying to retrieve robots.txt file" << endl;        
+	url << u.signature() << "robots.txt";
+	
+	static int	local_urls_only = config->Boolean("local_urls_only");
+	time_t 		timeZero = 0; // Right now we want to get this every time
+	Document	doc(url, 0);
+	Transport::DocStatus	status;
+	if (local_robots_files)
+	  {  
+	    if (debug > 1)
+	      cout << "Trying local files" << endl;
+	    status = doc.RetrieveLocal(timeZero, local_robots_files);
+	    if (status == Transport::Document_not_local)
+	      {
+		if (local_urls_only)
+		  status = Transport::Document_not_found;
+		else
+		  {
+		    if (debug > 1)
+		      cout << "Local retrieval failed, trying HTTP" << endl;
+		    status = doc.Retrieve(this, timeZero);
+		  }
+	      }
+	  }
+	else if (!local_urls_only)
+        {
+	  status = doc.Retrieve(this, timeZero);
+
+          // Let's check if persistent connections are both
+          // allowed by the configuration and possible after
+          // having requested the robots.txt file.
+
+          HtHTTP * http;
+          if (IsPersistentConnectionAllowed() &&
+                  ( http = doc.GetHTTPHandler()))
+          {
+              if (! http->isPersistentConnectionPossible())
+                  _persistent_connections=0;  // not possible. Let's disable
+                                              // them on this server.
+          }
+
+        }
+	else
+	  status = Transport::Document_not_found;
+
+	switch (status)
+	  {
+	  case Transport::Document_ok:
+	    //
+	    // Found a robots.txt file.  Go parse it.
+	    //
+	    robotstxt(doc);
+	    break;
+			
+	  case Transport::Document_not_found:
+	  case Transport::Document_not_parsable:
+	  case Transport::Document_redirect:
+	  case Transport::Document_not_authorized:
+	    //
+	    // These cases are for when there is no robots.txt file.
+	    // We will just go on happily without restrictions
+	    //
+	    break;
+			
+	  case Transport::Document_no_host:
+	  default:
+	    //
+	    // In all other cases the server could not be reached.
+	    // We will remember this fact so that no more attempts to
+	    // contact this server will be made.
+	    //
+	    _bad_server = 1;
+	    break;
+	  } // end switch
+      } // end if (http || https)
+}
+
+// Copy constructor
+Server::Server(const Server& rhs)
+:_host(_host),
+_port(rhs._port),
+_bad_server(rhs._bad_server),
+_connection_space(rhs._connection_space),
+_last_connection(rhs._last_connection),
+_paths(rhs._paths),
+_disallow(rhs._disallow),
+_documents(rhs._documents),
+_max_documents(rhs._max_documents),
+_persistent_connections(rhs._persistent_connections),
+_head_before_get(rhs._head_before_get),
+_disable_cookies(rhs._disable_cookies),
+_timeout(rhs._timeout),
+_tcp_wait_time(rhs._tcp_wait_time),
+_tcp_max_retries(rhs._tcp_max_retries),
+_user_agent(rhs._user_agent),
+_accept_language(rhs._accept_language)
+{
+}
+
+
+//*****************************************************************************
+// Server::~Server()
+//
+Server::~Server()
+{
+}
+
+
+//*****************************************************************************
+// void Server::robotstxt(Document &doc)
+//   This will parse the robots.txt file which is contained in the document.
+//
+void Server::robotstxt(Document &doc)
+{
+	HtConfiguration* config= HtConfiguration::config();
+    String	contents = doc.Contents();
+    int		length;
+    int		pay_attention = 0;
+    String	pattern;
+    String	myname = config->Find("server", _host.get(), "robotstxt_name");
+    int		seen_myname = 0;
+    char	*name, *rest;
+    
+    if (debug > 1)
+	cout << "Parsing robots.txt file using myname = " << myname << "\n";
+
+    //
+    // Go through the lines in the file and determine if we need to
+    // pay attention to them
+    //
+    for (char *line = strtok(contents, "\r\n"); line; line = strtok(0, "\r\n"))
+    {
+	if (debug > 2)
+	    cout << "Robots.txt line: " << line << endl;
+
+	//
+	// Strip comments
+	//
+	if (strchr(line, '#'))
+	{
+	    *(strchr(line, '#')) = '\0';
+	}
+	
+	name = good_strtok(line, ':');
+	if (!name)
+	    continue;
+	while (name && isspace(*name))  name++;
+	rest = good_strtok(NULL, '\r');
+	if (!rest)
+	    rest = "";
+
+	while (rest && isspace(*rest))
+	    rest++;
+			
+	length = strlen(rest);
+	if (length > 0)
+	{
+	    while (length > 0 && isspace(rest[length - 1]))
+		length--;
+	    rest[length] = '\0';
+	}
+
+	if (mystrcasecmp(name, "user-agent") == 0)
+	{
+	    if (debug > 1)
+		cout << "Found 'user-agent' line: " << rest << endl;
+
+	    if (*rest == '*' && !seen_myname)
+	    {
+		//
+		// This matches all search engines...
+		//
+		pay_attention = 1;
+	    }
+	    else if (mystrncasecmp(rest, (char*)myname, myname.length()) == 0)
+	    {
+		//
+		// This is for us!  This will override any previous patterns
+		// that may have been set.
+		//
+		if (!seen_myname)	// only take first section with our name
+		{
+		    seen_myname = 1;
+		    pay_attention = 1;
+		    pattern = 0;	// ignore previous User-agent: *
+		}
+		else
+		    pay_attention = 0;
+	    }
+	    else
+	    {
+		//
+		// This doesn't concern us
+		//
+		pay_attention = 0;
+	    }
+	}
+	else if (pay_attention && mystrcasecmp(name, "disallow") == 0)
+	{
+	    if (debug > 1)
+		cout << "Found 'disallow' line: " << rest << endl;
+				
+	    //
+	    // Add this path to our list to ignore
+	    //
+	    if (*rest)
+	    {
+		if (pattern.length())
+		    pattern << '|';
+		while (*rest)
+		{
+		    if (strchr("^.[$()|*+?{\\", *rest))
+			pattern << '\\';
+		    pattern << *rest++;
+		}
+	    }
+	}
+	//
+	// Ignore anything else (comments)
+	//
+    }
+
+    //
+    // Compile the pattern (if any...)
+    //
+    if (debug > 1)
+	cout << "Pattern: " << pattern << endl;
+		
+    // Empty "disallow" allows all, so don't make entry which matches all.
+    if (!pattern.empty())
+    {
+	String	fullpatt = "^[^:]*://[^/]*(";
+	fullpatt << pattern << ')';
+	_disallow.set(fullpatt, config->Boolean("case_sensitive"));
+    }
+}
+
+
+//*****************************************************************************
+// void Server::push(String &path, int hopcount, char *referer, int local, int newDoc)
+//
+void Server::push(const String &path, int hopcount, const String &referer,
+		  int local, int newDoc)
+{
+    if (_bad_server && !local)
+	return;
+
+    if (IsDisallowed(path) != 0)
+      {
+	if (debug > 2)
+	  cout << endl << "   Rejected: forbidden by server robots.txt!";
+
+	return;
+      }
+
+    // We use -1 as no limit, but we also don't want
+    // to forbid redirects from old places
+    if (_max_documents != -1 && newDoc &&
+	_documents >= _max_documents)
+    {
+       if (debug>2)     // Hey! we only want to get max_docs
+          cout << "Limit of " << _max_documents << " reached for " << _host << endl;
+        
+       return;
+    }
+
+    URLRef	*ref = new URLRef();
+    ref->SetURL(path);
+    ref->SetHopCount(hopcount);
+    ref->SetReferer(referer);
+    _paths.Add(ref);
+
+    if (newDoc)
+      _documents++;
+
+//     cout << "***** pushing '" << path << "' with '" << referer << "'\n";
+}
+
+
+//*****************************************************************************
+// URLRef *Server::pop()
+//
+URLRef *Server::pop()
+{
+    URLRef	*ref = (URLRef *) _paths.Remove();
+
+    if (!ref)
+	return 0;
+
+    return ref;
+}
+
+
+//*****************************************************************************
+// void Server::delay()
+//
+// Keeps track of how long it's been since we've seen this server
+// and call sleep if necessary
+//
+void Server::delay()
+{
+  HtDateTime now;
+
+  int time_taken = HtDateTime::GetDiff(now, _last_connection);  // arg1-arg2 > 0
+
+  if (time_taken < _connection_space)
+    sleep(_connection_space - time_taken);
+
+  now.SettoNow();
+  _last_connection = now;  // Reset the clock for the next delay!
+
+  return;
+}
+
+
+//*****************************************************************************
+// void Server::reportStatistics(String &out, char *name)
+//
+void Server::reportStatistics(String &out, char *name)
+{
+    out << name << " " << _host << ":" << _port;
+    out << " " << _documents << " document";
+    if (_documents != 1)
+	out << "s";
+}