diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/Server.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htdig/Server.cc | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.cc b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc new file mode 100644 index 00000000..3afdebd3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc @@ -0,0 +1,435 @@ +// +// Server.cc +// +// Server: A class to keep track of server specific information. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Server.cc,v 1.29 2004/05/28 13:15:16 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htdig.h" +#include "Server.h" +#include "good_strtok.h" +#include "htString.h" +#include "URL.h" +#include "Document.h" +#include "URLRef.h" +#include "Transport.h" +#include "HtHTTP.h" // for checking persistent connections +#include "StringList.h" + +#include <ctype.h> +#include "defaults.h" + + +//***************************************************************************** +// Server::Server(URL u, StringList *local_robots_files) +// u is the base URL for this server +// +Server::Server(URL u, StringList *local_robots_files) +: + _host(u.host()), + _port(u.port()), + _bad_server(0), + _documents(0), + _accept_language(0) +{ + HtConfiguration* config= HtConfiguration::config(); + if (debug) + cout << endl << "New server: " << _host << ", " << _port << endl; + + // We take it from the configuration + _persistent_connections = config->Boolean("server", _host.get(),"persistent_connections"); + _head_before_get = config->Boolean("server", _host.get(),"head_before_get"); + + _max_documents = config->Value("server",_host.get(),"server_max_docs"); + _connection_space = config->Value("server",_host.get(),"server_wait_time"); + _user_agent = config->Find("server", _host.get(), "user_agent"); + _disable_cookies = config->Boolean("server", _host.get(), "disable_cookies"); + + // Accept-Language directive + StringList _accept_language_list(config->Find("server", _host.get(), + "accept_language"), " \t"); + + _accept_language.trunc(); // maybe not needed + + for (int i = 0; i < _accept_language_list.Count(); i++) + { + if (i>0) + _accept_language << ","; // for multiple choices + + _accept_language << _accept_language_list[i]; + } + + // Timeout setting + _timeout = config->Value("server",_host.get(),"timeout"); + + // Number of consecutive attempts to establish a TCP connection + _tcp_max_retries = config->Value("server",_host.get(),"tcp_max_retries"); + + // Seconds to wait after a timeout occurs + _tcp_wait_time = config->Value("server",_host.get(),"tcp_wait_time"); + + + if (debug > 1) + { + cout << " - Persistent connections: " << + (_persistent_connections?"enabled":"disabled") << endl; + + cout << " - HEAD before GET: " << + (_head_before_get?"enabled":"disabled") << endl; + + cout << " - Timeout: " << _timeout << endl; + cout << " - Connection space: " << _connection_space << endl; + cout << " - Max Documents: " << _max_documents << endl; + cout << " - TCP retries: " << _tcp_max_retries << endl; + cout << " - TCP wait time: " << _tcp_wait_time << endl; + cout << " - Accept-Language: " << _accept_language << endl; + + } + + _last_connection.SettoNow(); // For getting robots.txt + + if (strcmp(u.service(),"http") == 0 || strcmp(u.service(),"https") == 0) + { + // + // Attempt to get a robots.txt file from the specified server + // + String url; + url.trunc(); + + if (debug>1) + cout << "Trying to retrieve robots.txt file" << endl; + url << u.signature() << "robots.txt"; + + static int local_urls_only = config->Boolean("local_urls_only"); + time_t timeZero = 0; // Right now we want to get this every time + Document doc(url, 0); + Transport::DocStatus status; + if (local_robots_files) + { + if (debug > 1) + cout << "Trying local files" << endl; + status = doc.RetrieveLocal(timeZero, local_robots_files); + if (status == Transport::Document_not_local) + { + if (local_urls_only) + status = Transport::Document_not_found; + else + { + if (debug > 1) + cout << "Local retrieval failed, trying HTTP" << endl; + status = doc.Retrieve(this, timeZero); + } + } + } + else if (!local_urls_only) + { + status = doc.Retrieve(this, timeZero); + + // Let's check if persistent connections are both + // allowed by the configuration and possible after + // having requested the robots.txt file. + + HtHTTP * http; + if (IsPersistentConnectionAllowed() && + ( http = doc.GetHTTPHandler())) + { + if (! http->isPersistentConnectionPossible()) + _persistent_connections=0; // not possible. Let's disable + // them on this server. + } + + } + else + status = Transport::Document_not_found; + + switch (status) + { + case Transport::Document_ok: + // + // Found a robots.txt file. Go parse it. + // + robotstxt(doc); + break; + + case Transport::Document_not_found: + case Transport::Document_not_parsable: + case Transport::Document_redirect: + case Transport::Document_not_authorized: + // + // These cases are for when there is no robots.txt file. + // We will just go on happily without restrictions + // + break; + + case Transport::Document_no_host: + default: + // + // In all other cases the server could not be reached. + // We will remember this fact so that no more attempts to + // contact this server will be made. + // + _bad_server = 1; + break; + } // end switch + } // end if (http || https) +} + +// Copy constructor +Server::Server(const Server& rhs) +:_host(_host), +_port(rhs._port), +_bad_server(rhs._bad_server), +_connection_space(rhs._connection_space), +_last_connection(rhs._last_connection), +_paths(rhs._paths), +_disallow(rhs._disallow), +_documents(rhs._documents), +_max_documents(rhs._max_documents), +_persistent_connections(rhs._persistent_connections), +_head_before_get(rhs._head_before_get), +_disable_cookies(rhs._disable_cookies), +_timeout(rhs._timeout), +_tcp_wait_time(rhs._tcp_wait_time), +_tcp_max_retries(rhs._tcp_max_retries), +_user_agent(rhs._user_agent), +_accept_language(rhs._accept_language) +{ +} + + +//***************************************************************************** +// Server::~Server() +// +Server::~Server() +{ +} + + +//***************************************************************************** +// void Server::robotstxt(Document &doc) +// This will parse the robots.txt file which is contained in the document. +// +void Server::robotstxt(Document &doc) +{ + HtConfiguration* config= HtConfiguration::config(); + String contents = doc.Contents(); + int length; + int pay_attention = 0; + String pattern; + String myname = config->Find("server", _host.get(), "robotstxt_name"); + int seen_myname = 0; + char *name, *rest; + + if (debug > 1) + cout << "Parsing robots.txt file using myname = " << myname << "\n"; + + // + // Go through the lines in the file and determine if we need to + // pay attention to them + // + for (char *line = strtok(contents, "\r\n"); line; line = strtok(0, "\r\n")) + { + if (debug > 2) + cout << "Robots.txt line: " << line << endl; + + // + // Strip comments + // + if (strchr(line, '#')) + { + *(strchr(line, '#')) = '\0'; + } + + name = good_strtok(line, ':'); + if (!name) + continue; + while (name && isspace(*name)) name++; + rest = good_strtok(NULL, '\r'); + if (!rest) + rest = ""; + + while (rest && isspace(*rest)) + rest++; + + length = strlen(rest); + if (length > 0) + { + while (length > 0 && isspace(rest[length - 1])) + length--; + rest[length] = '\0'; + } + + if (mystrcasecmp(name, "user-agent") == 0) + { + if (debug > 1) + cout << "Found 'user-agent' line: " << rest << endl; + + if (*rest == '*' && !seen_myname) + { + // + // This matches all search engines... + // + pay_attention = 1; + } + else if (mystrncasecmp(rest, (char*)myname, myname.length()) == 0) + { + // + // This is for us! This will override any previous patterns + // that may have been set. + // + if (!seen_myname) // only take first section with our name + { + seen_myname = 1; + pay_attention = 1; + pattern = 0; // ignore previous User-agent: * + } + else + pay_attention = 0; + } + else + { + // + // This doesn't concern us + // + pay_attention = 0; + } + } + else if (pay_attention && mystrcasecmp(name, "disallow") == 0) + { + if (debug > 1) + cout << "Found 'disallow' line: " << rest << endl; + + // + // Add this path to our list to ignore + // + if (*rest) + { + if (pattern.length()) + pattern << '|'; + while (*rest) + { + if (strchr("^.[$()|*+?{\\", *rest)) + pattern << '\\'; + pattern << *rest++; + } + } + } + // + // Ignore anything else (comments) + // + } + + // + // Compile the pattern (if any...) + // + if (debug > 1) + cout << "Pattern: " << pattern << endl; + + // Empty "disallow" allows all, so don't make entry which matches all. + if (!pattern.empty()) + { + String fullpatt = "^[^:]*://[^/]*("; + fullpatt << pattern << ')'; + _disallow.set(fullpatt, config->Boolean("case_sensitive")); + } +} + + +//***************************************************************************** +// void Server::push(String &path, int hopcount, char *referer, int local, int newDoc) +// +void Server::push(const String &path, int hopcount, const String &referer, + int local, int newDoc) +{ + if (_bad_server && !local) + return; + + if (IsDisallowed(path) != 0) + { + if (debug > 2) + cout << endl << " Rejected: forbidden by server robots.txt!"; + + return; + } + + // We use -1 as no limit, but we also don't want + // to forbid redirects from old places + if (_max_documents != -1 && newDoc && + _documents >= _max_documents) + { + if (debug>2) // Hey! we only want to get max_docs + cout << "Limit of " << _max_documents << " reached for " << _host << endl; + + return; + } + + URLRef *ref = new URLRef(); + ref->SetURL(path); + ref->SetHopCount(hopcount); + ref->SetReferer(referer); + _paths.Add(ref); + + if (newDoc) + _documents++; + +// cout << "***** pushing '" << path << "' with '" << referer << "'\n"; +} + + +//***************************************************************************** +// URLRef *Server::pop() +// +URLRef *Server::pop() +{ + URLRef *ref = (URLRef *) _paths.Remove(); + + if (!ref) + return 0; + + return ref; +} + + +//***************************************************************************** +// void Server::delay() +// +// Keeps track of how long it's been since we've seen this server +// and call sleep if necessary +// +void Server::delay() +{ + HtDateTime now; + + int time_taken = HtDateTime::GetDiff(now, _last_connection); // arg1-arg2 > 0 + + if (time_taken < _connection_space) + sleep(_connection_space - time_taken); + + now.SettoNow(); + _last_connection = now; // Reset the clock for the next delay! + + return; +} + + +//***************************************************************************** +// void Server::reportStatistics(String &out, char *name) +// +void Server::reportStatistics(String &out, char *name) +{ + out << name << " " << _host << ":" << _port; + out << " " << _documents << " document"; + if (_documents != 1) + out << "s"; +} |