diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc | 2013 |
1 files changed, 2013 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc new file mode 100644 index 00000000..13243571 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc @@ -0,0 +1,2013 @@ +// +// Retriever.cc +// +// Retriever: Crawl from a list of URLs and calls appropriate parsers. The +// parser notifies the Retriever object that it got something +// (got_* functions) and the Retriever object feed the databases +// and statistics accordingly. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#ifdef _MSC_VER /* _WIN32 */ +# include <sys/types.h> +# include <winsock2.h> +#endif + + +#include "Retriever.h" +#include "htdig.h" +#include "HtWordList.h" +#include "WordRecord.h" +#include "URLRef.h" +#include "Server.h" +#include "Parsable.h" +#include "Document.h" +#include "StringList.h" +#include "WordType.h" +#include "Transport.h" +#include "HtHTTP.h" // For HTTP statistics +#include "md5.h" +#include "defaults.h" + +#ifndef _MSC_VER /* _WIN32 */ +#include <pwd.h> +#endif + +#include <signal.h> +#include <stdio.h> + + +static int noSignal; + +// no_store_phrases: +// If true, only store first occurrence of each word in a document +static bool no_store_phrases; + +//***************************************************************************** +// Retriever::Retriever() +// +Retriever::Retriever(RetrieverLog flags): +words(*(HtConfiguration::config())), +words_to_add (100, 0.75) +{ + HtConfiguration *config = HtConfiguration::config(); + FILE *urls_parsed; + + currenthopcount = 0; + max_hop_count = config->Value("max_hop_count", 999999); + + no_store_phrases = !config->Boolean("store_phrases"); + + // + // Initialize the flags for the various HTML factors + // + + // text_factor + factor[0] = FLAG_TEXT; + // title_factor + factor[1] = FLAG_TITLE; + // heading factor (now generic) + factor[2] = FLAG_HEADING; + factor[3] = FLAG_HEADING; + factor[4] = FLAG_HEADING; + factor[5] = FLAG_HEADING; + factor[6] = FLAG_HEADING; + factor[7] = FLAG_HEADING; + // img alt text + //factor[8] = FLAG_KEYWORDS; + factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has + // its own FLAG and factor. + // keywords factor + factor[9] = FLAG_KEYWORDS; + // META description factor + factor[10] = FLAG_DESCRIPTION; + factor[11] = FLAG_AUTHOR; + + doc = new Document(); + minimumWordLength = config->Value("minimum_word_length", 3); + + log = flags; + // if in restart mode + if (Retriever_noLog != log) + { + String filelog = config->Find("url_log"); + char buffer[1024]; + int l; + + urls_parsed = fopen((char *) filelog, "r"); + if (urls_parsed != 0) + { + // read all url discovered but not fetched before + while (fgets(buffer, sizeof(buffer), urls_parsed)) + { + l = strlen(buffer); + buffer[l - 1] = 0; + Initial(buffer, 2); + } + fclose(urls_parsed); + } + unlink((char *) filelog); + } + + check_unique_md5 = config->Boolean("check_unique_md5", 0); + check_unique_date = config->Boolean("check_unique_date", 0); + + d_md5 = 0; + if (check_unique_md5) + { + d_md5 = Database::getDatabaseInstance(DB_HASH); + + if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK) + { + cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n"; + } + } + +} + + +//***************************************************************************** +// Retriever::~Retriever() +// +Retriever::~Retriever() +{ + if (d_md5) + d_md5->Close(); + delete doc; +} + + +//***************************************************************************** +// void Retriever::setUsernamePassword(char *credentials) +// +void Retriever::setUsernamePassword(const char *credentials) +{ + doc->setUsernamePassword(credentials); +} + + +//***************************************************************************** +// void Retriever::Initial(char *list, int from) +// Add a single URL to the list of URLs to visit. +// Since URLs are stored on a per server basis, we first need to find the +// the correct server to add the URL's path to. +// +// from == 0 urls in db.docs and no db.log +// from == 1 urls in start_url add url only if not already in the list +// from == 2 add url from db.log +// from == 3 urls in db.docs and there was a db.log +// +void Retriever::Initial(const String & list, int from) +{ + // + // Split the list of urls up into individual urls. + // + StringList tokens(list, " \t"); + String sig; + String url; + Server *server; + + for (int i = 0; i < tokens.Count(); i++) + { + URL u(tokens[i]); + url = u.get(); // get before u.signature() resolves aliases + server = (Server *) servers[u.signature()]; + if (debug > 2) + cout << "\t" << from << ":" << (int) log << ":" << url; + if (!server) + { + String robotsURL = u.signature(); + robotsURL << "robots.txt"; + StringList *localRobotsFile = GetLocal(robotsURL); + + server = new Server(u, localRobotsFile); + servers.Add(u.signature(), server); + delete localRobotsFile; + } + + if (from && visited.Exists(url)) + { + if (debug > 2) + cout << " skipped" << endl; + continue; + } + else if (IsValidURL(url) != 1) + { + if (debug > 2) + cout << endl; + continue; + } + + if (Retriever_noLog == log || from != 3) + { + if (debug > 2) + cout << " pushed"; + server->push(u.get(), 0, 0, IsLocalURL(url.get())); + } + if (debug > 2) + cout << endl; + visited.Add(url, 0); + } +} + + +//***************************************************************************** +// void Retriever::Initial(List &list, int from) +// +void Retriever::Initial(List & list, int from) +{ + list.Start_Get(); + String *str; + + // from == 0 is an optimisation for pushing url in update mode + // assuming that + // 1) there's many more urls in docdb + // 2) they're pushed first + // 3) there's no duplicate url in docdb + // then they don't need to be check against already pushed urls + // But 2) can be false with -l option + // + // FIXME it's nasty, what have to be test is : + // we have urls to push from db.docs but do we already have them in + // db.log? For this it's using a side effect with 'visited' and that + // urls in db.docs are only pushed via this method, and that db.log are pushed + // first, db.docs second, start_urls third! + // + if (!from && visited.Count()) + { + from = 3; + } + while ((str = (String *) list.Get_Next())) + { + Initial(str->get(), from); + } +} + +//***************************************************************************** +// +static void sigexit(int) +{ + noSignal = 0; //don't exit here.. just set the flag. +} + +static void sigpipe(int) +{ +} + +//***************************************************************************** +// static void sig_handlers +// initialise signal handlers +// +static void sig_handlers(void) +{ +#ifndef _MSC_VER /* _WIN32 */ + //POSIX SIGNALS + struct sigaction action; + + /* SIGINT, SIGQUIT, SIGTERM */ + action.sa_handler = sigexit; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGINT, &action, NULL) != 0) + reportError("Cannot install SIGINT handler\n"); + if (sigaction(SIGQUIT, &action, NULL) != 0) + reportError("Cannot install SIGQUIT handler\n"); + if (sigaction(SIGTERM, &action, NULL) != 0) + reportError("Cannot install SIGTERM handler\n"); + if (sigaction(SIGHUP, &action, NULL) != 0) + reportError("Cannot install SIGHUP handler\n"); +#else + //ANSI C signal handling - Limited to supported Windows signals. + signal(SIGINT, sigexit); + signal(SIGTERM, sigexit); +#endif //_MSC_VER /* _WIN32 */ +} + + + +static void sig_phandler(void) +{ +#ifndef _MSC_VER /* _WIN32 */ + struct sigaction action; + + sigemptyset(&action.sa_mask); + action.sa_handler = sigpipe; + action.sa_flags = SA_RESTART; + if (sigaction(SIGPIPE, &action, NULL) != 0) + reportError("Cannot install SIGPIPE handler\n"); +#endif //_MSC_VER /* _WIN32 */ +} + + +//***************************************************************************** +// static void win32_check_messages +// Check WIN32 messages! +// +#ifdef _MSC_VER /* _WIN32 */ +static void win32_check_messages(void) +{ +// NEAL - NEEDS FINISHING/TESTING +#if 0 + MSG msg = {0, 0, 0, 0}; + int cDown = 0; + int controlDown = 0; + + if( GetMessage(&msg, 0, 0, 0) ) + { + + switch(msg.message) + { + case WM_KEYDOWN: + { + if(LOWORD(msg.message)== 17) + controlDown = 1; + else if(LOWORD(msg.message) == 67) + { + cDown = 1; + } + } + break; + case WM_KEYUP: + { + if(LOWORD(msg.message) == 17) + controlDown = 0; + else if(LOWORD(msg.message) == 67) + cDown = 0; + } + break; + } + } + + DispatchMessage(&msg); +#endif +} +#endif //_MSC_VER /* _WIN32 */ + + +//***************************************************************************** +// void Retriever::Start() +// This is the main loop of the retriever. We will go through the +// list of paths stored for each server. While parsing the +// retrieved documents, new paths will be added to the servers. We +// return if no more paths need to be retrieved. +// +void Retriever::Start() +{ + // + // Main digger loop. The todo list should initialy have the start + // URL and all the URLs which were seen in a previous dig. The + // loop will continue as long as there are more URLs to visit. + // + int more = 1; + Server *server; + URLRef *ref; + + HtConfiguration *config = HtConfiguration::config(); + + // + // Always sig . The delay bother me but a bad db is worst + // + if (Retriever_noLog != log) + { + sig_handlers(); + } + sig_phandler(); + noSignal = 1; + + +/////// + // Main loop. We keep on retrieving until a signal is received + // or all the servers' queues are empty. +/////// + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + while (more && noSignal) + { + more = 0; + + // + // Go through all the current servers in sequence. + // If they support persistent connections, we keep on popping + // from the same server queue until it's empty or we reach a maximum + // number of consecutive requests ("max_connection_requests"). + // Or the loop may also continue for the infinite, + // if we set the "max_connection_requests" to -1. + // If the server doesn't support persistent connection, we take + // only an URL from it, then we skip to the next server. + // + // Since 15.05.02: even when persistent connections are activated + // we should wait for a 'server_wait_time' number of seconds + // after the 'max_connection_requests' value has been reached. + // + + // Let's position at the beginning + servers.Start_Get(); + + int count; + + // Maximum number of repeated requests with the same + // TCP connection (so on the same Server:Port). + + int max_connection_requests; + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + while ((server = (Server *) servers.Get_NextElement()) && noSignal) + { + if (debug > 1) + cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl; + + // We already know if a server supports HTTP pers. connections, + // because we asked it for the robots.txt file (constructor of + // the class). + + // If the Server doesn't support persistent connections + // we turn it down to 1. + + if (server->IsPersistentConnectionAllowed()) + { + + // Let's check for a '0' value (out of range) + // If set, we change it to 1. + + if (config->Value("server", server->host(), "max_connection_requests") == 0) + max_connection_requests = 1; + else + max_connection_requests = + config->Value("server", server->host(), "max_connection_requests"); + + if (debug > 2) + { + + cout << "> " << server->host() << " supports HTTP persistent connections"; + + if (max_connection_requests == -1) + cout << " (" << "infinite" << ")" << endl; + else + cout << " (" << max_connection_requests << ")" << endl; + + } + + } + else + { + + // No HTTP persistent connections. So we request only 1 document. + + max_connection_requests = 1; + + if (debug > 2) + cout << "> " << server->host() << " with a traditional HTTP connection" << endl; + + } + + + count = 0; + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + + while (((max_connection_requests == -1) || + (count < max_connection_requests)) && (ref = server->pop()) && noSignal) + { + count++; + + // + // We have a URL to index, now. We need to register the + // fact that we are not done yet by setting the 'more' + // variable. So, we have to restart scanning the queue. + // + + more = 1; + + // + // Deal with the actual URL. + // We'll check with the server to see if we need to sleep() + // before parsing it. + // + + parse_url(*ref); + delete ref; + + // We reached the maximum number of connections (either with + // or without persistent connections) and we must pause and + // respect the 'net ethic'. + if ((max_connection_requests - count) == 0) + server->delay(); // This will pause if needed + // and reset the time + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + } + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + } + } + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + + // if we exited on signal + if (Retriever_noLog != log && !noSignal) + { + FILE *urls_parsed; + String filelog = config->Find("url_log"); + // save url seen but not fetched + urls_parsed = fopen((char *) filelog, "w"); + if (0 == urls_parsed) + { + reportError(form("Unable to create URL log file '%s'", filelog.get())); + } + else + { + servers.Start_Get(); + while ((server = (Server *) servers.Get_NextElement())) + { + while (NULL != (ref = server->pop())) + { + fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get()); + delete ref; + } + } + fclose(urls_parsed); + } + } + words.Close(); +} + + +//***************************************************************************** +// void Retriever::parse_url(URLRef &urlRef) +// +void Retriever::parse_url(URLRef & urlRef) +{ + HtConfiguration *config = HtConfiguration::config(); + URL url; + DocumentRef *ref; + int old_document; + time_t date; + static int index = 0; + static int local_urls_only = config->Boolean("local_urls_only"); + static int mark_dead_servers = config->Boolean("ignore_dead_servers"); + Server *server; + + url.parse(urlRef.GetURL().get()); + + currenthopcount = urlRef.GetHopCount(); + + ref = docs[url.get()]; // It might be nice to have just an Exists() here + if (ref) + { + // + // We already have an entry for this document in our database. + // This means we can get the document ID and last modification + // time from there. + // + current_id = ref->DocID(); + date = ref->DocTime(); + if (ref->DocAccessed()) + old_document = 1; + else // we haven't retrieved it yet, so we only have the first link + old_document = 0; + ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link + ref->DocAccessed(time(0)); + ref->DocState(Reference_normal); + currenthopcount = ref->DocHopCount(); + } + else + { + // + // Never seen this document before. We need to create an + // entry for it. This implies that it gets a new document ID. + // + date = 0; + current_id = docs.NextDocID(); + ref = new DocumentRef; + ref->DocID(current_id); + ref->DocURL(url.get()); + ref->DocState(Reference_normal); + ref->DocAccessed(time(0)); + ref->DocHopCount(currenthopcount); + ref->DocBackLinks(1); // We had to have a link to get here! + old_document = 0; + } + + word_context.DocID(ref->DocID()); + + if (debug > 0) + { + // + // Display progress + // + cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": "; + cout.flush(); + } + + // Reset the document to clean out any old data + doc->Reset(); + doc->Url(url.get()); + doc->Referer(urlRef.GetReferer().get()); + + base = doc->Url(); + + // Retrieve document, first trying local file access if possible. + Transport::DocStatus status; + server = (Server *) servers[url.signature()]; + StringList *local_filenames = GetLocal(url.get()); + if (local_filenames) + { + if (debug > 1) + cout << "Trying local files" << endl; + status = doc->RetrieveLocal(date, local_filenames); + if (status == Transport::Document_not_local) + { + if (debug > 1) + cout << "Local retrieval failed, trying HTTP" << endl; + if (server && !server->IsDead() && !local_urls_only) + status = doc->Retrieve(server, date); + else + status = Transport::Document_no_host; + } + delete local_filenames; + } + else if (server && !server->IsDead() && !local_urls_only) + status = doc->Retrieve(server, date); + else + status = Transport::Document_no_host; + + current_ref = ref; + + // + // Determine what to do by looking at the status code returned by + // the Document retrieval process. + // + + String shash; + String sx; + char bhash[16]; + time_t ddate; + + switch (status) + { + + case Transport::Document_ok: + trackWords = 1; + + if (check_unique_md5) + { + if (doc->StoredLength() > 0) + { + if (check_unique_date) + { + ddate = doc->ModTime(); + if (ddate < time(NULL) - 10) + { // Unknown date was set to current time + md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug); + } + else + { + md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug); + } + } + else + md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug); + + shash.append(bhash, MD5_LENGTH); + d_md5->Get(shash, sx); + + if (!sx.empty()) + { + if (debug > 1) + { + cout << " Detected duplicate by md5 hash" << endl; + } + words.Skip(); + break; // Duplicate - don't index + } + else + { + d_md5->Put(shash, "x"); + } + + } + } + + if (old_document) + { + if (doc->ModTime() == ref->DocTime()) + { + words.Skip(); + if (debug) + cout << " retrieved but not changed" << endl; + words.Skip(); + break; + } + // + // Since we already had a record of this document and + // we were able to retrieve it, it must have changed + // since the last time we scanned it. This means that + // we need to assign a new document ID to it and mark + // the old one as obsolete. + // + words.Skip(); + int backlinks = ref->DocBackLinks(); + ref->DocState(Reference_obsolete); + docs.Add(*ref); + delete ref; + + current_id = docs.NextDocID(); + word_context.DocID(current_id); + ref = new DocumentRef; + ref->DocID(current_id); + ref->DocURL(url.get()); + ref->DocState(Reference_normal); + ref->DocAccessed(time(0)); + ref->DocHopCount(currenthopcount); + ref->DocBackLinks(backlinks); + if (debug) + cout << " (changed) "; + } + RetrievedDocument(*doc, url.get(), ref); + // Hey! If this document is marked noindex, don't even bother + // adding new words. Mark this as gone and get rid of it! + if (ref->DocState() == Reference_noindex) + { + if (debug > 1) + cout << " ( " << ref->DocURL() << " ignored)"; + words.Skip(); + } + else + words.Flush(); + if (debug) + cout << " size = " << doc->Length() << endl; + + if (urls_seen) + { + fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n", + (const char *) url.get(), doc->Length(), doc->ContentType(), + (int) doc->ModTime(), currenthopcount); + } + break; + + case Transport::Document_not_changed: + if (debug) + cout << " not changed" << endl; + words.Skip(); + break; + + case Transport::Document_not_found: + ref->DocState(Reference_not_found); + if (debug) + cout << " not found" << endl; + recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found); + words.Skip(); + break; + + case Transport::Document_no_host: + ref->DocState(Reference_not_found); + if (debug) + cout << " host not found" << endl; + recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host); + words.Skip(); + + // Mark the server as being down + if (server && mark_dead_servers) + server->IsDead(1); + break; + + case Transport::Document_no_port: + ref->DocState(Reference_not_found); + if (debug) + cout << " host not found (port)" << endl; + recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port); + words.Skip(); + + // Mark the server as being down + if (server && mark_dead_servers) + server->IsDead(1); + break; + + case Transport::Document_not_parsable: + ref->DocState(Reference_noindex); + if (debug) + cout << " not Parsable" << endl; + words.Skip(); + break; + + case Transport::Document_redirect: + if (debug) + cout << " redirect" << endl; + ref->DocState(Reference_obsolete); + words.Skip(); + got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get()); + break; + + case Transport::Document_not_authorized: + ref->DocState(Reference_not_found); + if (debug) + cout << " not authorized" << endl; + words.Skip(); + break; + + case Transport::Document_not_local: + ref->DocState(Reference_not_found); + if (debug) + cout << " not local" << endl; + words.Skip(); + break; + + case Transport::Document_no_header: + ref->DocState(Reference_not_found); + if (debug) + cout << " no header" << endl; + words.Skip(); + break; + + case Transport::Document_connection_down: + ref->DocState(Reference_not_found); + if (debug) + cout << " connection down" << endl; + words.Skip(); + break; + + case Transport::Document_no_connection: + ref->DocState(Reference_not_found); + if (debug) + cout << " no connection" << endl; + words.Skip(); + break; + + case Transport::Document_not_recognized_service: + ref->DocState(Reference_not_found); + if (debug) + cout << " service not recognized" << endl; + + // Mark the server as being down + if (server && mark_dead_servers) + server->IsDead(1); + words.Skip(); + break; + + case Transport::Document_other_error: + ref->DocState(Reference_not_found); + if (debug) + cout << " other error" << endl; + words.Skip(); + break; + } + docs.Add(*ref); + delete ref; +} + + +//***************************************************************************** +// void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref) +// We found a document that needs to be parsed. Since we don't know the +// document type, we'll let the Document itself return an appropriate +// Parsable object which we can call upon to parse the document contents. +// +void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref) +{ + n_links = 0; + current_ref = ref; + current_title = 0; + word_context.Anchor(0); + current_time = 0; + current_head = 0; + current_meta_dsc = 0; + + // + // Create a parser object and let it have a go at the document. + // We will pass ourselves as a callback object for all the got_*() + // routines. + // This will generate the Parsable object as a specific parser + // + Parsable *parsable = doc.getParsable(); + if (parsable) + parsable->parse(*this, *base); + else + { // If we didn't get a parser, then we should get rid of this! + ref->DocState(Reference_noindex); + return; + } + + // If just storing the first occurrence of each word in a document, + // we must now flush the words we saw in that document + if (no_store_phrases) + { + DictionaryCursor cursor; + char *key; + HtWordReference wordRef; + for (words_to_add.Start_Get (cursor); + (key = words_to_add.Get_Next(cursor)); ) + { + word_entry *entry = (word_entry*) (words_to_add [key]); + + wordRef.Location(entry->location); + wordRef.Flags(entry->flags); + wordRef.Word(key); + words.Replace(WordReference::Merge(wordRef, entry->context)); + // How do I clean up properly? + delete entry; + } + words_to_add.Release (); + } + + // + // We don't need to dispose of the parsable object since it will + // automatically be reused. + // + + // + // Update the document reference + // + ref->DocHead((char *) current_head); + ref->DocMetaDsc((char *) current_meta_dsc); + if (current_time == 0) + ref->DocTime(doc.ModTime()); + else + ref->DocTime(current_time); + ref->DocTitle((char *) current_title); + ref->DocSize(doc.Length()); + ref->DocAccessed(time(0)); + ref->DocLinks(n_links); +} + + +//***************************************************************************** +// int Retriever::Need2Get(const String &u) +// Return TRUE if we need to retrieve the given url. This will +// check the list of urls we have already visited. +// +int Retriever::Need2Get(const String & u) +{ + static String url; + url = u; + + return !visited.Exists(url); +} + + + +//***************************************************************************** +// int Retriever::IsValidURL(const String &u) +// Return TRUE if we need to retrieve the given url. We will check +// for limits here. +// +int Retriever::IsValidURL(const String & u) +{ + HtConfiguration *config = HtConfiguration::config(); + Dictionary invalids; + Dictionary valids; + URL aUrl(u); + StringList tmpList; + + // A list of bad extensions, separated by spaces or tabs + String t = config->Find(&aUrl, "bad_extensions"); + String lowerp; + char *p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + invalids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + // + // Valid extensions are performed similarly + // + // A list of valid extensions, separated by spaces or tabs + + t = config->Find(&aUrl, "valid_extensions"); + p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + valids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + static String url; + url = u; + + // + // If the URL contains any of the patterns in the exclude list, + // mark it as invalid + // + String exclude_urls = config->Find(&aUrl, "exclude_urls"); + static String *prevexcludes = 0; + static HtRegexList *excludes = 0; + if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0) + { + if (!excludes) + excludes = new HtRegexList; + if (prevexcludes) + delete prevexcludes; + prevexcludes = new String(exclude_urls); + tmpList.Create(exclude_urls, " \t"); + excludes->setEscaped(tmpList, config->Boolean("case_sensitive")); + tmpList.Destroy(); + } + if (excludes->match(url, 0, 0) != 0) + { + if (debug > 2) + cout << endl << " Rejected: item in exclude list "; + return (HTDIG_ERROR_TESTURL_EXCLUDE); + } + + // + // If the URL has a query string and it is in the bad query list + // mark it as invalid + // + String bad_querystr = config->Find(&aUrl, "bad_querystr"); + static String *prevbadquerystr = 0; + static HtRegexList *badquerystr = 0; + if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0) + { + if (!badquerystr) + badquerystr = new HtRegexList; + if (prevbadquerystr) + delete prevbadquerystr; + prevbadquerystr = new String(bad_querystr); + tmpList.Create(bad_querystr, " \t"); + badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive")); + tmpList.Destroy(); + } + char *ext = strrchr((char *) url, '?'); + if (ext && badquerystr->match(ext, 0, 0) != 0) + { + if (debug > 2) + cout << endl << " Rejected: item in bad query list "; + return (HTDIG_ERROR_TESTURL_BADQUERY); + } + + // + // See if the file extension is in the list of invalid ones + // + String urlpath = url.get(); + int parm = urlpath.indexOf('?'); // chop off URL parameter + if (parm >= 0) + urlpath.chop(urlpath.length() - parm); + ext = strrchr((char *) urlpath.get(), '.'); + String lowerext; + if (ext && strchr(ext, '/')) // Ignore a dot if it's not in the + ext = NULL; // final component of the path. + if (ext) + { + lowerext.set(ext); + lowerext.lowercase(); + if (invalids.Exists(lowerext)) + { + if (debug > 2) + cout << endl << " Rejected: Extension is invalid!"; + return (HTDIG_ERROR_TESTURL_EXTENSION); + } + } + // + // Or NOT in the list of valid ones + // + if (ext && valids.Count() > 0 && !valids.Exists(lowerext)) + { + if (debug > 2) + cout << endl << " Rejected: Extension is not valid!"; + return (HTDIG_ERROR_TESTURL_EXTENSION2); + } + + // + // If none of the limits is met, we disallow the URL + // + if (limits.match(url, 1, 0) == 0) + { + if (debug > 1) + cout << endl << " Rejected: URL not in the limits! "; + return (HTDIG_ERROR_TESTURL_LIMITS); + } + // + // Likewise if not in list of normalized urls + // + // Warning! + // should be last in checks because of aUrl normalization + // + // signature() implicitly normalizes the URL. Be efficient... + Server *server = (Server *) servers[aUrl.signature()]; +// aUrl.normalize(); + if (limitsn.match(aUrl.get(), 1, 0) == 0) + { + if (debug > 2) + cout << endl << " Rejected: not in \"limit_normalized\" list!"; + return (HTDIG_ERROR_TESTURL_LIMITSNORM); + } + + // + // After that gauntlet, check to see if the server allows it + // (robots.txt) + // + if (server && server->IsDisallowed(url) != 0) + { + if (debug > 2) + cout << endl << " Rejected: forbidden by server robots.txt!"; + return (HTDIG_ERROR_TESTURL_ROBOT_FORBID); + } + + return (1); +} + + +//***************************************************************************** +// StringList* Retriever::GetLocal(const String &url) +// Returns a list of strings containing the (possible) local filenames +// of the given url, or 0 if it's definitely not local. +// THE CALLER MUST FREE THE STRINGLIST AFTER USE! +// Returned strings are not hex encoded. +// +StringList *Retriever::GetLocal(const String & strurl) +{ + HtConfiguration *config = HtConfiguration::config(); + static StringList *prefixes = 0; + String url = strurl; + + static StringList *paths = 0; + StringList *defaultdocs = 0; + URL aUrl(url); + url = aUrl.get(); // make sure we look at a parsed URL + + // + // Initialize prefix/path list if this is the first time. + // The list is given in format "prefix1=path1 prefix2=path2 ..." + // + if (!prefixes) + { + prefixes = new StringList(); + paths = new StringList(); + + String t = config->Find("local_urls"); + char *p = strtok(t, " \t"); + while (p) + { + char *path = strchr(p, '='); + if (!path) + { + p = strtok(0, " \t"); + continue; + } + *path++ = '\0'; + String *pre = new String(p); + decodeURL(*pre); + prefixes->Add(pre); + String *pat = new String(path); + decodeURL(*pat); + paths->Add(pat); + p = strtok(0, " \t"); + } + } + if (!config->Find(&aUrl, "local_default_doc").empty()) + { + defaultdocs = new StringList(); + String t = config->Find(&aUrl, "local_default_doc"); + char *p = strtok(t, " \t"); + while (p) + { + String *def = new String(p); + decodeURL(*def); + defaultdocs->Add(def); + p = strtok(0, " \t"); + } + if (defaultdocs->Count() == 0) + { + delete defaultdocs; + defaultdocs = 0; + } + } + + // Begin by hex-decoding URL... + String hexurl = url; + decodeURL(hexurl); + url = hexurl.get(); + + // Check first for local user... + if (strchr(url.get(), '~')) + { + StringList *local = GetLocalUser(url, defaultdocs); + if (local) + { + if (defaultdocs) + delete defaultdocs; + return local; + } + } + + // This shouldn't happen, but check anyway... + if (strstr(url.get(), "..")) + return 0; + + String *prefix, *path; + String *defaultdoc; + StringList *local_names = new StringList(); + prefixes->Start_Get(); + paths->Start_Get(); + while ((prefix = (String *) prefixes->Get_Next())) + { + path = (String *) paths->Get_Next(); + if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0) + { + int l = strlen(url.get()) - prefix->length() + path->length() + 4; + String *local = new String(*path, l); + *local += &url[prefix->length()]; + if (local->last() == '/' && defaultdocs) + { + defaultdocs->Start_Get(); + while ((defaultdoc = (String *) defaultdocs->Get_Next())) + { + String *localdefault = + new String(*local, local->length() + defaultdoc->length() + 1); + localdefault->append(*defaultdoc); + local_names->Add(localdefault); + } + delete local; + } + else + local_names->Add(local); + } + } + if (local_names->Count() > 0) + { + if (defaultdocs) + delete defaultdocs; + return local_names; + } + + if (defaultdocs) + delete defaultdocs; + delete local_names; + return 0; +} + + +//***************************************************************************** +// StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs) +// If the URL has ~user part, return a list of strings containing the +// (possible) local filenames of the given url, or 0 if it's +// definitely not local. +// THE CALLER MUST FREE THE STRINGLIST AFTER USE! +// +StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs) +{ +// NOTE: Native Windows does not have this contruct for the user Web files +#ifndef _MSC_VER /* _WIN32 */ + HtConfiguration *config = HtConfiguration::config(); + static StringList *prefixes = 0, *paths = 0, *dirs = 0; + static Dictionary home_cache; + URL aUrl(url); + + // + // Initialize prefix/path list if this is the first time. + // The list is given in format "prefix1=path1,dir1 ..." + // If path is zero-length, user's home directory is looked up. + // + if (!prefixes) + { + prefixes = new StringList(); + paths = new StringList(); + dirs = new StringList(); + String t = config->Find("local_user_urls"); + char *p = strtok(t, " \t"); + while (p) + { + char *path = strchr(p, '='); + if (!path) + { + p = strtok(0, " \t"); + continue; + } + *path++ = '\0'; + char *dir = strchr(path, ','); + if (!dir) + { + p = strtok(0, " \t"); + continue; + } + *dir++ = '\0'; + String *pre = new String(p); + decodeURL(*pre); + prefixes->Add(pre); + String *pat = new String(path); + decodeURL(*pat); + paths->Add(pat); + String *ptd = new String(dir); + decodeURL(*ptd); + dirs->Add(ptd); + p = strtok(0, " \t"); + } + } + + // Can we do anything about this? + if (!strchr(url, '~') || !prefixes->Count() || strstr(url, "..")) + return 0; + + // Split the URL to components + String tmp = url; + char *name = strchr((char *) tmp, '~'); + *name++ = '\0'; + char *rest = strchr(name, '/'); + if (!rest || (rest - name <= 1) || (rest - name > 32)) + return 0; + *rest++ = '\0'; + + // Look it up in the prefix/path/dir table + prefixes->Start_Get(); + paths->Start_Get(); + dirs->Start_Get(); + String *prefix, *path, *dir; + String *defaultdoc; + StringList *local_names = new StringList(); + while ((prefix = (String *) prefixes->Get_Next())) + { + path = (String *) paths->Get_Next(); + dir = (String *) dirs->Get_Next(); + if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0) + continue; + + String *local = new String; + // No path, look up home directory + if (path->length() == 0) + { + String *home = (String *) home_cache[name]; + if (!home) + { + struct passwd *passwd = getpwnam(name); + if (passwd) + { + home = new String(passwd->pw_dir); + home_cache.Add(name, home); + } + } + if (home) + *local += *home; + else + continue; + } + else + { + *local += *path; + *local += name; + } + *local += *dir; + *local += rest; + if (local->last() == '/' && defaultdocs) + { + defaultdocs->Start_Get(); + while ((defaultdoc = (String *) defaultdocs->Get_Next())) + { + String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1); + localdefault->append(*defaultdoc); + local_names->Add(localdefault); + } + delete local; + } + else + local_names->Add(local); + } + + if (local_names->Count() > 0) + return local_names; + + delete local_names; +#endif //_MSC_VER /* _WIN32 */ + + return 0; +} + + +//***************************************************************************** +// int Retriever::IsLocalURL(const String &url) +// Returns 1 if the given url has a (possible) local filename +// or 0 if it's definitely not local. +// +int Retriever::IsLocalURL(const String & url) +{ + int ret; + + StringList *local_filename = GetLocal(url); + ret = (local_filename != 0); + if (local_filename) + delete local_filename; + + return ret; +} + + +//***************************************************************************** +// void Retriever::got_word(char *word, int location, int heading) +// The location is normalized to be in the range 0 - 1000. +// +void Retriever::got_word(const char *word, int location, int heading) +{ + if (debug > 3) + cout << "word: " << word << '@' << location << endl; + if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0) + heading = 0; // Assume it's just normal text + if (trackWords && strlen(word) >= (unsigned int) minimumWordLength) + { + String w = word; + HtWordReference wordRef; + + if (no_store_phrases) + { + // Add new word, or mark existing word as also being at + // this heading level + word_entry *entry; + if ((entry = (word_entry*)words_to_add.Find (w)) == NULL) + { + words_to_add.Add(w, new word_entry (location, factor[heading], word_context)); + } else + { + entry->flags |= factor[heading]; + } + } else + { + wordRef.Location(location); + wordRef.Flags(factor[heading]); + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + } + + // Check for compound words... + String parts = word; + int added; + int nparts = 1; + do + { + added = 0; + char *start = parts.get(); + char *punctp = 0, *nextp = 0, *p; + char punct; + int n; + while (*start) + { + p = start; + for (n = 0; n < nparts; n++) + { + while (HtIsStrictWordChar((unsigned char) *p)) + p++; + punctp = p; + if (!*punctp && n + 1 < nparts) + break; + while (*p && !HtIsStrictWordChar((unsigned char) *p)) + p++; + if (n == 0) + nextp = p; + } + if (n < nparts) + break; + punct = *punctp; + *punctp = '\0'; + if (*start && (*p || start > parts.get())) + { + w = start; + HtStripPunctuation(w); + if (w.length() >= minimumWordLength) + { + if (no_store_phrases) + { + // Add new word, or mark existing word as also being at + // this heading level + word_entry *entry; + if ((entry = (word_entry*)words_to_add.Find (w)) == NULL) + { + words_to_add.Add(w, new word_entry (location, factor[heading], word_context)); + } else + { + entry->flags |= factor[heading]; + } + } else + { + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + } + if (debug > 3) + cout << "word part: " << start << '@' << location << endl; + } + added++; + } + start = nextp; + *punctp = punct; + } + nparts++; + } + while (added > 2); + } +} + + +//***************************************************************************** +// void Retriever::got_title(const char *title) +// +void Retriever::got_title(const char *title) +{ + if (debug > 1) + cout << "\ntitle: " << title << endl; + current_title = title; +} + + +//***************************************************************************** +// void Retriever::got_author(const char *e) +// +void Retriever::got_author(const char *author) +{ + if (debug > 1) + cout << "\nauthor: " << author << endl; + current_ref->DocAuthor(author); +} + + +//***************************************************************************** +// void Retriever::got_time(const char *time) +// +void Retriever::got_time(const char *time) +{ + HtDateTime new_time(current_time); + + if (debug > 1) + cout << "\ntime: " << time << endl; + + // + // As defined by the Dublin Core, this should be YYYY-MM-DD + // In the future, we'll need to deal with the scheme portion + // in case someone picks a different format. + // + new_time.SetFTime(time, "%Y-%m-%d"); + current_time = new_time.GetTime_t(); + + // If we can't convert it, current_time stays the same and we get + // the default--the date returned by the server... +} + +//***************************************************************************** +// void Retriever::got_anchor(const char *anchor) +// +void Retriever::got_anchor(const char *anchor) +{ + if (debug > 2) + cout << "anchor: " << anchor << endl; + current_ref->AddAnchor(anchor); + word_context.Anchor(word_context.Anchor() + 1); +} + + +//***************************************************************************** +// void Retriever::got_image(const char *src) +// +void Retriever::got_image(const char *src) +{ + URL url(src, *base); + const char *image = (const char *) url.get(); + + if (debug > 2) + cout << "image: " << image << endl; + + if (images_seen) + fprintf(images_seen, "%s\n", image); +} + + +//***************************************************************************** +// +void Retriever::got_href(URL & url, const char *description, int hops) +{ + DocumentRef *ref = 0; + Server *server = 0; + int valid_url_code = 0; + + // Rewrite the URL (if need be) before we do anything to it. + url.rewrite(); + + if (debug > 2) + cout << "href: " << url.get() << " (" << description << ')' << endl; + + n_links++; + + if (urls_seen) + fprintf(urls_seen, "%s\n", (const char *) url.get()); + + // + // Check if this URL falls within the valid range of URLs. + // + valid_url_code = IsValidURL(url.get()); + if (valid_url_code > 0) + { + // + // It is valid. Normalize it (resolve cnames for the server) + // and check again... + // + if (debug > 2) + { + cout << "resolving '" << url.get() << "'\n"; + cout.flush(); + } + + url.normalize(); + + // If it is a backlink from the current document, + // just update that field. Writing to the database + // is meaningless, as it will be overwritten. + // Adding it as a new document may even be harmful, as + // that will be a duplicate. This can happen if the + // current document is never referenced before, as in a + // start_url. + + if (strcmp(url.get(), current_ref->DocURL()) == 0) + { + current_ref->DocBackLinks(current_ref->DocBackLinks() + 1); + current_ref->AddDescription(description, words); + } + else + { + + // + // First add it to the document database + // + ref = docs[url.get()]; + // if ref exists we have to call AddDescription even + // if max_hop_count is reached + if (!ref && currenthopcount + hops > max_hop_count) + return; + + if (!ref) + { + // + // Didn't see this one, yet. Create a new reference + // for it with a unique document ID + // + ref = new DocumentRef; + ref->DocID(docs.NextDocID()); + ref->DocHopCount(currenthopcount + hops); + ref->DocURL(url.get()); + } + ref->DocBackLinks(ref->DocBackLinks() + 1); // This one! + ref->AddDescription(description, words); + + // + // If the dig is restricting by hop count, perform the check here + // too + if (currenthopcount + hops > max_hop_count) + { + delete ref; + return; + } + + if (ref->DocHopCount() > currenthopcount + hops) + ref->DocHopCount(currenthopcount + hops); + + docs.Add(*ref); + + // + // Now put it in the list of URLs to still visit. + // + if (Need2Get(url.get())) + { + if (debug > 1) + cout << "\n pushing " << url.get() << endl; + server = (Server *) servers[url.signature()]; + if (!server) + { + // + // Hadn't seen this server, yet. Register it + // + String robotsURL = url.signature(); + robotsURL << "robots.txt"; + StringList *localRobotsFile = GetLocal(robotsURL.get()); + + server = new Server(url, localRobotsFile); + servers.Add(url.signature(), server); + delete localRobotsFile; + } + // + // Let's just be sure we're not pushing an empty URL + // + if (strlen(url.get())) + server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get())); + + String temp = url.get(); + visited.Add(temp, 0); + if (debug) + cout << '+'; + } + else if (debug) + cout << '*'; + delete ref; + } + } + else + { + // + // Not a valid URL + // + if (debug > 1) + cout << "\nurl rejected: (level 1)" << url.get() << endl; + if (debug == 1) + cout << '-'; + + if (urls_seen) + { + fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code); + } + + } + if (debug) + cout.flush(); +} + + +//***************************************************************************** +// void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref) +// +void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer) +{ + // First we must piece together the new URL, which may be relative + URL parent(old_ref->DocURL()); + URL url(new_url, parent); + + // Rewrite the URL (if need be) before we do anything to it. + url.rewrite(); + + if (debug > 2) + cout << "redirect: " << url.get() << endl; + + n_links++; + + if (urls_seen) + fprintf(urls_seen, "%s\n", (const char *) url.get()); + + // + // Check if this URL falls within the valid range of URLs. + // + if (IsValidURL(url.get()) > 0) + { + // + // It is valid. Normalize it (resolve cnames for the server) + // and check again... + // + if (debug > 2) + { + cout << "resolving '" << url.get() << "'\n"; + cout.flush(); + } + + url.normalize(); + // + // First add it to the document database + // + DocumentRef *ref = docs[url.get()]; + if (!ref) + { + // + // Didn't see this one, yet. Create a new reference + // for it with a unique document ID + // + ref = new DocumentRef; + ref->DocID(docs.NextDocID()); + ref->DocHopCount(currenthopcount); + } + ref->DocURL(url.get()); + + // + // Copy the descriptions of the old DocRef to this one + // + List *d = old_ref->Descriptions(); + if (d) + { + d->Start_Get(); + String *str; + while ((str = (String *) d->Get_Next())) + { + ref->AddDescription(str->get(), words); + } + } + if (ref->DocHopCount() > old_ref->DocHopCount()) + ref->DocHopCount(old_ref->DocHopCount()); + + // Copy the number of backlinks + ref->DocBackLinks(old_ref->DocBackLinks()); + + docs.Add(*ref); + + // + // Now put it in the list of URLs to still visit. + // + if (Need2Get(url.get())) + { + if (debug > 1) + cout << " pushing " << url.get() << endl; + Server *server = (Server *) servers[url.signature()]; + if (!server) + { + // + // Hadn't seen this server, yet. Register it + // + String robotsURL = url.signature(); + robotsURL << "robots.txt"; + StringList *localRobotsFile = GetLocal(robotsURL.get()); + + server = new Server(url, localRobotsFile); + servers.Add(url.signature(), server); + delete localRobotsFile; + } + if (!referer || strlen(referer) == 0) + server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0); + else + server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0); + + String temp = url.get(); + visited.Add(temp, 0); + } + + delete ref; + } +} + + +//***************************************************************************** +// void Retriever::got_head(const char *head) +// +void Retriever::got_head(const char *head) +{ + if (debug > 4) + cout << "head: " << head << endl; + current_head = head; +} + +//***************************************************************************** +// void Retriever::got_meta_dsc(const char *md) +// +void Retriever::got_meta_dsc(const char *md) +{ + if (debug > 4) + cout << "meta description: " << md << endl; + current_meta_dsc = md; +} + + +//***************************************************************************** +// void Retriever::got_meta_email(const char *e) +// +void Retriever::got_meta_email(const char *e) +{ + if (debug > 1) + cout << "\nmeta email: " << e << endl; + current_ref->DocEmail(e); +} + + +//***************************************************************************** +// void Retriever::got_meta_notification(const char *e) +// +void Retriever::got_meta_notification(const char *e) +{ + if (debug > 1) + cout << "\nmeta notification date: " << e << endl; + current_ref->DocNotification(e); +} + + +//***************************************************************************** +// void Retriever::got_meta_subject(const char *e) +// +void Retriever::got_meta_subject(const char *e) +{ + if (debug > 1) + cout << "\nmeta subect: " << e << endl; + current_ref->DocSubject(e); +} + + +//***************************************************************************** +// void Retriever::got_noindex() +// +void Retriever::got_noindex() +{ + if (debug > 1) + cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl; + current_ref->DocState(Reference_noindex); +} + + +//***************************************************************************** +// +void Retriever::recordNotFound(const String & url, const String & referer, int reason) +{ + char *message = ""; + + switch (reason) + { + case Transport::Document_not_found: + message = "Not found"; + break; + + case Transport::Document_no_host: + message = "Unknown host or unable to contact server"; + break; + + case Transport::Document_no_port: + message = "Unknown host or unable to contact server (port)"; + break; + + default: + break; + + } + + notFound << message << ": " << url << " Ref: " << referer << '\n'; +} + +//***************************************************************************** +// void Retriever::ReportStatistics(char *name) +// +void Retriever::ReportStatistics(const String & name) +{ + HtConfiguration *config = HtConfiguration::config(); + cout << name << ": Run complete\n"; + cout << name << ": " << servers.Count() << " server"; + if (servers.Count() > 1) + cout << "s"; + cout << " seen:\n"; + + Server *server; + String buffer; + StringList results; + String newname = name; + + newname << ": "; + + servers.Start_Get(); + while ((server = (Server *) servers.Get_NextElement())) + { + buffer = 0; + server->reportStatistics(buffer, newname); + results.Add(buffer); + } + results.Sort(); + + for (int i = 0; i < results.Count(); i++) + { + cout << results[i] << "\n"; + } + + if (notFound.length() > 0) + { + cout << "\n" << name << ": Errors to take note of:\n"; + cout << notFound; + } + + cout << endl; + + // Report HTTP connections stats + cout << "HTTP statistics" << endl; + cout << "===============" << endl; + + if (config->Boolean("persistent_connections")) + { + cout << " Persistent connections : Yes" << endl; + + if (config->Boolean("head_before_get")) + cout << " HEAD call before GET : Yes" << endl; + else + cout << " HEAD call before GET : No" << endl; + } + else + { + cout << " Persistent connections : No" << endl; + } + + HtHTTP::ShowStatistics(cout) << endl; + +} |