diff options
author | Slávek Banko <slavek.banko@axis.cz> | 2021-11-05 13:28:23 +0100 |
---|---|---|
committer | Slávek Banko <slavek.banko@axis.cz> | 2021-11-05 13:28:23 +0100 |
commit | 8c787c3591c1c885b91a54128835b400858c5cca (patch) | |
tree | eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htdig | |
parent | fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff) | |
download | extra-dependencies-8c787c35.tar.gz extra-dependencies-8c787c35.zip |
DEB htdig: Added to repository.
Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig')
24 files changed, 7440 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore b/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore new file mode 100644 index 00000000..4de01869 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/.cvsignore @@ -0,0 +1,8 @@ +Makefile +*.lo +*.la +.purify +.pure +.deps +.libs +htdig diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.cc b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc new file mode 100644 index 00000000..87272686 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc @@ -0,0 +1,784 @@ +// +// Document.cc +// +// Document: This class holds everything there is to know about a document. +// The actual contents of the document may or may not be present at +// all times for memory conservation reasons. +// The document can be told to retrieve its contents. This is done +// with the Retrieve call. In case the retrieval causes a +// redirect, the link is followed, but this process is done +// only once (to prevent loops.) If the redirect didn't +// work, Document_not_found is returned. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Document.cc,v 1.71 2004/05/28 13:15:14 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include "Document.h" +#include "StringList.h" +#include "htdig.h" +#include "HTML.h" +#include "Plaintext.h" +#include "ExternalParser.h" +#include "lib.h" + +#include "Transport.h" +#include "HtHTTP.h" + +#ifdef HAVE_SSL_H +#include "HtHTTPSecure.h" +#endif + +#include "HtHTTPBasic.h" +#include "ExternalTransport.h" + +#include "defaults.h" + +#if 1 +typedef void (*SIGNAL_HANDLER) (...); +#else +typedef SIG_PF SIGNAL_HANDLER; +#endif + +//***************************************************************************** +// Document::Document(char *u) +// Initialize with the given url as the location for this document. +// If the max_size is given, use that for size, otherwise use the +// config value. +// +Document::Document(char *u, int max_size) +{ + url = 0; + proxy = 0; + referer = 0; + contents = 0; + transportConnect = 0; + HTTPConnect = 0; + HTTPSConnect = 0; + FileConnect = 0; + FTPConnect = 0; + NNTPConnect = 0; + externalConnect = 0; + HtConfiguration* config= HtConfiguration::config(); + + // We probably need to move assignment of max_doc_size, according + // to a server or url configuration value. The same is valid for + // max_retries. + + if (max_size > 0) + max_doc_size = max_size; + else + max_doc_size = config->Value("max_doc_size"); + + if (config->Value("max_retries") > 0) + num_retries = config->Value("max_retries"); + else num_retries = 2; + + // Initialize some static variables of Transport + + Transport::SetDebugLevel(debug); + + // Initialize some static variables of Transport + // and the User Agent for every HtHTTP objects + + HtHTTP::SetParsingController(ExternalParser::canParse); + + // Set the default parser content-type string + Transport::SetDefaultParserContentType ("text/"); + + contents.allocate(max_doc_size + 100); + contentType = ""; + contentLength = -1; + if (u) + { + Url(u); + } +} + + +//***************************************************************************** +// Document::~Document() +// +Document::~Document() +{ + // We delete only the derived class objects + if (HTTPConnect) + delete HTTPConnect; + if (HTTPSConnect) + delete HTTPSConnect; + if (FileConnect) + delete FileConnect; + if (FTPConnect) + delete FTPConnect; + if (NNTPConnect) + delete NNTPConnect; + if (externalConnect) + delete externalConnect; + + if (url) + delete url; + if (proxy) + delete proxy; + if (referer) + delete referer; + +#if MEM_DEBUG + char *p = new char; + cout << "==== Document deleted: " << this << " new at " << + ((void *) p) << endl; + delete p; +#endif +} + + +//***************************************************************************** +// void Document::Reset() +// Restore the Document object to an initial state. +// We will not reset the authorization information since it can be reused. +// +void +Document::Reset() +{ + contentType = 0; + contentLength = -1; + if (url) + delete url; + url = 0; + if (referer) + delete referer; + + referer = 0; + + proxy=0; + authorization=0; + proxy_authorization=0; + contents = 0; + document_length = 0; + redirected_to = 0; + +} + + +//***************************************************************************** +// void Document::Url(const String &u) +// Set the URL for this document +// +void +Document::Url(const String &u) +{ + HtConfiguration* config= HtConfiguration::config(); + if (url) + delete url; + url = new URL(u); + + // Re-initialise the proxy + if (proxy) + delete proxy; + proxy = 0; + + // Get the proxy information for this URL + const String proxyURL = config->Find(url,"http_proxy"); + + // If http_proxy is not empty we set the proxy for the current URL + if (proxyURL.length()) + { + proxy = new URL(proxyURL); + proxy->normalize(); + // set the proxy authorization information + setProxyUsernamePassword(config->Find(url,"http_proxy_authorization")); + } + + // Set the authorization information + setUsernamePassword(config->Find(url,"authorization")); + +} + + +//***************************************************************************** +// void Document::Referer(const String &u) +// Set the Referring URL for this document +// +void +Document::Referer(const String &u) +{ + if (referer) + delete referer; + referer = new URL(u); +} + + +//***************************************************************************** +// int Document::UseProxy() +// Returns 1 if the given url is to be retrieved from the proxy server, +// or 0 if it's not. +// +int +Document::UseProxy() +{ + HtConfiguration* config= HtConfiguration::config(); + static HtRegex *excludeProxy = 0; + + // + // Initialize excludeProxy list if this is the first time. + // + if (!excludeProxy) + { + excludeProxy = new HtRegex(); + StringList l(config->Find("http_proxy_exclude"), " \t"); + excludeProxy->setEscaped(l, config->Boolean("case_sensitive")); + l.Release(); + } + + if ((proxy) && (excludeProxy->match(url->get(), 0, 0) == 0)) + return true; // if the exclude pattern is empty, use the proxy + return false; +} + + +//***************************************************************************** +// DocStatus Document::Retrieve(HtDateTime date) +// Attempt to retrieve the document pointed to by our internal URL +// +Transport::DocStatus +Document::Retrieve(Server *server, HtDateTime date) +{ + // Right now we just handle http:// service + // Soon this will include file:// + // as well as an ExternalTransport system + // eventually maybe ftp:// and a few others + + Transport::DocStatus status; + Transport_Response *response = 0; + HtDateTime *ptrdatetime = 0; + int useproxy = UseProxy(); + int NumRetries; + + transportConnect = 0; + + if (ExternalTransport::canHandle(url->service())) + { + if (externalConnect) + { + delete externalConnect; + } + externalConnect = new ExternalTransport(url->service()); + transportConnect = externalConnect; + } +#ifdef HAVE_SSL_H + else if (mystrncasecmp(url->service(), "https", 5) == 0) + { + if (!HTTPSConnect) + { + if (debug>4) + cout << "Creating an HtHTTPSecure object" << endl; + + HTTPSConnect = new HtHTTPSecure(); + + if (!HTTPSConnect) + return Transport::Document_other_error; + } + + if (HTTPSConnect) + { + // Here we must set only thing for a HTTP request + + HTTPSConnect->SetRequestURL(*url); + + // Set the user agent which can vary per server + HTTPSConnect->SetRequestUserAgent(server->UserAgent()); + + // Set the accept language which can vary per server + HTTPSConnect->SetAcceptLanguage(server->AcceptLanguage()); + + // Set the referer + if (referer) + HTTPSConnect->SetRefererURL(*referer); + + // Let's disable the cookies if we decided that in the config file + if (server->DisableCookies()) + HTTPSConnect->DisableCookies(); + else HTTPSConnect->AllowCookies(); + + // We may issue a config paramater to enable/disable them + if (server->IsPersistentConnectionAllowed()) + { + // Persistent connections allowed + HTTPSConnect->AllowPersistentConnection(); + } + else HTTPSConnect->DisablePersistentConnection(); + + // Head before Get option control + if (server->HeadBeforeGet()) + HTTPSConnect->EnableHeadBeforeGet(); + else + HTTPSConnect->DisableHeadBeforeGet(); + + // http->SetRequestMethod(HtHTTP::Method_GET); + if (debug > 2) + { + cout << "Making HTTPS request on " << url->get(); + + if (useproxy) + cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")"; + + cout << endl; + } + } + + HTTPSConnect->SetProxy(useproxy); + transportConnect = HTTPSConnect; + } +#endif + else if (mystrncasecmp(url->service(), "http", 4) == 0) + { + if (!HTTPConnect) + { + if (debug>4) + cout << "Creating an HtHTTPBasic object" << endl; + + HTTPConnect = new HtHTTPBasic(); + + if (!HTTPConnect) + return Transport::Document_other_error; + } + + if (HTTPConnect) + { + // Here we must set only thing for a HTTP request + + HTTPConnect->SetRequestURL(*url); + + // Set the user agent which can vary per server + HTTPConnect->SetRequestUserAgent(server->UserAgent()); + + // Set the accept language which can vary per server + HTTPConnect->SetAcceptLanguage(server->AcceptLanguage()); + + // Set the referer + if (referer) + HTTPConnect->SetRefererURL(*referer); + + // Let's disable the cookies if we decided that in the config file + if (server->DisableCookies()) + HTTPConnect->DisableCookies(); + else HTTPConnect->AllowCookies(); + + // We may issue a config paramater to enable/disable them + if (server->IsPersistentConnectionAllowed()) + { + // Persistent connections allowed + HTTPConnect->AllowPersistentConnection(); + } + else HTTPConnect->DisablePersistentConnection(); + + // Head before Get option control + if (server->HeadBeforeGet()) + HTTPConnect->EnableHeadBeforeGet(); + else + HTTPConnect->DisableHeadBeforeGet(); + + // http->SetRequestMethod(HtHTTP::Method_GET); + if (debug > 2) + { + cout << "Making HTTP request on " << url->get(); + + if (useproxy) + cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")"; + + cout << endl; + } + } + + HTTPConnect->SetProxy(useproxy); + transportConnect = HTTPConnect; + } + else if (mystrncasecmp(url->service(), "file", 4) == 0) + { + if (!FileConnect) + { + if (debug>4) + cout << "Creating an HtFile object" << endl; + + FileConnect = new HtFile(); + + if (!FileConnect) + return Transport::Document_other_error; + } + + if (FileConnect) + { + // Here we must set only thing for a file request + + FileConnect->SetRequestURL(*url); + + // Set the referer + if (referer) + FileConnect->SetRefererURL(*referer); + + if (debug > 2) + cout << "Making 'file' request on " << url->get() << endl; + } + + transportConnect = FileConnect; + } + else if (mystrncasecmp(url->service(), "ftp", 3) == 0) + { + // the following FTP handling is modeled very closely on + // the prior 'file'-protocol handling, so beware of bugs + + if (!FTPConnect) + { + if (debug>4) + cout << "Creating an HtFTP object" << endl; + + FTPConnect = new HtFTP(); + + if (!FTPConnect) + return Transport::Document_other_error; + } + if (FTPConnect) + { + // Here we must set only thing for a FTP request + + FTPConnect->SetRequestURL(*url); + //////////////////////////////////////////////////// + /// + /// stuff may be missing here or in need of change + /// + /////////////////////////////////////////////////// + + // Set the referer + if (referer) + FTPConnect->SetRefererURL(*referer); + + if (debug > 2) + cout << "Making 'ftp' request on " << url->get() << endl; + } + + transportConnect = FTPConnect; + } // end of else if (mystrncasecmp(url->service(), "ftp", 3) == 0) + + else if (mystrncasecmp(url->service(), "news", 4) == 0) + { + if (!NNTPConnect) + { + if (debug>4) + cout << "Creating an HtNNTP object" << endl; + + NNTPConnect = new HtNNTP(); + + if (!NNTPConnect) + return Transport::Document_other_error; + } + + if (NNTPConnect) + { + // Here we got an Usenet document request + + NNTPConnect->SetRequestURL(*url); + + if (debug > 2) + cout << "Making 'NNTP' request on " << url->get() << endl; + } + + transportConnect = NNTPConnect; + } + else + { + if (debug) + { + cout << '"' << url->service() << + "\" not a recognized transport service. Ignoring\n"; + } + + return Transport::Document_not_recognized_service; + } + + // Is a transport object pointer available? + + if (transportConnect) + { + // Set all the appropriate parameters + if (useproxy) + { + transportConnect->SetConnection(proxy); + if (proxy_authorization.length()) + transportConnect->SetProxyCredentials(proxy_authorization); + } + else + transportConnect->SetConnection(url); + + // OK. Let's set the connection time out + transportConnect->SetTimeOut(server->TimeOut()); + + // Let's set number of retries for a failed connection attempt + transportConnect->SetRetry(server->TcpMaxRetries()); + + // ... And the wait time after a failure + transportConnect->SetWaitTime(server->TcpWaitTime()); + + // OK. Let's set the maximum size of a document to be retrieved + transportConnect->SetRequestMaxDocumentSize(max_doc_size); + + // Let's set the credentials + transportConnect->SetCredentials(authorization); + + // Let's set the modification time (in order not to retrieve a + // document we already have) + transportConnect->SetRequestModificationTime(date); + + // Make the request + // Here is the main operation ... Let's make the request !!! + // We now perform a loop until we want to retry the request + + NumRetries = 0; + + do + { + status = transportConnect->Request(); + + if (NumRetries++) + if(debug>0) + cout << "."; + + } while (ShouldWeRetry(status) && NumRetries < num_retries); + + + // Let's get out the info we need + response = transportConnect->GetResponse(); + + if (response) + { + // We got the response + + contents = response->GetContents(); + contentType = response->GetContentType(); + contentLength = response->GetContentLength(); + ptrdatetime = response->GetModificationTime(); + document_length = response->GetDocumentLength(); + + // This test is ugly! Can whoever put it here explain why it's + // needed? Why would GetLocation() ever return a non-empty string + // from a Transport subclass that's not supposed to redirect? + if (transportConnect == HTTPConnect || transportConnect == HTTPSConnect || transportConnect == externalConnect) + redirected_to = ((HtHTTP_Response *)response)->GetLocation(); + + if (ptrdatetime) + { + // We got the modification date/time + modtime = *ptrdatetime; + } + + // How to manage it when there's no modification date/time? + + if (debug > 5) + { + cout << "Contents:\n" << contents << endl; + cout << "Content Type: " << contentType << endl; + cout << "Content Length: " << contentLength << endl; + cout << "Modification Time: " << modtime.GetISO8601() << endl; + } + } + + return status; + + } + else + return Transport::Document_not_found; +} + +//***************************************************************************** +// DocStatus Document::RetrieveLocal(HtDateTime date, StringList *filenames) +// Attempt to retrieve the document pointed to by our internal URL +// using a list of potential local filenames given. Returns Document_ok, +// Document_not_changed or Document_not_local (in which case the +// retriever tries it again using the standard retrieve method). +// +Transport::DocStatus +Document::RetrieveLocal(HtDateTime date, StringList *filenames) +{ + HtConfiguration* config= HtConfiguration::config(); + struct stat stat_buf; + String *filename; + + filenames->Start_Get(); + + // Loop through list of potential filenames until the list is exhausted + // or a suitable file is found to exist as a regular file. + while ((filename = (String *)filenames->Get_Next()) && + ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode))) + if (debug > 1) + cout << " tried local file " << *filename << endl; + + if (!filename) + return Transport::Document_not_local; + + if (debug > 1) + cout << " found existing file " << *filename << endl; + + modtime = stat_buf.st_mtime; + if (modtime <= date) + return Transport::Document_not_changed; + + char *ext = strrchr((char*)*filename, '.'); + if (ext == NULL) + return Transport::Document_not_local; + const String *type = HtFile::Ext2Mime (ext + 1); + + static Dictionary *bad_local_ext = 0; + if (!bad_local_ext) + { + // A list of bad extensions, separated by spaces or tabs + bad_local_ext = new Dictionary; + String t = config->Find("bad_local_extensions"); + String lowerp; + char *p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + bad_local_ext->Add(lowerp, 0); + p = strtok(0, " \t"); + } + } + if (type == NULL || bad_local_ext->Exists(ext)) + { + if (debug > 1 && type != NULL) + cout << "\nBad local extension: " << *filename << endl; + return Transport::Document_not_local; + } + else + contentType = *type; + + // Open it + FILE *f = fopen((char*)*filename, "r"); + if (f == NULL) + return Transport::Document_not_local; + + // + // Read in the document itself + // + max_doc_size = config->Value(url,"max_doc_size"); + contents = 0; + char docBuffer[8192]; + int bytesRead; + + while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0) + { + if (debug > 2) + cout << "Read " << bytesRead << " from document\n"; + if (contents.length() + bytesRead > max_doc_size) + bytesRead = max_doc_size - contents.length(); + contents.append(docBuffer, bytesRead); + if (contents.length() >= max_doc_size) + break; + } + fclose(f); + document_length = contents.length(); + contentLength = stat_buf.st_size; + + if (debug > 2) + cout << "Read a total of " << document_length << " bytes\n"; + + if (document_length < contentLength) + document_length = contentLength; + return Transport::Document_ok; +} + + +//***************************************************************************** +// Parsable *Document::getParsable() +// Given the content-type of a document, returns a document parser. +// This will first look through the list of user supplied parsers and +// then at our (limited) builtin list of parsers. The user supplied +// parsers are external programs that will be used. +// +Parsable * +Document::getParsable() +{ + static HTML *html = 0; + static Plaintext *plaintext = 0; + static ExternalParser *externalParser = 0; + + Parsable *parsable = 0; + + if (ExternalParser::canParse(contentType)) + { + if (externalParser) + { + delete externalParser; + } + externalParser = new ExternalParser(contentType); + parsable = externalParser; + } + else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else if (mystrncasecmp((char *)contentType, "text/css", 8) == 0) + { + return NULL; + } + else if (mystrncasecmp((char *)contentType, "text/", 5) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug > 1) + { + cout << '"' << contentType << + "\" not a recognized type. Assuming text/plain\n"; + } + } + else + { + if (debug > 1) + { + cout << '"' << contentType << + "\" not a recognized type. Ignoring\n"; + } + return NULL; + } + + parsable->setContents(contents.get(), contents.length()); + return parsable; +} + + +int Document::ShouldWeRetry(Transport::DocStatus DocumentStatus) +{ + + if (DocumentStatus == Transport::Document_connection_down) + return 1; + + if (DocumentStatus == Transport::Document_no_connection) + return 1; + + if (DocumentStatus == Transport::Document_no_header) + return 1; + + return 0; +} diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.h b/debian/htdig/htdig-3.2.0b6/htdig/Document.h new file mode 100644 index 00000000..215897c4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.h @@ -0,0 +1,138 @@ +// +// Document.h +// +// Document: This class holds everything there is to know about a document. +// The actual contents of the document may or may not be present at +// all times for memory conservation reasons. +// The document can be told to retrieve its contents. This is done +// with the Retrieve call. In case the retrieval causes a +// redirect, the link is followed, but this process is done +// only once (to prevent loops.) If the redirect didn't +// work, Document_not_found is returned. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Document.h,v 1.19 2004/05/28 13:15:14 lha Exp $ +// +// +#ifndef _Document_h_ +#define _Document_h_ + +#include "Parsable.h" +#include "Object.h" +#include "URL.h" +#include "htString.h" +#include "StringList.h" +#include "Transport.h" +#include "HtHTTP.h" +#include "HtFile.h" +#include "HtFTP.h" +#include "HtNNTP.h" +#include "ExternalTransport.h" +#include "Server.h" + + +class Connection; + + +class Document : public Object +{ +public: + // + // Construction/Destruction + // + Document(char *url = 0, int max_size = 0); + ~Document(); + + // + // Interface to the document. + // + void Reset(); + int Length() {return document_length;} + int ContentLength() {return contentLength;} + int StoredLength() {return contents.length();} + char *Contents() {return contents;} + void Contents(char *s) {contents = s; document_length = contents.length();} + char *ContentType() {return contentType.get();} + + // + // In case the retrieval process went through a redirect process, + // the new url can be gotten using the following call + // + char *Redirected() {return redirected_to;} + URL *Url() {return url;} + void Url(const String &url); + void Referer(const String &url); + time_t ModTime() {return modtime.GetTime_t();} + + Transport::DocStatus Retrieve(Server *server, HtDateTime date); + Transport::DocStatus RetrieveLocal(HtDateTime date, StringList *filenames); + + // + // Return an appropriate parsable object for the document type. + // + Parsable *getParsable(); + + // + // Set the username and password to be used in any requests + // + void setUsernamePassword(const String& credentials) + { authorization = credentials;} + + void setProxyUsernamePassword(const String& credentials) + { proxy_authorization = credentials;} + + HtHTTP *GetHTTPHandler() const { return HTTPConnect; } + +private: + enum + { + Header_ok, + Header_not_found, + Header_not_changed, + Header_redirect, + Header_not_text, + Header_not_authorized + }; + + URL *url; + URL *proxy; + URL *referer; + String contents; + String redirected_to; + String contentType; + String authorization; + String proxy_authorization; + int contentLength; + int document_length; + HtDateTime modtime; + int max_doc_size; + int num_retries; + + int UseProxy(); + + Transport *transportConnect; + HtHTTP *HTTPConnect; + HtHTTP *HTTPSConnect; + HtFile *FileConnect; + HtFTP *FTPConnect; + HtNNTP *NNTPConnect; + ExternalTransport *externalConnect; + + + /////// + // Tell us if we should retry to retrieve an URL depending on + // the first returned document status + /////// + + int ShouldWeRetry(Transport::DocStatus DocumentStatus); + +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc new file mode 100644 index 00000000..d967ba0b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc @@ -0,0 +1,614 @@ +// +// ExternalParser.cc +// +// ExternalParser: Implementation of ExternalParser +// Allows external programs to parse unknown document formats. +// The parser is expected to return the document in a +// specific format. The format is documented +// in http://www.htdig.org/attrs.html#external_parser +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ExternalParser.cc,v 1.29 2004/05/28 13:15:14 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "ExternalParser.h" +#include "HTML.h" +#include "Plaintext.h" +#include "htdig.h" +#include "htString.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "Dictionary.h" +#include "good_strtok.h" + +#include <ctype.h> +#include <stdio.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#include <stdlib.h> +#ifdef HAVE_WAIT_H +#include <wait.h> +#elif HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +#ifdef _MSC_VER /* _WIN32 */ +#include <process.h> +#endif + + +#include "defaults.h" + +static Dictionary *parsers = 0; +static Dictionary *toTypes = 0; +extern String configFile; + +//***************************************************************************** +// ExternalParser::ExternalParser(char *contentType) +// +ExternalParser::ExternalParser(char *contentType) +{ + String mime; + int sep; + + if (canParse(contentType)) + { + String mime = contentType; + mime.lowercase(); + sep = mime.indexOf(';'); + if (sep != -1) + mime = mime.sub(0, sep).get(); + + currentParser = ((String *)parsers->Find(mime))->get(); + } + ExternalParser::contentType = contentType; +} + + +//***************************************************************************** +// ExternalParser::~ExternalParser() +// +ExternalParser::~ExternalParser() +{ +} + + +//***************************************************************************** +// int ExternalParser::readLine(FILE *in, String &line) +// +int +ExternalParser::readLine(FILE *in, String &line) +{ + char buffer[2048]; + int length; + + line = 0; // read(in, buffer, sizeof(buffer) + while (fgets(buffer, sizeof(buffer), in)) + { + length = strlen(buffer); + if (buffer[length - 1] == '\n') + { + // + // A full line has been read. Return it. + // + line << buffer; + line.chop('\n'); + return 1; + } + else + { + // + // Only a partial line was read. Append it to the line + // and read some more. + // + line << buffer; + } + } + return line.length() > 0; +} + + +//***************************************************************************** +// int ExternalParser::canParse(char *contentType) +// +int +ExternalParser::canParse(char *contentType) +{ + HtConfiguration* config= HtConfiguration::config(); + int sep; + + if (!parsers) + { + parsers = new Dictionary(); + toTypes = new Dictionary(); + + QuotedStringList qsl(config->Find("external_parsers"), " \t"); + String from, to; + int i; + + for (i = 0; qsl[i]; i += 2) + { + from = qsl[i]; + to = ""; + sep = from.indexOf("->"); + if (sep != -1) + { + to = from.sub(sep+2).get(); + from = from.sub(0, sep).get(); + } + from.lowercase(); + sep = from.indexOf(';'); + if (sep != -1) + from = from.sub(0, sep).get(); + + parsers->Add(from, new String(qsl[i + 1])); + toTypes->Add(from, new String(to)); + } + } + + String mime = contentType; + mime.lowercase(); + sep = mime.indexOf(';'); + if (sep != -1) + mime = mime.sub(0, sep).get(); + return parsers->Exists(mime); +} + +//***************************************************************************** +// void ExternalParser::parse(Retriever &retriever, URL &base) +// +void +ExternalParser::parse(Retriever &retriever, URL &base) +{ +// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32 +#ifndef _MSC_VER /* _WIN32 */ + HtConfiguration* config= HtConfiguration::config(); + if (contents == 0 || contents->length() == 0 || + currentParser.length() == 0) + { + return; + } + + // + // Write the contents to a temporary file. + // + String path = getenv("TMPDIR"); + int fd; + if (path.length() == 0) + path = "/tmp"; +#ifndef HAVE_MKSTEMP + path << "/htdext." << getpid(); // This is unfortunately predictable + +#ifdef O_BINARY + fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY); +#else + fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL); +#endif +#else + path << "/htdex.XXXXXX"; + fd = mkstemp((char*)path); + // can we force binary mode somehow under Cygwin, if it has mkstemp? +#endif + if (fd < 0) + { + if (debug) + cout << "External parser error: Can't create temp file " + << (char *)path << endl; + return; + } + + write(fd, contents->get(), contents->length()); + close(fd); + +// unsigned int minimum_word_length = config->Value("minimum_word_length", 3); + String line; + char *token1, *token2, *token3; + int loc = 0, hd = 0; + URL url; + String mime = contentType; + mime.lowercase(); + int sep = mime.indexOf(';'); + if (sep != -1) + mime = mime.sub(0, sep).get(); + String convertToType = ((String *)toTypes->Find(mime))->get(); + int get_hdr = (convertToType.nocase_compare("user-defined") == 0); + int get_file = (convertToType.length() != 0); + String newcontent; + + StringList cpargs(currentParser); + char **parsargs = new char * [cpargs.Count() + 5]; + int argi; + for (argi = 0; argi < cpargs.Count(); argi++) + parsargs[argi] = (char *)cpargs[argi]; + parsargs[argi++] = path.get(); + parsargs[argi++] = contentType.get(); + parsargs[argi++] = (char *)base.get().get(); + parsargs[argi++] = configFile.get(); + parsargs[argi++] = 0; + + int stdout_pipe[2]; + int fork_result = -1; + int fork_try; + + if (pipe(stdout_pipe) == -1) + { + if (debug) + cout << "External parser error: Can't create pipe!" << endl; + unlink((char*)path); + delete [] parsargs; + return; + } + + for (fork_try = 4; --fork_try >= 0;) + { + fork_result = fork(); // Fork so we can execute in the child process + if (fork_result != -1) + break; + if (fork_try) + sleep(3); + } + if (fork_result == -1) + { + if (debug) + cout << "Fork Failure in ExternalParser" << endl; + unlink((char*)path); + delete [] parsargs; + return; + } + + if (fork_result == 0) // Child process + { + close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe + dup(stdout_pipe[1]); + close(stdout_pipe[0]); + close(stdout_pipe[1]); + close(STDIN_FILENO); // Close STDIN to replace with file + open((char*)path, O_RDONLY); + + // Call External Parser + execv(parsargs[0], parsargs); + + exit(EXIT_FAILURE); + } + + // Parent Process + delete [] parsargs; + close(stdout_pipe[1]); // Close STDOUT for writing +#ifdef O_BINARY + FILE *input = fdopen(stdout_pipe[0], "rb"); +#else + FILE *input = fdopen(stdout_pipe[0], "r"); +#endif + if (input == NULL) + { + if (debug) + cout << "Fdopen Failure in ExternalParser" << endl; + unlink((char*)path); + return; + } + + while ((!get_file || get_hdr) && readLine(input, line)) + { + if (get_hdr) + { + line.chop('\r'); + if (line.length() == 0) + get_hdr = false; + else if (mystrncasecmp((char*)line, "content-type:", 13) == 0) + { + token1 = line.get() + 13; + while (*token1 && isspace(*token1)) + token1++; + token1 = strtok(token1, "\n\t"); + convertToType = token1; + } + continue; + } +#ifdef O_BINARY + line.chop('\r'); +#endif + token1 = strtok(line, "\t"); + if (token1 == NULL) + token1 = ""; + token2 = NULL; + token3 = NULL; + switch (*token1) + { + case 'w': // word + token1 = strtok(0, "\t"); + if (token1 != NULL) + token2 = strtok(0, "\t"); + if (token2 != NULL) + token3 = strtok(0, "\t"); + if (token1 != NULL && token2 != NULL && token3 != NULL && + (loc = atoi(token2)) >= 0 && + (hd = atoi(token3)) >= 0 && hd < 12) + retriever.got_word(token1, loc, hd); + else + cerr<< "External parser error: expected word in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + + case 'u': // href + token1 = strtok(0, "\t"); + if (token1 != NULL) + token2 = strtok(0, "\t"); + if (token1 != NULL && token2 != NULL) + { + url.parse(token1); + url.hopcount(base.hopcount() + 1); + retriever.got_href(url, token2); + } + else + cerr<< "External parser error: expected URL in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + + case 't': // title + token1 = strtok(0, "\t"); + if (token1 != NULL) + retriever.got_title(token1); + else + cerr<< "External parser error: expected title in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + + case 'h': // head + token1 = strtok(0, "\t"); + if (token1 != NULL) + retriever.got_head(token1); + else + cerr<< "External parser error: expected text in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + + case 'a': // anchor + token1 = strtok(0, "\t"); + if (token1 != NULL) + retriever.got_anchor(token1); + else + cerr<< "External parser error: expected anchor in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + + case 'i': // image url + token1 = strtok(0, "\t"); + if (token1 != NULL) + retriever.got_image(token1); + else + cerr<< "External parser error: expected image URL in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + + case 'm': // meta + { + // Using good_strtok means we can accept empty + // fields. + char *httpEquiv = good_strtok(token1+2, '\t'); + char *name = good_strtok(0, '\t'); + char *content = good_strtok(0, '\t'); + + if (httpEquiv != NULL && name != NULL && content != NULL) + { + // It would be preferable if we could share + // this part with HTML.cc, but it has other + // chores too, and I do not see a point where to + // split it up to get a common shared function + // (or class). This should not stop anybody from + // finding a better solution. + // For now, there is duplicated code. + static StringMatch *keywordsMatch = 0; + if (!keywordsMatch) + { + StringList kn(config->Find("keywords_meta_tag_names"), " \t"); + keywordsMatch = new StringMatch(); + keywordsMatch->IgnoreCase(); + keywordsMatch->Pattern(kn.Join('|')); + } + static StringMatch *descriptionMatch = 0; + if (!descriptionMatch) + { + StringList dn(config->Find("description_meta_tag_names"), " \t"); + descriptionMatch = new StringMatch(); + descriptionMatch->IgnoreCase(); + descriptionMatch->Pattern(dn.Join('|')); + } + static StringMatch *metadatetags = 0; + if (!metadatetags) + { + metadatetags = new StringMatch(); + metadatetags->IgnoreCase(); + metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified"); + } + + // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5> + // says that the "name" attribute defaults to + // the http-equiv attribute if empty. + if (*name == '\0') + name = httpEquiv; + + if (*httpEquiv != '\0') + { + // <META HTTP-EQUIV=REFRESH case + if (mystrcasecmp(httpEquiv, "refresh") == 0 + && *content != '\0') + { + char *q = (char*)mystrcasestr(content, "url"); + if (q && *q) + { + q += 3; // skiping "URL" + while (*q && ((*q == '=') || isspace(*q))) q++; + char *qq = q; + while (*qq && (*qq != ';') && (*qq != '"') && + !isspace(*qq))qq++; + *qq = 0; + URL href(q, base); + // I don't know why anyone would do this, but hey... + retriever.got_href(href, ""); + } + } + } + + // + // Now check for <meta name=... content=...> tags that + // fly with any reasonable DTD out there + // + if (*name != '\0' && *content != '\0') + { + if (keywordsMatch->CompareWord(name)) + { + int wordindex = 1; + addKeywordString (retriever, content, wordindex); +// // can this be merged with Parser::addKeywordString ? +// char *w = strtok(content, " ,\t\r"); +// while (w) +// { +// if (strlen(w) >= minimum_word_length) +// retriever.got_word(w, 1, 9); +// w = strtok(0, " ,\t\r"); +// } + } + if (metadatetags->CompareWord(name) && + config->Boolean("use_doc_date", 0)) + { + retriever.got_time(content); + } + else if (mystrcasecmp(name, "author") == 0) + { + int wordindex = 1; + retriever.got_author(content); + addString (retriever, content, wordindex, 11); + } + else if (mystrcasecmp(name, "htdig-email") == 0) + { + retriever.got_meta_email(content); + } + else if (mystrcasecmp(name, "htdig-notification-date") == 0) + { + retriever.got_meta_notification(content); + } + else if (mystrcasecmp(name, "htdig-email-subject") == 0) + { + retriever.got_meta_subject(content); + } + else if (descriptionMatch->CompareWord(name) + && strlen(content) != 0) + { + // + // We need to do two things. First grab the description + // + String meta_dsc = content; + + if (meta_dsc.length() > max_meta_description_length) + meta_dsc = meta_dsc.sub(0, max_meta_description_length).get(); + if (debug > 1) + cout << "META Description: " << content << endl; + retriever.got_meta_dsc((char*)meta_dsc); + + // + // Now add the words to the word list + // (slot 10 is the new slot for this) + // + int wordindex = 1; + addString (retriever, content, wordindex, 10); +// // can this be merged with Parser::addString ? +// char *w = strtok(content, " \t\r"); +// while (w) +// { +// if (strlen(w) >= minimum_word_length) +// retriever.got_word(w, 1, 10); +// w = strtok(0, " \t\r"); +// } + } + } + } + else + cerr<< "External parser error: expected metadata in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + } + + default: + cerr<< "External parser error: unknown field in line "<<line<<"\n" << " URL: " << base.get() << "\n"; + break; + } + } // while(readLine) + if (get_file) + { + if (!canParse(convertToType) && + mystrncasecmp((char*)convertToType, "text/", 5) != 0) + { + if (mystrcasecmp((char*)convertToType, "user-defined") == 0) + cerr << "External parser error: no Content-Type given\n"; + else + cerr << "External parser error: can't parse Content-Type \"" + << convertToType << "\"\n"; + cerr << " URL: " << base.get() << "\n"; + } + else + { + char buffer[2048]; + int length; + int nbytes = config->Value("max_doc_size"); + while (nbytes > 0 && + (length = fread(buffer, 1, sizeof(buffer), input)) > 0) + { + nbytes -= length; + if (nbytes < 0) + length += nbytes; + newcontent.append(buffer, length); + } + } + } + fclose(input); + // close(stdout_pipe[0]); // This is closed for us by the fclose() + int rpid, status; + while ((rpid = wait(&status)) != fork_result && rpid != -1) + ; + unlink((char*)path); + + if (newcontent.length() > 0) + { + static HTML *html = 0; + static Plaintext *plaintext = 0; + Parsable *parsable = 0; + + contentType = convertToType; + if (canParse(contentType)) + { + currentParser = ((String *)parsers->Find(contentType))->get(); + parsable = this; + } + else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug) + cout << "External parser error: \"" << contentType << + "\" not a recognized type. Assuming text/plain\n"; + } + parsable->setContents(newcontent.get(), newcontent.length()); + parsable->parse(retriever, base); + } +#endif //ifndef _MSC_VER /* _WIN32 */ +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h new file mode 100644 index 00000000..4c7579a1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.h @@ -0,0 +1,58 @@ +// +// ExternalParser.h +// +// ExternalParser: Allows external programs to parse unknown document formats. +// The parser is expected to return the document in a +// specific format. The format is documented +// in http://www.htdig.org/attrs.html#external_parser +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ExternalParser.h,v 1.8 2004/05/28 13:15:14 lha Exp $ +// + +#ifndef _ExternalParser_h_ +#define _ExternalParser_h_ + +#include "Parsable.h" +#include "htString.h" + +#include <stdio.h> + +class URL; + + +class ExternalParser : public Parsable +{ +public: + // + // Construction/Destruction + // + ExternalParser(char *contentType); + virtual ~ExternalParser(); + + // + // Main parser interface. + // + virtual void parse(Retriever &retriever, URL &); + + // + // Check if the given contentType has an external parser associated + // with it + // + static int canParse(char *contentType); + +private: + String currentParser; + String contentType; + + int readLine(FILE *, String &); +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc new file mode 100644 index 00000000..c418e62c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.cc @@ -0,0 +1,376 @@ +// +// ExternalTransport.cc +// +// ExternalTransport: Allows external programs to retrieve given URLs with +// unknown protocols. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ExternalTransport.cc,v 1.9 2004/05/28 13:15:14 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "ExternalTransport.h" +#include "htdig.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "Dictionary.h" +#include "good_strtok.h" + +#include <ctype.h> +#include <stdio.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#include <stdlib.h> +#ifdef HAVE_WAIT_H +#include <wait.h> +#elif HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +#include "defaults.h" + +static Dictionary *handlers = 0; +static Dictionary *toTypes = 0; +extern String configFile; + +//***************************************************************************** +// ExternalTransport::ExternalTransport(char *protocol) +// +ExternalTransport::ExternalTransport(const String &protocol) +{ + if (canHandle(protocol)) + { + _Handler = ((String *)handlers->Find(protocol))->get(); + } + ExternalTransport::_Protocol = protocol; + _Response = new ExternalTransport_Response; +} + + +//***************************************************************************** +// ExternalTransport::~ExternalTransport() +// +ExternalTransport::~ExternalTransport() +{ + if (_Response) + { + delete _Response; + } +} + + +//***************************************************************************** +// int ExternalTransport::canHandle(const String &protocol) +// +int +ExternalTransport::canHandle(const String &protocol) +{ + HtConfiguration* config= HtConfiguration::config(); + if (!handlers) + { + handlers = new Dictionary(); + toTypes = new Dictionary(); + + QuotedStringList qsl(config->Find("external_protocols"), " \t"); + String from, to; + int i; + int sep; + + for (i = 0; qsl[i]; i += 2) + { + from = qsl[i]; + to = ""; + sep = from.indexOf("->"); + if (sep != -1) + { + to = from.sub(sep+2).get(); + from = from.sub(0, sep).get(); + } + + // Recognise service specified as "https://" rather than "https" + sep = from.indexOf(":"); + if (sep != -1) + from = from.sub(0, sep).get(); + + handlers->Add(from, new String(qsl[i + 1])); + toTypes->Add(from, new String(to)); + } + } + return handlers->Exists(protocol); +} + + +//***************************************************************************** +// void ExternalTransport::SetConnection(URL *u) +// +void ExternalTransport::SetConnection (URL *u) +{ + // Grab the actual URL to pass to the handler + _URL = *u; + + // OK, now call the parent method to make sure everything else is set up. + Transport::SetConnection (u->host(), u->port()); +} + + +//***************************************************************************** +// DocStatus ExternalTransport::Request() +// +Transport::DocStatus ExternalTransport::Request() +{ +// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32 +#ifndef _MSC_VER /* _WIN32 */ + // + // Start the external handler, passing the protocol, URL and config file + // as command arguments + // + StringList hargs(_Handler); + char **handlargs = new char * [hargs.Count() + 5]; + int argi; + for (argi = 0; argi < hargs.Count(); argi++) + handlargs[argi] = (char *)hargs[argi]; + handlargs[argi++] = _Protocol.get(); + handlargs[argi++] = (char *)_URL.get().get(); + handlargs[argi++] = configFile.get(); + handlargs[argi++] = 0; + + int stdout_pipe[2]; + int fork_result = -1; + int fork_try; + + if (pipe(stdout_pipe) == -1) + { + if (debug) + cerr << "External transport error: Can't create pipe!" << endl; + delete [] handlargs; + return GetDocumentStatus(_Response); + } + + for (fork_try = 4; --fork_try >= 0;) + { + fork_result = fork(); // Fork so we can execute in the child process + if (fork_result != -1) + break; + if (fork_try) + sleep(3); + } + if (fork_result == -1) + { + if (debug) + cerr << "Fork Failure in ExternalTransport" << endl; + delete [] handlargs; + return GetDocumentStatus(_Response); + } + + if (fork_result == 0) // Child process + { + close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe + dup(stdout_pipe[1]); + close(stdout_pipe[0]); + close(stdout_pipe[1]); + // not really necessary, and may pose Cygwin incompatibility... + //close(STDIN_FILENO); // Close STDIN to replace with null dev. + //open("/dev/null", O_RDONLY); + + // Call External Transport Handler + execv(handlargs[0], handlargs); + + exit(EXIT_FAILURE); + } + + // Parent Process + delete [] handlargs; + close(stdout_pipe[1]); // Close STDOUT for writing + FILE *input = fdopen(stdout_pipe[0], "r"); + if (input == NULL) + { + if (debug) + cerr << "Fdopen Failure in ExternalTransport" << endl; + return GetDocumentStatus(_Response); + } + + // Set up a response for this request + _Response->Reset(); + // We just accessed the document + _Response->_access_time = new HtDateTime(); + _Response->_access_time->SettoNow(); + + + // OK, now parse the stuff we got back from the handler... + String line; + char *token1; + int in_header = 1; + + while (in_header && readLine(input, line)) + { + line.chop('\r'); + if (line.length() > 0 && debug > 2) + cout << "Header line: " << line << endl; + token1 = strtok(line, "\t"); + if (token1 == NULL) + { + token1 = ""; + in_header = 0; + break; + } + + switch (*token1) + { + case 's': // status code + token1 = strtok(0, "\t"); + if (token1 != NULL) + _Response->_status_code = atoi(token1); + else + cerr<< "External transport error: expected status code in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + + case 'r': // status reason + token1 = strtok(0, "\t"); + if (token1 != NULL) + _Response->_reason_phrase = token1; + else + cerr<< "External transport error: expected status reason in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + + case 'm': // modification time + token1 = strtok(0, "\t"); + if (token1 != NULL) + _Response->_modification_time= NewDate(token1); // Hopefully we can grok it... + else + cerr<< "External transport error: expected modification time in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + + case 't': // Content-Type + token1 = strtok(0, "\t"); + if (token1 != NULL) + _Response->_content_type = token1; + else + cerr<< "External transport error: expected content-type in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + + case 'l': // Content-Length + token1 = strtok(0, "\t"); + if (token1 != NULL) + _Response->_content_length = atoi(token1); + else + cerr<< "External transport error: expected content-length in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + + case 'u': // redirect target + token1 = strtok(0, "\t"); + if (token1 != NULL) + _Response->_location = token1; + else + cerr<< "External transport error: expected URL in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + + default: + cerr<< "External transport error: unknown field in line "<<line<<"\n" << " URL: " << _URL.get() << "\n"; + break; + } + } + + // OK, now we read in the rest of the document as contents... + _Response->_contents = 0; + char docBuffer[8192]; + int bytesRead; + + while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), input)) > 0) + { + if (debug > 2) + cout << "Read " << bytesRead << " from document\n"; + if (_Response->_contents.length() + bytesRead > _max_document_size) + bytesRead = _max_document_size - _Response->_contents.length(); + _Response->_contents.append(docBuffer, bytesRead); + if (_Response->_contents.length() >= _max_document_size) + break; + } + _Response->_document_length = _Response->_contents.length(); + fclose(input); + // close(stdout_pipe[0]); // This is closed for us by the fclose() + + int rpid, status; + while ((rpid = wait(&status)) != fork_result && rpid != -1) + ; + +#endif + + return GetDocumentStatus(_Response); +} + + +//***************************************************************************** +// private +// DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r) +// +Transport::DocStatus ExternalTransport::GetDocumentStatus(ExternalTransport_Response *r) +{ + // The default is 'not found' if we can't figure it out... + DocStatus returnStatus = Document_not_found; + int statuscode = r->GetStatusCode(); + + if (statuscode == 200) + { + returnStatus = Document_ok; // OK + // Is it parsable? + } + + else if (statuscode > 200 && statuscode < 300) + returnStatus = Document_ok; // Successful 2xx + else if (statuscode == 304) + returnStatus = Document_not_changed; // Not modified + else if (statuscode > 300 && statuscode < 400) + returnStatus = Document_redirect; // Redirection 3xx + else if (statuscode == 401) + returnStatus = Document_not_authorized; // Unauthorized + + return returnStatus; +} + + +//***************************************************************************** +// private +// int ExternalTransport::readLine(FILE *in, String &line) +// +int +ExternalTransport::readLine(FILE *in, String &line) +{ + char buffer[2048]; + int length; + + line = 0; + while (fgets(buffer, sizeof(buffer), in)) + { + length = strlen(buffer); + if (buffer[length - 1] == '\n') + { + // + // A full line has been read. Return it. + // + line << buffer; + line.chop('\n'); + return 1; + } + else + { + // + // Only a partial line was read. Append it to the line + // and read some more. + // + line << buffer; + } + } + return line.length() > 0; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h new file mode 100644 index 00000000..4c946a96 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalTransport.h @@ -0,0 +1,88 @@ +// +// ExternalTransport.h +// +// ExternalTransport: Allows external programs to retrieve given URLs with +// unknown protocols. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ExternalTransport.h,v 1.5 2004/05/28 13:15:14 lha Exp $ +// + +#ifndef _ExternalTransport_h_ +#define _ExternalTransport_h_ + +#include "Transport.h" +#include "htString.h" + +#include <stdio.h> + +// First we must declare a derived Transport_Response class +// This requires declaring the main class in advance +class ExternalTransport; +class ExternalTransport_Response : public Transport_Response +{ + friend class ExternalTransport; + + // Nothing else... We just want it so we can access the protected fields +}; + +// Right, now we get on with the show... +class ExternalTransport : public Transport +{ +public: + // + // Construction/Destruction + // + ExternalTransport(const String &protocol); + virtual ~ExternalTransport(); + + + // + // Check if the given protocol has a handler + // + static int canHandle(const String &protocol); + + // Setting connections is obviously a bit different than the base class + // from a URL pointer + void SetConnection (URL *u); + + // from a URL object + void SetConnection (URL &u) + { SetConnection (&u); } + + // Make the request + DocStatus Request(); + + // Get the response or the status + Transport_Response *GetResponse() { return _Response; } + DocStatus GetDocumentStatus() { return GetDocumentStatus(_Response); } + + +private: + // The command to handle the current protocol + String _Handler; + // And the current protocol + String _Protocol; + + // The URL to Request() + URL _URL; + + // The result of the Request() + ExternalTransport_Response *_Response; + + + + // Private helper to read in the result from the handler + int readLine(FILE *, String &); + // Work out the DocStatus from the HTTP-style status codes + DocStatus GetDocumentStatus(ExternalTransport_Response *r); +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc new file mode 100644 index 00000000..56e1d00f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc @@ -0,0 +1,1002 @@ +// +// HTML.cc +// +// HTML: Class to parse HTML documents and return useful information +// to the Retriever +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HTML.cc,v 1.76 2004/06/09 17:35:34 grdetil Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htdig.h" +#include "HTML.h" +#include "HtSGMLCodec.h" +#include "HtConfiguration.h" +#include "StringMatch.h" +#include "StringList.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "WordType.h" + +#include <ctype.h> + +#include "defaults.h" + +// Flags for noindex & nofollow, indicating who turned indexing off/on... +#define TAGnoindex 0x0001 +#define TAGstyle 0x0002 +#define TAGscript 0x0004 +#define TAGmeta_htdig_noindex 0x0008 +#define TAGmeta_robots 0x0010 + +static StringMatch tags; +static StringMatch nobreaktags; +static StringMatch spacebeforetags; +static StringMatch spaceaftertags; +static StringMatch metadatetags; +static StringMatch descriptionMatch; +static StringMatch keywordsMatch; +//static int keywordsCount; +//static int max_keywords; + + +//***************************************************************************** +// ADDSPACE() macro, to insert space where needed in various strings +// Reduces all multiple whitespace to a single space + +#define ADDSPACE(in_space) \ + if (!in_space) \ + { \ + if (in_title && !noindex) \ + { \ + title << ' '; \ + } \ + if (in_ref && description.length() < max_description_length) \ + { \ + description << ' '; \ + } \ + if (head.length() < max_head_length && !noindex && !in_title) \ + { \ + head << ' '; \ + } \ + in_space = 1; \ + } + + +//***************************************************************************** +// HTML::HTML() +// +HTML::HTML() : + skip_start (HtConfiguration::config()->Find("noindex_start")," \t"), + skip_end (HtConfiguration::config()->Find("noindex_end"), " \t") +{ + HtConfiguration *config= HtConfiguration::config(); + // + // Initialize the patterns that we will try to match. + // The tags Match object is used to match tag commands while + // + tags.IgnoreCase(); + tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object|link|style|/style|script|/script"); + + // These tags don't cause a word break. They may also be in "tags" above, + // except for the "a" tag, which must be handled as a special case. + // Note that <sup> & <sub> should cause a word break. + nobreaktags.IgnoreCase(); + nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s"); + + // These tags, which may also be in "tags" above, cause word breaks and + // therefore cause space to be inserted before (or after) do_tag() is done. + spacebeforetags.IgnoreCase(); + spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer"); + spaceaftertags.IgnoreCase(); + spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote"); + + // These are the name values of meta tags that carry date information. + metadatetags.IgnoreCase(); + metadatetags.Pattern("date|dc.date|dc.date.created|dc.date.modified"); + + // These are the name values of meta tags that carry descriptions. + StringList descrNames(config->Find("description_meta_tag_names"), " \t"); + descriptionMatch.IgnoreCase(); + descriptionMatch.Pattern(descrNames.Join('|')); + + // These are the name values of meta tags that carry keywords. + StringList keywordNames(config->Find("keywords_meta_tag_names"), " \t"); + keywordsMatch.IgnoreCase(); + keywordsMatch.Pattern(keywordNames.Join('|')); +// (now in Parser) +// max_keywords = config->Value("max_keywords", -1); +// if (max_keywords < 0) +// max_keywords = (int) ((unsigned int) ~1 >> 1); + + // skip_start/end mark sections of text to be ignored by ht://Dig + // Make sure there are equal numbers of each, and warn of deprecated + // syntax. + if (skip_start.Count() > 1 || skip_end.Count() > 1) + { + if (skip_start.Count() != 0 && skip_end.Count() != 0) + { + // check for old-style start/end which allowed unquoted spaces + // (Check noindex_start/end for exactly one "<" or/followed-by + // exactly one ">", and no leading quotes.) + // Can someone think of a better (or simpler) check?? + String noindex_end (config->Find ("noindex_end")); + char *first_left = strchr (noindex_end.get(), '<'); + char *secnd_left = first_left ? strchr(first_left+1,'<') : (char*)0; + char *first_right= strchr (noindex_end.get(), '>'); + char *secnd_right= first_right? strchr(first_right+1,'>'): (char*)0; + String noindex_start (config->Find ("noindex_start")); + char *first_lft = strchr (noindex_start.get(), '<'); + char *secnd_lft = first_left ? strchr (first_lft +1,'<') : (char*)0; + char *first_rght= strchr (noindex_start.get(), '>'); + char *secnd_rght= first_right? strchr (first_rght+1,'>') : (char*)0; + + if (((first_right && !secnd_right && first_right < first_left) || + (first_left && !secnd_left && !first_right) || + (first_rght && !secnd_rght && first_rght < first_lft) || + (first_lft && !secnd_lft && !first_rght)) && + noindex_end[0] != '\"' && noindex_start[0] != '\"') + { + cout << "\nWarning: To allow multiple noindex_start/end patterns, patterns containing\nspaces should now be in quotation marks. (If the entries are indended to be\nmultiple patterns, this warning can be suppressed by placing the first pattern\nin quotes.)\n\n"; + // Should we treat the patterns as if they had been quoted + // (as we assume was intended)? + } + } + } + + // check each start has an end + if (skip_start.Count() < skip_end.Count()) + { + cout << "Warning: " << skip_end.Count() + << " noindex_end patterns, but only " << skip_start.Count() + << " noindex_start patterns.\n"; + } else + { + while (skip_start.Count () > skip_end.Count()) + { + int missing = skip_end.Count() - 1; + skip_end.Add ((missing >= 0) ? skip_end [missing] + : "<!--/htdig_noindex-->"); + cout << "Warning: Copying " << skip_end [missing+1] + << " as noindex_end match for " << skip_start [missing+1] + << endl; + } + } + + word = 0; + href = 0; + title = 0; + description = 0; + head = 0; + meta_dsc = 0; + tag = 0; + in_title = 0; + in_ref = 0; + in_heading = 0; + base = 0; + noindex = 0; + nofollow = 0; +// minimumWordLength = config->Value("minimum_word_length", 3); +} + + +//***************************************************************************** +// HTML::~HTML() +// +HTML::~HTML() +{ +} + + +//***************************************************************************** +// void HTML::parse(Retriever &retriever, URL &baseURL) +// Parse the HTML document using the Retriever object for all the callbacks. +// The HTML document contents are contained in the contents String. +// +void +HTML::parse(Retriever &retriever, URL &baseURL) +{ + if (contents == 0 || contents->length() == 0) + return; + + base = &baseURL; + + // + // We have some variables which will contain the various items we + // are looking for + // + int wordindex = 1; + int in_space; + int in_punct; + String scratch, textified; + unsigned char *q, *start; + unsigned char *position = (unsigned char *) contents->get(); + unsigned char *text = (unsigned char *)new char[contents->length()+1]; + unsigned char *ptext = text; + + keywordsCount = 0; + title = 0; + head = 0; + meta_dsc = 0; + noindex = 0; + nofollow = 0; + in_heading = 0; + in_title = 0; + in_ref = 0; + in_space = 0; + in_punct = 0; + + while (*position) + { + + // + // Filter out section marked to be ignored for indexing. + // This can contain any HTML. + // On finding a noindex_start, skip to first occurrence of matching + // noindex_end. Any noindex_start within will be ignored. + // + int i; + for (i = 0; i < skip_start.Count(); i++) + { + if (mystrncasecmp((char *)position, skip_start[i], + ((String*)skip_start.Nth(i))->length()) == 0) + break; // break from this loop for "continue" below... + } + if (i < skip_start.Count()) // found a match; + { + q = (unsigned char*)mystrcasestr((char *)position, skip_end[i]); + if (!q) + *position = '\0'; // Rest of document will be skipped... + else + position = q + ((String*)skip_end.Nth(i))->length(); + continue; + } + // end of noindex_start/end code + + + if (strncmp((char *)position, "<!", 2) == 0) + { + // + // Possible comment declaration (but could be DTD declaration!) + // A comment can contain other '<' and '>': + // we have to ignore complete comment declarations + // but of course also DTD declarations. + // + position += 2; // Get past declaration start + if (strncmp((char *)position, "--", 2) == 0) + { + // Found start of comment - now find the end + position += 2; + do + { + q = (unsigned char*)strstr((char *)position, "--"); + if (!q) + { + *position = '\0'; + break; // Rest of document seems to be a comment... + } + else + { + position = q + 2; + // Skip extra dashes after a badly formed comment + while (*position == '-') + position++; + // Skip whitespace after an individual comment + while (isspace(*position)) + position++; + } + // if comment declaration hasn't ended, skip another comment + } + while (*position && *position != '>'); + if (*position == '>') + { + position++; // End of comment declaration + } + } + else + { + // Not a comment declaration after all + // but possibly DTD: get to the end + q = (unsigned char*)strchr((char *)position, '>'); + if (q) + { + position = q + 1; + // End of (whatever) declaration + } + else + { + *position = '\0'; // Rest of document is DTD? + } + } + continue; + } + + if (*position == '<') + { + // + // Start of a tag. Since tags cannot be nested, we can simply + // search for the closing '>' + // + q = (unsigned char*)strchr((char *)position, '>'); + if (q) + { // copy tag + while (position <= q) + *ptext++ = *position++; + } + else + { // copy rest of text, as tag does not end + while (*position) + *ptext++ = *position++; + } + } + else if (*position == '&') + { + q = (unsigned char*)strchr((char *)position, ';'); + if (q && q <= position+10) + { // got ending, looks like valid SGML entity + scratch = 0; + scratch.append((char*)position, q+1 - position); + textified = HtSGMLCodec::instance()->encode(scratch); + if (textified[0] != '&' || textified.length() == 1) + { // it was decoded, copy it + position = (unsigned char *)textified.get(); + while (*position) + { + if (*position == '<') + { // got a decoded <, make a fake tag for it + // to avoid confusing it with real tag start + *ptext++ = '<'; + *ptext++ = '~'; + *ptext++ = '>'; + position++; + } + else + *ptext++ = *position++; + } + position = q+1; + } + else // it wasn't decoded, copy '&', and rest will follow + *ptext++ = *position++; + } + else // not SGML entity, copy bare '&' + *ptext++ = *position++; + } + else + { + *ptext++ = *position++; + } + } + *ptext++ = '\0'; + + position = text; + start = position; + + while (*position) + { + if (*position == '<' && (position[1] != '~' || position[2] != '>')) + { + // + // Start of a tag. Since tags cannot be nested, we can simply + // search for the closing '>' + // + q = (unsigned char*)strchr((char *)position, '>'); + if (!q) + break; // Syntax error in the doc. Tag never ends. + position++; + if (noindex & TAGscript) + { // Special handling in case '<' is part of JavaScript code + while (isspace(*position)) + position++; + if (mystrncasecmp((char *)position, "/script", 7) != 0) + continue; + } + tag = 0; + tag.append((char*)position, q - position); + while (isspace(*position)) + position++; + if (!in_space && spacebeforetags.CompareWord((char *)position) + || !in_space && !in_punct && *position != '/') + { + // These opening tags cause a space to be inserted + // before anything they insert. + // Tags processed here (i.e. not in nobreaktags), like <a ...> + // tag, are a special case: they don't actually add space in + // formatted text, but because in our processing it causes + // a word break, we avoid word concatenation in "head" string. + ADDSPACE(in_space); + in_punct = 0; + } + do_tag(retriever, tag); + if (!in_space && spaceaftertags.CompareWord((char *)position)) + { + // These closing tags cause a space to be inserted + // after anything they insert. + ADDSPACE(in_space); + in_punct = 0; + } + position = q+1; + } + else if (*position > 0 && HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + word = 0; + in_space = 0; + in_punct = 0; + while (*position && HtIsWordChar(*position)) + { + word << (char)*position; + // handle case where '<' is in extra_word_characters... + if (strncmp((char *)position, "<~>", 3) == 0) + position += 2; // skip over fake tag for decoded '<' + position++; + if (*position == '<') + { + q = position+1; + while (isspace(*q)) + q++; + // Does this tag cause a word break? + if (nobreaktags.CompareWord((char *)q)) + { + // These tags just change character formatting and + // don't break words. + q = (unsigned char*)strchr((char *)position, '>'); + if (q) + { + position++; + tag = 0; + tag.append((char*)position, q - position); + do_tag(retriever, tag); + position = q+1; + } + } + } + } + + if (in_title && !noindex) + { + title << word; + } + + if (in_ref) + { + if (description.length() < max_description_length) + { + description << word; + } + else + { + description << " ..."; + if (!nofollow) + retriever.got_href(*href, (char*)description); + in_ref = 0; + description = 0; + } + } + + if (head.length() < max_head_length && !noindex && !in_title) + { + // + // Capitalize H1 and H2 blocks + // + if (in_heading > 1 && in_heading < 4) + { + word.uppercase(); + } + + // + // Append the word to the head (excerpt) + // + head << word; + } + + if (word.length() >= (int)minimum_word_length && !noindex) + { + retriever.got_word((char*)word, wordindex++, in_heading); + } + } + else + { + // + // Characters that are not part of a word + // + if (isspace(*position)) + { + ADDSPACE(in_space); + in_punct = 0; + } + else + { + // + // Not whitespace + // + if (head.length() < max_head_length && !noindex && !in_title) + { + // We don't want to add random chars to the + // excerpt if we're in the title. + head << *position; + } + if (in_ref && description.length() < max_description_length) + { + description << *position; + } + if (in_title && !noindex) + { + title << *position; + } + in_space = 0; + in_punct = 1; + // handle normal case where decoded '<' is punctuation... + if (strncmp((char *)position, "<~>", 3) == 0) + position += 2; // skip over fake tag for decoded '<' + } + position++; + } + } + retriever.got_head((char*)head); + + delete [] text; +} + + +//***************************************************************************** +// void HTML::do_tag(Retriever &retriever, String &tag) +// +void +HTML::do_tag(Retriever &retriever, String &tag) +{ + HtConfiguration* config= HtConfiguration::config(); + int wordindex = 1; + char *position = tag.get(); + int which, length; + static int ignore_alt_text = config->Boolean("ignore_alt_text", 0); + + while (isspace(*position)) + position++; + + which = -1; + if (tags.CompareWord(position, which, length) < 0) + return; // Nothing matched. + + // Use the configuration code to match attributes as key-value pairs + HtConfiguration attrs; + attrs.NameValueSeparators("="); + attrs.Add(position); + + if (debug > 3) + cout << "Tag: <" << tag << ">, matched " << which << endl; + + switch (which) + { + case 0: // "title" + if (title.length()) + { + if (debug) + cout << "More than one <title> tag in document!" + << " (possible search engine spamming)" << endl; + break; + } + in_title = 1; + in_heading = 1; + break; + + case 1: // "/title" + if (!in_title) + break; + in_title = 0; + in_heading = 0; + retriever.got_title((char*)title); + break; + + case 2: // "a" + { + if (!attrs["href"].empty()) + { + // + // a href seen + // + if (in_ref) + { + if (debug > 1) + cout << "Terminating previous <a href=...> tag," + << " which didn't have a closing </a> tag." + << endl; + if (!nofollow) + retriever.got_href(*href, (char*)description); + in_ref = 0; + } + if (href) + delete href; + href = new URL(transSGML(attrs["href"]), *base); + in_ref = 1; + description = 0; + break; + } + + if (!attrs["title"].empty() && !attrs["href"].empty()) + { + // + // a title seen for href + // + retriever.got_href(*href, transSGML(attrs["title"])); + } + + if (!attrs["name"].empty()) + { + // + // a name seen + // + retriever.got_anchor(transSGML(attrs["name"])); + } + break; + } + + case 3: // "/a" + if (in_ref) + { + if (!nofollow) + retriever.got_href(*href, (char*)description); + in_ref = 0; + } + break; + + case 4: // "h1" + in_heading = 2; + break; + + case 5: // "h2" + in_heading = 3; + break; + + case 6: // "h3" + in_heading = 4; + break; + + case 7: // "h4" + in_heading = 5; + break; + + case 8: // "h5" + in_heading = 6; + break; + + case 9: // "h6" + in_heading = 7; + break; + + case 10: // "/h1" + case 11: // "/h2" + case 12: // "/h3" + case 13: // "/h4" + case 14: // "/h5" + case 15: // "/h6" + in_heading = 0; + break; + + case 16: // "noindex" + noindex |= TAGnoindex; + nofollow |= TAGnoindex; + if (!attrs["follow"].empty()) + nofollow &= ~TAGnoindex; + break; + + case 27: // "style" + noindex |= TAGstyle; + nofollow |= TAGstyle; + break; + + case 29: // "script" + noindex |= TAGscript; + nofollow |= TAGscript; + break; + + case 17: // "/noindex" + noindex &= ~TAGnoindex; + nofollow &= ~TAGnoindex; + break; + + case 28: // "/style" + noindex &= ~TAGstyle; + nofollow &= ~TAGstyle; + break; + + case 30: // "/script" + noindex &= ~TAGscript; + nofollow &= ~TAGscript; + break; + + case 19: // "li" + if (!noindex && !in_title && head.length() < max_head_length) + head << "* "; + break; + + case 20: // "meta" + { + // + // First test for old-style meta tags (these break any + // reasonable DTD...) + // + if (!attrs["htdig-noindex"].empty()) + { + retriever.got_noindex(); + noindex |= TAGmeta_htdig_noindex; + nofollow |= TAGmeta_htdig_noindex; + } + if (!attrs["htdig-index"].empty()) + { + noindex &= ~TAGmeta_htdig_noindex; + nofollow &= ~TAGmeta_htdig_noindex; + } + if (!attrs["htdig-email"].empty()) + retriever.got_meta_email(transSGML(attrs["htdig-email"])); + + if (!attrs["htdig-notification-date"].empty()) + retriever.got_meta_notification(transSGML(attrs["htdig-notification-date"])); + + if (!attrs["htdig-email-subject"].empty()) + retriever.got_meta_subject(transSGML(attrs["htdig-email-subject"])); + + if (!attrs["htdig-keywords"].empty() || !attrs["keywords"].empty()) + { + // + // Keywords are added as being at the very top of the + // document and have a weight factor of + // keywords-factor which is assigned to slot 9 in the + // factor table. + // + const String keywords = attrs["htdig-keywords"].empty() ? + attrs["htdig-keywords"] : + attrs["keywords"]; + if (!noindex) + { + String tmp = transSGML(keywords); + addKeywordString (retriever, tmp, wordindex); + } + } + + if (!attrs["http-equiv"].empty()) + { + + // <META HTTP-EQUIV=REFRESH case + if (mystrcasecmp(attrs["http-equiv"], "refresh") == 0 + && !attrs["content"].empty()) + { + String content = attrs["content"]; + char *q = (char*)mystrcasestr((char*)content, "url"); + if (q && *q) + { + q += 3; // skiping "URL" + while (*q && ((*q == '=') || isspace(*q))) q++; + char *qq = q; + while (*qq && (*qq != ';') && (*qq != '"') && + !isspace(*qq))qq++; + *qq = 0; + if (href) + delete href; + href = new URL(transSGML(q), *base); + // I don't know why anyone would do this, but hey... + if (!nofollow) + retriever.got_href(*href, ""); + } + } + } + + // + // Now check for <meta name=... content=...> tags that + // fly with any reasonable DTD out there + // + + if (!attrs["name"].empty() && !attrs["content"].empty()) + { + const String cache = attrs["name"]; + + // First of all, check for META description + + if (descriptionMatch.CompareWord(cache) + && !attrs["content"].empty()) + { + // + // We need to do two things. First grab the description + // and clean it up + // + meta_dsc = transSGML(attrs["content"]); + meta_dsc.replace('\n', ' '); + meta_dsc.replace('\r', ' '); + meta_dsc.replace('\t', ' '); + if (meta_dsc.length() > max_meta_description_length) + meta_dsc = meta_dsc.sub(0, max_meta_description_length).get(); + if (debug > 1) + cout << "META Description: " << attrs["content"] << endl; + retriever.got_meta_dsc((char*)meta_dsc); + + + // + // Now add the words to the word list + // Slot 10 is the current slot for this + // + if (!noindex) + { + String tmp = transSGML(attrs["content"]); + addString (retriever, tmp, wordindex, 10); + } + } + + if (keywordsMatch.CompareWord(cache) && !noindex) + { + String tmp = transSGML(attrs["content"]); + addKeywordString (retriever, tmp, wordindex); + } + else if (mystrcasecmp(cache, "author") == 0) + { + String author = transSGML(attrs["content"]); + retriever.got_author(author.get()); + if (!noindex) + addString (retriever, author, wordindex, 11); + } + else if (mystrcasecmp(cache, "htdig-email") == 0) + { + retriever.got_meta_email(transSGML(attrs["content"])); + } + else if (metadatetags.CompareWord(cache, which, length) && + (cache.get())[length] == '\0' && config->Boolean("use_doc_date",0)) + { + retriever.got_time(transSGML(attrs["content"])); + } + else if (mystrcasecmp(cache, "htdig-notification-date") == 0) + { + retriever.got_meta_notification(transSGML(attrs["content"])); + } + else if (mystrcasecmp(cache, "htdig-email-subject") == 0) + { + retriever.got_meta_subject(transSGML(attrs["content"])); + } + else if (mystrcasecmp(cache, "htdig-noindex") == 0) + { + retriever.got_noindex(); + noindex |= TAGmeta_htdig_noindex; + nofollow |= TAGmeta_htdig_noindex; + } + else if (mystrcasecmp(cache, "robots") == 0 + && !attrs["content"].empty()) + { + String content_cache = attrs["content"]; + content_cache.lowercase(); + if (content_cache.indexOf("noindex") != -1) + { + noindex |= TAGmeta_robots; + retriever.got_noindex(); + } + if (content_cache.indexOf("nofollow") != -1) + nofollow |= TAGmeta_robots; + if (content_cache.indexOf("none") != -1) + { + noindex |= TAGmeta_robots; + nofollow |= TAGmeta_robots; + retriever.got_noindex(); + } + } + } + else if (mystrcasecmp(attrs["name"], "htdig-noindex") == 0) + { + retriever.got_noindex(); + noindex |= TAGmeta_htdig_noindex; + nofollow |= TAGmeta_htdig_noindex; + } + break; + } + + case 21: // frame + case 24: // embed + { + if (!attrs["src"].empty()) + { + // + // src seen + // + if (!nofollow) + { + if (href) + delete href; + href = new URL(transSGML(attrs["src"]), *base); + // Frames have the same hopcount as the parent. + retriever.got_href(*href, transSGML(attrs["title"]), 0); + in_ref = 0; + } + } + break; + } + + case 25: // object + { + if (!attrs["data"].empty()) + { + // + // data seen + // + if (!nofollow) + { + if (href) + delete href; + href = new URL(transSGML(attrs["data"]), *base); + // Assume objects have the same hopcount as the parent. + retriever.got_href(*href, transSGML(attrs["title"]), 0); + in_ref = 0; + } + } + break; + } + + case 22: // area + case 26: // link + { + if (!attrs["href"].empty()) + { + // href seen + if (!nofollow) + { + if (href) + delete href; + href = new URL(transSGML(attrs["href"]), *base); + // area & link are like anchor tags -- one hopcount! + retriever.got_href(*href, transSGML(attrs["title"]), 1); + in_ref = 0; + } + } + break; + } + + case 23: // base + { + if (!attrs["href"].empty()) + { + URL tempBase(transSGML(attrs["href"])); + *base = tempBase; + } + break; + } + + case 18: // img + { + if (!ignore_alt_text && !attrs["alt"].empty()) + { + String tmp = transSGML(attrs["alt"]); + if (!noindex && in_title) + title << tmp << " "; + if (in_ref && description.length() < max_description_length) + description << tmp << " "; + if (!noindex && !in_title && head.length() < max_head_length) + head << tmp << " "; + if (!noindex) + addString (retriever, tmp, wordindex, 8); // slot for img_alt + } + if (!attrs["src"].empty()) + { + retriever.got_image(transSGML(attrs["src"])); + } + break; + } + + default: + return; // Nothing... + } +} + + +//***************************************************************************** +// const String HTML::transSGML(const String& str) +// +const String +HTML::transSGML(const String& str) +{ + return HtSGMLCodec::instance()->encode(str); +} diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.h b/debian/htdig/htdig-3.2.0b6/htdig/HTML.h new file mode 100644 index 00000000..867381ed --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.h @@ -0,0 +1,69 @@ +// +// HTML.h +// +// HTML: Class to parse HTML documents and return useful information +// to the Retriever +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HTML.h,v 1.14 2004/05/28 13:15:15 lha Exp $ +// +#ifndef _HTML_h_ +#define _HTML_h_ + +#include "Parsable.h" +#include "QuotedStringList.h" + +class Retriever; +class URL; + + +class HTML : public Parsable +{ +public: + // + // Construction/Destruction + // + HTML(); + virtual ~HTML(); + + // + // Main parser interface. + // + virtual void parse(Retriever &retriever, URL &baseURL); + +private: + // + // Our state variables + // + String word; + URL *href; + String title; + String description; + String head; + String meta_dsc; + String tag; + int in_title; + int in_ref; + int in_heading; + int noindex; + int nofollow; +// unsigned int minimumWordLength; + URL *base; + QuotedStringList skip_start; + QuotedStringList skip_end; + + // + // Helper functions + // + void do_tag(Retriever &, String &); + const String transSGML(const String& str); +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am new file mode 100644 index 00000000..1e8368b4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.am @@ -0,0 +1,16 @@ + +include $(top_srcdir)/Makefile.config + +bin_PROGRAMS = htdig + +htdig_SOURCES = Document.cc HTML.cc \ + Parsable.cc Plaintext.cc \ + Retriever.cc Server.cc ExternalTransport.cc \ + URLRef.cc htdig.cc ExternalParser.cc + +noinst_HEADERS = Document.h ExternalParser.h HTML.h \ + Parsable.h Plaintext.h Retriever.h Server.h URLRef.h htdig.h \ + ExternalTransport.h +htdig_DEPENDENCIES = $(HTLIBS) +htdig_LDFLAGS = $(PROFILING) ${extra_ldflags} +htdig_LDADD = $(HTLIBS) diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in new file mode 100644 index 00000000..52d9a862 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.in @@ -0,0 +1,487 @@ +# Makefile.in generated by automake 1.7.9 from Makefile.am. +# @configure_input@ + +# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003 +# Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# +# To compile with profiling do the following: +# +# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all +# + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. + +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_triplet = @host@ +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +APACHE = @APACHE@ +APACHE_MODULES = @APACHE_MODULES@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CGIBIN_DIR = @CGIBIN_DIR@ +COMMON_DIR = @COMMON_DIR@ +CONFIG_DIR = @CONFIG_DIR@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATABASE_DIR = @DATABASE_DIR@ +DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +FIND = @FIND@ +GUNZIP = @GUNZIP@ +HAVE_SSL = @HAVE_SSL@ +HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@ +HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@ +HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@ +IMAGE_DIR = @IMAGE_DIR@ +IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ +MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ +MAKEINFO = @MAKEINFO@ +MV = @MV@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PERL = @PERL@ +RANLIB = @RANLIB@ +RRDTOOL = @RRDTOOL@ +SEARCH_DIR = @SEARCH_DIR@ +SEARCH_FORM = @SEARCH_FORM@ +SED = @SED@ +SENDMAIL = @SENDMAIL@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TAR = @TAR@ +TESTS_FALSE = @TESTS_FALSE@ +TESTS_TRUE = @TESTS_TRUE@ +TIME = @TIME@ +TIMEV = @TIMEV@ +USER = @USER@ +VERSION = @VERSION@ +YACC = @YACC@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +ac_ct_RANLIB = @ac_ct_RANLIB@ +ac_ct_STRIP = @ac_ct_STRIP@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +exec_prefix = @exec_prefix@ +extra_ldflags = @extra_ldflags@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +oldincludedir = @oldincludedir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ + +AUTOMAKE_OPTIONS = foreign no-dependencies + +INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \ + -I$(top_srcdir)/include -I$(top_srcdir)/htlib \ + -I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \ + -I$(top_srcdir)/htword \ + -I$(top_srcdir)/db -I$(top_builddir)/db \ + $(LOCAL_DEFINES) $(PROFILING) + + +HTLIBS = $(top_builddir)/htnet/libhtnet.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/htlib/libht.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/db/libhtdb.la \ + $(top_builddir)/htlib/libht.la + + +bin_PROGRAMS = htdig + +htdig_SOURCES = Document.cc HTML.cc \ + Parsable.cc Plaintext.cc \ + Retriever.cc Server.cc ExternalTransport.cc \ + URLRef.cc htdig.cc ExternalParser.cc + + +noinst_HEADERS = Document.h ExternalParser.h HTML.h \ + Parsable.h Plaintext.h Retriever.h Server.h URLRef.h htdig.h \ + ExternalTransport.h + +htdig_DEPENDENCIES = $(HTLIBS) +htdig_LDFLAGS = $(PROFILING) ${extra_ldflags} +htdig_LDADD = $(HTLIBS) +subdir = htdig +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/include/config.h +CONFIG_CLEAN_FILES = +bin_PROGRAMS = htdig$(EXEEXT) +PROGRAMS = $(bin_PROGRAMS) + +am_htdig_OBJECTS = Document.$(OBJEXT) HTML.$(OBJEXT) Parsable.$(OBJEXT) \ + Plaintext.$(OBJEXT) Retriever.$(OBJEXT) Server.$(OBJEXT) \ + ExternalTransport.$(OBJEXT) URLRef.$(OBJEXT) htdig.$(OBJEXT) \ + ExternalParser.$(OBJEXT) +htdig_OBJECTS = $(am_htdig_OBJECTS) + +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/include +depcomp = +am__depfiles_maybe = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +DIST_SOURCES = $(htdig_SOURCES) +HEADERS = $(noinst_HEADERS) + +DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.in \ + $(top_srcdir)/Makefile.config Makefile.am +SOURCES = $(htdig_SOURCES) + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && \ + $(AUTOMAKE) --foreign htdig/Makefile +Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe) +binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + if test -f $$p \ + || test -f $$p1 \ + ; then \ + f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f"; \ + $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f || exit 1; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " rm -f $(DESTDIR)$(bindir)/$$f"; \ + rm -f $(DESTDIR)$(bindir)/$$f; \ + done + +clean-binPROGRAMS: + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +htdig$(EXEEXT): $(htdig_OBJECTS) $(htdig_DEPENDENCIES) + @rm -f htdig$(EXEEXT) + $(CXXLINK) $(htdig_LDFLAGS) $(htdig_OBJECTS) $(htdig_LDADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) core *.core + +distclean-compile: + -rm -f *.tab.c + +.cc.o: + $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cc.obj: + $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` + +.cc.lo: + $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ETAGS = etags +ETAGSFLAGS = + +CTAGS = ctags +CTAGSFLAGS = + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$tags$$unique" \ + || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique + +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) + +top_distdir = .. +distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) + +distdir: $(DISTFILES) + $(mkinstalldirs) $(distdir)/.. + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkinstalldirs) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(PROGRAMS) $(HEADERS) + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: install-binPROGRAMS + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-binPROGRAMS uninstall-info-am + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ + clean-generic clean-libtool ctags distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am info info-am install install-am install-binPROGRAMS \ + install-data install-data-am install-exec install-exec-am \ + install-info install-info-am install-man install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-binPROGRAMS \ + uninstall-info-am + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32 new file mode 100644 index 00000000..49839a7c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Makefile.win32 @@ -0,0 +1,29 @@ +# +# Makefile - makefile for rightnow +# + +APP_NAME = Right Now Web CGI +RNT_PRODUCT = rnw + +TARGET = $(BINDIR)/htdig$(EXESFX) + +include ../Makedefs.win32 + +# ----------------------------------------------------------------------------- +# add new executable members to this list + + +CXXSRC = Document.cc HTML.cc Parsable.cc Plaintext.cc Retriever.cc \ + Server.cc ExternalTransport.cc URLRef.cc htdig.cc ExternalParser.cc + +CPPFLAGS += -I. -I../include -I../htlib -I../htcommon -I../htword -I../db -I../htnet + +LDLIBS = ../lib/$(ARCH)/libhtnet.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libhtdb.lib +OTHERLIBS = ws2_32.lib L:/win32/lib/zlib114/zlib.lib + +DEPLIBS += $(LDLIBS) + +$(TARGET): $(OBJDIRDEP) $(BINDIRDEP) $(OBJS) $(DEPLIBS) + $(EXELD) $(LDFLAGS) $(OBJS) $(LDLIBS) $(OTHERLIBS) + +include ../Makerules.win32 diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc new file mode 100644 index 00000000..049362a8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.cc @@ -0,0 +1,96 @@ +// +// Parsable.cc +// +// Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Parsable.cc,v 1.9 2004/05/28 13:15:15 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Parsable.h" +#include "htdig.h" +#include "defaults.h" + + +//***************************************************************************** +// Parsable::Parsable() +// +Parsable::Parsable() +{ + HtConfiguration* config= HtConfiguration::config(); + contents = 0; + max_head_length = config->Value("max_head_length", 0); + max_description_length = config->Value("max_description_length", 50); + max_meta_description_length = config->Value("max_meta_description_length", 0); + + max_keywords = config->Value("max_keywords", -1); + if (max_keywords < 0) + max_keywords = (int) ((unsigned int) ~1 >> 1); + minimum_word_length = config->Value("minimum_word_length", 3); +} + + +//***************************************************************************** +// Parsable::~Parsable() +// +Parsable::~Parsable() +{ + delete contents; +} + + +//***************************************************************************** +// void Parsable::setContents(char *data, int length) +// This will set the contents of the parsable object. +// +void +Parsable::setContents(char *data, int length) +{ + delete contents; + contents = new String(data, length); +} + +//***************************************************************************** +// void Parsable::addString(char *s, int& wordindex, int slot) +// Add all words in string s in "heading level" slot, incrementing wordindex +// along the way. String s is corrupted. +// +void +Parsable::addString(Retriever& retriever, char *s, int& wordindex, int slot) +{ + char *w = HtWordToken(s); + while (w) + { + if (strlen(w) >= minimum_word_length) + retriever.got_word(w, wordindex++, slot); // slot for img_alt + w = HtWordToken(0); + } + w = '\0'; +} + +//***************************************************************************** +// void Parsable::addKeywordString(char *s, int& wordindex) +// Add all words in string s as keywords, incrementing wordindex +// along the way. String s is corrupted. +// +void +Parsable::addKeywordString(Retriever& retriever, char *s, int& wordindex) +{ + char *w = HtWordToken(s); + while (w) + { + if (strlen(w) >= minimum_word_length && ++keywordsCount <= max_keywords) + retriever.got_word(w, wordindex++, 9); + w = HtWordToken(0); + } + w = '\0'; +} diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h new file mode 100644 index 00000000..7149fe7c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Parsable.h @@ -0,0 +1,57 @@ +// +// Parsable.h +// +// Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Parsable.h,v 1.10 2004/05/28 13:15:15 lha Exp $ +// + +#ifndef _Parsable_h_ +#define _Parsable_h_ + +#include "htString.h" +#include "Retriever.h" + +class URL; + + +class Parsable +{ +public: + // + // Construction/Destruction + // + Parsable(); + virtual ~Parsable(); + + // + // Main parser interface. + // + virtual void parse(Retriever &retriever, URL &) = 0; + + // + // The rest of the members are used by the Document to provide us + // the data that we contain. + // + virtual void setContents(char *data, int length); + void addString(Retriever& retriever, char *s, int& wordindex, int slot); + void addKeywordString(Retriever& retriever, char *s, int& wordindex); + +protected: + String *contents; + int max_head_length; + int max_description_length; + int max_meta_description_length; + int max_keywords, keywordsCount; + unsigned int minimum_word_length; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc new file mode 100644 index 00000000..e7006fb1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc @@ -0,0 +1,116 @@ +// +// Plaintext.cc +// +// Plaintext: Parses plaintext files. Not much to do, really. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Plaintext.cc,v 1.20 2004/05/28 13:15:15 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Plaintext.h" +#include "htdig.h" +#include "htString.h" +#include "WordType.h" + +#include <ctype.h> +#include "defaults.h" + + +//***************************************************************************** +// Plaintext::Plaintext() +// +Plaintext::Plaintext() +{ +} + + +//***************************************************************************** +// Plaintext::~Plaintext() +// +Plaintext::~Plaintext() +{ +} + + +//***************************************************************************** +// void Plaintext::parse(Retriever &retriever, URL &) +// +void +Plaintext::parse(Retriever &retriever, URL &) +{ + if (contents == 0 || contents->length() == 0) + return; + + HtConfiguration* config= HtConfiguration::config(); + unsigned char *position = (unsigned char *) contents->get(); + static int minimumWordLength = config->Value("minimum_word_length", 3); + int wordIndex = 1; + int in_space = 0; + String word; + String head; + + while (*position) + { + word = 0; + + if (HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + in_space = 0; + while (*position && HtIsWordChar(*position)) + { + word << *position; + position++; + } + + if (head.length() < max_head_length) + { + head << word; + } + + if (word.length() >= minimumWordLength) + { + retriever.got_word((char*)word, wordIndex++, 0); + } + } + + if (head.length() < max_head_length) + { + // + // Characters that are not part of a word + // + if (*position && isspace(*position)) + { + // + // Reduce all multiple whitespace to a single space + // + if (!in_space) + { + head << ' '; + } + in_space = 1; + } + else + { + head << *position; + in_space = 0; + } + } + if (*position) + position++; + } + retriever.got_head((char*)head); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h new file mode 100644 index 00000000..a6275c41 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.h @@ -0,0 +1,41 @@ +// +// Plaintext.h +// +// Plaintext: Parses plaintext files. Not much to do, really. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Plaintext.h,v 1.6 2004/05/28 13:15:15 lha Exp $ +// +#ifndef _Plaintext_h_ +#define _Plaintext_h_ + +#include "Parsable.h" + +class URL; + + +class Plaintext : public Parsable +{ +public: + // + // Construction/Destruction + // + Plaintext(); + virtual ~Plaintext(); + + // + // Main parser interface. + // + virtual void parse(Retriever &retriever, URL &); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc new file mode 100644 index 00000000..13243571 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.cc @@ -0,0 +1,2013 @@ +// +// Retriever.cc +// +// Retriever: Crawl from a list of URLs and calls appropriate parsers. The +// parser notifies the Retriever object that it got something +// (got_* functions) and the Retriever object feed the databases +// and statistics accordingly. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#ifdef _MSC_VER /* _WIN32 */ +# include <sys/types.h> +# include <winsock2.h> +#endif + + +#include "Retriever.h" +#include "htdig.h" +#include "HtWordList.h" +#include "WordRecord.h" +#include "URLRef.h" +#include "Server.h" +#include "Parsable.h" +#include "Document.h" +#include "StringList.h" +#include "WordType.h" +#include "Transport.h" +#include "HtHTTP.h" // For HTTP statistics +#include "md5.h" +#include "defaults.h" + +#ifndef _MSC_VER /* _WIN32 */ +#include <pwd.h> +#endif + +#include <signal.h> +#include <stdio.h> + + +static int noSignal; + +// no_store_phrases: +// If true, only store first occurrence of each word in a document +static bool no_store_phrases; + +//***************************************************************************** +// Retriever::Retriever() +// +Retriever::Retriever(RetrieverLog flags): +words(*(HtConfiguration::config())), +words_to_add (100, 0.75) +{ + HtConfiguration *config = HtConfiguration::config(); + FILE *urls_parsed; + + currenthopcount = 0; + max_hop_count = config->Value("max_hop_count", 999999); + + no_store_phrases = !config->Boolean("store_phrases"); + + // + // Initialize the flags for the various HTML factors + // + + // text_factor + factor[0] = FLAG_TEXT; + // title_factor + factor[1] = FLAG_TITLE; + // heading factor (now generic) + factor[2] = FLAG_HEADING; + factor[3] = FLAG_HEADING; + factor[4] = FLAG_HEADING; + factor[5] = FLAG_HEADING; + factor[6] = FLAG_HEADING; + factor[7] = FLAG_HEADING; + // img alt text + //factor[8] = FLAG_KEYWORDS; + factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has + // its own FLAG and factor. + // keywords factor + factor[9] = FLAG_KEYWORDS; + // META description factor + factor[10] = FLAG_DESCRIPTION; + factor[11] = FLAG_AUTHOR; + + doc = new Document(); + minimumWordLength = config->Value("minimum_word_length", 3); + + log = flags; + // if in restart mode + if (Retriever_noLog != log) + { + String filelog = config->Find("url_log"); + char buffer[1024]; + int l; + + urls_parsed = fopen((char *) filelog, "r"); + if (urls_parsed != 0) + { + // read all url discovered but not fetched before + while (fgets(buffer, sizeof(buffer), urls_parsed)) + { + l = strlen(buffer); + buffer[l - 1] = 0; + Initial(buffer, 2); + } + fclose(urls_parsed); + } + unlink((char *) filelog); + } + + check_unique_md5 = config->Boolean("check_unique_md5", 0); + check_unique_date = config->Boolean("check_unique_date", 0); + + d_md5 = 0; + if (check_unique_md5) + { + d_md5 = Database::getDatabaseInstance(DB_HASH); + + if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK) + { + cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n"; + } + } + +} + + +//***************************************************************************** +// Retriever::~Retriever() +// +Retriever::~Retriever() +{ + if (d_md5) + d_md5->Close(); + delete doc; +} + + +//***************************************************************************** +// void Retriever::setUsernamePassword(char *credentials) +// +void Retriever::setUsernamePassword(const char *credentials) +{ + doc->setUsernamePassword(credentials); +} + + +//***************************************************************************** +// void Retriever::Initial(char *list, int from) +// Add a single URL to the list of URLs to visit. +// Since URLs are stored on a per server basis, we first need to find the +// the correct server to add the URL's path to. +// +// from == 0 urls in db.docs and no db.log +// from == 1 urls in start_url add url only if not already in the list +// from == 2 add url from db.log +// from == 3 urls in db.docs and there was a db.log +// +void Retriever::Initial(const String & list, int from) +{ + // + // Split the list of urls up into individual urls. + // + StringList tokens(list, " \t"); + String sig; + String url; + Server *server; + + for (int i = 0; i < tokens.Count(); i++) + { + URL u(tokens[i]); + url = u.get(); // get before u.signature() resolves aliases + server = (Server *) servers[u.signature()]; + if (debug > 2) + cout << "\t" << from << ":" << (int) log << ":" << url; + if (!server) + { + String robotsURL = u.signature(); + robotsURL << "robots.txt"; + StringList *localRobotsFile = GetLocal(robotsURL); + + server = new Server(u, localRobotsFile); + servers.Add(u.signature(), server); + delete localRobotsFile; + } + + if (from && visited.Exists(url)) + { + if (debug > 2) + cout << " skipped" << endl; + continue; + } + else if (IsValidURL(url) != 1) + { + if (debug > 2) + cout << endl; + continue; + } + + if (Retriever_noLog == log || from != 3) + { + if (debug > 2) + cout << " pushed"; + server->push(u.get(), 0, 0, IsLocalURL(url.get())); + } + if (debug > 2) + cout << endl; + visited.Add(url, 0); + } +} + + +//***************************************************************************** +// void Retriever::Initial(List &list, int from) +// +void Retriever::Initial(List & list, int from) +{ + list.Start_Get(); + String *str; + + // from == 0 is an optimisation for pushing url in update mode + // assuming that + // 1) there's many more urls in docdb + // 2) they're pushed first + // 3) there's no duplicate url in docdb + // then they don't need to be check against already pushed urls + // But 2) can be false with -l option + // + // FIXME it's nasty, what have to be test is : + // we have urls to push from db.docs but do we already have them in + // db.log? For this it's using a side effect with 'visited' and that + // urls in db.docs are only pushed via this method, and that db.log are pushed + // first, db.docs second, start_urls third! + // + if (!from && visited.Count()) + { + from = 3; + } + while ((str = (String *) list.Get_Next())) + { + Initial(str->get(), from); + } +} + +//***************************************************************************** +// +static void sigexit(int) +{ + noSignal = 0; //don't exit here.. just set the flag. +} + +static void sigpipe(int) +{ +} + +//***************************************************************************** +// static void sig_handlers +// initialise signal handlers +// +static void sig_handlers(void) +{ +#ifndef _MSC_VER /* _WIN32 */ + //POSIX SIGNALS + struct sigaction action; + + /* SIGINT, SIGQUIT, SIGTERM */ + action.sa_handler = sigexit; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + if (sigaction(SIGINT, &action, NULL) != 0) + reportError("Cannot install SIGINT handler\n"); + if (sigaction(SIGQUIT, &action, NULL) != 0) + reportError("Cannot install SIGQUIT handler\n"); + if (sigaction(SIGTERM, &action, NULL) != 0) + reportError("Cannot install SIGTERM handler\n"); + if (sigaction(SIGHUP, &action, NULL) != 0) + reportError("Cannot install SIGHUP handler\n"); +#else + //ANSI C signal handling - Limited to supported Windows signals. + signal(SIGINT, sigexit); + signal(SIGTERM, sigexit); +#endif //_MSC_VER /* _WIN32 */ +} + + + +static void sig_phandler(void) +{ +#ifndef _MSC_VER /* _WIN32 */ + struct sigaction action; + + sigemptyset(&action.sa_mask); + action.sa_handler = sigpipe; + action.sa_flags = SA_RESTART; + if (sigaction(SIGPIPE, &action, NULL) != 0) + reportError("Cannot install SIGPIPE handler\n"); +#endif //_MSC_VER /* _WIN32 */ +} + + +//***************************************************************************** +// static void win32_check_messages +// Check WIN32 messages! +// +#ifdef _MSC_VER /* _WIN32 */ +static void win32_check_messages(void) +{ +// NEAL - NEEDS FINISHING/TESTING +#if 0 + MSG msg = {0, 0, 0, 0}; + int cDown = 0; + int controlDown = 0; + + if( GetMessage(&msg, 0, 0, 0) ) + { + + switch(msg.message) + { + case WM_KEYDOWN: + { + if(LOWORD(msg.message)== 17) + controlDown = 1; + else if(LOWORD(msg.message) == 67) + { + cDown = 1; + } + } + break; + case WM_KEYUP: + { + if(LOWORD(msg.message) == 17) + controlDown = 0; + else if(LOWORD(msg.message) == 67) + cDown = 0; + } + break; + } + } + + DispatchMessage(&msg); +#endif +} +#endif //_MSC_VER /* _WIN32 */ + + +//***************************************************************************** +// void Retriever::Start() +// This is the main loop of the retriever. We will go through the +// list of paths stored for each server. While parsing the +// retrieved documents, new paths will be added to the servers. We +// return if no more paths need to be retrieved. +// +void Retriever::Start() +{ + // + // Main digger loop. The todo list should initialy have the start + // URL and all the URLs which were seen in a previous dig. The + // loop will continue as long as there are more URLs to visit. + // + int more = 1; + Server *server; + URLRef *ref; + + HtConfiguration *config = HtConfiguration::config(); + + // + // Always sig . The delay bother me but a bad db is worst + // + if (Retriever_noLog != log) + { + sig_handlers(); + } + sig_phandler(); + noSignal = 1; + + +/////// + // Main loop. We keep on retrieving until a signal is received + // or all the servers' queues are empty. +/////// + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + while (more && noSignal) + { + more = 0; + + // + // Go through all the current servers in sequence. + // If they support persistent connections, we keep on popping + // from the same server queue until it's empty or we reach a maximum + // number of consecutive requests ("max_connection_requests"). + // Or the loop may also continue for the infinite, + // if we set the "max_connection_requests" to -1. + // If the server doesn't support persistent connection, we take + // only an URL from it, then we skip to the next server. + // + // Since 15.05.02: even when persistent connections are activated + // we should wait for a 'server_wait_time' number of seconds + // after the 'max_connection_requests' value has been reached. + // + + // Let's position at the beginning + servers.Start_Get(); + + int count; + + // Maximum number of repeated requests with the same + // TCP connection (so on the same Server:Port). + + int max_connection_requests; + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + while ((server = (Server *) servers.Get_NextElement()) && noSignal) + { + if (debug > 1) + cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl; + + // We already know if a server supports HTTP pers. connections, + // because we asked it for the robots.txt file (constructor of + // the class). + + // If the Server doesn't support persistent connections + // we turn it down to 1. + + if (server->IsPersistentConnectionAllowed()) + { + + // Let's check for a '0' value (out of range) + // If set, we change it to 1. + + if (config->Value("server", server->host(), "max_connection_requests") == 0) + max_connection_requests = 1; + else + max_connection_requests = + config->Value("server", server->host(), "max_connection_requests"); + + if (debug > 2) + { + + cout << "> " << server->host() << " supports HTTP persistent connections"; + + if (max_connection_requests == -1) + cout << " (" << "infinite" << ")" << endl; + else + cout << " (" << max_connection_requests << ")" << endl; + + } + + } + else + { + + // No HTTP persistent connections. So we request only 1 document. + + max_connection_requests = 1; + + if (debug > 2) + cout << "> " << server->host() << " with a traditional HTTP connection" << endl; + + } + + + count = 0; + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + + while (((max_connection_requests == -1) || + (count < max_connection_requests)) && (ref = server->pop()) && noSignal) + { + count++; + + // + // We have a URL to index, now. We need to register the + // fact that we are not done yet by setting the 'more' + // variable. So, we have to restart scanning the queue. + // + + more = 1; + + // + // Deal with the actual URL. + // We'll check with the server to see if we need to sleep() + // before parsing it. + // + + parse_url(*ref); + delete ref; + + // We reached the maximum number of connections (either with + // or without persistent connections) and we must pause and + // respect the 'net ethic'. + if ((max_connection_requests - count) == 0) + server->delay(); // This will pause if needed + // and reset the time + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + } + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + } + } + +#ifdef _MSC_VER /* _WIN32 */ + win32_check_messages(); +#endif + + + // if we exited on signal + if (Retriever_noLog != log && !noSignal) + { + FILE *urls_parsed; + String filelog = config->Find("url_log"); + // save url seen but not fetched + urls_parsed = fopen((char *) filelog, "w"); + if (0 == urls_parsed) + { + reportError(form("Unable to create URL log file '%s'", filelog.get())); + } + else + { + servers.Start_Get(); + while ((server = (Server *) servers.Get_NextElement())) + { + while (NULL != (ref = server->pop())) + { + fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get()); + delete ref; + } + } + fclose(urls_parsed); + } + } + words.Close(); +} + + +//***************************************************************************** +// void Retriever::parse_url(URLRef &urlRef) +// +void Retriever::parse_url(URLRef & urlRef) +{ + HtConfiguration *config = HtConfiguration::config(); + URL url; + DocumentRef *ref; + int old_document; + time_t date; + static int index = 0; + static int local_urls_only = config->Boolean("local_urls_only"); + static int mark_dead_servers = config->Boolean("ignore_dead_servers"); + Server *server; + + url.parse(urlRef.GetURL().get()); + + currenthopcount = urlRef.GetHopCount(); + + ref = docs[url.get()]; // It might be nice to have just an Exists() here + if (ref) + { + // + // We already have an entry for this document in our database. + // This means we can get the document ID and last modification + // time from there. + // + current_id = ref->DocID(); + date = ref->DocTime(); + if (ref->DocAccessed()) + old_document = 1; + else // we haven't retrieved it yet, so we only have the first link + old_document = 0; + ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link + ref->DocAccessed(time(0)); + ref->DocState(Reference_normal); + currenthopcount = ref->DocHopCount(); + } + else + { + // + // Never seen this document before. We need to create an + // entry for it. This implies that it gets a new document ID. + // + date = 0; + current_id = docs.NextDocID(); + ref = new DocumentRef; + ref->DocID(current_id); + ref->DocURL(url.get()); + ref->DocState(Reference_normal); + ref->DocAccessed(time(0)); + ref->DocHopCount(currenthopcount); + ref->DocBackLinks(1); // We had to have a link to get here! + old_document = 0; + } + + word_context.DocID(ref->DocID()); + + if (debug > 0) + { + // + // Display progress + // + cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": "; + cout.flush(); + } + + // Reset the document to clean out any old data + doc->Reset(); + doc->Url(url.get()); + doc->Referer(urlRef.GetReferer().get()); + + base = doc->Url(); + + // Retrieve document, first trying local file access if possible. + Transport::DocStatus status; + server = (Server *) servers[url.signature()]; + StringList *local_filenames = GetLocal(url.get()); + if (local_filenames) + { + if (debug > 1) + cout << "Trying local files" << endl; + status = doc->RetrieveLocal(date, local_filenames); + if (status == Transport::Document_not_local) + { + if (debug > 1) + cout << "Local retrieval failed, trying HTTP" << endl; + if (server && !server->IsDead() && !local_urls_only) + status = doc->Retrieve(server, date); + else + status = Transport::Document_no_host; + } + delete local_filenames; + } + else if (server && !server->IsDead() && !local_urls_only) + status = doc->Retrieve(server, date); + else + status = Transport::Document_no_host; + + current_ref = ref; + + // + // Determine what to do by looking at the status code returned by + // the Document retrieval process. + // + + String shash; + String sx; + char bhash[16]; + time_t ddate; + + switch (status) + { + + case Transport::Document_ok: + trackWords = 1; + + if (check_unique_md5) + { + if (doc->StoredLength() > 0) + { + if (check_unique_date) + { + ddate = doc->ModTime(); + if (ddate < time(NULL) - 10) + { // Unknown date was set to current time + md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug); + } + else + { + md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug); + } + } + else + md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug); + + shash.append(bhash, MD5_LENGTH); + d_md5->Get(shash, sx); + + if (!sx.empty()) + { + if (debug > 1) + { + cout << " Detected duplicate by md5 hash" << endl; + } + words.Skip(); + break; // Duplicate - don't index + } + else + { + d_md5->Put(shash, "x"); + } + + } + } + + if (old_document) + { + if (doc->ModTime() == ref->DocTime()) + { + words.Skip(); + if (debug) + cout << " retrieved but not changed" << endl; + words.Skip(); + break; + } + // + // Since we already had a record of this document and + // we were able to retrieve it, it must have changed + // since the last time we scanned it. This means that + // we need to assign a new document ID to it and mark + // the old one as obsolete. + // + words.Skip(); + int backlinks = ref->DocBackLinks(); + ref->DocState(Reference_obsolete); + docs.Add(*ref); + delete ref; + + current_id = docs.NextDocID(); + word_context.DocID(current_id); + ref = new DocumentRef; + ref->DocID(current_id); + ref->DocURL(url.get()); + ref->DocState(Reference_normal); + ref->DocAccessed(time(0)); + ref->DocHopCount(currenthopcount); + ref->DocBackLinks(backlinks); + if (debug) + cout << " (changed) "; + } + RetrievedDocument(*doc, url.get(), ref); + // Hey! If this document is marked noindex, don't even bother + // adding new words. Mark this as gone and get rid of it! + if (ref->DocState() == Reference_noindex) + { + if (debug > 1) + cout << " ( " << ref->DocURL() << " ignored)"; + words.Skip(); + } + else + words.Flush(); + if (debug) + cout << " size = " << doc->Length() << endl; + + if (urls_seen) + { + fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n", + (const char *) url.get(), doc->Length(), doc->ContentType(), + (int) doc->ModTime(), currenthopcount); + } + break; + + case Transport::Document_not_changed: + if (debug) + cout << " not changed" << endl; + words.Skip(); + break; + + case Transport::Document_not_found: + ref->DocState(Reference_not_found); + if (debug) + cout << " not found" << endl; + recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found); + words.Skip(); + break; + + case Transport::Document_no_host: + ref->DocState(Reference_not_found); + if (debug) + cout << " host not found" << endl; + recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host); + words.Skip(); + + // Mark the server as being down + if (server && mark_dead_servers) + server->IsDead(1); + break; + + case Transport::Document_no_port: + ref->DocState(Reference_not_found); + if (debug) + cout << " host not found (port)" << endl; + recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port); + words.Skip(); + + // Mark the server as being down + if (server && mark_dead_servers) + server->IsDead(1); + break; + + case Transport::Document_not_parsable: + ref->DocState(Reference_noindex); + if (debug) + cout << " not Parsable" << endl; + words.Skip(); + break; + + case Transport::Document_redirect: + if (debug) + cout << " redirect" << endl; + ref->DocState(Reference_obsolete); + words.Skip(); + got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get()); + break; + + case Transport::Document_not_authorized: + ref->DocState(Reference_not_found); + if (debug) + cout << " not authorized" << endl; + words.Skip(); + break; + + case Transport::Document_not_local: + ref->DocState(Reference_not_found); + if (debug) + cout << " not local" << endl; + words.Skip(); + break; + + case Transport::Document_no_header: + ref->DocState(Reference_not_found); + if (debug) + cout << " no header" << endl; + words.Skip(); + break; + + case Transport::Document_connection_down: + ref->DocState(Reference_not_found); + if (debug) + cout << " connection down" << endl; + words.Skip(); + break; + + case Transport::Document_no_connection: + ref->DocState(Reference_not_found); + if (debug) + cout << " no connection" << endl; + words.Skip(); + break; + + case Transport::Document_not_recognized_service: + ref->DocState(Reference_not_found); + if (debug) + cout << " service not recognized" << endl; + + // Mark the server as being down + if (server && mark_dead_servers) + server->IsDead(1); + words.Skip(); + break; + + case Transport::Document_other_error: + ref->DocState(Reference_not_found); + if (debug) + cout << " other error" << endl; + words.Skip(); + break; + } + docs.Add(*ref); + delete ref; +} + + +//***************************************************************************** +// void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref) +// We found a document that needs to be parsed. Since we don't know the +// document type, we'll let the Document itself return an appropriate +// Parsable object which we can call upon to parse the document contents. +// +void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref) +{ + n_links = 0; + current_ref = ref; + current_title = 0; + word_context.Anchor(0); + current_time = 0; + current_head = 0; + current_meta_dsc = 0; + + // + // Create a parser object and let it have a go at the document. + // We will pass ourselves as a callback object for all the got_*() + // routines. + // This will generate the Parsable object as a specific parser + // + Parsable *parsable = doc.getParsable(); + if (parsable) + parsable->parse(*this, *base); + else + { // If we didn't get a parser, then we should get rid of this! + ref->DocState(Reference_noindex); + return; + } + + // If just storing the first occurrence of each word in a document, + // we must now flush the words we saw in that document + if (no_store_phrases) + { + DictionaryCursor cursor; + char *key; + HtWordReference wordRef; + for (words_to_add.Start_Get (cursor); + (key = words_to_add.Get_Next(cursor)); ) + { + word_entry *entry = (word_entry*) (words_to_add [key]); + + wordRef.Location(entry->location); + wordRef.Flags(entry->flags); + wordRef.Word(key); + words.Replace(WordReference::Merge(wordRef, entry->context)); + // How do I clean up properly? + delete entry; + } + words_to_add.Release (); + } + + // + // We don't need to dispose of the parsable object since it will + // automatically be reused. + // + + // + // Update the document reference + // + ref->DocHead((char *) current_head); + ref->DocMetaDsc((char *) current_meta_dsc); + if (current_time == 0) + ref->DocTime(doc.ModTime()); + else + ref->DocTime(current_time); + ref->DocTitle((char *) current_title); + ref->DocSize(doc.Length()); + ref->DocAccessed(time(0)); + ref->DocLinks(n_links); +} + + +//***************************************************************************** +// int Retriever::Need2Get(const String &u) +// Return TRUE if we need to retrieve the given url. This will +// check the list of urls we have already visited. +// +int Retriever::Need2Get(const String & u) +{ + static String url; + url = u; + + return !visited.Exists(url); +} + + + +//***************************************************************************** +// int Retriever::IsValidURL(const String &u) +// Return TRUE if we need to retrieve the given url. We will check +// for limits here. +// +int Retriever::IsValidURL(const String & u) +{ + HtConfiguration *config = HtConfiguration::config(); + Dictionary invalids; + Dictionary valids; + URL aUrl(u); + StringList tmpList; + + // A list of bad extensions, separated by spaces or tabs + String t = config->Find(&aUrl, "bad_extensions"); + String lowerp; + char *p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + invalids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + // + // Valid extensions are performed similarly + // + // A list of valid extensions, separated by spaces or tabs + + t = config->Find(&aUrl, "valid_extensions"); + p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + valids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + static String url; + url = u; + + // + // If the URL contains any of the patterns in the exclude list, + // mark it as invalid + // + String exclude_urls = config->Find(&aUrl, "exclude_urls"); + static String *prevexcludes = 0; + static HtRegexList *excludes = 0; + if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0) + { + if (!excludes) + excludes = new HtRegexList; + if (prevexcludes) + delete prevexcludes; + prevexcludes = new String(exclude_urls); + tmpList.Create(exclude_urls, " \t"); + excludes->setEscaped(tmpList, config->Boolean("case_sensitive")); + tmpList.Destroy(); + } + if (excludes->match(url, 0, 0) != 0) + { + if (debug > 2) + cout << endl << " Rejected: item in exclude list "; + return (HTDIG_ERROR_TESTURL_EXCLUDE); + } + + // + // If the URL has a query string and it is in the bad query list + // mark it as invalid + // + String bad_querystr = config->Find(&aUrl, "bad_querystr"); + static String *prevbadquerystr = 0; + static HtRegexList *badquerystr = 0; + if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0) + { + if (!badquerystr) + badquerystr = new HtRegexList; + if (prevbadquerystr) + delete prevbadquerystr; + prevbadquerystr = new String(bad_querystr); + tmpList.Create(bad_querystr, " \t"); + badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive")); + tmpList.Destroy(); + } + char *ext = strrchr((char *) url, '?'); + if (ext && badquerystr->match(ext, 0, 0) != 0) + { + if (debug > 2) + cout << endl << " Rejected: item in bad query list "; + return (HTDIG_ERROR_TESTURL_BADQUERY); + } + + // + // See if the file extension is in the list of invalid ones + // + String urlpath = url.get(); + int parm = urlpath.indexOf('?'); // chop off URL parameter + if (parm >= 0) + urlpath.chop(urlpath.length() - parm); + ext = strrchr((char *) urlpath.get(), '.'); + String lowerext; + if (ext && strchr(ext, '/')) // Ignore a dot if it's not in the + ext = NULL; // final component of the path. + if (ext) + { + lowerext.set(ext); + lowerext.lowercase(); + if (invalids.Exists(lowerext)) + { + if (debug > 2) + cout << endl << " Rejected: Extension is invalid!"; + return (HTDIG_ERROR_TESTURL_EXTENSION); + } + } + // + // Or NOT in the list of valid ones + // + if (ext && valids.Count() > 0 && !valids.Exists(lowerext)) + { + if (debug > 2) + cout << endl << " Rejected: Extension is not valid!"; + return (HTDIG_ERROR_TESTURL_EXTENSION2); + } + + // + // If none of the limits is met, we disallow the URL + // + if (limits.match(url, 1, 0) == 0) + { + if (debug > 1) + cout << endl << " Rejected: URL not in the limits! "; + return (HTDIG_ERROR_TESTURL_LIMITS); + } + // + // Likewise if not in list of normalized urls + // + // Warning! + // should be last in checks because of aUrl normalization + // + // signature() implicitly normalizes the URL. Be efficient... + Server *server = (Server *) servers[aUrl.signature()]; +// aUrl.normalize(); + if (limitsn.match(aUrl.get(), 1, 0) == 0) + { + if (debug > 2) + cout << endl << " Rejected: not in \"limit_normalized\" list!"; + return (HTDIG_ERROR_TESTURL_LIMITSNORM); + } + + // + // After that gauntlet, check to see if the server allows it + // (robots.txt) + // + if (server && server->IsDisallowed(url) != 0) + { + if (debug > 2) + cout << endl << " Rejected: forbidden by server robots.txt!"; + return (HTDIG_ERROR_TESTURL_ROBOT_FORBID); + } + + return (1); +} + + +//***************************************************************************** +// StringList* Retriever::GetLocal(const String &url) +// Returns a list of strings containing the (possible) local filenames +// of the given url, or 0 if it's definitely not local. +// THE CALLER MUST FREE THE STRINGLIST AFTER USE! +// Returned strings are not hex encoded. +// +StringList *Retriever::GetLocal(const String & strurl) +{ + HtConfiguration *config = HtConfiguration::config(); + static StringList *prefixes = 0; + String url = strurl; + + static StringList *paths = 0; + StringList *defaultdocs = 0; + URL aUrl(url); + url = aUrl.get(); // make sure we look at a parsed URL + + // + // Initialize prefix/path list if this is the first time. + // The list is given in format "prefix1=path1 prefix2=path2 ..." + // + if (!prefixes) + { + prefixes = new StringList(); + paths = new StringList(); + + String t = config->Find("local_urls"); + char *p = strtok(t, " \t"); + while (p) + { + char *path = strchr(p, '='); + if (!path) + { + p = strtok(0, " \t"); + continue; + } + *path++ = '\0'; + String *pre = new String(p); + decodeURL(*pre); + prefixes->Add(pre); + String *pat = new String(path); + decodeURL(*pat); + paths->Add(pat); + p = strtok(0, " \t"); + } + } + if (!config->Find(&aUrl, "local_default_doc").empty()) + { + defaultdocs = new StringList(); + String t = config->Find(&aUrl, "local_default_doc"); + char *p = strtok(t, " \t"); + while (p) + { + String *def = new String(p); + decodeURL(*def); + defaultdocs->Add(def); + p = strtok(0, " \t"); + } + if (defaultdocs->Count() == 0) + { + delete defaultdocs; + defaultdocs = 0; + } + } + + // Begin by hex-decoding URL... + String hexurl = url; + decodeURL(hexurl); + url = hexurl.get(); + + // Check first for local user... + if (strchr(url.get(), '~')) + { + StringList *local = GetLocalUser(url, defaultdocs); + if (local) + { + if (defaultdocs) + delete defaultdocs; + return local; + } + } + + // This shouldn't happen, but check anyway... + if (strstr(url.get(), "..")) + return 0; + + String *prefix, *path; + String *defaultdoc; + StringList *local_names = new StringList(); + prefixes->Start_Get(); + paths->Start_Get(); + while ((prefix = (String *) prefixes->Get_Next())) + { + path = (String *) paths->Get_Next(); + if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0) + { + int l = strlen(url.get()) - prefix->length() + path->length() + 4; + String *local = new String(*path, l); + *local += &url[prefix->length()]; + if (local->last() == '/' && defaultdocs) + { + defaultdocs->Start_Get(); + while ((defaultdoc = (String *) defaultdocs->Get_Next())) + { + String *localdefault = + new String(*local, local->length() + defaultdoc->length() + 1); + localdefault->append(*defaultdoc); + local_names->Add(localdefault); + } + delete local; + } + else + local_names->Add(local); + } + } + if (local_names->Count() > 0) + { + if (defaultdocs) + delete defaultdocs; + return local_names; + } + + if (defaultdocs) + delete defaultdocs; + delete local_names; + return 0; +} + + +//***************************************************************************** +// StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs) +// If the URL has ~user part, return a list of strings containing the +// (possible) local filenames of the given url, or 0 if it's +// definitely not local. +// THE CALLER MUST FREE THE STRINGLIST AFTER USE! +// +StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs) +{ +// NOTE: Native Windows does not have this contruct for the user Web files +#ifndef _MSC_VER /* _WIN32 */ + HtConfiguration *config = HtConfiguration::config(); + static StringList *prefixes = 0, *paths = 0, *dirs = 0; + static Dictionary home_cache; + URL aUrl(url); + + // + // Initialize prefix/path list if this is the first time. + // The list is given in format "prefix1=path1,dir1 ..." + // If path is zero-length, user's home directory is looked up. + // + if (!prefixes) + { + prefixes = new StringList(); + paths = new StringList(); + dirs = new StringList(); + String t = config->Find("local_user_urls"); + char *p = strtok(t, " \t"); + while (p) + { + char *path = strchr(p, '='); + if (!path) + { + p = strtok(0, " \t"); + continue; + } + *path++ = '\0'; + char *dir = strchr(path, ','); + if (!dir) + { + p = strtok(0, " \t"); + continue; + } + *dir++ = '\0'; + String *pre = new String(p); + decodeURL(*pre); + prefixes->Add(pre); + String *pat = new String(path); + decodeURL(*pat); + paths->Add(pat); + String *ptd = new String(dir); + decodeURL(*ptd); + dirs->Add(ptd); + p = strtok(0, " \t"); + } + } + + // Can we do anything about this? + if (!strchr(url, '~') || !prefixes->Count() || strstr(url, "..")) + return 0; + + // Split the URL to components + String tmp = url; + char *name = strchr((char *) tmp, '~'); + *name++ = '\0'; + char *rest = strchr(name, '/'); + if (!rest || (rest - name <= 1) || (rest - name > 32)) + return 0; + *rest++ = '\0'; + + // Look it up in the prefix/path/dir table + prefixes->Start_Get(); + paths->Start_Get(); + dirs->Start_Get(); + String *prefix, *path, *dir; + String *defaultdoc; + StringList *local_names = new StringList(); + while ((prefix = (String *) prefixes->Get_Next())) + { + path = (String *) paths->Get_Next(); + dir = (String *) dirs->Get_Next(); + if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0) + continue; + + String *local = new String; + // No path, look up home directory + if (path->length() == 0) + { + String *home = (String *) home_cache[name]; + if (!home) + { + struct passwd *passwd = getpwnam(name); + if (passwd) + { + home = new String(passwd->pw_dir); + home_cache.Add(name, home); + } + } + if (home) + *local += *home; + else + continue; + } + else + { + *local += *path; + *local += name; + } + *local += *dir; + *local += rest; + if (local->last() == '/' && defaultdocs) + { + defaultdocs->Start_Get(); + while ((defaultdoc = (String *) defaultdocs->Get_Next())) + { + String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1); + localdefault->append(*defaultdoc); + local_names->Add(localdefault); + } + delete local; + } + else + local_names->Add(local); + } + + if (local_names->Count() > 0) + return local_names; + + delete local_names; +#endif //_MSC_VER /* _WIN32 */ + + return 0; +} + + +//***************************************************************************** +// int Retriever::IsLocalURL(const String &url) +// Returns 1 if the given url has a (possible) local filename +// or 0 if it's definitely not local. +// +int Retriever::IsLocalURL(const String & url) +{ + int ret; + + StringList *local_filename = GetLocal(url); + ret = (local_filename != 0); + if (local_filename) + delete local_filename; + + return ret; +} + + +//***************************************************************************** +// void Retriever::got_word(char *word, int location, int heading) +// The location is normalized to be in the range 0 - 1000. +// +void Retriever::got_word(const char *word, int location, int heading) +{ + if (debug > 3) + cout << "word: " << word << '@' << location << endl; + if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0) + heading = 0; // Assume it's just normal text + if (trackWords && strlen(word) >= (unsigned int) minimumWordLength) + { + String w = word; + HtWordReference wordRef; + + if (no_store_phrases) + { + // Add new word, or mark existing word as also being at + // this heading level + word_entry *entry; + if ((entry = (word_entry*)words_to_add.Find (w)) == NULL) + { + words_to_add.Add(w, new word_entry (location, factor[heading], word_context)); + } else + { + entry->flags |= factor[heading]; + } + } else + { + wordRef.Location(location); + wordRef.Flags(factor[heading]); + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + } + + // Check for compound words... + String parts = word; + int added; + int nparts = 1; + do + { + added = 0; + char *start = parts.get(); + char *punctp = 0, *nextp = 0, *p; + char punct; + int n; + while (*start) + { + p = start; + for (n = 0; n < nparts; n++) + { + while (HtIsStrictWordChar((unsigned char) *p)) + p++; + punctp = p; + if (!*punctp && n + 1 < nparts) + break; + while (*p && !HtIsStrictWordChar((unsigned char) *p)) + p++; + if (n == 0) + nextp = p; + } + if (n < nparts) + break; + punct = *punctp; + *punctp = '\0'; + if (*start && (*p || start > parts.get())) + { + w = start; + HtStripPunctuation(w); + if (w.length() >= minimumWordLength) + { + if (no_store_phrases) + { + // Add new word, or mark existing word as also being at + // this heading level + word_entry *entry; + if ((entry = (word_entry*)words_to_add.Find (w)) == NULL) + { + words_to_add.Add(w, new word_entry (location, factor[heading], word_context)); + } else + { + entry->flags |= factor[heading]; + } + } else + { + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + } + if (debug > 3) + cout << "word part: " << start << '@' << location << endl; + } + added++; + } + start = nextp; + *punctp = punct; + } + nparts++; + } + while (added > 2); + } +} + + +//***************************************************************************** +// void Retriever::got_title(const char *title) +// +void Retriever::got_title(const char *title) +{ + if (debug > 1) + cout << "\ntitle: " << title << endl; + current_title = title; +} + + +//***************************************************************************** +// void Retriever::got_author(const char *e) +// +void Retriever::got_author(const char *author) +{ + if (debug > 1) + cout << "\nauthor: " << author << endl; + current_ref->DocAuthor(author); +} + + +//***************************************************************************** +// void Retriever::got_time(const char *time) +// +void Retriever::got_time(const char *time) +{ + HtDateTime new_time(current_time); + + if (debug > 1) + cout << "\ntime: " << time << endl; + + // + // As defined by the Dublin Core, this should be YYYY-MM-DD + // In the future, we'll need to deal with the scheme portion + // in case someone picks a different format. + // + new_time.SetFTime(time, "%Y-%m-%d"); + current_time = new_time.GetTime_t(); + + // If we can't convert it, current_time stays the same and we get + // the default--the date returned by the server... +} + +//***************************************************************************** +// void Retriever::got_anchor(const char *anchor) +// +void Retriever::got_anchor(const char *anchor) +{ + if (debug > 2) + cout << "anchor: " << anchor << endl; + current_ref->AddAnchor(anchor); + word_context.Anchor(word_context.Anchor() + 1); +} + + +//***************************************************************************** +// void Retriever::got_image(const char *src) +// +void Retriever::got_image(const char *src) +{ + URL url(src, *base); + const char *image = (const char *) url.get(); + + if (debug > 2) + cout << "image: " << image << endl; + + if (images_seen) + fprintf(images_seen, "%s\n", image); +} + + +//***************************************************************************** +// +void Retriever::got_href(URL & url, const char *description, int hops) +{ + DocumentRef *ref = 0; + Server *server = 0; + int valid_url_code = 0; + + // Rewrite the URL (if need be) before we do anything to it. + url.rewrite(); + + if (debug > 2) + cout << "href: " << url.get() << " (" << description << ')' << endl; + + n_links++; + + if (urls_seen) + fprintf(urls_seen, "%s\n", (const char *) url.get()); + + // + // Check if this URL falls within the valid range of URLs. + // + valid_url_code = IsValidURL(url.get()); + if (valid_url_code > 0) + { + // + // It is valid. Normalize it (resolve cnames for the server) + // and check again... + // + if (debug > 2) + { + cout << "resolving '" << url.get() << "'\n"; + cout.flush(); + } + + url.normalize(); + + // If it is a backlink from the current document, + // just update that field. Writing to the database + // is meaningless, as it will be overwritten. + // Adding it as a new document may even be harmful, as + // that will be a duplicate. This can happen if the + // current document is never referenced before, as in a + // start_url. + + if (strcmp(url.get(), current_ref->DocURL()) == 0) + { + current_ref->DocBackLinks(current_ref->DocBackLinks() + 1); + current_ref->AddDescription(description, words); + } + else + { + + // + // First add it to the document database + // + ref = docs[url.get()]; + // if ref exists we have to call AddDescription even + // if max_hop_count is reached + if (!ref && currenthopcount + hops > max_hop_count) + return; + + if (!ref) + { + // + // Didn't see this one, yet. Create a new reference + // for it with a unique document ID + // + ref = new DocumentRef; + ref->DocID(docs.NextDocID()); + ref->DocHopCount(currenthopcount + hops); + ref->DocURL(url.get()); + } + ref->DocBackLinks(ref->DocBackLinks() + 1); // This one! + ref->AddDescription(description, words); + + // + // If the dig is restricting by hop count, perform the check here + // too + if (currenthopcount + hops > max_hop_count) + { + delete ref; + return; + } + + if (ref->DocHopCount() > currenthopcount + hops) + ref->DocHopCount(currenthopcount + hops); + + docs.Add(*ref); + + // + // Now put it in the list of URLs to still visit. + // + if (Need2Get(url.get())) + { + if (debug > 1) + cout << "\n pushing " << url.get() << endl; + server = (Server *) servers[url.signature()]; + if (!server) + { + // + // Hadn't seen this server, yet. Register it + // + String robotsURL = url.signature(); + robotsURL << "robots.txt"; + StringList *localRobotsFile = GetLocal(robotsURL.get()); + + server = new Server(url, localRobotsFile); + servers.Add(url.signature(), server); + delete localRobotsFile; + } + // + // Let's just be sure we're not pushing an empty URL + // + if (strlen(url.get())) + server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get())); + + String temp = url.get(); + visited.Add(temp, 0); + if (debug) + cout << '+'; + } + else if (debug) + cout << '*'; + delete ref; + } + } + else + { + // + // Not a valid URL + // + if (debug > 1) + cout << "\nurl rejected: (level 1)" << url.get() << endl; + if (debug == 1) + cout << '-'; + + if (urls_seen) + { + fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code); + } + + } + if (debug) + cout.flush(); +} + + +//***************************************************************************** +// void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref) +// +void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer) +{ + // First we must piece together the new URL, which may be relative + URL parent(old_ref->DocURL()); + URL url(new_url, parent); + + // Rewrite the URL (if need be) before we do anything to it. + url.rewrite(); + + if (debug > 2) + cout << "redirect: " << url.get() << endl; + + n_links++; + + if (urls_seen) + fprintf(urls_seen, "%s\n", (const char *) url.get()); + + // + // Check if this URL falls within the valid range of URLs. + // + if (IsValidURL(url.get()) > 0) + { + // + // It is valid. Normalize it (resolve cnames for the server) + // and check again... + // + if (debug > 2) + { + cout << "resolving '" << url.get() << "'\n"; + cout.flush(); + } + + url.normalize(); + // + // First add it to the document database + // + DocumentRef *ref = docs[url.get()]; + if (!ref) + { + // + // Didn't see this one, yet. Create a new reference + // for it with a unique document ID + // + ref = new DocumentRef; + ref->DocID(docs.NextDocID()); + ref->DocHopCount(currenthopcount); + } + ref->DocURL(url.get()); + + // + // Copy the descriptions of the old DocRef to this one + // + List *d = old_ref->Descriptions(); + if (d) + { + d->Start_Get(); + String *str; + while ((str = (String *) d->Get_Next())) + { + ref->AddDescription(str->get(), words); + } + } + if (ref->DocHopCount() > old_ref->DocHopCount()) + ref->DocHopCount(old_ref->DocHopCount()); + + // Copy the number of backlinks + ref->DocBackLinks(old_ref->DocBackLinks()); + + docs.Add(*ref); + + // + // Now put it in the list of URLs to still visit. + // + if (Need2Get(url.get())) + { + if (debug > 1) + cout << " pushing " << url.get() << endl; + Server *server = (Server *) servers[url.signature()]; + if (!server) + { + // + // Hadn't seen this server, yet. Register it + // + String robotsURL = url.signature(); + robotsURL << "robots.txt"; + StringList *localRobotsFile = GetLocal(robotsURL.get()); + + server = new Server(url, localRobotsFile); + servers.Add(url.signature(), server); + delete localRobotsFile; + } + if (!referer || strlen(referer) == 0) + server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0); + else + server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0); + + String temp = url.get(); + visited.Add(temp, 0); + } + + delete ref; + } +} + + +//***************************************************************************** +// void Retriever::got_head(const char *head) +// +void Retriever::got_head(const char *head) +{ + if (debug > 4) + cout << "head: " << head << endl; + current_head = head; +} + +//***************************************************************************** +// void Retriever::got_meta_dsc(const char *md) +// +void Retriever::got_meta_dsc(const char *md) +{ + if (debug > 4) + cout << "meta description: " << md << endl; + current_meta_dsc = md; +} + + +//***************************************************************************** +// void Retriever::got_meta_email(const char *e) +// +void Retriever::got_meta_email(const char *e) +{ + if (debug > 1) + cout << "\nmeta email: " << e << endl; + current_ref->DocEmail(e); +} + + +//***************************************************************************** +// void Retriever::got_meta_notification(const char *e) +// +void Retriever::got_meta_notification(const char *e) +{ + if (debug > 1) + cout << "\nmeta notification date: " << e << endl; + current_ref->DocNotification(e); +} + + +//***************************************************************************** +// void Retriever::got_meta_subject(const char *e) +// +void Retriever::got_meta_subject(const char *e) +{ + if (debug > 1) + cout << "\nmeta subect: " << e << endl; + current_ref->DocSubject(e); +} + + +//***************************************************************************** +// void Retriever::got_noindex() +// +void Retriever::got_noindex() +{ + if (debug > 1) + cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl; + current_ref->DocState(Reference_noindex); +} + + +//***************************************************************************** +// +void Retriever::recordNotFound(const String & url, const String & referer, int reason) +{ + char *message = ""; + + switch (reason) + { + case Transport::Document_not_found: + message = "Not found"; + break; + + case Transport::Document_no_host: + message = "Unknown host or unable to contact server"; + break; + + case Transport::Document_no_port: + message = "Unknown host or unable to contact server (port)"; + break; + + default: + break; + + } + + notFound << message << ": " << url << " Ref: " << referer << '\n'; +} + +//***************************************************************************** +// void Retriever::ReportStatistics(char *name) +// +void Retriever::ReportStatistics(const String & name) +{ + HtConfiguration *config = HtConfiguration::config(); + cout << name << ": Run complete\n"; + cout << name << ": " << servers.Count() << " server"; + if (servers.Count() > 1) + cout << "s"; + cout << " seen:\n"; + + Server *server; + String buffer; + StringList results; + String newname = name; + + newname << ": "; + + servers.Start_Get(); + while ((server = (Server *) servers.Get_NextElement())) + { + buffer = 0; + server->reportStatistics(buffer, newname); + results.Add(buffer); + } + results.Sort(); + + for (int i = 0; i < results.Count(); i++) + { + cout << results[i] << "\n"; + } + + if (notFound.length() > 0) + { + cout << "\n" << name << ": Errors to take note of:\n"; + cout << notFound; + } + + cout << endl; + + // Report HTTP connections stats + cout << "HTTP statistics" << endl; + cout << "===============" << endl; + + if (config->Boolean("persistent_connections")) + { + cout << " Persistent connections : Yes" << endl; + + if (config->Boolean("head_before_get")) + cout << " HEAD call before GET : Yes" << endl; + else + cout << " HEAD call before GET : No" << endl; + } + else + { + cout << " Persistent connections : No" << endl; + } + + HtHTTP::ShowStatistics(cout) << endl; + +} diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h new file mode 100644 index 00000000..b2fff24d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h @@ -0,0 +1,183 @@ +// +// Retriever.h +// +// Retriever: Crawl from a list of URLs and calls appropriate parsers. The +// parser notifies the Retriever object that it got something +// (got_* functions) and the Retriever object feed the databases +// and statistics accordingly. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Retriever.h,v 1.28 2004/05/28 13:15:15 lha Exp $ +// + +#ifndef _Retriever_h_ +#define _Retriever_h_ + +#include "DocumentRef.h" +#include "Dictionary.h" +#include "Queue.h" +#include "HtWordReference.h" +#include "List.h" +#include "StringList.h" +#include "DocumentDB.h" + +#define HTDIG_ERROR_TESTURL_EXCLUDE -109 +#define HTDIG_ERROR_TESTURL_BADQUERY -110 +#define HTDIG_ERROR_TESTURL_EXTENSION -111 +#define HTDIG_ERROR_TESTURL_EXTENSION2 -112 +#define HTDIG_ERROR_TESTURL_LIMITS -113 +#define HTDIG_ERROR_TESTURL_LIMITSNORM -114 +#define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115 +#define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116 +#define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117 +#define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118 + + +class URL; +class Document; +class URLRef; +class HtWordList; + +enum RetrieverLog { + Retriever_noLog, + Retriever_logUrl, + Retriever_Restart +}; + +struct word_entry : public Object +{ + word_entry (int loc, int fl, HtWordReference& ref) : + location (loc), flags (fl), context (ref) + {}; + int location; + int flags; + HtWordReference context; +}; + +class Retriever +{ +public: + // + // Construction/Destruction + // + Retriever(RetrieverLog flags = Retriever_noLog); + virtual ~Retriever(); + + // + // Getting it all started + // + void Initial(const String& url, int checked = 0); + void Initial(List &list , int checked = 0); + void Start(); + + // + // Report statistics about the parser + // + void ReportStatistics(const String& name); + + // + // These are the callbacks that we need to write code for + // + void got_word(const char *word, int location, int heading); + void got_href(URL &url, const char *description, int hops = 1); + void got_title(const char *title); + void got_author(const char *author); + void got_time(const char *time); + void got_head(const char *head); + void got_meta_dsc(const char *md); + void got_anchor(const char *anchor); + void got_image(const char *src); + void got_meta_email(const char *); + void got_meta_notification(const char *); + void got_meta_subject(const char *); + void got_noindex(); + + // + // Allow for the indexing of protected sites by using a + // username/password + // + void setUsernamePassword(const char *credentials); + + // + // Routines for dealing with local filesystem access + // + StringList * GetLocal(const String &strurl); + StringList * GetLocalUser(const String &url, StringList *defaultdocs); + int IsLocalURL(const String &url); + +private: + // + // A hash to keep track of what we've seen + // + Dictionary visited; + + URL *base; + String current_title; + String current_head; + String current_meta_dsc; + time_t current_time; + int current_id; + DocumentRef *current_ref; + int current_anchor_number; + int trackWords; + int n_links; + String credentials; + HtWordReference word_context; + HtWordList words; + + Dictionary words_to_add; + + int check_unique_md5; + int check_unique_date; + + + RetrieverLog log; + // + // These are weights for the words. The index is the heading level. + // + long int factor[12]; + int currenthopcount; + + // + // Some semi-constants... + // + int max_hop_count; + + // + // The list of server-specific information objects is indexed by + // ip address and port number. The list contains Server objects. + // + Dictionary servers; + + // + // For efficiency reasons, we will only use one document object which + // we reuse. + // + Document *doc; + + Database *d_md5; + + String notFound; + + // Some useful constants + int minimumWordLength; + + // + // Helper routines + // + int Need2Get(const String &url); + int IsValidURL(const String &url); + void RetrievedDocument(Document &, const String &url, DocumentRef *ref); + void parse_url(URLRef &urlRef); + void got_redirect(const char *, DocumentRef *, const char * = 0); + void recordNotFound(const String &url, const String &referer, int reason); +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.cc b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc new file mode 100644 index 00000000..3afdebd3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.cc @@ -0,0 +1,435 @@ +// +// Server.cc +// +// Server: A class to keep track of server specific information. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Server.cc,v 1.29 2004/05/28 13:15:16 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htdig.h" +#include "Server.h" +#include "good_strtok.h" +#include "htString.h" +#include "URL.h" +#include "Document.h" +#include "URLRef.h" +#include "Transport.h" +#include "HtHTTP.h" // for checking persistent connections +#include "StringList.h" + +#include <ctype.h> +#include "defaults.h" + + +//***************************************************************************** +// Server::Server(URL u, StringList *local_robots_files) +// u is the base URL for this server +// +Server::Server(URL u, StringList *local_robots_files) +: + _host(u.host()), + _port(u.port()), + _bad_server(0), + _documents(0), + _accept_language(0) +{ + HtConfiguration* config= HtConfiguration::config(); + if (debug) + cout << endl << "New server: " << _host << ", " << _port << endl; + + // We take it from the configuration + _persistent_connections = config->Boolean("server", _host.get(),"persistent_connections"); + _head_before_get = config->Boolean("server", _host.get(),"head_before_get"); + + _max_documents = config->Value("server",_host.get(),"server_max_docs"); + _connection_space = config->Value("server",_host.get(),"server_wait_time"); + _user_agent = config->Find("server", _host.get(), "user_agent"); + _disable_cookies = config->Boolean("server", _host.get(), "disable_cookies"); + + // Accept-Language directive + StringList _accept_language_list(config->Find("server", _host.get(), + "accept_language"), " \t"); + + _accept_language.trunc(); // maybe not needed + + for (int i = 0; i < _accept_language_list.Count(); i++) + { + if (i>0) + _accept_language << ","; // for multiple choices + + _accept_language << _accept_language_list[i]; + } + + // Timeout setting + _timeout = config->Value("server",_host.get(),"timeout"); + + // Number of consecutive attempts to establish a TCP connection + _tcp_max_retries = config->Value("server",_host.get(),"tcp_max_retries"); + + // Seconds to wait after a timeout occurs + _tcp_wait_time = config->Value("server",_host.get(),"tcp_wait_time"); + + + if (debug > 1) + { + cout << " - Persistent connections: " << + (_persistent_connections?"enabled":"disabled") << endl; + + cout << " - HEAD before GET: " << + (_head_before_get?"enabled":"disabled") << endl; + + cout << " - Timeout: " << _timeout << endl; + cout << " - Connection space: " << _connection_space << endl; + cout << " - Max Documents: " << _max_documents << endl; + cout << " - TCP retries: " << _tcp_max_retries << endl; + cout << " - TCP wait time: " << _tcp_wait_time << endl; + cout << " - Accept-Language: " << _accept_language << endl; + + } + + _last_connection.SettoNow(); // For getting robots.txt + + if (strcmp(u.service(),"http") == 0 || strcmp(u.service(),"https") == 0) + { + // + // Attempt to get a robots.txt file from the specified server + // + String url; + url.trunc(); + + if (debug>1) + cout << "Trying to retrieve robots.txt file" << endl; + url << u.signature() << "robots.txt"; + + static int local_urls_only = config->Boolean("local_urls_only"); + time_t timeZero = 0; // Right now we want to get this every time + Document doc(url, 0); + Transport::DocStatus status; + if (local_robots_files) + { + if (debug > 1) + cout << "Trying local files" << endl; + status = doc.RetrieveLocal(timeZero, local_robots_files); + if (status == Transport::Document_not_local) + { + if (local_urls_only) + status = Transport::Document_not_found; + else + { + if (debug > 1) + cout << "Local retrieval failed, trying HTTP" << endl; + status = doc.Retrieve(this, timeZero); + } + } + } + else if (!local_urls_only) + { + status = doc.Retrieve(this, timeZero); + + // Let's check if persistent connections are both + // allowed by the configuration and possible after + // having requested the robots.txt file. + + HtHTTP * http; + if (IsPersistentConnectionAllowed() && + ( http = doc.GetHTTPHandler())) + { + if (! http->isPersistentConnectionPossible()) + _persistent_connections=0; // not possible. Let's disable + // them on this server. + } + + } + else + status = Transport::Document_not_found; + + switch (status) + { + case Transport::Document_ok: + // + // Found a robots.txt file. Go parse it. + // + robotstxt(doc); + break; + + case Transport::Document_not_found: + case Transport::Document_not_parsable: + case Transport::Document_redirect: + case Transport::Document_not_authorized: + // + // These cases are for when there is no robots.txt file. + // We will just go on happily without restrictions + // + break; + + case Transport::Document_no_host: + default: + // + // In all other cases the server could not be reached. + // We will remember this fact so that no more attempts to + // contact this server will be made. + // + _bad_server = 1; + break; + } // end switch + } // end if (http || https) +} + +// Copy constructor +Server::Server(const Server& rhs) +:_host(_host), +_port(rhs._port), +_bad_server(rhs._bad_server), +_connection_space(rhs._connection_space), +_last_connection(rhs._last_connection), +_paths(rhs._paths), +_disallow(rhs._disallow), +_documents(rhs._documents), +_max_documents(rhs._max_documents), +_persistent_connections(rhs._persistent_connections), +_head_before_get(rhs._head_before_get), +_disable_cookies(rhs._disable_cookies), +_timeout(rhs._timeout), +_tcp_wait_time(rhs._tcp_wait_time), +_tcp_max_retries(rhs._tcp_max_retries), +_user_agent(rhs._user_agent), +_accept_language(rhs._accept_language) +{ +} + + +//***************************************************************************** +// Server::~Server() +// +Server::~Server() +{ +} + + +//***************************************************************************** +// void Server::robotstxt(Document &doc) +// This will parse the robots.txt file which is contained in the document. +// +void Server::robotstxt(Document &doc) +{ + HtConfiguration* config= HtConfiguration::config(); + String contents = doc.Contents(); + int length; + int pay_attention = 0; + String pattern; + String myname = config->Find("server", _host.get(), "robotstxt_name"); + int seen_myname = 0; + char *name, *rest; + + if (debug > 1) + cout << "Parsing robots.txt file using myname = " << myname << "\n"; + + // + // Go through the lines in the file and determine if we need to + // pay attention to them + // + for (char *line = strtok(contents, "\r\n"); line; line = strtok(0, "\r\n")) + { + if (debug > 2) + cout << "Robots.txt line: " << line << endl; + + // + // Strip comments + // + if (strchr(line, '#')) + { + *(strchr(line, '#')) = '\0'; + } + + name = good_strtok(line, ':'); + if (!name) + continue; + while (name && isspace(*name)) name++; + rest = good_strtok(NULL, '\r'); + if (!rest) + rest = ""; + + while (rest && isspace(*rest)) + rest++; + + length = strlen(rest); + if (length > 0) + { + while (length > 0 && isspace(rest[length - 1])) + length--; + rest[length] = '\0'; + } + + if (mystrcasecmp(name, "user-agent") == 0) + { + if (debug > 1) + cout << "Found 'user-agent' line: " << rest << endl; + + if (*rest == '*' && !seen_myname) + { + // + // This matches all search engines... + // + pay_attention = 1; + } + else if (mystrncasecmp(rest, (char*)myname, myname.length()) == 0) + { + // + // This is for us! This will override any previous patterns + // that may have been set. + // + if (!seen_myname) // only take first section with our name + { + seen_myname = 1; + pay_attention = 1; + pattern = 0; // ignore previous User-agent: * + } + else + pay_attention = 0; + } + else + { + // + // This doesn't concern us + // + pay_attention = 0; + } + } + else if (pay_attention && mystrcasecmp(name, "disallow") == 0) + { + if (debug > 1) + cout << "Found 'disallow' line: " << rest << endl; + + // + // Add this path to our list to ignore + // + if (*rest) + { + if (pattern.length()) + pattern << '|'; + while (*rest) + { + if (strchr("^.[$()|*+?{\\", *rest)) + pattern << '\\'; + pattern << *rest++; + } + } + } + // + // Ignore anything else (comments) + // + } + + // + // Compile the pattern (if any...) + // + if (debug > 1) + cout << "Pattern: " << pattern << endl; + + // Empty "disallow" allows all, so don't make entry which matches all. + if (!pattern.empty()) + { + String fullpatt = "^[^:]*://[^/]*("; + fullpatt << pattern << ')'; + _disallow.set(fullpatt, config->Boolean("case_sensitive")); + } +} + + +//***************************************************************************** +// void Server::push(String &path, int hopcount, char *referer, int local, int newDoc) +// +void Server::push(const String &path, int hopcount, const String &referer, + int local, int newDoc) +{ + if (_bad_server && !local) + return; + + if (IsDisallowed(path) != 0) + { + if (debug > 2) + cout << endl << " Rejected: forbidden by server robots.txt!"; + + return; + } + + // We use -1 as no limit, but we also don't want + // to forbid redirects from old places + if (_max_documents != -1 && newDoc && + _documents >= _max_documents) + { + if (debug>2) // Hey! we only want to get max_docs + cout << "Limit of " << _max_documents << " reached for " << _host << endl; + + return; + } + + URLRef *ref = new URLRef(); + ref->SetURL(path); + ref->SetHopCount(hopcount); + ref->SetReferer(referer); + _paths.Add(ref); + + if (newDoc) + _documents++; + +// cout << "***** pushing '" << path << "' with '" << referer << "'\n"; +} + + +//***************************************************************************** +// URLRef *Server::pop() +// +URLRef *Server::pop() +{ + URLRef *ref = (URLRef *) _paths.Remove(); + + if (!ref) + return 0; + + return ref; +} + + +//***************************************************************************** +// void Server::delay() +// +// Keeps track of how long it's been since we've seen this server +// and call sleep if necessary +// +void Server::delay() +{ + HtDateTime now; + + int time_taken = HtDateTime::GetDiff(now, _last_connection); // arg1-arg2 > 0 + + if (time_taken < _connection_space) + sleep(_connection_space - time_taken); + + now.SettoNow(); + _last_connection = now; // Reset the clock for the next delay! + + return; +} + + +//***************************************************************************** +// void Server::reportStatistics(String &out, char *name) +// +void Server::reportStatistics(String &out, char *name) +{ + out << name << " " << _host << ":" << _port; + out << " " << _documents << " document"; + if (_documents != 1) + out << "s"; +} diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Server.h b/debian/htdig/htdig-3.2.0b6/htdig/Server.h new file mode 100644 index 00000000..ca6a4f04 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Server.h @@ -0,0 +1,142 @@ +// +// Server.h +// +// Server: A class to keep track of server specific information. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Server.h,v 1.13 2004/05/28 13:15:16 lha Exp $ +// + +#ifndef _Server_h_ +#define _Server_h_ + +#include "Object.h" +#include "htString.h" +#include "StringList.h" +#include "Stack.h" +#include "HtHeap.h" +#include "HtRegex.h" +#include "StringMatch.h" +#include "URLRef.h" +#include "HtDateTime.h" + + +class Document; + +class Server : public Object +{ +public: + // + // Construction/Destruction + // + Server(URL u, StringList *local_robots_files = NULL); + Server(const Server& rhs); + ~Server(); + + // + // This needs to be called with a document containing the + // robots.txt file for this server + // + void robotstxt(Document &doc); + + // + // Provide some way of getting at the host and port for this server + // + int port() const {return _port;} + const String &host() const {return _host;} + + // + // Provide some way of getting at the status of this server + // + int IsDead() {return _bad_server;} + void IsDead(int flag) {_bad_server = flag;} + + // + // Add a path to the queue for this server. + // This will check to see if the server is up if the URL is not local + // if it's down, it simply will not be added + // + void push(const String &path, int hopcount, const String &referer, + int local = 0, int newDoc = 1); + + // + // Return the next URL from the queue for this server. + // + URLRef *pop(); + + // + // Delays the server if necessary. If the time between requests + // is long enough, the request can occur immediately. + // + void delay(); + + // + // Produce statistics for this server. + // + void reportStatistics(String &out, char *name); + + // + // Methods for managing persistent connections + // + void AllowPersistentConnection() { _persistent_connections = true; } + void AvoidPersistentConnection() { _persistent_connections = false; } + bool IsPersistentConnectionAllowed () const + { return _persistent_connections; } + + // Methods for getting info regarding server configuration + bool HeadBeforeGet() const { return _head_before_get; } + unsigned int TimeOut() const { return _timeout; } + unsigned int TcpWaitTime() const { return _tcp_wait_time; } + unsigned int TcpMaxRetries() const { return _tcp_max_retries; } + unsigned int MaxDocuments() const { return _max_documents; } + const String &UserAgent() const { return _user_agent; } + const String &AcceptLanguage() const { return _accept_language; } + bool DisableCookies() const { return _disable_cookies; } + + // + // Return the URLs to be excluded from this server + // (for inclusion in the exclude_urls attribute) + // + int IsDisallowed(String url) { return _disallow.match(url, 0, 0); } + +private: + String _host; + int _port; + int _bad_server; // TRUE if we shouldn't use this one + int _connection_space; // Seconds between connections + HtDateTime _last_connection; // Time of last connection to this server + HtHeap _paths; + HtRegex _disallow; // This pattern will be used to test paths + int _documents; // Number of documents visited + + int _max_documents; // Maximum number of documents from this server + + bool _persistent_connections; // Are pcs allowed + + bool _head_before_get; // HEAD call before a GET? + + bool _disable_cookies; // Should we send cookies? + + int _timeout; // Timeout for this server + + unsigned int _tcp_wait_time; // Wait time after a timeout + // has been raised. + + unsigned int _tcp_max_retries; // Max number of retries when + // connection is not possible + // and timeout occurs + String _user_agent; // User agent to use for this server + String _accept_language; // Accept-language to be sent + // for the HTTP server + + +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc new file mode 100644 index 00000000..6cc8bc43 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.cc @@ -0,0 +1,47 @@ +// +// URLRef.cc +// +// URLRef: A definition of a URL/Referer pair with associated hopcount +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: URLRef.cc,v 1.9 2004/05/28 13:15:16 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "URLRef.h" + + +//***************************************************************************** +// URLRef::URLRef() +// +URLRef::URLRef() +{ + hopcount = 0; +} + + +//***************************************************************************** +// URLRef::~URLRef() +// +URLRef::~URLRef() +{ +} + + +//***************************************************************************** +// +int URLRef::compare(const URLRef& to) const +{ + return hopcount - to.hopcount; +} + + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h new file mode 100644 index 00000000..dfc251ec --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/URLRef.h @@ -0,0 +1,50 @@ +// +// URLRef.h +// +// URLRef: A definition of a URL/Referer pair with associated hopcount +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: URLRef.h,v 1.9 2004/05/28 13:15:16 lha Exp $ +// +// +#ifndef _URLRef_h_ +#define _URLRef_h_ + +#include "Object.h" +#include "htString.h" +#include "URL.h" + +class URLRef : public Object +{ +public: + // + // Construction/Destruction + // + URLRef(); + ~URLRef(); + + const URL &GetURL() const {return url;} + int GetHopCount() const {return hopcount;} + const URL &GetReferer() const {return referer;} + + void SetURL(const URL &u) {url = u;} + void SetHopCount(int h) {hopcount = h;} + void SetReferer(const URL &ref) {referer = ref;} + + int compare(const Object& to) const { return compare((const URLRef&) to); } + int compare(const URLRef& to) const; + +private: + URL url; + URL referer; + int hopcount; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc new file mode 100644 index 00000000..ba1d842a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc @@ -0,0 +1,536 @@ +// +// htdig.cc +// +// htdig: Indexes the web sites specified in the config file +// generating several databases to be used by htmerge +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htdig.cc,v 1.42 2004/05/28 13:15:16 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Document.h" +#include "Retriever.h" +#include "StringList.h" +#include "htdig.h" +#include "defaults.h" +#include "HtURLCodec.h" +#include "WordContext.h" +#include "HtDateTime.h" +#include "HtURLRewriter.h" + +//////////////////////////// +// For cookie jar +//////////////////////////// +#include "HtCookieJar.h" +#include "HtCookieMemJar.h" +#include "HtCookieInFileJar.h" +#include "HtHTTP.h" +//////////////////////////// + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#elif HAVE_GETOPT_LOCAL +#include <getopt_local.h> +#endif + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +// +// Global variables +// +int debug = 0; +int report_statistics = 0; +DocumentDB docs; +HtRegexList limits; +HtRegexList limitsn; +FILE *urls_seen = NULL; +FILE *images_seen = NULL; +String configFile = DEFAULT_CONFIG_FILE; +String minimalFile = 0; +HtDateTime StartTime; +HtDateTime EndTime; + +void usage(); +void reportError(char *msg); + + +// +// Start of the program. +// +int main(int ac, char **av) +{ + int c; + extern char *optarg; + String credentials; + int initial = 0; + int alt_work_area = 0; + int create_text_database = 0; + char *max_hops = 0; + + // Cookie jar dynamic creation. + HtCookieJar* _cookie_jar = new HtCookieMemJar(); // new cookie jar + if (_cookie_jar) + HtHTTP::SetCookieJar(_cookie_jar); + +//extern int yydebug; +//yydebug=1; + + // + // Parse command line arguments + // + while ((c = getopt(ac, av, "lsm:c:vith:u:a")) != -1) + { + unsigned int pos; + switch (c) + { + case 'c': + configFile = optarg; + break; + case 'v': + debug++; + break; + case 'i': + initial++; + break; + case 't': + create_text_database++; + break; + case 'h': + max_hops = optarg; + break; + case 's': + report_statistics++; + break; + case 'u': + credentials = optarg; + for (pos = 0; pos < strlen(optarg); pos++) + optarg[pos] = '*'; + break; + case 'a': + alt_work_area++; + break; + case 'm': + minimalFile = optarg; + max_hops = "0"; + break; + case '?': + usage(); + default: + break; + } + } + + // Shows Start Time + if (debug>0) + cout << "ht://dig Start Time: " << StartTime.GetAscTime() << endl; + + // + // First set all the defaults and then read the specified config + // file to override the defaults. + // + HtConfiguration* const config= HtConfiguration::config(); + config->Defaults(&defaults[0]); + if (access((char*)configFile, R_OK) < 0) + { + reportError(form("Unable to find configuration file '%s'", + configFile.get())); + } + config->Read(configFile); + + // Warn user if any obsolete options are found in config file + // For efficiency, check all fields here. If different config + // files are used for searching, obsolete options may remain + char *deprecatedOptions [] = { + "heading_factor_1", "heading_factor_2", "heading_factor_3", + "heading_factor_4", "heading_factor_5", "heading_factor_6", + "modification_time_is_now", "pdf_parser", "translate_amp", + "translate_lt_gt", "translate_quot", "uncoded_db_compatible", + "" // empty terminator + }; + char **option; + for (option = deprecatedOptions; **option; option++) + { + if (!config->Find(*option).empty()) + cout << "Warning: Configuration option " << *option << + " is no longer supported\n"; + } + + if (config->Find("locale").empty() && debug > 0) + cout << "Warning: unknown locale!\n"; + + if (max_hops) + { + config->Add("max_hop_count", max_hops); + } + + // Set up credentials for this run + if (credentials.length()) + config->Add("authorization", credentials); + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance()->ErrMsg(); + + if (url_part_errors.length() != 0) + reportError(form("Invalid url_part_aliases or common_url_parts: %s", + url_part_errors.get())); + + // + // Check url_rewrite_rules for errors. + String url_rewrite_rules = HtURLRewriter::instance()->ErrMsg(); + + if (url_rewrite_rules.length() != 0) + reportError(form("Invalid url_rewrite_rules: %s", + url_rewrite_rules.get())); + + // + // If indicated, change the database file names to have the .work + // extension + // + if (alt_work_area != 0) + { + String configValue = config->Find("doc_db"); + + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("doc_db", configValue); + } + + configValue = config->Find("word_db"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("word_db", configValue); + } + + configValue = config->Find("doc_index"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("doc_index", configValue); + } + + configValue = config->Find("doc_excerpt"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("doc_excerpt", configValue); + } + + configValue = config->Find("md5_db"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("md5_db", configValue); + } + } + + // Imports the cookies file + const String CookiesInputFile = config->Find("cookies_input_file"); + if (CookiesInputFile.length()) + { + if (debug>0) + cout << "Importing Cookies input file " + << CookiesInputFile << endl; + int result; + HtCookieJar::SetDebugLevel(debug); // Set the debug level + HtCookieInFileJar* cookie_file = new HtCookieInFileJar(CookiesInputFile, result); + if (cookie_file) + { + if (!result) + { + if (debug>0) + cookie_file->ShowSummary(); + delete _cookie_jar; // Deletes previous cookie jar + _cookie_jar = (HtCookieJar*) cookie_file; // set the imported one + HtHTTP::SetCookieJar(_cookie_jar); // and set the new HTTP jar + } + else if (debug > 0) + cout << "Warning: Import failed! (" << CookiesInputFile << ")" << endl; + } + else + reportError(form("Unable to load cookies file '%s' in memory", + CookiesInputFile.get())); + } + + // + // If needed, we will create a list of every URL we come across. + // + if (config->Boolean("create_url_list")) + { + const String filename = config->Find("url_list"); + urls_seen = fopen(filename, initial ? "w" : "a"); + if (urls_seen == 0) + { + reportError(form("Unable to create URL file '%s'", + filename.get())); + } + } + + // + // If needed, we will create a list of every image we come across. + // + if (config->Boolean("create_image_list")) + { + const String filename = config->Find("image_list"); + images_seen = fopen(filename, initial ? "w" : "a"); + if (images_seen == 0) + { + reportError(form("Unable to create images file '%s'", + filename.get())); + } + } + + // + // Set up the limits list + // + StringList l(config->Find("limit_urls_to"), " \t"); + limits.setEscaped(l, config->Boolean("case_sensitive")); + l.Destroy(); + + l.Create(config->Find("limit_normalized"), " \t"); + limitsn.setEscaped(l, config->Boolean("case_sensitive")); + l.Destroy(); + + // + // Open the document database + // + const String filename = config->Find("doc_db"); + if (initial) + unlink(filename); + + const String index_filename = config->Find("doc_index"); + if (initial) + unlink(index_filename); + + const String head_filename = config->Find("doc_excerpt"); + if (initial) + unlink(head_filename); + + if (docs.Open(filename, index_filename, head_filename) < 0) + { + reportError(form("Unable to open/create document database '%s'", + filename.get())); + } + + const String word_filename = config->Find("word_db"); + if (initial) + { + unlink(word_filename); + unlink((word_filename + "_weakcmpr").get()); + + // Remove "duplicate detection" database + unlink(config->Find("md5_db")); + + // using -i, also ignore seen-but-not-processed URLs from last pass + unlink(config->Find("url_log")); + } + + // Initialize htword + WordContext::Initialize(*config); + + // Create the Retriever object which we will use to parse all the + // HTML files. + // In case this is just an update dig, we will add all existing + // URLs? + // + Retriever retriever(Retriever_logUrl); + if (minimalFile.length() == 0) + { + List *list = docs.URLs(); + retriever.Initial(*list); + delete list; + + // Add start_url to the initial list of the retriever. + // Don't check a URL twice! + // Beware order is important, if this bugs you could change + // previous line retriever.Initial(*list, 0) to Initial(*list,1) + retriever.Initial(config->Find("start_url"), 1); + } + + // Handle list of URLs given in a file (stdin, if "-") specified as + // argument to -m or as an optional trailing argument. + if (optind < ac) + { + if (debug) + if (minimalFile.length() != 0) + cout << "Warning: argument " << av[optind] + << " overrides -m " << minimalFile << endl; + minimalFile = av[optind]; + } + if (strcmp (minimalFile.get(), "-") == 0) + { + String str; + // Why not combine this with the code below, with input = stdin ? + while (!cin.eof()) + { + cin >> str; + str.chop("\r\n"); // (Why "\r\n" here and "\r\n\t " below?) + if (str.length() > 0) + retriever.Initial(str, 1); + } + } + else if (minimalFile.length() != 0) + { + FILE *input = fopen(minimalFile.get(), "r"); + char buffer[1000]; + + if (input) + { + while (fgets(buffer, sizeof(buffer), input)) + { + String str(buffer); + str.chop("\r\n\t "); + if (str.length() > 0) + retriever.Initial(str, 1); + } + fclose(input); + } + else + { + cerr << "Could not open argument '" << minimalFile + << "' of flag -m\n"; + exit (1); + } + } + + // + // Go do it! + // + retriever.Start(); + + // + // All done with parsing. + // + + // + // If the user so wants, create a text version of the document database. + // + + if (create_text_database) + { + const String doc_list = config->Find("doc_list"); + if (initial) + unlink(doc_list); + docs.DumpDB(doc_list); + const String word_dump = config->Find("word_dump"); + if (initial) + unlink(word_dump); + HtWordList words(*config); + if(words.Open(config->Find("word_db"), O_RDONLY) == OK) { + words.Dump(word_dump); + } + } + + // + // Cleanup + // + if (urls_seen) + fclose(urls_seen); + if (images_seen) + fclose(images_seen); + + // + // If needed, report some statistics + // + if (report_statistics) + { + retriever.ReportStatistics("htdig"); + } + + // Shows End Time + if (debug>0) + { + EndTime.SettoNow(); + cout << "ht://dig End Time: " << EndTime.GetAscTime() << endl; + } + + if (_cookie_jar) + delete _cookie_jar; +} + + +// +// Display usage information for the htdig program +// +void usage() +{ + cout << "usage: htdig [-v][-i][-c configfile][-t][-m minimalfile]\n"; + cout << "This program is part of ht://Dig " << VERSION << "\n\n"; + cout << "Options:\n"; + + cout << "\t-v\tVerbose mode. This increases the verbosity of the\n"; + cout << "\t\tprogram. Using more than 2 is probably only useful\n"; + cout << "\t\tfor debugging purposes. The default verbose mode\n"; + cout << "\t\tgives a nice progress report while digging.\n\n"; + + cout << "\t-i\tInitial. Do not use any old databases. This is\n"; + cout << "\t\taccomplished by first erasing the databases.\n\n"; + + cout << "\t-c configfile\n"; + cout << "\t\tUse the specified configuration file instead of the\n"; + cout << "\t\tdefault.\n\n"; + + cout << "\t-t\tCreate an ASCII version of the document database.\n"; + cout << "\t\tThis database is easy to parse with other programs so\n"; + cout << "\t\tthat information can be extracted from it.\n\n"; + + cout << "\t-h hopcount\n"; + cout << "\t\tLimit the stored documents to those which are at\n"; + cout << "\t\tmost hopcount links away from the start URL.\n\n"; + + cout << "\t-s\tReport statistics after completion.\n\n"; + + cout << "\t-u username:password\n"; + cout << "\t\tTells htdig to send the supplied username and\n"; + cout << "\t\tpassword with each HTTP request. The credentials\n"; + cout << "\t\twill be encoded using the 'Basic' authentication scheme.\n"; + cout << "\t\tThere *HAS* to be a colon (:) between the username\n"; + cout << "\t\tand password.\n\n"; + + cout << "\t-a\tUse alternate work files.\n"; + cout << "\t\tTells htdig to append .work to database files, causing\n"; + cout << "\t\ta second copy of the database to be built. This allows\n"; + cout << "\t\tthe original files to be used by htsearch during the\n"; + cout << "\t\tindexing run.\n\n"; + + cout << "\t-m minimalfile (or just a file name at end of arguments)\n"; + cout << "\t\tTells htdig to read URLs from the supplied file and index\n"; + cout << "\t\tthem in place of (or in addition to) the existing URLs in\n"; + cout << "\t\tthe database and the start_url. With the -m, only the\n"; + cout << "\t\tURLs specified are added to the database. A file name of\n"; + cout << "\t\t'-' indicates the standard input.\n\n"; + + + + exit(0); +} + +// +// Report an error and die +// +void reportError(char *msg) +{ + cout << "htdig: " << msg << "\n\n"; + exit(1); +} + diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.h b/debian/htdig/htdig-3.2.0b6/htdig/htdig.h new file mode 100644 index 00000000..5eb5b9bb --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.h @@ -0,0 +1,55 @@ +// +// htdig.h +// +// htdig: Indexes the web sites specified in the config file +// generating several databases to be used by htmerge +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htdig.h,v 1.16 2004/05/28 13:15:16 lha Exp $ +// + +#ifndef _htdig_h_ +#define _htdig_h_ + +#include "HtConfiguration.h" +#include "List.h" +#include "DocumentDB.h" +#include "StringMatch.h" +#include "htconfig.h" +#include "HtRegexList.h" +#include <stdlib.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> + +extern int debug; +extern DocumentDB docs; +extern HtRegexList limits; +extern HtRegexList limitsn; +extern HtRegexList excludes; +extern HtRegexList badquerystr; +extern FILE *urls_seen; +extern FILE *images_seen; + +extern void reportError(char *msg); + +#endif + + |