diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/Document.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htdig/Document.cc | 784 |
1 files changed, 784 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Document.cc b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc new file mode 100644 index 00000000..87272686 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Document.cc @@ -0,0 +1,784 @@ +// +// Document.cc +// +// Document: This class holds everything there is to know about a document. +// The actual contents of the document may or may not be present at +// all times for memory conservation reasons. +// The document can be told to retrieve its contents. This is done +// with the Retrieve call. In case the retrieval causes a +// redirect, the link is followed, but this process is done +// only once (to prevent loops.) If the redirect didn't +// work, Document_not_found is returned. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Document.cc,v 1.71 2004/05/28 13:15:14 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include "Document.h" +#include "StringList.h" +#include "htdig.h" +#include "HTML.h" +#include "Plaintext.h" +#include "ExternalParser.h" +#include "lib.h" + +#include "Transport.h" +#include "HtHTTP.h" + +#ifdef HAVE_SSL_H +#include "HtHTTPSecure.h" +#endif + +#include "HtHTTPBasic.h" +#include "ExternalTransport.h" + +#include "defaults.h" + +#if 1 +typedef void (*SIGNAL_HANDLER) (...); +#else +typedef SIG_PF SIGNAL_HANDLER; +#endif + +//***************************************************************************** +// Document::Document(char *u) +// Initialize with the given url as the location for this document. +// If the max_size is given, use that for size, otherwise use the +// config value. +// +Document::Document(char *u, int max_size) +{ + url = 0; + proxy = 0; + referer = 0; + contents = 0; + transportConnect = 0; + HTTPConnect = 0; + HTTPSConnect = 0; + FileConnect = 0; + FTPConnect = 0; + NNTPConnect = 0; + externalConnect = 0; + HtConfiguration* config= HtConfiguration::config(); + + // We probably need to move assignment of max_doc_size, according + // to a server or url configuration value. The same is valid for + // max_retries. + + if (max_size > 0) + max_doc_size = max_size; + else + max_doc_size = config->Value("max_doc_size"); + + if (config->Value("max_retries") > 0) + num_retries = config->Value("max_retries"); + else num_retries = 2; + + // Initialize some static variables of Transport + + Transport::SetDebugLevel(debug); + + // Initialize some static variables of Transport + // and the User Agent for every HtHTTP objects + + HtHTTP::SetParsingController(ExternalParser::canParse); + + // Set the default parser content-type string + Transport::SetDefaultParserContentType ("text/"); + + contents.allocate(max_doc_size + 100); + contentType = ""; + contentLength = -1; + if (u) + { + Url(u); + } +} + + +//***************************************************************************** +// Document::~Document() +// +Document::~Document() +{ + // We delete only the derived class objects + if (HTTPConnect) + delete HTTPConnect; + if (HTTPSConnect) + delete HTTPSConnect; + if (FileConnect) + delete FileConnect; + if (FTPConnect) + delete FTPConnect; + if (NNTPConnect) + delete NNTPConnect; + if (externalConnect) + delete externalConnect; + + if (url) + delete url; + if (proxy) + delete proxy; + if (referer) + delete referer; + +#if MEM_DEBUG + char *p = new char; + cout << "==== Document deleted: " << this << " new at " << + ((void *) p) << endl; + delete p; +#endif +} + + +//***************************************************************************** +// void Document::Reset() +// Restore the Document object to an initial state. +// We will not reset the authorization information since it can be reused. +// +void +Document::Reset() +{ + contentType = 0; + contentLength = -1; + if (url) + delete url; + url = 0; + if (referer) + delete referer; + + referer = 0; + + proxy=0; + authorization=0; + proxy_authorization=0; + contents = 0; + document_length = 0; + redirected_to = 0; + +} + + +//***************************************************************************** +// void Document::Url(const String &u) +// Set the URL for this document +// +void +Document::Url(const String &u) +{ + HtConfiguration* config= HtConfiguration::config(); + if (url) + delete url; + url = new URL(u); + + // Re-initialise the proxy + if (proxy) + delete proxy; + proxy = 0; + + // Get the proxy information for this URL + const String proxyURL = config->Find(url,"http_proxy"); + + // If http_proxy is not empty we set the proxy for the current URL + if (proxyURL.length()) + { + proxy = new URL(proxyURL); + proxy->normalize(); + // set the proxy authorization information + setProxyUsernamePassword(config->Find(url,"http_proxy_authorization")); + } + + // Set the authorization information + setUsernamePassword(config->Find(url,"authorization")); + +} + + +//***************************************************************************** +// void Document::Referer(const String &u) +// Set the Referring URL for this document +// +void +Document::Referer(const String &u) +{ + if (referer) + delete referer; + referer = new URL(u); +} + + +//***************************************************************************** +// int Document::UseProxy() +// Returns 1 if the given url is to be retrieved from the proxy server, +// or 0 if it's not. +// +int +Document::UseProxy() +{ + HtConfiguration* config= HtConfiguration::config(); + static HtRegex *excludeProxy = 0; + + // + // Initialize excludeProxy list if this is the first time. + // + if (!excludeProxy) + { + excludeProxy = new HtRegex(); + StringList l(config->Find("http_proxy_exclude"), " \t"); + excludeProxy->setEscaped(l, config->Boolean("case_sensitive")); + l.Release(); + } + + if ((proxy) && (excludeProxy->match(url->get(), 0, 0) == 0)) + return true; // if the exclude pattern is empty, use the proxy + return false; +} + + +//***************************************************************************** +// DocStatus Document::Retrieve(HtDateTime date) +// Attempt to retrieve the document pointed to by our internal URL +// +Transport::DocStatus +Document::Retrieve(Server *server, HtDateTime date) +{ + // Right now we just handle http:// service + // Soon this will include file:// + // as well as an ExternalTransport system + // eventually maybe ftp:// and a few others + + Transport::DocStatus status; + Transport_Response *response = 0; + HtDateTime *ptrdatetime = 0; + int useproxy = UseProxy(); + int NumRetries; + + transportConnect = 0; + + if (ExternalTransport::canHandle(url->service())) + { + if (externalConnect) + { + delete externalConnect; + } + externalConnect = new ExternalTransport(url->service()); + transportConnect = externalConnect; + } +#ifdef HAVE_SSL_H + else if (mystrncasecmp(url->service(), "https", 5) == 0) + { + if (!HTTPSConnect) + { + if (debug>4) + cout << "Creating an HtHTTPSecure object" << endl; + + HTTPSConnect = new HtHTTPSecure(); + + if (!HTTPSConnect) + return Transport::Document_other_error; + } + + if (HTTPSConnect) + { + // Here we must set only thing for a HTTP request + + HTTPSConnect->SetRequestURL(*url); + + // Set the user agent which can vary per server + HTTPSConnect->SetRequestUserAgent(server->UserAgent()); + + // Set the accept language which can vary per server + HTTPSConnect->SetAcceptLanguage(server->AcceptLanguage()); + + // Set the referer + if (referer) + HTTPSConnect->SetRefererURL(*referer); + + // Let's disable the cookies if we decided that in the config file + if (server->DisableCookies()) + HTTPSConnect->DisableCookies(); + else HTTPSConnect->AllowCookies(); + + // We may issue a config paramater to enable/disable them + if (server->IsPersistentConnectionAllowed()) + { + // Persistent connections allowed + HTTPSConnect->AllowPersistentConnection(); + } + else HTTPSConnect->DisablePersistentConnection(); + + // Head before Get option control + if (server->HeadBeforeGet()) + HTTPSConnect->EnableHeadBeforeGet(); + else + HTTPSConnect->DisableHeadBeforeGet(); + + // http->SetRequestMethod(HtHTTP::Method_GET); + if (debug > 2) + { + cout << "Making HTTPS request on " << url->get(); + + if (useproxy) + cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")"; + + cout << endl; + } + } + + HTTPSConnect->SetProxy(useproxy); + transportConnect = HTTPSConnect; + } +#endif + else if (mystrncasecmp(url->service(), "http", 4) == 0) + { + if (!HTTPConnect) + { + if (debug>4) + cout << "Creating an HtHTTPBasic object" << endl; + + HTTPConnect = new HtHTTPBasic(); + + if (!HTTPConnect) + return Transport::Document_other_error; + } + + if (HTTPConnect) + { + // Here we must set only thing for a HTTP request + + HTTPConnect->SetRequestURL(*url); + + // Set the user agent which can vary per server + HTTPConnect->SetRequestUserAgent(server->UserAgent()); + + // Set the accept language which can vary per server + HTTPConnect->SetAcceptLanguage(server->AcceptLanguage()); + + // Set the referer + if (referer) + HTTPConnect->SetRefererURL(*referer); + + // Let's disable the cookies if we decided that in the config file + if (server->DisableCookies()) + HTTPConnect->DisableCookies(); + else HTTPConnect->AllowCookies(); + + // We may issue a config paramater to enable/disable them + if (server->IsPersistentConnectionAllowed()) + { + // Persistent connections allowed + HTTPConnect->AllowPersistentConnection(); + } + else HTTPConnect->DisablePersistentConnection(); + + // Head before Get option control + if (server->HeadBeforeGet()) + HTTPConnect->EnableHeadBeforeGet(); + else + HTTPConnect->DisableHeadBeforeGet(); + + // http->SetRequestMethod(HtHTTP::Method_GET); + if (debug > 2) + { + cout << "Making HTTP request on " << url->get(); + + if (useproxy) + cout << " via proxy (" << proxy->host() << ":" << proxy->port() << ")"; + + cout << endl; + } + } + + HTTPConnect->SetProxy(useproxy); + transportConnect = HTTPConnect; + } + else if (mystrncasecmp(url->service(), "file", 4) == 0) + { + if (!FileConnect) + { + if (debug>4) + cout << "Creating an HtFile object" << endl; + + FileConnect = new HtFile(); + + if (!FileConnect) + return Transport::Document_other_error; + } + + if (FileConnect) + { + // Here we must set only thing for a file request + + FileConnect->SetRequestURL(*url); + + // Set the referer + if (referer) + FileConnect->SetRefererURL(*referer); + + if (debug > 2) + cout << "Making 'file' request on " << url->get() << endl; + } + + transportConnect = FileConnect; + } + else if (mystrncasecmp(url->service(), "ftp", 3) == 0) + { + // the following FTP handling is modeled very closely on + // the prior 'file'-protocol handling, so beware of bugs + + if (!FTPConnect) + { + if (debug>4) + cout << "Creating an HtFTP object" << endl; + + FTPConnect = new HtFTP(); + + if (!FTPConnect) + return Transport::Document_other_error; + } + if (FTPConnect) + { + // Here we must set only thing for a FTP request + + FTPConnect->SetRequestURL(*url); + //////////////////////////////////////////////////// + /// + /// stuff may be missing here or in need of change + /// + /////////////////////////////////////////////////// + + // Set the referer + if (referer) + FTPConnect->SetRefererURL(*referer); + + if (debug > 2) + cout << "Making 'ftp' request on " << url->get() << endl; + } + + transportConnect = FTPConnect; + } // end of else if (mystrncasecmp(url->service(), "ftp", 3) == 0) + + else if (mystrncasecmp(url->service(), "news", 4) == 0) + { + if (!NNTPConnect) + { + if (debug>4) + cout << "Creating an HtNNTP object" << endl; + + NNTPConnect = new HtNNTP(); + + if (!NNTPConnect) + return Transport::Document_other_error; + } + + if (NNTPConnect) + { + // Here we got an Usenet document request + + NNTPConnect->SetRequestURL(*url); + + if (debug > 2) + cout << "Making 'NNTP' request on " << url->get() << endl; + } + + transportConnect = NNTPConnect; + } + else + { + if (debug) + { + cout << '"' << url->service() << + "\" not a recognized transport service. Ignoring\n"; + } + + return Transport::Document_not_recognized_service; + } + + // Is a transport object pointer available? + + if (transportConnect) + { + // Set all the appropriate parameters + if (useproxy) + { + transportConnect->SetConnection(proxy); + if (proxy_authorization.length()) + transportConnect->SetProxyCredentials(proxy_authorization); + } + else + transportConnect->SetConnection(url); + + // OK. Let's set the connection time out + transportConnect->SetTimeOut(server->TimeOut()); + + // Let's set number of retries for a failed connection attempt + transportConnect->SetRetry(server->TcpMaxRetries()); + + // ... And the wait time after a failure + transportConnect->SetWaitTime(server->TcpWaitTime()); + + // OK. Let's set the maximum size of a document to be retrieved + transportConnect->SetRequestMaxDocumentSize(max_doc_size); + + // Let's set the credentials + transportConnect->SetCredentials(authorization); + + // Let's set the modification time (in order not to retrieve a + // document we already have) + transportConnect->SetRequestModificationTime(date); + + // Make the request + // Here is the main operation ... Let's make the request !!! + // We now perform a loop until we want to retry the request + + NumRetries = 0; + + do + { + status = transportConnect->Request(); + + if (NumRetries++) + if(debug>0) + cout << "."; + + } while (ShouldWeRetry(status) && NumRetries < num_retries); + + + // Let's get out the info we need + response = transportConnect->GetResponse(); + + if (response) + { + // We got the response + + contents = response->GetContents(); + contentType = response->GetContentType(); + contentLength = response->GetContentLength(); + ptrdatetime = response->GetModificationTime(); + document_length = response->GetDocumentLength(); + + // This test is ugly! Can whoever put it here explain why it's + // needed? Why would GetLocation() ever return a non-empty string + // from a Transport subclass that's not supposed to redirect? + if (transportConnect == HTTPConnect || transportConnect == HTTPSConnect || transportConnect == externalConnect) + redirected_to = ((HtHTTP_Response *)response)->GetLocation(); + + if (ptrdatetime) + { + // We got the modification date/time + modtime = *ptrdatetime; + } + + // How to manage it when there's no modification date/time? + + if (debug > 5) + { + cout << "Contents:\n" << contents << endl; + cout << "Content Type: " << contentType << endl; + cout << "Content Length: " << contentLength << endl; + cout << "Modification Time: " << modtime.GetISO8601() << endl; + } + } + + return status; + + } + else + return Transport::Document_not_found; +} + +//***************************************************************************** +// DocStatus Document::RetrieveLocal(HtDateTime date, StringList *filenames) +// Attempt to retrieve the document pointed to by our internal URL +// using a list of potential local filenames given. Returns Document_ok, +// Document_not_changed or Document_not_local (in which case the +// retriever tries it again using the standard retrieve method). +// +Transport::DocStatus +Document::RetrieveLocal(HtDateTime date, StringList *filenames) +{ + HtConfiguration* config= HtConfiguration::config(); + struct stat stat_buf; + String *filename; + + filenames->Start_Get(); + + // Loop through list of potential filenames until the list is exhausted + // or a suitable file is found to exist as a regular file. + while ((filename = (String *)filenames->Get_Next()) && + ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode))) + if (debug > 1) + cout << " tried local file " << *filename << endl; + + if (!filename) + return Transport::Document_not_local; + + if (debug > 1) + cout << " found existing file " << *filename << endl; + + modtime = stat_buf.st_mtime; + if (modtime <= date) + return Transport::Document_not_changed; + + char *ext = strrchr((char*)*filename, '.'); + if (ext == NULL) + return Transport::Document_not_local; + const String *type = HtFile::Ext2Mime (ext + 1); + + static Dictionary *bad_local_ext = 0; + if (!bad_local_ext) + { + // A list of bad extensions, separated by spaces or tabs + bad_local_ext = new Dictionary; + String t = config->Find("bad_local_extensions"); + String lowerp; + char *p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + bad_local_ext->Add(lowerp, 0); + p = strtok(0, " \t"); + } + } + if (type == NULL || bad_local_ext->Exists(ext)) + { + if (debug > 1 && type != NULL) + cout << "\nBad local extension: " << *filename << endl; + return Transport::Document_not_local; + } + else + contentType = *type; + + // Open it + FILE *f = fopen((char*)*filename, "r"); + if (f == NULL) + return Transport::Document_not_local; + + // + // Read in the document itself + // + max_doc_size = config->Value(url,"max_doc_size"); + contents = 0; + char docBuffer[8192]; + int bytesRead; + + while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0) + { + if (debug > 2) + cout << "Read " << bytesRead << " from document\n"; + if (contents.length() + bytesRead > max_doc_size) + bytesRead = max_doc_size - contents.length(); + contents.append(docBuffer, bytesRead); + if (contents.length() >= max_doc_size) + break; + } + fclose(f); + document_length = contents.length(); + contentLength = stat_buf.st_size; + + if (debug > 2) + cout << "Read a total of " << document_length << " bytes\n"; + + if (document_length < contentLength) + document_length = contentLength; + return Transport::Document_ok; +} + + +//***************************************************************************** +// Parsable *Document::getParsable() +// Given the content-type of a document, returns a document parser. +// This will first look through the list of user supplied parsers and +// then at our (limited) builtin list of parsers. The user supplied +// parsers are external programs that will be used. +// +Parsable * +Document::getParsable() +{ + static HTML *html = 0; + static Plaintext *plaintext = 0; + static ExternalParser *externalParser = 0; + + Parsable *parsable = 0; + + if (ExternalParser::canParse(contentType)) + { + if (externalParser) + { + delete externalParser; + } + externalParser = new ExternalParser(contentType); + parsable = externalParser; + } + else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else if (mystrncasecmp((char *)contentType, "text/css", 8) == 0) + { + return NULL; + } + else if (mystrncasecmp((char *)contentType, "text/", 5) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug > 1) + { + cout << '"' << contentType << + "\" not a recognized type. Assuming text/plain\n"; + } + } + else + { + if (debug > 1) + { + cout << '"' << contentType << + "\" not a recognized type. Ignoring\n"; + } + return NULL; + } + + parsable->setContents(contents.get(), contents.length()); + return parsable; +} + + +int Document::ShouldWeRetry(Transport::DocStatus DocumentStatus) +{ + + if (DocumentStatus == Transport::Document_connection_down) + return 1; + + if (DocumentStatus == Transport::Document_no_connection) + return 1; + + if (DocumentStatus == Transport::Document_no_header) + return 1; + + return 0; +} |