summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/htdig.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/htdig.cc536
1 files changed, 536 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
new file mode 100644
index 00000000..ba1d842a
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/htdig.cc
@@ -0,0 +1,536 @@
+//
+// htdig.cc
+//
+// htdig: Indexes the web sites specified in the config file
+// generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: htdig.cc,v 1.42 2004/05/28 13:15:16 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Document.h"
+#include "Retriever.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "defaults.h"
+#include "HtURLCodec.h"
+#include "WordContext.h"
+#include "HtDateTime.h"
+#include "HtURLRewriter.h"
+
+////////////////////////////
+// For cookie jar
+////////////////////////////
+#include "HtCookieJar.h"
+#include "HtCookieMemJar.h"
+#include "HtCookieInFileJar.h"
+#include "HtHTTP.h"
+////////////////////////////
+
+// If we have this, we probably want it.
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#elif HAVE_GETOPT_LOCAL
+#include <getopt_local.h>
+#endif
+
+#ifdef HAVE_STD
+#include <iostream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <iostream.h>
+#endif /* HAVE_STD */
+
+//
+// Global variables
+//
+int debug = 0;
+int report_statistics = 0;
+DocumentDB docs;
+HtRegexList limits;
+HtRegexList limitsn;
+FILE *urls_seen = NULL;
+FILE *images_seen = NULL;
+String configFile = DEFAULT_CONFIG_FILE;
+String minimalFile = 0;
+HtDateTime StartTime;
+HtDateTime EndTime;
+
+void usage();
+void reportError(char *msg);
+
+
+//
+// Start of the program.
+//
+int main(int ac, char **av)
+{
+ int c;
+ extern char *optarg;
+ String credentials;
+ int initial = 0;
+ int alt_work_area = 0;
+ int create_text_database = 0;
+ char *max_hops = 0;
+
+ // Cookie jar dynamic creation.
+ HtCookieJar* _cookie_jar = new HtCookieMemJar(); // new cookie jar
+ if (_cookie_jar)
+ HtHTTP::SetCookieJar(_cookie_jar);
+
+//extern int yydebug;
+//yydebug=1;
+
+ //
+ // Parse command line arguments
+ //
+ while ((c = getopt(ac, av, "lsm:c:vith:u:a")) != -1)
+ {
+ unsigned int pos;
+ switch (c)
+ {
+ case 'c':
+ configFile = optarg;
+ break;
+ case 'v':
+ debug++;
+ break;
+ case 'i':
+ initial++;
+ break;
+ case 't':
+ create_text_database++;
+ break;
+ case 'h':
+ max_hops = optarg;
+ break;
+ case 's':
+ report_statistics++;
+ break;
+ case 'u':
+ credentials = optarg;
+ for (pos = 0; pos < strlen(optarg); pos++)
+ optarg[pos] = '*';
+ break;
+ case 'a':
+ alt_work_area++;
+ break;
+ case 'm':
+ minimalFile = optarg;
+ max_hops = "0";
+ break;
+ case '?':
+ usage();
+ default:
+ break;
+ }
+ }
+
+ // Shows Start Time
+ if (debug>0)
+ cout << "ht://dig Start Time: " << StartTime.GetAscTime() << endl;
+
+ //
+ // First set all the defaults and then read the specified config
+ // file to override the defaults.
+ //
+ HtConfiguration* const config= HtConfiguration::config();
+ config->Defaults(&defaults[0]);
+ if (access((char*)configFile, R_OK) < 0)
+ {
+ reportError(form("Unable to find configuration file '%s'",
+ configFile.get()));
+ }
+ config->Read(configFile);
+
+ // Warn user if any obsolete options are found in config file
+ // For efficiency, check all fields here. If different config
+ // files are used for searching, obsolete options may remain
+ char *deprecatedOptions [] = {
+ "heading_factor_1", "heading_factor_2", "heading_factor_3",
+ "heading_factor_4", "heading_factor_5", "heading_factor_6",
+ "modification_time_is_now", "pdf_parser", "translate_amp",
+ "translate_lt_gt", "translate_quot", "uncoded_db_compatible",
+ "" // empty terminator
+ };
+ char **option;
+ for (option = deprecatedOptions; **option; option++)
+ {
+ if (!config->Find(*option).empty())
+ cout << "Warning: Configuration option " << *option <<
+ " is no longer supported\n";
+ }
+
+ if (config->Find("locale").empty() && debug > 0)
+ cout << "Warning: unknown locale!\n";
+
+ if (max_hops)
+ {
+ config->Add("max_hop_count", max_hops);
+ }
+
+ // Set up credentials for this run
+ if (credentials.length())
+ config->Add("authorization", credentials);
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance()->ErrMsg();
+
+ if (url_part_errors.length() != 0)
+ reportError(form("Invalid url_part_aliases or common_url_parts: %s",
+ url_part_errors.get()));
+
+ //
+ // Check url_rewrite_rules for errors.
+ String url_rewrite_rules = HtURLRewriter::instance()->ErrMsg();
+
+ if (url_rewrite_rules.length() != 0)
+ reportError(form("Invalid url_rewrite_rules: %s",
+ url_rewrite_rules.get()));
+
+ //
+ // If indicated, change the database file names to have the .work
+ // extension
+ //
+ if (alt_work_area != 0)
+ {
+ String configValue = config->Find("doc_db");
+
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("doc_db", configValue);
+ }
+
+ configValue = config->Find("word_db");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("word_db", configValue);
+ }
+
+ configValue = config->Find("doc_index");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("doc_index", configValue);
+ }
+
+ configValue = config->Find("doc_excerpt");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("doc_excerpt", configValue);
+ }
+
+ configValue = config->Find("md5_db");
+ if (configValue.length() != 0)
+ {
+ configValue << ".work";
+ config->Add("md5_db", configValue);
+ }
+ }
+
+ // Imports the cookies file
+ const String CookiesInputFile = config->Find("cookies_input_file");
+ if (CookiesInputFile.length())
+ {
+ if (debug>0)
+ cout << "Importing Cookies input file "
+ << CookiesInputFile << endl;
+ int result;
+ HtCookieJar::SetDebugLevel(debug); // Set the debug level
+ HtCookieInFileJar* cookie_file = new HtCookieInFileJar(CookiesInputFile, result);
+ if (cookie_file)
+ {
+ if (!result)
+ {
+ if (debug>0)
+ cookie_file->ShowSummary();
+ delete _cookie_jar; // Deletes previous cookie jar
+ _cookie_jar = (HtCookieJar*) cookie_file; // set the imported one
+ HtHTTP::SetCookieJar(_cookie_jar); // and set the new HTTP jar
+ }
+ else if (debug > 0)
+ cout << "Warning: Import failed! (" << CookiesInputFile << ")" << endl;
+ }
+ else
+ reportError(form("Unable to load cookies file '%s' in memory",
+ CookiesInputFile.get()));
+ }
+
+ //
+ // If needed, we will create a list of every URL we come across.
+ //
+ if (config->Boolean("create_url_list"))
+ {
+ const String filename = config->Find("url_list");
+ urls_seen = fopen(filename, initial ? "w" : "a");
+ if (urls_seen == 0)
+ {
+ reportError(form("Unable to create URL file '%s'",
+ filename.get()));
+ }
+ }
+
+ //
+ // If needed, we will create a list of every image we come across.
+ //
+ if (config->Boolean("create_image_list"))
+ {
+ const String filename = config->Find("image_list");
+ images_seen = fopen(filename, initial ? "w" : "a");
+ if (images_seen == 0)
+ {
+ reportError(form("Unable to create images file '%s'",
+ filename.get()));
+ }
+ }
+
+ //
+ // Set up the limits list
+ //
+ StringList l(config->Find("limit_urls_to"), " \t");
+ limits.setEscaped(l, config->Boolean("case_sensitive"));
+ l.Destroy();
+
+ l.Create(config->Find("limit_normalized"), " \t");
+ limitsn.setEscaped(l, config->Boolean("case_sensitive"));
+ l.Destroy();
+
+ //
+ // Open the document database
+ //
+ const String filename = config->Find("doc_db");
+ if (initial)
+ unlink(filename);
+
+ const String index_filename = config->Find("doc_index");
+ if (initial)
+ unlink(index_filename);
+
+ const String head_filename = config->Find("doc_excerpt");
+ if (initial)
+ unlink(head_filename);
+
+ if (docs.Open(filename, index_filename, head_filename) < 0)
+ {
+ reportError(form("Unable to open/create document database '%s'",
+ filename.get()));
+ }
+
+ const String word_filename = config->Find("word_db");
+ if (initial)
+ {
+ unlink(word_filename);
+ unlink((word_filename + "_weakcmpr").get());
+
+ // Remove "duplicate detection" database
+ unlink(config->Find("md5_db"));
+
+ // using -i, also ignore seen-but-not-processed URLs from last pass
+ unlink(config->Find("url_log"));
+ }
+
+ // Initialize htword
+ WordContext::Initialize(*config);
+
+ // Create the Retriever object which we will use to parse all the
+ // HTML files.
+ // In case this is just an update dig, we will add all existing
+ // URLs?
+ //
+ Retriever retriever(Retriever_logUrl);
+ if (minimalFile.length() == 0)
+ {
+ List *list = docs.URLs();
+ retriever.Initial(*list);
+ delete list;
+
+ // Add start_url to the initial list of the retriever.
+ // Don't check a URL twice!
+ // Beware order is important, if this bugs you could change
+ // previous line retriever.Initial(*list, 0) to Initial(*list,1)
+ retriever.Initial(config->Find("start_url"), 1);
+ }
+
+ // Handle list of URLs given in a file (stdin, if "-") specified as
+ // argument to -m or as an optional trailing argument.
+ if (optind < ac)
+ {
+ if (debug)
+ if (minimalFile.length() != 0)
+ cout << "Warning: argument " << av[optind]
+ << " overrides -m " << minimalFile << endl;
+ minimalFile = av[optind];
+ }
+ if (strcmp (minimalFile.get(), "-") == 0)
+ {
+ String str;
+ // Why not combine this with the code below, with input = stdin ?
+ while (!cin.eof())
+ {
+ cin >> str;
+ str.chop("\r\n"); // (Why "\r\n" here and "\r\n\t " below?)
+ if (str.length() > 0)
+ retriever.Initial(str, 1);
+ }
+ }
+ else if (minimalFile.length() != 0)
+ {
+ FILE *input = fopen(minimalFile.get(), "r");
+ char buffer[1000];
+
+ if (input)
+ {
+ while (fgets(buffer, sizeof(buffer), input))
+ {
+ String str(buffer);
+ str.chop("\r\n\t ");
+ if (str.length() > 0)
+ retriever.Initial(str, 1);
+ }
+ fclose(input);
+ }
+ else
+ {
+ cerr << "Could not open argument '" << minimalFile
+ << "' of flag -m\n";
+ exit (1);
+ }
+ }
+
+ //
+ // Go do it!
+ //
+ retriever.Start();
+
+ //
+ // All done with parsing.
+ //
+
+ //
+ // If the user so wants, create a text version of the document database.
+ //
+
+ if (create_text_database)
+ {
+ const String doc_list = config->Find("doc_list");
+ if (initial)
+ unlink(doc_list);
+ docs.DumpDB(doc_list);
+ const String word_dump = config->Find("word_dump");
+ if (initial)
+ unlink(word_dump);
+ HtWordList words(*config);
+ if(words.Open(config->Find("word_db"), O_RDONLY) == OK) {
+ words.Dump(word_dump);
+ }
+ }
+
+ //
+ // Cleanup
+ //
+ if (urls_seen)
+ fclose(urls_seen);
+ if (images_seen)
+ fclose(images_seen);
+
+ //
+ // If needed, report some statistics
+ //
+ if (report_statistics)
+ {
+ retriever.ReportStatistics("htdig");
+ }
+
+ // Shows End Time
+ if (debug>0)
+ {
+ EndTime.SettoNow();
+ cout << "ht://dig End Time: " << EndTime.GetAscTime() << endl;
+ }
+
+ if (_cookie_jar)
+ delete _cookie_jar;
+}
+
+
+//
+// Display usage information for the htdig program
+//
+void usage()
+{
+ cout << "usage: htdig [-v][-i][-c configfile][-t][-m minimalfile]\n";
+ cout << "This program is part of ht://Dig " << VERSION << "\n\n";
+ cout << "Options:\n";
+
+ cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
+ cout << "\t\tprogram. Using more than 2 is probably only useful\n";
+ cout << "\t\tfor debugging purposes. The default verbose mode\n";
+ cout << "\t\tgives a nice progress report while digging.\n\n";
+
+ cout << "\t-i\tInitial. Do not use any old databases. This is\n";
+ cout << "\t\taccomplished by first erasing the databases.\n\n";
+
+ cout << "\t-c configfile\n";
+ cout << "\t\tUse the specified configuration file instead of the\n";
+ cout << "\t\tdefault.\n\n";
+
+ cout << "\t-t\tCreate an ASCII version of the document database.\n";
+ cout << "\t\tThis database is easy to parse with other programs so\n";
+ cout << "\t\tthat information can be extracted from it.\n\n";
+
+ cout << "\t-h hopcount\n";
+ cout << "\t\tLimit the stored documents to those which are at\n";
+ cout << "\t\tmost hopcount links away from the start URL.\n\n";
+
+ cout << "\t-s\tReport statistics after completion.\n\n";
+
+ cout << "\t-u username:password\n";
+ cout << "\t\tTells htdig to send the supplied username and\n";
+ cout << "\t\tpassword with each HTTP request. The credentials\n";
+ cout << "\t\twill be encoded using the 'Basic' authentication scheme.\n";
+ cout << "\t\tThere *HAS* to be a colon (:) between the username\n";
+ cout << "\t\tand password.\n\n";
+
+ cout << "\t-a\tUse alternate work files.\n";
+ cout << "\t\tTells htdig to append .work to database files, causing\n";
+ cout << "\t\ta second copy of the database to be built. This allows\n";
+ cout << "\t\tthe original files to be used by htsearch during the\n";
+ cout << "\t\tindexing run.\n\n";
+
+ cout << "\t-m minimalfile (or just a file name at end of arguments)\n";
+ cout << "\t\tTells htdig to read URLs from the supplied file and index\n";
+ cout << "\t\tthem in place of (or in addition to) the existing URLs in\n";
+ cout << "\t\tthe database and the start_url. With the -m, only the\n";
+ cout << "\t\tURLs specified are added to the database. A file name of\n";
+ cout << "\t\t'-' indicates the standard input.\n\n";
+
+
+
+ exit(0);
+}
+
+//
+// Report an error and die
+//
+void reportError(char *msg)
+{
+ cout << "htdig: " << msg << "\n\n";
+ exit(1);
+}
+