diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/httools/htmerge.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/httools/htmerge.cc | 403 |
1 files changed, 403 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/httools/htmerge.cc b/debian/htdig/htdig-3.2.0b6/httools/htmerge.cc new file mode 100644 index 00000000..d25267fb --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/httools/htmerge.cc @@ -0,0 +1,403 @@ +// +// htmerge.cc +// +// htmerge: Merges two databases and/or updates databases to remove +// old documents and ensures the databases are consistent. +// Calls db.cc, docs.cc, and/or words.cc as necessary +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htmerge.cc,v 1.7 2004/05/28 13:15:25 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WordContext.h" +#include "good_strtok.h" +#include "defaults.h" +#include "DocumentDB.h" +#include "HtURLCodec.h" +#include "HtWordList.h" +#include "HtWordReference.h" +#include "htString.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#include <stdlib.h> +#include <ctype.h> +#include <string.h> + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#elif HAVE_GETOPT_LOCAL +#include <getopt_local.h> +#endif + + +// +// This hash is used to keep track of all the document IDs which have to be +// discarded. +// This is generated from the doc database and is used to prune words +// from the word db +// +Dictionary discard_list; + + +// This config is used for merging multiple databses +HtConfiguration merge_config; + +int verbose = 0; +int stats = 0; + +// Component procedures +void mergeDB(); +void usage(); +void reportError(char *msg); + +//***************************************************************************** +// int main(int ac, char **av) +// +int main(int ac, char **av) +{ + int alt_work_area = 0; + String configfile = DEFAULT_CONFIG_FILE; + String merge_configfile = 0; + int c; + extern char *optarg; + + while ((c = getopt(ac, av, "svm:c:dwa")) != -1) + { + switch (c) + { + case 'd': + break; + case 'w': + break; + case 'c': + configfile = optarg; + break; + case 'm': + merge_configfile = optarg; + break; + case 'v': + verbose++; + break; + case 's': + break; + case 'a': + alt_work_area++; + break; + case '?': + usage(); + break; + } + } + + HtConfiguration* config= HtConfiguration::config(); + config->Defaults(&defaults[0]); + + if (access((char*)configfile, R_OK) < 0) + { + reportError(form("Unable to find configuration file '%s'", + configfile.get())); + } + + config->Read(configfile); + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance()->ErrMsg(); + + if (url_part_errors.length() != 0) + reportError(form("Invalid url_part_aliases or common_url_parts: %s", + url_part_errors.get())); + + if (merge_configfile.length()) + { + merge_config.Defaults(&defaults[0]); + if (access((char*)merge_configfile, R_OK) < 0) + { + reportError(form("Unable to find configuration file '%s'", + merge_configfile.get())); + } + merge_config.Read(merge_configfile); + } + + if (alt_work_area != 0) + { + String configValue; + + configValue = config->Find("word_db"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("word_db", configValue); + } + + configValue = config->Find("doc_db"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("doc_db", configValue); + } + + configValue = config->Find("doc_index"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("doc_index", configValue); + } + + configValue = config->Find("doc_excerpt"); + if (configValue.length() != 0) + { + configValue << ".work"; + config->Add("doc_excerpt", configValue); + } + } + + WordContext::Initialize(*config); + + if (merge_configfile.length()) + { + // Merge the databases specified in merge_configfile into the current + // databases. Do this first then update the other databases as usual + // Note: We don't have to specify anything, it's all in the config vars + + mergeDB(); + } + + return 0; +} + +//***************************************************************************** +// void mergeDB() +// +void +mergeDB() +{ + HtConfiguration* config= HtConfiguration::config(); + DocumentDB merge_db, db; + List *urls; + Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore + int docIDOffset; + + const String doc_index = config->Find("doc_index"); + if (access(doc_index, R_OK) < 0) + { + reportError(form("Unable to open document index '%s'", (const char*)doc_index)); + } + const String doc_excerpt = config->Find("doc_excerpt"); + if (access(doc_excerpt, R_OK) < 0) + { + reportError(form("Unable to open document excerpts '%s'", (const char*)doc_excerpt)); + } + const String doc_db = config->Find("doc_db"); + if (db.Open(doc_db, doc_index, doc_excerpt) < 0) + { + reportError(form("Unable to open/create document database '%s'", + (const char*)doc_db)); + } + + + const String merge_doc_index = merge_config["doc_index"]; + if (access(merge_doc_index, R_OK) < 0) + { + reportError(form("Unable to open document index '%s'", (const char*)merge_doc_index)); + } + const String merge_doc_excerpt = merge_config["doc_excerpt"]; + if (access(merge_doc_excerpt, R_OK) < 0) + { + reportError(form("Unable to open document excerpts '%s'", (const char*)merge_doc_excerpt)); + } + const String merge_doc_db = merge_config["doc_db"]; + if (merge_db.Open(merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0) + { + reportError(form("Unable to open document database '%s'", + (const char*)merge_doc_db)); + } + + // Start the merging by going through all the URLs that are in + // the database to be merged + + urls = merge_db.URLs(); + // This ensures that every document added from merge_db has a unique ID + // in the new database + docIDOffset = db.NextDocID(); + + urls->Start_Get(); + String *url; + String id; + while ((url = (String *) urls->Get_Next())) + { + DocumentRef *ref = merge_db[url->get()]; + DocumentRef *old_ref = db[url->get()]; + if (!ref) + continue; + + if (old_ref) + { + // Oh well, we knew this would happen. Let's get the duplicate + // And we'll only use the most recent date. + + if ( old_ref->DocTime() >= ref->DocTime() ) + { + // Cool, the ref we're merging is too old, just ignore it + char str[20]; + sprintf(str, "%d", ref->DocID()); + merge_dup_ids.Add(str, 0); + + if (verbose > 1) + { + cout << "htmerge: Duplicate, URL: " << url << " ignoring merging copy \n"; + cout.flush(); + } + } + else + { + // The ref we're merging is newer, delete the old one and add + char str[20]; + sprintf(str, "%d", old_ref->DocID()); + db_dup_ids.Add(str, 0); + db.Delete(old_ref->DocID()); + ref->DocID(ref->DocID() + docIDOffset); + db.Add(*ref); + if (verbose > 1) + { + cout << "htmerge: Duplicate, URL: "; + cout << url->get() << " ignoring destination copy \n"; + cout.flush(); + } + } + } + else + { + // It's a new URL, just add it, making sure to load the excerpt + merge_db.ReadExcerpt(*ref); + ref->DocID(ref->DocID() + docIDOffset); + db.Add(*ref); + if (verbose > 1) + { + cout << "htmerge: Merged URL: " << url->get() << " \n"; + cout.flush(); + } + } + delete ref; + delete old_ref; + } + delete urls; + + // As reported by Roman Dimov, we must update db.NextDocID() + // because of all the added records... + db.IncNextDocID( merge_db.NextDocID() ); + merge_db.Close(); + db.Close(); + + // OK, after merging the doc DBs, we do the same for the words + HtWordList mergeWordDB(*config), wordDB(*config); + List *words; + String docIDKey; + + if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0) + { + reportError(form("Unable to open/create document database '%s'", + (const char*)config->Find("word_db"))); + } + + if (mergeWordDB.Open(merge_config["word_db"], O_RDONLY) < 0) + { + reportError(form("Unable to open document database '%s'", + (const char *)merge_config["word_db"])); + } + + // Start the merging by going through all the URLs that are in + // the database to be merged + + words = mergeWordDB.WordRefs(); + + words->Start_Get(); + HtWordReference *word; + while ((word = (HtWordReference *) words->Get_Next())) + { + docIDKey = word->DocID(); + if (merge_dup_ids.Exists(docIDKey)) + continue; + + word->DocID(word->DocID() + docIDOffset); + wordDB.Override(*word); + } + delete words; + + words = wordDB.WordRefs(); + words->Start_Get(); + while ((word = (HtWordReference *) words->Get_Next())) + { + docIDKey = word->DocID(); + if (db_dup_ids.Exists(docIDKey)) + wordDB.Delete(*word); + } + delete words; + + // Cleanup--just close the two word databases + mergeWordDB.Close(); + wordDB.Close(); +} + + +//***************************************************************************** +// void usage() +// Display program usage information +// +void usage() +{ + cout << "usage: htmerge [-v][-c configfile][-m merge_configfile]\n"; + cout << "This program is part of ht://Dig " << VERSION << "\n\n"; + cout << "Options:\n"; + cout << "\t-v\tVerbose mode. This increases the verbosity of the\n"; + cout << "\t\tprogram. Using more than 2 is probably only useful\n"; + cout << "\t\tfor debugging purposes. The default verbose mode\n"; + cout << "\t\tgives a progress on what it is doing and where it is.\n\n"; + cout << "\t-m merge_configfile\n"; + cout << "\t\tMerge the databases specified into the databases specified\n"; + cout << "\t\tby -c or the default.\n\n"; + cout << "\t-c configfile\n"; + cout << "\t\tUse the specified configuration file instead on the\n"; + cout << "\t\tdefault.\n\n"; + cout << "\t-a\tUse alternate work files.\n"; + cout << "\t\tTells htmerge to append .work to database files causing\n"; + cout << "\t\ta second copy of the database to be built. This allows\n"; + cout << "\t\toriginal files to be used by htsearch during the indexing\n"; + cout << "\t\trun.\n\n"; + exit(0); +} + + +//***************************************************************************** +// Report an error and die +// +void reportError(char *msg) +{ + cout << "htmerge: " << msg << "\n\n"; + exit(1); +} |