diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig')
17 files changed, 7124 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc new file mode 100644 index 00000000..d6862550 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc @@ -0,0 +1,316 @@ +// +// BasicDocument.cc +// +// 2/6/2002 created for libhtdig to simplify & mimic Document.cc +// +// Neal Richter nealr@rightnow.com +// +// +// BasicDocument: This class holds everything there is to know about a document. +// The actual contents of the document may or may not be present at +// all times for memory conservation reasons. +// +// This is a basic extensable container for plain text holding documents. +// +// Uses any Parser with parse method handling this class. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BasicDocument.cc,v 1.3 2004/05/28 13:15:28 lha Exp $ +// +//-------------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include "BasicDocument.h" +#include "TextCollector.h" +#include "StringList.h" +#include "htdig.h" +#include "Plaintext.h" +#include "HTML.h" +#include "ExternalParser.h" +#include "lib.h" + +#include "defaults.h" + +#if 1 +typedef void (*SIGNAL_HANDLER) (...); +#else +typedef SIG_PF SIGNAL_HANDLER; +#endif + +//***************************************************************************** +// BasicDocument::BasicDocument(char *loc) +// Initialize with the given loc-parameter as the location for this document. +// If the max_size is given, use that for size, otherwise use the +// config value. +// +BasicDocument::BasicDocument(char *loc, int suggested_size) +{ + int temp_size = 0; + + id = 0; + location = 0; + title = 0; + metacontent = 0; + contents = 0; + document_length = 0; + + + HtConfiguration *config = HtConfiguration::config(); + + //We probably need to move assignment of max_doc_size, according + //to a configuration value. + + if (suggested_size > 0) + temp_size = suggested_size; + else + temp_size = config->Value("max_doc_size"); + + contents.allocate(temp_size + 100); + + contentType = ""; + + if (loc) + { + Location(loc); + } +} + + +//***************************************************************************** +// BasicDocument::~BasicDocument() +// +BasicDocument::~BasicDocument() +{ + // We delete only the derived class objects + +#if MEM_DEBUG + char *p = new char; + cout << "==== BasicDocument deleted: " << this << " new at " << ((void *) p) << endl; + delete p; +#endif +} + + +//***************************************************************************** +// void BasicDocument::Reset() +// Restore the BasicDocument object to an initial state. +// +void +BasicDocument::Reset() +{ + + id = 0; + location = 0; + title = 0; + metacontent = 0; + contents = 0; + + contentType = 0; + document_length = 0; + +} + +//***************************************************************************** +// void BasicDocument::Length() +// Return/Calc length of BasicDocument... icummulative size of the Strings +// +int +BasicDocument::Length() +{ + if (document_length < 0) + { + document_length = 0; + document_length += location.length(); + document_length += title.length(); + document_length += metacontent.length(); + document_length += contents.length(); + document_length += id.length(); + } + + return (document_length); +} + + +//***************************************************************************** +// Parsable *BasicDocument::getParsable() +// Given the content-type of a document, returns a document parser. +// This will first look through the list of user supplied parsers and +// then at our (limited) builtin list of parsers. The user supplied +// parsers are external programs that will be used. + +Parsable * +BasicDocument::getParsable() +{ + static HTML *html = 0; + static Plaintext *plaintext = 0; + static ExternalParser *externalParser = 0; + + Parsable *parsable = 0; + + if (ExternalParser::canParse(contentType)) + { + if (externalParser) + { + delete externalParser; + } + externalParser = new ExternalParser(contentType); + parsable = externalParser; + } + else if (mystrncasecmp((char *) contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp((char *) contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else if (mystrncasecmp((char *) contentType, "text/css", 8) == 0) + { + return NULL; + } + else if (mystrncasecmp((char *) contentType, "text/", 5) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug > 1) + { + cout << '"' << contentType << "\" not a recognized type. Assuming text/plain\n"; + } + } + else + { + if (debug > 1) + { + cout << '"' << contentType << "\" not a recognized type. Ignoring\n"; + } + return NULL; + } + + parsable->setContents(contents.get(), contents.length()); + return parsable; +} + +//***************************************************************************** +// +// Test for self parseaable +// +int +BasicDocument::SelfParseable() +{ + + if (mystrncasecmp((char *) contentType, "text/vnd.customdocument", 10) == 0) + { + return (TRUE); + } + else + return (FALSE); + +} + + +//***************************************************************************** +// Parsable *BasicDocument::internalParser() +int +BasicDocument::internalParser(TextCollector & textcollector) +{ + HtConfiguration* config= HtConfiguration::config(); + char *position = NULL; + static int minimumWordLength = config->Value("minimum_word_length", 3); + int wordIndex = 1; + String word; + int letter_count = 0; + + //First Process Title + textcollector.got_title((char *) title); + + //Next Process Contents + position = contents; + + while (*position) + { + word = 0; + + if (HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + //TODO NEAL RICHTER Imposed a 50-letter word length limit here + // + while (*position && HtIsWordChar(*position) && (letter_count < 50)) + { + word << *position; + position++; + letter_count++; + } + + letter_count = 0; + if (word.length() >= minimumWordLength) + { + textcollector.got_word((char *) word, wordIndex++, 0); + } + } + + if (*position) + position++; + + }//end while + + textcollector.got_head((char*) contents); + + //Third, Process MetaContent + position = metacontent; + textcollector.got_meta_dsc(metacontent); + + + //max_meta_description_length??? + + while (*position) + { + word = 0; + + if (HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + while (*position && HtIsWordChar(*position) && (letter_count < 50)) + { + word << *position; + position++; + letter_count++; + } + + letter_count = 0; + + if (word.length() >= minimumWordLength) + { + textcollector.got_word((char *) word, wordIndex++, 9); + } + } + + if (*position) + position++; + + }//end while + + return(1); +} diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h new file mode 100644 index 00000000..9d4a2a73 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h @@ -0,0 +1,119 @@ +//-------------------------------------------------------------------- +// +// BasicDocument.h +// +// 2/6/2002 created for libhtdig to simplify & mimic Document.cc +// +// Neal Richter nealr@rightnow.com +// +// +// BasicDocument: This class holds everything there is to know about a document. +// The actual contents of the document may or may not be present at +// all times for memory conservation reasons. +// +// This is a basic extensable container for plain text holding documents. +// +// Uses any Parser with parse method handling this class. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BasicDocument.h,v 1.4 2004/05/28 13:15:28 lha Exp $ +// +//-------------------------------------------------------------------- + + + + +#ifndef _BasicDocument_h_ +#define _BasicDocument_h_ + +#include "htString.h" +#include "Parsable.h" +#include "Object.h" +#include "StringList.h" +#include "HtDateTime.h" + + +class TextCollector; + + +class BasicDocument:public Object +{ + public: + // + // Construction/Destruction + // + BasicDocument(char *location = 0, int max_size = 0); + ~BasicDocument(); + + // + // Interface to the document. + // + void Reset(); + int Length(); + + //int StoredLength() {return contents.length();} + + char *Title() {return title;} + void Title(char *t) {title = t; document_length = -1;} + void Title(const String & t) {title = t; document_length = -1;} + int TitleLength() {return title.length();} + + char *MetaContent() {return metacontent;} + void MetaContent(char *m) {metacontent = m; document_length = -1;} + void MetaContent(const String & m) {metacontent = m; document_length = -1;} + int MetaContentLength() {return metacontent.length();} + + char *Contents() {return contents;} + void Contents(char *s) {contents = s; document_length = -1;} + void Contents(const String & s) {contents = s; document_length = -1;} + int ContentsLength() {return contents.length();} + + char *Location() {return location;} + void Location(char *l) {location = l; document_length = -1;} + void Location(const String & l) {location = l; document_length = -1;} + int LocationLength() {return location.length();} + + char *DocumentID() {return id;} + void DocumentID(char *ida) {id = ida; document_length = -1;} + void DocumentID(const String & ida) {id = ida; document_length = -1;} + int DocumentIDLength() {return id.length();} + + char *ContentType() {return contentType;} + void ContentType(char *ct) {contentType = ct;} + void ContentType(const String & ct) {contentType = ct;} + + time_t ModTime() {return modtime.GetTime_t();} + void ModTime(time_t t) {modtime = t;} + + // + // Return an appropriate parsable object for the document type. + // + Parsable *getParsable(); + + int internalParser(TextCollector & textcollector); + int SelfParseable(); + + private: + + String id; + String location; + String title; + String metacontent; + String contents; + + String contentType; + + HtDateTime modtime; + + int document_length; + + //int max_doc_size; + +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile new file mode 100644 index 00000000..01f78ec4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile @@ -0,0 +1,182 @@ +#libhtdig.so makefile for Unix systems + +LIBHTDIG_BUILD_VER = 3.2.1 + + +#Berkeley DB Specific defines +BDB_INC_DIRS = -I../db -I/usr/local/include +BDB_DEFS = -DHAVE_CONFIG_H + +INC_DIRS= -I. -I../htdig -I../include -I../db -I../htlib -I../htnet -I../htcommon -I../htword -I../htfuzzy -I../htsearch -I/usr/local/include + +#HTDIG_INC_DIR= -I../htdig + +#define your own defaults here! +DEFAULT_CONFIG_FILE=\"/etc/htdig/htdig.conf\" +DEFAULT_DB_PATH=\"/etc/htdig/\" +BIN_DIR=\"/usr/local/bin\" +COMMON_DIR=\"/usr/local/share/htdig\" +CONFIG_DIR=\"/etc/htdig\" +DATABASE_DIR=\"/var/lib/htdig\" +IMAGE_URL_PREFIX=\"/htdig\" + +DEFS= -DHAVE_CONFIG_H -DDEFAULT_CONFIG_FILE=$(DEFAULT_CONFIG_FILE) -DBIN_DIR=$(BIN_DIR) -DCOMMON_DIR=$(COMMON_DIR) +DEFS+= -DCONFIG_DIR=$(CONFIG_DIR) -DDATABASE_DIR=$(DATABASE_DIR) -DIMAGE_URL_PREFIX=$(IMAGE_URL_PREFIX) + +#LEX define based on your system +LEX = flex +AM_LFLAGS = -L +LEX_OUTPUT_ROOT = lex.yy + +#YACC define based on your system +YACC = bison -y +AM_YFLAGS = -l -d +YACC_OUTPUT_ROOT = y.tab + + + + +ifdef INSURE +CC=insure gcc +CXX=insure g++ +else +CC=gcc +CXX=g++ +endif + + +#OPTIMZ=-O2 +OPTIMZ=-O0 +#OPTS=$(OPTIMZ) -Wall -fno-rtti -fno-exceptions -Werror +OPTS=$(OPTIMZ) -Wall -fno-rtti -fno-exceptions -fPIC +DBG= -g $(PROFILING) + +ifdef FULLDEBUG +DBG+= -DDEBUG -DDEBUG_CMPR $(INC_DIRS) +endif + +ifdef DEBUG +DBG+= -DDEBUG +endif + +ifdef EXPKEY +OPTS += -DEXPKEY +endif + +LIB_DIRS= + +#use '-G' with Solaris +LFLAGS = -lc -lstdc++ +#LFLAGS = -lc -lstdc++ -G + + +LIBZ = -lz +#use this if you've built an -fPIC version of libz.a to link into the libhtdig.so +#LIBZ = -lz-fpic + +#berkeley db c files +BDB_C_OBJS += bt_compare.o bt_conv.o bt_curadj.o bt_cursor.o bt_delete.o bt_method.o bt_open.o bt_put.o bt_rec.o bt_reclaim.o bt_recno.o bt_rsearch.o bt_search.o bt_split.o bt_stat.o bt_upgrade.o btree_auto.o crdel_auto.o crdel_rec.o db.o db_am.o db_auto.o db_byteorder.o db_conv.o db_dispatch.o db_dup.o db_err.o db_getlong.o db_iface.o db_join.o db_log2.o db_meta.o db_method.o db_overflow.o db_pr.o db_rec.o db_reclaim.o db_ret.o db_salloc.o db_shash.o db_upgrade.o env_method.o env_open.o env_recover.o env_region.o hash.o hash_auto.o hash_conv.o hash_dup.o hash_func.o hash_meta.o hash_method.o hash_page.o hash_rec.o hash_reclaim.o hash_stat.o hash_upgrade.o lock.o lock_conflict.o lock_deadlock.o lock_region.o lock_util.o log.o log_archive.o log_auto.o log_compare.o log_findckp.o log_get.o log_method.o log_put.o log_rec.o log_register.o mp_alloc.o mp_bh.o mp_cmpr.o mp_fget.o mp_fopen.o mp_fput.o mp_fset.o mp_method.o mp_region.o mp_register.o mp_stat.o mp_sync.o mp_trickle.o mut_fcntl.o mut_pthread.o mut_tas.o mutex.o os_abs.o os_alloc.o os_dir.o os_errno.o os_fid.o os_finit.o os_fsync.o os_handle.o os_map.o os_method.o os_oflags.o os_open.o os_region.o os_rename.o os_root.o os_rpath.o os_rw.o os_seek.o os_sleep.o os_spin.o os_stat.o os_tmpdir.o os_unlink.o qam.o qam_auto.o qam_conv.o qam_method.o qam_open.o qam_rec.o qam_stat.o txn.o txn_auto.o txn_rec.o txn_region.o xa.o xa_db.o xa_map.o + +#htlib c files $(REGEX) +HTLIB_C_OBJS += getcwd.o mhash_md5.o regex.o vsnprintf.o memcmp.o mktime.o snprintf.o memcpy.o myqsort.o strerror.o memmove.o raise.o timegm.o + +#htlib c++ files +HTLIB_CXX_OBJS += Configuration.o Database.o Dictionary.o DB2_db.o IntObject.o List.o Object.o ParsedString.o Queue.o QuotedStringList.o Stack.o String.o StringList.o StringMatch.o String_fmt.o good_strtok.o strcasecmp.o strptime.o HtCodec.o HtWordCodec.o HtVector.o HtHeap.o HtPack.o HtDateTime.o HtRegex.o HtRegexList.o HtRegexReplace.o HtRegexReplaceList.o HtVectorGeneric.o HtMaxMin.o HtWordType.o md5.o + +#htword c++ files +HTWORD_CXX_OBJS += WordBitCompress.o WordContext.o WordCursor.o WordDB.o WordDBCompress.o WordDBInfo.o WordDBPage.o WordKey.o WordKeyInfo.o WordList.o WordMonitor.o WordRecord.o WordRecordInfo.o WordReference.o WordStat.o WordType.o + +#htcommon c++ files +HTCOMMON_CXX_OBJS += DocumentDB.o DocumentRef.o HtWordReference.o HtWordList.o defaults.o HtURLCodec.o URL.o URLTrans.o HtZlibCodec.o cgi.o HtSGMLCodec.o HtConfiguration.o HtURLRewriter.o + +#htnet c++ files +HTNET_CXX_OBJS += Connection.o Transport.o HtHTTP.o HtFile.o HtNNTP.o HtCookie.o HtCookieJar.o HtCookieMemJar.o HtHTTPBasic.o HtHTTPSecure.o SSLConnection.o HtFTP.o HtCookieInFileJar.o + +#htdig c++ files +HTDIG_CXX_OBJS += Document.o ExternalTransport.o Parsable.o Retriever.o URLRef.o ExternalParser.o HTML.o Plaintext.o Server.o + +#htfuzzy c++ files +HTFUZZY_CXX_OBJS += Accents.o EndingsDB.o Fuzzy.o Prefix.o Soundex.o Substring.o Synonym.o Endings.o Exact.o Metaphone.o Regexp.o Speling.o SuffixEntry.o filecopy.o + +#HTFUZZY_C_OBJS += filecopy.o + +#htsearch c++ files +HTSEARCH_CXX_OBJS += Collection.o DocMatch.o ResultList.o SplitMatches.o TemplateList.o Display.o HtURLSeedScore.o ResultMatch.o Template.o WeightWord.o parser.o + +#libhtdig c++ files +LIBHTDIG_CXX_OBJS += ResultFetch.o BasicDocument.o TextCollector.o libhtdig_htdig.o libhtdig_htmerge.o libhtdig_htfuzzy.o libhtdig_log.o libhtdig_htsearch.o + +#htcommon lex & yacc targets +LIBHTDIG_CXX_OBJS += conf_lexer.o conf_parser.o + +#libhtdig c files +#LIBHTDIG_C_OBJS += filecopy.o + +LXX_TARGETS += conf_lexer.cc + +YXX_TARGETS += conf_parser.cc + +OBJS += $(BDB_C_OBJS) $(HTLIB_C_OBJS) $(HTLIB_CXX_OBJS) $(HTWORD_CXX_OBJS) +OBJS += $(HTCOMMON_CXX_OBJS) $(HTNET_CXX_OBJS) $(HTDIG_CXX_OBJS) +OBJS += $(HTFUZZY_CXX_OBJS) $(HTFUZZY_C_OBJS) $(HTSEARCH_CXX_OBJS) $(LIBHTDIG_CXX_OBJS) $(LIBHTDIG_C_OBJS) + + +libhtdig-3.2.0.so: $(OBJS) $(LXX_TARGETS) $(YXX_TARGETS) + $(CC) -shared $(LIB_DIRS) $(OTHER_OBJS) $(OBJS) -L/usr/local/lib $(LIBZ) $(LFLAGS) -Xlinker -h -Xlinker libhtdig.so.$(LIBHTDIG_BUILD_VER) -o libhtdig.so.$(LIBHTDIG_BUILD_VER) + + +libhtdig.a: $(OBJS) $(LXX_TARGETS) $(YXX_TARGETS) + ar cru libhtdig.a $(OTHER_OBJS) $(OBJS) + ranlib libhtdig.a + + +$(BDB_C_OBJS): %.o: ../db/%.c + $(CC) $(BDB_INC_DIRS) $(BDB_DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTLIB_C_OBJS): %.o: ../htlib/%.c + $(CC) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTLIB_CXX_OBJS): %.o: ../htlib/%.cc + $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTWORD_CXX_OBJS): %.o: ../htword/%.cc + $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTCOMMON_CXX_OBJS): %.o: ../htcommon/%.cc + $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTNET_CXX_OBJS): %.o: ../htnet/%.cc + $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTDIG_CXX_OBJS): %.o: ../htdig/%.cc + $(CXX) $(HTDIG_INC_DIR) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTFUZZY_C_OBJS): %.o: ../htfuzzy/%.c + $(CC) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTFUZZY_CXX_OBJS): %.o: ../htfuzzy/%.cc + $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(HTSEARCH_CXX_OBJS): %.o: ../htsearch/%.cc + $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(LIBHTDIG_CXX_OBJS): %.o: %.cc + $(CXX) $(HTDIG_INC_DIR) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(LIBHTDIG_C_OBJS): %.o: %.c + $(CC) $(HTDIG_INC_DIR) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@ + +$(LXX_TARGETS): %.cc: ../htcommon/%.lxx + $(LEX) $(AM_LFLAGS) $< && mv $(LEX_OUTPUT_ROOT).c $@ + #$(LEX) $(AM_LFLAGS) $(LFLAGS) -o$@ $< + +$(YXX_TARGETS): %.cc: ../htcommon/%.yxx + $(YACC) $(AM_YFLAGS) $< && mv $(YACC_OUTPUT_ROOT).c $@ + if test -f y.tab.h; then if cmp -s y.tab.h conf_parser.h; then rm -f y.tab.h; else mv y.tab.h conf_parser.h; fi; else :; fi + + +clean: + rm -f *.o *~ *.bak *.lo *.a* *.so* core $(LXX_TARGETS) $(YXX_TARGETS) + + + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32 new file mode 100644 index 00000000..da1dfb62 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32 @@ -0,0 +1,173 @@ +# +# Makefile - makefile for libhtdig +# + +PRODUCT = htdig + +TARGET = $(LIBDIR)/libhtdig$(DLLSFX) + +include ../Makedefs.win32 + +ARCH = win32 +#MV = move +MV = mv + + +#define your own defaults here! +DEFAULT_CONFIG_FILE=\"/etc/htdig/htdig.conf\" +DEFAULT_DB_PATH=\"/etc/htdig/\" +BIN_DIR=\"/usr/local/bin\" +COMMON_DIR=\"/usr/local/share/htdig\" +CONFIG_DIR=\"/etc/htdig\" +DATABASE_DIR=\"/var/lib/htdig\" +IMAGE_URL_PREFIX=\"/htdig\" + +DEFS= -DHAVE_CONFIG_H -DDEFAULT_CONFIG_FILE=$(DEFAULT_CONFIG_FILE) -DBIN_DIR=$(BIN_DIR) -DCOMMON_DIR=$(COMMON_DIR) +DEFS+= -DCONFIG_DIR=$(CONFIG_DIR) -DDATABASE_DIR=$(DATABASE_DIR) -DIMAGE_URL_PREFIX=$(IMAGE_URL_PREFIX) + +#LEX define based on your system +LEX = flex +AM_LFLAGS = -L +LEX_OUTPUT_ROOT = lex.yy + +#YACC define based on your system +YACC = bison -y +AM_YFLAGS = -l -d +YACC_OUTPUT_ROOT = y.tab + + +# ----------------------------------------------------------------------------- +# add new library members to this list + +#berkeley db c files +BDB_C_OBJS += bt_compare.obj bt_conv.obj bt_curadj.obj bt_cursor.obj bt_delete.obj bt_method.obj bt_open.obj bt_put.obj bt_rec.obj bt_reclaim.obj bt_recno.obj bt_rsearch.obj bt_search.obj bt_split.obj bt_stat.obj bt_upgrade.obj btree_auto.obj crdel_auto.obj crdel_rec.obj db.obj db_am.obj db_auto.obj db_byteorder.obj db_conv.obj db_dispatch.obj db_dup.obj db_err.obj db_getlong.obj db_iface.obj db_join.obj db_log2.obj db_meta.obj db_method.obj db_overflow.obj db_pr.obj db_rec.obj db_reclaim.obj db_ret.obj db_salloc.obj db_shash.obj db_upgrade.obj env_method.obj env_open.obj env_recover.obj env_region.obj hash.obj hash_auto.obj hash_conv.obj hash_dup.obj hash_func.obj hash_meta.obj hash_method.obj hash_page.obj hash_rec.obj hash_reclaim.obj hash_stat.obj hash_upgrade.obj lock.obj lock_conflict.obj lock_deadlock.obj lock_region.obj lock_util.obj log.obj log_archive.obj log_auto.obj log_compare.obj log_findckp.obj log_get.obj log_method.obj log_put.obj log_rec.obj log_register.obj mp_alloc.obj mp_bh.obj mp_cmpr.obj mp_fget.obj mp_fopen.obj mp_fput.obj mp_fset.obj mp_method.obj mp_region.obj mp_register.obj mp_stat.obj mp_sync.obj mp_trickle.obj mut_fcntl.obj mut_pthread.obj mut_tas.obj mutex.obj os_abs.obj os_alloc.obj os_dir.obj os_errno.obj os_fid.obj os_finit.obj os_fsync.obj os_handle.obj os_map.obj os_method.obj os_oflags.obj os_open.obj os_region.obj os_rename.obj os_root.obj os_rpath.obj os_rw.obj os_seek.obj os_sleep.obj os_spin.obj os_stat.obj os_tmpdir.obj os_unlink.obj qam.obj qam_auto.obj qam_conv.obj qam_method.obj qam_open.obj qam_rec.obj qam_stat.obj txn.obj txn_auto.obj txn_rec.obj txn_region.obj xa.obj xa_db.obj xa_map.obj + +ifdef WINDIR +BDB_C_OBJS += dirent_local.obj +endif + +#htlib c files $(REGEX) +HTLIB_C_OBJS += getcwd.obj mhash_md5.obj regex.obj vsnprintf.obj memcmp.obj mktime.obj snprintf.obj memcpy.obj myqsort.obj strerror.obj memmove.obj raise.obj timegm.obj + +#htlib c++ files +HTLIB_CXX_OBJS += Configuration.obj Database.obj Dictionary.obj DB2_db.obj IntObject.obj List.obj Object.obj ParsedString.obj Queue.obj QuotedStringList.obj Stack.obj String.obj StringList.obj StringMatch.obj String_fmt.obj good_strtok.obj strcasecmp.obj strptime.obj HtCodec.obj HtWordCodec.obj HtVector.obj HtHeap.obj HtPack.obj HtDateTime.obj HtRegex.obj HtRegexList.obj HtRegexReplace.obj HtRegexReplaceList.obj HtVectorGeneric.obj HtMaxMin.obj HtWordType.obj md5.obj filecopy.obj + +#htword c++ files +HTWORD_CXX_OBJS += WordBitCompress.obj WordContext.obj WordCursor.obj WordDB.obj WordDBCompress.obj WordDBInfo.obj WordDBPage.obj WordKey.obj WordKeyInfo.obj WordList.obj WordMonitor.obj WordRecord.obj WordRecordInfo.obj WordReference.obj WordStat.obj WordType.obj + +#htcommon c++ files +HTCOMMON_CXX_OBJS += DocumentDB.obj DocumentRef.obj HtWordReference.obj HtWordList.obj defaults.obj HtURLCodec.obj URL.obj URLTrans.obj HtZlibCodec.obj cgi.obj HtSGMLCodec.obj HtConfiguration.obj HtURLRewriter.obj + +#htnet c++ files +HTNET_CXX_OBJS += Connection.obj Transport.obj HtHTTP.obj HtFile.obj HtNNTP.obj HtCookie.obj HtCookieJar.obj HtCookieMemJar.obj HtHTTPBasic.obj HtHTTPSecure.obj SSLConnection.obj HtFTP.obj HtCookieInFileJar.obj + +#htdig c++ files +HTDIG_CXX_OBJS += Document.obj ExternalTransport.obj Parsable.obj Retriever.obj URLRef.obj ExternalParser.obj HTML.obj Plaintext.obj Server.obj + +#htfuzzy c++ files +HTFUZZY_CXX_OBJS += Accents.obj EndingsDB.obj Fuzzy.obj Prefix.obj Soundex.obj Substring.obj Synonym.obj Endings.obj Exact.obj Metaphone.obj Regexp.obj Speling.obj SuffixEntry.obj + +#HTFUZZY_C_OBJS += filecopy.o + +#htsearch c++ files +HTSEARCH_CXX_OBJS += Collection.obj DocMatch.obj ResultList.obj SplitMatches.obj TemplateList.obj Display.obj HtURLSeedScore.obj ResultMatch.obj Template.obj WeightWord.obj parser.obj + +#libhtdig c++ files +LIBHTDIG_CXX_OBJS += ResultFetch.obj BasicDocument.obj TextCollector.obj libhtdig_htdig.obj libhtdig_htmerge.obj libhtdig_htfuzzy.obj libhtdig_log.obj libhtdig_htsearch.obj + +#htcommon lex & yacc targets +LIBHTDIG_CXX_OBJS += conf_lexer.obj conf_parser.obj + +#libhtdig c files +#LIBHTDIG_C_OBJS += filecopy.o + +LXX_TARGETS += conf_lexer.cc + +YXX_TARGETS += conf_parser.cc + +OBJS += $(BDB_C_OBJS) $(HTLIB_C_OBJS) $(HTLIB_CXX_OBJS) $(HTWORD_CXX_OBJS) +OBJS += $(HTCOMMON_CXX_OBJS) $(HTNET_CXX_OBJS) $(HTDIG_CXX_OBJS) +OBJS += $(HTFUZZY_CXX_OBJS) $(HTFUZZY_C_OBJS) $(HTSEARCH_CXX_OBJS) $(LIBHTDIG_CXX_OBJS) $(LIBHTDIG_C_OBJS) + +OTHERLIBS = L:/win32/lib/zlib114/zlib.lib ws2_32.lib + +# ----------------------------------------------------------------------------- + +CMNDLLS = + +CPPFLAGS += -DHAVE_CONFIG_H -I. -I../include -I../htlib -I../htcommon -I../htword \ + -I../db -I../htnet -I../htsearch -I../htdig -I../htfuzzy + +CFLAGS += $(CPPFLAGS) + +#ifeq ($(ARCH),win32) +CFLAGS += -DDYNAMIC_LIBUTIL +CPPFLAGS += -DDYNAMIC_LIBUTIL -DYY_NEVER_INTERACTIVE +#endif + +ifeq ($(ARCH),linux) +LDFLAGS += -Xlinker -Bsymbolic +endif + +# ----------------------------------------------------------------------------- + + +#win32/%.obj: %.cc %.c +# $(CC) $(CPPFLAGS) -c $< -o $@ + + + #$(CC) $(CPPFLAGS) $(OPTS) $(DBG) -c $< /Fo$@ + +$(BDB_C_OBJS): %.obj: ../db/%.c + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $< + +$(HTLIB_C_OBJS): %.obj: ../htlib/%.c + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $< + +$(HTLIB_CXX_OBJS): %.obj: ../htlib/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(HTWORD_CXX_OBJS): %.obj: ../htword/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(HTCOMMON_CXX_OBJS): %.obj: ../htcommon/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(HTNET_CXX_OBJS): %.obj: ../htnet/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(HTDIG_CXX_OBJS): %.obj: ../htdig/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(HTFUZZY_C_OBJS): %.obj: ../htfuzzy/%.c + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $< + +$(HTFUZZY_CXX_OBJS): %.obj: ../htfuzzy/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(HTSEARCH_CXX_OBJS): %.obj: ../htsearch/%.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(LIBHTDIG_CXX_OBJS): %.obj: %.cc + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $< + +$(LIBHTDIG_C_OBJS): %.obj: %.c + $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $< + +$(LXX_TARGETS): %.cc: ../htcommon/%.lxx + $(LEX) $(AM_LFLAGS) $< && cat $(LEX_OUTPUT_ROOT).c | sed -e 's/#include <unistd.h>//g' > $@ + #$(LEX) $(AM_LFLAGS) $(LFLAGS) -o$@ $< + +$(YXX_TARGETS): %.cc: ../htcommon/%.yxx + $(YACC) $(AM_YFLAGS) $< && $(MV) $(YACC_OUTPUT_ROOT).c $@ + if test -f y.tab.h; then if cmp -s y.tab.h conf_parser.h; then rm -f y.tab.h; else mv y.tab.h conf_parser.h; fi; else :; fi + + + +$(TARGET): $(OBJDIRDEP) $(LIBDIRDEP) $(OBJS) + $(RM) $(basename $(TARGET))* + $(DLLLD) $(LDFLAGS) $(OBJS) $(LDLIBS) $(DBLIBS) $(OTHERLIBS) + $(DLL_SYMLINK_CMD) + +include ../Makerules.win32 + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/README b/debian/htdig/htdig-3.2.0b6/libhtdig/README new file mode 100644 index 00000000..99591a2c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/README @@ -0,0 +1,46 @@ +Neal Richter +10/6/2002 + +How to use this: + +-------------------------- +Building htdig: + +unzip the tarball + +cd inside of the tarball root directory + +./configure' with any parameters needed + +--------------------------- +Building libhtdig: + +cd libhtdig + +make + +-------------------------- +Building libhtdigphp: + +cd libhtdigphp + +./configure +make +./relink + +Note: The relink script uses the PHP wrapper objects as well as all the object +in libhtdig to create a one-piece sharded library + +--------------------------- + +note that the libhtdig_xxxx.cc take the place of the various utilities 'main' +functions. Please look through them and compare to see if changes need to be +synced in. + +--------------------------- + +This should make you a libhtdig.so.XXXX Copy the latest libhtdig_api.h to a +place that you might need it. + +link against libhtdig.so.XXXXX + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc new file mode 100644 index 00000000..e36be04b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc @@ -0,0 +1,1735 @@ +//-------------------------------------------------------------------- +// +// ResultFetch.cc +// +// 2/6/2002 created for libhtdig +// +// Neal Richter nealr@rightnow.com +// +// ResultFetch: Takes results of search and fills in the HTML templates +// +// FOR USE IN LIBHTDIG... does NOT stream to stdout!! +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ResultFetch.cc,v 1.5 2004/05/28 13:15:28 lha Exp $ +// +//-------------------------------------------------------------------- + + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htsearch.h" +#include "ResultFetch.h" +#include "ResultMatch.h" +#include "WeightWord.h" +#include "StringMatch.h" +#include "StringList.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "HtSGMLCodec.h" +#include "HtURLCodec.h" +#include "WordType.h" +#include "Collection.h" +#include "HtURLSeedScore.h" +#include "SplitMatches.h" +#include "HtConfiguration.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> + +#ifndef _WIN32 +#include <syslog.h> +#endif + +#include <locale.h> +#include <math.h> +#include <float.h> + +#if !defined(DBL_MAX) && defined(MAXFLOAT) +# define DBL_MAX MAXFLOAT +#endif + +//***************************************************************************** +// +ResultFetch::ResultFetch(Dictionary *collections, const StringList& templist) +//ResultFetch::ResultFetch(Dictionary * collections) +{ + + HtConfiguration *config = HtConfiguration::config(); + selected_collections = collections; + //collectionList = &templist; + limitTo = 0; + excludeFrom = 0; + // needExcerpt = 0; + templateError = 0; + + maxStars = config->Value("max_stars"); + maxScore = -DBL_MAX; + minScore = DBL_MAX; + setupImages(); + setupTemplates(); + + if (!templates.createFromString(config->Find("template_map"))) + { + // Error in createFromString. + // Let's try the default template_map + + config->Add("template_map", "Long builtin-long builtin-long Short builtin-short builtin-short"); + if (!templates.createFromString(config->Find("template_map"))) + { + // Unrecoverable Error + // (No idea why this would happen) + templateError = 1; + } + } + + currentTemplate = templates.get(config->Find("template_name")); + if (!currentTemplate) + { + // + // Must have been some error. Resort to the builtin-long (slot 0) + // + currentTemplate = (Template *) templates.templates[0]; + } + if (!currentTemplate) + { + // + // Another error!? Time to bail out... + // + templateError = 1; + } + // if (mystrcasestr(currentTemplate->getMatchTemplate(), "excerpt")) + // needExcerpt = 1; +} + +//***************************************************************************** +ResultFetch::~ResultFetch() +{ + // docDB.Close(); +} + +//***************************************************************************** +// +//void +List * +ResultFetch::fetch() +{ + + int pageNumber = 1; + + HtConfiguration *config = HtConfiguration::config(); + int good_sort = 0; + good_sort = ResultMatch::setSortType(config->Find("sort")); + if (!good_sort) + { + // Must temporarily stash the message in a String, since + // displaySyntaxError will overwrite the static temp used in form. + + String s(form("No such sort method: `%s'", (const char *) config->Find("sort"))); + + displaySyntaxError(s); + //return; + return(NULL); + } + + List *matches = buildMatchList(); + //int currentMatch = 0; + //int numberDisplayed = 0; + ResultMatch *match = 0; + int number = 0; + number = config->Value("matches_per_page"); + if (number <= 0) + number = 10; + //int startAt = (pageNumber - 1) * number; + + if (config->Boolean("logging")) + { + logSearch(pageNumber, matches); + } + + setVariables(pageNumber, matches); + + // + // The first match is guaranteed to have the highest score of + // all the matches. We use this to compute the number of stars + // to display for all the other matches. + // + match = (ResultMatch *) (*matches)[0]; + if (!match) + { + // + // No matches. + // + delete matches; + if (config->Boolean("nph")) + { + //cout << "HTTP/1.0 200 OK\r\n"; + } + //cout << "Content-type: text/html\r\n\r\n"; + + //displayNomatch(); + //return; + return(NULL); + } + // maxScore = match->getScore(); // now done in buildMatchList() + + if (config->Boolean("nph")) + { + //cout << "HTTP/1.0 200 OK\r\n"; + } + //cout << "Content-type: text/html\r\n\r\n"; + + String wrap_file = config->Find("search_results_wrapper"); + String *wrapper = 0; + char *header = 0, *footer = 0; + if (wrap_file.length()) + { + wrapper = readFile(wrap_file.get()); + if (wrapper && wrapper->length()) + { + char wrap_sepr[] = "HTSEARCH_RESULTS"; + char *h = wrapper->get(); + char *p = strstr(h, wrap_sepr); + if (p) + { + if (p > h && p[-1] == '$') + { + footer = p + strlen(wrap_sepr); + header = h; + p[-1] = '\0'; + } + else if (p > h + 1 && p[-2] == '$' && + (p[-1] == '(' || p[-1] == '{') && + (p[strlen(wrap_sepr)] == ')' || p[strlen(wrap_sepr)] == '}')) + { + footer = p + strlen(wrap_sepr) + 1; + header = h; + p[-2] = '\0'; + } + } + } + } + if (header) + { + //expandVariables(header); + } + else + { + //displayHeader(); + } + + + //neal + return(matches); + + /* + + // + // Display the window of matches requested. + // + if (!currentTemplate->getStartTemplate().empty()) + { + expandVariables(currentTemplate->getStartTemplate()); + } + + matches->Start_Get(); + while ((match = (ResultMatch *) matches->Get_Next()) && numberDisplayed < number) + { + if (currentMatch >= startAt) + { + // DocumentRef *ref = docDB[match->getID()]; + Collection *collection = match->getCollection(); + DocumentRef *ref = collection->getDocumentRef(match->getID()); + if (!ref || ref->DocState() != Reference_normal) + continue; // The document isn't present or shouldn't be displayed + ref->DocAnchor(match->getAnchor()); + ref->DocScore(match->getScore()); + fetchMatch(match, ref, currentMatch + 1); + numberDisplayed++; + delete ref; + } + currentMatch++; + } + + if (!currentTemplate->getEndTemplate().empty()) + { + expandVariables(currentTemplate->getEndTemplate()); + } + if (footer) + { + //expandVariables(footer); + } + else + { + //displayFooter(); + } + + if (wrapper) + delete wrapper; + delete matches; + + */ +} + +//***************************************************************************** +// Return true if the specified URL should be counted towards the results. +int +ResultFetch::includeURL(const String & url) +{ + + if (limitTo && limitTo->match(url, 1, 0) == 0) + return 0; + else + { + + if (excludeFrom && excludeFrom->match(url, 0, 0) != 0) + return 0; + else + return 1; + } +} + +//***************************************************************************** +//void +Dictionary * +ResultFetch::fetchMatch(ResultMatch * match, DocumentRef * ref, int current) +{ + HtConfiguration *config = HtConfiguration::config(); + String *str = 0; + + char *url = NULL; + + url = form("%s", ref->DocURL()); + + vars.Add("URL", new String(url)); + + int iA = ref->DocAnchor(); + + String *anchor = 0; + int fanchor = 0; + if (iA > 0) // if an anchor was found + { + List *anchors = ref->DocAnchors(); + if (anchors->Count() >= iA) + { + anchor = new String(); + fanchor = 1; + *anchor << "#" << ((String *) (*anchors)[iA - 1])->get(); + vars.Add("ANCHOR", anchor); + } + } + + // + // no condition for determining excerpt any more: + // we need it anyway to see if an anchor is relevant + // + int first = -1; + String urlanchor(url); + if (anchor) + urlanchor << anchor; + vars.Add("EXCERPT", excerpt(match, ref, urlanchor, fanchor, first)); + // + // anchor only relevant if an excerpt was found, i.e., + // the search expression matches the body of the document + // instead of only META keywords. + // + if (first < 0) + { + vars.Remove("ANCHOR"); + } + + vars.Add("METADESCRIPTION", new String(ref->DocMetaDsc())); + vars.Add("SCORE", new String(form("%f", ref->DocScore()))); + vars.Add("CURRENT", new String(form("%d", current))); + char *title = ref->DocTitle(); + if (!title || !*title) + { + if (strcmp(config->Find("no_title_text"), "filename") == 0) + { + // use actual file name + title = strrchr(url, '/'); + if (title) + { + title++; // Skip slash + str = new String(form("[%s]", title)); + } + else + // URL without '/' ?? + str = new String("[No title]"); + } + else + // use configure 'no title' text + str = new String(config->Find("no_title_text")); + } + else + str = new String(title); + vars.Add("TITLE", str); + vars.Add("STARSRIGHT", generateStars(ref, 1)); + vars.Add("STARSLEFT", generateStars(ref, 0)); + vars.Add("SIZE", new String(form("%d", ref->DocSize()))); + vars.Add("SIZEK", new String(form("%d", (ref->DocSize() + 1023) / 1024))); + + if (maxScore != 0 && maxScore != minScore) + { + int percent = (int) ((ref->DocScore() - minScore) * 100 / (maxScore - minScore)); + if (percent <= 0) + percent = 1; + vars.Add("PERCENT", new String(form("%d", percent))); + } + else + vars.Add("PERCENT", new String("100")); + + { + str = new String(); + char buffer[100]; + time_t t = ref->DocTime(); + if (t) + { + struct tm *tm = localtime(&t); + String datefmt = config->Find("date_format"); + const String locale = config->Find("locale"); + if (datefmt.empty()) + { + if (config->Boolean("iso_8601")) + datefmt = "%Y-%m-%d %H:%M:%S %Z"; + else + datefmt = "%x"; + } + if (!locale.empty()) + { + setlocale(LC_TIME, locale); + } + strftime(buffer, sizeof(buffer), (char *) datefmt, tm); + *str << buffer; + } + vars.Add("MODIFIED", str); + } + + vars.Add("HOPCOUNT", new String(form("%d", ref->DocHopCount()))); + vars.Add("DOCID", new String(form("%d", ref->DocID()))); + vars.Add("BACKLINKS", new String(form("%d", ref->DocBackLinks()))); + + { + str = new String(); + List *list = ref->Descriptions(); + int n = list->Count(); + for (int i = 0; i < n; i++) + { + *str << ((String *) (*list)[i])->get() << "<br>\n"; + } + vars.Add("DESCRIPTIONS", str); + String *description = new String(); + if (list->Count()) + *description << ((String *) (*list)[0]); + vars.Add("DESCRIPTION", description); + } + + int index = 0; + int length = 0; + int status = -1; + if (URLtemplate.hasPattern()) + status = URLtemplate.FindFirst(ref->DocURL(), index, length); + +/* + if (status >= 0 && index >= 0) + displayParsedFile(((String *) URLtemplateList[index])->get()); + else + expandVariables(currentTemplate->getMatchTemplate()); + + + + int vars_count = vars.Count(); + vars.Start_Get(); + + String key; + String * value; + + for(int i = 0; i < vars_count; i++) + { + key = vars.Get_Next(); + value = (String *) vars[key]; + + cout << key.get() << "[" << value->get() << "]" << endl; + cout.flush(); + } + +*/ + + return(&vars); + +} + +//***************************************************************************** +void +ResultFetch::setVariables(int pageNumber, List * matches) +{ + HtConfiguration *config = HtConfiguration::config(); + String tmp; + int i; + int nMatches = 0; + + if (matches) + nMatches = matches->Count(); + + int matchesPerPage = config->Value("matches_per_page"); + if (matchesPerPage <= 0) + matchesPerPage = 10; + int nPages = (nMatches + matchesPerPage - 1) / matchesPerPage; + + if (nPages > config->Value("maximum_pages", 10)) + nPages = config->Value("maximum_pages", 10); + if (nPages < 1) + nPages = 1; // We always have at least one page... + vars.Add("MATCHES_PER_PAGE", new String(config->Find("matches_per_page"))); + vars.Add("MAX_STARS", new String(config->Find("max_stars"))); + vars.Add("CONFIG", new String(config->Find("config"))); + vars.Add("VERSION", new String(config->Find("version"))); + vars.Add("RESTRICT", new String(config->Find("restrict"))); + vars.Add("EXCLUDE", new String(config->Find("exclude"))); + vars.Add("KEYWORDS", new String(config->Find("keywords"))); + if (mystrcasecmp(config->Find("match_method"), "and") == 0) + vars.Add("MATCH_MESSAGE", new String("all")); + else if (mystrcasecmp(config->Find("match_method"), "or") == 0) + vars.Add("MATCH_MESSAGE", new String("some")); + vars.Add("MATCHES", new String(form("%d", nMatches))); + vars.Add("PLURAL_MATCHES", + new String((nMatches == 1) ? (char *) "" : (const char *) config->Find("plural_suffix"))); + vars.Add("PAGE", new String(form("%d", pageNumber))); + vars.Add("PAGES", new String(form("%d", nPages))); + vars.Add("FIRSTDISPLAYED", new String(form("%d", (pageNumber - 1) * matchesPerPage + 1))); + if (nPages > 1) + vars.Add("PAGEHEADER", new String(config->Find("page_list_header"))); + else + vars.Add("PAGEHEADER", new String(config->Find("no_page_list_header"))); + + i = pageNumber * matchesPerPage; + if (i > nMatches) + i = nMatches; + vars.Add("LASTDISPLAYED", new String(form("%d", i))); + + if (config->Find("script_name").length() != 0) + { + vars.Add("CGI", new String(config->Find("script_name"))); + } + else + { + vars.Add("CGI", new String(getenv("SCRIPT_NAME"))); + } + vars.Add("STARTYEAR", new String(config->Find("startyear"))); + vars.Add("STARTMONTH", new String(config->Find("startmonth"))); + vars.Add("STARTDAY", new String(config->Find("startday"))); + vars.Add("ENDYEAR", new String(config->Find("endyear"))); + vars.Add("ENDMONTH", new String(config->Find("endmonth"))); + vars.Add("ENDDAY", new String(config->Find("endday"))); + + String *str; + //char *format = input->get("format"); + char *format = "builtin-long"; + String *in; + + vars.Add("SELECTED_FORMAT", new String(format)); + + str = new String(); + *str << "<select name=\"format\">\n"; + for (i = 0; i < templates.displayNames.Count(); i++) + { + in = (String *) templates.internalNames[i]; + *str << "<option value=\"" << in->get() << '"'; + if (format && mystrcasecmp(in->get(), format) == 0) + { + *str << " selected"; + } + *str << '>' << ((String *) templates.displayNames[i])->get() << '\n'; + } + *str << "</select>\n"; + vars.Add("FORMAT", str); + + str = new String(); + QuotedStringList ml(config->Find("method_names"), " \t\r\n"); + *str << "<select name=\"method\">\n"; + for (i = 0; i < ml.Count(); i += 2) + { + *str << "<option value=\"" << ml[i] << '"'; + if (mystrcasecmp(ml[i], config->Find("match_method")) == 0) + *str << " selected"; + *str << '>' << ml[i + 1] << '\n'; + } + *str << "</select>\n"; + vars.Add("METHOD", str); + + vars.Add("SELECTED_METHOD", new String(config->Find("match_method"))); + + ////////////////// Multiple database support ////////////////////// + // Emit collection table. Ensure that previously selected collections + // are "checked". + // Collections are specified in the config file with the + // "collection_names" attribute. An example of the corresponding snippet + // in the config file is as follows: + // + // collection_names: htdig_docs htdig_bugs + // + // htdig_bugs and htdig_docs are the two collections (databases) and + // their corresponding config files are: $CONFIG_DIR/htdig_bugs.conf and + // $CONFIG_DIR/htdig_docs.conf respectively. + // + QuotedStringList clist(config->Find("collection_names"), " \t\r\n"); + for (i = 0; i < clist.Count(); i++) + { + String config_name = clist[i]; + + for (int j = 0; j < collectionList.Count(); j++) + { + if (strcmp(config_name.get(), collectionList[j]) == 0) + { + str = new String(); + *str << "checked"; + String collection_id = "COLLECTION_"; + collection_id << config_name; + vars.Add(collection_id, str); + break; + } + } + } + + ////////////////// Multiple database support ////////////////////// + + str = new String(); + QuotedStringList sl(config->Find("sort_names"), " \t\r\n"); + const String st = config->Find("sort"); + StringMatch datetime; + datetime.IgnoreCase(); + datetime.Pattern("date|time"); + *str << "<select name=\"sort\">\n"; + for (i = 0; i < sl.Count(); i += 2) + { + *str << "<option value=\"" << sl[i] << '"'; + if (mystrcasecmp(sl[i], st) == 0 || + datetime.Compare(sl[i]) && datetime.Compare(st) || + mystrncasecmp(sl[i], st, 3) == 0 && + datetime.Compare(sl[i] + 3) && datetime.Compare(st.get() + 3)) + *str << " selected"; + *str << '>' << sl[i + 1] << '\n'; + } + *str << "</select>\n"; + vars.Add("SORT", str); + vars.Add("SELECTED_SORT", new String(st)); + + // + // If a paged output is required, set the appropriate variables + // + if (nPages > 1) + { + if (pageNumber > 1) + { + str = new String("<a href=\""); + tmp = 0; + createURL(tmp, pageNumber - 1); + *str << tmp << "\">" << config->Find("prev_page_text") << "</a>"; + } + else + { + str = new String(config->Find("no_prev_page_text")); + } + vars.Add("PREVPAGE", str); + + if (pageNumber < nPages) + { + str = new String("<a href=\""); + tmp = 0; + createURL(tmp, pageNumber + 1); + *str << tmp << "\">" << config->Find("next_page_text") << "</a>"; + } + else + { + str = new String(config->Find("no_next_page_text")); + } + vars.Add("NEXTPAGE", str); + + str = new String(); + char *p; + QuotedStringList pnt(config->Find("page_number_text"), " \t\r\n"); + QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n"); + QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n"); + if (nPages > config->Value("maximum_page_buttons", 10)) + nPages = config->Value("maximum_page_buttons", 10); + for (i = 1; i <= nPages; i++) + { + if (i == pageNumber) + { + p = npnt[i - 1]; + if (!p) + p = form("%d", i); + *str << p; + } + else + { + p = pnt[i - 1]; + if (!p) + p = form("%d", i); + *str << "<a href=\""; + tmp = 0; + createURL(tmp, i); + *str << tmp << "\">" << p << "</a>"; + } + if (i != nPages && sep.Count() > 0) + *str << sep[(i - 1) % sep.Count()]; + else if (i != nPages && sep.Count() <= 0) + *str << " "; + } + vars.Add("PAGELIST", str); + } + StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); + String *key; + for (i = 0; i < form_vars.Count(); i++) + { + if (!config->Find(form_vars[i]).empty()) + { + key = new String(form_vars[i]); + key->uppercase(); + vars.Add(key->get(), new String(config->Find(form_vars[i]))); + } + } +} + +//***************************************************************************** +void +ResultFetch::createURL(String & url, int pageNumber) +{ + HtConfiguration *config = HtConfiguration::config(); + String s; + int i; +//#define encodeInput(name) (s = input->get(name), encodeURL(s), s.get()) + + if (!config->Find("script_name").empty()) + { + url << config->Find("script_name"); + } + else + { + url << getenv("SCRIPT_NAME"); + } + + url << '?'; + + //if (input->exists("restrict")) + // url << "restrict=" << encodeInput("restrict") << '&'; + //if (input->exists("exclude")) + // url << "exclude=" << encodeInput("exclude") << '&'; + // Not needed: The next loop below handles this output + //if (input->exists("config")) + // url << "config=" << encodeInput("config") << '&'; + + // Put out all specified collections. If none selected, resort to + // default behaviour. + char *config_name = collectionList[0]; + String config_encoded; + if (config_name && config_name[0] == '\0') + config_name = NULL; + + if (config_name) + { + for (i = 0; i < collectionList.Count(); i++) + { + config_name = collectionList[i]; + config_encoded = config_name; + encodeURL(config_encoded); + url << "config=" << config_encoded << '&'; + } + } +/* + if (input->exists("method")) + url << "method=" << encodeInput("method") << '&'; + if (input->exists("format")) + url << "format=" << encodeInput("format") << '&'; + if (input->exists("sort")) + url << "sort=" << encodeInput("sort") << '&'; + if (input->exists("matchesperpage")) + url << "matchesperpage=" << encodeInput("matchesperpage") << '&'; + if (input->exists("keywords")) + url << "keywords=" << encodeInput("keywords") << '&'; + if (input->exists("words")) + url << "words=" << encodeInput("words") << '&'; + if (input->exists("startyear")) + url << "startyear=" << encodeInput("startyear") << '&'; + if (input->exists("startmonth")) + url << "startmonth=" << encodeInput("startmonth") << '&'; + if (input->exists("startday")) + url << "startday=" << encodeInput("startday") << '&'; + if (input->exists("endyear")) + url << "endyear=" << encodeInput("endyear") << '&'; + if (input->exists("endmonth")) + url << "endmonth=" << encodeInput("endmonth") << '&'; + if (input->exists("endday")) + url << "endday=" << encodeInput("endday") << '&'; + StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); + for (i = 0; i < form_vars.Count(); i++) + { + if (input->exists(form_vars[i])) + { + s = form_vars[i]; + encodeURL(s); // shouldn't be needed, but just in case + url << s << '='; + url << encodeInput(form_vars[i]) << '&'; + } + } + url << "page=" << pageNumber; + +*/ + +} + +//***************************************************************************** +void +ResultFetch::displayHeader() +{ + HtConfiguration *config = HtConfiguration::config(); + displayParsedFile(config->Find("search_results_header")); +} + +//***************************************************************************** +void +ResultFetch::displayFooter() +{ + HtConfiguration *config = HtConfiguration::config(); + displayParsedFile(config->Find("search_results_footer")); +} + +//***************************************************************************** +void +ResultFetch::displayNomatch() +{ + HtConfiguration *config = HtConfiguration::config(); + displayParsedFile(config->Find("nothing_found_file")); +} + +//***************************************************************************** +void +ResultFetch::displaySyntaxError(const String & message) +{ + HtConfiguration *config = HtConfiguration::config(); + if (config->Boolean("nph")) + { + // cout << "HTTP/1.0 200 OK\r\n"; + } + //cout << "Content-type: text/html\r\n\r\n"; + + setVariables(0, 0); + vars.Add("SYNTAXERROR", new String(message)); + displayParsedFile(config->Find("syntax_error_file")); +} + +//***************************************************************************** +void +ResultFetch::displayParsedFile(const String & filename) +{ + FILE *fl = fopen(filename, "r"); + char buffer[1000]; + + while (fl && fgets(buffer, sizeof(buffer), fl)) + { + expandVariables(buffer); + } + if (fl) + fclose(fl); + else if (debug) + cerr << "displayParsedFile: Can't open " << filename << endl; +} + +//***************************************************************************** +// If the result templates need to depend on the URL of the match, we need +// an efficient way to determine which template file to use. To do this, we +// will build a StringMatch object with all the URL patterns and also +// a List parallel to that pattern that contains the actual template file +// names to use for each URL. +// +void +ResultFetch::setupTemplates() +{ + HtConfiguration *config = HtConfiguration::config(); + String templatePatterns = config->Find("template_patterns"); + if (!templatePatterns.empty()) + { + // + // The templatePatterns string will have pairs of values. The first + // value of a pair will be a pattern, the second value will be a + // result template file name. + // + char *token = strtok(templatePatterns, " \t\r\n"); + String pattern; + while (token) + { + // + // First token is a pattern... + // + pattern << token << '|'; + + // + // Second token is an URL + // + token = strtok(0, " \t\r\n"); + URLtemplateList.Add(new String(token)); + if (token) + token = strtok(0, " \t\r\n"); + } + pattern.chop(1); + URLtemplate.Pattern(pattern); + } +} + +//***************************************************************************** +// If the star images need to depend on the URL of the match, we need +// an efficient way to determine which image to use. To do this, we +// will build a StringMatch object with all the URL patterns and also +// a List parallel to that pattern that contains the actual images to +// use for each URL. +// +void +ResultFetch::setupImages() +{ + HtConfiguration *config = HtConfiguration::config(); + String starPatterns = config->Find("star_patterns"); + if (!starPatterns.empty()) + { + // + // The starPatterns string will have pairs of values. The first + // value of a pair will be a pattern, the second value will be an + // URL to an image. + // + char *token = strtok(starPatterns, " \t\r\n"); + String pattern; + while (token) + { + // + // First token is a pattern... + // + pattern << token << '|'; + + // + // Second token is an URL + // + token = strtok(0, " \t\r\n"); + URLimageList.Add(new String(token)); + if (token) + token = strtok(0, " \t\r\n"); + } + pattern.chop(1); + URLimage.Pattern(pattern); + } +} + +//***************************************************************************** +String * +ResultFetch::generateStars(DocumentRef * ref, int right) +{ + int i; + String *result = new String(); + HtConfiguration *config = HtConfiguration::config(); + if (!config->Boolean("use_star_image", 1)) + return result; + + String image = config->Find("star_image"); + const String blank = config->Find("star_blank"); + double score; + + if (maxScore != 0 && maxScore != minScore) + { + score = (ref->DocScore() - minScore) / (maxScore - minScore); + if (debug) + cerr << "generateStars: doc, min, max " << ref-> + DocScore() << ", " << minScore << ", " << maxScore << endl; + } + else + { + maxScore = ref->DocScore(); + score = 1; + } + int nStars = int (score * (maxStars - 1) + 0.5) + 1; + + vars.Add("NSTARS", new String(form("%.d", nStars))); + if (debug) + cerr << "generateStars: nStars " << nStars << " of " << maxStars << endl; + + if (right) + { + for (i = 0; i < maxStars - nStars; i++) + { + *result << "<img src=\"" << blank << "\" alt=\" \">"; + } + } + + int match = 0; + int length = 0; + int status; + + if (URLimage.hasPattern()) + status = URLimage.FindFirst(ref->DocURL(), match, length); + else + status = -1; + + if (status >= 0 && match >= 0) + { + image = ((String *) URLimageList[match])->get(); + } + + for (i = 0; i < nStars; i++) + { + *result << "<img src=\"" << image << "\" alt=\"*\">"; + } + + if (!right) + { + for (i = 0; i < maxStars - nStars; i++) + { + *result << "<img src=\"" << blank << "\" alt=\" \">"; + } + } + + return result; +} + +//***************************************************************************** +String * +ResultFetch::readFile(const String & filename) +{ + FILE *fl; + String *s = new String(); + char line[1024]; + + fl = fopen(filename, "r"); + while (fl && fgets(line, sizeof(line), fl)) + { + *s << line; + } + if (fl) + fclose(fl); + else if (debug) + cerr << "readFile: Can't open " << filename << endl; + return s; +} + +//***************************************************************************** +void +ResultFetch::expandVariables(const String & str_arg) +{ + const char *str = str_arg; + enum + { + StStart, StLiteral, StVarStart, StVarClose, StVarPlain, StGotVar + } + state = StStart; + String var = ""; + + while (str && *str) + { + switch (state) + { + case StStart: + if (*str == '\\') + state = StLiteral; + else if (*str == '$') + state = StVarStart; + else + { + cout << *str; + cout.flush(); + } + break; + case StLiteral: + cout << *str; + cout.flush(); + state = StStart; + break; + case StVarStart: + if (*str == '%' || *str == '=') + var << *str; // code for URL-encoded/decoded variable + else if (*str == '&') + { + var << *str; // code for SGML-encoded variable + if (mystrncasecmp("&", str, 5) == 0) + str += 4; + } + else if (*str == '(' || *str == '{') + state = StVarClose; + else if (isalnum(*str) || *str == '_' || *str == '-') + { + var << *str; + state = StVarPlain; + } + else + state = StStart; + break; + case StVarClose: + if (*str == ')' || *str == '}') + state = StGotVar; + else if (isalnum(*str) || *str == '_' || *str == '-') + var << *str; + else + state = StStart; + break; + case StVarPlain: + if (isalnum(*str) || *str == '_' || *str == '-') + var << *str; + else + { + state = StGotVar; + continue; + } + break; + case StGotVar: + // + // We have a complete variable in var. Look it up and + // see if we can find a good replacement for it. + // + outputVariable(var); + var = ""; + state = StStart; + continue; + } + str++; + } + if (state == StGotVar || state == StVarPlain) + { + // + // The end of string was reached, but we are still trying to + // put a variable together. Since we now have a complete + // variable, we will look up the value for it. + // + outputVariable(var); + } +} + +//***************************************************************************** +void +ResultFetch::outputVariable(const String & var) +{ + String *temp; + String value = ""; + const char *ev, *name; + + // We have a complete variable name in var. Look it up and + // see if we can find a good replacement for it, either in our + // vars dictionary or in the environment variables. + name = var; + while (*name == '&' || *name == '%' || *name == '=') + name++; + temp = (String *) vars[name]; + if (temp) + value = *temp; + else + { + ev = getenv(name); + if (ev) + value = ev; + } + while (--name >= var.get() && value.length()) + { + if (*name == '%') + encodeURL(value); + else if (*name == '&') + value = HtSGMLCodec::instance()->decode(value); + else // (*name == '=') + decodeURL(value); + } + cout << value; + cout.flush(); +} + +//***************************************************************************** +List * +ResultFetch::buildMatchList() +{ + HtConfiguration *config = HtConfiguration::config(); + char *cpid; + String url; + ResultMatch *thisMatch; + SplitMatches matches(*config); + double backlink_factor = config->Double("backlink_factor"); + double date_factor = config->Double("date_factor"); + double backlink_score = 0; + double date_score = 0; + double base_score = 0; + + + // Additions made here by Mike Grommet ... + + tm startdate; // structure to hold the startdate specified by the user + tm enddate; // structure to hold the enddate specified by the user + time_t now = time((time_t *) 0); // fill in all fields for mktime + tm *lt = localtime(&now); // - Gilles's fix + startdate = *lt; + enddate = *lt; + + time_t eternity = ~(1 << (sizeof(time_t) * 8 - 1)); // will be the largest value holdable by a time_t + tm *endoftime; // the time_t eternity will be converted into a tm, held by this variable + + time_t timet_startdate; + time_t timet_enddate; + int monthdays[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; + + // boolean to test to see if we need to build date information or not + int dategiven = ((config->Value("startmonth")) || + (config->Value("startday")) || + (config->Value("startyear")) || + (config->Value("endmonth")) || (config->Value("endday")) || (config->Value("endyear"))); + + // find the end of time + endoftime = gmtime(&eternity); + + if (dategiven) // user specified some sort of date information + { + // set up the startdate structure + // see man mktime for details on the tm structure + startdate.tm_sec = 0; + startdate.tm_min = 0; + startdate.tm_hour = 0; + startdate.tm_yday = 0; + startdate.tm_wday = 0; + + // The concept here is that if a user did not specify a part of a date, + // then we will make assumtions... + // For instance, suppose the user specified Feb, 1999 as the start + // range, we take steps to make sure that the search range date starts + // at Feb 1, 1999, + // along these same lines: (these are in MM-DD-YYYY format) + // Startdates: Date Becomes + // 01-01 01-01-1970 + // 01-1970 01-01-1970 + // 04-1970 04-01-1970 + // 1970 01-01-1970 + // These things seem to work fine for start dates, as all months have + // the same first day however the ending date can't work this way. + + if (config->Value("startmonth")) // form input specified a start month + { + startdate.tm_mon = config->Value("startmonth") - 1; + // tm months are zero based. They are passed in as 1 based + } + else + startdate.tm_mon = 0; // otherwise, no start month, default to 0 + + if (config->Value("startday")) // form input specified a start day + { + startdate.tm_mday = config->Value("startday"); + // tm days are 1 based, they are passed in as 1 based + } + else + startdate.tm_mday = 1; // otherwise, no start day, default to 1 + + // year is handled a little differently... the tm_year structure + // wants the tm_year in a format of year - 1900. + // since we are going to convert these dates to a time_t, + // a time_t value of zero, the earliest possible date + // occurs Jan 1, 1970. If we allow dates < 1970, then we + // could get negative time_t values right??? + // (barring minor timezone offsets west of GMT, where Epoch is 12-31-69) + + if (config->Value("startyear")) // form input specified a start year + { + startdate.tm_year = config->Value("startyear") - 1900; + if (startdate.tm_year < 69 - 1900) // correct for 2-digit years 00-68 + startdate.tm_year += 2000; // - Gilles's fix + if (startdate.tm_year < 0) // correct for 2-digit years 69-99 + startdate.tm_year += 1900; + } + else + startdate.tm_year = 1970 - 1900; + // otherwise, no start day, specify start at 1970 + + // set up the enddate structure + enddate.tm_sec = 59; // allow up to last second of end day + enddate.tm_min = 59; // - Gilles's fix + enddate.tm_hour = 23; + enddate.tm_yday = 0; + enddate.tm_wday = 0; + + if (config->Value("endmonth")) // form input specified an end month + { + enddate.tm_mon = config->Value("endmonth") - 1; + // tm months are zero based. They are passed in as 1 based + } + else + enddate.tm_mon = 11; // otherwise, no end month, default to 11 + + if (config->Value("endyear")) // form input specified a end year + { + enddate.tm_year = config->Value("endyear") - 1900; + if (enddate.tm_year < 69 - 1900) // correct for 2-digit years 00-68 + enddate.tm_year += 2000; // - Gilles's fix + if (enddate.tm_year < 0) // correct for 2-digit years 69-99 + enddate.tm_year += 1900; + } + else + enddate.tm_year = endoftime->tm_year; + // otherwise, no end year, specify end at the end of time allowable + + // Months have different number of days, and this makes things more + // complicated than the startdate range. + // Following the example above, here is what we want to happen: + // Enddates: Date Becomes + // 04-31 04-31-endoftime->tm_year + // 05-1999 05-31-1999, may has 31 days... we want to search until the end of may so... + // 1999 12-31-1999, search until the end of the year + + if (config->Value("endday")) // form input specified an end day + { + enddate.tm_mday = config->Value("endday"); + // tm days are 1 based, they are passed in as 1 based + } + else + { + // otherwise, no end day, default to the end of the month + enddate.tm_mday = monthdays[enddate.tm_mon]; + if (enddate.tm_mon == 1) // February, so check for leap year + if (((enddate.tm_year + 1900) % 4 == 0 && + (enddate.tm_year + 1900) % 100 != 0) || (enddate.tm_year + 1900) % 400 == 0) + enddate.tm_mday += 1; // Feb. 29 - Gilles's fix + } + + // Convert the tm values into time_t values. + // Web servers specify modification times in GMT, but htsearch + // displays these modification times in the server's local time zone. + // For consistency, we would prefer to select based on this same + // local time zone. - Gilles's fix + + timet_startdate = mktime(&startdate); + timet_enddate = mktime(&enddate); + + // I'm not quite sure what behavior I want to happen if + // someone reverses the start and end dates, and one of them is invalid. + // for now, if there is a completely invalid date on the start or end + // date, I will force the start date to time_t 0, and the end date to + // the maximum that can be handled by a time_t. + + if (timet_startdate < 0) + timet_startdate = 0; + if (timet_enddate < 0) + timet_enddate = eternity; + + // what if the user did something really goofy like choose an end date + // that's before the start date + + if (timet_enddate < timet_startdate) // if so, then swap them so they are in order + { + time_t timet_temp = timet_enddate; + timet_enddate = timet_startdate; + timet_startdate = timet_temp; + } + } + else // no date was specifed, so plug in some defaults + { + timet_startdate = 0; + timet_enddate = eternity; + } + + // ... MG + + + URLSeedScore adjustments(*config); + + // If we knew where to pass it, this would be a good place to pass + // on errors from adjustments.ErrMsg(). + +// Deal with all collections +// + selected_collections->Start_Get(); + Collection *collection = NULL; + while ((collection = (Collection *) selected_collections->Get_NextElement())) + { + ResultList *results = collection->getResultList(); + if (results == NULL) + continue; + + results->Start_Get(); + while ((cpid = results->Get_Next())) + { + int id = atoi(cpid); + + // DocumentRef *thisRef = docDB[id]; + + DocMatch *dm = results->find(cpid); + Collection *collection = NULL; + if (dm) + collection = dm->collection; + if (collection == NULL) + continue; + DocumentRef *thisRef = collection->getDocumentRef(id); + + // + // If it wasn't there, then ignore it + // + if (thisRef == 0) + { + continue; + } + + if (!includeURL(thisRef->DocURL())) + { + // Get rid of it to free the memory! + delete thisRef; + + continue; + } + + // Code added by Mike Grommet for date search ranges + // check for valid date range. toss it out if it isn't relevant. + if ((timet_startdate > 0 || enddate.tm_year < endoftime->tm_year) && + (thisRef->DocTime() < timet_startdate || thisRef->DocTime() > timet_enddate)) + { + delete thisRef; + continue; + } + + thisMatch = ResultMatch::create(); + thisMatch->setID(id); + thisMatch->setCollection(collection); + + // + // Assign the incomplete score to this match. This score was + // computed from the word database only, no excerpt context was + // known at that time, or info about the document itself, + // so this still needs to be done. + // + + // Moved up: DocMatch *dm = results->find(cpid); + double score = dm->score; + + // We need to scale based on date relevance and backlinks + // Other changes to the score can happen now + // Or be calculated by the result match in getScore() + + // This formula derived through experimentation + // We want older docs to have smaller values and the + // ultimate values to be a reasonable size (max about 100) + + base_score = score; + if (date_factor != 0.0) + { + date_score = date_factor * ((thisRef->DocTime() * 1000.0 / (double) now) - 900); + score += date_score; + } + + if (backlink_factor != 0.0) + { + int links = thisRef->DocLinks(); + if (links == 0) + links = 1; // It's a hack, but it helps... + + backlink_score = backlink_factor * (thisRef->DocBackLinks() / (double) links); + score += backlink_score; + } + + if (debug) + { + cerr << thisRef->DocURL() << "\n"; + } + + thisMatch->setTime(thisRef->DocTime()); + thisMatch->setTitle(thisRef->DocTitle()); + + score = adjustments.adjust_score(score, thisRef->DocURL()); + + // Get rid of it to free the memory! + delete thisRef; + + score = log(1.0 + score); + thisMatch->setScore(score); + thisMatch->setAnchor(dm->anchor); + + // + // Append this match to our list of matches. + // + matches.Add(thisMatch, url.get()); + + if (debug) + { + cerr << " base_score " << base_score << " date_score " << date_score << + " backlink_score " << backlink_score << "\n"; + cerr << " score " << score << "(" << thisMatch-> + getScore() << "), maxScore " << maxScore << ", minScore " << minScore << endl; + } + + if (maxScore < score) + { + if (debug) + cerr << "Set maxScore = score" << endl; + maxScore = score; + } + if (minScore > score) + { + if (debug) + cerr << "Set minScore = score" << endl; + minScore = score; + } + } + } + + // + // Each sub-area is then sorted by relevance level. + // + List *matches_part; // Outside of loop to keep for-scope warnings away. + for (matches_part = matches.Get_First(); matches_part != 0; matches_part = matches.Get_Next()) + sort(matches_part); + + // Then all sub-lists are concatenated and put in a new list. + return matches.JoinedLists(); +} + +//***************************************************************************** +String * +ResultFetch::excerpt(ResultMatch * match, DocumentRef * ref, String urlanchor, int fanchor, int &first) +{ + HtConfiguration *config = HtConfiguration::config(); + // It is necessary to keep alive the String you .get() a char * from, + // as long as you use the char *. + + //String head_string; + + char *head; + int use_meta_description = 0; + Collection *collection = match->getCollection(); + + if (config->Boolean("use_meta_description", 0) && strlen(ref->DocMetaDsc()) != 0) + { + // Set the head to point to description + head = ref->DocMetaDsc(); + use_meta_description = 1; + } + else + { + // docDB.ReadExcerpt(*ref); + collection->ReadExcerpt(*ref); + head = ref->DocHead(); // head points to the top + } + + //head_string = HtSGMLCodec::instance()->decode(head); + //head = head_string.get(); + + int which, length; + char *temp = head; + String part; + String *text = new String(""); + + StringMatch *allWordsPattern = NULL; + if (collection) + allWordsPattern = collection->getSearchWordsPattern(); + if (!allWordsPattern) + return text; + + // htsearch displays the description when: + // 1) a description has been found + // 2) the option "use_meta_description" is set to true + // If previous conditions are false and "excerpt_show_top" is set to true + // it shows the whole head. Else, it acts as default. + + if (config->Boolean("excerpt_show_top", 0) || use_meta_description || !allWordsPattern->hasPattern()) + first = 0; + else + first = allWordsPattern->FindFirstWord(head, which, length); + + if (first < 0 && config->Boolean("no_excerpt_show_top")) + first = 0; // No excerpt, but we want to show the top. + + if (first < 0) + { + // + // No excerpt available, don't show top, so display message + // + if (!config->Find("no_excerpt_text").empty()) + { + *text << config->Find("no_excerpt_text"); + } + } + else + { + int headLength = strlen(head); + int length = config->Value("excerpt_length", 50); + char *start; + char *end; + WordType type(*config); + + if (!config->Boolean("add_anchors_to_excerpt")) + // negate flag if it's on (anchor available) + fanchor = 0; + + // + // Figure out where to start the excerpt. Basically we go back + // half the excerpt length from the first matched word + // + start = &temp[first] - length / 2; + if (start < temp) + start = temp; + else + { + *text << config->Find("start_ellipses"); + while (*start && type.IsStrictChar(*start)) + start++; + } + + // + // Figure out the end of the excerpt. + // + end = start + length; + if (end > temp + headLength) + { + end = temp + headLength; + *text << hilight(match, start, urlanchor, fanchor); + } + else + { + while (*end && type.IsStrictChar(*end)) + end++; + *end = '\0'; + *text << hilight(match, start, urlanchor, fanchor); + *text << config->Find("end_ellipses"); + } + } + return text; +} + +//***************************************************************************** +String ResultFetch::hilight(ResultMatch * match, const String & str_arg, const String & urlanchor, int fanchor) +{ + HtConfiguration * + config = HtConfiguration::config(); + const String + start_highlight = config->Find("start_highlight"); + const String + end_highlight = config->Find("end_highlight"); + const char * + str = str_arg; + String + result; + int + pos = 0; + int + which, length; + WeightWord * + ww; + int + first = 1; + String + s; +#define SGMLencodedChars(p, l) (s = 0, s.append(p, l), HtSGMLCodec::instance()->decode(s)) + + result = 0; + Collection * + collection = match->getCollection(); + StringMatch * + allWordsPattern = NULL; + if (collection) + allWordsPattern = collection->getSearchWordsPattern(); + List * + searchWords = NULL; + if (collection) + searchWords = collection->getSearchWords(); + if (!allWordsPattern || !searchWords) + return result; + + while (allWordsPattern->hasPattern() && (pos = allWordsPattern->FindFirstWord(str, which, length)) >= 0) + { + //result.append(str, pos); + result << SGMLencodedChars(str, pos); + ww = (WeightWord *) (*searchWords)[which]; + result << start_highlight; + if (first && fanchor) + result << "<a href=\"" << urlanchor << "\">"; + //result.append(str + pos, length); + result << SGMLencodedChars(str + pos, length); + if (first && fanchor) + result << "</a>"; + result << end_highlight; + str += pos + length; + first = 0; + } + //result.append(str); + result << SGMLencodedChars(str, strlen(str)); + return result; +} + +//***************************************************************************** +void +ResultFetch::sort(List * matches) +{ + HtConfiguration *config = HtConfiguration::config(); + int numberOfMatches = matches->Count(); + int i; + + if (numberOfMatches <= 1) + return; + + ResultMatch **array = new ResultMatch *[numberOfMatches]; + for (i = 0; i < numberOfMatches; i++) + { + array[i] = (ResultMatch *) (*matches)[i]; + } + matches->Release(); + + qsort((char *) array, numberOfMatches, sizeof(ResultMatch *), array[0]->getSortFun()); + + const String st = config->Find("sort"); + if (!st.empty() && mystrncasecmp("rev", st, 3) == 0) + { + for (i = numberOfMatches; --i >= 0;) + matches->Add(array[i]); + } + else + { + for (i = 0; i < numberOfMatches; i++) + matches->Add(array[i]); + } + delete[]array; +} + +//***************************************************************************** +void +ResultFetch::logSearch(int page, List * matches) +{ +//Note: This is Posix and dependent on a running syslogd.. +//does not work for Win32 +//TODO: Look into using native windows system logs instead +#ifndef _WIN32 + + HtConfiguration *config = HtConfiguration::config(); + // Currently unused time_t t; + int nMatches = 0; + int level = LOG_LEVEL; + int facility = LOG_FACILITY; + char *host = getenv("REMOTE_HOST"); + char *ref = getenv("HTTP_REFERER"); + + if (host == NULL) + host = getenv("REMOTE_ADDR"); + if (host == NULL) + host = "-"; + + if (ref == NULL) + ref = "-"; + + if (matches) + nMatches = matches->Count(); + + openlog("htsearch", LOG_PID, facility); + syslog(level, "%s [%s] (%s) [%s] [%s] (%d/%s) - %d -- %s\n", + host, + input->exists("config") ? input->get("config") : "default", + (const char *) config->Find("match_method"), input->get("words"), logicalWords.get(), + nMatches, (const char *) config->Find("matches_per_page"), page, ref); +#endif +} diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h new file mode 100644 index 00000000..f1f9e92a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h @@ -0,0 +1,248 @@ +//-------------------------------------------------------------------- +// +// ResultFetch.h +// +// 2/6/2002 created for libhtdig +// +// Neal Richter nealr@rightnow.com +// +// +// ResultFetch: Takes results of search and fills in the HTML templates +// +// FOR USE IN LIBHTDIG... does NOT stream to stdout!! +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ResultFetch.h,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//-------------------------------------------------------------------- + +#ifndef _ResultFetch_h_ +#define _ResultFetch_h_ + +#include "Object.h" +#include "ResultList.h" +#include "ResultMatch.h" +#include "TemplateList.h" +#include "cgi.h" +#include "StringMatch.h" +#include "List.h" +#include "DocumentDB.h" +#include "Database.h" +#include "Dictionary.h" +#include "HtRegex.h" + +class ResultFetch : public Object +{ +public: + // + // Construction/Destruction + // + // Display(const String& docFile, const String& indexFile, const String& excerptFile); + + ResultFetch(Dictionary *selected_collections, const StringList& templist); + ResultFetch(Dictionary *selected_collections); + ~ResultFetch(); + + void setStartTemplate(const String& templateName); + void setMatchTemplate(const String& templateName); + void setEndTemplate(const String& templateName); + + // inline void setResults(ResultList *results); + // inline void setSearchWords(List *searchWords); + inline void setLimit(HtRegex *); + inline void setExclude(HtRegex *); + // inline void setAllWordsPattern(StringMatch *); + inline void setLogicalWords(char *); + inline void setOriginalWords(char *); + inline void setCGI(cgi *); + + //void fetch(int pageNumber); + //void fetchMatch(ResultMatch *match, DocumentRef *ref, int current); + List * fetch(); + Dictionary * fetchMatch(ResultMatch *match, DocumentRef *ref, int current); + void displayHeader(); + void displayFooter(); + void displayNomatch(); + void displaySyntaxError(const String &); + + int hasTemplateError() {return templateError;} + +protected: + // + // Multiple database support + // + Dictionary *selected_collections; + + // + // Search Policy + char *search_policy; + + // + // The list of search results. + // + // ResultList *results; + + // + // The database that contains documents. + // + // DocumentDB docDB; + + // List of databases to search on + StringList collectionList; + + // + // A list of words that we are searching for + // + // List *searchWords; + + // + // Pattern that all result URLs must match or exclude + // + HtRegex *limitTo; + HtRegex *excludeFrom; + + // + // Pattern of all the words + // + // StringMatch *allWordsPattern; + + // + // Variables for substitution into text are stored in a dictionary + // + Dictionary vars; + + // + // Since the creation of excerpts is somewhat time consuming, we will + // only compute them if they're actually going to be used. This is the + // flag that tells us if we will need the excerpt. + // + int needExcerpt; + + // + // Since we might have errors we cannot recover from, this tells us + // what happened. + // + int templateError; + + // + // To allow the result templates to be dependant on the match URL, we need + // the following: + // + StringMatch URLtemplate; + List URLtemplateList; + + // + // To allow the star images to be dependant on the match URL, we need + // the following: + // + StringMatch URLimage; + List URLimageList; + + // + // Maximum number of stars to display + // + int maxStars; + double maxScore; + double minScore; + + // + // For display, we have different versions of the list of words. + // + String logicalWords; + String originalWords; + + // + // To be able to recreate the URL that will get to us again, we need + // the info from the HTML form that called us. + // + cgi *input; + + // + // Match output is done through templates. This is the interface to these + // templates. + // + TemplateList templates; + Template *currentTemplate; + + // + // Methods... + // + List *buildMatchList(); + void sort(List *); + + int includeURL(const String&); + String *readFile(const String&); + void expandVariables(const String&); + void outputVariable(const String&); + String *excerpt(ResultMatch *match, DocumentRef *ref, String urlanchor, + int fanchor, int &first); + String hilight(ResultMatch *match, const String& str, const String& urlanchor, int fanchor); + void setupTemplates(); + void setupImages(); + String *generateStars(DocumentRef *, int); + void displayParsedFile(const String&); + void setVariables(int, List *); + void createURL(String &, int); + void logSearch(int, List *); +}; + +//***************************************************************************** +inline void +ResultFetch::setLimit(HtRegex *limit) +{ + limitTo = limit; +} + +inline void +ResultFetch::setExclude(HtRegex *exclude) +{ + excludeFrom = exclude; +} + +#if 0 +inline void +Display::setAllWordsPattern(StringMatch *pattern) +{ + allWordsPattern = pattern; +} + +inline void +Display::setResults(ResultList *results) +{ + this->results = results; +} + +inline void +Display::setSearchWords(List *searchWords) +{ + this->searchWords = searchWords; +} +#endif + +inline void +ResultFetch::setLogicalWords(char *s) +{ + logicalWords = s; + vars.Add("LOGICAL_WORDS", new String(logicalWords)); +} + +inline void +ResultFetch::setOriginalWords(char *s) +{ + originalWords = s; + vars.Add("WORDS", new String(originalWords)); +} + +inline void +ResultFetch::setCGI(cgi *aCgi) +{ + input = aCgi; +} + +#endif + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc new file mode 100644 index 00000000..3f6d5e5f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc @@ -0,0 +1,517 @@ +//-------------------------------------------------------------------- +// +// TextCollector.cc +// +// 2/6/2002 created for libhtdig +// +// Neal Richter nealr@rightnow.com +// +// TextCollector: +// General Purpose Text Document Indexer. +// Calls appropriate parsers. +// The parser notifies the TextCollector object that it got something +// (got_* functions) and the TextCollector object feed the databases +// and statistics accordingly. +// +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//-------------------------------------------------------------------- + + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "TextCollector.h" +#include "htdig.h" +#include "HtWordList.h" +#include "WordRecord.h" +#include "URLRef.h" +#include "Server.h" +#include "Parsable.h" +#include "BasicDocument.h" +#include "StringList.h" +#include "WordType.h" +#include "md5.h" +#include "defaults.h" + +#include <signal.h> +#include <stdio.h> + +#include <sys/timeb.h> + + +//***************************************************************************** +// TextCollector::TextCollector() +// +TextCollector::TextCollector(TextCollectorLog flags): +words(*(HtConfiguration::config())) +{ + HtConfiguration *config = HtConfiguration::config(); + //FILE *urls_parsed; + + currenthopcount = 0; + + //turn on word tracking! + trackWords = 1; + + // + // Initialize the flags for the various HTML factors + // + + // text_factor + factor[0] = FLAG_TEXT; + // title_factor + factor[1] = FLAG_TITLE; + // heading factor (now generic) + factor[2] = FLAG_HEADING; + factor[3] = FLAG_HEADING; + factor[4] = FLAG_HEADING; + factor[5] = FLAG_HEADING; + factor[6] = FLAG_HEADING; + factor[7] = FLAG_HEADING; + // img alt text + //factor[8] = FLAG_KEYWORDS; + factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has + // its own FLAG and factor. + // keywords factor + factor[9] = FLAG_KEYWORDS; + // META description factor + factor[10] = FLAG_DESCRIPTION; + + doc = NULL; + minimumWordLength = config->Value("minimum_word_length", 3); + + + //TODO put document-index log file stuff here via logs like Retriever + + check_unique_md5 = config->Boolean("check_unique_md5", 0); + check_unique_date = config->Boolean("check_unique_date", 0); + + d_md5 = 0; + if (check_unique_md5) + { + d_md5 = Database::getDatabaseInstance(DB_HASH); + + if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK) + { + cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n"; + } + } + + temp_doc_count = 0; + +} + + +//***************************************************************************** +// TextCollector::~TextCollector() +// +TextCollector::~TextCollector() +{ + if (d_md5) + d_md5->Close(); + //delete doc; + + if(temp_doc_count != 0) + { + words.Flush(); + temp_doc_count = 0; + } + + words.Flush(); + words.Close(); + +} + + +//***************************************************************************** +// void TextCollector::IndexDoc() +// +// + +int +TextCollector::IndexDoc(BasicDocument & a_basicdoc) +{ + DocumentRef *ref; + time_t date; + int old_document = 0; + static int index = 0; + + //struct timeb tb; + + //HtConfiguration *config = HtConfiguration::config(); + + doc = &a_basicdoc; + + ref = docs[doc->Location()]; // It might be nice to have just an Exists() here + if (ref) + { + // + // We already have an entry for this document in our database. + // This means we can get the document ID and last modification + // time from there. + // + current_id = ref->DocID(); + date = ref->DocTime(); + if (ref->DocAccessed()) + old_document = 1; + else // we haven't retrieved it yet, so we only have the first link + old_document = 0; + ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link + ref->DocAccessed(time(0)); + ref->DocState(Reference_normal); + currenthopcount = ref->DocHopCount(); + } + else + { + // + // Never seen this document before. We need to create an + // entry for it. This implies that it gets a new document ID. + // + + date = 0; + + current_id = docs.NextDocID(); + ref = new DocumentRef; + ref->DocID(current_id); + ref->DocURL(doc->Location()); + ref->DocState(Reference_normal); + ref->DocAccessed(time(0)); + ref->DocHopCount(0); + ref->DocBackLinks(1); // We had to have a link to get here! + old_document = 0; + } + + word_context.DocID(ref->DocID()); + + if (debug > 0) + { + // + // Display progress + // + cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() << + ": "; + cout.flush(); + } + + //printf("New Doc\n"); + //ftime(&tb); + //fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + RetrievedDocument(ref); + + //ftime(&tb); + //fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + if(temp_doc_count > 250) + { + //words.Flush(); + temp_doc_count = 0; + } + else + { + temp_doc_count++; + } + + //ftime(&tb); + //fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + docs.Add(*ref); + + //ftime(&tb); + //fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + delete ref; + + words.Flush(); + //words.Close(); + + if (urls_seen) + { + fprintf(urls_seen, "%s|%d|%s|%d|0|1\n", + (const char *) doc->Location(), doc->Length(), doc->ContentType(), + (int) doc->ModTime()); + } + + + return(1); +} + +int TextCollector::FlushWordDB() +{ + if(temp_doc_count != 0) + { + words.Flush(); + temp_doc_count = 0; + } + + words.Flush(); + words.Close(); + return(1); +} + +//***************************************************************************** +// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref) +// We found a document that needs to be parsed. Since we don't know the +// document type, we'll let the Document itself return an appropriate +// Parsable object which we can call upon to parse the document contents. +// +void +TextCollector::RetrievedDocument(DocumentRef * ref) +{ + n_links = 0; + current_ref = ref; + current_title = 0; + word_context.Anchor(0); + current_time = 0; + current_head = 0; + current_meta_dsc = 0; + time_t doc_time; + + //Check if the Document is self-parseable + //We will pass ourselves as a callback object for all the got_*() routines + if (doc->SelfParseable() == TRUE) + { + doc->internalParser(*this); + } + else + { + // Create a parser object and let it have a go at the document. + // We will pass ourselves as a callback object for all the got_*() + // routines. + // This will generate the Parsable object as a specific parser + /* + Parsable *parsable = doc->getParsable(); + if (parsable) + parsable->parse(*this, *base); + else + { // If we didn't get a parser, then we should get rid of this! + ref->DocState(Reference_noindex); + return; + } + */ + } + + // We don't need to dispose of the parsable object since it will + // automatically be reused. + + + // + // Update the document reference + // + ref->DocTitle((char *) current_title); + ref->DocHead((char *) current_head); + ref->DocMetaDsc((char *) current_meta_dsc); + +/* if (current_time == 0) + ref->DocTime(doc->ModTime()); + else + ref->DocTime(current_time); */ + + doc_time = doc->ModTime(); + if(doc_time != 0) + ref->DocTime(doc_time); + else + ref->DocTime(time(NULL)); + + ref->DocSize(doc->Length()); + ref->DocAccessed(time(0)); + ref->DocLinks(n_links); +} + + +//***************************************************************************** +// void TextCollector::got_word(char *word, int location, int heading) +// The location is normalized to be in the range 0 - 1000. +// +void +TextCollector::got_word(const char *word, int location, int heading) +{ + if (debug > 3) + cout << "word: " << word << '@' << location << endl; + if (heading >= 11 || heading < 0) // Current limits for headings + heading = 0; // Assume it's just normal text + + if ((trackWords) && (strlen(word) >= minimumWordLength)) + { + String w = word; + HtWordReference wordRef; + + wordRef.Location(location); + wordRef.Flags(factor[heading]); + + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + +#ifdef DEBUG + cout << "Adding: [" << w << "]"<< endl; //NEALR +#endif + + // Check for compound words... + String parts = word; + int added; + int nparts = 1; + do + { + added = 0; + char *start = parts.get(); + char *punctp = 0, *nextp = 0, *p; + char punct; + int n; + while (*start) + { + p = start; + for (n = 0; n < nparts; n++) + { + while (HtIsStrictWordChar((unsigned char) *p)) + p++; + punctp = p; + if (!*punctp && n + 1 < nparts) + break; + while (*p && !HtIsStrictWordChar((unsigned char) *p)) + p++; + if (n == 0) + nextp = p; + } + if (n < nparts) + break; + punct = *punctp; + *punctp = '\0'; + if (*start && (*p || start > parts.get())) + { + w = start; + HtStripPunctuation(w); + if (w.length() >= minimumWordLength) + { + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + if (debug > 3) + cout << "word part: " << start << '@' << location << endl; + +#ifdef DEBUG + cout << "Adding: [" << w << "]"<< endl; //NEALR +#endif + } + added++; + } + start = nextp; + *punctp = punct; + } + nparts++; + } + while (added > 2); + } +} + + +//***************************************************************************** +// void TextCollector::got_title(const char *title) +// +void +TextCollector::got_title(const char *title) +{ + if (debug > 1) + cout << "\ntitle: " << title << endl; + current_title = title; +} + +//***************************************************************************** +// void TextCollector::got_time(const char *time) +// +void +TextCollector::got_time(const char *time) +{ + HtDateTime new_time(current_time); + + if (debug > 1) + cout << "\ntime: " << time << endl; + + // + // As defined by the Dublin Core, this should be YYYY-MM-DD + // In the future, we'll need to deal with the scheme portion + // in case someone picks a different format. + // + new_time.SetFTime(time, "%Y-%m-%d"); + current_time = new_time.GetTime_t(); + + // If we can't convert it, current_time stays the same and we get + // the default--the date returned by the server... +} + +//***************************************************************************** +// void TextCollector::got_head(const char *head) +// +void +TextCollector::got_head(const char *head) +{ + if (debug > 4) + cout << "head: " << head << endl; + current_head = head; +} + +//***************************************************************************** +// void TextCollector::got_meta_dsc(const char *md) +// +void +TextCollector::got_meta_dsc(const char *md) +{ + if (debug > 4) + cout << "meta description: " << md << endl; + current_meta_dsc = md; +} + + +//***************************************************************************** +// void TextCollector::got_meta_email(const char *e) +// +void +TextCollector::got_meta_email(const char *e) +{ + if (debug > 1) + cout << "\nmeta email: " << e << endl; + current_ref->DocEmail(e); +} + + +//***************************************************************************** +// void TextCollector::got_meta_notification(const char *e) +// +void +TextCollector::got_meta_notification(const char *e) +{ + if (debug > 1) + cout << "\nmeta notification date: " << e << endl; + current_ref->DocNotification(e); +} + + +//***************************************************************************** +// void TextCollector::got_meta_subject(const char *e) +// +void +TextCollector::got_meta_subject(const char *e) +{ + if (debug > 1) + cout << "\nmeta subect: " << e << endl; + current_ref->DocSubject(e); +} + + +//***************************************************************************** +// void TextCollector::got_noindex() +// +void +TextCollector::got_noindex() +{ + if (debug > 1) + cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl; + current_ref->DocState(Reference_noindex); +} diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h new file mode 100644 index 00000000..d44869a6 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h @@ -0,0 +1,133 @@ +//-------------------------------------------------------------------- +// +// TextCollector.h +// +// 2/6/2002 created for libhtdig +// +// Neal Richter nealr@rightnow.com +// +// TextCollector: +// General Purpose Text Document Indexer. +// Calls appropriate parsers. +// The parser notifies the TextCollector object that it got something +// (got_* functions) and the TextCollector object feed the databases +// and statistics accordingly. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//-------------------------------------------------------------------- + + +#ifndef _TextCollector_h_ +#define _TextCollector_h_ + +#include "BasicDocument.h" +#include "DocumentRef.h" +#include "Dictionary.h" +#include "Queue.h" +#include "HtWordReference.h" +#include "List.h" +#include "StringList.h" +#include "DocumentDB.h" + +class Document; +class HtWordList; + +enum TextCollectorLog { + TextCollector_noLog, + TextCollector_logUrl, + TextCollector_Restart +}; + +class TextCollector +{ + public: + // + // Construction/Destruction + // + TextCollector(TextCollectorLog flags = TextCollector_noLog); + virtual ~TextCollector(); + + int IndexDoc(BasicDocument & adoc); + int FlushWordDB(); + + // + // Report statistics about the parser + // + void ReportStatistics(const String& name); + + // + // These are the callbacks that we need to write code for + // + void got_word(const char *word, int location, int heading); + void got_href(URL &url, const char *description, int hops = 1); + void got_title(const char *title); + void got_time(const char *time); + void got_head(const char *head); + void got_meta_dsc(const char *md); + void got_anchor(const char *anchor); + void got_image(const char *src); + void got_meta_email(const char *); + void got_meta_notification(const char *); + void got_meta_subject(const char *); + void got_noindex(); + + + private: + // + // A hash to keep track of what we've seen + // + Dictionary visited; + + URL *base; + String current_title; + String current_head; + String current_meta_dsc; + time_t current_time; + int current_id; + DocumentRef *current_ref; + int current_anchor_number; + int trackWords; + int n_links; + HtWordReference word_context; + HtWordList words; + + int check_unique_md5; + int check_unique_date; + + + TextCollectorLog log; + // + // These are weights for the words. The index is the heading level. + // + long int factor[11]; + int currenthopcount; + + // + // For efficiency reasons, we will only use one document object which + // we reuse. + // + BasicDocument *doc; + + Database *d_md5; + + // Some useful constants + int minimumWordLength; + + // + // Helper routines + // + void RetrievedDocument(DocumentRef *ref); + + int temp_doc_count; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h b/debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h new file mode 100644 index 00000000..4d7f9a0c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h @@ -0,0 +1,75 @@ +// +// htsearch.h +// +// htsearch: The main search CGI. Parses the CGI input, reads the config files +// and calls the necessary code to put together the result lists +// and the final display. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htsearch.h,v 1.5 2004/05/28 13:15:29 lha Exp $ +// + +#ifndef _htsearch_h_ +#define _htsearch_h_ + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "List.h" +#include "StringList.h" +#include "Dictionary.h" +#include "DocumentRef.h" +#include "Database.h" +#include "good_strtok.h" +#include "DocumentDB.h" +#include "htString.h" +#include "HtConfiguration.h" +#include "ResultMatch.h" +#include "ResultList.h" +#include "HtWordReference.h" +#include "StringMatch.h" +#include "defaults.h" + +#include <stdio.h> +#include <stdlib.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#ifndef _WIN32 +#include <unistd.h> +#endif + +extern int n_matches; +extern int do_and; +extern int do_short; +extern StringList fields; + +#ifndef _WIN32 +//extern StringMatch limit_to; +#endif + +extern StringMatch URLimage; +extern List URLimageList; +extern StringMatch wm; +extern Database *dbf; +extern String logicalWords; +extern String originalWords; +extern int debug; +extern StringList collectionList; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h new file mode 100644 index 00000000..5b915e39 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h @@ -0,0 +1,614 @@ +//---------------------------------------------------------------- +// +// libhtdig_api.h +// +// Header function for htdig shared library API +// +// 1/25/2002 created +// +// Neal Richter nealr@rightnow.com +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_api.h,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//---------------------------------------------------------------- + +#ifndef LIBHTDIG_API_H +#define LIBHTDIG_API_H + +#include <time.h> + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + + +#define HTDIG_MAX_FILENAME_PATH_L 1024 +#define HTDIG_DOCUMENT_ID_L 32 +#define HTDIG_DOCUMENT_TITLE_L 256 +#define HTDIG_DOCUMENT_META_L 4096 +#define HTDIG_DOCUMENT_CONTENT_TYPE_L 32 +#define HTDIG_DOCUMENT_EXCERPT_L 1024 +//make sure HTDIG_DOCUMENT_EXCERPT_L is more than config 'excerpt_length' + +//default failsafe size of 'excerpt' document +//make sure it's more than config 'max_head_length' +#define HTDIG_DEFAULT_EXCERPT_SIZE 524288 + +//should be the same as the default value in HTDIG +#define HTDIG_MAX_QUERY_L 256 + + +#define HTDIG_CUSTOM_TEXT_MIME_TYPE "text/vnd.customdocument" + +//htfuzzy +#define HTDIG_ALG_ACCENTS 0x00000100 //"accents" +#define HTDIG_ALG_ACCENTS_STR "accents" + +#define HTDIG_ALG_ENDINGS 0x00001000 //"endings" +#define HTDIG_ALG_ENDINGS_STR "endings" + +#define HTDIG_ALG_METAPHONE 0x00000010 //"metaphone" +#define HTDIG_ALG_METAPHONE_STR "metaphone" + +#define HTDIG_ALG_SOUNDEX 0x00000001 //"soundex" +#define HTDIG_ALG_SOUNDEX_STR "soundex" + +#define HTDIG_ALG_SYNONYMS 0x00010000 //"synonyms" +#define HTDIG_ALG_SYNONYMS_STR "synonyms" + + +//searching +#define HTSEARCH_ALG_AND 0x00000100 //"and" +#define HTSEARCH_ALG_AND_STR "and" + +#define HTSEARCH_ALG_BOOLEAN 0x00000001 //"boolean" +#define HTSEARCH_ALG_BOOLEAN_STR "boolean" + +#define HTSEARCH_ALG_OR 0x00000010 //"or" +#define HTSEARCH_ALG_OR_STR "or" + + +#define HTSEARCH_FORMAT_LONG 0x00000001 //"long" +#define HTSEARCH_FORMAT_LONG_STR "long" + +#define HTSEARCH_FORMAT_SHORT 0x00000010 //"short" +#define HTSEARCH_FORMAT_SHORT_STR "short" + + +#define HTSEARCH_SORT_SCORE 0x00000001 //"score" +#define HTSEARCH_SORT_SCORE_STR "score" + +#define HTSEARCH_SORT_REV_SCORE 0x00000010 //"reverse score" +#define HTSEARCH_SORT_REV_SCORE_STR "reverse score" + +#define HTSEARCH_SORT_TIME 0x00000100 //"time" +#define HTSEARCH_SORT_TIME_STR "time" + +#define HTSEARCH_SORT_REV_TIME 0x00001000 //"reverse time" +#define HTSEARCH_SORT_REV_TIME_STR "reverse time" + +#define HTSEARCH_SORT_TITLE 0x00010000 //"title" +#define HTSEARCH_SORT_TITLE_STR "title" + +#define HTSEARCH_SORT_REV_TITLE 0x00100000 //"reverse title" +#define HTSEARCH_SORT_REV_TITLE_STR "reverse title" + + + +#define HTDIG_ERROR_CONFIG_READ -101 +#define HTDIG_ERROR_URL_PART -102 +#define HTDIG_ERROR_URL_REWRITE -103 +#define HTDIG_ERROR_URL_CREATE_FILE -104 +#define HTDIG_ERROR_IMAGE_CREATE_FILE -105 +#define HTDIG_ERROR_OPEN_CREATE_DOCDB -106 +#define HTDIG_ERROR_LOGFILE_OPEN -107 +#define HTDIG_ERROR_LOGFILE_CLOSE -108 + +#define HTDIG_ERROR_TESTURL_EXCLUDE -109 +#define HTDIG_ERROR_TESTURL_BADQUERY -110 +#define HTDIG_ERROR_TESTURL_EXTENSION -111 +#define HTDIG_ERROR_TESTURL_EXTENSION2 -112 +#define HTDIG_ERROR_TESTURL_LIMITS -113 +#define HTDIG_ERROR_TESTURL_LIMITSNORM -114 +#define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115 +#define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116 +#define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117 +#define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118 + +#define HTSEARCH_ERROR_NO_MATCH -201 +#define HTSEARCH_ERROR_BAD_MATCH_INDEX -202 +#define HTSEARCH_ERROR_BAD_DOCUMENT -203 +#define HTSEARCH_ERROR_TEMPLATE_ERROR -204 +#define HTSEARCH_ERROR_LOGFILE_OPEN -205 +#define HTSEARCH_ERROR_LOGFILE_CLOSE -206 +#define HTSEARCH_ERROR_CONFIG_READ -207 +#define HTSEARCH_ERROR_URL_PART -208 +#define HTSEARCH_ERROR_WORDDB_READ -209 +#define HTSEARCH_ERROR_DOCINDEX_READ -210 +#define HTSEARCH_ERROR_DOCDB_READ -211 +#define HTSEARCH_ERROR_EXCERPTDB_READ -212 + +#define HTMERGE_ERROR_LOGFILE_OPEN -301 +#define HTMERGE_ERROR_LOGFILE_CLOSE -302 +#define HTMERGE_ERROR_CONFIG_READ -303 +#define HTMERGE_ERROR_URL_PART -304 +#define HTMERGE_ERROR_WORDDB_READ -305 +#define HTMERGE_ERROR_DOCINDEX_READ -306 +#define HTMERGE_ERROR_DOCDB_READ -307 +#define HTMERGE_ERROR_EXCERPTDB_READ -308 + +#define PHP_HTDIG_CONFIGFILE_PARM "configFile" +#define PHP_HTDIG_URL_PARM "URL" +#define PHP_HTDIG_LIMITTO_PARM "limit_urls_to" +#define PHP_HTDIG_LIMITN_PARM "limit_normalized" +#define PHP_HTDIG_EXCLUDEURLS_PARM "exclude_urls" +#define PHP_HTDIG_SEARCHRESTRICT_PARM "search_restrict" +#define PHP_HTDIG_SEARCHEXCLUDE_PARM "search_exclude" +#define PHP_HTDIG_MAXHOPCOUNT_PARM "max_hop_cont" +#define PHP_HTDIG_URLREWRITE_PARM "url_rewrite_rules" +#define PHP_HTDIG_BAD_QUERYSTR_PARM "bad_querystr" + +//============================================================================= +//===== HTDIG INDEXING API ==================================================== + + +/*************************************************** + * HTDIG_DOCUMENTATION for htdig_parameters_struct + * + * DEBUGGING PARAMETERS + * + * int debug + * Verbose mode. This increases the verbosity of the + * program. Using more than 2 is probably only useful + * for debugging purposes. The default verbose mode + * gives a nice progress report while digging. + * + * char logFile + * File to stream debugging & error messages to! + * + * BOOLEAN PARAMETERS + * + * int initial + * Initial. Do not use any old databases. This is + * accomplished by first erasing the databases + * + * int create_text_database + * Create an ASCII version of the document database. + * This database is easy to parse with other programs so + * that information can be extracted from it. + * + * int report_statistics + * Report statistics after completion. + * + * int alt_work_area + * Use alternate work files. + * Tells htdig to append .work to database files, causing + * a second copy of the database to be built. This allows + * the original files to be used by htsearch during the + * indexing run. + * + * + * STRING PARAMETERS + * + * char configFile + * configfile + * Use the specified configuration file instead of the + * default. + * + * char credentials + * username:password + * Tells htdig to send the supplied username and + * password with each HTTP request. The credentials + * will be encoded using the 'Basic' authentication scheme. + * There *HAS* to be a colon (:) between the username + * and password. + * + * + * char maxhops //9 digit limit + * hopcount + * Limit the stored documents to those which are at + * most hopcount links away from the start URL. + * + * char minimalFile + * + * char URL + * 'command-line' URLs from stdin + * fetches & indexes these URLs + * + ******************************************************************/ + +typedef struct htdig_parameters_struct { + + char configFile[HTDIG_MAX_FILENAME_PATH_L]; + char DBpath[HTDIG_MAX_FILENAME_PATH_L]; + char credentials[HTDIG_MAX_FILENAME_PATH_L]; + char max_hops[10]; //9 digit limit + char minimalFile[HTDIG_MAX_FILENAME_PATH_L]; + + //debugging & logfile + char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file + int debug; //0, 1 ,2, 3, 4, 5 + + //booelan values + int initial; + int create_text_database; + int report_statistics; + int alt_work_area; + int use_cookies; + + //spidering filters + char URL[HTDIG_MAX_FILENAME_PATH_L]; + char limit_urls_to[HTDIG_MAX_FILENAME_PATH_L]; + char limit_normalized[HTDIG_MAX_FILENAME_PATH_L]; + char exclude_urls[HTDIG_MAX_FILENAME_PATH_L]; + char search_restrict[HTDIG_MAX_FILENAME_PATH_L]; + char search_exclude[HTDIG_MAX_FILENAME_PATH_L]; + char url_rewrite_rules[HTDIG_MAX_FILENAME_PATH_L]; + char bad_querystr[HTDIG_MAX_FILENAME_PATH_L]; + char locale[16]; + char title_factor[16]; + char text_factor[16]; + char meta_description_factor[16]; + int max_hop_count; + + //the rewritten URL - OUTGOING after htdig_index_test_url + char rewritten_URL[HTDIG_MAX_FILENAME_PATH_L]; + +} htdig_parameters_struct; + +/***************************************************************** + * HTDIG_DOCUMENTATION for htdig_simple_doc_struct + * + * STRING PARAMETERS + * + * char location + * the 'URL' of the document. Can be any usefull string. + * + * char documentid + * document id of document [NOT CURRENTLY USED - IGNORED] + * + * char title + * document title + * + * char meta + * content that is indexed but won appear in an search excerpts + * + * char * contents + * pointer to a NULL TERMINATED string on information to be + * indexed. + * + * char content_type + * a MIME-like string + * custom MIME-type defined above, others are supported by + * htdig as well. + * + * + *****************************************************************/ + +typedef struct htdig_simple_doc_struct { + + char location[HTDIG_MAX_FILENAME_PATH_L]; + char documentid[HTDIG_DOCUMENT_ID_L]; + char title[HTDIG_DOCUMENT_TITLE_L]; + char meta[HTDIG_DOCUMENT_META_L]; + char *contents; //MUST ALLOCATE & FREE!!! + char content_type[HTDIG_DOCUMENT_CONTENT_TYPE_L]; //MIME-ISH string + //struct tm time_tm; // use to override index time + time_t doc_time; + +} htdig_simple_doc_struct; + + +int htdig_index_open(htdig_parameters_struct *); +int htdig_index_simple_doc(htdig_simple_doc_struct * ); +int htdig_index_urls(void); +int htdig_index_reset(void); +int htdig_index_close(void); + +int htdig_index_test_url(htdig_parameters_struct *htparms); + +int htdig_get_max_head_length(void); + + + + +//============================================================================= +//===== HTDIG MERGING API ===================================================== + +/************************************************** + * HTDIG_DOCUMENTATION for htmerge_parameters_struct + * + * DEBUGGING PARAMETERS + * + * int debug + * Verbose mode. This increases the verbosity of the + * program. Using more than 2 is probably only useful + * for debugging purposes. The default verbose mode + * gives a progress on what it is doing and where it is. + * + * char logFile + * File to stream debugging & error messages to! + * + * + * BOOLEAN PARAMETERS + * + * int alt_work_area + * Use alternate work files. + * Tells htmerge to append .work to database files causing + * a second copy of the database to be built. This allows + * original files to be used by htsearch during the indexing run. + * + * + * STRING PARAMETERS + * + * char configFile + * configfile + * Use the specified configuration file instead of the default. + * + * char merge_configFile + * merge_configfile + * Merge the databases specified into the databases specified + * by -c or the default. + * + * + *************************************************/ + +typedef struct htmerge_parameters_struct { + + char configFile[HTDIG_MAX_FILENAME_PATH_L]; + char merge_configFile[HTDIG_MAX_FILENAME_PATH_L]; + + //debugging & logfile + char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file + int debug; //0, 1 ,2, 3, 4, 5 + + //booelan values + int alt_work_area; + +} htmerge_parameters_struct; + +int htmerge_index_merge(htmerge_parameters_struct *); + + + + + +//============================================================================= +//===== HTDIG HTFUZZY API ===================================================== + + + +/************************************************** + * HTDIG_DOCUMENTATION for htfuzzy_parameters_struct + * + * DEBUGGING PARAMETERS + * + * int debug + * Verbose mode. This increases the verbosity of the + * program. Using more than 2 is probably only useful + * for debugging purposes. + * + * char logFile + * File to stream debugging & error messages to! + * + * + * PARAMETERS + * + * char configFile + * configfile + * Use the specified configuration file instead of the default. + * + * int algorithms_flag + * Bitwise Flags to signal algorithms to be used + * + * soundex == HTDIG_ALG_SOUNDEX + * metaphone == HTDIG_ALG_METAPHONE + * accents == HTDIG_ALG_ACCENTS + * endings == HTDIG_ALG_ENDINGS + * synonyms == HTDIG_ALG_SYNONYMS + * + ***************************************************/ + + +typedef struct htfuzzy_parameters_struct { + + char configFile[HTDIG_MAX_FILENAME_PATH_L]; + int algorithms_flag; + + //debugging & logfile + char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file + int debug; //0, 1 ,2, 3, 4, 5 + + //booelan values + +} htfuzzy_parameters_struct; + + +// htfuzzy functions +int htfuzzy_index(htfuzzy_parameters_struct *); + + + + +//============================================================================== +//===== HTDIG SEARCHING API ==================================================== + +/************************************************ + * HTDIG_DOCUMENTATION for htsearch_parameters_struct + * + * DEBUGGING PARAMETERS + * + * int debug + * Verbose mode. This increases the verbosity of the; + * program. Using more than 2 is probably only useful; + * for debugging purposes. The default verbose mode; + * gives a progress on what it is doing and where it is.; + * + * char logFile + * File to stream debugging & error messages to! + * + * STRING PARAMETERS + * + * char configFile + * configfile + * Use the specified configuration file instead of the default. + * + * + **************************************************/ + +typedef struct htsearch_parameters_struct { + + char configFile[HTDIG_MAX_FILENAME_PATH_L]; + char DBpath[HTDIG_MAX_FILENAME_PATH_L]; + char locale[16]; + + //debugging & logfile + char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file + int debug; //0, 1 ,2, 3, 4, 5 + + //filters + char search_restrict[HTDIG_MAX_FILENAME_PATH_L]; + char search_exclude[HTDIG_MAX_FILENAME_PATH_L]; + char title_factor[16]; + char text_factor[16]; + char meta_description_factor[16]; + +} htsearch_parameters_struct; + + + + +/***************************************************************** + * HTDIG_DOCUMENTATION for htsearch_query_struct + * + * STRING PARAMETERS + * + * char raw_query + * STRING of text that is the search query -- syntax is important + * + * INTEGER PARAMETERS + * + * int algorithms_flag [ALSO CALLED 'method' IN HTDIG] + * HTSEARCH_ALG_BOOLEAN + * HTSEARCH_ALG_OR + * HTSEARCH_ALG_AND + * + * int sortby_flag + * score, date, title & reversed + * HTSEARCH_SORT_SCORE + * HTSEARCH_SORT_REV_SCORE + * HTSEARCH_SORT_TIME + * HTSEARCH_SORT_REV_TIME + * HTSEARCH_SORT_TITLE + * HTSEARCH_SORT_REV_TITLE + * + * int format + * short, long (with excerpt) + * HTSEARCH_FORMAT_LONG + * HTSEARCH_FORMAT_SHORT + * + * + * + * TODO: 'Connect' these htsearch features to this API + * + * config + * Specifies the name of the configuration file. + * + * exclude + * This value is a pattern that specifies which URLs are to be excluded from + * the search results. + * + * keywords + * Used to specify a list of required words that have to be in the documents. + * + * restrict + * This value is a pattern that all URLs of the search results will have to + * match. + * + * startyear, startmonth, startday, endyear, endmonth, endday + * These values specify the allowed range of document modification dates + * allowed in the search results. + * + * + * + *****************************************************************/ + +typedef struct htsearch_query_struct { + + char raw_query[HTDIG_MAX_QUERY_L]; + + int algorithms_flag; + int sortby_flag; + int format; + +} htsearch_query_struct; + + +/***************************************************************** + * HTDIG_DOCUMENTATION for htsearch_query_match_struct + * + * STRING PARAMETERS + * + * char title + * Title of document returned + * + * char URL + * URL/location-string of document returned + * + * char excerpt + * Excerpt with search words highlighted with + * <strong>searchword</strong> + * + * INTEGER PARAMETERS + * + * int score + * score in 'number of stars' + * [MAX NUMBER OF STARS DECLARED IN CONFIG FILE] + * + * int score_percent //top result is 100% + * + * time_t time [DOCUMENT TIME] + * struct tm time_tm [DOCUMENT TIME] + * int size [TOTAL DOCUMENT SIZE] + * + * + *****************************************************************/ + +typedef struct htsearch_query_match_struct { + + char title[HTDIG_DOCUMENT_TITLE_L]; + char URL[HTDIG_MAX_FILENAME_PATH_L]; + char excerpt[HTDIG_DOCUMENT_EXCERPT_L]; + int score; + int score_percent; //top result is 100% + struct tm time_tm; + int size; + +} htsearch_query_match_struct; + + +// htsearch functions + +int htsearch_open(htsearch_parameters_struct *); +int htsearch_query(htsearch_query_struct *); + +int htsearch_get_nth_match(int, htsearch_query_match_struct *); +int htsearch_close(); + +//htsearch_free(indicator) + +char * htsearch_get_error(); + + +#endif /* LIBHTDIG_API_H */ + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc new file mode 100644 index 00000000..8a610d36 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc @@ -0,0 +1,1058 @@ +//------------------------------------------------------------- +// +// libhtdig_htdig.cc +// +// 1/25/2002 created from htdig.cc +// +// Neal Richter nealr@rightnow.com +// +// libhtdig_htdig.cc +// +// htdig: Indexes the web sites specified in the config file +// generating several databases to be used by htmerge +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_htdig.cc,v 1.5 2004/05/28 13:15:29 lha Exp $ +// +//------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +extern "C" { +#include "libhtdig_api.h" +} + +#include "libhtdig_log.h" + +#include "BasicDocument.h" +#include "Document.h" +#include "TextCollector.h" +#include "Retriever.h" +#include "StringList.h" +#include "htdig.h" +#include "defaults.h" +#include "HtURLCodec.h" +#include "WordContext.h" +#include "HtDateTime.h" +#include "HtURLRewriter.h" +#include "URL.h" +#include "Server.h" + +//////////////////////////// +// For cookie jar +//////////////////////////// +#include "HtCookieJar.h" +#include "HtCookieMemJar.h" +#include "HtHTTP.h" +//////////////////////////// + +// If we have this, we probably want it. +//#ifdef HAVE_GETOPT_H +//#include <getopt.h> +//#endif + + + +//Global Variables for Library + +int debug = 0; +HtRegexList limits; +HtRegexList limitsn; +String configFile = DEFAULT_CONFIG_FILE; +FILE *urls_seen = NULL; +FILE *images_seen = NULL; +DocumentDB docs; + + +// +// Global variables for this file +// +static int report_statistics = 0; +static String minimalFile = 0; +static HtDateTime StartTime; +static HtDateTime EndTime; + +//static char *max_hops = NULL; +static String credentials; +static HtCookieJar *_cookie_jar = NULL; +static HtConfiguration * config = NULL; +static WordContext * wc = NULL; + +static int create_text_database = 0; +static int alt_work_area = 0; +static int initial = 0; + +int htdig_index_open_flag = FALSE; + + +//new. URLs from 'command-line' +#define URL_SEPCHARS " ," +static char *myURL = NULL; + + +BasicDocument *a_basicdoc; +TextCollector *Indexer; + +BasicDocument the_basicdoc; +//TextCollector the_Indexer; + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_open(...) + * + * + * opens/creates document indexes and initializes variables + * for indexing. + * + * + * see libhtdig_api.h headerfile for definition of + * htdig_parameters_struct + * + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ + +int htdig_index_open(htdig_parameters_struct * htdig_parms) +{ + int ret = -1; + + if(htdig_index_open_flag != FALSE) + return(FALSE); + + //load 'comand-line' parameters + + if (htdig_parms->configFile[0] != 0) + configFile = htdig_parms->configFile; + + if (htdig_parms->URL[0] != 0) + { + myURL = strdup(htdig_parms->URL); + } + + debug = htdig_parms->debug; + if(debug != 0) + { + ret = logOpen(htdig_parms->logFile); + + if(ret == FALSE) + { + reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n", + htdig_parms->logFile, errno, strerror(errno)) ); + return(HTDIG_ERROR_LOGFILE_OPEN); + } + } + + initial = htdig_parms->initial; + create_text_database = htdig_parms->create_text_database; + //max_hops = strdup(htdig_parms->max_hops); + report_statistics = htdig_parms->report_statistics; + credentials = htdig_parms->credentials; + alt_work_area = htdig_parms->alt_work_area; + minimalFile = htdig_parms->minimalFile; + + + if(htdig_parms->use_cookies == TRUE) + { + // Cookie jar dynamic creation. + + _cookie_jar = new HtCookieMemJar (); // new cookie jar + if (_cookie_jar) + HtHTTP::SetCookieJar (_cookie_jar); + } + + // + // First set all the defaults and then read the specified config + // file to override the defaults. + // + + config = HtConfiguration::config (); + + config->Defaults (&defaults[0]); + if (access ((char *) configFile, R_OK) < 0) + { + reportError (form ("[HTDIG] Unable to find configuration file '%s'", + configFile.get ())); + return(HTDIG_ERROR_CONFIG_READ); + } + config->Read (configFile); + + //------- Now override config settings ------------ + + //------- override database path ------------ + if(strlen(htdig_parms->DBpath) > 0) + { + config->Add("database_dir", htdig_parms->DBpath); + } + + //------- custom filters from htdig_parms ---------- + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if (config->Find ("locale").empty () && debug > 0) + logEntry("Warning: unknown locale!\n"); + + if (strlen(htdig_parms->max_hops) > 0) + { + config->Add ("max_hop_count", htdig_parms->max_hops); + } + + if(strlen(htdig_parms->limit_urls_to) > 0) + { + config->Add("limit_urls_to", htdig_parms->limit_urls_to); + } + + if(strlen(htdig_parms->limit_normalized) > 0) + { + config->Add("limit_normalized", htdig_parms->limit_normalized); + } + + if(strlen(htdig_parms->exclude_urls) > 0) + { + config->Add("exclude_urls", htdig_parms->exclude_urls); + } + + if(strlen(htdig_parms->url_rewrite_rules) > 0) + { + config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules); + } + + if(strlen(htdig_parms->bad_querystr) > 0) + { + config->Add("bad_querystr", htdig_parms->bad_querystr); + } + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if(strlen(htdig_parms->meta_description_factor) > 0) + { + config->Add("meta_description_factor", htdig_parms->meta_description_factor); + } + + if(strlen(htdig_parms->title_factor) > 0) + { + config->Add("title_factor", htdig_parms->title_factor); + } + + if(strlen(htdig_parms->text_factor) > 0) + { + config->Add("text_factor", htdig_parms->text_factor); + } + + if(strlen(htdig_parms->URL) > 0) + { + config->Add("start_url", htdig_parms->URL); + free(myURL); + myURL=NULL; + } + + //------- end custom filters from htdig_parms ---------- + + // Set up credentials for this run + if (credentials.length ()) + config->Add ("authorization", credentials); + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance ()->ErrMsg (); + + if (url_part_errors.length () != 0) + { + reportError (form("[HTDIG] Invalid url_part_aliases or common_url_parts: %s", + url_part_errors.get ())); + return(HTDIG_ERROR_URL_PART); + } + // + // Check url_rewrite_rules for errors. + String url_rewrite_rules = HtURLRewriter::instance ()->ErrMsg (); + + if (url_rewrite_rules.length () != 0) + { + reportError (form ("[HTDIG] Invalid url_rewrite_rules: %s", + url_rewrite_rules.get ())); + return(HTDIG_ERROR_URL_REWRITE); + } + + // + // If indicated, change the database file names to have the .work + // extension + // + if (alt_work_area != 0) + { + String configValue = config->Find ("doc_db"); + + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_db", configValue); + } + + configValue = config->Find ("word_db"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("word_db", configValue); + } + + configValue = config->Find ("doc_index"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_index", configValue); + } + + configValue = config->Find ("doc_excerpt"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_excerpt", configValue); + } + + configValue = config->Find ("md5_db"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("md5_db", configValue); + } + } + + // + // If needed, we will create a list of every URL we come across. + //TODO put document-index log file stuff here + + if (config->Boolean ("create_url_list")) + { + const String filename = config->Find ("url_list"); + urls_seen = fopen (filename, initial ? "w" : "a"); + if (urls_seen == 0) + { + reportError (form ("[HTDIG] Unable to create URL file '%s'", + filename.get ())); + return(HTDIG_ERROR_URL_CREATE_FILE); + } + } + + // + // If needed, we will create a list of every image we come across. + // + if (config->Boolean ("create_image_list")) + { + const String filename = config->Find ("image_list"); + images_seen = fopen (filename, initial ? "w" : "a"); + if (images_seen == 0) + { + reportError (form ("[HTDIG] Unable to create images file '%s'", + filename.get ())); + return(HTDIG_ERROR_IMAGE_CREATE_FILE); + } + } + + // + // Set up the limits list + // + StringList l (config->Find ("limit_urls_to"), " \t"); + limits.setEscaped (l, config->Boolean ("case_sensitive")); + l.Destroy (); + + l.Create (config->Find ("limit_normalized"), " \t"); + limitsn.setEscaped (l, config->Boolean ("case_sensitive")); + l.Destroy (); + + // + // Open the document database + // + const String filename = config->Find ("doc_db"); + if (initial) + unlink (filename); + + const String index_filename = config->Find ("doc_index"); + if (initial) + unlink (index_filename); + + const String head_filename = config->Find ("doc_excerpt"); + if (initial) + unlink (head_filename); + + if (docs.Open (filename, index_filename, head_filename) < 0) + { + reportError (form ("[HTDIG] Unable to open/create document database '%s'", + filename.get ())); + return(HTDIG_ERROR_OPEN_CREATE_DOCDB); + } + + const String word_filename = config->Find ("word_db"); + if (initial) + unlink (word_filename); + + // Initialize htword + wc = new WordContext; + wc->Initialize(*config); + + + //a_basicdoc = new BasicDocument; + Indexer = new TextCollector; + + a_basicdoc = &the_basicdoc; + a_basicdoc->Reset(); + + //Indexer = &the_Indexer; + + if ((a_basicdoc == NULL) || (Indexer == NULL)) + return(FALSE); + + + htdig_index_open_flag = TRUE; + + return(TRUE); + +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_simple_doc(...) + * + * + * indexes a simple document supplied by parameter + * + * see libhtdig_api.h headerfile for definition of + * htdig_simple_doc_struct + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ +int htdig_index_simple_doc(htdig_simple_doc_struct * a_simple_doc) +{ + int index_error = 0; + //int ret = 0; + + // Reset the document to clean out any old data + a_basicdoc->Reset(); + + a_basicdoc->ModTime(a_simple_doc->doc_time); + a_basicdoc->Location(a_simple_doc->location); + a_basicdoc->DocumentID(a_simple_doc->documentid); + a_basicdoc->Title(a_simple_doc->title); + a_basicdoc->MetaContent(a_simple_doc->meta); + a_basicdoc->Contents(a_simple_doc->contents); //MUST ALLOCATE & FREE!!! + a_basicdoc->ContentType(a_simple_doc->content_type); //MIME-ISH string + a_basicdoc->Length(); + + + //TODO What is this error? + index_error = Indexer->IndexDoc(*a_basicdoc); + + return(TRUE); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_urls(...) + * + * Starts fetch & index of URL supplied in config file + * OR supplied in htdig_index_open parameter + * + * TODO Examine external function calls for error return + * codes + * TODO Blank/empty URL error? + *******************************************************/ +int htdig_index_urls(void) +{ + + char * temp_URL_list = NULL; + char * temp_url = NULL; + + // Create the Retriever object which we will use to parse all the + // HTML files. + // In case this is just an update dig, we will add all existing + // URLs? + // + Retriever retriever (Retriever_logUrl); + if (minimalFile.length () == 0) + { + List *list = docs.URLs (); + retriever.Initial (*list); + delete list; + + // Add start_url to the initial list of the retriever. + // Don't check a URL twice! + // Beware order is important, if this bugs you could change + // previous line retriever.Initial(*list, 0) to Initial(*list,1) + retriever.Initial (config->Find ("start_url"), 1); + } + + // Handle list of URLs given on 'command-line' + if (myURL != NULL) + { + String str; + temp_URL_list = strdup(myURL); + temp_url = strtok(temp_URL_list, URL_SEPCHARS); + while (temp_url != NULL) + { + str = temp_url; + str.chop ("\r\n"); + if (str.length () > 0) + retriever.Initial (str, 1); + + temp_url = strtok(NULL, URL_SEPCHARS); + } + free(temp_URL_list); + } + else if (minimalFile.length () != 0) + { + FILE *input = fopen (minimalFile.get (), "r"); + char buffer[1000]; + + if (input) + { + while (fgets (buffer, sizeof (buffer), input)) + { + String str (buffer); + str.chop ("\r\n\t "); + if (str.length () > 0) + retriever.Initial (str, 1); + } + fclose (input); + } + } + + // + // Go do it! + // + retriever.Start (); + + // + // All done with parsing. + // + + // + // If the user so wants, create a text version of the document database. + // + + if (create_text_database) + { + const String doc_list = config->Find ("doc_list"); + if (initial) + unlink (doc_list); + docs.DumpDB (doc_list); + const String word_dump = config->Find ("word_dump"); + if (initial) + unlink (word_dump); + HtWordList words (*config); + if (words.Open (config->Find ("word_db"), O_RDONLY) == OK) + { + words.Dump (word_dump); + } + } + + // + // Cleanup + // + if (images_seen) + fclose (images_seen); + + // + // If needed, report some statistics + // + if (report_statistics) + { + retriever.ReportStatistics ("htdig"); + } + + return(TRUE); +} + + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_close(...) + * + * Closes the database and destroys various objects + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ +int htdig_index_close(void) +{ + int ret = -1; + + if(htdig_index_open_flag == TRUE) + { + //delete a_basicdoc; + //delete Indexer; + + Indexer->FlushWordDB(); + + if (_cookie_jar) + delete _cookie_jar; + + //if (max_hops != NULL) + // free(max_hops); + + if (myURL != NULL) + free(myURL); + + //call destructors here + docs.~DocumentDB(); + //config->~HtConfiguration(); + + if (debug != 0) + { + ret = logClose(); + + if (ret == FALSE) + { + reportError (form ("[HTDIG] Error closing log file . Error:[%d], %s\n", + errno, strerror(errno)) ); + return(HTDIG_ERROR_LOGFILE_CLOSE); + } + } + + /* + if(config) { + WordContext::Finish(); + } + */ + + if (wc) + delete wc; + + if (urls_seen) + fclose (urls_seen); + + htdig_index_open_flag = FALSE; + } + + return(TRUE); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_reset(...) + * + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ + +int htdig_index_reset(void) +{ + Indexer->FlushWordDB(); + a_basicdoc->Reset(); + + return(TRUE); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_get_max_head_length(...) + * + * + * Returns size of maximum document storage length + * for db.excerpts [htdig.conf:max_head_length] + * + * This represents the maximum amount of the document + * That will be available for excerpting. + * + * + *******************************************************/ + +int htdig_get_max_head_length() +{ + int ret = -1; + + if(config != NULL) + ret = config->Value("max_head_length"); + + return(ret); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_test_url(...) + * + * + * Test a URL for filter Pass/Fail + * + * Pass = return(TRUE) + * Fail = return(XXX) [Negative Value] + * + * + * + * + * + *******************************************************/ + + +//int htdig_index_test_url(htdig_parameters_struct *htdig_parms) +int htdig_index_test_url(htdig_parameters_struct *htdig_parms) +{ + //int ret = FALSE; + String the_URL(htdig_parms->URL); + HtConfiguration* config= HtConfiguration::config(); + Dictionary invalids; + Dictionary valids; + URL aUrl(the_URL); + String rewritten_url(the_URL); + StringList tmpList; + HtRegex limitTo; + HtRegex excludeFrom; + + //initalize outgoing-parameter rewritten_URL + htdig_parms->rewritten_URL[0] = 0; + +#ifdef DEBUG + //output relevant config variables + cout << " bad_extensions = " << config->Find("bad_extensions") << endl; + cout << " valid_extensions = " << config->Find("valid_extensions") << endl; + cout << " exclude_urls = " << config->Find("exclude_urls") << endl; + cout << " bad_querystr = " << config->Find("bad_querystr") << endl; + cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl; + cout << " limit_normalized = " << config->Find("limit_normalized") << endl; + cout << " restrict = " << config->Find("restrict") << endl; + cout << " exclude = " << config->Find("exclude") << endl; +#endif + + //------------ read the config file if it is given --------------- + if (htdig_parms->configFile[0] != 0) + configFile = htdig_parms->configFile; + + config = HtConfiguration::config (); + + config->Defaults (&defaults[0]); + if (access ((char *) configFile, R_OK) < 0) + { + reportError (form ("[HTDIG] Unable to find configuration file '%s'", + configFile.get ())); + return(HTDIG_ERROR_CONFIG_READ); + } + config->Read (configFile); + + //---------- Now override config settings ----------------- + + //------- override database path ------------ + if(strlen(htdig_parms->DBpath) > 0) + { + config->Add("database_dir", htdig_parms->DBpath); + } + + //------- custom filters from htdig_parms ---------- + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if (config->Find ("locale").empty () && debug > 0) + logEntry("Warning: unknown locale!\n"); + + if (strlen(htdig_parms->max_hops) > 0) + { + config->Add ("max_hop_count", htdig_parms->max_hops); + } + + if(strlen(htdig_parms->limit_urls_to) > 0) + { + config->Add("limit_urls_to", htdig_parms->limit_urls_to); + } + + if(strlen(htdig_parms->limit_normalized) > 0) + { + config->Add("limit_normalized", htdig_parms->limit_normalized); + } + + if(strlen(htdig_parms->exclude_urls) > 0) + { + config->Add("exclude_urls", htdig_parms->exclude_urls); + } + + if(strlen(htdig_parms->url_rewrite_rules) > 0) + { + config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules); + } + + if(strlen(htdig_parms->bad_querystr) > 0) + { + config->Add("bad_querystr", htdig_parms->bad_querystr); + } + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if(strlen(htdig_parms->meta_description_factor) > 0) + { + config->Add("meta_description_factor", htdig_parms->meta_description_factor); + } + + if(strlen(htdig_parms->title_factor) > 0) + { + config->Add("title_factor", htdig_parms->title_factor); + } + + if(strlen(htdig_parms->text_factor) > 0) + { + config->Add("text_factor", htdig_parms->text_factor); + } + + //------------------------------------------------------------------- + +#ifdef DEBUG + //output relevant config variables + cout << " bad_extensions = " << config->Find("bad_extensions") << endl; + cout << " valid_extensions = " << config->Find("valid_extensions") << endl; + cout << " exclude_urls = " << config->Find("exclude_urls") << endl; + cout << " bad_querystr = " << config->Find("bad_querystr") << endl; + cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl; + cout << " limit_normalized = " << config->Find("limit_normalized") << endl; + cout << " restrict = " << config->Find("restrict") << endl; + cout << " exclude = " << config->Find("exclude") << endl; +#endif + + + //------ bad_extensions ----------------------------------------------- + //A list of bad extensions, separated by spaces or tabs + + String t = config->Find("bad_extensions"); + String lowerp; + char *p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + invalids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + + //------ valid_extensions ------------------------------------------------ + // Valid extensions are performed similarly + // A list of valid extensions, separated by spaces or tabs + + t = config->Find("valid_extensions"); + p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + valids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + //----- rewrite the URL------------------------------------------ + aUrl.rewrite(); + rewritten_url = aUrl.get(); + + if(rewritten_url.length() <= 0) + { + //Rejected: empty rewritten URL + String temp = config->Find("url_rewrite_rules"); + strcpy(htdig_parms->rewritten_URL, temp.get()); + system(form("echo \"%s\" > /tmp/neal", temp.get())); + + return(HTDIG_ERROR_TESTURL_REWRITE_EMPTY); + } + + //cout << form("TestURL: org=[%s]\n", the_URL.get()); + //cout << form(" rewritten[%s]\n", rewritten_url.get()); + + //copy the rewritten URL for outgoing parm pass + strcpy(htdig_parms->rewritten_URL, rewritten_url.get()); + + //---- exclude_urls --------------------------------------------- + // If the URL contains any of the patterns in the exclude list, + // mark it as invalid + + /*if(strlen(htdig_parms->exclude_urls) > 0) + tmpList.Create(htdig_parms->exclude_urls," \t"); + else*/ + tmpList.Create(config->Find("exclude_urls")," \t"); + + HtRegexList excludes; + excludes.setEscaped(tmpList, config->Boolean("case_sensitive")); + if (excludes.match(rewritten_url, 0, 0) != 0) + { + //Rejected: item in exclude list + return(HTDIG_ERROR_TESTURL_EXCLUDE); + } + + //---- bad_querystr ------------------------------------------- + // If the URL has a query string and it is in the bad query list + // mark it as invalid + + tmpList.Destroy(); + + /*if(strlen(htdig_parms->bad_querystr) > 0) + tmpList.Create(htdig_parms->bad_querystr, " \t"); + else*/ + tmpList.Create(config->Find("bad_querystr"), " \t"); + + HtRegexList badquerystr; + badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive")); + char *ext = strrchr((char*)rewritten_url, '?'); + if (ext && badquerystr.match(ext, 0, 0) != 0) + { + //if (debug > 2) + // cout << endl << " Rejected: item in bad query list "; + return(HTDIG_ERROR_TESTURL_BADQUERY); + } + + //------ invalid_extensions #2 ------ + // See if the file extension is in the list of invalid ones + + ext = strrchr((char*)rewritten_url, '.'); + String lowerext; + if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the + ext = NULL; // final component of the path. + if(ext) + { + lowerext.set(ext); + int parm = lowerext.indexOf('?'); // chop off URL parameter + if (parm >= 0) + lowerext.chop(lowerext.length() - parm); + lowerext.lowercase(); + if (invalids.Exists(lowerext)) + { + //Rejected: Extension is invalid! + return(HTDIG_ERROR_TESTURL_EXTENSION); + } + } + + //------ valid_extensions #2 ------ + // Or NOT in the list of valid ones + + if (ext && valids.Count() > 0 && !valids.Exists(lowerext)) + { + //Rejected: Extension is not valid! + return(HTDIG_ERROR_TESTURL_EXTENSION2); + } + + //----- limit_urls_to & limit_normalized ------------------------------ + // Set up the limits list + + StringList l; + /*if(strlen(htdig_parms->limit_urls_to) > 0) + l.Create(htdig_parms->limit_urls_to, " \t"); + else*/ + l.Create(config->Find ("limit_urls_to"), " \t"); + + limits.setEscaped (l, config->Boolean ("case_sensitive")); + + l.Destroy (); + + /*if(strlen(htdig_parms->limit_normalized) > 0) + l.Create (htdig_parms->limit_normalized, " \t"); + else*/ + l.Create (config->Find ("limit_normalized"), " \t"); + + limitsn.setEscaped (l, config->Boolean ("case_sensitive")); + l.Destroy (); + + // If any of the limits are met, we allow the URL + if (limits.match(rewritten_url, 1, 0) == 0) + { + //Rejected: URL not in the limits!; + return(HTDIG_ERROR_TESTURL_LIMITS); + } + + + // or not in list of normalized urls + // Warning! should be last in checks because of aUrl normalization + aUrl.normalize(); + if (limitsn.match(rewritten_url.get(), 1, 0) == 0) + { + //Rejected: not in "limit_normalized" list! + return(HTDIG_ERROR_TESTURL_LIMITSNORM); + } + + //----- restrict & exclude ---------------------------------- + //Search-Time Filters + + String temp; + + /*if(strlen(htdig_parms->search_restrict) > 0) + temp = htdig_parms->search_restrict; + else*/ + temp = config->Find("restrict"); + + if (temp.length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(temp, " \t\r\n\001|"); + limitTo.setEscaped(l); + } + + /*if(strlen(htdig_parms->search_exclude) > 0) + temp = htdig_parms->search_exclude; + else*/ + temp = config->Find("exclude"); + + if (temp.length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(temp, " \t\r\n\001|"); + excludeFrom.setEscaped(l); + } + + //Restrict Test + if (limitTo.match(rewritten_url, 1, 0) == 0) + { + //Rejected URL Not in SearchTime Restrict List + return(HTDIG_ERROR_TESTURL_SRCH_RESTRICT); + } + //Exclude Test + if (excludeFrom.match(rewritten_url, 0, 0) != 0) + { + //Rejected URL in SearchTime Exclude List + return(HTDIG_ERROR_TESTURL_SRCH_EXCLUDE); + } + + + //Success! + return TRUE; +} diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc new file mode 100644 index 00000000..f7597c8e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc @@ -0,0 +1,265 @@ +//---------------------------------------------------------------- +// +// libhtdig_htfuzzy.cc +// +// 1/25/2002 created from htfuzzy.cc +// +// Neal Richter nealr@rightnow.com +// +// libhtdig_htfuzzy.cc +// +// htfuzzy: Create one or more ``fuzzy'' indexes into the main word database. +// These indexes can be used by htsearch to perform a search that uses +// other algorithms than exact word match. +// +// This program is meant to be run after htmerge has created the word +// database. +// +// For each fuzzy algorithm, there will be a separate database. Each +// database is simply a mapping from the fuzzy key to a list of words +// in the main word database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_htfuzzy.cc,v 1.5 2004/05/28 13:15:29 lha Exp $ +// +//---------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +extern "C" +{ +#include "libhtdig_api.h" +} + +#include "libhtdig_log.h" + + +//#include "htfuzzy.h" //NOT USED + +#include "Fuzzy.h" +#include "Accents.h" +#include "Soundex.h" +#include "Endings.h" +#include "Metaphone.h" +#include "Synonym.h" +#include "htString.h" +#include "List.h" +#include "Dictionary.h" +#include "defaults.h" +#include "HtWordList.h" +#include "WordContext.h" + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif + +#include "HtConfiguration.h" +#include "HtWordList.h" + +#include <stdlib.h> + +#ifndef _WIN32 +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> + + + +extern int debug; + +static HtConfiguration * config = NULL; + + +//***************************************************************************** +// int main(int ac, char **av) +// +//int main(int ac, char **av) + +int htfuzzy_index(htfuzzy_parameters_struct * htfuzzy_parms) +{ + String configFile = DEFAULT_CONFIG_FILE; + int ret = 0; + + // + // Parse command line arguments + // + + debug = htfuzzy_parms->debug; + if (debug != 0) + { + ret = logOpen(htfuzzy_parms->logFile); + + if (ret == FALSE) + { + fprintf(stderr, "htdig: Error opening file [%s]. Error:[%d], %s\n", + htfuzzy_parms->logFile, errno, strerror(errno)); + } + } + + + configFile = htfuzzy_parms->configFile; + + config = HtConfiguration::config(); + + // + // Determine what algorithms to use + // + List wordAlgorithms; + List noWordAlgorithms; + + if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_SOUNDEX) + { + wordAlgorithms.Add(new Soundex(*config)); + } + else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_METAPHONE) + { + wordAlgorithms.Add(new Metaphone(*config)); + } + else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_ACCENTS) + { + wordAlgorithms.Add(new Accents(*config)); + } + else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_ENDINGS) + { + noWordAlgorithms.Add(new Endings(*config)); + } + else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_SYNONYMS) + { + noWordAlgorithms.Add(new Synonym(*config)); + } + + + if (wordAlgorithms.Count() == 0 && noWordAlgorithms.Count() == 0) + { + logEntry(form("htfuzzy: No algorithms specified\n")); + } + + // + // Find and parse the configuration file. + // + config->Defaults(&defaults[0]); + if (access((char *) configFile, R_OK) < 0) + { + reportError(form("[HTFUZZY] Unable to find configuration file '%s'", configFile.get())); + } + config->Read(configFile); + + // Initialize htword library (key description + wordtype...) + WordContext::Initialize(*config); + + Fuzzy *fuzzy; + if (wordAlgorithms.Count() > 0) + { + // + // Open the word database so that we can grab the words from it. + // + HtWordList worddb(*config); + if (worddb.Open(config->Find("word_db"), O_RDONLY) == OK) + { + // + // Go through all the words in the database + // + List *words = worddb.Words(); + String *key; + Fuzzy *fuzzy = 0; + String word, fuzzyKey; + int count = 0; + + words->Start_Get(); + while ((key = (String *) words->Get_Next())) + { + word = *key; + wordAlgorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next())) + { + fuzzy->addWord(word); + } + count++; + if ((count % 100) == 0 && debug) + { + //cout << "htfuzzy: words: " << count << '\n'; + } + } + if (debug) + { + logEntry(form("htfuzzy: total words: %d\n", count)); + logEntry(form("htfuzzy: Writing index files...\n")); + } + + // + // All the information is now in memory. + // Write all of it out to the individual databases + // + wordAlgorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next())) + { + fuzzy->writeDB(); + } + worddb.Close(); + words->Destroy(); + delete words; + if (fuzzy) + delete fuzzy; + } + else + { + reportError(form("[htfuzzy] Unable to open word database %s", config->Find("word_db").get())); + } + } + if (noWordAlgorithms.Count() > 0) + { + noWordAlgorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) noWordAlgorithms.Get_Next())) + { + if (debug) + { + logEntry(form( "htfuzzy: Selected algorithm: %s\n", fuzzy->getName())); + } + if (fuzzy->createDB(*config) == NOTOK) + { + logEntry(form("htfuzzy: Could not create database for algorithm: %s\n", fuzzy->getName())); + } + } + } + + if (debug) + { + logEntry("htfuzzy: Done.\n"); + } + + if (debug != 0) + { + ret = logClose(); + + if (ret == FALSE) + { + fprintf(stderr, "htfuzzy: Error closing file [%s]. Error:[%d], %s\n", + htfuzzy_parms->logFile, errno, strerror(errno)); + } + } + + + delete config; + + return 0; +} + + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc new file mode 100644 index 00000000..988a8b61 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc @@ -0,0 +1,407 @@ +//---------------------------------------------------------------- +// +// libhtdig_htmerge.cc +// +// 1/25/2002 created from htmerge.cc +// +// Neal Richter nealr@rightnow.com +// +// libhtdig_htmerge.cc +// +// htmerge: Merges two databases and/or updates databases to remove +// old documents and ensures the databases are consistent. +// Calls db.cc, docs.cc, and/or words.cc as necessary +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_htmerge.cc,v 1.5 2004/05/28 13:15:29 lha Exp $ +// +//---------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +extern "C" { +#include "libhtdig_api.h" +} + +#include "libhtdig_log.h" + +#include "WordContext.h" +#include "good_strtok.h" +#include "defaults.h" +#include "DocumentDB.h" +#include "HtURLCodec.h" +#include "HtWordList.h" +#include "HtWordReference.h" +#include "htString.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> + +#ifndef _WIN32 +#include <unistd.h> +#endif + +#include <stdlib.h> +#include <ctype.h> +#include <string.h> + +// If we have this, we probably want it. +//#ifdef HAVE_GETOPT_H +//#include <getopt.h> +//#endif + + + + + +//Global Variables for this file + +// This hash is used to keep track of all the document IDs which have to be +// discarded. +// This is generated from the doc database and is used to prune words +// from the word db +static Dictionary discard_list; + +// This config is used for merging multiple databses +static HtConfiguration merge_config; +static HtConfiguration *config = NULL; + +static int verbose = 0; +//static int stats = 0; +static int alt_work_area = 0; + +//static String configFile = DEFAULT_CONFIG_FILE; +extern String configFile; + +static String merge_configFile = 0; + + +// Component procedures +static int mergeDB (); + +int htmerge_index_merge(htmerge_parameters_struct *htmerge_parms) +{ + int ret = -1; + int merge_ret = -1; + + //load htmerge 'command-line parameters' + configFile = htmerge_parms->configFile; + merge_configFile = htmerge_parms->merge_configFile; + verbose = htmerge_parms->debug; + if(verbose != 0) + { + ret = logOpen(htmerge_parms->logFile); + + if(ret == FALSE) + { + reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n", + htmerge_parms->logFile, errno, strerror(errno)) ); + return(HTMERGE_ERROR_LOGFILE_OPEN); + } + } + + alt_work_area = htmerge_parms->alt_work_area; + + + + config = HtConfiguration::config (); + config->Defaults (&defaults[0]); + + if (access ((char *) configFile, R_OK) < 0) + { + reportError (form ("[HTMERGE] Unable to find configuration file '%s'", + configFile.get ())); + return(HTMERGE_ERROR_CONFIG_READ); + } + + config->Read (configFile); + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance ()->ErrMsg (); + + if (url_part_errors.length () != 0) + { + reportError (form("[HTMERGE] Invalid url_part_aliases or common_url_parts: %s", + url_part_errors.get ())); + return(HTMERGE_ERROR_URL_PART); + } + + if (merge_configFile.length ()) + { + merge_config.Defaults (&defaults[0]); + if (access ((char *) merge_configFile, R_OK) < 0) + { + reportError (form ("[HTMERGE] Unable to find configuration file '%s'", + merge_configFile.get ())); + return(HTMERGE_ERROR_CONFIG_READ); + } + merge_config.Read (merge_configFile); + } + + if (alt_work_area != 0) + { + String configValue; + + configValue = config->Find ("word_db"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("word_db", configValue); + } + + configValue = config->Find ("doc_db"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_db", configValue); + } + + configValue = config->Find ("doc_index"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_index", configValue); + } + + configValue = config->Find ("doc_excerpt"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_excerpt", configValue); + } + } + + WordContext::Initialize(*config); + + if (merge_configFile.length()) + { + // Merge the databases specified in merge_configFile into the current + // databases. Do this first then update the other databases as usual + // Note: We don't have to specify anything, it's all in the config vars + + merge_ret = mergeDB(); + } + + //call destructors here + config->~HtConfiguration(); + merge_config.~HtConfiguration(); + + if (verbose != 0) + { + ret = logClose(); + + if (ret == FALSE) + { + reportError (form("[HTMERGE]: Error closing file [%s]. Error:[%d], %s\n", + htmerge_parms->logFile, errno, strerror(errno)) ); + return(HTMERGE_ERROR_LOGFILE_CLOSE); + } + } + + return(TRUE); +} + +//***************************************************************************** +// void mergeDB() +// +static int mergeDB () +{ + HtConfiguration *config = HtConfiguration::config (); + DocumentDB merge_db, db; + List *urls; + Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore + int docIDOffset; + + const String doc_index = config->Find ("doc_index"); + if (access (doc_index, R_OK) < 0) + { + reportError (form + ("[HTMERGE] Unable to open document index '%s'", + (const char *) doc_index)); + return(HTMERGE_ERROR_DOCINDEX_READ); + } + const String doc_excerpt = config->Find ("doc_excerpt"); + if (access (doc_excerpt, R_OK) < 0) + { + reportError (form + ("[HTMERGE] Unable to open document excerpts '%s'", + (const char *) doc_excerpt)); + return(HTMERGE_ERROR_EXCERPTDB_READ); + } + const String doc_db = config->Find ("doc_db"); + if (db.Open (doc_db, doc_index, doc_excerpt) < 0) + { + reportError (form ("[HTMERGE] Unable to open/create document database '%s'", + (const char *) doc_db)); + return(HTMERGE_ERROR_DOCDB_READ); + } + + + const String merge_doc_index = merge_config["doc_index"]; + if (access (merge_doc_index, R_OK) < 0) + { + reportError (form + ("[HTMERGE] Unable to open document index '%s'", + (const char *) merge_doc_index)); + return(HTMERGE_ERROR_DOCINDEX_READ); + } + const String merge_doc_excerpt = merge_config["doc_excerpt"]; + if (access (merge_doc_excerpt, R_OK) < 0) + { + reportError (form + ("[HTMERGE] Unable to open document excerpts '%s'", + (const char *) merge_doc_excerpt)); + return(HTMERGE_ERROR_EXCERPTDB_READ); + } + const String merge_doc_db = merge_config["doc_db"]; + if (merge_db.Open (merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0) + { + reportError (form ("[HTMERGE] Unable to open document database '%s'", + (const char *) merge_doc_db)); + return(HTMERGE_ERROR_DOCDB_READ); + } + + // Start the merging by going through all the URLs that are in + // the database to be merged + + urls = merge_db.URLs (); + // This ensures that every document added from merge_db has a unique ID + // in the new database + docIDOffset = db.NextDocID (); + + urls->Start_Get (); + String *url; + String id; + while ((url = (String *) urls->Get_Next ())) + { + DocumentRef *ref = merge_db[url->get ()]; + DocumentRef *old_ref = db[url->get ()]; + if (!ref) + continue; + + if (old_ref) + { + // Oh well, we knew this would happen. Let's get the duplicate + // And we'll only use the most recent date. + + if (old_ref->DocTime () >= ref->DocTime ()) + { + // Cool, the ref we're merging is too old, just ignore it + char str[20]; + sprintf (str, "%d", ref->DocID ()); + merge_dup_ids.Add (str, 0); + + if (verbose > 1) + { + logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring & merging copy\n", url)); + } + } + else + { + // The ref we're merging is newer, delete the old one and add + char str[20]; + sprintf (str, "%d", old_ref->DocID ()); + db_dup_ids.Add (str, 0); + db.Delete (old_ref->DocID ()); + ref->DocID (ref->DocID () + docIDOffset); + db.Add (*ref); + if (verbose > 1) + { + logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring destination copy\n",url->get())); + } + } + } + else + { + // It's a new URL, just add it, making sure to load the excerpt + merge_db.ReadExcerpt (*ref); + ref->DocID (ref->DocID () + docIDOffset); + db.Add (*ref); + if (verbose > 1) + { + logEntry(form("[HTMERGE] Merged URL: {%s} \n",url->get())); + } + } + delete ref; + delete old_ref; + } + delete urls; + + // As reported by Roman Dimov, we must update db.NextDocID() + // because of all the added records... + db.IncNextDocID (merge_db.NextDocID ()); + merge_db.Close (); + db.Close (); + + // OK, after merging the doc DBs, we do the same for the words + HtWordList mergeWordDB (*config), wordDB (*config); + List *words; + String docIDKey; + + if (wordDB.Open (config->Find ("word_db"), O_RDWR) < 0) + { + reportError (form ("[HTMERGE] Unable to open/create word database '%s'", + (const char *) config->Find ("word_db"))); + return(HTMERGE_ERROR_WORDDB_READ); + } + + if (mergeWordDB.Open (merge_config["word_db"], O_RDONLY) < 0) + { + reportError (form ("[HTMERGE] Unable to open word database '%s'", + (const char *) merge_config["word_db"])); + return(HTMERGE_ERROR_WORDDB_READ); + } + + // Start the merging by going through all the URLs that are in + // the database to be merged + + words = mergeWordDB.WordRefs (); + + words->Start_Get (); + HtWordReference *word; + while ((word = (HtWordReference *) words->Get_Next ())) + { + docIDKey = word->DocID (); + if (merge_dup_ids.Exists (docIDKey)) + continue; + + word->DocID (word->DocID () + docIDOffset); + wordDB.Override (*word); + } + delete words; + + words = wordDB.WordRefs (); + words->Start_Get (); + while ((word = (HtWordReference *) words->Get_Next ())) + { + docIDKey = word->DocID (); + if (db_dup_ids.Exists (docIDKey)) + wordDB.Delete (*word); + } + delete words; + + // Cleanup--just close the two word databases + mergeWordDB.Close (); + wordDB.Close (); + + return(TRUE); + +} + diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc new file mode 100644 index 00000000..472b5fc2 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc @@ -0,0 +1,1099 @@ +//---------------------------------------------------------------- +// +// libhtdig_htsearch.cc +// +// 1/25/2002 created from htsearch.cc +// +// Neal Richter nealr@rightnow.com +// +// +// htsearch: The main search CGI. Parses the CGI input, reads the config files +// and calls the necessary code to put together the result lists +// and the final display. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_htsearch.cc,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//---------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +extern "C" +{ +#include "libhtdig_api.h" +} + +#include "libhtdig_log.h" + + +#include "htsearch.h" +#include "defaults.h" +#include "WeightWord.h" +#include "parser.h" +#include "ResultFetch.h" +#include "../htfuzzy/Fuzzy.h" +#include "cgi.h" +#include "WordRecord.h" +#include "HtWordList.h" +#include "StringList.h" +#include "IntObject.h" +#include "HtURLCodec.h" +#include "HtURLRewriter.h" +#include "WordContext.h" +#include "HtRegex.h" +#include "Collection.h" + +//define _XOPEN_SOURCE +//#define _GNU_SOURCE +#include <time.h> +#include <ctype.h> +#include <signal.h> + +#ifndef _WIN32 +#include <unistd.h> +#endif + + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif + +typedef void (*SIGNAL_HANDLER) (...); + +// ResultList *htsearch(const String&, List &, Parser *); +int htsearch(Collection *, List &, Parser *); + +void setupWords(char *, List &, int, Parser *, String &); +void createLogicalWords(List &, String &, String &); +void reportError(char *); +void convertToBoolean(List & words); +void doFuzzy(WeightWord *, List &, List &); +void addRequiredWords(List &, StringList &); + +int minimum_word_length = 3; + +StringList boolean_keywords; + +Parser *parser = NULL; + +extern String configFile; +extern int debug; + +static HtConfiguration *config = NULL; +Dictionary selected_collections; // Multiple database support +Collection *collection = NULL; +String errorMsg; + +String originalWords; +String origPattern; +String logicalWords; +String logicalPattern; +StringMatch *searchWordsPattern = NULL; +StringList requiredWords; //TODO add this + +HtRegex limit_to; +HtRegex exclude_these; + +// List searchWords; +List *searchWords = NULL; + +StringList collectionList; // List of databases to search on + + +static int total_matches = 0; +static List *matches_list = 0; +static ResultFetch *resultfetch = 0; + + +//***************************************************************************** +// int main() +// +//int main(int ac, char **av) +int htsearch_open(htsearch_parameters_struct * htsearch_parms) +{ + int ret = -1; + int override_config = 0; + + String logicalWords; + String logicalPattern; + // StringMatch searchWordsPattern; + StringMatch *searchWordsPattern = NULL; + StringList requiredWords; + //int i; + //int c; + int cInd = 0; + + //load 'comand-line' parameters + + if (htsearch_parms->configFile[0] != 0) + configFile = htsearch_parms->configFile; + + debug = htsearch_parms->debug; + if (debug != 0) + { + ret = logOpen(htsearch_parms->logFile); + + if (ret == FALSE) + { + reportError(form("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n", + htsearch_parms->logFile, errno, strerror(errno))); + return (HTSEARCH_ERROR_LOGFILE_OPEN); + } + } + + + //case 'c': + // The default is obviously to do this securely + // but if people want to shoot themselves in the foot... + // configFile = optarg; + // override_config = 1; + + // + // The total search can NEVER take more than 5 minutes. + // + //alarm(5 * 60); + + errorMsg = ""; + + config = HtConfiguration::config(); + + // Each collection is handled in an iteration. Reset the following so + // that we start with a clean slate. + // + logicalWords = 0; + origPattern = 0; + logicalPattern = 0; + searchWords = new List; + searchWordsPattern = new StringMatch; + + char *config_name = collectionList[cInd]; + if (config_name && config_name[0] == '\0') + config_name = NULL; // use default config + + // + // Setup the configuration database. First we read the compiled defaults. + // Then we override those with defaults read in from the configuration + // file, and finally we override some attributes with information we + // got from the HTML form. + // + config->Defaults(&defaults[0]); + // To allow . in filename while still being 'secure', + // e.g. htdig-f.q.d.n.conf + if (!override_config && config_name && (strstr(config_name, "./") == NULL)) + { + char *configDir = getenv("CONFIG_DIR"); + if (configDir) + { + configFile = configDir; + } + else + { + configFile = CONFIG_DIR; + } + if (strlen(config_name) == 0) + configFile = DEFAULT_CONFIG_FILE; + else + configFile << '/' << config_name << ".conf"; + } + if (access((char *) configFile, R_OK) < 0) + { + reportError(form("Unable to read configuration file '%s'", configFile.get())); + return (HTSEARCH_ERROR_CONFIG_READ); + } + config->Read(configFile); + + + //---------- Now override config settings ----------------- + + //------- override database path ------------ + if (strlen(htsearch_parms->DBpath) > 0) + { + config->Add("database_dir", htsearch_parms->DBpath); + } + + //------- custom filters from htsearch_parms ---------- + + //resrict,exclude,urlrewrite + + + if (strlen(htsearch_parms->meta_description_factor) > 0) + { + config->Add("meta_description_factor", htsearch_parms->meta_description_factor); + } + + if (strlen(htsearch_parms->title_factor) > 0) + { + config->Add("title_factor", htsearch_parms->title_factor); + } + + if (strlen(htsearch_parms->text_factor) > 0) + { + config->Add("text_factor", htsearch_parms->text_factor); + } + + if(strlen(htsearch_parms->locale) > 0) + { + config->Add("locale", htsearch_parms->locale); + } + + //------------------------------------------------------------------- + + + // Initialize htword library (key description + wordtype...) + WordContext::Initialize(*config); + +//NON-CGI Usage libhtdig +/* + + config->Add("match_method", input["method"]); + config->Add("template_name", input["format"]); + + // minimum check for a valid int value of "matchesperpage" cgi variable + if (atoi(input["matchesperpage"]) > 0) + config->Add("matches_per_page", input["matchesperpage"]); + + pageNumber = atoi(input["page"]); + config->Add("config", input["config"]); + config->Add("restrict", input["restrict"]); + config->Add("exclude", input["exclude"]); + config->Add("keywords", input["keywords"]); + requiredWords.Create(config->Find("keywords"), " \t\r\n\001"); + config->Add("sort", input["sort"]); + + config->Add("startmonth", input["startmonth"]); + config->Add("startday", input["startday"]); + config->Add("startyear", input["startyear"]); + + config->Add("endmonth", input["endmonth"]); + config->Add("endday", input["endday"]); + config->Add("endyear", input["endyear"]); + + + StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); + for (i = 0; i < form_vars.Count(); i++) + { + if (input.exists(form_vars[i])) + config->Add(form_vars[i], input[form_vars[i]]); + } + +*/ +//NON-CGI Usage libhtdig + + + minimum_word_length = config->Value("minimum_word_length", minimum_word_length); + + // + // Compile the URL limit patterns. + // + + if (config->Find("restrict").length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(config->Find("restrict"), " \t\r\n\001|"); + limit_to.setEscaped(l); + String u = l.Join('|'); + config->Add("restrict", u); // re-create the config attribute + } + if (config->Find("exclude").length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(config->Find("exclude"), " \t\r\n\001|"); + exclude_these.setEscaped(l); + String u = l.Join('|'); + config->Add("exclude", u); // re-create the config attribute + } + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance()->ErrMsg(); + + if (url_part_errors.length() != 0) + { + reportError(form("Invalid url_part_aliases or common_url_parts: %s", url_part_errors.get())); + return (HTSEARCH_ERROR_URL_PART); + + } + + // for htsearch, use search_rewrite_rules attribute for HtURLRewriter. + config->AddParsed("url_rewrite_rules", "${search_rewrite_rules}"); + url_part_errors = HtURLRewriter::instance()->ErrMsg(); + if (url_part_errors.length() != 0) + reportError(form("Invalid url_rewrite_rules: %s", url_part_errors.get())); + + // Load boolean_keywords from configuration + // they should be placed in this order: + // 0 1 2 + // and or not + boolean_keywords.Create(config->Find("boolean_keywords"), "| \t\r\n\001"); + if (boolean_keywords.Count() != 3) + reportError("boolean_keywords attribute should have three entries"); + + + + parser = new Parser(); + + return (TRUE); +} + +//--------------------------------------------------------------------------------------- +// +// +// RETURN: Number of Documents resulted from search +// +//--------------------------------------------------------------------------------------- + +int htsearch_query(htsearch_query_struct * htseach_query) +{ + int total_match_count = 0; + + originalWords = htseach_query->raw_query; + originalWords.chop(" \t\r\n"); + + //sort + switch (htseach_query->sortby_flag) + { + case HTSEARCH_SORT_SCORE: + config->Add("sort", "score"); + break; + case HTSEARCH_SORT_REV_SCORE: + config->Add("sort", "revscore"); + break; + case HTSEARCH_SORT_TIME: + config->Add("sort", "time"); + break; + case HTSEARCH_SORT_REV_TIME: + config->Add("sort", "revtime"); + break; + case HTSEARCH_SORT_TITLE: + config->Add("sort", "title"); + break; + case HTSEARCH_SORT_REV_TITLE: + config->Add("sort", "revtitle"); + break; + } + + + switch (htseach_query->algorithms_flag) + { + case HTSEARCH_ALG_BOOLEAN: + config->Add("match_method", "boolean"); + break; + case HTSEARCH_ALG_OR: + config->Add("match_method", "or"); + break; + case HTSEARCH_ALG_AND: + config->Add("match_method", "and"); + break; + } + + //format + switch (htseach_query->algorithms_flag) + { + case HTSEARCH_FORMAT_SHORT: + config->Add("template_name", "builtin-short"); + break; + case HTSEARCH_FORMAT_LONG: + config->Add("template_name", "builtin-long"); + break; + } + + + origPattern = 0; + logicalWords = 0; + logicalPattern = 0; + searchWordsPattern = new StringMatch; + + // Iterate over all specified collections (databases) + //for (int cInd = 0; errorMsg.empty() && cInd < collectionList.Count(); cInd++) + //{ + + // Parse the words to search for from the argument list. + // This will produce a list of WeightWord objects. + // + setupWords(originalWords, *searchWords, + strcmp(config->Find("match_method"), "boolean") == 0, parser, origPattern); + + // + // Convert the list of WeightWord objects to a pattern string + // that we can compile. + // + createLogicalWords(*searchWords, logicalWords, logicalPattern); + + // + // Assemble the full pattern for excerpt matching and highlighting + // + origPattern += logicalPattern; + searchWordsPattern->IgnoreCase(); + searchWordsPattern->IgnorePunct(); + searchWordsPattern->Pattern(logicalPattern); // this should now be enough + //searchWordsPattern.Pattern(origPattern); + //if (debug > 2) + // cout << "Excerpt pattern: " << origPattern << "\n"; + + // + // If required keywords were given in the search form, we will + // modify the current searchWords list to include the required + // words. + // + if (requiredWords.Count() > 0) + { + addRequiredWords(*searchWords, requiredWords); + } + + // + // Perform the actual search. The function htsearch() is used for this. + // The Dictionary it returns is then passed on to the Display object to + // actually render the results in HTML. + // + const String word_db = config->Find("word_db"); + if (access(word_db, R_OK) < 0) + { + reportError(form("Unable to read word database file '%s'\nDid you run htdig?", word_db.get())); + return (HTSEARCH_ERROR_WORDDB_READ); + } + // ResultList *results = htsearch((char*)word_db, searchWords, parser); + + String doc_index = config->Find("doc_index"); + if (access((char *) doc_index, R_OK) < 0) + { + reportError(form("Unable to read document index file '%s'\nDid you run htdig?", doc_index.get())); + return (HTSEARCH_ERROR_DOCINDEX_READ); + } + + const String doc_db = config->Find("doc_db"); + if (access(doc_db, R_OK) < 0) + { + reportError(form("Unable to read document database file '%s'\nDid you run htdig?", doc_db.get())); + return (HTSEARCH_ERROR_DOCDB_READ); + } + + const String doc_excerpt = config->Find("doc_excerpt"); + if (access(doc_excerpt, R_OK) < 0) + { + reportError(form("Unable to read document excerpts '%s'\nDid you run htdig?", doc_excerpt.get())); + return (HTSEARCH_ERROR_EXCERPTDB_READ); + } + + // Multiple database support + collection = new Collection((char *) configFile, + word_db.get(), doc_index.get(), doc_db.get(), doc_excerpt.get()); + + // Perform search within the collection. Each collection stores its + // own result list. + total_match_count += htsearch(collection, *searchWords, parser); + collection->setSearchWords(searchWords); + collection->setSearchWordsPattern(searchWordsPattern); + selected_collections.Add(configFile, collection); + + if (parser->hadError()) + errorMsg = parser->getErrorMessage(); + + delete parser; + //} + + + total_matches = total_match_count; + + if (total_matches > 0) + { + + resultfetch = new ResultFetch(&selected_collections, collectionList); + + if (resultfetch->hasTemplateError()) + { + reportError(form("Unable to read template file '%s'\nDoes it exist?", + (const char *) config->Find("template_name"))); + + return (HTSEARCH_ERROR_TEMPLATE_ERROR); + } + resultfetch->setOriginalWords(originalWords); + resultfetch->setLimit(&limit_to); + resultfetch->setExclude(&exclude_these); + resultfetch->setLogicalWords(logicalWords); + if (!errorMsg.empty()) + resultfetch->displaySyntaxError(errorMsg); + else + { + + matches_list = resultfetch->fetch(); + + //matches_list->Start_Get(); + + } + + } //if ((total_matches > 0) && (desired_match_index == 0)) + + + return (total_match_count); +} + +//------------------ htsearch_get_nth_match (...) ------------------------------------- +// +// Parameters +// result_desired_index ZERO based results index. +// query_result structure to fill with result +// +// htsearch_query_match_struct: +// char title[HTDIG_DOCUMENT_TITLE_L]; +// char URL[HTDIG_MAX_FILENAME_PATH_L]; +// char excerpt[HTDIG_DOCUMENT_EXCERPT_L]; +// int score; +// int match_percent; //top result is 100% +// time_t doc_date; +// int size; +// +//--------------------------------------------------------------------------------------- + +int htsearch_get_nth_match(int desired_match_index, htsearch_query_match_struct * query_result) +{ + + ResultMatch *match = 0; + Dictionary *vars = 0; + + if (total_matches == 0) + { + return (HTSEARCH_ERROR_NO_MATCH); + } + else if (desired_match_index >= total_matches) + { + return (HTSEARCH_ERROR_BAD_MATCH_INDEX); + } + else if ((total_matches > 0) && (desired_match_index < total_matches)) + { + match = (ResultMatch *) matches_list->Nth(desired_match_index); + + // DocumentRef *ref = docDB[match->getID()]; + Collection *collection = match->getCollection(); + DocumentRef *ref = collection->getDocumentRef(match->getID()); + if (!ref || ref->DocState() != Reference_normal) + { + // The document isn't present or shouldn't be displayed + return (HTSEARCH_ERROR_BAD_DOCUMENT); + } + + ref->DocAnchor(match->getAnchor()); + ref->DocScore(match->getScore()); + vars = resultfetch->fetchMatch(match, ref, desired_match_index); + delete ref; + + String *value; + String key; + + key = "NSTARS"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + query_result->score = atoi(value->get()); + + key = "PERCENT"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + query_result->score_percent = atoi(value->get()); + + key = "TITLE"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + snprintf(query_result->title, HTDIG_DOCUMENT_TITLE_L, "%s", value->get()); + + key = "EXCERPT"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + snprintf(query_result->excerpt, HTDIG_DOCUMENT_EXCERPT_L, "%s", value->get()); + + key = "URL"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + snprintf(query_result->URL, HTDIG_MAX_FILENAME_PATH_L, "%s", value->get()); + + String datefmt = config->Find("date_format"); + key = "MODIFIED"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + mystrptime(value->get(), datefmt.get(), &(query_result->time_tm)); + //cout << "[" << asctime(&query_result->time_tm) << "]" << endl; + + key = "SIZE"; + value = (String *) vars->Find(key); + //cout << key.get() << "[" << value->get() << "]" << endl; + query_result->size = atoi(value->get()); + + + } + + return (TRUE); +} + +//--------------------------------------------------------------------------------------- +// +// +// RETURN: TRUE or FALSE +// +//--------------------------------------------------------------------------------------- + +int htsearch_close() +{ + + + // delete results; + // delete parser; + + + return (TRUE); + +} + +//***************************************************************************** +void createLogicalWords(List & searchWords, String & logicalWords, String & wm) +{ + String pattern; + int i; + int wasHidden = 0; + int inPhrase = 0; + + for (i = 0; i < searchWords.Count(); i++) + { + WeightWord *ww = (WeightWord *) searchWords[i]; + if (!ww->isHidden) + { + + if (strcmp((char *) ww->word, "&") == 0 && wasHidden == 0) + logicalWords << ' ' << boolean_keywords[AND] << ' '; + else if (strcmp((char *) ww->word, "|") == 0 && wasHidden == 0) + logicalWords << ' ' << boolean_keywords[OR] << ' '; + else if (strcmp((char *) ww->word, "!") == 0 && wasHidden == 0) + logicalWords << ' ' << boolean_keywords[NOT] << ' '; + else if (strcmp((char *) ww->word, "\"") == 0 && wasHidden == 0) + { + if (inPhrase) + logicalWords.chop(' '); + inPhrase = !inPhrase; + logicalWords << "\""; + } + else if (wasHidden == 0) + { + logicalWords << ww->word; + if (inPhrase) + logicalWords << " "; + } + wasHidden = 0; + } + else + wasHidden = 1; + if (ww->weight > 0 // Ignore boolean syntax stuff + && !ww->isIgnore) // Ignore short or bad words + { + if (pattern.length() && !inPhrase) + pattern << '|'; + else if (pattern.length() && inPhrase) + pattern << ' '; + pattern << ww->word; + } + } + wm = pattern; + + if (debug) + { + cerr << "LogicalWords: " << logicalWords << endl; + cerr << "Pattern: " << pattern << endl; + } +} + +void dumpWords(List & words, char *msg = "") +{ + if (debug) + { + cerr << msg << ": '"; + for (int i = 0; i < words.Count(); i++) + { + WeightWord *ww = (WeightWord *) words[i]; + cerr << ww->word << ':' << ww->isHidden << ' '; + } + cerr << "'\n"; + } +} + +//***************************************************************************** +// void setupWords(char *allWords, List &searchWords, +// int boolean, Parser *parser, String &originalPattern) +// +void setupWords(char *allWords, List & searchWords, int boolean, Parser * parser, String & originalPattern) +{ + HtConfiguration *config = HtConfiguration::config(); + List tempWords; + int i; + + // + // Parse the words we need to search for. It should be a list of words + // with optional 'and' and 'or' between them. The list of words + // will be put in the searchWords list and at the same time in the + // String pattern separated with '|'. + // + + // + // Convert the string to a list of WeightWord objects. The special + // characters '(' and ')' will be put into their own WeightWord objects. + // + unsigned char *pos = (unsigned char *) allWords; + unsigned char t; + String word; + const String prefix_suffix = config->Find("prefix_match_character"); + while (*pos) + { + while (1) + { + t = *pos++; + if (isspace(t)) + { + continue; + } + else if (t == '"') + { + tempWords.Add(new WeightWord("\"", -1.0)); + break; + } + else if (boolean && (t == '(' || t == ')')) + { + char s[2]; + s[0] = t; + s[1] = '\0'; + tempWords.Add(new WeightWord(s, -1.0)); + break; + } + else if (HtIsWordChar(t) || t == ':' || + (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255)) + { + word = 0; + while (t && (HtIsWordChar(t) || + t == ':' || (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255))) + { + word << (char) t; + t = *pos++; + } + + pos--; + + if (boolean && (mystrcasecmp(word.get(), "+") == 0 + || mystrcasecmp(word.get(), boolean_keywords[AND]) == 0)) + { + tempWords.Add(new WeightWord("&", -1.0)); + } + else if (boolean && mystrcasecmp(word.get(), boolean_keywords[OR]) == 0) + { + tempWords.Add(new WeightWord("|", -1.0)); + } + else if (boolean && (mystrcasecmp(word.get(), "-") == 0 + || mystrcasecmp(word.get(), boolean_keywords[NOT]) == 0)) + { + tempWords.Add(new WeightWord("!", -1.0)); + } + else + { + // Add word to excerpt matching list + originalPattern << word << "|"; + WeightWord *ww = new WeightWord(word, 1.0); + if (HtWordNormalize(word) & WORD_NORMALIZE_NOTOK) + ww->isIgnore = 1; + tempWords.Add(ww); + } + break; + } + } + } + + dumpWords(tempWords, "tempWords"); + + // + // If the user specified boolean expression operators, the whole + // expression has to be syntactically correct. If not, we need + // to report a syntax error. + // + if (boolean) + { + if (!parser->checkSyntax(&tempWords)) + { + for (i = 0; i < tempWords.Count(); i++) + { + searchWords.Add(tempWords[i]); + } + tempWords.Release(); + return; +// reportError("Syntax error"); + } + } + else + { + convertToBoolean(tempWords); + } + + dumpWords(tempWords, "Boolean"); + + // + // We need to assign weights to the words according to the search_algorithm + // configuration attribute. + // For algorithms other than exact, we need to also do word lookups. + // + StringList algs(config->Find("search_algorithm"), " \t"); + List algorithms; + String name, weight; + double fweight; + Fuzzy *fuzzy = 0; + + // + // Generate the list of algorithms to use and associate the given + // weights with them. + // + for (i = 0; i < algs.Count(); i++) + { + name = strtok(algs[i], ":"); + weight = strtok(0, ":"); + if (name.length() == 0) + name = "exact"; + if (weight.length() == 0) + weight = "1"; + fweight = atof((char *) weight); + + fuzzy = Fuzzy::getFuzzyByName(name, *config); + if (fuzzy) + { + fuzzy->setWeight(fweight); + fuzzy->openIndex(); + algorithms.Add(fuzzy); + } + } + + dumpWords(searchWords, "initial"); + + // + // For each of the words, apply all the algorithms. + // + int in_phrase = 0; // If we get into a phrase, we don't want to fuzz. + for (i = 0; i < tempWords.Count(); i++) + { + WeightWord *ww = (WeightWord *) tempWords[i]; + if (ww->weight > 0 && !ww->isIgnore && !in_phrase) + { + // + // Apply all the algorithms to the word. + // + if (debug) + cerr << "Fuzzy on: " << ww->word << endl; + doFuzzy(ww, searchWords, algorithms); + delete ww; + } + else if (ww->word.length() == 1 && ww->word[0] == '"') + { + in_phrase = !in_phrase; + if (debug) + cerr << "Add: " << ww->word << endl; + searchWords.Add(ww); + } + else + { + // + // This is '(', ')', '&', or '|'. These will be automatically + // transfered to the searchWords list. + // + if (debug) + cerr << "Add: " << ww->word << endl; + searchWords.Add(ww); + } + dumpWords(searchWords, "searchWords"); + } + tempWords.Release(); +} + + +//***************************************************************************** +void doFuzzy(WeightWord * ww, List & searchWords, List & algorithms) +{ + List fuzzyWords; + List weightWords; + Fuzzy *fuzzy; + WeightWord *newWw; + String *word; + + algorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) algorithms.Get_Next())) + { + if (debug > 1) + cout << " " << fuzzy->getName(); + fuzzy->getWords(ww->word, fuzzyWords); + fuzzyWords.Start_Get(); + while ((word = (String *) fuzzyWords.Get_Next())) + { + if (debug > 1) + cout << " " << word->get(); + newWw = new WeightWord(word->get(), fuzzy->getWeight()); + newWw->isExact = ww->isExact; + newWw->isHidden = ww->isHidden; + weightWords.Add(newWw); + } + if (debug > 1) + cout << endl; + fuzzyWords.Destroy(); + } + + // + // We now have a list of substitute words. They need to be added + // to the searchWords. + // + if (weightWords.Count()) + { + if (weightWords.Count() > 1) + searchWords.Add(new WeightWord("(", -1.0)); + for (int i = 0; i < weightWords.Count(); i++) + { + if (i > 0) + searchWords.Add(new WeightWord("|", -1.0)); + searchWords.Add(weightWords[i]); + } + if (weightWords.Count() > 1) + searchWords.Add(new WeightWord(")", -1.0)); + } + else // if no fuzzy matches, add exact word, but give it tiny weight + { + searchWords.Add(new WeightWord(word->get(), 0.000001)); + } + + + weightWords.Release(); +} + + +//***************************************************************************** +// void convertToBoolean(List &words) +// +void convertToBoolean(List & words) +{ + HtConfiguration *config = HtConfiguration::config(); + List list; + int i; + int do_and = strcmp(config->Find("match_method"), "and") == 0; + int in_phrase = 0; + + String quote = "\""; + + if (words.Count() == 0) + return; + list.Add(words[0]); + + // We might start off with a phrase match + if (((WeightWord *) words[0])->word == quote) + in_phrase = 1; + + for (i = 1; i < words.Count(); i++) + { + if (do_and && !in_phrase) + list.Add(new WeightWord("&", -1.0)); + else if (!in_phrase) + list.Add(new WeightWord("|", -1.0)); + + if (((WeightWord *) words[i])->word == quote) + in_phrase = !in_phrase; + + list.Add(words[i]); + } + words.Release(); + + for (i = 0; i < list.Count(); i++) + { + words.Add(list[i]); + } + list.Release(); +} + + +//***************************************************************************** +// Dictionary *htsearch(char *wordfile, List &searchWords, Parser *parser) +// This returns a dictionary indexed by document ID and containing a +// List of HtWordReference objects. +// +int htsearch(Collection * collection, List & searchWords, Parser * parser) +{ + int count = 0; + + // + // Pick the database type we are going to use + // + ResultList *matches = new ResultList; + if (searchWords.Count() > 0) + { + // parser->setDatabase(wordfile); + parser->setCollection(collection); + parser->parse(&searchWords, *matches); + } + + collection->setResultList(matches); + + count = matches->Count(); + + return (count); +} + + +//***************************************************************************** +// Modify the search words list to include the required words as well. +// This is done by putting the existing search words in parenthesis and +// appending the required words separated with "and". +void addRequiredWords(List & searchWords, StringList & requiredWords) +{ + HtConfiguration *config = HtConfiguration::config(); + static int any_keywords = config->Boolean("any_keywords", 0); + if (requiredWords.Count() == 0) + return; + if (searchWords.Count() > 0) + { + searchWords.Insert(new WeightWord("(", -1.0), 0); + searchWords.Add(new WeightWord(")", -1.0)); + searchWords.Add(new WeightWord("&", -1.0)); + } + if (requiredWords.Count() == 1) + { + searchWords.Add(new WeightWord(requiredWords[0], 1.0)); + } + else + { + searchWords.Add(new WeightWord("(", -1.0)); + searchWords.Add(new WeightWord(requiredWords[0], 1.0)); + for (int i = 1; i < requiredWords.Count(); i++) + { + if (any_keywords) + searchWords.Add(new WeightWord("|", -1.0)); + else + searchWords.Add(new WeightWord("&", -1.0)); + searchWords.Add(new WeightWord(requiredWords[i], 1.0)); + } + searchWords.Add(new WeightWord(")", -1.0)); + } +} + + +//***************************************************************************** +// Report an error. Since we don' know if we are running as a CGI or not, +// we will assume this is the first thing returned by a CGI program. +// +void reportError_html(char *msg) +{ + HtConfiguration *config = HtConfiguration::config(); + cout << "Content-type: text/html\r\n\r\n"; + cout << "<html><head><title>htsearch error</title></head>\n"; + cout << "<body bgcolor=\"#ffffff\">\n"; + cout << "<h1>ht://Dig error</h1>\n"; + cout << "<p>htsearch detected an error. Please report this to the\n"; + cout << "webmaster of this site by sending an e-mail to:\n"; + cout << "<a href=\"mailto:" << config->Find("maintainer") << "\">"; + cout << config->Find("maintainer") << "</a>\n"; + cout << "The error message is:</p>\n"; + cout << "<pre>\n" << msg << "\n</pre>\n</body></html>\n"; + exit(1); +} diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc new file mode 100644 index 00000000..db51ae3a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc @@ -0,0 +1,99 @@ +//-------------------------------------------------------------------- +// +// libhtdig_log.cc +// +// 2/6/2002 created +// +// Neal Richter nealr@rightnow.com +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_log.cc,v 1.5 2004/05/28 13:15:29 lha Exp $ +// +//-------------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "libhtdig_log.h" + +#include <stdlib.h> + +#ifndef _WIN32 +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> +#include <time.h> + + +static FILE *errorlog_fp = NULL; + +int logOpen(char *filename) +{ + if(errorlog_fp == NULL) + errorlog_fp = fopen(filename, "a+"); + + if (errorlog_fp == NULL) + return (TRUE); + else + return (FALSE); +} + + +void logEntry (char *msg) +{ + time_t now = time(NULL); + + if(errorlog_fp != NULL) + fprintf(errorlog_fp, "[%s] %s\n", ctime(&now), msg); + +} + + +//***************************************************************************** +// Report an error + +void reportError (char *msg) +{ + time_t now = time(NULL); + + if(errorlog_fp != NULL) + fprintf(errorlog_fp, "%s [ERROR] %s\n", ctime(&now), msg); + + fprintf(stderr, "%s [ERROR] %s\n", ctime(&now), msg); + +} + + +int logClose() +{ + int ret = -1; + + if(errorlog_fp != NULL) + { + ret = fclose(errorlog_fp); + errorlog_fp = NULL; + + if(ret == 0) + return(TRUE); + else + return(FALSE); + } + + return(TRUE); +} diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h new file mode 100644 index 00000000..22adceca --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h @@ -0,0 +1,38 @@ +//-------------------------------------------------------------------- +// +// libhtdig_log.h +// +// 2/6/2002 created +// +// Neal Richter nealr@rightnow.com +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_log.h,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//-------------------------------------------------------------------- + +#ifndef LIBHTDIG_LOG_H +#define LIBHTDIG_LOG_H + + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + + +int logOpen(char *file); +void logEntry(char *msg); +void reportError(char *msg); +int logClose(void); + +#endif /* LIBHTDIG_LOG_H */ + |