summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/libhtdig
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig')
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc316
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h119
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/Makefile182
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32173
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/README46
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc1735
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h248
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc517
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h133
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h75
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h614
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc1058
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc265
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc407
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc1099
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc99
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h38
17 files changed, 7124 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc
new file mode 100644
index 00000000..d6862550
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc
@@ -0,0 +1,316 @@
+//
+// BasicDocument.cc
+//
+// 2/6/2002 created for libhtdig to simplify & mimic Document.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+//
+// BasicDocument: This class holds everything there is to know about a document.
+// The actual contents of the document may or may not be present at
+// all times for memory conservation reasons.
+//
+// This is a basic extensable container for plain text holding documents.
+//
+// Uses any Parser with parse method handling this class.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: BasicDocument.cc,v 1.3 2004/05/28 13:15:28 lha Exp $
+//
+//--------------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+
+#include "BasicDocument.h"
+#include "TextCollector.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "Plaintext.h"
+#include "HTML.h"
+#include "ExternalParser.h"
+#include "lib.h"
+
+#include "defaults.h"
+
+#if 1
+typedef void (*SIGNAL_HANDLER) (...);
+#else
+typedef SIG_PF SIGNAL_HANDLER;
+#endif
+
+//*****************************************************************************
+// BasicDocument::BasicDocument(char *loc)
+// Initialize with the given loc-parameter as the location for this document.
+// If the max_size is given, use that for size, otherwise use the
+// config value.
+//
+BasicDocument::BasicDocument(char *loc, int suggested_size)
+{
+ int temp_size = 0;
+
+ id = 0;
+ location = 0;
+ title = 0;
+ metacontent = 0;
+ contents = 0;
+ document_length = 0;
+
+
+ HtConfiguration *config = HtConfiguration::config();
+
+ //We probably need to move assignment of max_doc_size, according
+ //to a configuration value.
+
+ if (suggested_size > 0)
+ temp_size = suggested_size;
+ else
+ temp_size = config->Value("max_doc_size");
+
+ contents.allocate(temp_size + 100);
+
+ contentType = "";
+
+ if (loc)
+ {
+ Location(loc);
+ }
+}
+
+
+//*****************************************************************************
+// BasicDocument::~BasicDocument()
+//
+BasicDocument::~BasicDocument()
+{
+ // We delete only the derived class objects
+
+#if MEM_DEBUG
+ char *p = new char;
+ cout << "==== BasicDocument deleted: " << this << " new at " << ((void *) p) << endl;
+ delete p;
+#endif
+}
+
+
+//*****************************************************************************
+// void BasicDocument::Reset()
+// Restore the BasicDocument object to an initial state.
+//
+void
+BasicDocument::Reset()
+{
+
+ id = 0;
+ location = 0;
+ title = 0;
+ metacontent = 0;
+ contents = 0;
+
+ contentType = 0;
+ document_length = 0;
+
+}
+
+//*****************************************************************************
+// void BasicDocument::Length()
+// Return/Calc length of BasicDocument... icummulative size of the Strings
+//
+int
+BasicDocument::Length()
+{
+ if (document_length < 0)
+ {
+ document_length = 0;
+ document_length += location.length();
+ document_length += title.length();
+ document_length += metacontent.length();
+ document_length += contents.length();
+ document_length += id.length();
+ }
+
+ return (document_length);
+}
+
+
+//*****************************************************************************
+// Parsable *BasicDocument::getParsable()
+// Given the content-type of a document, returns a document parser.
+// This will first look through the list of user supplied parsers and
+// then at our (limited) builtin list of parsers. The user supplied
+// parsers are external programs that will be used.
+
+Parsable *
+BasicDocument::getParsable()
+{
+ static HTML *html = 0;
+ static Plaintext *plaintext = 0;
+ static ExternalParser *externalParser = 0;
+
+ Parsable *parsable = 0;
+
+ if (ExternalParser::canParse(contentType))
+ {
+ if (externalParser)
+ {
+ delete externalParser;
+ }
+ externalParser = new ExternalParser(contentType);
+ parsable = externalParser;
+ }
+ else if (mystrncasecmp((char *) contentType, "text/html", 9) == 0)
+ {
+ if (!html)
+ html = new HTML();
+ parsable = html;
+ }
+ else if (mystrncasecmp((char *) contentType, "text/plain", 10) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ }
+ else if (mystrncasecmp((char *) contentType, "text/css", 8) == 0)
+ {
+ return NULL;
+ }
+ else if (mystrncasecmp((char *) contentType, "text/", 5) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ if (debug > 1)
+ {
+ cout << '"' << contentType << "\" not a recognized type. Assuming text/plain\n";
+ }
+ }
+ else
+ {
+ if (debug > 1)
+ {
+ cout << '"' << contentType << "\" not a recognized type. Ignoring\n";
+ }
+ return NULL;
+ }
+
+ parsable->setContents(contents.get(), contents.length());
+ return parsable;
+}
+
+//*****************************************************************************
+//
+// Test for self parseaable
+//
+int
+BasicDocument::SelfParseable()
+{
+
+ if (mystrncasecmp((char *) contentType, "text/vnd.customdocument", 10) == 0)
+ {
+ return (TRUE);
+ }
+ else
+ return (FALSE);
+
+}
+
+
+//*****************************************************************************
+// Parsable *BasicDocument::internalParser()
+int
+BasicDocument::internalParser(TextCollector & textcollector)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ char *position = NULL;
+ static int minimumWordLength = config->Value("minimum_word_length", 3);
+ int wordIndex = 1;
+ String word;
+ int letter_count = 0;
+
+ //First Process Title
+ textcollector.got_title((char *) title);
+
+ //Next Process Contents
+ position = contents;
+
+ while (*position)
+ {
+ word = 0;
+
+ if (HtIsStrictWordChar(*position))
+ {
+ //
+ // Start of a word. Try to find the whole thing
+ //
+ //TODO NEAL RICHTER Imposed a 50-letter word length limit here
+ //
+ while (*position && HtIsWordChar(*position) && (letter_count < 50))
+ {
+ word << *position;
+ position++;
+ letter_count++;
+ }
+
+ letter_count = 0;
+ if (word.length() >= minimumWordLength)
+ {
+ textcollector.got_word((char *) word, wordIndex++, 0);
+ }
+ }
+
+ if (*position)
+ position++;
+
+ }//end while
+
+ textcollector.got_head((char*) contents);
+
+ //Third, Process MetaContent
+ position = metacontent;
+ textcollector.got_meta_dsc(metacontent);
+
+
+ //max_meta_description_length???
+
+ while (*position)
+ {
+ word = 0;
+
+ if (HtIsStrictWordChar(*position))
+ {
+ //
+ // Start of a word. Try to find the whole thing
+ //
+ while (*position && HtIsWordChar(*position) && (letter_count < 50))
+ {
+ word << *position;
+ position++;
+ letter_count++;
+ }
+
+ letter_count = 0;
+
+ if (word.length() >= minimumWordLength)
+ {
+ textcollector.got_word((char *) word, wordIndex++, 9);
+ }
+ }
+
+ if (*position)
+ position++;
+
+ }//end while
+
+ return(1);
+}
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h
new file mode 100644
index 00000000..9d4a2a73
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.h
@@ -0,0 +1,119 @@
+//--------------------------------------------------------------------
+//
+// BasicDocument.h
+//
+// 2/6/2002 created for libhtdig to simplify & mimic Document.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+//
+// BasicDocument: This class holds everything there is to know about a document.
+// The actual contents of the document may or may not be present at
+// all times for memory conservation reasons.
+//
+// This is a basic extensable container for plain text holding documents.
+//
+// Uses any Parser with parse method handling this class.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: BasicDocument.h,v 1.4 2004/05/28 13:15:28 lha Exp $
+//
+//--------------------------------------------------------------------
+
+
+
+
+#ifndef _BasicDocument_h_
+#define _BasicDocument_h_
+
+#include "htString.h"
+#include "Parsable.h"
+#include "Object.h"
+#include "StringList.h"
+#include "HtDateTime.h"
+
+
+class TextCollector;
+
+
+class BasicDocument:public Object
+{
+ public:
+ //
+ // Construction/Destruction
+ //
+ BasicDocument(char *location = 0, int max_size = 0);
+ ~BasicDocument();
+
+ //
+ // Interface to the document.
+ //
+ void Reset();
+ int Length();
+
+ //int StoredLength() {return contents.length();}
+
+ char *Title() {return title;}
+ void Title(char *t) {title = t; document_length = -1;}
+ void Title(const String & t) {title = t; document_length = -1;}
+ int TitleLength() {return title.length();}
+
+ char *MetaContent() {return metacontent;}
+ void MetaContent(char *m) {metacontent = m; document_length = -1;}
+ void MetaContent(const String & m) {metacontent = m; document_length = -1;}
+ int MetaContentLength() {return metacontent.length();}
+
+ char *Contents() {return contents;}
+ void Contents(char *s) {contents = s; document_length = -1;}
+ void Contents(const String & s) {contents = s; document_length = -1;}
+ int ContentsLength() {return contents.length();}
+
+ char *Location() {return location;}
+ void Location(char *l) {location = l; document_length = -1;}
+ void Location(const String & l) {location = l; document_length = -1;}
+ int LocationLength() {return location.length();}
+
+ char *DocumentID() {return id;}
+ void DocumentID(char *ida) {id = ida; document_length = -1;}
+ void DocumentID(const String & ida) {id = ida; document_length = -1;}
+ int DocumentIDLength() {return id.length();}
+
+ char *ContentType() {return contentType;}
+ void ContentType(char *ct) {contentType = ct;}
+ void ContentType(const String & ct) {contentType = ct;}
+
+ time_t ModTime() {return modtime.GetTime_t();}
+ void ModTime(time_t t) {modtime = t;}
+
+ //
+ // Return an appropriate parsable object for the document type.
+ //
+ Parsable *getParsable();
+
+ int internalParser(TextCollector & textcollector);
+ int SelfParseable();
+
+ private:
+
+ String id;
+ String location;
+ String title;
+ String metacontent;
+ String contents;
+
+ String contentType;
+
+ HtDateTime modtime;
+
+ int document_length;
+
+ //int max_doc_size;
+
+};
+
+#endif
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile
new file mode 100644
index 00000000..01f78ec4
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile
@@ -0,0 +1,182 @@
+#libhtdig.so makefile for Unix systems
+
+LIBHTDIG_BUILD_VER = 3.2.1
+
+
+#Berkeley DB Specific defines
+BDB_INC_DIRS = -I../db -I/usr/local/include
+BDB_DEFS = -DHAVE_CONFIG_H
+
+INC_DIRS= -I. -I../htdig -I../include -I../db -I../htlib -I../htnet -I../htcommon -I../htword -I../htfuzzy -I../htsearch -I/usr/local/include
+
+#HTDIG_INC_DIR= -I../htdig
+
+#define your own defaults here!
+DEFAULT_CONFIG_FILE=\"/etc/htdig/htdig.conf\"
+DEFAULT_DB_PATH=\"/etc/htdig/\"
+BIN_DIR=\"/usr/local/bin\"
+COMMON_DIR=\"/usr/local/share/htdig\"
+CONFIG_DIR=\"/etc/htdig\"
+DATABASE_DIR=\"/var/lib/htdig\"
+IMAGE_URL_PREFIX=\"/htdig\"
+
+DEFS= -DHAVE_CONFIG_H -DDEFAULT_CONFIG_FILE=$(DEFAULT_CONFIG_FILE) -DBIN_DIR=$(BIN_DIR) -DCOMMON_DIR=$(COMMON_DIR)
+DEFS+= -DCONFIG_DIR=$(CONFIG_DIR) -DDATABASE_DIR=$(DATABASE_DIR) -DIMAGE_URL_PREFIX=$(IMAGE_URL_PREFIX)
+
+#LEX define based on your system
+LEX = flex
+AM_LFLAGS = -L
+LEX_OUTPUT_ROOT = lex.yy
+
+#YACC define based on your system
+YACC = bison -y
+AM_YFLAGS = -l -d
+YACC_OUTPUT_ROOT = y.tab
+
+
+
+
+ifdef INSURE
+CC=insure gcc
+CXX=insure g++
+else
+CC=gcc
+CXX=g++
+endif
+
+
+#OPTIMZ=-O2
+OPTIMZ=-O0
+#OPTS=$(OPTIMZ) -Wall -fno-rtti -fno-exceptions -Werror
+OPTS=$(OPTIMZ) -Wall -fno-rtti -fno-exceptions -fPIC
+DBG= -g $(PROFILING)
+
+ifdef FULLDEBUG
+DBG+= -DDEBUG -DDEBUG_CMPR $(INC_DIRS)
+endif
+
+ifdef DEBUG
+DBG+= -DDEBUG
+endif
+
+ifdef EXPKEY
+OPTS += -DEXPKEY
+endif
+
+LIB_DIRS=
+
+#use '-G' with Solaris
+LFLAGS = -lc -lstdc++
+#LFLAGS = -lc -lstdc++ -G
+
+
+LIBZ = -lz
+#use this if you've built an -fPIC version of libz.a to link into the libhtdig.so
+#LIBZ = -lz-fpic
+
+#berkeley db c files
+BDB_C_OBJS += bt_compare.o bt_conv.o bt_curadj.o bt_cursor.o bt_delete.o bt_method.o bt_open.o bt_put.o bt_rec.o bt_reclaim.o bt_recno.o bt_rsearch.o bt_search.o bt_split.o bt_stat.o bt_upgrade.o btree_auto.o crdel_auto.o crdel_rec.o db.o db_am.o db_auto.o db_byteorder.o db_conv.o db_dispatch.o db_dup.o db_err.o db_getlong.o db_iface.o db_join.o db_log2.o db_meta.o db_method.o db_overflow.o db_pr.o db_rec.o db_reclaim.o db_ret.o db_salloc.o db_shash.o db_upgrade.o env_method.o env_open.o env_recover.o env_region.o hash.o hash_auto.o hash_conv.o hash_dup.o hash_func.o hash_meta.o hash_method.o hash_page.o hash_rec.o hash_reclaim.o hash_stat.o hash_upgrade.o lock.o lock_conflict.o lock_deadlock.o lock_region.o lock_util.o log.o log_archive.o log_auto.o log_compare.o log_findckp.o log_get.o log_method.o log_put.o log_rec.o log_register.o mp_alloc.o mp_bh.o mp_cmpr.o mp_fget.o mp_fopen.o mp_fput.o mp_fset.o mp_method.o mp_region.o mp_register.o mp_stat.o mp_sync.o mp_trickle.o mut_fcntl.o mut_pthread.o mut_tas.o mutex.o os_abs.o os_alloc.o os_dir.o os_errno.o os_fid.o os_finit.o os_fsync.o os_handle.o os_map.o os_method.o os_oflags.o os_open.o os_region.o os_rename.o os_root.o os_rpath.o os_rw.o os_seek.o os_sleep.o os_spin.o os_stat.o os_tmpdir.o os_unlink.o qam.o qam_auto.o qam_conv.o qam_method.o qam_open.o qam_rec.o qam_stat.o txn.o txn_auto.o txn_rec.o txn_region.o xa.o xa_db.o xa_map.o
+
+#htlib c files $(REGEX)
+HTLIB_C_OBJS += getcwd.o mhash_md5.o regex.o vsnprintf.o memcmp.o mktime.o snprintf.o memcpy.o myqsort.o strerror.o memmove.o raise.o timegm.o
+
+#htlib c++ files
+HTLIB_CXX_OBJS += Configuration.o Database.o Dictionary.o DB2_db.o IntObject.o List.o Object.o ParsedString.o Queue.o QuotedStringList.o Stack.o String.o StringList.o StringMatch.o String_fmt.o good_strtok.o strcasecmp.o strptime.o HtCodec.o HtWordCodec.o HtVector.o HtHeap.o HtPack.o HtDateTime.o HtRegex.o HtRegexList.o HtRegexReplace.o HtRegexReplaceList.o HtVectorGeneric.o HtMaxMin.o HtWordType.o md5.o
+
+#htword c++ files
+HTWORD_CXX_OBJS += WordBitCompress.o WordContext.o WordCursor.o WordDB.o WordDBCompress.o WordDBInfo.o WordDBPage.o WordKey.o WordKeyInfo.o WordList.o WordMonitor.o WordRecord.o WordRecordInfo.o WordReference.o WordStat.o WordType.o
+
+#htcommon c++ files
+HTCOMMON_CXX_OBJS += DocumentDB.o DocumentRef.o HtWordReference.o HtWordList.o defaults.o HtURLCodec.o URL.o URLTrans.o HtZlibCodec.o cgi.o HtSGMLCodec.o HtConfiguration.o HtURLRewriter.o
+
+#htnet c++ files
+HTNET_CXX_OBJS += Connection.o Transport.o HtHTTP.o HtFile.o HtNNTP.o HtCookie.o HtCookieJar.o HtCookieMemJar.o HtHTTPBasic.o HtHTTPSecure.o SSLConnection.o HtFTP.o HtCookieInFileJar.o
+
+#htdig c++ files
+HTDIG_CXX_OBJS += Document.o ExternalTransport.o Parsable.o Retriever.o URLRef.o ExternalParser.o HTML.o Plaintext.o Server.o
+
+#htfuzzy c++ files
+HTFUZZY_CXX_OBJS += Accents.o EndingsDB.o Fuzzy.o Prefix.o Soundex.o Substring.o Synonym.o Endings.o Exact.o Metaphone.o Regexp.o Speling.o SuffixEntry.o filecopy.o
+
+#HTFUZZY_C_OBJS += filecopy.o
+
+#htsearch c++ files
+HTSEARCH_CXX_OBJS += Collection.o DocMatch.o ResultList.o SplitMatches.o TemplateList.o Display.o HtURLSeedScore.o ResultMatch.o Template.o WeightWord.o parser.o
+
+#libhtdig c++ files
+LIBHTDIG_CXX_OBJS += ResultFetch.o BasicDocument.o TextCollector.o libhtdig_htdig.o libhtdig_htmerge.o libhtdig_htfuzzy.o libhtdig_log.o libhtdig_htsearch.o
+
+#htcommon lex & yacc targets
+LIBHTDIG_CXX_OBJS += conf_lexer.o conf_parser.o
+
+#libhtdig c files
+#LIBHTDIG_C_OBJS += filecopy.o
+
+LXX_TARGETS += conf_lexer.cc
+
+YXX_TARGETS += conf_parser.cc
+
+OBJS += $(BDB_C_OBJS) $(HTLIB_C_OBJS) $(HTLIB_CXX_OBJS) $(HTWORD_CXX_OBJS)
+OBJS += $(HTCOMMON_CXX_OBJS) $(HTNET_CXX_OBJS) $(HTDIG_CXX_OBJS)
+OBJS += $(HTFUZZY_CXX_OBJS) $(HTFUZZY_C_OBJS) $(HTSEARCH_CXX_OBJS) $(LIBHTDIG_CXX_OBJS) $(LIBHTDIG_C_OBJS)
+
+
+libhtdig-3.2.0.so: $(OBJS) $(LXX_TARGETS) $(YXX_TARGETS)
+ $(CC) -shared $(LIB_DIRS) $(OTHER_OBJS) $(OBJS) -L/usr/local/lib $(LIBZ) $(LFLAGS) -Xlinker -h -Xlinker libhtdig.so.$(LIBHTDIG_BUILD_VER) -o libhtdig.so.$(LIBHTDIG_BUILD_VER)
+
+
+libhtdig.a: $(OBJS) $(LXX_TARGETS) $(YXX_TARGETS)
+ ar cru libhtdig.a $(OTHER_OBJS) $(OBJS)
+ ranlib libhtdig.a
+
+
+$(BDB_C_OBJS): %.o: ../db/%.c
+ $(CC) $(BDB_INC_DIRS) $(BDB_DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTLIB_C_OBJS): %.o: ../htlib/%.c
+ $(CC) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTLIB_CXX_OBJS): %.o: ../htlib/%.cc
+ $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTWORD_CXX_OBJS): %.o: ../htword/%.cc
+ $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTCOMMON_CXX_OBJS): %.o: ../htcommon/%.cc
+ $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTNET_CXX_OBJS): %.o: ../htnet/%.cc
+ $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTDIG_CXX_OBJS): %.o: ../htdig/%.cc
+ $(CXX) $(HTDIG_INC_DIR) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTFUZZY_C_OBJS): %.o: ../htfuzzy/%.c
+ $(CC) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTFUZZY_CXX_OBJS): %.o: ../htfuzzy/%.cc
+ $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(HTSEARCH_CXX_OBJS): %.o: ../htsearch/%.cc
+ $(CXX) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(LIBHTDIG_CXX_OBJS): %.o: %.cc
+ $(CXX) $(HTDIG_INC_DIR) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(LIBHTDIG_C_OBJS): %.o: %.c
+ $(CC) $(HTDIG_INC_DIR) $(INC_DIRS) $(DEFS) $(OPTS) $(DBG) -c $< -o $@
+
+$(LXX_TARGETS): %.cc: ../htcommon/%.lxx
+ $(LEX) $(AM_LFLAGS) $< && mv $(LEX_OUTPUT_ROOT).c $@
+ #$(LEX) $(AM_LFLAGS) $(LFLAGS) -o$@ $<
+
+$(YXX_TARGETS): %.cc: ../htcommon/%.yxx
+ $(YACC) $(AM_YFLAGS) $< && mv $(YACC_OUTPUT_ROOT).c $@
+ if test -f y.tab.h; then if cmp -s y.tab.h conf_parser.h; then rm -f y.tab.h; else mv y.tab.h conf_parser.h; fi; else :; fi
+
+
+clean:
+ rm -f *.o *~ *.bak *.lo *.a* *.so* core $(LXX_TARGETS) $(YXX_TARGETS)
+
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32
new file mode 100644
index 00000000..da1dfb62
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/Makefile.win32
@@ -0,0 +1,173 @@
+#
+# Makefile - makefile for libhtdig
+#
+
+PRODUCT = htdig
+
+TARGET = $(LIBDIR)/libhtdig$(DLLSFX)
+
+include ../Makedefs.win32
+
+ARCH = win32
+#MV = move
+MV = mv
+
+
+#define your own defaults here!
+DEFAULT_CONFIG_FILE=\"/etc/htdig/htdig.conf\"
+DEFAULT_DB_PATH=\"/etc/htdig/\"
+BIN_DIR=\"/usr/local/bin\"
+COMMON_DIR=\"/usr/local/share/htdig\"
+CONFIG_DIR=\"/etc/htdig\"
+DATABASE_DIR=\"/var/lib/htdig\"
+IMAGE_URL_PREFIX=\"/htdig\"
+
+DEFS= -DHAVE_CONFIG_H -DDEFAULT_CONFIG_FILE=$(DEFAULT_CONFIG_FILE) -DBIN_DIR=$(BIN_DIR) -DCOMMON_DIR=$(COMMON_DIR)
+DEFS+= -DCONFIG_DIR=$(CONFIG_DIR) -DDATABASE_DIR=$(DATABASE_DIR) -DIMAGE_URL_PREFIX=$(IMAGE_URL_PREFIX)
+
+#LEX define based on your system
+LEX = flex
+AM_LFLAGS = -L
+LEX_OUTPUT_ROOT = lex.yy
+
+#YACC define based on your system
+YACC = bison -y
+AM_YFLAGS = -l -d
+YACC_OUTPUT_ROOT = y.tab
+
+
+# -----------------------------------------------------------------------------
+# add new library members to this list
+
+#berkeley db c files
+BDB_C_OBJS += bt_compare.obj bt_conv.obj bt_curadj.obj bt_cursor.obj bt_delete.obj bt_method.obj bt_open.obj bt_put.obj bt_rec.obj bt_reclaim.obj bt_recno.obj bt_rsearch.obj bt_search.obj bt_split.obj bt_stat.obj bt_upgrade.obj btree_auto.obj crdel_auto.obj crdel_rec.obj db.obj db_am.obj db_auto.obj db_byteorder.obj db_conv.obj db_dispatch.obj db_dup.obj db_err.obj db_getlong.obj db_iface.obj db_join.obj db_log2.obj db_meta.obj db_method.obj db_overflow.obj db_pr.obj db_rec.obj db_reclaim.obj db_ret.obj db_salloc.obj db_shash.obj db_upgrade.obj env_method.obj env_open.obj env_recover.obj env_region.obj hash.obj hash_auto.obj hash_conv.obj hash_dup.obj hash_func.obj hash_meta.obj hash_method.obj hash_page.obj hash_rec.obj hash_reclaim.obj hash_stat.obj hash_upgrade.obj lock.obj lock_conflict.obj lock_deadlock.obj lock_region.obj lock_util.obj log.obj log_archive.obj log_auto.obj log_compare.obj log_findckp.obj log_get.obj log_method.obj log_put.obj log_rec.obj log_register.obj mp_alloc.obj mp_bh.obj mp_cmpr.obj mp_fget.obj mp_fopen.obj mp_fput.obj mp_fset.obj mp_method.obj mp_region.obj mp_register.obj mp_stat.obj mp_sync.obj mp_trickle.obj mut_fcntl.obj mut_pthread.obj mut_tas.obj mutex.obj os_abs.obj os_alloc.obj os_dir.obj os_errno.obj os_fid.obj os_finit.obj os_fsync.obj os_handle.obj os_map.obj os_method.obj os_oflags.obj os_open.obj os_region.obj os_rename.obj os_root.obj os_rpath.obj os_rw.obj os_seek.obj os_sleep.obj os_spin.obj os_stat.obj os_tmpdir.obj os_unlink.obj qam.obj qam_auto.obj qam_conv.obj qam_method.obj qam_open.obj qam_rec.obj qam_stat.obj txn.obj txn_auto.obj txn_rec.obj txn_region.obj xa.obj xa_db.obj xa_map.obj
+
+ifdef WINDIR
+BDB_C_OBJS += dirent_local.obj
+endif
+
+#htlib c files $(REGEX)
+HTLIB_C_OBJS += getcwd.obj mhash_md5.obj regex.obj vsnprintf.obj memcmp.obj mktime.obj snprintf.obj memcpy.obj myqsort.obj strerror.obj memmove.obj raise.obj timegm.obj
+
+#htlib c++ files
+HTLIB_CXX_OBJS += Configuration.obj Database.obj Dictionary.obj DB2_db.obj IntObject.obj List.obj Object.obj ParsedString.obj Queue.obj QuotedStringList.obj Stack.obj String.obj StringList.obj StringMatch.obj String_fmt.obj good_strtok.obj strcasecmp.obj strptime.obj HtCodec.obj HtWordCodec.obj HtVector.obj HtHeap.obj HtPack.obj HtDateTime.obj HtRegex.obj HtRegexList.obj HtRegexReplace.obj HtRegexReplaceList.obj HtVectorGeneric.obj HtMaxMin.obj HtWordType.obj md5.obj filecopy.obj
+
+#htword c++ files
+HTWORD_CXX_OBJS += WordBitCompress.obj WordContext.obj WordCursor.obj WordDB.obj WordDBCompress.obj WordDBInfo.obj WordDBPage.obj WordKey.obj WordKeyInfo.obj WordList.obj WordMonitor.obj WordRecord.obj WordRecordInfo.obj WordReference.obj WordStat.obj WordType.obj
+
+#htcommon c++ files
+HTCOMMON_CXX_OBJS += DocumentDB.obj DocumentRef.obj HtWordReference.obj HtWordList.obj defaults.obj HtURLCodec.obj URL.obj URLTrans.obj HtZlibCodec.obj cgi.obj HtSGMLCodec.obj HtConfiguration.obj HtURLRewriter.obj
+
+#htnet c++ files
+HTNET_CXX_OBJS += Connection.obj Transport.obj HtHTTP.obj HtFile.obj HtNNTP.obj HtCookie.obj HtCookieJar.obj HtCookieMemJar.obj HtHTTPBasic.obj HtHTTPSecure.obj SSLConnection.obj HtFTP.obj HtCookieInFileJar.obj
+
+#htdig c++ files
+HTDIG_CXX_OBJS += Document.obj ExternalTransport.obj Parsable.obj Retriever.obj URLRef.obj ExternalParser.obj HTML.obj Plaintext.obj Server.obj
+
+#htfuzzy c++ files
+HTFUZZY_CXX_OBJS += Accents.obj EndingsDB.obj Fuzzy.obj Prefix.obj Soundex.obj Substring.obj Synonym.obj Endings.obj Exact.obj Metaphone.obj Regexp.obj Speling.obj SuffixEntry.obj
+
+#HTFUZZY_C_OBJS += filecopy.o
+
+#htsearch c++ files
+HTSEARCH_CXX_OBJS += Collection.obj DocMatch.obj ResultList.obj SplitMatches.obj TemplateList.obj Display.obj HtURLSeedScore.obj ResultMatch.obj Template.obj WeightWord.obj parser.obj
+
+#libhtdig c++ files
+LIBHTDIG_CXX_OBJS += ResultFetch.obj BasicDocument.obj TextCollector.obj libhtdig_htdig.obj libhtdig_htmerge.obj libhtdig_htfuzzy.obj libhtdig_log.obj libhtdig_htsearch.obj
+
+#htcommon lex & yacc targets
+LIBHTDIG_CXX_OBJS += conf_lexer.obj conf_parser.obj
+
+#libhtdig c files
+#LIBHTDIG_C_OBJS += filecopy.o
+
+LXX_TARGETS += conf_lexer.cc
+
+YXX_TARGETS += conf_parser.cc
+
+OBJS += $(BDB_C_OBJS) $(HTLIB_C_OBJS) $(HTLIB_CXX_OBJS) $(HTWORD_CXX_OBJS)
+OBJS += $(HTCOMMON_CXX_OBJS) $(HTNET_CXX_OBJS) $(HTDIG_CXX_OBJS)
+OBJS += $(HTFUZZY_CXX_OBJS) $(HTFUZZY_C_OBJS) $(HTSEARCH_CXX_OBJS) $(LIBHTDIG_CXX_OBJS) $(LIBHTDIG_C_OBJS)
+
+OTHERLIBS = L:/win32/lib/zlib114/zlib.lib ws2_32.lib
+
+# -----------------------------------------------------------------------------
+
+CMNDLLS =
+
+CPPFLAGS += -DHAVE_CONFIG_H -I. -I../include -I../htlib -I../htcommon -I../htword \
+ -I../db -I../htnet -I../htsearch -I../htdig -I../htfuzzy
+
+CFLAGS += $(CPPFLAGS)
+
+#ifeq ($(ARCH),win32)
+CFLAGS += -DDYNAMIC_LIBUTIL
+CPPFLAGS += -DDYNAMIC_LIBUTIL -DYY_NEVER_INTERACTIVE
+#endif
+
+ifeq ($(ARCH),linux)
+LDFLAGS += -Xlinker -Bsymbolic
+endif
+
+# -----------------------------------------------------------------------------
+
+
+#win32/%.obj: %.cc %.c
+# $(CC) $(CPPFLAGS) -c $< -o $@
+
+
+ #$(CC) $(CPPFLAGS) $(OPTS) $(DBG) -c $< /Fo$@
+
+$(BDB_C_OBJS): %.obj: ../db/%.c
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $<
+
+$(HTLIB_C_OBJS): %.obj: ../htlib/%.c
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $<
+
+$(HTLIB_CXX_OBJS): %.obj: ../htlib/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(HTWORD_CXX_OBJS): %.obj: ../htword/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(HTCOMMON_CXX_OBJS): %.obj: ../htcommon/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(HTNET_CXX_OBJS): %.obj: ../htnet/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(HTDIG_CXX_OBJS): %.obj: ../htdig/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(HTFUZZY_C_OBJS): %.obj: ../htfuzzy/%.c
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $<
+
+$(HTFUZZY_CXX_OBJS): %.obj: ../htfuzzy/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(HTSEARCH_CXX_OBJS): %.obj: ../htsearch/%.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(LIBHTDIG_CXX_OBJS): %.obj: %.cc
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TP -c $<
+
+$(LIBHTDIG_C_OBJS): %.obj: %.c
+ $(CC) $(CFLAGS) $(OPTS) $(DBG) /TC -c $<
+
+$(LXX_TARGETS): %.cc: ../htcommon/%.lxx
+ $(LEX) $(AM_LFLAGS) $< && cat $(LEX_OUTPUT_ROOT).c | sed -e 's/#include <unistd.h>//g' > $@
+ #$(LEX) $(AM_LFLAGS) $(LFLAGS) -o$@ $<
+
+$(YXX_TARGETS): %.cc: ../htcommon/%.yxx
+ $(YACC) $(AM_YFLAGS) $< && $(MV) $(YACC_OUTPUT_ROOT).c $@
+ if test -f y.tab.h; then if cmp -s y.tab.h conf_parser.h; then rm -f y.tab.h; else mv y.tab.h conf_parser.h; fi; else :; fi
+
+
+
+$(TARGET): $(OBJDIRDEP) $(LIBDIRDEP) $(OBJS)
+ $(RM) $(basename $(TARGET))*
+ $(DLLLD) $(LDFLAGS) $(OBJS) $(LDLIBS) $(DBLIBS) $(OTHERLIBS)
+ $(DLL_SYMLINK_CMD)
+
+include ../Makerules.win32
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/README b/debian/htdig/htdig-3.2.0b6/libhtdig/README
new file mode 100644
index 00000000..99591a2c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/README
@@ -0,0 +1,46 @@
+Neal Richter
+10/6/2002
+
+How to use this:
+
+--------------------------
+Building htdig:
+
+unzip the tarball
+
+cd inside of the tarball root directory
+
+./configure' with any parameters needed
+
+---------------------------
+Building libhtdig:
+
+cd libhtdig
+
+make
+
+--------------------------
+Building libhtdigphp:
+
+cd libhtdigphp
+
+./configure
+make
+./relink
+
+Note: The relink script uses the PHP wrapper objects as well as all the object
+in libhtdig to create a one-piece sharded library
+
+---------------------------
+
+note that the libhtdig_xxxx.cc take the place of the various utilities 'main'
+functions. Please look through them and compare to see if changes need to be
+synced in.
+
+---------------------------
+
+This should make you a libhtdig.so.XXXX Copy the latest libhtdig_api.h to a
+place that you might need it.
+
+link against libhtdig.so.XXXXX
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc
new file mode 100644
index 00000000..e36be04b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.cc
@@ -0,0 +1,1735 @@
+//--------------------------------------------------------------------
+//
+// ResultFetch.cc
+//
+// 2/6/2002 created for libhtdig
+//
+// Neal Richter nealr@rightnow.com
+//
+// ResultFetch: Takes results of search and fills in the HTML templates
+//
+// FOR USE IN LIBHTDIG... does NOT stream to stdout!!
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ResultFetch.cc,v 1.5 2004/05/28 13:15:28 lha Exp $
+//
+//--------------------------------------------------------------------
+
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "htsearch.h"
+#include "ResultFetch.h"
+#include "ResultMatch.h"
+#include "WeightWord.h"
+#include "StringMatch.h"
+#include "StringList.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "HtSGMLCodec.h"
+#include "HtURLCodec.h"
+#include "WordType.h"
+#include "Collection.h"
+#include "HtURLSeedScore.h"
+#include "SplitMatches.h"
+#include "HtConfiguration.h"
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#ifndef _WIN32
+#include <syslog.h>
+#endif
+
+#include <locale.h>
+#include <math.h>
+#include <float.h>
+
+#if !defined(DBL_MAX) && defined(MAXFLOAT)
+# define DBL_MAX MAXFLOAT
+#endif
+
+//*****************************************************************************
+//
+ResultFetch::ResultFetch(Dictionary *collections, const StringList& templist)
+//ResultFetch::ResultFetch(Dictionary * collections)
+{
+
+ HtConfiguration *config = HtConfiguration::config();
+ selected_collections = collections;
+ //collectionList = &templist;
+ limitTo = 0;
+ excludeFrom = 0;
+ // needExcerpt = 0;
+ templateError = 0;
+
+ maxStars = config->Value("max_stars");
+ maxScore = -DBL_MAX;
+ minScore = DBL_MAX;
+ setupImages();
+ setupTemplates();
+
+ if (!templates.createFromString(config->Find("template_map")))
+ {
+ // Error in createFromString.
+ // Let's try the default template_map
+
+ config->Add("template_map", "Long builtin-long builtin-long Short builtin-short builtin-short");
+ if (!templates.createFromString(config->Find("template_map")))
+ {
+ // Unrecoverable Error
+ // (No idea why this would happen)
+ templateError = 1;
+ }
+ }
+
+ currentTemplate = templates.get(config->Find("template_name"));
+ if (!currentTemplate)
+ {
+ //
+ // Must have been some error. Resort to the builtin-long (slot 0)
+ //
+ currentTemplate = (Template *) templates.templates[0];
+ }
+ if (!currentTemplate)
+ {
+ //
+ // Another error!? Time to bail out...
+ //
+ templateError = 1;
+ }
+ // if (mystrcasestr(currentTemplate->getMatchTemplate(), "excerpt"))
+ // needExcerpt = 1;
+}
+
+//*****************************************************************************
+ResultFetch::~ResultFetch()
+{
+ // docDB.Close();
+}
+
+//*****************************************************************************
+//
+//void
+List *
+ResultFetch::fetch()
+{
+
+ int pageNumber = 1;
+
+ HtConfiguration *config = HtConfiguration::config();
+ int good_sort = 0;
+ good_sort = ResultMatch::setSortType(config->Find("sort"));
+ if (!good_sort)
+ {
+ // Must temporarily stash the message in a String, since
+ // displaySyntaxError will overwrite the static temp used in form.
+
+ String s(form("No such sort method: `%s'", (const char *) config->Find("sort")));
+
+ displaySyntaxError(s);
+ //return;
+ return(NULL);
+ }
+
+ List *matches = buildMatchList();
+ //int currentMatch = 0;
+ //int numberDisplayed = 0;
+ ResultMatch *match = 0;
+ int number = 0;
+ number = config->Value("matches_per_page");
+ if (number <= 0)
+ number = 10;
+ //int startAt = (pageNumber - 1) * number;
+
+ if (config->Boolean("logging"))
+ {
+ logSearch(pageNumber, matches);
+ }
+
+ setVariables(pageNumber, matches);
+
+ //
+ // The first match is guaranteed to have the highest score of
+ // all the matches. We use this to compute the number of stars
+ // to display for all the other matches.
+ //
+ match = (ResultMatch *) (*matches)[0];
+ if (!match)
+ {
+ //
+ // No matches.
+ //
+ delete matches;
+ if (config->Boolean("nph"))
+ {
+ //cout << "HTTP/1.0 200 OK\r\n";
+ }
+ //cout << "Content-type: text/html\r\n\r\n";
+
+ //displayNomatch();
+ //return;
+ return(NULL);
+ }
+ // maxScore = match->getScore(); // now done in buildMatchList()
+
+ if (config->Boolean("nph"))
+ {
+ //cout << "HTTP/1.0 200 OK\r\n";
+ }
+ //cout << "Content-type: text/html\r\n\r\n";
+
+ String wrap_file = config->Find("search_results_wrapper");
+ String *wrapper = 0;
+ char *header = 0, *footer = 0;
+ if (wrap_file.length())
+ {
+ wrapper = readFile(wrap_file.get());
+ if (wrapper && wrapper->length())
+ {
+ char wrap_sepr[] = "HTSEARCH_RESULTS";
+ char *h = wrapper->get();
+ char *p = strstr(h, wrap_sepr);
+ if (p)
+ {
+ if (p > h && p[-1] == '$')
+ {
+ footer = p + strlen(wrap_sepr);
+ header = h;
+ p[-1] = '\0';
+ }
+ else if (p > h + 1 && p[-2] == '$' &&
+ (p[-1] == '(' || p[-1] == '{') &&
+ (p[strlen(wrap_sepr)] == ')' || p[strlen(wrap_sepr)] == '}'))
+ {
+ footer = p + strlen(wrap_sepr) + 1;
+ header = h;
+ p[-2] = '\0';
+ }
+ }
+ }
+ }
+ if (header)
+ {
+ //expandVariables(header);
+ }
+ else
+ {
+ //displayHeader();
+ }
+
+
+ //neal
+ return(matches);
+
+ /*
+
+ //
+ // Display the window of matches requested.
+ //
+ if (!currentTemplate->getStartTemplate().empty())
+ {
+ expandVariables(currentTemplate->getStartTemplate());
+ }
+
+ matches->Start_Get();
+ while ((match = (ResultMatch *) matches->Get_Next()) && numberDisplayed < number)
+ {
+ if (currentMatch >= startAt)
+ {
+ // DocumentRef *ref = docDB[match->getID()];
+ Collection *collection = match->getCollection();
+ DocumentRef *ref = collection->getDocumentRef(match->getID());
+ if (!ref || ref->DocState() != Reference_normal)
+ continue; // The document isn't present or shouldn't be displayed
+ ref->DocAnchor(match->getAnchor());
+ ref->DocScore(match->getScore());
+ fetchMatch(match, ref, currentMatch + 1);
+ numberDisplayed++;
+ delete ref;
+ }
+ currentMatch++;
+ }
+
+ if (!currentTemplate->getEndTemplate().empty())
+ {
+ expandVariables(currentTemplate->getEndTemplate());
+ }
+ if (footer)
+ {
+ //expandVariables(footer);
+ }
+ else
+ {
+ //displayFooter();
+ }
+
+ if (wrapper)
+ delete wrapper;
+ delete matches;
+
+ */
+}
+
+//*****************************************************************************
+// Return true if the specified URL should be counted towards the results.
+int
+ResultFetch::includeURL(const String & url)
+{
+
+ if (limitTo && limitTo->match(url, 1, 0) == 0)
+ return 0;
+ else
+ {
+
+ if (excludeFrom && excludeFrom->match(url, 0, 0) != 0)
+ return 0;
+ else
+ return 1;
+ }
+}
+
+//*****************************************************************************
+//void
+Dictionary *
+ResultFetch::fetchMatch(ResultMatch * match, DocumentRef * ref, int current)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ String *str = 0;
+
+ char *url = NULL;
+
+ url = form("%s", ref->DocURL());
+
+ vars.Add("URL", new String(url));
+
+ int iA = ref->DocAnchor();
+
+ String *anchor = 0;
+ int fanchor = 0;
+ if (iA > 0) // if an anchor was found
+ {
+ List *anchors = ref->DocAnchors();
+ if (anchors->Count() >= iA)
+ {
+ anchor = new String();
+ fanchor = 1;
+ *anchor << "#" << ((String *) (*anchors)[iA - 1])->get();
+ vars.Add("ANCHOR", anchor);
+ }
+ }
+
+ //
+ // no condition for determining excerpt any more:
+ // we need it anyway to see if an anchor is relevant
+ //
+ int first = -1;
+ String urlanchor(url);
+ if (anchor)
+ urlanchor << anchor;
+ vars.Add("EXCERPT", excerpt(match, ref, urlanchor, fanchor, first));
+ //
+ // anchor only relevant if an excerpt was found, i.e.,
+ // the search expression matches the body of the document
+ // instead of only META keywords.
+ //
+ if (first < 0)
+ {
+ vars.Remove("ANCHOR");
+ }
+
+ vars.Add("METADESCRIPTION", new String(ref->DocMetaDsc()));
+ vars.Add("SCORE", new String(form("%f", ref->DocScore())));
+ vars.Add("CURRENT", new String(form("%d", current)));
+ char *title = ref->DocTitle();
+ if (!title || !*title)
+ {
+ if (strcmp(config->Find("no_title_text"), "filename") == 0)
+ {
+ // use actual file name
+ title = strrchr(url, '/');
+ if (title)
+ {
+ title++; // Skip slash
+ str = new String(form("[%s]", title));
+ }
+ else
+ // URL without '/' ??
+ str = new String("[No title]");
+ }
+ else
+ // use configure 'no title' text
+ str = new String(config->Find("no_title_text"));
+ }
+ else
+ str = new String(title);
+ vars.Add("TITLE", str);
+ vars.Add("STARSRIGHT", generateStars(ref, 1));
+ vars.Add("STARSLEFT", generateStars(ref, 0));
+ vars.Add("SIZE", new String(form("%d", ref->DocSize())));
+ vars.Add("SIZEK", new String(form("%d", (ref->DocSize() + 1023) / 1024)));
+
+ if (maxScore != 0 && maxScore != minScore)
+ {
+ int percent = (int) ((ref->DocScore() - minScore) * 100 / (maxScore - minScore));
+ if (percent <= 0)
+ percent = 1;
+ vars.Add("PERCENT", new String(form("%d", percent)));
+ }
+ else
+ vars.Add("PERCENT", new String("100"));
+
+ {
+ str = new String();
+ char buffer[100];
+ time_t t = ref->DocTime();
+ if (t)
+ {
+ struct tm *tm = localtime(&t);
+ String datefmt = config->Find("date_format");
+ const String locale = config->Find("locale");
+ if (datefmt.empty())
+ {
+ if (config->Boolean("iso_8601"))
+ datefmt = "%Y-%m-%d %H:%M:%S %Z";
+ else
+ datefmt = "%x";
+ }
+ if (!locale.empty())
+ {
+ setlocale(LC_TIME, locale);
+ }
+ strftime(buffer, sizeof(buffer), (char *) datefmt, tm);
+ *str << buffer;
+ }
+ vars.Add("MODIFIED", str);
+ }
+
+ vars.Add("HOPCOUNT", new String(form("%d", ref->DocHopCount())));
+ vars.Add("DOCID", new String(form("%d", ref->DocID())));
+ vars.Add("BACKLINKS", new String(form("%d", ref->DocBackLinks())));
+
+ {
+ str = new String();
+ List *list = ref->Descriptions();
+ int n = list->Count();
+ for (int i = 0; i < n; i++)
+ {
+ *str << ((String *) (*list)[i])->get() << "<br>\n";
+ }
+ vars.Add("DESCRIPTIONS", str);
+ String *description = new String();
+ if (list->Count())
+ *description << ((String *) (*list)[0]);
+ vars.Add("DESCRIPTION", description);
+ }
+
+ int index = 0;
+ int length = 0;
+ int status = -1;
+ if (URLtemplate.hasPattern())
+ status = URLtemplate.FindFirst(ref->DocURL(), index, length);
+
+/*
+ if (status >= 0 && index >= 0)
+ displayParsedFile(((String *) URLtemplateList[index])->get());
+ else
+ expandVariables(currentTemplate->getMatchTemplate());
+
+
+
+ int vars_count = vars.Count();
+ vars.Start_Get();
+
+ String key;
+ String * value;
+
+ for(int i = 0; i < vars_count; i++)
+ {
+ key = vars.Get_Next();
+ value = (String *) vars[key];
+
+ cout << key.get() << "[" << value->get() << "]" << endl;
+ cout.flush();
+ }
+
+*/
+
+ return(&vars);
+
+}
+
+//*****************************************************************************
+void
+ResultFetch::setVariables(int pageNumber, List * matches)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ String tmp;
+ int i;
+ int nMatches = 0;
+
+ if (matches)
+ nMatches = matches->Count();
+
+ int matchesPerPage = config->Value("matches_per_page");
+ if (matchesPerPage <= 0)
+ matchesPerPage = 10;
+ int nPages = (nMatches + matchesPerPage - 1) / matchesPerPage;
+
+ if (nPages > config->Value("maximum_pages", 10))
+ nPages = config->Value("maximum_pages", 10);
+ if (nPages < 1)
+ nPages = 1; // We always have at least one page...
+ vars.Add("MATCHES_PER_PAGE", new String(config->Find("matches_per_page")));
+ vars.Add("MAX_STARS", new String(config->Find("max_stars")));
+ vars.Add("CONFIG", new String(config->Find("config")));
+ vars.Add("VERSION", new String(config->Find("version")));
+ vars.Add("RESTRICT", new String(config->Find("restrict")));
+ vars.Add("EXCLUDE", new String(config->Find("exclude")));
+ vars.Add("KEYWORDS", new String(config->Find("keywords")));
+ if (mystrcasecmp(config->Find("match_method"), "and") == 0)
+ vars.Add("MATCH_MESSAGE", new String("all"));
+ else if (mystrcasecmp(config->Find("match_method"), "or") == 0)
+ vars.Add("MATCH_MESSAGE", new String("some"));
+ vars.Add("MATCHES", new String(form("%d", nMatches)));
+ vars.Add("PLURAL_MATCHES",
+ new String((nMatches == 1) ? (char *) "" : (const char *) config->Find("plural_suffix")));
+ vars.Add("PAGE", new String(form("%d", pageNumber)));
+ vars.Add("PAGES", new String(form("%d", nPages)));
+ vars.Add("FIRSTDISPLAYED", new String(form("%d", (pageNumber - 1) * matchesPerPage + 1)));
+ if (nPages > 1)
+ vars.Add("PAGEHEADER", new String(config->Find("page_list_header")));
+ else
+ vars.Add("PAGEHEADER", new String(config->Find("no_page_list_header")));
+
+ i = pageNumber * matchesPerPage;
+ if (i > nMatches)
+ i = nMatches;
+ vars.Add("LASTDISPLAYED", new String(form("%d", i)));
+
+ if (config->Find("script_name").length() != 0)
+ {
+ vars.Add("CGI", new String(config->Find("script_name")));
+ }
+ else
+ {
+ vars.Add("CGI", new String(getenv("SCRIPT_NAME")));
+ }
+ vars.Add("STARTYEAR", new String(config->Find("startyear")));
+ vars.Add("STARTMONTH", new String(config->Find("startmonth")));
+ vars.Add("STARTDAY", new String(config->Find("startday")));
+ vars.Add("ENDYEAR", new String(config->Find("endyear")));
+ vars.Add("ENDMONTH", new String(config->Find("endmonth")));
+ vars.Add("ENDDAY", new String(config->Find("endday")));
+
+ String *str;
+ //char *format = input->get("format");
+ char *format = "builtin-long";
+ String *in;
+
+ vars.Add("SELECTED_FORMAT", new String(format));
+
+ str = new String();
+ *str << "<select name=\"format\">\n";
+ for (i = 0; i < templates.displayNames.Count(); i++)
+ {
+ in = (String *) templates.internalNames[i];
+ *str << "<option value=\"" << in->get() << '"';
+ if (format && mystrcasecmp(in->get(), format) == 0)
+ {
+ *str << " selected";
+ }
+ *str << '>' << ((String *) templates.displayNames[i])->get() << '\n';
+ }
+ *str << "</select>\n";
+ vars.Add("FORMAT", str);
+
+ str = new String();
+ QuotedStringList ml(config->Find("method_names"), " \t\r\n");
+ *str << "<select name=\"method\">\n";
+ for (i = 0; i < ml.Count(); i += 2)
+ {
+ *str << "<option value=\"" << ml[i] << '"';
+ if (mystrcasecmp(ml[i], config->Find("match_method")) == 0)
+ *str << " selected";
+ *str << '>' << ml[i + 1] << '\n';
+ }
+ *str << "</select>\n";
+ vars.Add("METHOD", str);
+
+ vars.Add("SELECTED_METHOD", new String(config->Find("match_method")));
+
+ ////////////////// Multiple database support //////////////////////
+ // Emit collection table. Ensure that previously selected collections
+ // are "checked".
+ // Collections are specified in the config file with the
+ // "collection_names" attribute. An example of the corresponding snippet
+ // in the config file is as follows:
+ //
+ // collection_names: htdig_docs htdig_bugs
+ //
+ // htdig_bugs and htdig_docs are the two collections (databases) and
+ // their corresponding config files are: $CONFIG_DIR/htdig_bugs.conf and
+ // $CONFIG_DIR/htdig_docs.conf respectively.
+ //
+ QuotedStringList clist(config->Find("collection_names"), " \t\r\n");
+ for (i = 0; i < clist.Count(); i++)
+ {
+ String config_name = clist[i];
+
+ for (int j = 0; j < collectionList.Count(); j++)
+ {
+ if (strcmp(config_name.get(), collectionList[j]) == 0)
+ {
+ str = new String();
+ *str << "checked";
+ String collection_id = "COLLECTION_";
+ collection_id << config_name;
+ vars.Add(collection_id, str);
+ break;
+ }
+ }
+ }
+
+ ////////////////// Multiple database support //////////////////////
+
+ str = new String();
+ QuotedStringList sl(config->Find("sort_names"), " \t\r\n");
+ const String st = config->Find("sort");
+ StringMatch datetime;
+ datetime.IgnoreCase();
+ datetime.Pattern("date|time");
+ *str << "<select name=\"sort\">\n";
+ for (i = 0; i < sl.Count(); i += 2)
+ {
+ *str << "<option value=\"" << sl[i] << '"';
+ if (mystrcasecmp(sl[i], st) == 0 ||
+ datetime.Compare(sl[i]) && datetime.Compare(st) ||
+ mystrncasecmp(sl[i], st, 3) == 0 &&
+ datetime.Compare(sl[i] + 3) && datetime.Compare(st.get() + 3))
+ *str << " selected";
+ *str << '>' << sl[i + 1] << '\n';
+ }
+ *str << "</select>\n";
+ vars.Add("SORT", str);
+ vars.Add("SELECTED_SORT", new String(st));
+
+ //
+ // If a paged output is required, set the appropriate variables
+ //
+ if (nPages > 1)
+ {
+ if (pageNumber > 1)
+ {
+ str = new String("<a href=\"");
+ tmp = 0;
+ createURL(tmp, pageNumber - 1);
+ *str << tmp << "\">" << config->Find("prev_page_text") << "</a>";
+ }
+ else
+ {
+ str = new String(config->Find("no_prev_page_text"));
+ }
+ vars.Add("PREVPAGE", str);
+
+ if (pageNumber < nPages)
+ {
+ str = new String("<a href=\"");
+ tmp = 0;
+ createURL(tmp, pageNumber + 1);
+ *str << tmp << "\">" << config->Find("next_page_text") << "</a>";
+ }
+ else
+ {
+ str = new String(config->Find("no_next_page_text"));
+ }
+ vars.Add("NEXTPAGE", str);
+
+ str = new String();
+ char *p;
+ QuotedStringList pnt(config->Find("page_number_text"), " \t\r\n");
+ QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n");
+ QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n");
+ if (nPages > config->Value("maximum_page_buttons", 10))
+ nPages = config->Value("maximum_page_buttons", 10);
+ for (i = 1; i <= nPages; i++)
+ {
+ if (i == pageNumber)
+ {
+ p = npnt[i - 1];
+ if (!p)
+ p = form("%d", i);
+ *str << p;
+ }
+ else
+ {
+ p = pnt[i - 1];
+ if (!p)
+ p = form("%d", i);
+ *str << "<a href=\"";
+ tmp = 0;
+ createURL(tmp, i);
+ *str << tmp << "\">" << p << "</a>";
+ }
+ if (i != nPages && sep.Count() > 0)
+ *str << sep[(i - 1) % sep.Count()];
+ else if (i != nPages && sep.Count() <= 0)
+ *str << " ";
+ }
+ vars.Add("PAGELIST", str);
+ }
+ StringList form_vars(config->Find("allow_in_form"), " \t\r\n");
+ String *key;
+ for (i = 0; i < form_vars.Count(); i++)
+ {
+ if (!config->Find(form_vars[i]).empty())
+ {
+ key = new String(form_vars[i]);
+ key->uppercase();
+ vars.Add(key->get(), new String(config->Find(form_vars[i])));
+ }
+ }
+}
+
+//*****************************************************************************
+void
+ResultFetch::createURL(String & url, int pageNumber)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ String s;
+ int i;
+//#define encodeInput(name) (s = input->get(name), encodeURL(s), s.get())
+
+ if (!config->Find("script_name").empty())
+ {
+ url << config->Find("script_name");
+ }
+ else
+ {
+ url << getenv("SCRIPT_NAME");
+ }
+
+ url << '?';
+
+ //if (input->exists("restrict"))
+ // url << "restrict=" << encodeInput("restrict") << '&';
+ //if (input->exists("exclude"))
+ // url << "exclude=" << encodeInput("exclude") << '&';
+ // Not needed: The next loop below handles this output
+ //if (input->exists("config"))
+ // url << "config=" << encodeInput("config") << '&';
+
+ // Put out all specified collections. If none selected, resort to
+ // default behaviour.
+ char *config_name = collectionList[0];
+ String config_encoded;
+ if (config_name && config_name[0] == '\0')
+ config_name = NULL;
+
+ if (config_name)
+ {
+ for (i = 0; i < collectionList.Count(); i++)
+ {
+ config_name = collectionList[i];
+ config_encoded = config_name;
+ encodeURL(config_encoded);
+ url << "config=" << config_encoded << '&';
+ }
+ }
+/*
+ if (input->exists("method"))
+ url << "method=" << encodeInput("method") << '&';
+ if (input->exists("format"))
+ url << "format=" << encodeInput("format") << '&';
+ if (input->exists("sort"))
+ url << "sort=" << encodeInput("sort") << '&';
+ if (input->exists("matchesperpage"))
+ url << "matchesperpage=" << encodeInput("matchesperpage") << '&';
+ if (input->exists("keywords"))
+ url << "keywords=" << encodeInput("keywords") << '&';
+ if (input->exists("words"))
+ url << "words=" << encodeInput("words") << '&';
+ if (input->exists("startyear"))
+ url << "startyear=" << encodeInput("startyear") << '&';
+ if (input->exists("startmonth"))
+ url << "startmonth=" << encodeInput("startmonth") << '&';
+ if (input->exists("startday"))
+ url << "startday=" << encodeInput("startday") << '&';
+ if (input->exists("endyear"))
+ url << "endyear=" << encodeInput("endyear") << '&';
+ if (input->exists("endmonth"))
+ url << "endmonth=" << encodeInput("endmonth") << '&';
+ if (input->exists("endday"))
+ url << "endday=" << encodeInput("endday") << '&';
+ StringList form_vars(config->Find("allow_in_form"), " \t\r\n");
+ for (i = 0; i < form_vars.Count(); i++)
+ {
+ if (input->exists(form_vars[i]))
+ {
+ s = form_vars[i];
+ encodeURL(s); // shouldn't be needed, but just in case
+ url << s << '=';
+ url << encodeInput(form_vars[i]) << '&';
+ }
+ }
+ url << "page=" << pageNumber;
+
+*/
+
+}
+
+//*****************************************************************************
+void
+ResultFetch::displayHeader()
+{
+ HtConfiguration *config = HtConfiguration::config();
+ displayParsedFile(config->Find("search_results_header"));
+}
+
+//*****************************************************************************
+void
+ResultFetch::displayFooter()
+{
+ HtConfiguration *config = HtConfiguration::config();
+ displayParsedFile(config->Find("search_results_footer"));
+}
+
+//*****************************************************************************
+void
+ResultFetch::displayNomatch()
+{
+ HtConfiguration *config = HtConfiguration::config();
+ displayParsedFile(config->Find("nothing_found_file"));
+}
+
+//*****************************************************************************
+void
+ResultFetch::displaySyntaxError(const String & message)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ if (config->Boolean("nph"))
+ {
+ // cout << "HTTP/1.0 200 OK\r\n";
+ }
+ //cout << "Content-type: text/html\r\n\r\n";
+
+ setVariables(0, 0);
+ vars.Add("SYNTAXERROR", new String(message));
+ displayParsedFile(config->Find("syntax_error_file"));
+}
+
+//*****************************************************************************
+void
+ResultFetch::displayParsedFile(const String & filename)
+{
+ FILE *fl = fopen(filename, "r");
+ char buffer[1000];
+
+ while (fl && fgets(buffer, sizeof(buffer), fl))
+ {
+ expandVariables(buffer);
+ }
+ if (fl)
+ fclose(fl);
+ else if (debug)
+ cerr << "displayParsedFile: Can't open " << filename << endl;
+}
+
+//*****************************************************************************
+// If the result templates need to depend on the URL of the match, we need
+// an efficient way to determine which template file to use. To do this, we
+// will build a StringMatch object with all the URL patterns and also
+// a List parallel to that pattern that contains the actual template file
+// names to use for each URL.
+//
+void
+ResultFetch::setupTemplates()
+{
+ HtConfiguration *config = HtConfiguration::config();
+ String templatePatterns = config->Find("template_patterns");
+ if (!templatePatterns.empty())
+ {
+ //
+ // The templatePatterns string will have pairs of values. The first
+ // value of a pair will be a pattern, the second value will be a
+ // result template file name.
+ //
+ char *token = strtok(templatePatterns, " \t\r\n");
+ String pattern;
+ while (token)
+ {
+ //
+ // First token is a pattern...
+ //
+ pattern << token << '|';
+
+ //
+ // Second token is an URL
+ //
+ token = strtok(0, " \t\r\n");
+ URLtemplateList.Add(new String(token));
+ if (token)
+ token = strtok(0, " \t\r\n");
+ }
+ pattern.chop(1);
+ URLtemplate.Pattern(pattern);
+ }
+}
+
+//*****************************************************************************
+// If the star images need to depend on the URL of the match, we need
+// an efficient way to determine which image to use. To do this, we
+// will build a StringMatch object with all the URL patterns and also
+// a List parallel to that pattern that contains the actual images to
+// use for each URL.
+//
+void
+ResultFetch::setupImages()
+{
+ HtConfiguration *config = HtConfiguration::config();
+ String starPatterns = config->Find("star_patterns");
+ if (!starPatterns.empty())
+ {
+ //
+ // The starPatterns string will have pairs of values. The first
+ // value of a pair will be a pattern, the second value will be an
+ // URL to an image.
+ //
+ char *token = strtok(starPatterns, " \t\r\n");
+ String pattern;
+ while (token)
+ {
+ //
+ // First token is a pattern...
+ //
+ pattern << token << '|';
+
+ //
+ // Second token is an URL
+ //
+ token = strtok(0, " \t\r\n");
+ URLimageList.Add(new String(token));
+ if (token)
+ token = strtok(0, " \t\r\n");
+ }
+ pattern.chop(1);
+ URLimage.Pattern(pattern);
+ }
+}
+
+//*****************************************************************************
+String *
+ResultFetch::generateStars(DocumentRef * ref, int right)
+{
+ int i;
+ String *result = new String();
+ HtConfiguration *config = HtConfiguration::config();
+ if (!config->Boolean("use_star_image", 1))
+ return result;
+
+ String image = config->Find("star_image");
+ const String blank = config->Find("star_blank");
+ double score;
+
+ if (maxScore != 0 && maxScore != minScore)
+ {
+ score = (ref->DocScore() - minScore) / (maxScore - minScore);
+ if (debug)
+ cerr << "generateStars: doc, min, max " << ref->
+ DocScore() << ", " << minScore << ", " << maxScore << endl;
+ }
+ else
+ {
+ maxScore = ref->DocScore();
+ score = 1;
+ }
+ int nStars = int (score * (maxStars - 1) + 0.5) + 1;
+
+ vars.Add("NSTARS", new String(form("%.d", nStars)));
+ if (debug)
+ cerr << "generateStars: nStars " << nStars << " of " << maxStars << endl;
+
+ if (right)
+ {
+ for (i = 0; i < maxStars - nStars; i++)
+ {
+ *result << "<img src=\"" << blank << "\" alt=\" \">";
+ }
+ }
+
+ int match = 0;
+ int length = 0;
+ int status;
+
+ if (URLimage.hasPattern())
+ status = URLimage.FindFirst(ref->DocURL(), match, length);
+ else
+ status = -1;
+
+ if (status >= 0 && match >= 0)
+ {
+ image = ((String *) URLimageList[match])->get();
+ }
+
+ for (i = 0; i < nStars; i++)
+ {
+ *result << "<img src=\"" << image << "\" alt=\"*\">";
+ }
+
+ if (!right)
+ {
+ for (i = 0; i < maxStars - nStars; i++)
+ {
+ *result << "<img src=\"" << blank << "\" alt=\" \">";
+ }
+ }
+
+ return result;
+}
+
+//*****************************************************************************
+String *
+ResultFetch::readFile(const String & filename)
+{
+ FILE *fl;
+ String *s = new String();
+ char line[1024];
+
+ fl = fopen(filename, "r");
+ while (fl && fgets(line, sizeof(line), fl))
+ {
+ *s << line;
+ }
+ if (fl)
+ fclose(fl);
+ else if (debug)
+ cerr << "readFile: Can't open " << filename << endl;
+ return s;
+}
+
+//*****************************************************************************
+void
+ResultFetch::expandVariables(const String & str_arg)
+{
+ const char *str = str_arg;
+ enum
+ {
+ StStart, StLiteral, StVarStart, StVarClose, StVarPlain, StGotVar
+ }
+ state = StStart;
+ String var = "";
+
+ while (str && *str)
+ {
+ switch (state)
+ {
+ case StStart:
+ if (*str == '\\')
+ state = StLiteral;
+ else if (*str == '$')
+ state = StVarStart;
+ else
+ {
+ cout << *str;
+ cout.flush();
+ }
+ break;
+ case StLiteral:
+ cout << *str;
+ cout.flush();
+ state = StStart;
+ break;
+ case StVarStart:
+ if (*str == '%' || *str == '=')
+ var << *str; // code for URL-encoded/decoded variable
+ else if (*str == '&')
+ {
+ var << *str; // code for SGML-encoded variable
+ if (mystrncasecmp("&amp;", str, 5) == 0)
+ str += 4;
+ }
+ else if (*str == '(' || *str == '{')
+ state = StVarClose;
+ else if (isalnum(*str) || *str == '_' || *str == '-')
+ {
+ var << *str;
+ state = StVarPlain;
+ }
+ else
+ state = StStart;
+ break;
+ case StVarClose:
+ if (*str == ')' || *str == '}')
+ state = StGotVar;
+ else if (isalnum(*str) || *str == '_' || *str == '-')
+ var << *str;
+ else
+ state = StStart;
+ break;
+ case StVarPlain:
+ if (isalnum(*str) || *str == '_' || *str == '-')
+ var << *str;
+ else
+ {
+ state = StGotVar;
+ continue;
+ }
+ break;
+ case StGotVar:
+ //
+ // We have a complete variable in var. Look it up and
+ // see if we can find a good replacement for it.
+ //
+ outputVariable(var);
+ var = "";
+ state = StStart;
+ continue;
+ }
+ str++;
+ }
+ if (state == StGotVar || state == StVarPlain)
+ {
+ //
+ // The end of string was reached, but we are still trying to
+ // put a variable together. Since we now have a complete
+ // variable, we will look up the value for it.
+ //
+ outputVariable(var);
+ }
+}
+
+//*****************************************************************************
+void
+ResultFetch::outputVariable(const String & var)
+{
+ String *temp;
+ String value = "";
+ const char *ev, *name;
+
+ // We have a complete variable name in var. Look it up and
+ // see if we can find a good replacement for it, either in our
+ // vars dictionary or in the environment variables.
+ name = var;
+ while (*name == '&' || *name == '%' || *name == '=')
+ name++;
+ temp = (String *) vars[name];
+ if (temp)
+ value = *temp;
+ else
+ {
+ ev = getenv(name);
+ if (ev)
+ value = ev;
+ }
+ while (--name >= var.get() && value.length())
+ {
+ if (*name == '%')
+ encodeURL(value);
+ else if (*name == '&')
+ value = HtSGMLCodec::instance()->decode(value);
+ else // (*name == '=')
+ decodeURL(value);
+ }
+ cout << value;
+ cout.flush();
+}
+
+//*****************************************************************************
+List *
+ResultFetch::buildMatchList()
+{
+ HtConfiguration *config = HtConfiguration::config();
+ char *cpid;
+ String url;
+ ResultMatch *thisMatch;
+ SplitMatches matches(*config);
+ double backlink_factor = config->Double("backlink_factor");
+ double date_factor = config->Double("date_factor");
+ double backlink_score = 0;
+ double date_score = 0;
+ double base_score = 0;
+
+
+ // Additions made here by Mike Grommet ...
+
+ tm startdate; // structure to hold the startdate specified by the user
+ tm enddate; // structure to hold the enddate specified by the user
+ time_t now = time((time_t *) 0); // fill in all fields for mktime
+ tm *lt = localtime(&now); // - Gilles's fix
+ startdate = *lt;
+ enddate = *lt;
+
+ time_t eternity = ~(1 << (sizeof(time_t) * 8 - 1)); // will be the largest value holdable by a time_t
+ tm *endoftime; // the time_t eternity will be converted into a tm, held by this variable
+
+ time_t timet_startdate;
+ time_t timet_enddate;
+ int monthdays[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
+
+ // boolean to test to see if we need to build date information or not
+ int dategiven = ((config->Value("startmonth")) ||
+ (config->Value("startday")) ||
+ (config->Value("startyear")) ||
+ (config->Value("endmonth")) || (config->Value("endday")) || (config->Value("endyear")));
+
+ // find the end of time
+ endoftime = gmtime(&eternity);
+
+ if (dategiven) // user specified some sort of date information
+ {
+ // set up the startdate structure
+ // see man mktime for details on the tm structure
+ startdate.tm_sec = 0;
+ startdate.tm_min = 0;
+ startdate.tm_hour = 0;
+ startdate.tm_yday = 0;
+ startdate.tm_wday = 0;
+
+ // The concept here is that if a user did not specify a part of a date,
+ // then we will make assumtions...
+ // For instance, suppose the user specified Feb, 1999 as the start
+ // range, we take steps to make sure that the search range date starts
+ // at Feb 1, 1999,
+ // along these same lines: (these are in MM-DD-YYYY format)
+ // Startdates: Date Becomes
+ // 01-01 01-01-1970
+ // 01-1970 01-01-1970
+ // 04-1970 04-01-1970
+ // 1970 01-01-1970
+ // These things seem to work fine for start dates, as all months have
+ // the same first day however the ending date can't work this way.
+
+ if (config->Value("startmonth")) // form input specified a start month
+ {
+ startdate.tm_mon = config->Value("startmonth") - 1;
+ // tm months are zero based. They are passed in as 1 based
+ }
+ else
+ startdate.tm_mon = 0; // otherwise, no start month, default to 0
+
+ if (config->Value("startday")) // form input specified a start day
+ {
+ startdate.tm_mday = config->Value("startday");
+ // tm days are 1 based, they are passed in as 1 based
+ }
+ else
+ startdate.tm_mday = 1; // otherwise, no start day, default to 1
+
+ // year is handled a little differently... the tm_year structure
+ // wants the tm_year in a format of year - 1900.
+ // since we are going to convert these dates to a time_t,
+ // a time_t value of zero, the earliest possible date
+ // occurs Jan 1, 1970. If we allow dates < 1970, then we
+ // could get negative time_t values right???
+ // (barring minor timezone offsets west of GMT, where Epoch is 12-31-69)
+
+ if (config->Value("startyear")) // form input specified a start year
+ {
+ startdate.tm_year = config->Value("startyear") - 1900;
+ if (startdate.tm_year < 69 - 1900) // correct for 2-digit years 00-68
+ startdate.tm_year += 2000; // - Gilles's fix
+ if (startdate.tm_year < 0) // correct for 2-digit years 69-99
+ startdate.tm_year += 1900;
+ }
+ else
+ startdate.tm_year = 1970 - 1900;
+ // otherwise, no start day, specify start at 1970
+
+ // set up the enddate structure
+ enddate.tm_sec = 59; // allow up to last second of end day
+ enddate.tm_min = 59; // - Gilles's fix
+ enddate.tm_hour = 23;
+ enddate.tm_yday = 0;
+ enddate.tm_wday = 0;
+
+ if (config->Value("endmonth")) // form input specified an end month
+ {
+ enddate.tm_mon = config->Value("endmonth") - 1;
+ // tm months are zero based. They are passed in as 1 based
+ }
+ else
+ enddate.tm_mon = 11; // otherwise, no end month, default to 11
+
+ if (config->Value("endyear")) // form input specified a end year
+ {
+ enddate.tm_year = config->Value("endyear") - 1900;
+ if (enddate.tm_year < 69 - 1900) // correct for 2-digit years 00-68
+ enddate.tm_year += 2000; // - Gilles's fix
+ if (enddate.tm_year < 0) // correct for 2-digit years 69-99
+ enddate.tm_year += 1900;
+ }
+ else
+ enddate.tm_year = endoftime->tm_year;
+ // otherwise, no end year, specify end at the end of time allowable
+
+ // Months have different number of days, and this makes things more
+ // complicated than the startdate range.
+ // Following the example above, here is what we want to happen:
+ // Enddates: Date Becomes
+ // 04-31 04-31-endoftime->tm_year
+ // 05-1999 05-31-1999, may has 31 days... we want to search until the end of may so...
+ // 1999 12-31-1999, search until the end of the year
+
+ if (config->Value("endday")) // form input specified an end day
+ {
+ enddate.tm_mday = config->Value("endday");
+ // tm days are 1 based, they are passed in as 1 based
+ }
+ else
+ {
+ // otherwise, no end day, default to the end of the month
+ enddate.tm_mday = monthdays[enddate.tm_mon];
+ if (enddate.tm_mon == 1) // February, so check for leap year
+ if (((enddate.tm_year + 1900) % 4 == 0 &&
+ (enddate.tm_year + 1900) % 100 != 0) || (enddate.tm_year + 1900) % 400 == 0)
+ enddate.tm_mday += 1; // Feb. 29 - Gilles's fix
+ }
+
+ // Convert the tm values into time_t values.
+ // Web servers specify modification times in GMT, but htsearch
+ // displays these modification times in the server's local time zone.
+ // For consistency, we would prefer to select based on this same
+ // local time zone. - Gilles's fix
+
+ timet_startdate = mktime(&startdate);
+ timet_enddate = mktime(&enddate);
+
+ // I'm not quite sure what behavior I want to happen if
+ // someone reverses the start and end dates, and one of them is invalid.
+ // for now, if there is a completely invalid date on the start or end
+ // date, I will force the start date to time_t 0, and the end date to
+ // the maximum that can be handled by a time_t.
+
+ if (timet_startdate < 0)
+ timet_startdate = 0;
+ if (timet_enddate < 0)
+ timet_enddate = eternity;
+
+ // what if the user did something really goofy like choose an end date
+ // that's before the start date
+
+ if (timet_enddate < timet_startdate) // if so, then swap them so they are in order
+ {
+ time_t timet_temp = timet_enddate;
+ timet_enddate = timet_startdate;
+ timet_startdate = timet_temp;
+ }
+ }
+ else // no date was specifed, so plug in some defaults
+ {
+ timet_startdate = 0;
+ timet_enddate = eternity;
+ }
+
+ // ... MG
+
+
+ URLSeedScore adjustments(*config);
+
+ // If we knew where to pass it, this would be a good place to pass
+ // on errors from adjustments.ErrMsg().
+
+// Deal with all collections
+//
+ selected_collections->Start_Get();
+ Collection *collection = NULL;
+ while ((collection = (Collection *) selected_collections->Get_NextElement()))
+ {
+ ResultList *results = collection->getResultList();
+ if (results == NULL)
+ continue;
+
+ results->Start_Get();
+ while ((cpid = results->Get_Next()))
+ {
+ int id = atoi(cpid);
+
+ // DocumentRef *thisRef = docDB[id];
+
+ DocMatch *dm = results->find(cpid);
+ Collection *collection = NULL;
+ if (dm)
+ collection = dm->collection;
+ if (collection == NULL)
+ continue;
+ DocumentRef *thisRef = collection->getDocumentRef(id);
+
+ //
+ // If it wasn't there, then ignore it
+ //
+ if (thisRef == 0)
+ {
+ continue;
+ }
+
+ if (!includeURL(thisRef->DocURL()))
+ {
+ // Get rid of it to free the memory!
+ delete thisRef;
+
+ continue;
+ }
+
+ // Code added by Mike Grommet for date search ranges
+ // check for valid date range. toss it out if it isn't relevant.
+ if ((timet_startdate > 0 || enddate.tm_year < endoftime->tm_year) &&
+ (thisRef->DocTime() < timet_startdate || thisRef->DocTime() > timet_enddate))
+ {
+ delete thisRef;
+ continue;
+ }
+
+ thisMatch = ResultMatch::create();
+ thisMatch->setID(id);
+ thisMatch->setCollection(collection);
+
+ //
+ // Assign the incomplete score to this match. This score was
+ // computed from the word database only, no excerpt context was
+ // known at that time, or info about the document itself,
+ // so this still needs to be done.
+ //
+
+ // Moved up: DocMatch *dm = results->find(cpid);
+ double score = dm->score;
+
+ // We need to scale based on date relevance and backlinks
+ // Other changes to the score can happen now
+ // Or be calculated by the result match in getScore()
+
+ // This formula derived through experimentation
+ // We want older docs to have smaller values and the
+ // ultimate values to be a reasonable size (max about 100)
+
+ base_score = score;
+ if (date_factor != 0.0)
+ {
+ date_score = date_factor * ((thisRef->DocTime() * 1000.0 / (double) now) - 900);
+ score += date_score;
+ }
+
+ if (backlink_factor != 0.0)
+ {
+ int links = thisRef->DocLinks();
+ if (links == 0)
+ links = 1; // It's a hack, but it helps...
+
+ backlink_score = backlink_factor * (thisRef->DocBackLinks() / (double) links);
+ score += backlink_score;
+ }
+
+ if (debug)
+ {
+ cerr << thisRef->DocURL() << "\n";
+ }
+
+ thisMatch->setTime(thisRef->DocTime());
+ thisMatch->setTitle(thisRef->DocTitle());
+
+ score = adjustments.adjust_score(score, thisRef->DocURL());
+
+ // Get rid of it to free the memory!
+ delete thisRef;
+
+ score = log(1.0 + score);
+ thisMatch->setScore(score);
+ thisMatch->setAnchor(dm->anchor);
+
+ //
+ // Append this match to our list of matches.
+ //
+ matches.Add(thisMatch, url.get());
+
+ if (debug)
+ {
+ cerr << " base_score " << base_score << " date_score " << date_score <<
+ " backlink_score " << backlink_score << "\n";
+ cerr << " score " << score << "(" << thisMatch->
+ getScore() << "), maxScore " << maxScore << ", minScore " << minScore << endl;
+ }
+
+ if (maxScore < score)
+ {
+ if (debug)
+ cerr << "Set maxScore = score" << endl;
+ maxScore = score;
+ }
+ if (minScore > score)
+ {
+ if (debug)
+ cerr << "Set minScore = score" << endl;
+ minScore = score;
+ }
+ }
+ }
+
+ //
+ // Each sub-area is then sorted by relevance level.
+ //
+ List *matches_part; // Outside of loop to keep for-scope warnings away.
+ for (matches_part = matches.Get_First(); matches_part != 0; matches_part = matches.Get_Next())
+ sort(matches_part);
+
+ // Then all sub-lists are concatenated and put in a new list.
+ return matches.JoinedLists();
+}
+
+//*****************************************************************************
+String *
+ResultFetch::excerpt(ResultMatch * match, DocumentRef * ref, String urlanchor, int fanchor, int &first)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ // It is necessary to keep alive the String you .get() a char * from,
+ // as long as you use the char *.
+
+ //String head_string;
+
+ char *head;
+ int use_meta_description = 0;
+ Collection *collection = match->getCollection();
+
+ if (config->Boolean("use_meta_description", 0) && strlen(ref->DocMetaDsc()) != 0)
+ {
+ // Set the head to point to description
+ head = ref->DocMetaDsc();
+ use_meta_description = 1;
+ }
+ else
+ {
+ // docDB.ReadExcerpt(*ref);
+ collection->ReadExcerpt(*ref);
+ head = ref->DocHead(); // head points to the top
+ }
+
+ //head_string = HtSGMLCodec::instance()->decode(head);
+ //head = head_string.get();
+
+ int which, length;
+ char *temp = head;
+ String part;
+ String *text = new String("");
+
+ StringMatch *allWordsPattern = NULL;
+ if (collection)
+ allWordsPattern = collection->getSearchWordsPattern();
+ if (!allWordsPattern)
+ return text;
+
+ // htsearch displays the description when:
+ // 1) a description has been found
+ // 2) the option "use_meta_description" is set to true
+ // If previous conditions are false and "excerpt_show_top" is set to true
+ // it shows the whole head. Else, it acts as default.
+
+ if (config->Boolean("excerpt_show_top", 0) || use_meta_description || !allWordsPattern->hasPattern())
+ first = 0;
+ else
+ first = allWordsPattern->FindFirstWord(head, which, length);
+
+ if (first < 0 && config->Boolean("no_excerpt_show_top"))
+ first = 0; // No excerpt, but we want to show the top.
+
+ if (first < 0)
+ {
+ //
+ // No excerpt available, don't show top, so display message
+ //
+ if (!config->Find("no_excerpt_text").empty())
+ {
+ *text << config->Find("no_excerpt_text");
+ }
+ }
+ else
+ {
+ int headLength = strlen(head);
+ int length = config->Value("excerpt_length", 50);
+ char *start;
+ char *end;
+ WordType type(*config);
+
+ if (!config->Boolean("add_anchors_to_excerpt"))
+ // negate flag if it's on (anchor available)
+ fanchor = 0;
+
+ //
+ // Figure out where to start the excerpt. Basically we go back
+ // half the excerpt length from the first matched word
+ //
+ start = &temp[first] - length / 2;
+ if (start < temp)
+ start = temp;
+ else
+ {
+ *text << config->Find("start_ellipses");
+ while (*start && type.IsStrictChar(*start))
+ start++;
+ }
+
+ //
+ // Figure out the end of the excerpt.
+ //
+ end = start + length;
+ if (end > temp + headLength)
+ {
+ end = temp + headLength;
+ *text << hilight(match, start, urlanchor, fanchor);
+ }
+ else
+ {
+ while (*end && type.IsStrictChar(*end))
+ end++;
+ *end = '\0';
+ *text << hilight(match, start, urlanchor, fanchor);
+ *text << config->Find("end_ellipses");
+ }
+ }
+ return text;
+}
+
+//*****************************************************************************
+String ResultFetch::hilight(ResultMatch * match, const String & str_arg, const String & urlanchor, int fanchor)
+{
+ HtConfiguration *
+ config = HtConfiguration::config();
+ const String
+ start_highlight = config->Find("start_highlight");
+ const String
+ end_highlight = config->Find("end_highlight");
+ const char *
+ str = str_arg;
+ String
+ result;
+ int
+ pos = 0;
+ int
+ which, length;
+ WeightWord *
+ ww;
+ int
+ first = 1;
+ String
+ s;
+#define SGMLencodedChars(p, l) (s = 0, s.append(p, l), HtSGMLCodec::instance()->decode(s))
+
+ result = 0;
+ Collection *
+ collection = match->getCollection();
+ StringMatch *
+ allWordsPattern = NULL;
+ if (collection)
+ allWordsPattern = collection->getSearchWordsPattern();
+ List *
+ searchWords = NULL;
+ if (collection)
+ searchWords = collection->getSearchWords();
+ if (!allWordsPattern || !searchWords)
+ return result;
+
+ while (allWordsPattern->hasPattern() && (pos = allWordsPattern->FindFirstWord(str, which, length)) >= 0)
+ {
+ //result.append(str, pos);
+ result << SGMLencodedChars(str, pos);
+ ww = (WeightWord *) (*searchWords)[which];
+ result << start_highlight;
+ if (first && fanchor)
+ result << "<a href=\"" << urlanchor << "\">";
+ //result.append(str + pos, length);
+ result << SGMLencodedChars(str + pos, length);
+ if (first && fanchor)
+ result << "</a>";
+ result << end_highlight;
+ str += pos + length;
+ first = 0;
+ }
+ //result.append(str);
+ result << SGMLencodedChars(str, strlen(str));
+ return result;
+}
+
+//*****************************************************************************
+void
+ResultFetch::sort(List * matches)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ int numberOfMatches = matches->Count();
+ int i;
+
+ if (numberOfMatches <= 1)
+ return;
+
+ ResultMatch **array = new ResultMatch *[numberOfMatches];
+ for (i = 0; i < numberOfMatches; i++)
+ {
+ array[i] = (ResultMatch *) (*matches)[i];
+ }
+ matches->Release();
+
+ qsort((char *) array, numberOfMatches, sizeof(ResultMatch *), array[0]->getSortFun());
+
+ const String st = config->Find("sort");
+ if (!st.empty() && mystrncasecmp("rev", st, 3) == 0)
+ {
+ for (i = numberOfMatches; --i >= 0;)
+ matches->Add(array[i]);
+ }
+ else
+ {
+ for (i = 0; i < numberOfMatches; i++)
+ matches->Add(array[i]);
+ }
+ delete[]array;
+}
+
+//*****************************************************************************
+void
+ResultFetch::logSearch(int page, List * matches)
+{
+//Note: This is Posix and dependent on a running syslogd..
+//does not work for Win32
+//TODO: Look into using native windows system logs instead
+#ifndef _WIN32
+
+ HtConfiguration *config = HtConfiguration::config();
+ // Currently unused time_t t;
+ int nMatches = 0;
+ int level = LOG_LEVEL;
+ int facility = LOG_FACILITY;
+ char *host = getenv("REMOTE_HOST");
+ char *ref = getenv("HTTP_REFERER");
+
+ if (host == NULL)
+ host = getenv("REMOTE_ADDR");
+ if (host == NULL)
+ host = "-";
+
+ if (ref == NULL)
+ ref = "-";
+
+ if (matches)
+ nMatches = matches->Count();
+
+ openlog("htsearch", LOG_PID, facility);
+ syslog(level, "%s [%s] (%s) [%s] [%s] (%d/%s) - %d -- %s\n",
+ host,
+ input->exists("config") ? input->get("config") : "default",
+ (const char *) config->Find("match_method"), input->get("words"), logicalWords.get(),
+ nMatches, (const char *) config->Find("matches_per_page"), page, ref);
+#endif
+}
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h
new file mode 100644
index 00000000..f1f9e92a
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/ResultFetch.h
@@ -0,0 +1,248 @@
+//--------------------------------------------------------------------
+//
+// ResultFetch.h
+//
+// 2/6/2002 created for libhtdig
+//
+// Neal Richter nealr@rightnow.com
+//
+//
+// ResultFetch: Takes results of search and fills in the HTML templates
+//
+// FOR USE IN LIBHTDIG... does NOT stream to stdout!!
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ResultFetch.h,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+#ifndef _ResultFetch_h_
+#define _ResultFetch_h_
+
+#include "Object.h"
+#include "ResultList.h"
+#include "ResultMatch.h"
+#include "TemplateList.h"
+#include "cgi.h"
+#include "StringMatch.h"
+#include "List.h"
+#include "DocumentDB.h"
+#include "Database.h"
+#include "Dictionary.h"
+#include "HtRegex.h"
+
+class ResultFetch : public Object
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ // Display(const String& docFile, const String& indexFile, const String& excerptFile);
+
+ ResultFetch(Dictionary *selected_collections, const StringList& templist);
+ ResultFetch(Dictionary *selected_collections);
+ ~ResultFetch();
+
+ void setStartTemplate(const String& templateName);
+ void setMatchTemplate(const String& templateName);
+ void setEndTemplate(const String& templateName);
+
+ // inline void setResults(ResultList *results);
+ // inline void setSearchWords(List *searchWords);
+ inline void setLimit(HtRegex *);
+ inline void setExclude(HtRegex *);
+ // inline void setAllWordsPattern(StringMatch *);
+ inline void setLogicalWords(char *);
+ inline void setOriginalWords(char *);
+ inline void setCGI(cgi *);
+
+ //void fetch(int pageNumber);
+ //void fetchMatch(ResultMatch *match, DocumentRef *ref, int current);
+ List * fetch();
+ Dictionary * fetchMatch(ResultMatch *match, DocumentRef *ref, int current);
+ void displayHeader();
+ void displayFooter();
+ void displayNomatch();
+ void displaySyntaxError(const String &);
+
+ int hasTemplateError() {return templateError;}
+
+protected:
+ //
+ // Multiple database support
+ //
+ Dictionary *selected_collections;
+
+ //
+ // Search Policy
+ char *search_policy;
+
+ //
+ // The list of search results.
+ //
+ // ResultList *results;
+
+ //
+ // The database that contains documents.
+ //
+ // DocumentDB docDB;
+
+ // List of databases to search on
+ StringList collectionList;
+
+ //
+ // A list of words that we are searching for
+ //
+ // List *searchWords;
+
+ //
+ // Pattern that all result URLs must match or exclude
+ //
+ HtRegex *limitTo;
+ HtRegex *excludeFrom;
+
+ //
+ // Pattern of all the words
+ //
+ // StringMatch *allWordsPattern;
+
+ //
+ // Variables for substitution into text are stored in a dictionary
+ //
+ Dictionary vars;
+
+ //
+ // Since the creation of excerpts is somewhat time consuming, we will
+ // only compute them if they're actually going to be used. This is the
+ // flag that tells us if we will need the excerpt.
+ //
+ int needExcerpt;
+
+ //
+ // Since we might have errors we cannot recover from, this tells us
+ // what happened.
+ //
+ int templateError;
+
+ //
+ // To allow the result templates to be dependant on the match URL, we need
+ // the following:
+ //
+ StringMatch URLtemplate;
+ List URLtemplateList;
+
+ //
+ // To allow the star images to be dependant on the match URL, we need
+ // the following:
+ //
+ StringMatch URLimage;
+ List URLimageList;
+
+ //
+ // Maximum number of stars to display
+ //
+ int maxStars;
+ double maxScore;
+ double minScore;
+
+ //
+ // For display, we have different versions of the list of words.
+ //
+ String logicalWords;
+ String originalWords;
+
+ //
+ // To be able to recreate the URL that will get to us again, we need
+ // the info from the HTML form that called us.
+ //
+ cgi *input;
+
+ //
+ // Match output is done through templates. This is the interface to these
+ // templates.
+ //
+ TemplateList templates;
+ Template *currentTemplate;
+
+ //
+ // Methods...
+ //
+ List *buildMatchList();
+ void sort(List *);
+
+ int includeURL(const String&);
+ String *readFile(const String&);
+ void expandVariables(const String&);
+ void outputVariable(const String&);
+ String *excerpt(ResultMatch *match, DocumentRef *ref, String urlanchor,
+ int fanchor, int &first);
+ String hilight(ResultMatch *match, const String& str, const String& urlanchor, int fanchor);
+ void setupTemplates();
+ void setupImages();
+ String *generateStars(DocumentRef *, int);
+ void displayParsedFile(const String&);
+ void setVariables(int, List *);
+ void createURL(String &, int);
+ void logSearch(int, List *);
+};
+
+//*****************************************************************************
+inline void
+ResultFetch::setLimit(HtRegex *limit)
+{
+ limitTo = limit;
+}
+
+inline void
+ResultFetch::setExclude(HtRegex *exclude)
+{
+ excludeFrom = exclude;
+}
+
+#if 0
+inline void
+Display::setAllWordsPattern(StringMatch *pattern)
+{
+ allWordsPattern = pattern;
+}
+
+inline void
+Display::setResults(ResultList *results)
+{
+ this->results = results;
+}
+
+inline void
+Display::setSearchWords(List *searchWords)
+{
+ this->searchWords = searchWords;
+}
+#endif
+
+inline void
+ResultFetch::setLogicalWords(char *s)
+{
+ logicalWords = s;
+ vars.Add("LOGICAL_WORDS", new String(logicalWords));
+}
+
+inline void
+ResultFetch::setOriginalWords(char *s)
+{
+ originalWords = s;
+ vars.Add("WORDS", new String(originalWords));
+}
+
+inline void
+ResultFetch::setCGI(cgi *aCgi)
+{
+ input = aCgi;
+}
+
+#endif
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
new file mode 100644
index 00000000..3f6d5e5f
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
@@ -0,0 +1,517 @@
+//--------------------------------------------------------------------
+//
+// TextCollector.cc
+//
+// 2/6/2002 created for libhtdig
+//
+// Neal Richter nealr@rightnow.com
+//
+// TextCollector:
+// General Purpose Text Document Indexer.
+// Calls appropriate parsers.
+// The parser notifies the TextCollector object that it got something
+// (got_* functions) and the TextCollector object feed the databases
+// and statistics accordingly.
+//
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "TextCollector.h"
+#include "htdig.h"
+#include "HtWordList.h"
+#include "WordRecord.h"
+#include "URLRef.h"
+#include "Server.h"
+#include "Parsable.h"
+#include "BasicDocument.h"
+#include "StringList.h"
+#include "WordType.h"
+#include "md5.h"
+#include "defaults.h"
+
+#include <signal.h>
+#include <stdio.h>
+
+#include <sys/timeb.h>
+
+
+//*****************************************************************************
+// TextCollector::TextCollector()
+//
+TextCollector::TextCollector(TextCollectorLog flags):
+words(*(HtConfiguration::config()))
+{
+ HtConfiguration *config = HtConfiguration::config();
+ //FILE *urls_parsed;
+
+ currenthopcount = 0;
+
+ //turn on word tracking!
+ trackWords = 1;
+
+ //
+ // Initialize the flags for the various HTML factors
+ //
+
+ // text_factor
+ factor[0] = FLAG_TEXT;
+ // title_factor
+ factor[1] = FLAG_TITLE;
+ // heading factor (now generic)
+ factor[2] = FLAG_HEADING;
+ factor[3] = FLAG_HEADING;
+ factor[4] = FLAG_HEADING;
+ factor[5] = FLAG_HEADING;
+ factor[6] = FLAG_HEADING;
+ factor[7] = FLAG_HEADING;
+ // img alt text
+ //factor[8] = FLAG_KEYWORDS;
+ factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has
+ // its own FLAG and factor.
+ // keywords factor
+ factor[9] = FLAG_KEYWORDS;
+ // META description factor
+ factor[10] = FLAG_DESCRIPTION;
+
+ doc = NULL;
+ minimumWordLength = config->Value("minimum_word_length", 3);
+
+
+ //TODO put document-index log file stuff here via logs like Retriever
+
+ check_unique_md5 = config->Boolean("check_unique_md5", 0);
+ check_unique_date = config->Boolean("check_unique_date", 0);
+
+ d_md5 = 0;
+ if (check_unique_md5)
+ {
+ d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+ if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
+ {
+ cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
+ }
+ }
+
+ temp_doc_count = 0;
+
+}
+
+
+//*****************************************************************************
+// TextCollector::~TextCollector()
+//
+TextCollector::~TextCollector()
+{
+ if (d_md5)
+ d_md5->Close();
+ //delete doc;
+
+ if(temp_doc_count != 0)
+ {
+ words.Flush();
+ temp_doc_count = 0;
+ }
+
+ words.Flush();
+ words.Close();
+
+}
+
+
+//*****************************************************************************
+// void TextCollector::IndexDoc()
+//
+//
+
+int
+TextCollector::IndexDoc(BasicDocument & a_basicdoc)
+{
+ DocumentRef *ref;
+ time_t date;
+ int old_document = 0;
+ static int index = 0;
+
+ //struct timeb tb;
+
+ //HtConfiguration *config = HtConfiguration::config();
+
+ doc = &a_basicdoc;
+
+ ref = docs[doc->Location()]; // It might be nice to have just an Exists() here
+ if (ref)
+ {
+ //
+ // We already have an entry for this document in our database.
+ // This means we can get the document ID and last modification
+ // time from there.
+ //
+ current_id = ref->DocID();
+ date = ref->DocTime();
+ if (ref->DocAccessed())
+ old_document = 1;
+ else // we haven't retrieved it yet, so we only have the first link
+ old_document = 0;
+ ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link
+ ref->DocAccessed(time(0));
+ ref->DocState(Reference_normal);
+ currenthopcount = ref->DocHopCount();
+ }
+ else
+ {
+ //
+ // Never seen this document before. We need to create an
+ // entry for it. This implies that it gets a new document ID.
+ //
+
+ date = 0;
+
+ current_id = docs.NextDocID();
+ ref = new DocumentRef;
+ ref->DocID(current_id);
+ ref->DocURL(doc->Location());
+ ref->DocState(Reference_normal);
+ ref->DocAccessed(time(0));
+ ref->DocHopCount(0);
+ ref->DocBackLinks(1); // We had to have a link to get here!
+ old_document = 0;
+ }
+
+ word_context.DocID(ref->DocID());
+
+ if (debug > 0)
+ {
+ //
+ // Display progress
+ //
+ cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() <<
+ ": ";
+ cout.flush();
+ }
+
+ //printf("New Doc\n");
+ //ftime(&tb);
+ //fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ RetrievedDocument(ref);
+
+ //ftime(&tb);
+ //fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ if(temp_doc_count > 250)
+ {
+ //words.Flush();
+ temp_doc_count = 0;
+ }
+ else
+ {
+ temp_doc_count++;
+ }
+
+ //ftime(&tb);
+ //fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ docs.Add(*ref);
+
+ //ftime(&tb);
+ //fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ delete ref;
+
+ words.Flush();
+ //words.Close();
+
+ if (urls_seen)
+ {
+ fprintf(urls_seen, "%s|%d|%s|%d|0|1\n",
+ (const char *) doc->Location(), doc->Length(), doc->ContentType(),
+ (int) doc->ModTime());
+ }
+
+
+ return(1);
+}
+
+int TextCollector::FlushWordDB()
+{
+ if(temp_doc_count != 0)
+ {
+ words.Flush();
+ temp_doc_count = 0;
+ }
+
+ words.Flush();
+ words.Close();
+ return(1);
+}
+
+//*****************************************************************************
+// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
+// We found a document that needs to be parsed. Since we don't know the
+// document type, we'll let the Document itself return an appropriate
+// Parsable object which we can call upon to parse the document contents.
+//
+void
+TextCollector::RetrievedDocument(DocumentRef * ref)
+{
+ n_links = 0;
+ current_ref = ref;
+ current_title = 0;
+ word_context.Anchor(0);
+ current_time = 0;
+ current_head = 0;
+ current_meta_dsc = 0;
+ time_t doc_time;
+
+ //Check if the Document is self-parseable
+ //We will pass ourselves as a callback object for all the got_*() routines
+ if (doc->SelfParseable() == TRUE)
+ {
+ doc->internalParser(*this);
+ }
+ else
+ {
+ // Create a parser object and let it have a go at the document.
+ // We will pass ourselves as a callback object for all the got_*()
+ // routines.
+ // This will generate the Parsable object as a specific parser
+ /*
+ Parsable *parsable = doc->getParsable();
+ if (parsable)
+ parsable->parse(*this, *base);
+ else
+ { // If we didn't get a parser, then we should get rid of this!
+ ref->DocState(Reference_noindex);
+ return;
+ }
+ */
+ }
+
+ // We don't need to dispose of the parsable object since it will
+ // automatically be reused.
+
+
+ //
+ // Update the document reference
+ //
+ ref->DocTitle((char *) current_title);
+ ref->DocHead((char *) current_head);
+ ref->DocMetaDsc((char *) current_meta_dsc);
+
+/* if (current_time == 0)
+ ref->DocTime(doc->ModTime());
+ else
+ ref->DocTime(current_time); */
+
+ doc_time = doc->ModTime();
+ if(doc_time != 0)
+ ref->DocTime(doc_time);
+ else
+ ref->DocTime(time(NULL));
+
+ ref->DocSize(doc->Length());
+ ref->DocAccessed(time(0));
+ ref->DocLinks(n_links);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_word(char *word, int location, int heading)
+// The location is normalized to be in the range 0 - 1000.
+//
+void
+TextCollector::got_word(const char *word, int location, int heading)
+{
+ if (debug > 3)
+ cout << "word: " << word << '@' << location << endl;
+ if (heading >= 11 || heading < 0) // Current limits for headings
+ heading = 0; // Assume it's just normal text
+
+ if ((trackWords) && (strlen(word) >= minimumWordLength))
+ {
+ String w = word;
+ HtWordReference wordRef;
+
+ wordRef.Location(location);
+ wordRef.Flags(factor[heading]);
+
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+
+#ifdef DEBUG
+ cout << "Adding: [" << w << "]"<< endl; //NEALR
+#endif
+
+ // Check for compound words...
+ String parts = word;
+ int added;
+ int nparts = 1;
+ do
+ {
+ added = 0;
+ char *start = parts.get();
+ char *punctp = 0, *nextp = 0, *p;
+ char punct;
+ int n;
+ while (*start)
+ {
+ p = start;
+ for (n = 0; n < nparts; n++)
+ {
+ while (HtIsStrictWordChar((unsigned char) *p))
+ p++;
+ punctp = p;
+ if (!*punctp && n + 1 < nparts)
+ break;
+ while (*p && !HtIsStrictWordChar((unsigned char) *p))
+ p++;
+ if (n == 0)
+ nextp = p;
+ }
+ if (n < nparts)
+ break;
+ punct = *punctp;
+ *punctp = '\0';
+ if (*start && (*p || start > parts.get()))
+ {
+ w = start;
+ HtStripPunctuation(w);
+ if (w.length() >= minimumWordLength)
+ {
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+ if (debug > 3)
+ cout << "word part: " << start << '@' << location << endl;
+
+#ifdef DEBUG
+ cout << "Adding: [" << w << "]"<< endl; //NEALR
+#endif
+ }
+ added++;
+ }
+ start = nextp;
+ *punctp = punct;
+ }
+ nparts++;
+ }
+ while (added > 2);
+ }
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_title(const char *title)
+//
+void
+TextCollector::got_title(const char *title)
+{
+ if (debug > 1)
+ cout << "\ntitle: " << title << endl;
+ current_title = title;
+}
+
+//*****************************************************************************
+// void TextCollector::got_time(const char *time)
+//
+void
+TextCollector::got_time(const char *time)
+{
+ HtDateTime new_time(current_time);
+
+ if (debug > 1)
+ cout << "\ntime: " << time << endl;
+
+ //
+ // As defined by the Dublin Core, this should be YYYY-MM-DD
+ // In the future, we'll need to deal with the scheme portion
+ // in case someone picks a different format.
+ //
+ new_time.SetFTime(time, "%Y-%m-%d");
+ current_time = new_time.GetTime_t();
+
+ // If we can't convert it, current_time stays the same and we get
+ // the default--the date returned by the server...
+}
+
+//*****************************************************************************
+// void TextCollector::got_head(const char *head)
+//
+void
+TextCollector::got_head(const char *head)
+{
+ if (debug > 4)
+ cout << "head: " << head << endl;
+ current_head = head;
+}
+
+//*****************************************************************************
+// void TextCollector::got_meta_dsc(const char *md)
+//
+void
+TextCollector::got_meta_dsc(const char *md)
+{
+ if (debug > 4)
+ cout << "meta description: " << md << endl;
+ current_meta_dsc = md;
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_email(const char *e)
+//
+void
+TextCollector::got_meta_email(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta email: " << e << endl;
+ current_ref->DocEmail(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_notification(const char *e)
+//
+void
+TextCollector::got_meta_notification(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta notification date: " << e << endl;
+ current_ref->DocNotification(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_subject(const char *e)
+//
+void
+TextCollector::got_meta_subject(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta subect: " << e << endl;
+ current_ref->DocSubject(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_noindex()
+//
+void
+TextCollector::got_noindex()
+{
+ if (debug > 1)
+ cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
+ current_ref->DocState(Reference_noindex);
+}
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h
new file mode 100644
index 00000000..d44869a6
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h
@@ -0,0 +1,133 @@
+//--------------------------------------------------------------------
+//
+// TextCollector.h
+//
+// 2/6/2002 created for libhtdig
+//
+// Neal Richter nealr@rightnow.com
+//
+// TextCollector:
+// General Purpose Text Document Indexer.
+// Calls appropriate parsers.
+// The parser notifies the TextCollector object that it got something
+// (got_* functions) and the TextCollector object feed the databases
+// and statistics accordingly.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+
+#ifndef _TextCollector_h_
+#define _TextCollector_h_
+
+#include "BasicDocument.h"
+#include "DocumentRef.h"
+#include "Dictionary.h"
+#include "Queue.h"
+#include "HtWordReference.h"
+#include "List.h"
+#include "StringList.h"
+#include "DocumentDB.h"
+
+class Document;
+class HtWordList;
+
+enum TextCollectorLog {
+ TextCollector_noLog,
+ TextCollector_logUrl,
+ TextCollector_Restart
+};
+
+class TextCollector
+{
+ public:
+ //
+ // Construction/Destruction
+ //
+ TextCollector(TextCollectorLog flags = TextCollector_noLog);
+ virtual ~TextCollector();
+
+ int IndexDoc(BasicDocument & adoc);
+ int FlushWordDB();
+
+ //
+ // Report statistics about the parser
+ //
+ void ReportStatistics(const String& name);
+
+ //
+ // These are the callbacks that we need to write code for
+ //
+ void got_word(const char *word, int location, int heading);
+ void got_href(URL &url, const char *description, int hops = 1);
+ void got_title(const char *title);
+ void got_time(const char *time);
+ void got_head(const char *head);
+ void got_meta_dsc(const char *md);
+ void got_anchor(const char *anchor);
+ void got_image(const char *src);
+ void got_meta_email(const char *);
+ void got_meta_notification(const char *);
+ void got_meta_subject(const char *);
+ void got_noindex();
+
+
+ private:
+ //
+ // A hash to keep track of what we've seen
+ //
+ Dictionary visited;
+
+ URL *base;
+ String current_title;
+ String current_head;
+ String current_meta_dsc;
+ time_t current_time;
+ int current_id;
+ DocumentRef *current_ref;
+ int current_anchor_number;
+ int trackWords;
+ int n_links;
+ HtWordReference word_context;
+ HtWordList words;
+
+ int check_unique_md5;
+ int check_unique_date;
+
+
+ TextCollectorLog log;
+ //
+ // These are weights for the words. The index is the heading level.
+ //
+ long int factor[11];
+ int currenthopcount;
+
+ //
+ // For efficiency reasons, we will only use one document object which
+ // we reuse.
+ //
+ BasicDocument *doc;
+
+ Database *d_md5;
+
+ // Some useful constants
+ int minimumWordLength;
+
+ //
+ // Helper routines
+ //
+ void RetrievedDocument(DocumentRef *ref);
+
+ int temp_doc_count;
+};
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h b/debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h
new file mode 100644
index 00000000..4d7f9a0c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/htsearch.h
@@ -0,0 +1,75 @@
+//
+// htsearch.h
+//
+// htsearch: The main search CGI. Parses the CGI input, reads the config files
+// and calls the necessary code to put together the result lists
+// and the final display.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: htsearch.h,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+
+#ifndef _htsearch_h_
+#define _htsearch_h_
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif
+
+#include "List.h"
+#include "StringList.h"
+#include "Dictionary.h"
+#include "DocumentRef.h"
+#include "Database.h"
+#include "good_strtok.h"
+#include "DocumentDB.h"
+#include "htString.h"
+#include "HtConfiguration.h"
+#include "ResultMatch.h"
+#include "ResultList.h"
+#include "HtWordReference.h"
+#include "StringMatch.h"
+#include "defaults.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+extern int n_matches;
+extern int do_and;
+extern int do_short;
+extern StringList fields;
+
+#ifndef _WIN32
+//extern StringMatch limit_to;
+#endif
+
+extern StringMatch URLimage;
+extern List URLimageList;
+extern StringMatch wm;
+extern Database *dbf;
+extern String logicalWords;
+extern String originalWords;
+extern int debug;
+extern StringList collectionList;
+
+#endif
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h
new file mode 100644
index 00000000..5b915e39
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_api.h
@@ -0,0 +1,614 @@
+//----------------------------------------------------------------
+//
+// libhtdig_api.h
+//
+// Header function for htdig shared library API
+//
+// 1/25/2002 created
+//
+// Neal Richter nealr@rightnow.com
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_api.h,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//----------------------------------------------------------------
+
+#ifndef LIBHTDIG_API_H
+#define LIBHTDIG_API_H
+
+#include <time.h>
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+
+#define HTDIG_MAX_FILENAME_PATH_L 1024
+#define HTDIG_DOCUMENT_ID_L 32
+#define HTDIG_DOCUMENT_TITLE_L 256
+#define HTDIG_DOCUMENT_META_L 4096
+#define HTDIG_DOCUMENT_CONTENT_TYPE_L 32
+#define HTDIG_DOCUMENT_EXCERPT_L 1024
+//make sure HTDIG_DOCUMENT_EXCERPT_L is more than config 'excerpt_length'
+
+//default failsafe size of 'excerpt' document
+//make sure it's more than config 'max_head_length'
+#define HTDIG_DEFAULT_EXCERPT_SIZE 524288
+
+//should be the same as the default value in HTDIG
+#define HTDIG_MAX_QUERY_L 256
+
+
+#define HTDIG_CUSTOM_TEXT_MIME_TYPE "text/vnd.customdocument"
+
+//htfuzzy
+#define HTDIG_ALG_ACCENTS 0x00000100 //"accents"
+#define HTDIG_ALG_ACCENTS_STR "accents"
+
+#define HTDIG_ALG_ENDINGS 0x00001000 //"endings"
+#define HTDIG_ALG_ENDINGS_STR "endings"
+
+#define HTDIG_ALG_METAPHONE 0x00000010 //"metaphone"
+#define HTDIG_ALG_METAPHONE_STR "metaphone"
+
+#define HTDIG_ALG_SOUNDEX 0x00000001 //"soundex"
+#define HTDIG_ALG_SOUNDEX_STR "soundex"
+
+#define HTDIG_ALG_SYNONYMS 0x00010000 //"synonyms"
+#define HTDIG_ALG_SYNONYMS_STR "synonyms"
+
+
+//searching
+#define HTSEARCH_ALG_AND 0x00000100 //"and"
+#define HTSEARCH_ALG_AND_STR "and"
+
+#define HTSEARCH_ALG_BOOLEAN 0x00000001 //"boolean"
+#define HTSEARCH_ALG_BOOLEAN_STR "boolean"
+
+#define HTSEARCH_ALG_OR 0x00000010 //"or"
+#define HTSEARCH_ALG_OR_STR "or"
+
+
+#define HTSEARCH_FORMAT_LONG 0x00000001 //"long"
+#define HTSEARCH_FORMAT_LONG_STR "long"
+
+#define HTSEARCH_FORMAT_SHORT 0x00000010 //"short"
+#define HTSEARCH_FORMAT_SHORT_STR "short"
+
+
+#define HTSEARCH_SORT_SCORE 0x00000001 //"score"
+#define HTSEARCH_SORT_SCORE_STR "score"
+
+#define HTSEARCH_SORT_REV_SCORE 0x00000010 //"reverse score"
+#define HTSEARCH_SORT_REV_SCORE_STR "reverse score"
+
+#define HTSEARCH_SORT_TIME 0x00000100 //"time"
+#define HTSEARCH_SORT_TIME_STR "time"
+
+#define HTSEARCH_SORT_REV_TIME 0x00001000 //"reverse time"
+#define HTSEARCH_SORT_REV_TIME_STR "reverse time"
+
+#define HTSEARCH_SORT_TITLE 0x00010000 //"title"
+#define HTSEARCH_SORT_TITLE_STR "title"
+
+#define HTSEARCH_SORT_REV_TITLE 0x00100000 //"reverse title"
+#define HTSEARCH_SORT_REV_TITLE_STR "reverse title"
+
+
+
+#define HTDIG_ERROR_CONFIG_READ -101
+#define HTDIG_ERROR_URL_PART -102
+#define HTDIG_ERROR_URL_REWRITE -103
+#define HTDIG_ERROR_URL_CREATE_FILE -104
+#define HTDIG_ERROR_IMAGE_CREATE_FILE -105
+#define HTDIG_ERROR_OPEN_CREATE_DOCDB -106
+#define HTDIG_ERROR_LOGFILE_OPEN -107
+#define HTDIG_ERROR_LOGFILE_CLOSE -108
+
+#define HTDIG_ERROR_TESTURL_EXCLUDE -109
+#define HTDIG_ERROR_TESTURL_BADQUERY -110
+#define HTDIG_ERROR_TESTURL_EXTENSION -111
+#define HTDIG_ERROR_TESTURL_EXTENSION2 -112
+#define HTDIG_ERROR_TESTURL_LIMITS -113
+#define HTDIG_ERROR_TESTURL_LIMITSNORM -114
+#define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115
+#define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116
+#define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117
+#define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118
+
+#define HTSEARCH_ERROR_NO_MATCH -201
+#define HTSEARCH_ERROR_BAD_MATCH_INDEX -202
+#define HTSEARCH_ERROR_BAD_DOCUMENT -203
+#define HTSEARCH_ERROR_TEMPLATE_ERROR -204
+#define HTSEARCH_ERROR_LOGFILE_OPEN -205
+#define HTSEARCH_ERROR_LOGFILE_CLOSE -206
+#define HTSEARCH_ERROR_CONFIG_READ -207
+#define HTSEARCH_ERROR_URL_PART -208
+#define HTSEARCH_ERROR_WORDDB_READ -209
+#define HTSEARCH_ERROR_DOCINDEX_READ -210
+#define HTSEARCH_ERROR_DOCDB_READ -211
+#define HTSEARCH_ERROR_EXCERPTDB_READ -212
+
+#define HTMERGE_ERROR_LOGFILE_OPEN -301
+#define HTMERGE_ERROR_LOGFILE_CLOSE -302
+#define HTMERGE_ERROR_CONFIG_READ -303
+#define HTMERGE_ERROR_URL_PART -304
+#define HTMERGE_ERROR_WORDDB_READ -305
+#define HTMERGE_ERROR_DOCINDEX_READ -306
+#define HTMERGE_ERROR_DOCDB_READ -307
+#define HTMERGE_ERROR_EXCERPTDB_READ -308
+
+#define PHP_HTDIG_CONFIGFILE_PARM "configFile"
+#define PHP_HTDIG_URL_PARM "URL"
+#define PHP_HTDIG_LIMITTO_PARM "limit_urls_to"
+#define PHP_HTDIG_LIMITN_PARM "limit_normalized"
+#define PHP_HTDIG_EXCLUDEURLS_PARM "exclude_urls"
+#define PHP_HTDIG_SEARCHRESTRICT_PARM "search_restrict"
+#define PHP_HTDIG_SEARCHEXCLUDE_PARM "search_exclude"
+#define PHP_HTDIG_MAXHOPCOUNT_PARM "max_hop_cont"
+#define PHP_HTDIG_URLREWRITE_PARM "url_rewrite_rules"
+#define PHP_HTDIG_BAD_QUERYSTR_PARM "bad_querystr"
+
+//=============================================================================
+//===== HTDIG INDEXING API ====================================================
+
+
+/***************************************************
+ * HTDIG_DOCUMENTATION for htdig_parameters_struct
+ *
+ * DEBUGGING PARAMETERS
+ *
+ * int debug
+ * Verbose mode. This increases the verbosity of the
+ * program. Using more than 2 is probably only useful
+ * for debugging purposes. The default verbose mode
+ * gives a nice progress report while digging.
+ *
+ * char logFile
+ * File to stream debugging & error messages to!
+ *
+ * BOOLEAN PARAMETERS
+ *
+ * int initial
+ * Initial. Do not use any old databases. This is
+ * accomplished by first erasing the databases
+ *
+ * int create_text_database
+ * Create an ASCII version of the document database.
+ * This database is easy to parse with other programs so
+ * that information can be extracted from it.
+ *
+ * int report_statistics
+ * Report statistics after completion.
+ *
+ * int alt_work_area
+ * Use alternate work files.
+ * Tells htdig to append .work to database files, causing
+ * a second copy of the database to be built. This allows
+ * the original files to be used by htsearch during the
+ * indexing run.
+ *
+ *
+ * STRING PARAMETERS
+ *
+ * char configFile
+ * configfile
+ * Use the specified configuration file instead of the
+ * default.
+ *
+ * char credentials
+ * username:password
+ * Tells htdig to send the supplied username and
+ * password with each HTTP request. The credentials
+ * will be encoded using the 'Basic' authentication scheme.
+ * There *HAS* to be a colon (:) between the username
+ * and password.
+ *
+ *
+ * char maxhops //9 digit limit
+ * hopcount
+ * Limit the stored documents to those which are at
+ * most hopcount links away from the start URL.
+ *
+ * char minimalFile
+ *
+ * char URL
+ * 'command-line' URLs from stdin
+ * fetches & indexes these URLs
+ *
+ ******************************************************************/
+
+typedef struct htdig_parameters_struct {
+
+ char configFile[HTDIG_MAX_FILENAME_PATH_L];
+ char DBpath[HTDIG_MAX_FILENAME_PATH_L];
+ char credentials[HTDIG_MAX_FILENAME_PATH_L];
+ char max_hops[10]; //9 digit limit
+ char minimalFile[HTDIG_MAX_FILENAME_PATH_L];
+
+ //debugging & logfile
+ char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
+ int debug; //0, 1 ,2, 3, 4, 5
+
+ //booelan values
+ int initial;
+ int create_text_database;
+ int report_statistics;
+ int alt_work_area;
+ int use_cookies;
+
+ //spidering filters
+ char URL[HTDIG_MAX_FILENAME_PATH_L];
+ char limit_urls_to[HTDIG_MAX_FILENAME_PATH_L];
+ char limit_normalized[HTDIG_MAX_FILENAME_PATH_L];
+ char exclude_urls[HTDIG_MAX_FILENAME_PATH_L];
+ char search_restrict[HTDIG_MAX_FILENAME_PATH_L];
+ char search_exclude[HTDIG_MAX_FILENAME_PATH_L];
+ char url_rewrite_rules[HTDIG_MAX_FILENAME_PATH_L];
+ char bad_querystr[HTDIG_MAX_FILENAME_PATH_L];
+ char locale[16];
+ char title_factor[16];
+ char text_factor[16];
+ char meta_description_factor[16];
+ int max_hop_count;
+
+ //the rewritten URL - OUTGOING after htdig_index_test_url
+ char rewritten_URL[HTDIG_MAX_FILENAME_PATH_L];
+
+} htdig_parameters_struct;
+
+/*****************************************************************
+ * HTDIG_DOCUMENTATION for htdig_simple_doc_struct
+ *
+ * STRING PARAMETERS
+ *
+ * char location
+ * the 'URL' of the document. Can be any usefull string.
+ *
+ * char documentid
+ * document id of document [NOT CURRENTLY USED - IGNORED]
+ *
+ * char title
+ * document title
+ *
+ * char meta
+ * content that is indexed but won appear in an search excerpts
+ *
+ * char * contents
+ * pointer to a NULL TERMINATED string on information to be
+ * indexed.
+ *
+ * char content_type
+ * a MIME-like string
+ * custom MIME-type defined above, others are supported by
+ * htdig as well.
+ *
+ *
+ *****************************************************************/
+
+typedef struct htdig_simple_doc_struct {
+
+ char location[HTDIG_MAX_FILENAME_PATH_L];
+ char documentid[HTDIG_DOCUMENT_ID_L];
+ char title[HTDIG_DOCUMENT_TITLE_L];
+ char meta[HTDIG_DOCUMENT_META_L];
+ char *contents; //MUST ALLOCATE & FREE!!!
+ char content_type[HTDIG_DOCUMENT_CONTENT_TYPE_L]; //MIME-ISH string
+ //struct tm time_tm; // use to override index time
+ time_t doc_time;
+
+} htdig_simple_doc_struct;
+
+
+int htdig_index_open(htdig_parameters_struct *);
+int htdig_index_simple_doc(htdig_simple_doc_struct * );
+int htdig_index_urls(void);
+int htdig_index_reset(void);
+int htdig_index_close(void);
+
+int htdig_index_test_url(htdig_parameters_struct *htparms);
+
+int htdig_get_max_head_length(void);
+
+
+
+
+//=============================================================================
+//===== HTDIG MERGING API =====================================================
+
+/**************************************************
+ * HTDIG_DOCUMENTATION for htmerge_parameters_struct
+ *
+ * DEBUGGING PARAMETERS
+ *
+ * int debug
+ * Verbose mode. This increases the verbosity of the
+ * program. Using more than 2 is probably only useful
+ * for debugging purposes. The default verbose mode
+ * gives a progress on what it is doing and where it is.
+ *
+ * char logFile
+ * File to stream debugging & error messages to!
+ *
+ *
+ * BOOLEAN PARAMETERS
+ *
+ * int alt_work_area
+ * Use alternate work files.
+ * Tells htmerge to append .work to database files causing
+ * a second copy of the database to be built. This allows
+ * original files to be used by htsearch during the indexing run.
+ *
+ *
+ * STRING PARAMETERS
+ *
+ * char configFile
+ * configfile
+ * Use the specified configuration file instead of the default.
+ *
+ * char merge_configFile
+ * merge_configfile
+ * Merge the databases specified into the databases specified
+ * by -c or the default.
+ *
+ *
+ *************************************************/
+
+typedef struct htmerge_parameters_struct {
+
+ char configFile[HTDIG_MAX_FILENAME_PATH_L];
+ char merge_configFile[HTDIG_MAX_FILENAME_PATH_L];
+
+ //debugging & logfile
+ char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
+ int debug; //0, 1 ,2, 3, 4, 5
+
+ //booelan values
+ int alt_work_area;
+
+} htmerge_parameters_struct;
+
+int htmerge_index_merge(htmerge_parameters_struct *);
+
+
+
+
+
+//=============================================================================
+//===== HTDIG HTFUZZY API =====================================================
+
+
+
+/**************************************************
+ * HTDIG_DOCUMENTATION for htfuzzy_parameters_struct
+ *
+ * DEBUGGING PARAMETERS
+ *
+ * int debug
+ * Verbose mode. This increases the verbosity of the
+ * program. Using more than 2 is probably only useful
+ * for debugging purposes.
+ *
+ * char logFile
+ * File to stream debugging & error messages to!
+ *
+ *
+ * PARAMETERS
+ *
+ * char configFile
+ * configfile
+ * Use the specified configuration file instead of the default.
+ *
+ * int algorithms_flag
+ * Bitwise Flags to signal algorithms to be used
+ *
+ * soundex == HTDIG_ALG_SOUNDEX
+ * metaphone == HTDIG_ALG_METAPHONE
+ * accents == HTDIG_ALG_ACCENTS
+ * endings == HTDIG_ALG_ENDINGS
+ * synonyms == HTDIG_ALG_SYNONYMS
+ *
+ ***************************************************/
+
+
+typedef struct htfuzzy_parameters_struct {
+
+ char configFile[HTDIG_MAX_FILENAME_PATH_L];
+ int algorithms_flag;
+
+ //debugging & logfile
+ char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
+ int debug; //0, 1 ,2, 3, 4, 5
+
+ //booelan values
+
+} htfuzzy_parameters_struct;
+
+
+// htfuzzy functions
+int htfuzzy_index(htfuzzy_parameters_struct *);
+
+
+
+
+//==============================================================================
+//===== HTDIG SEARCHING API ====================================================
+
+/************************************************
+ * HTDIG_DOCUMENTATION for htsearch_parameters_struct
+ *
+ * DEBUGGING PARAMETERS
+ *
+ * int debug
+ * Verbose mode. This increases the verbosity of the;
+ * program. Using more than 2 is probably only useful;
+ * for debugging purposes. The default verbose mode;
+ * gives a progress on what it is doing and where it is.;
+ *
+ * char logFile
+ * File to stream debugging & error messages to!
+ *
+ * STRING PARAMETERS
+ *
+ * char configFile
+ * configfile
+ * Use the specified configuration file instead of the default.
+ *
+ *
+ **************************************************/
+
+typedef struct htsearch_parameters_struct {
+
+ char configFile[HTDIG_MAX_FILENAME_PATH_L];
+ char DBpath[HTDIG_MAX_FILENAME_PATH_L];
+ char locale[16];
+
+ //debugging & logfile
+ char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
+ int debug; //0, 1 ,2, 3, 4, 5
+
+ //filters
+ char search_restrict[HTDIG_MAX_FILENAME_PATH_L];
+ char search_exclude[HTDIG_MAX_FILENAME_PATH_L];
+ char title_factor[16];
+ char text_factor[16];
+ char meta_description_factor[16];
+
+} htsearch_parameters_struct;
+
+
+
+
+/*****************************************************************
+ * HTDIG_DOCUMENTATION for htsearch_query_struct
+ *
+ * STRING PARAMETERS
+ *
+ * char raw_query
+ * STRING of text that is the search query -- syntax is important
+ *
+ * INTEGER PARAMETERS
+ *
+ * int algorithms_flag [ALSO CALLED 'method' IN HTDIG]
+ * HTSEARCH_ALG_BOOLEAN
+ * HTSEARCH_ALG_OR
+ * HTSEARCH_ALG_AND
+ *
+ * int sortby_flag
+ * score, date, title & reversed
+ * HTSEARCH_SORT_SCORE
+ * HTSEARCH_SORT_REV_SCORE
+ * HTSEARCH_SORT_TIME
+ * HTSEARCH_SORT_REV_TIME
+ * HTSEARCH_SORT_TITLE
+ * HTSEARCH_SORT_REV_TITLE
+ *
+ * int format
+ * short, long (with excerpt)
+ * HTSEARCH_FORMAT_LONG
+ * HTSEARCH_FORMAT_SHORT
+ *
+ *
+ *
+ * TODO: 'Connect' these htsearch features to this API
+ *
+ * config
+ * Specifies the name of the configuration file.
+ *
+ * exclude
+ * This value is a pattern that specifies which URLs are to be excluded from
+ * the search results.
+ *
+ * keywords
+ * Used to specify a list of required words that have to be in the documents.
+ *
+ * restrict
+ * This value is a pattern that all URLs of the search results will have to
+ * match.
+ *
+ * startyear, startmonth, startday, endyear, endmonth, endday
+ * These values specify the allowed range of document modification dates
+ * allowed in the search results.
+ *
+ *
+ *
+ *****************************************************************/
+
+typedef struct htsearch_query_struct {
+
+ char raw_query[HTDIG_MAX_QUERY_L];
+
+ int algorithms_flag;
+ int sortby_flag;
+ int format;
+
+} htsearch_query_struct;
+
+
+/*****************************************************************
+ * HTDIG_DOCUMENTATION for htsearch_query_match_struct
+ *
+ * STRING PARAMETERS
+ *
+ * char title
+ * Title of document returned
+ *
+ * char URL
+ * URL/location-string of document returned
+ *
+ * char excerpt
+ * Excerpt with search words highlighted with
+ * <strong>searchword</strong>
+ *
+ * INTEGER PARAMETERS
+ *
+ * int score
+ * score in 'number of stars'
+ * [MAX NUMBER OF STARS DECLARED IN CONFIG FILE]
+ *
+ * int score_percent //top result is 100%
+ *
+ * time_t time [DOCUMENT TIME]
+ * struct tm time_tm [DOCUMENT TIME]
+ * int size [TOTAL DOCUMENT SIZE]
+ *
+ *
+ *****************************************************************/
+
+typedef struct htsearch_query_match_struct {
+
+ char title[HTDIG_DOCUMENT_TITLE_L];
+ char URL[HTDIG_MAX_FILENAME_PATH_L];
+ char excerpt[HTDIG_DOCUMENT_EXCERPT_L];
+ int score;
+ int score_percent; //top result is 100%
+ struct tm time_tm;
+ int size;
+
+} htsearch_query_match_struct;
+
+
+// htsearch functions
+
+int htsearch_open(htsearch_parameters_struct *);
+int htsearch_query(htsearch_query_struct *);
+
+int htsearch_get_nth_match(int, htsearch_query_match_struct *);
+int htsearch_close();
+
+//htsearch_free(indicator)
+
+char * htsearch_get_error();
+
+
+#endif /* LIBHTDIG_API_H */
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc
new file mode 100644
index 00000000..8a610d36
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc
@@ -0,0 +1,1058 @@
+//-------------------------------------------------------------
+//
+// libhtdig_htdig.cc
+//
+// 1/25/2002 created from htdig.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+// libhtdig_htdig.cc
+//
+// htdig: Indexes the web sites specified in the config file
+// generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htdig.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+//-------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#ifdef HAVE_STD
+#include <iostream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <iostream.h>
+#endif /* HAVE_STD */
+
+extern "C" {
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+#include "BasicDocument.h"
+#include "Document.h"
+#include "TextCollector.h"
+#include "Retriever.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "defaults.h"
+#include "HtURLCodec.h"
+#include "WordContext.h"
+#include "HtDateTime.h"
+#include "HtURLRewriter.h"
+#include "URL.h"
+#include "Server.h"
+
+////////////////////////////
+// For cookie jar
+////////////////////////////
+#include "HtCookieJar.h"
+#include "HtCookieMemJar.h"
+#include "HtHTTP.h"
+////////////////////////////
+
+// If we have this, we probably want it.
+//#ifdef HAVE_GETOPT_H
+//#include <getopt.h>
+//#endif
+
+
+
+//Global Variables for Library
+
+int debug = 0;
+HtRegexList limits;
+HtRegexList limitsn;
+String configFile = DEFAULT_CONFIG_FILE;
+FILE *urls_seen = NULL;
+FILE *images_seen = NULL;
+DocumentDB docs;
+
+
+//
+// Global variables for this file
+//
+static int report_statistics = 0;
+static String minimalFile = 0;
+static HtDateTime StartTime;
+static HtDateTime EndTime;
+
+//static char *max_hops = NULL;
+static String credentials;
+static HtCookieJar *_cookie_jar = NULL;
+static HtConfiguration * config = NULL;
+static WordContext * wc = NULL;
+
+static int create_text_database = 0;
+static int alt_work_area = 0;
+static int initial = 0;
+
+int htdig_index_open_flag = FALSE;
+
+
+//new. URLs from 'command-line'
+#define URL_SEPCHARS " ,"
+static char *myURL = NULL;
+
+
+BasicDocument *a_basicdoc;
+TextCollector *Indexer;
+
+BasicDocument the_basicdoc;
+//TextCollector the_Indexer;
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_open(...)
+ *
+ *
+ * opens/creates document indexes and initializes variables
+ * for indexing.
+ *
+ *
+ * see libhtdig_api.h headerfile for definition of
+ * htdig_parameters_struct
+ *
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+
+int htdig_index_open(htdig_parameters_struct * htdig_parms)
+{
+ int ret = -1;
+
+ if(htdig_index_open_flag != FALSE)
+ return(FALSE);
+
+ //load 'comand-line' parameters
+
+ if (htdig_parms->configFile[0] != 0)
+ configFile = htdig_parms->configFile;
+
+ if (htdig_parms->URL[0] != 0)
+ {
+ myURL = strdup(htdig_parms->URL);
+ }
+
+ debug = htdig_parms->debug;
+ if(debug != 0)
+ {
+ ret = logOpen(htdig_parms->logFile);
+
+ if(ret == FALSE)
+ {
+ reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
+ htdig_parms->logFile, errno, strerror(errno)) );
+ return(HTDIG_ERROR_LOGFILE_OPEN);
+ }
+ }
+
+ initial = htdig_parms->initial;
+ create_text_database = htdig_parms->create_text_database;
+ //max_hops = strdup(htdig_parms->max_hops);
+ report_statistics = htdig_parms->report_statistics;
+ credentials = htdig_parms->credentials;
+ alt_work_area = htdig_parms->alt_work_area;
+ minimalFile = htdig_parms->minimalFile;
+
+
+ if(htdig_parms->use_cookies == TRUE)
+ {
+ // Cookie jar dynamic creation.
+
+ _cookie_jar = new HtCookieMemJar (); // new cookie jar
+ if (_cookie_jar)
+ HtHTTP::SetCookieJar (_cookie_jar);
+ }
+
+ //
+ // First set all the defaults and then read the specified config
+ // file to override the defaults.
+ //
+
+ config = HtConfiguration::config ();
+
+ config->Defaults (&defaults[0]);
+ if (access ((char *) configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTDIG] Unable to find configuration file '%s'",
+ configFile.get ()));
+ return(HTDIG_ERROR_CONFIG_READ);
+ }
+ config->Read (configFile);
+
+ //------- Now override config settings ------------
+
+ //------- override database path ------------
+ if(strlen(htdig_parms->DBpath) > 0)
+ {
+ config->Add("database_dir", htdig_parms->DBpath);
+ }
+
+ //------- custom filters from htdig_parms ----------
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if (config->Find ("locale").empty () && debug > 0)
+ logEntry("Warning: unknown locale!\n");
+
+ if (strlen(htdig_parms->max_hops) > 0)
+ {
+ config->Add ("max_hop_count", htdig_parms->max_hops);
+ }
+
+ if(strlen(htdig_parms->limit_urls_to) > 0)
+ {
+ config->Add("limit_urls_to", htdig_parms->limit_urls_to);
+ }
+
+ if(strlen(htdig_parms->limit_normalized) > 0)
+ {
+ config->Add("limit_normalized", htdig_parms->limit_normalized);
+ }
+
+ if(strlen(htdig_parms->exclude_urls) > 0)
+ {
+ config->Add("exclude_urls", htdig_parms->exclude_urls);
+ }
+
+ if(strlen(htdig_parms->url_rewrite_rules) > 0)
+ {
+ config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules);
+ }
+
+ if(strlen(htdig_parms->bad_querystr) > 0)
+ {
+ config->Add("bad_querystr", htdig_parms->bad_querystr);
+ }
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if(strlen(htdig_parms->meta_description_factor) > 0)
+ {
+ config->Add("meta_description_factor", htdig_parms->meta_description_factor);
+ }
+
+ if(strlen(htdig_parms->title_factor) > 0)
+ {
+ config->Add("title_factor", htdig_parms->title_factor);
+ }
+
+ if(strlen(htdig_parms->text_factor) > 0)
+ {
+ config->Add("text_factor", htdig_parms->text_factor);
+ }
+
+ if(strlen(htdig_parms->URL) > 0)
+ {
+ config->Add("start_url", htdig_parms->URL);
+ free(myURL);
+ myURL=NULL;
+ }
+
+ //------- end custom filters from htdig_parms ----------
+
+ // Set up credentials for this run
+ if (credentials.length ())
+ config->Add ("authorization", credentials);
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance ()->ErrMsg ();
+
+ if (url_part_errors.length () != 0)
+ {
+ reportError (form("[HTDIG] Invalid url_part_aliases or common_url_parts: %s",
+ url_part_errors.get ()));
+ return(HTDIG_ERROR_URL_PART);
+ }
+ //
+ // Check url_rewrite_rules for errors.
+ String url_rewrite_rules = HtURLRewriter::instance ()->ErrMsg ();
+
+ if (url_rewrite_rules.length () != 0)
+ {
+ reportError (form ("[HTDIG] Invalid url_rewrite_rules: %s",
+ url_rewrite_rules.get ()));
+ return(HTDIG_ERROR_URL_REWRITE);
+ }
+
+ //
+ // If indicated, change the database file names to have the .work
+ // extension
+ //
+ if (alt_work_area != 0)
+ {
+ String configValue = config->Find ("doc_db");
+
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_db", configValue);
+ }
+
+ configValue = config->Find ("word_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("word_db", configValue);
+ }
+
+ configValue = config->Find ("doc_index");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_index", configValue);
+ }
+
+ configValue = config->Find ("doc_excerpt");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_excerpt", configValue);
+ }
+
+ configValue = config->Find ("md5_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("md5_db", configValue);
+ }
+ }
+
+ //
+ // If needed, we will create a list of every URL we come across.
+ //TODO put document-index log file stuff here
+
+ if (config->Boolean ("create_url_list"))
+ {
+ const String filename = config->Find ("url_list");
+ urls_seen = fopen (filename, initial ? "w" : "a");
+ if (urls_seen == 0)
+ {
+ reportError (form ("[HTDIG] Unable to create URL file '%s'",
+ filename.get ()));
+ return(HTDIG_ERROR_URL_CREATE_FILE);
+ }
+ }
+
+ //
+ // If needed, we will create a list of every image we come across.
+ //
+ if (config->Boolean ("create_image_list"))
+ {
+ const String filename = config->Find ("image_list");
+ images_seen = fopen (filename, initial ? "w" : "a");
+ if (images_seen == 0)
+ {
+ reportError (form ("[HTDIG] Unable to create images file '%s'",
+ filename.get ()));
+ return(HTDIG_ERROR_IMAGE_CREATE_FILE);
+ }
+ }
+
+ //
+ // Set up the limits list
+ //
+ StringList l (config->Find ("limit_urls_to"), " \t");
+ limits.setEscaped (l, config->Boolean ("case_sensitive"));
+ l.Destroy ();
+
+ l.Create (config->Find ("limit_normalized"), " \t");
+ limitsn.setEscaped (l, config->Boolean ("case_sensitive"));
+ l.Destroy ();
+
+ //
+ // Open the document database
+ //
+ const String filename = config->Find ("doc_db");
+ if (initial)
+ unlink (filename);
+
+ const String index_filename = config->Find ("doc_index");
+ if (initial)
+ unlink (index_filename);
+
+ const String head_filename = config->Find ("doc_excerpt");
+ if (initial)
+ unlink (head_filename);
+
+ if (docs.Open (filename, index_filename, head_filename) < 0)
+ {
+ reportError (form ("[HTDIG] Unable to open/create document database '%s'",
+ filename.get ()));
+ return(HTDIG_ERROR_OPEN_CREATE_DOCDB);
+ }
+
+ const String word_filename = config->Find ("word_db");
+ if (initial)
+ unlink (word_filename);
+
+ // Initialize htword
+ wc = new WordContext;
+ wc->Initialize(*config);
+
+
+ //a_basicdoc = new BasicDocument;
+ Indexer = new TextCollector;
+
+ a_basicdoc = &the_basicdoc;
+ a_basicdoc->Reset();
+
+ //Indexer = &the_Indexer;
+
+ if ((a_basicdoc == NULL) || (Indexer == NULL))
+ return(FALSE);
+
+
+ htdig_index_open_flag = TRUE;
+
+ return(TRUE);
+
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_simple_doc(...)
+ *
+ *
+ * indexes a simple document supplied by parameter
+ *
+ * see libhtdig_api.h headerfile for definition of
+ * htdig_simple_doc_struct
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+int htdig_index_simple_doc(htdig_simple_doc_struct * a_simple_doc)
+{
+ int index_error = 0;
+ //int ret = 0;
+
+ // Reset the document to clean out any old data
+ a_basicdoc->Reset();
+
+ a_basicdoc->ModTime(a_simple_doc->doc_time);
+ a_basicdoc->Location(a_simple_doc->location);
+ a_basicdoc->DocumentID(a_simple_doc->documentid);
+ a_basicdoc->Title(a_simple_doc->title);
+ a_basicdoc->MetaContent(a_simple_doc->meta);
+ a_basicdoc->Contents(a_simple_doc->contents); //MUST ALLOCATE & FREE!!!
+ a_basicdoc->ContentType(a_simple_doc->content_type); //MIME-ISH string
+ a_basicdoc->Length();
+
+
+ //TODO What is this error?
+ index_error = Indexer->IndexDoc(*a_basicdoc);
+
+ return(TRUE);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_urls(...)
+ *
+ * Starts fetch & index of URL supplied in config file
+ * OR supplied in htdig_index_open parameter
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ * TODO Blank/empty URL error?
+ *******************************************************/
+int htdig_index_urls(void)
+{
+
+ char * temp_URL_list = NULL;
+ char * temp_url = NULL;
+
+ // Create the Retriever object which we will use to parse all the
+ // HTML files.
+ // In case this is just an update dig, we will add all existing
+ // URLs?
+ //
+ Retriever retriever (Retriever_logUrl);
+ if (minimalFile.length () == 0)
+ {
+ List *list = docs.URLs ();
+ retriever.Initial (*list);
+ delete list;
+
+ // Add start_url to the initial list of the retriever.
+ // Don't check a URL twice!
+ // Beware order is important, if this bugs you could change
+ // previous line retriever.Initial(*list, 0) to Initial(*list,1)
+ retriever.Initial (config->Find ("start_url"), 1);
+ }
+
+ // Handle list of URLs given on 'command-line'
+ if (myURL != NULL)
+ {
+ String str;
+ temp_URL_list = strdup(myURL);
+ temp_url = strtok(temp_URL_list, URL_SEPCHARS);
+ while (temp_url != NULL)
+ {
+ str = temp_url;
+ str.chop ("\r\n");
+ if (str.length () > 0)
+ retriever.Initial (str, 1);
+
+ temp_url = strtok(NULL, URL_SEPCHARS);
+ }
+ free(temp_URL_list);
+ }
+ else if (minimalFile.length () != 0)
+ {
+ FILE *input = fopen (minimalFile.get (), "r");
+ char buffer[1000];
+
+ if (input)
+ {
+ while (fgets (buffer, sizeof (buffer), input))
+ {
+ String str (buffer);
+ str.chop ("\r\n\t ");
+ if (str.length () > 0)
+ retriever.Initial (str, 1);
+ }
+ fclose (input);
+ }
+ }
+
+ //
+ // Go do it!
+ //
+ retriever.Start ();
+
+ //
+ // All done with parsing.
+ //
+
+ //
+ // If the user so wants, create a text version of the document database.
+ //
+
+ if (create_text_database)
+ {
+ const String doc_list = config->Find ("doc_list");
+ if (initial)
+ unlink (doc_list);
+ docs.DumpDB (doc_list);
+ const String word_dump = config->Find ("word_dump");
+ if (initial)
+ unlink (word_dump);
+ HtWordList words (*config);
+ if (words.Open (config->Find ("word_db"), O_RDONLY) == OK)
+ {
+ words.Dump (word_dump);
+ }
+ }
+
+ //
+ // Cleanup
+ //
+ if (images_seen)
+ fclose (images_seen);
+
+ //
+ // If needed, report some statistics
+ //
+ if (report_statistics)
+ {
+ retriever.ReportStatistics ("htdig");
+ }
+
+ return(TRUE);
+}
+
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_close(...)
+ *
+ * Closes the database and destroys various objects
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+int htdig_index_close(void)
+{
+ int ret = -1;
+
+ if(htdig_index_open_flag == TRUE)
+ {
+ //delete a_basicdoc;
+ //delete Indexer;
+
+ Indexer->FlushWordDB();
+
+ if (_cookie_jar)
+ delete _cookie_jar;
+
+ //if (max_hops != NULL)
+ // free(max_hops);
+
+ if (myURL != NULL)
+ free(myURL);
+
+ //call destructors here
+ docs.~DocumentDB();
+ //config->~HtConfiguration();
+
+ if (debug != 0)
+ {
+ ret = logClose();
+
+ if (ret == FALSE)
+ {
+ reportError (form ("[HTDIG] Error closing log file . Error:[%d], %s\n",
+ errno, strerror(errno)) );
+ return(HTDIG_ERROR_LOGFILE_CLOSE);
+ }
+ }
+
+ /*
+ if(config) {
+ WordContext::Finish();
+ }
+ */
+
+ if (wc)
+ delete wc;
+
+ if (urls_seen)
+ fclose (urls_seen);
+
+ htdig_index_open_flag = FALSE;
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_reset(...)
+ *
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+
+int htdig_index_reset(void)
+{
+ Indexer->FlushWordDB();
+ a_basicdoc->Reset();
+
+ return(TRUE);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_get_max_head_length(...)
+ *
+ *
+ * Returns size of maximum document storage length
+ * for db.excerpts [htdig.conf:max_head_length]
+ *
+ * This represents the maximum amount of the document
+ * That will be available for excerpting.
+ *
+ *
+ *******************************************************/
+
+int htdig_get_max_head_length()
+{
+ int ret = -1;
+
+ if(config != NULL)
+ ret = config->Value("max_head_length");
+
+ return(ret);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_test_url(...)
+ *
+ *
+ * Test a URL for filter Pass/Fail
+ *
+ * Pass = return(TRUE)
+ * Fail = return(XXX) [Negative Value]
+ *
+ *
+ *
+ *
+ *
+ *******************************************************/
+
+
+//int htdig_index_test_url(htdig_parameters_struct *htdig_parms)
+int htdig_index_test_url(htdig_parameters_struct *htdig_parms)
+{
+ //int ret = FALSE;
+ String the_URL(htdig_parms->URL);
+ HtConfiguration* config= HtConfiguration::config();
+ Dictionary invalids;
+ Dictionary valids;
+ URL aUrl(the_URL);
+ String rewritten_url(the_URL);
+ StringList tmpList;
+ HtRegex limitTo;
+ HtRegex excludeFrom;
+
+ //initalize outgoing-parameter rewritten_URL
+ htdig_parms->rewritten_URL[0] = 0;
+
+#ifdef DEBUG
+ //output relevant config variables
+ cout << " bad_extensions = " << config->Find("bad_extensions") << endl;
+ cout << " valid_extensions = " << config->Find("valid_extensions") << endl;
+ cout << " exclude_urls = " << config->Find("exclude_urls") << endl;
+ cout << " bad_querystr = " << config->Find("bad_querystr") << endl;
+ cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl;
+ cout << " limit_normalized = " << config->Find("limit_normalized") << endl;
+ cout << " restrict = " << config->Find("restrict") << endl;
+ cout << " exclude = " << config->Find("exclude") << endl;
+#endif
+
+ //------------ read the config file if it is given ---------------
+ if (htdig_parms->configFile[0] != 0)
+ configFile = htdig_parms->configFile;
+
+ config = HtConfiguration::config ();
+
+ config->Defaults (&defaults[0]);
+ if (access ((char *) configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTDIG] Unable to find configuration file '%s'",
+ configFile.get ()));
+ return(HTDIG_ERROR_CONFIG_READ);
+ }
+ config->Read (configFile);
+
+ //---------- Now override config settings -----------------
+
+ //------- override database path ------------
+ if(strlen(htdig_parms->DBpath) > 0)
+ {
+ config->Add("database_dir", htdig_parms->DBpath);
+ }
+
+ //------- custom filters from htdig_parms ----------
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if (config->Find ("locale").empty () && debug > 0)
+ logEntry("Warning: unknown locale!\n");
+
+ if (strlen(htdig_parms->max_hops) > 0)
+ {
+ config->Add ("max_hop_count", htdig_parms->max_hops);
+ }
+
+ if(strlen(htdig_parms->limit_urls_to) > 0)
+ {
+ config->Add("limit_urls_to", htdig_parms->limit_urls_to);
+ }
+
+ if(strlen(htdig_parms->limit_normalized) > 0)
+ {
+ config->Add("limit_normalized", htdig_parms->limit_normalized);
+ }
+
+ if(strlen(htdig_parms->exclude_urls) > 0)
+ {
+ config->Add("exclude_urls", htdig_parms->exclude_urls);
+ }
+
+ if(strlen(htdig_parms->url_rewrite_rules) > 0)
+ {
+ config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules);
+ }
+
+ if(strlen(htdig_parms->bad_querystr) > 0)
+ {
+ config->Add("bad_querystr", htdig_parms->bad_querystr);
+ }
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if(strlen(htdig_parms->meta_description_factor) > 0)
+ {
+ config->Add("meta_description_factor", htdig_parms->meta_description_factor);
+ }
+
+ if(strlen(htdig_parms->title_factor) > 0)
+ {
+ config->Add("title_factor", htdig_parms->title_factor);
+ }
+
+ if(strlen(htdig_parms->text_factor) > 0)
+ {
+ config->Add("text_factor", htdig_parms->text_factor);
+ }
+
+ //-------------------------------------------------------------------
+
+#ifdef DEBUG
+ //output relevant config variables
+ cout << " bad_extensions = " << config->Find("bad_extensions") << endl;
+ cout << " valid_extensions = " << config->Find("valid_extensions") << endl;
+ cout << " exclude_urls = " << config->Find("exclude_urls") << endl;
+ cout << " bad_querystr = " << config->Find("bad_querystr") << endl;
+ cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl;
+ cout << " limit_normalized = " << config->Find("limit_normalized") << endl;
+ cout << " restrict = " << config->Find("restrict") << endl;
+ cout << " exclude = " << config->Find("exclude") << endl;
+#endif
+
+
+ //------ bad_extensions -----------------------------------------------
+ //A list of bad extensions, separated by spaces or tabs
+
+ String t = config->Find("bad_extensions");
+ String lowerp;
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ invalids.Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+
+
+ //------ valid_extensions ------------------------------------------------
+ // Valid extensions are performed similarly
+ // A list of valid extensions, separated by spaces or tabs
+
+ t = config->Find("valid_extensions");
+ p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ valids.Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+
+ //----- rewrite the URL------------------------------------------
+ aUrl.rewrite();
+ rewritten_url = aUrl.get();
+
+ if(rewritten_url.length() <= 0)
+ {
+ //Rejected: empty rewritten URL
+ String temp = config->Find("url_rewrite_rules");
+ strcpy(htdig_parms->rewritten_URL, temp.get());
+ system(form("echo \"%s\" > /tmp/neal", temp.get()));
+
+ return(HTDIG_ERROR_TESTURL_REWRITE_EMPTY);
+ }
+
+ //cout << form("TestURL: org=[%s]\n", the_URL.get());
+ //cout << form(" rewritten[%s]\n", rewritten_url.get());
+
+ //copy the rewritten URL for outgoing parm pass
+ strcpy(htdig_parms->rewritten_URL, rewritten_url.get());
+
+ //---- exclude_urls ---------------------------------------------
+ // If the URL contains any of the patterns in the exclude list,
+ // mark it as invalid
+
+ /*if(strlen(htdig_parms->exclude_urls) > 0)
+ tmpList.Create(htdig_parms->exclude_urls," \t");
+ else*/
+ tmpList.Create(config->Find("exclude_urls")," \t");
+
+ HtRegexList excludes;
+ excludes.setEscaped(tmpList, config->Boolean("case_sensitive"));
+ if (excludes.match(rewritten_url, 0, 0) != 0)
+ {
+ //Rejected: item in exclude list
+ return(HTDIG_ERROR_TESTURL_EXCLUDE);
+ }
+
+ //---- bad_querystr -------------------------------------------
+ // If the URL has a query string and it is in the bad query list
+ // mark it as invalid
+
+ tmpList.Destroy();
+
+ /*if(strlen(htdig_parms->bad_querystr) > 0)
+ tmpList.Create(htdig_parms->bad_querystr, " \t");
+ else*/
+ tmpList.Create(config->Find("bad_querystr"), " \t");
+
+ HtRegexList badquerystr;
+ badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive"));
+ char *ext = strrchr((char*)rewritten_url, '?');
+ if (ext && badquerystr.match(ext, 0, 0) != 0)
+ {
+ //if (debug > 2)
+ // cout << endl << " Rejected: item in bad query list ";
+ return(HTDIG_ERROR_TESTURL_BADQUERY);
+ }
+
+ //------ invalid_extensions #2 ------
+ // See if the file extension is in the list of invalid ones
+
+ ext = strrchr((char*)rewritten_url, '.');
+ String lowerext;
+ if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the
+ ext = NULL; // final component of the path.
+ if(ext)
+ {
+ lowerext.set(ext);
+ int parm = lowerext.indexOf('?'); // chop off URL parameter
+ if (parm >= 0)
+ lowerext.chop(lowerext.length() - parm);
+ lowerext.lowercase();
+ if (invalids.Exists(lowerext))
+ {
+ //Rejected: Extension is invalid!
+ return(HTDIG_ERROR_TESTURL_EXTENSION);
+ }
+ }
+
+ //------ valid_extensions #2 ------
+ // Or NOT in the list of valid ones
+
+ if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
+ {
+ //Rejected: Extension is not valid!
+ return(HTDIG_ERROR_TESTURL_EXTENSION2);
+ }
+
+ //----- limit_urls_to & limit_normalized ------------------------------
+ // Set up the limits list
+
+ StringList l;
+ /*if(strlen(htdig_parms->limit_urls_to) > 0)
+ l.Create(htdig_parms->limit_urls_to, " \t");
+ else*/
+ l.Create(config->Find ("limit_urls_to"), " \t");
+
+ limits.setEscaped (l, config->Boolean ("case_sensitive"));
+
+ l.Destroy ();
+
+ /*if(strlen(htdig_parms->limit_normalized) > 0)
+ l.Create (htdig_parms->limit_normalized, " \t");
+ else*/
+ l.Create (config->Find ("limit_normalized"), " \t");
+
+ limitsn.setEscaped (l, config->Boolean ("case_sensitive"));
+ l.Destroy ();
+
+ // If any of the limits are met, we allow the URL
+ if (limits.match(rewritten_url, 1, 0) == 0)
+ {
+ //Rejected: URL not in the limits!;
+ return(HTDIG_ERROR_TESTURL_LIMITS);
+ }
+
+
+ // or not in list of normalized urls
+ // Warning! should be last in checks because of aUrl normalization
+ aUrl.normalize();
+ if (limitsn.match(rewritten_url.get(), 1, 0) == 0)
+ {
+ //Rejected: not in "limit_normalized" list!
+ return(HTDIG_ERROR_TESTURL_LIMITSNORM);
+ }
+
+ //----- restrict & exclude ----------------------------------
+ //Search-Time Filters
+
+ String temp;
+
+ /*if(strlen(htdig_parms->search_restrict) > 0)
+ temp = htdig_parms->search_restrict;
+ else*/
+ temp = config->Find("restrict");
+
+ if (temp.length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(temp, " \t\r\n\001|");
+ limitTo.setEscaped(l);
+ }
+
+ /*if(strlen(htdig_parms->search_exclude) > 0)
+ temp = htdig_parms->search_exclude;
+ else*/
+ temp = config->Find("exclude");
+
+ if (temp.length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(temp, " \t\r\n\001|");
+ excludeFrom.setEscaped(l);
+ }
+
+ //Restrict Test
+ if (limitTo.match(rewritten_url, 1, 0) == 0)
+ {
+ //Rejected URL Not in SearchTime Restrict List
+ return(HTDIG_ERROR_TESTURL_SRCH_RESTRICT);
+ }
+ //Exclude Test
+ if (excludeFrom.match(rewritten_url, 0, 0) != 0)
+ {
+ //Rejected URL in SearchTime Exclude List
+ return(HTDIG_ERROR_TESTURL_SRCH_EXCLUDE);
+ }
+
+
+ //Success!
+ return TRUE;
+}
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc
new file mode 100644
index 00000000..f7597c8e
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htfuzzy.cc
@@ -0,0 +1,265 @@
+//----------------------------------------------------------------
+//
+// libhtdig_htfuzzy.cc
+//
+// 1/25/2002 created from htfuzzy.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+// libhtdig_htfuzzy.cc
+//
+// htfuzzy: Create one or more ``fuzzy'' indexes into the main word database.
+// These indexes can be used by htsearch to perform a search that uses
+// other algorithms than exact word match.
+//
+// This program is meant to be run after htmerge has created the word
+// database.
+//
+// For each fuzzy algorithm, there will be a separate database. Each
+// database is simply a mapping from the fuzzy key to a list of words
+// in the main word database.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htfuzzy.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+//----------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+extern "C"
+{
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+
+//#include "htfuzzy.h" //NOT USED
+
+#include "Fuzzy.h"
+#include "Accents.h"
+#include "Soundex.h"
+#include "Endings.h"
+#include "Metaphone.h"
+#include "Synonym.h"
+#include "htString.h"
+#include "List.h"
+#include "Dictionary.h"
+#include "defaults.h"
+#include "HtWordList.h"
+#include "WordContext.h"
+
+// If we have this, we probably want it.
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+
+#include "HtConfiguration.h"
+#include "HtWordList.h"
+
+#include <stdlib.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+
+
+
+extern int debug;
+
+static HtConfiguration * config = NULL;
+
+
+//*****************************************************************************
+// int main(int ac, char **av)
+//
+//int main(int ac, char **av)
+
+int htfuzzy_index(htfuzzy_parameters_struct * htfuzzy_parms)
+{
+ String configFile = DEFAULT_CONFIG_FILE;
+ int ret = 0;
+
+ //
+ // Parse command line arguments
+ //
+
+ debug = htfuzzy_parms->debug;
+ if (debug != 0)
+ {
+ ret = logOpen(htfuzzy_parms->logFile);
+
+ if (ret == FALSE)
+ {
+ fprintf(stderr, "htdig: Error opening file [%s]. Error:[%d], %s\n",
+ htfuzzy_parms->logFile, errno, strerror(errno));
+ }
+ }
+
+
+ configFile = htfuzzy_parms->configFile;
+
+ config = HtConfiguration::config();
+
+ //
+ // Determine what algorithms to use
+ //
+ List wordAlgorithms;
+ List noWordAlgorithms;
+
+ if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_SOUNDEX)
+ {
+ wordAlgorithms.Add(new Soundex(*config));
+ }
+ else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_METAPHONE)
+ {
+ wordAlgorithms.Add(new Metaphone(*config));
+ }
+ else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_ACCENTS)
+ {
+ wordAlgorithms.Add(new Accents(*config));
+ }
+ else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_ENDINGS)
+ {
+ noWordAlgorithms.Add(new Endings(*config));
+ }
+ else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_SYNONYMS)
+ {
+ noWordAlgorithms.Add(new Synonym(*config));
+ }
+
+
+ if (wordAlgorithms.Count() == 0 && noWordAlgorithms.Count() == 0)
+ {
+ logEntry(form("htfuzzy: No algorithms specified\n"));
+ }
+
+ //
+ // Find and parse the configuration file.
+ //
+ config->Defaults(&defaults[0]);
+ if (access((char *) configFile, R_OK) < 0)
+ {
+ reportError(form("[HTFUZZY] Unable to find configuration file '%s'", configFile.get()));
+ }
+ config->Read(configFile);
+
+ // Initialize htword library (key description + wordtype...)
+ WordContext::Initialize(*config);
+
+ Fuzzy *fuzzy;
+ if (wordAlgorithms.Count() > 0)
+ {
+ //
+ // Open the word database so that we can grab the words from it.
+ //
+ HtWordList worddb(*config);
+ if (worddb.Open(config->Find("word_db"), O_RDONLY) == OK)
+ {
+ //
+ // Go through all the words in the database
+ //
+ List *words = worddb.Words();
+ String *key;
+ Fuzzy *fuzzy = 0;
+ String word, fuzzyKey;
+ int count = 0;
+
+ words->Start_Get();
+ while ((key = (String *) words->Get_Next()))
+ {
+ word = *key;
+ wordAlgorithms.Start_Get();
+ while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next()))
+ {
+ fuzzy->addWord(word);
+ }
+ count++;
+ if ((count % 100) == 0 && debug)
+ {
+ //cout << "htfuzzy: words: " << count << '\n';
+ }
+ }
+ if (debug)
+ {
+ logEntry(form("htfuzzy: total words: %d\n", count));
+ logEntry(form("htfuzzy: Writing index files...\n"));
+ }
+
+ //
+ // All the information is now in memory.
+ // Write all of it out to the individual databases
+ //
+ wordAlgorithms.Start_Get();
+ while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next()))
+ {
+ fuzzy->writeDB();
+ }
+ worddb.Close();
+ words->Destroy();
+ delete words;
+ if (fuzzy)
+ delete fuzzy;
+ }
+ else
+ {
+ reportError(form("[htfuzzy] Unable to open word database %s", config->Find("word_db").get()));
+ }
+ }
+ if (noWordAlgorithms.Count() > 0)
+ {
+ noWordAlgorithms.Start_Get();
+ while ((fuzzy = (Fuzzy *) noWordAlgorithms.Get_Next()))
+ {
+ if (debug)
+ {
+ logEntry(form( "htfuzzy: Selected algorithm: %s\n", fuzzy->getName()));
+ }
+ if (fuzzy->createDB(*config) == NOTOK)
+ {
+ logEntry(form("htfuzzy: Could not create database for algorithm: %s\n", fuzzy->getName()));
+ }
+ }
+ }
+
+ if (debug)
+ {
+ logEntry("htfuzzy: Done.\n");
+ }
+
+ if (debug != 0)
+ {
+ ret = logClose();
+
+ if (ret == FALSE)
+ {
+ fprintf(stderr, "htfuzzy: Error closing file [%s]. Error:[%d], %s\n",
+ htfuzzy_parms->logFile, errno, strerror(errno));
+ }
+ }
+
+
+ delete config;
+
+ return 0;
+}
+
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc
new file mode 100644
index 00000000..988a8b61
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc
@@ -0,0 +1,407 @@
+//----------------------------------------------------------------
+//
+// libhtdig_htmerge.cc
+//
+// 1/25/2002 created from htmerge.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+// libhtdig_htmerge.cc
+//
+// htmerge: Merges two databases and/or updates databases to remove
+// old documents and ensures the databases are consistent.
+// Calls db.cc, docs.cc, and/or words.cc as necessary
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1999-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htmerge.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+//----------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+extern "C" {
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+#include "WordContext.h"
+#include "good_strtok.h"
+#include "defaults.h"
+#include "DocumentDB.h"
+#include "HtURLCodec.h"
+#include "HtWordList.h"
+#include "HtWordReference.h"
+#include "htString.h"
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+// If we have this, we probably want it.
+//#ifdef HAVE_GETOPT_H
+//#include <getopt.h>
+//#endif
+
+
+
+
+
+//Global Variables for this file
+
+// This hash is used to keep track of all the document IDs which have to be
+// discarded.
+// This is generated from the doc database and is used to prune words
+// from the word db
+static Dictionary discard_list;
+
+// This config is used for merging multiple databses
+static HtConfiguration merge_config;
+static HtConfiguration *config = NULL;
+
+static int verbose = 0;
+//static int stats = 0;
+static int alt_work_area = 0;
+
+//static String configFile = DEFAULT_CONFIG_FILE;
+extern String configFile;
+
+static String merge_configFile = 0;
+
+
+// Component procedures
+static int mergeDB ();
+
+int htmerge_index_merge(htmerge_parameters_struct *htmerge_parms)
+{
+ int ret = -1;
+ int merge_ret = -1;
+
+ //load htmerge 'command-line parameters'
+ configFile = htmerge_parms->configFile;
+ merge_configFile = htmerge_parms->merge_configFile;
+ verbose = htmerge_parms->debug;
+ if(verbose != 0)
+ {
+ ret = logOpen(htmerge_parms->logFile);
+
+ if(ret == FALSE)
+ {
+ reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
+ htmerge_parms->logFile, errno, strerror(errno)) );
+ return(HTMERGE_ERROR_LOGFILE_OPEN);
+ }
+ }
+
+ alt_work_area = htmerge_parms->alt_work_area;
+
+
+
+ config = HtConfiguration::config ();
+ config->Defaults (&defaults[0]);
+
+ if (access ((char *) configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to find configuration file '%s'",
+ configFile.get ()));
+ return(HTMERGE_ERROR_CONFIG_READ);
+ }
+
+ config->Read (configFile);
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance ()->ErrMsg ();
+
+ if (url_part_errors.length () != 0)
+ {
+ reportError (form("[HTMERGE] Invalid url_part_aliases or common_url_parts: %s",
+ url_part_errors.get ()));
+ return(HTMERGE_ERROR_URL_PART);
+ }
+
+ if (merge_configFile.length ())
+ {
+ merge_config.Defaults (&defaults[0]);
+ if (access ((char *) merge_configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to find configuration file '%s'",
+ merge_configFile.get ()));
+ return(HTMERGE_ERROR_CONFIG_READ);
+ }
+ merge_config.Read (merge_configFile);
+ }
+
+ if (alt_work_area != 0)
+ {
+ String configValue;
+
+ configValue = config->Find ("word_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("word_db", configValue);
+ }
+
+ configValue = config->Find ("doc_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_db", configValue);
+ }
+
+ configValue = config->Find ("doc_index");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_index", configValue);
+ }
+
+ configValue = config->Find ("doc_excerpt");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_excerpt", configValue);
+ }
+ }
+
+ WordContext::Initialize(*config);
+
+ if (merge_configFile.length())
+ {
+ // Merge the databases specified in merge_configFile into the current
+ // databases. Do this first then update the other databases as usual
+ // Note: We don't have to specify anything, it's all in the config vars
+
+ merge_ret = mergeDB();
+ }
+
+ //call destructors here
+ config->~HtConfiguration();
+ merge_config.~HtConfiguration();
+
+ if (verbose != 0)
+ {
+ ret = logClose();
+
+ if (ret == FALSE)
+ {
+ reportError (form("[HTMERGE]: Error closing file [%s]. Error:[%d], %s\n",
+ htmerge_parms->logFile, errno, strerror(errno)) );
+ return(HTMERGE_ERROR_LOGFILE_CLOSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+//*****************************************************************************
+// void mergeDB()
+//
+static int mergeDB ()
+{
+ HtConfiguration *config = HtConfiguration::config ();
+ DocumentDB merge_db, db;
+ List *urls;
+ Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore
+ int docIDOffset;
+
+ const String doc_index = config->Find ("doc_index");
+ if (access (doc_index, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document index '%s'",
+ (const char *) doc_index));
+ return(HTMERGE_ERROR_DOCINDEX_READ);
+ }
+ const String doc_excerpt = config->Find ("doc_excerpt");
+ if (access (doc_excerpt, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document excerpts '%s'",
+ (const char *) doc_excerpt));
+ return(HTMERGE_ERROR_EXCERPTDB_READ);
+ }
+ const String doc_db = config->Find ("doc_db");
+ if (db.Open (doc_db, doc_index, doc_excerpt) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open/create document database '%s'",
+ (const char *) doc_db));
+ return(HTMERGE_ERROR_DOCDB_READ);
+ }
+
+
+ const String merge_doc_index = merge_config["doc_index"];
+ if (access (merge_doc_index, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document index '%s'",
+ (const char *) merge_doc_index));
+ return(HTMERGE_ERROR_DOCINDEX_READ);
+ }
+ const String merge_doc_excerpt = merge_config["doc_excerpt"];
+ if (access (merge_doc_excerpt, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document excerpts '%s'",
+ (const char *) merge_doc_excerpt));
+ return(HTMERGE_ERROR_EXCERPTDB_READ);
+ }
+ const String merge_doc_db = merge_config["doc_db"];
+ if (merge_db.Open (merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open document database '%s'",
+ (const char *) merge_doc_db));
+ return(HTMERGE_ERROR_DOCDB_READ);
+ }
+
+ // Start the merging by going through all the URLs that are in
+ // the database to be merged
+
+ urls = merge_db.URLs ();
+ // This ensures that every document added from merge_db has a unique ID
+ // in the new database
+ docIDOffset = db.NextDocID ();
+
+ urls->Start_Get ();
+ String *url;
+ String id;
+ while ((url = (String *) urls->Get_Next ()))
+ {
+ DocumentRef *ref = merge_db[url->get ()];
+ DocumentRef *old_ref = db[url->get ()];
+ if (!ref)
+ continue;
+
+ if (old_ref)
+ {
+ // Oh well, we knew this would happen. Let's get the duplicate
+ // And we'll only use the most recent date.
+
+ if (old_ref->DocTime () >= ref->DocTime ())
+ {
+ // Cool, the ref we're merging is too old, just ignore it
+ char str[20];
+ sprintf (str, "%d", ref->DocID ());
+ merge_dup_ids.Add (str, 0);
+
+ if (verbose > 1)
+ {
+ logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring & merging copy\n", url));
+ }
+ }
+ else
+ {
+ // The ref we're merging is newer, delete the old one and add
+ char str[20];
+ sprintf (str, "%d", old_ref->DocID ());
+ db_dup_ids.Add (str, 0);
+ db.Delete (old_ref->DocID ());
+ ref->DocID (ref->DocID () + docIDOffset);
+ db.Add (*ref);
+ if (verbose > 1)
+ {
+ logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring destination copy\n",url->get()));
+ }
+ }
+ }
+ else
+ {
+ // It's a new URL, just add it, making sure to load the excerpt
+ merge_db.ReadExcerpt (*ref);
+ ref->DocID (ref->DocID () + docIDOffset);
+ db.Add (*ref);
+ if (verbose > 1)
+ {
+ logEntry(form("[HTMERGE] Merged URL: {%s} \n",url->get()));
+ }
+ }
+ delete ref;
+ delete old_ref;
+ }
+ delete urls;
+
+ // As reported by Roman Dimov, we must update db.NextDocID()
+ // because of all the added records...
+ db.IncNextDocID (merge_db.NextDocID ());
+ merge_db.Close ();
+ db.Close ();
+
+ // OK, after merging the doc DBs, we do the same for the words
+ HtWordList mergeWordDB (*config), wordDB (*config);
+ List *words;
+ String docIDKey;
+
+ if (wordDB.Open (config->Find ("word_db"), O_RDWR) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open/create word database '%s'",
+ (const char *) config->Find ("word_db")));
+ return(HTMERGE_ERROR_WORDDB_READ);
+ }
+
+ if (mergeWordDB.Open (merge_config["word_db"], O_RDONLY) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open word database '%s'",
+ (const char *) merge_config["word_db"]));
+ return(HTMERGE_ERROR_WORDDB_READ);
+ }
+
+ // Start the merging by going through all the URLs that are in
+ // the database to be merged
+
+ words = mergeWordDB.WordRefs ();
+
+ words->Start_Get ();
+ HtWordReference *word;
+ while ((word = (HtWordReference *) words->Get_Next ()))
+ {
+ docIDKey = word->DocID ();
+ if (merge_dup_ids.Exists (docIDKey))
+ continue;
+
+ word->DocID (word->DocID () + docIDOffset);
+ wordDB.Override (*word);
+ }
+ delete words;
+
+ words = wordDB.WordRefs ();
+ words->Start_Get ();
+ while ((word = (HtWordReference *) words->Get_Next ()))
+ {
+ docIDKey = word->DocID ();
+ if (db_dup_ids.Exists (docIDKey))
+ wordDB.Delete (*word);
+ }
+ delete words;
+
+ // Cleanup--just close the two word databases
+ mergeWordDB.Close ();
+ wordDB.Close ();
+
+ return(TRUE);
+
+}
+
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc
new file mode 100644
index 00000000..472b5fc2
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc
@@ -0,0 +1,1099 @@
+//----------------------------------------------------------------
+//
+// libhtdig_htsearch.cc
+//
+// 1/25/2002 created from htsearch.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+//
+// htsearch: The main search CGI. Parses the CGI input, reads the config files
+// and calls the necessary code to put together the result lists
+// and the final display.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htsearch.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//----------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+extern "C"
+{
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+
+#include "htsearch.h"
+#include "defaults.h"
+#include "WeightWord.h"
+#include "parser.h"
+#include "ResultFetch.h"
+#include "../htfuzzy/Fuzzy.h"
+#include "cgi.h"
+#include "WordRecord.h"
+#include "HtWordList.h"
+#include "StringList.h"
+#include "IntObject.h"
+#include "HtURLCodec.h"
+#include "HtURLRewriter.h"
+#include "WordContext.h"
+#include "HtRegex.h"
+#include "Collection.h"
+
+//define _XOPEN_SOURCE
+//#define _GNU_SOURCE
+#include <time.h>
+#include <ctype.h>
+#include <signal.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+
+// If we have this, we probably want it.
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+
+typedef void (*SIGNAL_HANDLER) (...);
+
+// ResultList *htsearch(const String&, List &, Parser *);
+int htsearch(Collection *, List &, Parser *);
+
+void setupWords(char *, List &, int, Parser *, String &);
+void createLogicalWords(List &, String &, String &);
+void reportError(char *);
+void convertToBoolean(List & words);
+void doFuzzy(WeightWord *, List &, List &);
+void addRequiredWords(List &, StringList &);
+
+int minimum_word_length = 3;
+
+StringList boolean_keywords;
+
+Parser *parser = NULL;
+
+extern String configFile;
+extern int debug;
+
+static HtConfiguration *config = NULL;
+Dictionary selected_collections; // Multiple database support
+Collection *collection = NULL;
+String errorMsg;
+
+String originalWords;
+String origPattern;
+String logicalWords;
+String logicalPattern;
+StringMatch *searchWordsPattern = NULL;
+StringList requiredWords; //TODO add this
+
+HtRegex limit_to;
+HtRegex exclude_these;
+
+// List searchWords;
+List *searchWords = NULL;
+
+StringList collectionList; // List of databases to search on
+
+
+static int total_matches = 0;
+static List *matches_list = 0;
+static ResultFetch *resultfetch = 0;
+
+
+//*****************************************************************************
+// int main()
+//
+//int main(int ac, char **av)
+int htsearch_open(htsearch_parameters_struct * htsearch_parms)
+{
+ int ret = -1;
+ int override_config = 0;
+
+ String logicalWords;
+ String logicalPattern;
+ // StringMatch searchWordsPattern;
+ StringMatch *searchWordsPattern = NULL;
+ StringList requiredWords;
+ //int i;
+ //int c;
+ int cInd = 0;
+
+ //load 'comand-line' parameters
+
+ if (htsearch_parms->configFile[0] != 0)
+ configFile = htsearch_parms->configFile;
+
+ debug = htsearch_parms->debug;
+ if (debug != 0)
+ {
+ ret = logOpen(htsearch_parms->logFile);
+
+ if (ret == FALSE)
+ {
+ reportError(form("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
+ htsearch_parms->logFile, errno, strerror(errno)));
+ return (HTSEARCH_ERROR_LOGFILE_OPEN);
+ }
+ }
+
+
+ //case 'c':
+ // The default is obviously to do this securely
+ // but if people want to shoot themselves in the foot...
+ // configFile = optarg;
+ // override_config = 1;
+
+ //
+ // The total search can NEVER take more than 5 minutes.
+ //
+ //alarm(5 * 60);
+
+ errorMsg = "";
+
+ config = HtConfiguration::config();
+
+ // Each collection is handled in an iteration. Reset the following so
+ // that we start with a clean slate.
+ //
+ logicalWords = 0;
+ origPattern = 0;
+ logicalPattern = 0;
+ searchWords = new List;
+ searchWordsPattern = new StringMatch;
+
+ char *config_name = collectionList[cInd];
+ if (config_name && config_name[0] == '\0')
+ config_name = NULL; // use default config
+
+ //
+ // Setup the configuration database. First we read the compiled defaults.
+ // Then we override those with defaults read in from the configuration
+ // file, and finally we override some attributes with information we
+ // got from the HTML form.
+ //
+ config->Defaults(&defaults[0]);
+ // To allow . in filename while still being 'secure',
+ // e.g. htdig-f.q.d.n.conf
+ if (!override_config && config_name && (strstr(config_name, "./") == NULL))
+ {
+ char *configDir = getenv("CONFIG_DIR");
+ if (configDir)
+ {
+ configFile = configDir;
+ }
+ else
+ {
+ configFile = CONFIG_DIR;
+ }
+ if (strlen(config_name) == 0)
+ configFile = DEFAULT_CONFIG_FILE;
+ else
+ configFile << '/' << config_name << ".conf";
+ }
+ if (access((char *) configFile, R_OK) < 0)
+ {
+ reportError(form("Unable to read configuration file '%s'", configFile.get()));
+ return (HTSEARCH_ERROR_CONFIG_READ);
+ }
+ config->Read(configFile);
+
+
+ //---------- Now override config settings -----------------
+
+ //------- override database path ------------
+ if (strlen(htsearch_parms->DBpath) > 0)
+ {
+ config->Add("database_dir", htsearch_parms->DBpath);
+ }
+
+ //------- custom filters from htsearch_parms ----------
+
+ //resrict,exclude,urlrewrite
+
+
+ if (strlen(htsearch_parms->meta_description_factor) > 0)
+ {
+ config->Add("meta_description_factor", htsearch_parms->meta_description_factor);
+ }
+
+ if (strlen(htsearch_parms->title_factor) > 0)
+ {
+ config->Add("title_factor", htsearch_parms->title_factor);
+ }
+
+ if (strlen(htsearch_parms->text_factor) > 0)
+ {
+ config->Add("text_factor", htsearch_parms->text_factor);
+ }
+
+ if(strlen(htsearch_parms->locale) > 0)
+ {
+ config->Add("locale", htsearch_parms->locale);
+ }
+
+ //-------------------------------------------------------------------
+
+
+ // Initialize htword library (key description + wordtype...)
+ WordContext::Initialize(*config);
+
+//NON-CGI Usage libhtdig
+/*
+
+ config->Add("match_method", input["method"]);
+ config->Add("template_name", input["format"]);
+
+ // minimum check for a valid int value of "matchesperpage" cgi variable
+ if (atoi(input["matchesperpage"]) > 0)
+ config->Add("matches_per_page", input["matchesperpage"]);
+
+ pageNumber = atoi(input["page"]);
+ config->Add("config", input["config"]);
+ config->Add("restrict", input["restrict"]);
+ config->Add("exclude", input["exclude"]);
+ config->Add("keywords", input["keywords"]);
+ requiredWords.Create(config->Find("keywords"), " \t\r\n\001");
+ config->Add("sort", input["sort"]);
+
+ config->Add("startmonth", input["startmonth"]);
+ config->Add("startday", input["startday"]);
+ config->Add("startyear", input["startyear"]);
+
+ config->Add("endmonth", input["endmonth"]);
+ config->Add("endday", input["endday"]);
+ config->Add("endyear", input["endyear"]);
+
+
+ StringList form_vars(config->Find("allow_in_form"), " \t\r\n");
+ for (i = 0; i < form_vars.Count(); i++)
+ {
+ if (input.exists(form_vars[i]))
+ config->Add(form_vars[i], input[form_vars[i]]);
+ }
+
+*/
+//NON-CGI Usage libhtdig
+
+
+ minimum_word_length = config->Value("minimum_word_length", minimum_word_length);
+
+ //
+ // Compile the URL limit patterns.
+ //
+
+ if (config->Find("restrict").length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(config->Find("restrict"), " \t\r\n\001|");
+ limit_to.setEscaped(l);
+ String u = l.Join('|');
+ config->Add("restrict", u); // re-create the config attribute
+ }
+ if (config->Find("exclude").length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(config->Find("exclude"), " \t\r\n\001|");
+ exclude_these.setEscaped(l);
+ String u = l.Join('|');
+ config->Add("exclude", u); // re-create the config attribute
+ }
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance()->ErrMsg();
+
+ if (url_part_errors.length() != 0)
+ {
+ reportError(form("Invalid url_part_aliases or common_url_parts: %s", url_part_errors.get()));
+ return (HTSEARCH_ERROR_URL_PART);
+
+ }
+
+ // for htsearch, use search_rewrite_rules attribute for HtURLRewriter.
+ config->AddParsed("url_rewrite_rules", "${search_rewrite_rules}");
+ url_part_errors = HtURLRewriter::instance()->ErrMsg();
+ if (url_part_errors.length() != 0)
+ reportError(form("Invalid url_rewrite_rules: %s", url_part_errors.get()));
+
+ // Load boolean_keywords from configuration
+ // they should be placed in this order:
+ // 0 1 2
+ // and or not
+ boolean_keywords.Create(config->Find("boolean_keywords"), "| \t\r\n\001");
+ if (boolean_keywords.Count() != 3)
+ reportError("boolean_keywords attribute should have three entries");
+
+
+
+ parser = new Parser();
+
+ return (TRUE);
+}
+
+//---------------------------------------------------------------------------------------
+//
+//
+// RETURN: Number of Documents resulted from search
+//
+//---------------------------------------------------------------------------------------
+
+int htsearch_query(htsearch_query_struct * htseach_query)
+{
+ int total_match_count = 0;
+
+ originalWords = htseach_query->raw_query;
+ originalWords.chop(" \t\r\n");
+
+ //sort
+ switch (htseach_query->sortby_flag)
+ {
+ case HTSEARCH_SORT_SCORE:
+ config->Add("sort", "score");
+ break;
+ case HTSEARCH_SORT_REV_SCORE:
+ config->Add("sort", "revscore");
+ break;
+ case HTSEARCH_SORT_TIME:
+ config->Add("sort", "time");
+ break;
+ case HTSEARCH_SORT_REV_TIME:
+ config->Add("sort", "revtime");
+ break;
+ case HTSEARCH_SORT_TITLE:
+ config->Add("sort", "title");
+ break;
+ case HTSEARCH_SORT_REV_TITLE:
+ config->Add("sort", "revtitle");
+ break;
+ }
+
+
+ switch (htseach_query->algorithms_flag)
+ {
+ case HTSEARCH_ALG_BOOLEAN:
+ config->Add("match_method", "boolean");
+ break;
+ case HTSEARCH_ALG_OR:
+ config->Add("match_method", "or");
+ break;
+ case HTSEARCH_ALG_AND:
+ config->Add("match_method", "and");
+ break;
+ }
+
+ //format
+ switch (htseach_query->algorithms_flag)
+ {
+ case HTSEARCH_FORMAT_SHORT:
+ config->Add("template_name", "builtin-short");
+ break;
+ case HTSEARCH_FORMAT_LONG:
+ config->Add("template_name", "builtin-long");
+ break;
+ }
+
+
+ origPattern = 0;
+ logicalWords = 0;
+ logicalPattern = 0;
+ searchWordsPattern = new StringMatch;
+
+ // Iterate over all specified collections (databases)
+ //for (int cInd = 0; errorMsg.empty() && cInd < collectionList.Count(); cInd++)
+ //{
+
+ // Parse the words to search for from the argument list.
+ // This will produce a list of WeightWord objects.
+ //
+ setupWords(originalWords, *searchWords,
+ strcmp(config->Find("match_method"), "boolean") == 0, parser, origPattern);
+
+ //
+ // Convert the list of WeightWord objects to a pattern string
+ // that we can compile.
+ //
+ createLogicalWords(*searchWords, logicalWords, logicalPattern);
+
+ //
+ // Assemble the full pattern for excerpt matching and highlighting
+ //
+ origPattern += logicalPattern;
+ searchWordsPattern->IgnoreCase();
+ searchWordsPattern->IgnorePunct();
+ searchWordsPattern->Pattern(logicalPattern); // this should now be enough
+ //searchWordsPattern.Pattern(origPattern);
+ //if (debug > 2)
+ // cout << "Excerpt pattern: " << origPattern << "\n";
+
+ //
+ // If required keywords were given in the search form, we will
+ // modify the current searchWords list to include the required
+ // words.
+ //
+ if (requiredWords.Count() > 0)
+ {
+ addRequiredWords(*searchWords, requiredWords);
+ }
+
+ //
+ // Perform the actual search. The function htsearch() is used for this.
+ // The Dictionary it returns is then passed on to the Display object to
+ // actually render the results in HTML.
+ //
+ const String word_db = config->Find("word_db");
+ if (access(word_db, R_OK) < 0)
+ {
+ reportError(form("Unable to read word database file '%s'\nDid you run htdig?", word_db.get()));
+ return (HTSEARCH_ERROR_WORDDB_READ);
+ }
+ // ResultList *results = htsearch((char*)word_db, searchWords, parser);
+
+ String doc_index = config->Find("doc_index");
+ if (access((char *) doc_index, R_OK) < 0)
+ {
+ reportError(form("Unable to read document index file '%s'\nDid you run htdig?", doc_index.get()));
+ return (HTSEARCH_ERROR_DOCINDEX_READ);
+ }
+
+ const String doc_db = config->Find("doc_db");
+ if (access(doc_db, R_OK) < 0)
+ {
+ reportError(form("Unable to read document database file '%s'\nDid you run htdig?", doc_db.get()));
+ return (HTSEARCH_ERROR_DOCDB_READ);
+ }
+
+ const String doc_excerpt = config->Find("doc_excerpt");
+ if (access(doc_excerpt, R_OK) < 0)
+ {
+ reportError(form("Unable to read document excerpts '%s'\nDid you run htdig?", doc_excerpt.get()));
+ return (HTSEARCH_ERROR_EXCERPTDB_READ);
+ }
+
+ // Multiple database support
+ collection = new Collection((char *) configFile,
+ word_db.get(), doc_index.get(), doc_db.get(), doc_excerpt.get());
+
+ // Perform search within the collection. Each collection stores its
+ // own result list.
+ total_match_count += htsearch(collection, *searchWords, parser);
+ collection->setSearchWords(searchWords);
+ collection->setSearchWordsPattern(searchWordsPattern);
+ selected_collections.Add(configFile, collection);
+
+ if (parser->hadError())
+ errorMsg = parser->getErrorMessage();
+
+ delete parser;
+ //}
+
+
+ total_matches = total_match_count;
+
+ if (total_matches > 0)
+ {
+
+ resultfetch = new ResultFetch(&selected_collections, collectionList);
+
+ if (resultfetch->hasTemplateError())
+ {
+ reportError(form("Unable to read template file '%s'\nDoes it exist?",
+ (const char *) config->Find("template_name")));
+
+ return (HTSEARCH_ERROR_TEMPLATE_ERROR);
+ }
+ resultfetch->setOriginalWords(originalWords);
+ resultfetch->setLimit(&limit_to);
+ resultfetch->setExclude(&exclude_these);
+ resultfetch->setLogicalWords(logicalWords);
+ if (!errorMsg.empty())
+ resultfetch->displaySyntaxError(errorMsg);
+ else
+ {
+
+ matches_list = resultfetch->fetch();
+
+ //matches_list->Start_Get();
+
+ }
+
+ } //if ((total_matches > 0) && (desired_match_index == 0))
+
+
+ return (total_match_count);
+}
+
+//------------------ htsearch_get_nth_match (...) -------------------------------------
+//
+// Parameters
+// result_desired_index ZERO based results index.
+// query_result structure to fill with result
+//
+// htsearch_query_match_struct:
+// char title[HTDIG_DOCUMENT_TITLE_L];
+// char URL[HTDIG_MAX_FILENAME_PATH_L];
+// char excerpt[HTDIG_DOCUMENT_EXCERPT_L];
+// int score;
+// int match_percent; //top result is 100%
+// time_t doc_date;
+// int size;
+//
+//---------------------------------------------------------------------------------------
+
+int htsearch_get_nth_match(int desired_match_index, htsearch_query_match_struct * query_result)
+{
+
+ ResultMatch *match = 0;
+ Dictionary *vars = 0;
+
+ if (total_matches == 0)
+ {
+ return (HTSEARCH_ERROR_NO_MATCH);
+ }
+ else if (desired_match_index >= total_matches)
+ {
+ return (HTSEARCH_ERROR_BAD_MATCH_INDEX);
+ }
+ else if ((total_matches > 0) && (desired_match_index < total_matches))
+ {
+ match = (ResultMatch *) matches_list->Nth(desired_match_index);
+
+ // DocumentRef *ref = docDB[match->getID()];
+ Collection *collection = match->getCollection();
+ DocumentRef *ref = collection->getDocumentRef(match->getID());
+ if (!ref || ref->DocState() != Reference_normal)
+ {
+ // The document isn't present or shouldn't be displayed
+ return (HTSEARCH_ERROR_BAD_DOCUMENT);
+ }
+
+ ref->DocAnchor(match->getAnchor());
+ ref->DocScore(match->getScore());
+ vars = resultfetch->fetchMatch(match, ref, desired_match_index);
+ delete ref;
+
+ String *value;
+ String key;
+
+ key = "NSTARS";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ query_result->score = atoi(value->get());
+
+ key = "PERCENT";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ query_result->score_percent = atoi(value->get());
+
+ key = "TITLE";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ snprintf(query_result->title, HTDIG_DOCUMENT_TITLE_L, "%s", value->get());
+
+ key = "EXCERPT";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ snprintf(query_result->excerpt, HTDIG_DOCUMENT_EXCERPT_L, "%s", value->get());
+
+ key = "URL";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ snprintf(query_result->URL, HTDIG_MAX_FILENAME_PATH_L, "%s", value->get());
+
+ String datefmt = config->Find("date_format");
+ key = "MODIFIED";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ mystrptime(value->get(), datefmt.get(), &(query_result->time_tm));
+ //cout << "[" << asctime(&query_result->time_tm) << "]" << endl;
+
+ key = "SIZE";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ query_result->size = atoi(value->get());
+
+
+ }
+
+ return (TRUE);
+}
+
+//---------------------------------------------------------------------------------------
+//
+//
+// RETURN: TRUE or FALSE
+//
+//---------------------------------------------------------------------------------------
+
+int htsearch_close()
+{
+
+
+ // delete results;
+ // delete parser;
+
+
+ return (TRUE);
+
+}
+
+//*****************************************************************************
+void createLogicalWords(List & searchWords, String & logicalWords, String & wm)
+{
+ String pattern;
+ int i;
+ int wasHidden = 0;
+ int inPhrase = 0;
+
+ for (i = 0; i < searchWords.Count(); i++)
+ {
+ WeightWord *ww = (WeightWord *) searchWords[i];
+ if (!ww->isHidden)
+ {
+
+ if (strcmp((char *) ww->word, "&") == 0 && wasHidden == 0)
+ logicalWords << ' ' << boolean_keywords[AND] << ' ';
+ else if (strcmp((char *) ww->word, "|") == 0 && wasHidden == 0)
+ logicalWords << ' ' << boolean_keywords[OR] << ' ';
+ else if (strcmp((char *) ww->word, "!") == 0 && wasHidden == 0)
+ logicalWords << ' ' << boolean_keywords[NOT] << ' ';
+ else if (strcmp((char *) ww->word, "\"") == 0 && wasHidden == 0)
+ {
+ if (inPhrase)
+ logicalWords.chop(' ');
+ inPhrase = !inPhrase;
+ logicalWords << "\"";
+ }
+ else if (wasHidden == 0)
+ {
+ logicalWords << ww->word;
+ if (inPhrase)
+ logicalWords << " ";
+ }
+ wasHidden = 0;
+ }
+ else
+ wasHidden = 1;
+ if (ww->weight > 0 // Ignore boolean syntax stuff
+ && !ww->isIgnore) // Ignore short or bad words
+ {
+ if (pattern.length() && !inPhrase)
+ pattern << '|';
+ else if (pattern.length() && inPhrase)
+ pattern << ' ';
+ pattern << ww->word;
+ }
+ }
+ wm = pattern;
+
+ if (debug)
+ {
+ cerr << "LogicalWords: " << logicalWords << endl;
+ cerr << "Pattern: " << pattern << endl;
+ }
+}
+
+void dumpWords(List & words, char *msg = "")
+{
+ if (debug)
+ {
+ cerr << msg << ": '";
+ for (int i = 0; i < words.Count(); i++)
+ {
+ WeightWord *ww = (WeightWord *) words[i];
+ cerr << ww->word << ':' << ww->isHidden << ' ';
+ }
+ cerr << "'\n";
+ }
+}
+
+//*****************************************************************************
+// void setupWords(char *allWords, List &searchWords,
+// int boolean, Parser *parser, String &originalPattern)
+//
+void setupWords(char *allWords, List & searchWords, int boolean, Parser * parser, String & originalPattern)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ List tempWords;
+ int i;
+
+ //
+ // Parse the words we need to search for. It should be a list of words
+ // with optional 'and' and 'or' between them. The list of words
+ // will be put in the searchWords list and at the same time in the
+ // String pattern separated with '|'.
+ //
+
+ //
+ // Convert the string to a list of WeightWord objects. The special
+ // characters '(' and ')' will be put into their own WeightWord objects.
+ //
+ unsigned char *pos = (unsigned char *) allWords;
+ unsigned char t;
+ String word;
+ const String prefix_suffix = config->Find("prefix_match_character");
+ while (*pos)
+ {
+ while (1)
+ {
+ t = *pos++;
+ if (isspace(t))
+ {
+ continue;
+ }
+ else if (t == '"')
+ {
+ tempWords.Add(new WeightWord("\"", -1.0));
+ break;
+ }
+ else if (boolean && (t == '(' || t == ')'))
+ {
+ char s[2];
+ s[0] = t;
+ s[1] = '\0';
+ tempWords.Add(new WeightWord(s, -1.0));
+ break;
+ }
+ else if (HtIsWordChar(t) || t == ':' ||
+ (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255))
+ {
+ word = 0;
+ while (t && (HtIsWordChar(t) ||
+ t == ':' || (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255)))
+ {
+ word << (char) t;
+ t = *pos++;
+ }
+
+ pos--;
+
+ if (boolean && (mystrcasecmp(word.get(), "+") == 0
+ || mystrcasecmp(word.get(), boolean_keywords[AND]) == 0))
+ {
+ tempWords.Add(new WeightWord("&", -1.0));
+ }
+ else if (boolean && mystrcasecmp(word.get(), boolean_keywords[OR]) == 0)
+ {
+ tempWords.Add(new WeightWord("|", -1.0));
+ }
+ else if (boolean && (mystrcasecmp(word.get(), "-") == 0
+ || mystrcasecmp(word.get(), boolean_keywords[NOT]) == 0))
+ {
+ tempWords.Add(new WeightWord("!", -1.0));
+ }
+ else
+ {
+ // Add word to excerpt matching list
+ originalPattern << word << "|";
+ WeightWord *ww = new WeightWord(word, 1.0);
+ if (HtWordNormalize(word) & WORD_NORMALIZE_NOTOK)
+ ww->isIgnore = 1;
+ tempWords.Add(ww);
+ }
+ break;
+ }
+ }
+ }
+
+ dumpWords(tempWords, "tempWords");
+
+ //
+ // If the user specified boolean expression operators, the whole
+ // expression has to be syntactically correct. If not, we need
+ // to report a syntax error.
+ //
+ if (boolean)
+ {
+ if (!parser->checkSyntax(&tempWords))
+ {
+ for (i = 0; i < tempWords.Count(); i++)
+ {
+ searchWords.Add(tempWords[i]);
+ }
+ tempWords.Release();
+ return;
+// reportError("Syntax error");
+ }
+ }
+ else
+ {
+ convertToBoolean(tempWords);
+ }
+
+ dumpWords(tempWords, "Boolean");
+
+ //
+ // We need to assign weights to the words according to the search_algorithm
+ // configuration attribute.
+ // For algorithms other than exact, we need to also do word lookups.
+ //
+ StringList algs(config->Find("search_algorithm"), " \t");
+ List algorithms;
+ String name, weight;
+ double fweight;
+ Fuzzy *fuzzy = 0;
+
+ //
+ // Generate the list of algorithms to use and associate the given
+ // weights with them.
+ //
+ for (i = 0; i < algs.Count(); i++)
+ {
+ name = strtok(algs[i], ":");
+ weight = strtok(0, ":");
+ if (name.length() == 0)
+ name = "exact";
+ if (weight.length() == 0)
+ weight = "1";
+ fweight = atof((char *) weight);
+
+ fuzzy = Fuzzy::getFuzzyByName(name, *config);
+ if (fuzzy)
+ {
+ fuzzy->setWeight(fweight);
+ fuzzy->openIndex();
+ algorithms.Add(fuzzy);
+ }
+ }
+
+ dumpWords(searchWords, "initial");
+
+ //
+ // For each of the words, apply all the algorithms.
+ //
+ int in_phrase = 0; // If we get into a phrase, we don't want to fuzz.
+ for (i = 0; i < tempWords.Count(); i++)
+ {
+ WeightWord *ww = (WeightWord *) tempWords[i];
+ if (ww->weight > 0 && !ww->isIgnore && !in_phrase)
+ {
+ //
+ // Apply all the algorithms to the word.
+ //
+ if (debug)
+ cerr << "Fuzzy on: " << ww->word << endl;
+ doFuzzy(ww, searchWords, algorithms);
+ delete ww;
+ }
+ else if (ww->word.length() == 1 && ww->word[0] == '"')
+ {
+ in_phrase = !in_phrase;
+ if (debug)
+ cerr << "Add: " << ww->word << endl;
+ searchWords.Add(ww);
+ }
+ else
+ {
+ //
+ // This is '(', ')', '&', or '|'. These will be automatically
+ // transfered to the searchWords list.
+ //
+ if (debug)
+ cerr << "Add: " << ww->word << endl;
+ searchWords.Add(ww);
+ }
+ dumpWords(searchWords, "searchWords");
+ }
+ tempWords.Release();
+}
+
+
+//*****************************************************************************
+void doFuzzy(WeightWord * ww, List & searchWords, List & algorithms)
+{
+ List fuzzyWords;
+ List weightWords;
+ Fuzzy *fuzzy;
+ WeightWord *newWw;
+ String *word;
+
+ algorithms.Start_Get();
+ while ((fuzzy = (Fuzzy *) algorithms.Get_Next()))
+ {
+ if (debug > 1)
+ cout << " " << fuzzy->getName();
+ fuzzy->getWords(ww->word, fuzzyWords);
+ fuzzyWords.Start_Get();
+ while ((word = (String *) fuzzyWords.Get_Next()))
+ {
+ if (debug > 1)
+ cout << " " << word->get();
+ newWw = new WeightWord(word->get(), fuzzy->getWeight());
+ newWw->isExact = ww->isExact;
+ newWw->isHidden = ww->isHidden;
+ weightWords.Add(newWw);
+ }
+ if (debug > 1)
+ cout << endl;
+ fuzzyWords.Destroy();
+ }
+
+ //
+ // We now have a list of substitute words. They need to be added
+ // to the searchWords.
+ //
+ if (weightWords.Count())
+ {
+ if (weightWords.Count() > 1)
+ searchWords.Add(new WeightWord("(", -1.0));
+ for (int i = 0; i < weightWords.Count(); i++)
+ {
+ if (i > 0)
+ searchWords.Add(new WeightWord("|", -1.0));
+ searchWords.Add(weightWords[i]);
+ }
+ if (weightWords.Count() > 1)
+ searchWords.Add(new WeightWord(")", -1.0));
+ }
+ else // if no fuzzy matches, add exact word, but give it tiny weight
+ {
+ searchWords.Add(new WeightWord(word->get(), 0.000001));
+ }
+
+
+ weightWords.Release();
+}
+
+
+//*****************************************************************************
+// void convertToBoolean(List &words)
+//
+void convertToBoolean(List & words)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ List list;
+ int i;
+ int do_and = strcmp(config->Find("match_method"), "and") == 0;
+ int in_phrase = 0;
+
+ String quote = "\"";
+
+ if (words.Count() == 0)
+ return;
+ list.Add(words[0]);
+
+ // We might start off with a phrase match
+ if (((WeightWord *) words[0])->word == quote)
+ in_phrase = 1;
+
+ for (i = 1; i < words.Count(); i++)
+ {
+ if (do_and && !in_phrase)
+ list.Add(new WeightWord("&", -1.0));
+ else if (!in_phrase)
+ list.Add(new WeightWord("|", -1.0));
+
+ if (((WeightWord *) words[i])->word == quote)
+ in_phrase = !in_phrase;
+
+ list.Add(words[i]);
+ }
+ words.Release();
+
+ for (i = 0; i < list.Count(); i++)
+ {
+ words.Add(list[i]);
+ }
+ list.Release();
+}
+
+
+//*****************************************************************************
+// Dictionary *htsearch(char *wordfile, List &searchWords, Parser *parser)
+// This returns a dictionary indexed by document ID and containing a
+// List of HtWordReference objects.
+//
+int htsearch(Collection * collection, List & searchWords, Parser * parser)
+{
+ int count = 0;
+
+ //
+ // Pick the database type we are going to use
+ //
+ ResultList *matches = new ResultList;
+ if (searchWords.Count() > 0)
+ {
+ // parser->setDatabase(wordfile);
+ parser->setCollection(collection);
+ parser->parse(&searchWords, *matches);
+ }
+
+ collection->setResultList(matches);
+
+ count = matches->Count();
+
+ return (count);
+}
+
+
+//*****************************************************************************
+// Modify the search words list to include the required words as well.
+// This is done by putting the existing search words in parenthesis and
+// appending the required words separated with "and".
+void addRequiredWords(List & searchWords, StringList & requiredWords)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ static int any_keywords = config->Boolean("any_keywords", 0);
+ if (requiredWords.Count() == 0)
+ return;
+ if (searchWords.Count() > 0)
+ {
+ searchWords.Insert(new WeightWord("(", -1.0), 0);
+ searchWords.Add(new WeightWord(")", -1.0));
+ searchWords.Add(new WeightWord("&", -1.0));
+ }
+ if (requiredWords.Count() == 1)
+ {
+ searchWords.Add(new WeightWord(requiredWords[0], 1.0));
+ }
+ else
+ {
+ searchWords.Add(new WeightWord("(", -1.0));
+ searchWords.Add(new WeightWord(requiredWords[0], 1.0));
+ for (int i = 1; i < requiredWords.Count(); i++)
+ {
+ if (any_keywords)
+ searchWords.Add(new WeightWord("|", -1.0));
+ else
+ searchWords.Add(new WeightWord("&", -1.0));
+ searchWords.Add(new WeightWord(requiredWords[i], 1.0));
+ }
+ searchWords.Add(new WeightWord(")", -1.0));
+ }
+}
+
+
+//*****************************************************************************
+// Report an error. Since we don' know if we are running as a CGI or not,
+// we will assume this is the first thing returned by a CGI program.
+//
+void reportError_html(char *msg)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ cout << "Content-type: text/html\r\n\r\n";
+ cout << "<html><head><title>htsearch error</title></head>\n";
+ cout << "<body bgcolor=\"#ffffff\">\n";
+ cout << "<h1>ht://Dig error</h1>\n";
+ cout << "<p>htsearch detected an error. Please report this to the\n";
+ cout << "webmaster of this site by sending an e-mail to:\n";
+ cout << "<a href=\"mailto:" << config->Find("maintainer") << "\">";
+ cout << config->Find("maintainer") << "</a>\n";
+ cout << "The error message is:</p>\n";
+ cout << "<pre>\n" << msg << "\n</pre>\n</body></html>\n";
+ exit(1);
+}
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc
new file mode 100644
index 00000000..db51ae3a
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.cc
@@ -0,0 +1,99 @@
+//--------------------------------------------------------------------
+//
+// libhtdig_log.cc
+//
+// 2/6/2002 created
+//
+// Neal Richter nealr@rightnow.com
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_log.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif
+
+#include "libhtdig_log.h"
+
+#include <stdlib.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+#include <time.h>
+
+
+static FILE *errorlog_fp = NULL;
+
+int logOpen(char *filename)
+{
+ if(errorlog_fp == NULL)
+ errorlog_fp = fopen(filename, "a+");
+
+ if (errorlog_fp == NULL)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+
+void logEntry (char *msg)
+{
+ time_t now = time(NULL);
+
+ if(errorlog_fp != NULL)
+ fprintf(errorlog_fp, "[%s] %s\n", ctime(&now), msg);
+
+}
+
+
+//*****************************************************************************
+// Report an error
+
+void reportError (char *msg)
+{
+ time_t now = time(NULL);
+
+ if(errorlog_fp != NULL)
+ fprintf(errorlog_fp, "%s [ERROR] %s\n", ctime(&now), msg);
+
+ fprintf(stderr, "%s [ERROR] %s\n", ctime(&now), msg);
+
+}
+
+
+int logClose()
+{
+ int ret = -1;
+
+ if(errorlog_fp != NULL)
+ {
+ ret = fclose(errorlog_fp);
+ errorlog_fp = NULL;
+
+ if(ret == 0)
+ return(TRUE);
+ else
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h
new file mode 100644
index 00000000..22adceca
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_log.h
@@ -0,0 +1,38 @@
+//--------------------------------------------------------------------
+//
+// libhtdig_log.h
+//
+// 2/6/2002 created
+//
+// Neal Richter nealr@rightnow.com
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_log.h,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+#ifndef LIBHTDIG_LOG_H
+#define LIBHTDIG_LOG_H
+
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+
+int logOpen(char *file);
+void logEntry(char *msg);
+void reportError(char *msg);
+int logClose(void);
+
+#endif /* LIBHTDIG_LOG_H */
+