summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc1099
1 files changed, 1099 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc
new file mode 100644
index 00000000..472b5fc2
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htsearch.cc
@@ -0,0 +1,1099 @@
+//----------------------------------------------------------------
+//
+// libhtdig_htsearch.cc
+//
+// 1/25/2002 created from htsearch.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+//
+// htsearch: The main search CGI. Parses the CGI input, reads the config files
+// and calls the necessary code to put together the result lists
+// and the final display.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htsearch.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//----------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+extern "C"
+{
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+
+#include "htsearch.h"
+#include "defaults.h"
+#include "WeightWord.h"
+#include "parser.h"
+#include "ResultFetch.h"
+#include "../htfuzzy/Fuzzy.h"
+#include "cgi.h"
+#include "WordRecord.h"
+#include "HtWordList.h"
+#include "StringList.h"
+#include "IntObject.h"
+#include "HtURLCodec.h"
+#include "HtURLRewriter.h"
+#include "WordContext.h"
+#include "HtRegex.h"
+#include "Collection.h"
+
+//define _XOPEN_SOURCE
+//#define _GNU_SOURCE
+#include <time.h>
+#include <ctype.h>
+#include <signal.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+
+// If we have this, we probably want it.
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+
+typedef void (*SIGNAL_HANDLER) (...);
+
+// ResultList *htsearch(const String&, List &, Parser *);
+int htsearch(Collection *, List &, Parser *);
+
+void setupWords(char *, List &, int, Parser *, String &);
+void createLogicalWords(List &, String &, String &);
+void reportError(char *);
+void convertToBoolean(List & words);
+void doFuzzy(WeightWord *, List &, List &);
+void addRequiredWords(List &, StringList &);
+
+int minimum_word_length = 3;
+
+StringList boolean_keywords;
+
+Parser *parser = NULL;
+
+extern String configFile;
+extern int debug;
+
+static HtConfiguration *config = NULL;
+Dictionary selected_collections; // Multiple database support
+Collection *collection = NULL;
+String errorMsg;
+
+String originalWords;
+String origPattern;
+String logicalWords;
+String logicalPattern;
+StringMatch *searchWordsPattern = NULL;
+StringList requiredWords; //TODO add this
+
+HtRegex limit_to;
+HtRegex exclude_these;
+
+// List searchWords;
+List *searchWords = NULL;
+
+StringList collectionList; // List of databases to search on
+
+
+static int total_matches = 0;
+static List *matches_list = 0;
+static ResultFetch *resultfetch = 0;
+
+
+//*****************************************************************************
+// int main()
+//
+//int main(int ac, char **av)
+int htsearch_open(htsearch_parameters_struct * htsearch_parms)
+{
+ int ret = -1;
+ int override_config = 0;
+
+ String logicalWords;
+ String logicalPattern;
+ // StringMatch searchWordsPattern;
+ StringMatch *searchWordsPattern = NULL;
+ StringList requiredWords;
+ //int i;
+ //int c;
+ int cInd = 0;
+
+ //load 'comand-line' parameters
+
+ if (htsearch_parms->configFile[0] != 0)
+ configFile = htsearch_parms->configFile;
+
+ debug = htsearch_parms->debug;
+ if (debug != 0)
+ {
+ ret = logOpen(htsearch_parms->logFile);
+
+ if (ret == FALSE)
+ {
+ reportError(form("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
+ htsearch_parms->logFile, errno, strerror(errno)));
+ return (HTSEARCH_ERROR_LOGFILE_OPEN);
+ }
+ }
+
+
+ //case 'c':
+ // The default is obviously to do this securely
+ // but if people want to shoot themselves in the foot...
+ // configFile = optarg;
+ // override_config = 1;
+
+ //
+ // The total search can NEVER take more than 5 minutes.
+ //
+ //alarm(5 * 60);
+
+ errorMsg = "";
+
+ config = HtConfiguration::config();
+
+ // Each collection is handled in an iteration. Reset the following so
+ // that we start with a clean slate.
+ //
+ logicalWords = 0;
+ origPattern = 0;
+ logicalPattern = 0;
+ searchWords = new List;
+ searchWordsPattern = new StringMatch;
+
+ char *config_name = collectionList[cInd];
+ if (config_name && config_name[0] == '\0')
+ config_name = NULL; // use default config
+
+ //
+ // Setup the configuration database. First we read the compiled defaults.
+ // Then we override those with defaults read in from the configuration
+ // file, and finally we override some attributes with information we
+ // got from the HTML form.
+ //
+ config->Defaults(&defaults[0]);
+ // To allow . in filename while still being 'secure',
+ // e.g. htdig-f.q.d.n.conf
+ if (!override_config && config_name && (strstr(config_name, "./") == NULL))
+ {
+ char *configDir = getenv("CONFIG_DIR");
+ if (configDir)
+ {
+ configFile = configDir;
+ }
+ else
+ {
+ configFile = CONFIG_DIR;
+ }
+ if (strlen(config_name) == 0)
+ configFile = DEFAULT_CONFIG_FILE;
+ else
+ configFile << '/' << config_name << ".conf";
+ }
+ if (access((char *) configFile, R_OK) < 0)
+ {
+ reportError(form("Unable to read configuration file '%s'", configFile.get()));
+ return (HTSEARCH_ERROR_CONFIG_READ);
+ }
+ config->Read(configFile);
+
+
+ //---------- Now override config settings -----------------
+
+ //------- override database path ------------
+ if (strlen(htsearch_parms->DBpath) > 0)
+ {
+ config->Add("database_dir", htsearch_parms->DBpath);
+ }
+
+ //------- custom filters from htsearch_parms ----------
+
+ //resrict,exclude,urlrewrite
+
+
+ if (strlen(htsearch_parms->meta_description_factor) > 0)
+ {
+ config->Add("meta_description_factor", htsearch_parms->meta_description_factor);
+ }
+
+ if (strlen(htsearch_parms->title_factor) > 0)
+ {
+ config->Add("title_factor", htsearch_parms->title_factor);
+ }
+
+ if (strlen(htsearch_parms->text_factor) > 0)
+ {
+ config->Add("text_factor", htsearch_parms->text_factor);
+ }
+
+ if(strlen(htsearch_parms->locale) > 0)
+ {
+ config->Add("locale", htsearch_parms->locale);
+ }
+
+ //-------------------------------------------------------------------
+
+
+ // Initialize htword library (key description + wordtype...)
+ WordContext::Initialize(*config);
+
+//NON-CGI Usage libhtdig
+/*
+
+ config->Add("match_method", input["method"]);
+ config->Add("template_name", input["format"]);
+
+ // minimum check for a valid int value of "matchesperpage" cgi variable
+ if (atoi(input["matchesperpage"]) > 0)
+ config->Add("matches_per_page", input["matchesperpage"]);
+
+ pageNumber = atoi(input["page"]);
+ config->Add("config", input["config"]);
+ config->Add("restrict", input["restrict"]);
+ config->Add("exclude", input["exclude"]);
+ config->Add("keywords", input["keywords"]);
+ requiredWords.Create(config->Find("keywords"), " \t\r\n\001");
+ config->Add("sort", input["sort"]);
+
+ config->Add("startmonth", input["startmonth"]);
+ config->Add("startday", input["startday"]);
+ config->Add("startyear", input["startyear"]);
+
+ config->Add("endmonth", input["endmonth"]);
+ config->Add("endday", input["endday"]);
+ config->Add("endyear", input["endyear"]);
+
+
+ StringList form_vars(config->Find("allow_in_form"), " \t\r\n");
+ for (i = 0; i < form_vars.Count(); i++)
+ {
+ if (input.exists(form_vars[i]))
+ config->Add(form_vars[i], input[form_vars[i]]);
+ }
+
+*/
+//NON-CGI Usage libhtdig
+
+
+ minimum_word_length = config->Value("minimum_word_length", minimum_word_length);
+
+ //
+ // Compile the URL limit patterns.
+ //
+
+ if (config->Find("restrict").length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(config->Find("restrict"), " \t\r\n\001|");
+ limit_to.setEscaped(l);
+ String u = l.Join('|');
+ config->Add("restrict", u); // re-create the config attribute
+ }
+ if (config->Find("exclude").length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(config->Find("exclude"), " \t\r\n\001|");
+ exclude_these.setEscaped(l);
+ String u = l.Join('|');
+ config->Add("exclude", u); // re-create the config attribute
+ }
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance()->ErrMsg();
+
+ if (url_part_errors.length() != 0)
+ {
+ reportError(form("Invalid url_part_aliases or common_url_parts: %s", url_part_errors.get()));
+ return (HTSEARCH_ERROR_URL_PART);
+
+ }
+
+ // for htsearch, use search_rewrite_rules attribute for HtURLRewriter.
+ config->AddParsed("url_rewrite_rules", "${search_rewrite_rules}");
+ url_part_errors = HtURLRewriter::instance()->ErrMsg();
+ if (url_part_errors.length() != 0)
+ reportError(form("Invalid url_rewrite_rules: %s", url_part_errors.get()));
+
+ // Load boolean_keywords from configuration
+ // they should be placed in this order:
+ // 0 1 2
+ // and or not
+ boolean_keywords.Create(config->Find("boolean_keywords"), "| \t\r\n\001");
+ if (boolean_keywords.Count() != 3)
+ reportError("boolean_keywords attribute should have three entries");
+
+
+
+ parser = new Parser();
+
+ return (TRUE);
+}
+
+//---------------------------------------------------------------------------------------
+//
+//
+// RETURN: Number of Documents resulted from search
+//
+//---------------------------------------------------------------------------------------
+
+int htsearch_query(htsearch_query_struct * htseach_query)
+{
+ int total_match_count = 0;
+
+ originalWords = htseach_query->raw_query;
+ originalWords.chop(" \t\r\n");
+
+ //sort
+ switch (htseach_query->sortby_flag)
+ {
+ case HTSEARCH_SORT_SCORE:
+ config->Add("sort", "score");
+ break;
+ case HTSEARCH_SORT_REV_SCORE:
+ config->Add("sort", "revscore");
+ break;
+ case HTSEARCH_SORT_TIME:
+ config->Add("sort", "time");
+ break;
+ case HTSEARCH_SORT_REV_TIME:
+ config->Add("sort", "revtime");
+ break;
+ case HTSEARCH_SORT_TITLE:
+ config->Add("sort", "title");
+ break;
+ case HTSEARCH_SORT_REV_TITLE:
+ config->Add("sort", "revtitle");
+ break;
+ }
+
+
+ switch (htseach_query->algorithms_flag)
+ {
+ case HTSEARCH_ALG_BOOLEAN:
+ config->Add("match_method", "boolean");
+ break;
+ case HTSEARCH_ALG_OR:
+ config->Add("match_method", "or");
+ break;
+ case HTSEARCH_ALG_AND:
+ config->Add("match_method", "and");
+ break;
+ }
+
+ //format
+ switch (htseach_query->algorithms_flag)
+ {
+ case HTSEARCH_FORMAT_SHORT:
+ config->Add("template_name", "builtin-short");
+ break;
+ case HTSEARCH_FORMAT_LONG:
+ config->Add("template_name", "builtin-long");
+ break;
+ }
+
+
+ origPattern = 0;
+ logicalWords = 0;
+ logicalPattern = 0;
+ searchWordsPattern = new StringMatch;
+
+ // Iterate over all specified collections (databases)
+ //for (int cInd = 0; errorMsg.empty() && cInd < collectionList.Count(); cInd++)
+ //{
+
+ // Parse the words to search for from the argument list.
+ // This will produce a list of WeightWord objects.
+ //
+ setupWords(originalWords, *searchWords,
+ strcmp(config->Find("match_method"), "boolean") == 0, parser, origPattern);
+
+ //
+ // Convert the list of WeightWord objects to a pattern string
+ // that we can compile.
+ //
+ createLogicalWords(*searchWords, logicalWords, logicalPattern);
+
+ //
+ // Assemble the full pattern for excerpt matching and highlighting
+ //
+ origPattern += logicalPattern;
+ searchWordsPattern->IgnoreCase();
+ searchWordsPattern->IgnorePunct();
+ searchWordsPattern->Pattern(logicalPattern); // this should now be enough
+ //searchWordsPattern.Pattern(origPattern);
+ //if (debug > 2)
+ // cout << "Excerpt pattern: " << origPattern << "\n";
+
+ //
+ // If required keywords were given in the search form, we will
+ // modify the current searchWords list to include the required
+ // words.
+ //
+ if (requiredWords.Count() > 0)
+ {
+ addRequiredWords(*searchWords, requiredWords);
+ }
+
+ //
+ // Perform the actual search. The function htsearch() is used for this.
+ // The Dictionary it returns is then passed on to the Display object to
+ // actually render the results in HTML.
+ //
+ const String word_db = config->Find("word_db");
+ if (access(word_db, R_OK) < 0)
+ {
+ reportError(form("Unable to read word database file '%s'\nDid you run htdig?", word_db.get()));
+ return (HTSEARCH_ERROR_WORDDB_READ);
+ }
+ // ResultList *results = htsearch((char*)word_db, searchWords, parser);
+
+ String doc_index = config->Find("doc_index");
+ if (access((char *) doc_index, R_OK) < 0)
+ {
+ reportError(form("Unable to read document index file '%s'\nDid you run htdig?", doc_index.get()));
+ return (HTSEARCH_ERROR_DOCINDEX_READ);
+ }
+
+ const String doc_db = config->Find("doc_db");
+ if (access(doc_db, R_OK) < 0)
+ {
+ reportError(form("Unable to read document database file '%s'\nDid you run htdig?", doc_db.get()));
+ return (HTSEARCH_ERROR_DOCDB_READ);
+ }
+
+ const String doc_excerpt = config->Find("doc_excerpt");
+ if (access(doc_excerpt, R_OK) < 0)
+ {
+ reportError(form("Unable to read document excerpts '%s'\nDid you run htdig?", doc_excerpt.get()));
+ return (HTSEARCH_ERROR_EXCERPTDB_READ);
+ }
+
+ // Multiple database support
+ collection = new Collection((char *) configFile,
+ word_db.get(), doc_index.get(), doc_db.get(), doc_excerpt.get());
+
+ // Perform search within the collection. Each collection stores its
+ // own result list.
+ total_match_count += htsearch(collection, *searchWords, parser);
+ collection->setSearchWords(searchWords);
+ collection->setSearchWordsPattern(searchWordsPattern);
+ selected_collections.Add(configFile, collection);
+
+ if (parser->hadError())
+ errorMsg = parser->getErrorMessage();
+
+ delete parser;
+ //}
+
+
+ total_matches = total_match_count;
+
+ if (total_matches > 0)
+ {
+
+ resultfetch = new ResultFetch(&selected_collections, collectionList);
+
+ if (resultfetch->hasTemplateError())
+ {
+ reportError(form("Unable to read template file '%s'\nDoes it exist?",
+ (const char *) config->Find("template_name")));
+
+ return (HTSEARCH_ERROR_TEMPLATE_ERROR);
+ }
+ resultfetch->setOriginalWords(originalWords);
+ resultfetch->setLimit(&limit_to);
+ resultfetch->setExclude(&exclude_these);
+ resultfetch->setLogicalWords(logicalWords);
+ if (!errorMsg.empty())
+ resultfetch->displaySyntaxError(errorMsg);
+ else
+ {
+
+ matches_list = resultfetch->fetch();
+
+ //matches_list->Start_Get();
+
+ }
+
+ } //if ((total_matches > 0) && (desired_match_index == 0))
+
+
+ return (total_match_count);
+}
+
+//------------------ htsearch_get_nth_match (...) -------------------------------------
+//
+// Parameters
+// result_desired_index ZERO based results index.
+// query_result structure to fill with result
+//
+// htsearch_query_match_struct:
+// char title[HTDIG_DOCUMENT_TITLE_L];
+// char URL[HTDIG_MAX_FILENAME_PATH_L];
+// char excerpt[HTDIG_DOCUMENT_EXCERPT_L];
+// int score;
+// int match_percent; //top result is 100%
+// time_t doc_date;
+// int size;
+//
+//---------------------------------------------------------------------------------------
+
+int htsearch_get_nth_match(int desired_match_index, htsearch_query_match_struct * query_result)
+{
+
+ ResultMatch *match = 0;
+ Dictionary *vars = 0;
+
+ if (total_matches == 0)
+ {
+ return (HTSEARCH_ERROR_NO_MATCH);
+ }
+ else if (desired_match_index >= total_matches)
+ {
+ return (HTSEARCH_ERROR_BAD_MATCH_INDEX);
+ }
+ else if ((total_matches > 0) && (desired_match_index < total_matches))
+ {
+ match = (ResultMatch *) matches_list->Nth(desired_match_index);
+
+ // DocumentRef *ref = docDB[match->getID()];
+ Collection *collection = match->getCollection();
+ DocumentRef *ref = collection->getDocumentRef(match->getID());
+ if (!ref || ref->DocState() != Reference_normal)
+ {
+ // The document isn't present or shouldn't be displayed
+ return (HTSEARCH_ERROR_BAD_DOCUMENT);
+ }
+
+ ref->DocAnchor(match->getAnchor());
+ ref->DocScore(match->getScore());
+ vars = resultfetch->fetchMatch(match, ref, desired_match_index);
+ delete ref;
+
+ String *value;
+ String key;
+
+ key = "NSTARS";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ query_result->score = atoi(value->get());
+
+ key = "PERCENT";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ query_result->score_percent = atoi(value->get());
+
+ key = "TITLE";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ snprintf(query_result->title, HTDIG_DOCUMENT_TITLE_L, "%s", value->get());
+
+ key = "EXCERPT";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ snprintf(query_result->excerpt, HTDIG_DOCUMENT_EXCERPT_L, "%s", value->get());
+
+ key = "URL";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ snprintf(query_result->URL, HTDIG_MAX_FILENAME_PATH_L, "%s", value->get());
+
+ String datefmt = config->Find("date_format");
+ key = "MODIFIED";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ mystrptime(value->get(), datefmt.get(), &(query_result->time_tm));
+ //cout << "[" << asctime(&query_result->time_tm) << "]" << endl;
+
+ key = "SIZE";
+ value = (String *) vars->Find(key);
+ //cout << key.get() << "[" << value->get() << "]" << endl;
+ query_result->size = atoi(value->get());
+
+
+ }
+
+ return (TRUE);
+}
+
+//---------------------------------------------------------------------------------------
+//
+//
+// RETURN: TRUE or FALSE
+//
+//---------------------------------------------------------------------------------------
+
+int htsearch_close()
+{
+
+
+ // delete results;
+ // delete parser;
+
+
+ return (TRUE);
+
+}
+
+//*****************************************************************************
+void createLogicalWords(List & searchWords, String & logicalWords, String & wm)
+{
+ String pattern;
+ int i;
+ int wasHidden = 0;
+ int inPhrase = 0;
+
+ for (i = 0; i < searchWords.Count(); i++)
+ {
+ WeightWord *ww = (WeightWord *) searchWords[i];
+ if (!ww->isHidden)
+ {
+
+ if (strcmp((char *) ww->word, "&") == 0 && wasHidden == 0)
+ logicalWords << ' ' << boolean_keywords[AND] << ' ';
+ else if (strcmp((char *) ww->word, "|") == 0 && wasHidden == 0)
+ logicalWords << ' ' << boolean_keywords[OR] << ' ';
+ else if (strcmp((char *) ww->word, "!") == 0 && wasHidden == 0)
+ logicalWords << ' ' << boolean_keywords[NOT] << ' ';
+ else if (strcmp((char *) ww->word, "\"") == 0 && wasHidden == 0)
+ {
+ if (inPhrase)
+ logicalWords.chop(' ');
+ inPhrase = !inPhrase;
+ logicalWords << "\"";
+ }
+ else if (wasHidden == 0)
+ {
+ logicalWords << ww->word;
+ if (inPhrase)
+ logicalWords << " ";
+ }
+ wasHidden = 0;
+ }
+ else
+ wasHidden = 1;
+ if (ww->weight > 0 // Ignore boolean syntax stuff
+ && !ww->isIgnore) // Ignore short or bad words
+ {
+ if (pattern.length() && !inPhrase)
+ pattern << '|';
+ else if (pattern.length() && inPhrase)
+ pattern << ' ';
+ pattern << ww->word;
+ }
+ }
+ wm = pattern;
+
+ if (debug)
+ {
+ cerr << "LogicalWords: " << logicalWords << endl;
+ cerr << "Pattern: " << pattern << endl;
+ }
+}
+
+void dumpWords(List & words, char *msg = "")
+{
+ if (debug)
+ {
+ cerr << msg << ": '";
+ for (int i = 0; i < words.Count(); i++)
+ {
+ WeightWord *ww = (WeightWord *) words[i];
+ cerr << ww->word << ':' << ww->isHidden << ' ';
+ }
+ cerr << "'\n";
+ }
+}
+
+//*****************************************************************************
+// void setupWords(char *allWords, List &searchWords,
+// int boolean, Parser *parser, String &originalPattern)
+//
+void setupWords(char *allWords, List & searchWords, int boolean, Parser * parser, String & originalPattern)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ List tempWords;
+ int i;
+
+ //
+ // Parse the words we need to search for. It should be a list of words
+ // with optional 'and' and 'or' between them. The list of words
+ // will be put in the searchWords list and at the same time in the
+ // String pattern separated with '|'.
+ //
+
+ //
+ // Convert the string to a list of WeightWord objects. The special
+ // characters '(' and ')' will be put into their own WeightWord objects.
+ //
+ unsigned char *pos = (unsigned char *) allWords;
+ unsigned char t;
+ String word;
+ const String prefix_suffix = config->Find("prefix_match_character");
+ while (*pos)
+ {
+ while (1)
+ {
+ t = *pos++;
+ if (isspace(t))
+ {
+ continue;
+ }
+ else if (t == '"')
+ {
+ tempWords.Add(new WeightWord("\"", -1.0));
+ break;
+ }
+ else if (boolean && (t == '(' || t == ')'))
+ {
+ char s[2];
+ s[0] = t;
+ s[1] = '\0';
+ tempWords.Add(new WeightWord(s, -1.0));
+ break;
+ }
+ else if (HtIsWordChar(t) || t == ':' ||
+ (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255))
+ {
+ word = 0;
+ while (t && (HtIsWordChar(t) ||
+ t == ':' || (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255)))
+ {
+ word << (char) t;
+ t = *pos++;
+ }
+
+ pos--;
+
+ if (boolean && (mystrcasecmp(word.get(), "+") == 0
+ || mystrcasecmp(word.get(), boolean_keywords[AND]) == 0))
+ {
+ tempWords.Add(new WeightWord("&", -1.0));
+ }
+ else if (boolean && mystrcasecmp(word.get(), boolean_keywords[OR]) == 0)
+ {
+ tempWords.Add(new WeightWord("|", -1.0));
+ }
+ else if (boolean && (mystrcasecmp(word.get(), "-") == 0
+ || mystrcasecmp(word.get(), boolean_keywords[NOT]) == 0))
+ {
+ tempWords.Add(new WeightWord("!", -1.0));
+ }
+ else
+ {
+ // Add word to excerpt matching list
+ originalPattern << word << "|";
+ WeightWord *ww = new WeightWord(word, 1.0);
+ if (HtWordNormalize(word) & WORD_NORMALIZE_NOTOK)
+ ww->isIgnore = 1;
+ tempWords.Add(ww);
+ }
+ break;
+ }
+ }
+ }
+
+ dumpWords(tempWords, "tempWords");
+
+ //
+ // If the user specified boolean expression operators, the whole
+ // expression has to be syntactically correct. If not, we need
+ // to report a syntax error.
+ //
+ if (boolean)
+ {
+ if (!parser->checkSyntax(&tempWords))
+ {
+ for (i = 0; i < tempWords.Count(); i++)
+ {
+ searchWords.Add(tempWords[i]);
+ }
+ tempWords.Release();
+ return;
+// reportError("Syntax error");
+ }
+ }
+ else
+ {
+ convertToBoolean(tempWords);
+ }
+
+ dumpWords(tempWords, "Boolean");
+
+ //
+ // We need to assign weights to the words according to the search_algorithm
+ // configuration attribute.
+ // For algorithms other than exact, we need to also do word lookups.
+ //
+ StringList algs(config->Find("search_algorithm"), " \t");
+ List algorithms;
+ String name, weight;
+ double fweight;
+ Fuzzy *fuzzy = 0;
+
+ //
+ // Generate the list of algorithms to use and associate the given
+ // weights with them.
+ //
+ for (i = 0; i < algs.Count(); i++)
+ {
+ name = strtok(algs[i], ":");
+ weight = strtok(0, ":");
+ if (name.length() == 0)
+ name = "exact";
+ if (weight.length() == 0)
+ weight = "1";
+ fweight = atof((char *) weight);
+
+ fuzzy = Fuzzy::getFuzzyByName(name, *config);
+ if (fuzzy)
+ {
+ fuzzy->setWeight(fweight);
+ fuzzy->openIndex();
+ algorithms.Add(fuzzy);
+ }
+ }
+
+ dumpWords(searchWords, "initial");
+
+ //
+ // For each of the words, apply all the algorithms.
+ //
+ int in_phrase = 0; // If we get into a phrase, we don't want to fuzz.
+ for (i = 0; i < tempWords.Count(); i++)
+ {
+ WeightWord *ww = (WeightWord *) tempWords[i];
+ if (ww->weight > 0 && !ww->isIgnore && !in_phrase)
+ {
+ //
+ // Apply all the algorithms to the word.
+ //
+ if (debug)
+ cerr << "Fuzzy on: " << ww->word << endl;
+ doFuzzy(ww, searchWords, algorithms);
+ delete ww;
+ }
+ else if (ww->word.length() == 1 && ww->word[0] == '"')
+ {
+ in_phrase = !in_phrase;
+ if (debug)
+ cerr << "Add: " << ww->word << endl;
+ searchWords.Add(ww);
+ }
+ else
+ {
+ //
+ // This is '(', ')', '&', or '|'. These will be automatically
+ // transfered to the searchWords list.
+ //
+ if (debug)
+ cerr << "Add: " << ww->word << endl;
+ searchWords.Add(ww);
+ }
+ dumpWords(searchWords, "searchWords");
+ }
+ tempWords.Release();
+}
+
+
+//*****************************************************************************
+void doFuzzy(WeightWord * ww, List & searchWords, List & algorithms)
+{
+ List fuzzyWords;
+ List weightWords;
+ Fuzzy *fuzzy;
+ WeightWord *newWw;
+ String *word;
+
+ algorithms.Start_Get();
+ while ((fuzzy = (Fuzzy *) algorithms.Get_Next()))
+ {
+ if (debug > 1)
+ cout << " " << fuzzy->getName();
+ fuzzy->getWords(ww->word, fuzzyWords);
+ fuzzyWords.Start_Get();
+ while ((word = (String *) fuzzyWords.Get_Next()))
+ {
+ if (debug > 1)
+ cout << " " << word->get();
+ newWw = new WeightWord(word->get(), fuzzy->getWeight());
+ newWw->isExact = ww->isExact;
+ newWw->isHidden = ww->isHidden;
+ weightWords.Add(newWw);
+ }
+ if (debug > 1)
+ cout << endl;
+ fuzzyWords.Destroy();
+ }
+
+ //
+ // We now have a list of substitute words. They need to be added
+ // to the searchWords.
+ //
+ if (weightWords.Count())
+ {
+ if (weightWords.Count() > 1)
+ searchWords.Add(new WeightWord("(", -1.0));
+ for (int i = 0; i < weightWords.Count(); i++)
+ {
+ if (i > 0)
+ searchWords.Add(new WeightWord("|", -1.0));
+ searchWords.Add(weightWords[i]);
+ }
+ if (weightWords.Count() > 1)
+ searchWords.Add(new WeightWord(")", -1.0));
+ }
+ else // if no fuzzy matches, add exact word, but give it tiny weight
+ {
+ searchWords.Add(new WeightWord(word->get(), 0.000001));
+ }
+
+
+ weightWords.Release();
+}
+
+
+//*****************************************************************************
+// void convertToBoolean(List &words)
+//
+void convertToBoolean(List & words)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ List list;
+ int i;
+ int do_and = strcmp(config->Find("match_method"), "and") == 0;
+ int in_phrase = 0;
+
+ String quote = "\"";
+
+ if (words.Count() == 0)
+ return;
+ list.Add(words[0]);
+
+ // We might start off with a phrase match
+ if (((WeightWord *) words[0])->word == quote)
+ in_phrase = 1;
+
+ for (i = 1; i < words.Count(); i++)
+ {
+ if (do_and && !in_phrase)
+ list.Add(new WeightWord("&", -1.0));
+ else if (!in_phrase)
+ list.Add(new WeightWord("|", -1.0));
+
+ if (((WeightWord *) words[i])->word == quote)
+ in_phrase = !in_phrase;
+
+ list.Add(words[i]);
+ }
+ words.Release();
+
+ for (i = 0; i < list.Count(); i++)
+ {
+ words.Add(list[i]);
+ }
+ list.Release();
+}
+
+
+//*****************************************************************************
+// Dictionary *htsearch(char *wordfile, List &searchWords, Parser *parser)
+// This returns a dictionary indexed by document ID and containing a
+// List of HtWordReference objects.
+//
+int htsearch(Collection * collection, List & searchWords, Parser * parser)
+{
+ int count = 0;
+
+ //
+ // Pick the database type we are going to use
+ //
+ ResultList *matches = new ResultList;
+ if (searchWords.Count() > 0)
+ {
+ // parser->setDatabase(wordfile);
+ parser->setCollection(collection);
+ parser->parse(&searchWords, *matches);
+ }
+
+ collection->setResultList(matches);
+
+ count = matches->Count();
+
+ return (count);
+}
+
+
+//*****************************************************************************
+// Modify the search words list to include the required words as well.
+// This is done by putting the existing search words in parenthesis and
+// appending the required words separated with "and".
+void addRequiredWords(List & searchWords, StringList & requiredWords)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ static int any_keywords = config->Boolean("any_keywords", 0);
+ if (requiredWords.Count() == 0)
+ return;
+ if (searchWords.Count() > 0)
+ {
+ searchWords.Insert(new WeightWord("(", -1.0), 0);
+ searchWords.Add(new WeightWord(")", -1.0));
+ searchWords.Add(new WeightWord("&", -1.0));
+ }
+ if (requiredWords.Count() == 1)
+ {
+ searchWords.Add(new WeightWord(requiredWords[0], 1.0));
+ }
+ else
+ {
+ searchWords.Add(new WeightWord("(", -1.0));
+ searchWords.Add(new WeightWord(requiredWords[0], 1.0));
+ for (int i = 1; i < requiredWords.Count(); i++)
+ {
+ if (any_keywords)
+ searchWords.Add(new WeightWord("|", -1.0));
+ else
+ searchWords.Add(new WeightWord("&", -1.0));
+ searchWords.Add(new WeightWord(requiredWords[i], 1.0));
+ }
+ searchWords.Add(new WeightWord(")", -1.0));
+ }
+}
+
+
+//*****************************************************************************
+// Report an error. Since we don' know if we are running as a CGI or not,
+// we will assume this is the first thing returned by a CGI program.
+//
+void reportError_html(char *msg)
+{
+ HtConfiguration *config = HtConfiguration::config();
+ cout << "Content-type: text/html\r\n\r\n";
+ cout << "<html><head><title>htsearch error</title></head>\n";
+ cout << "<body bgcolor=\"#ffffff\">\n";
+ cout << "<h1>ht://Dig error</h1>\n";
+ cout << "<p>htsearch detected an error. Please report this to the\n";
+ cout << "webmaster of this site by sending an e-mail to:\n";
+ cout << "<a href=\"mailto:" << config->Find("maintainer") << "\">";
+ cout << config->Find("maintainer") << "</a>\n";
+ cout << "The error message is:</p>\n";
+ cout << "<pre>\n" << msg << "\n</pre>\n</body></html>\n";
+ exit(1);
+}