summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htsearch/parser.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htsearch/parser.cc918
1 files changed, 918 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc
new file mode 100644
index 00000000..3ed1531c
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc
@@ -0,0 +1,918 @@
+//
+// parser.cc
+//
+// parser: Parses a boolean expression tree, retrieving and scoring
+// the resulting document list
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: parser.cc,v 1.36 2004/06/11 16:50:33 grdetil Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "parser.h"
+#include "HtPack.h"
+#include "Collection.h"
+#include "Dictionary.h"
+#include "QuotedStringList.h"
+
+#define WORD 1000
+#define DONE 1001
+
+QuotedStringList boolean_syntax_errors;
+enum ErrorIndices { EXPECTED, SEARCH_WORD, AT_END, INSTEAD_OF, END_OF_EXPR, QUOTE };
+
+//*****************************************************************************
+Parser::Parser() :
+ words(*(HtConfiguration::config()))
+{
+ tokens = 0;
+ result = 0;
+ current = 0;
+ valid = 1;
+}
+
+
+//*****************************************************************************
+// int Parser::checkSyntax(List *tokenList)
+// As the name of the function implies, we will only perform a syntax check
+// on the list of tokens.
+//
+int
+Parser::checkSyntax(List *tokenList)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ void reportError(char *);
+ // Load boolean_syntax_errors from configuration
+ // they should be placed in this order:
+ // 0 1 2 3 4
+ // Expected "a search word" "at the end" "instead of" "end of expression"
+ // 5
+ // "a closing quote"
+ boolean_syntax_errors.Destroy();
+ boolean_syntax_errors.Create(config->Find("boolean_syntax_errors"), "| \t\r\n\001");
+ if (boolean_syntax_errors.Count() == 5)
+ { // for backward compatibility
+ boolean_syntax_errors.Add (new String ("a closing quote"));
+ if (debug)
+ cerr << "Parser::checkSyntax() : boolean_syntax_errors should have six entries\n";
+ } else if (boolean_syntax_errors.Count() != 6)
+ reportError("boolean_syntax_errors attribute should have six entries");
+ tokens = tokenList;
+ valid = 1;
+ fullexpr(0);
+ return valid;
+}
+
+//*****************************************************************************
+/* Called by: Parser::parse(List*, ResultList&), checkSyntax(List*) */
+/* Inputs: output -- if zero, simply check syntax */
+/* otherwise, list matching documents in head of "stack" */
+void
+Parser::fullexpr(int output)
+{
+ tokens->Start_Get();
+ lookahead = lexan();
+ expr(output);
+ if (valid && lookahead != DONE)
+ {
+ setError(boolean_syntax_errors[END_OF_EXPR]);
+ }
+}
+
+//*****************************************************************************
+int
+Parser::lexan()
+{
+ current = (WeightWord *) tokens->Get_Next();
+ if (!current)
+ return DONE;
+ else if (mystrcasecmp((char*)current->word, "&") == 0)
+ return '&';
+ else if (mystrcasecmp((char*)current->word, "|") == 0)
+ return '|';
+ else if (mystrcasecmp((char*)current->word, "!") == 0)
+ return '!';
+ else if (mystrcasecmp((char*)current->word, "(") == 0)
+ return '(';
+ else if (mystrcasecmp((char*)current->word, ")") == 0)
+ return ')';
+ else if (mystrcasecmp((char*)current->word, "\"") == 0)
+ return '"';
+ else
+ return WORD;
+}
+
+//*****************************************************************************
+// Attempt to deal with expressions in the form
+// term | term | term ...
+/* Called by: Parser::fullexpr(int), factor(int) */
+/* Inputs: output -- if zero, simply check syntax */
+void
+Parser::expr(int output)
+{
+ term(output);
+ while (1)
+ {
+ if (match('|'))
+ {
+ term(output);
+ if (output)
+ {
+ if(debug) cerr << "or--" << endl;
+ perform_or();
+ if(debug) cerr << "stack:" << stack.Size() << endl;
+ }
+ }
+ else
+ break;
+ }
+ if (valid && lookahead == WORD)
+ {
+ String expected = "'";
+ expected << boolean_keywords[AND] << "' "<< boolean_keywords[OR] <<" '"
+ << boolean_keywords[OR] << "'";
+ setError(expected.get());
+ }
+}
+
+//*****************************************************************************
+// Attempt to deal with terms in the form
+// factor & factor & factor ...
+/* Called by: Parser::expr(int) */
+/* Inputs: output -- if zero, simply check syntax */
+void
+Parser::term(int output)
+{
+
+ factor(output);
+ if(debug) cerr << "term:factor" << endl;
+ while (1)
+ {
+ if(match('&'))
+ {
+ factor(output);
+ if(output)
+ {
+ if(debug) cerr << "and--" << endl;
+ perform_and();
+ if(debug) cerr << "stack:" << stack.Size() << endl;
+ }
+ }
+ else if(match('!'))
+ {
+ factor(output);
+ if(output)
+ {
+ if(debug) cerr << "not--" << endl;
+ perform_not();
+ if(debug) cerr << "stack:" << stack.Size() << endl;
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+}
+
+//*****************************************************************************
+/* Gather and score a (possibly bracketed) boolean expression */
+/* Called by: Parser::term(int) */
+/* Inputs: output -- if zero, simply check syntax */
+void
+Parser::factor(int output)
+{
+ if(match('"'))
+ {
+ phrase(output);
+ }
+ else if (match('('))
+ {
+ expr(output);
+ if (match(')'))
+ {
+ return;
+ }
+ else
+ {
+ setError("')'");
+ }
+ }
+ else if (lookahead == WORD)
+ {
+ if (output)
+ {
+ perform_push();
+ }
+ lookahead = lexan();
+ }
+ else
+ {
+ setError(boolean_syntax_errors[SEARCH_WORD]);
+// setError("a search word, a quoted phrase, a boolean expression between ()");
+ }
+}
+
+//*****************************************************************************
+/* Gather and score a quoted phrase */
+/* Called by: Parser::factor(int) */
+/* Inputs: output -- if zero, simply check syntax */
+void
+Parser::phrase(int output)
+{
+ List *wordList = 0;
+ double weight = 1.0;
+
+ while (1)
+ {
+ if (match('"'))
+ {
+ if (output)
+ {
+ if(!wordList) wordList = new List;
+ if(debug) cerr << "scoring phrase" << endl;
+ score(wordList, weight, FLAGS_MATCH_ONE); // look in all fields
+ }
+ break;
+ }
+ else if (lookahead == WORD)
+ {
+ weight *= current->weight;
+ if (output)
+ perform_phrase(wordList);
+
+ lookahead = lexan();
+ }
+ else if (lookahead == DONE)
+ {
+ setError(boolean_syntax_errors[QUOTE]);
+ break;
+ }
+ else
+ {
+ // skip '&' '|' and '!' in the phrase
+ current->isIgnore = 1;
+ if (output)
+ perform_phrase(wordList);
+ lookahead = lexan ();
+ }
+ } // end while
+ if(wordList) delete wordList;
+}
+
+//*****************************************************************************
+int
+Parser::match(int t)
+{
+ if (lookahead == t)
+ {
+ lookahead = lexan();
+ return 1;
+ }
+ else
+ return 0;
+}
+
+//*****************************************************************************
+void
+Parser::setError(char *expected)
+{
+ if (valid)
+ {
+ valid = 0;
+ error = 0;
+ error << boolean_syntax_errors[EXPECTED] << ' ' << expected;
+ if (lookahead == DONE || !current)
+ {
+ error << ' ' << boolean_syntax_errors[AT_END];
+ }
+ else
+ {
+ error << ' ' << boolean_syntax_errors[INSTEAD_OF] << " '"
+ << current->word.get() << "'";
+ switch (lookahead)
+ {
+ case '&': error << ' ' << boolean_keywords[OR] << " '"
+ << boolean_keywords[AND] << "'";
+ break;
+ case '|': error << ' ' << boolean_keywords[OR] << " '"
+ << boolean_keywords[OR] << "'";
+ break;
+ case '!': error << ' ' << boolean_keywords[OR] << " '"
+ << boolean_keywords[NOT] << "'";
+ break;
+ }
+ }
+ if (debug) cerr << "Syntax error: " << error << endl;
+ }
+}
+
+//*****************************************************************************
+// Perform a lookup of the current word and push the result onto the stack
+//
+void
+Parser::perform_push()
+{
+ HtConfiguration* config= HtConfiguration::config();
+ static int maximum_word_length = config->Value("maximum_word_length", 12);
+ String temp = current->word.get();
+ char *p;
+
+ if(debug)
+ cerr << "perform_push @"<< stack.Size() << ": " << temp << endl;
+
+ String wildcard = config->Find("prefix_match_character");
+ if (!wildcard.get())
+ wildcard = "*";
+ if (temp == wildcard)
+ {
+ if (debug) cerr << "Wild card search\n";
+ ResultList *list = new ResultList;
+ String doc_db = config->Find("doc_db");
+ DocumentDB docdb;
+ docdb.Read(doc_db);
+ List *docs = docdb.DocIDs();
+
+ //
+ // Traverse all the known documents
+ //
+ DocumentRef *ref;
+ IntObject *id;
+ DocMatch *dm;
+ docs->Start_Get();
+ while ((id = (IntObject *) docs->Get_Next()))
+ {
+ ref = docdb[id->Value()];
+ if (debug)
+ cerr << (ref ? "Wildcard match" : "Wildcard empty") << endl;
+ if (ref)
+ {
+ dm = new DocMatch;
+ dm->score = current->weight;
+ dm->id = ref->DocID();
+ dm->orMatches = 1;
+ dm->anchor = 0;
+ list->add(dm);
+ }
+ delete ref;
+ }
+ delete docs;
+ stack.push(list);
+
+ return;
+ }
+
+ // Must be after wildcard: "*" is "isIgnore" because it is too short.
+ if (current->isIgnore)
+ {
+ if(debug) cerr << "ignore: " << temp << " @" << stack.Size() << endl;
+ //
+ // This word needs to be ignored. Make it so.
+ //
+ ResultList *list = new ResultList;
+ list->isIgnore = 1;
+ stack.push(list);
+ return;
+ }
+
+ temp.lowercase();
+ p = temp.get();
+ if (temp.length() > maximum_word_length)
+ p[maximum_word_length] = '\0';
+
+ List* result = words[p];
+ score(result, current->weight, current->flags);
+ delete result;
+}
+
+//*****************************************************************************
+// BUG: Phrases containing "bad words" can have *any* "bad word" in that
+// position. Words less than minimum_word_length ignored entirely,
+// as they are not indexed.
+void
+Parser::perform_phrase(List * &oldWords)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ static int maximum_word_length = config->Value("maximum_word_length", 12);
+ String temp = current->word.get();
+ char *p;
+ List *newWords = 0;
+ HtWordReference *oldWord, *newWord;
+
+ // how many words ignored since last checked word?
+ static int ignoredWords = 0;
+
+ // if the query is empty, no further effort is needed
+ if(oldWords && oldWords->Count() == 0)
+ {
+ if(debug) cerr << "phrase not found, skip" << endl;
+ return;
+ }
+
+ if(debug) cerr << "phrase current: " << temp << endl;
+ if (current->isIgnore)
+ {
+ //
+ // This word needs to be ignored. Make it so.
+ //
+ if (temp.length() >= config->Value ("minimum_word_length") && oldWords)
+ ignoredWords++;
+ if(debug) cerr << "ignoring: " << temp << endl;
+ return;
+ }
+
+ temp.lowercase();
+ p = temp.get();
+ if (temp.length() > maximum_word_length)
+ p[maximum_word_length] = '\0';
+
+ newWords = words[p];
+ if(debug) cerr << "new words count: " << newWords->Count() << endl;
+
+ // If we don't have a prior list of words, we want this one...
+ if (!oldWords)
+ {
+ oldWords = new List;
+ if(debug) cerr << "phrase adding first: " << temp << endl;
+ newWords->Start_Get();
+ while ((newWord = (HtWordReference *) newWords->Get_Next()))
+ {
+ oldWords->Add(newWord);
+ }
+ if(debug) cerr << "old words count: " << oldWords->Count() << endl;
+ return;
+ }
+
+ // OK, now we have a previous list in wordList and a new list
+ List *results = new List;
+
+ Dictionary newDict(5000);
+
+ String nid;
+ newWords->Start_Get();
+ while ((newWord = (HtWordReference *) newWords->Get_Next()))
+ {
+ nid = "";
+ int did = newWord->DocID();
+ nid << did;
+ nid << "-";
+ int loc = newWord->Location();
+ nid << loc;
+ if (! newDict.Exists(nid)) {
+ newDict.Add(nid, (Object *)newWord);
+ } else {
+// cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n";
+// Double addition is a problem if you don't want your original objects deleted
+ }
+ }
+
+ String oid;
+ oldWords->Start_Get();
+ while ((oldWord = (HtWordReference *) oldWords->Get_Next()))
+ {
+ oid = "";
+ int did = oldWord->DocID();
+ oid << did;
+ oid << "-";
+ int loc = oldWord->Location();
+ oid << loc + ignoredWords+1;
+ if (newDict.Exists(oid))
+ {
+ newWord = (HtWordReference *)newDict.Find(oid);
+
+ HtWordReference *result = new HtWordReference(*oldWord);
+
+ result->Flags(oldWord->Flags() & newWord->Flags());
+ result->Location(newWord->Location());
+
+ results->Add(result);
+ }
+ }
+ ignoredWords = 0; // most recent word is not a non-ignored word
+
+ newDict.Release();
+
+ if(debug) cerr << "old words count: " << oldWords->Count() << endl;
+ if(debug) cerr << "results count: " << results->Count() << endl;
+ oldWords->Destroy();
+ results->Start_Get();
+ while ((newWord = (HtWordReference *) results->Get_Next()))
+ {
+ oldWords->Add(newWord);
+ }
+ if(debug) cerr << "old words count: " << oldWords->Count() << endl;
+ results->Release();
+ delete results;
+
+ newWords->Destroy();
+ delete newWords;
+
+}
+
+//*****************************************************************************
+// Allocate scores based on words in wordList.
+// Fields within which the word must appear are specified in flags
+// (see HtWordReference.h).
+void
+Parser::score(List *wordList, double weight, unsigned int flags)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ DocMatch *dm;
+ HtWordReference *wr;
+ static double text_factor = config->Double("text_factor", 1);
+ static double caps_factor = config->Double("caps_factor", 1);
+ static double title_factor = config->Double("title_factor", 1);
+ static double heading_factor = config->Double("heading_factor", 1);
+ static double keywords_factor = config->Double("keywords_factor", 1);
+ static double meta_description_factor = config->Double("meta_description_factor", 1);
+ static double author_factor = config->Double("author_factor", 1);
+ static double description_factor = config->Double("description_factor", 1);
+ double wscore;
+ int docanchor;
+ int word_count;
+
+ if (!wordList || wordList->Count() == 0)
+ {
+ // We can't score an empty list, so push a null pointer...
+ if(debug) cerr << "score: empty list, push 0 @" << stack.Size() << endl;
+
+ stack.push(0);
+ return;
+ }
+
+ ResultList *list = new ResultList;
+ if(debug) cerr << "score: push @" << stack.Size() << endl;
+ stack.push(list);
+ // We're now guaranteed to have a non-empty list
+ // We'll use the number of occurences of this word for scoring
+ word_count = wordList->Count();
+
+ wordList->Start_Get();
+ while ((wr = (HtWordReference *) wordList->Get_Next()))
+ {
+ //
+ // ******* Compute the score for the document
+ //
+
+ // If word not in one of the required fields, skip the entry.
+ // Plain text sets no flag in dbase, so treat it separately.
+ if (!(wr->Flags() & flags) && (wr->Flags() || !(flags & FLAG_PLAIN)))
+ {
+ if (debug > 2)
+ cerr << "Flags " << wr->Flags() << " lack " << flags << endl;
+ continue;
+ }
+
+ wscore = 0.0;
+ if (wr->Flags() == FLAG_TEXT) wscore += text_factor;
+ if (wr->Flags() & FLAG_CAPITAL) wscore += caps_factor;
+ if (wr->Flags() & FLAG_TITLE) wscore += title_factor;
+ if (wr->Flags() & FLAG_HEADING) wscore += heading_factor;
+ if (wr->Flags() & FLAG_KEYWORDS) wscore += keywords_factor;
+ if (wr->Flags() & FLAG_DESCRIPTION) wscore += meta_description_factor;
+ if (wr->Flags() & FLAG_AUTHOR) wscore += author_factor;
+ if (wr->Flags() & FLAG_LINK_TEXT) wscore += description_factor;
+ wscore *= weight;
+ wscore = wscore / (double)word_count;
+ docanchor = wr->Anchor();
+ dm = list->find(wr->DocID());
+ if (dm)
+ {
+ wscore += dm->score;
+ if (dm->anchor < docanchor)
+ docanchor = dm->anchor;
+ // We wish to *update* this, not add a duplicate
+ list->remove(wr->DocID());
+ }
+
+ dm = new DocMatch;
+ dm->id = wr->DocID();
+ dm->score = wscore;
+ dm->orMatches = 1; // how many "OR" terms this doc has
+ dm->anchor = docanchor;
+ list->add(dm);
+ }
+}
+
+
+//*****************************************************************************
+// The top two entries in the stack need to be ANDed together.
+//
+// a b a and b
+// 0 0 0
+// 0 1 0
+// 0 x 0
+// 1 0 0
+// 1 1 intersect(a,b)
+// 1 x a
+// x 0 0
+// x 1 b
+// x x x
+//
+void
+Parser::perform_and()
+{
+ ResultList *l1 = (ResultList *) stack.pop();
+ ResultList *l2 = (ResultList *) stack.pop();
+ int i;
+ DocMatch *dm, *dm2, *dm3;
+ HtVector *elements;
+
+ if(!(l2 && l1))
+ {
+ if(debug) cerr << "and: at least one empty operator, pushing 0 @" << stack.Size() << endl;
+ stack.push(0);
+ if(l1) delete l1;
+ if(l2) delete l2;
+ return;
+ }
+
+ //
+ // If either of the arguments is set to be ignored, we will use the
+ // other as the result.
+ // remember l2 and l1, l2 not l1
+
+ if (l1->isIgnore && l2->isIgnore)
+ {
+ if(debug) cerr << "and: ignoring all, pushing ignored list @" << stack.Size() << endl;
+ ResultList *result = new ResultList;
+ result->isIgnore = 1;
+ delete l1; delete l2;
+ stack.push(result);
+ return;
+ }
+ else if (l1->isIgnore)
+ {
+ if(debug) cerr << "and: ignoring l1, pushing l2 @" << stack.Size() << endl;
+ stack.push(l2);
+ delete l1;
+ return;
+ }
+ else if (l2->isIgnore)
+ {
+ if(debug) cerr << "and: ignoring l2, pushing l2 @" << stack.Size() << endl;
+ stack.push(l1);
+ delete l2;
+ return;
+ }
+
+ ResultList *result = new ResultList;
+ stack.push(result);
+ elements = l2->elements();
+
+ if(debug)
+ cerr << "perform and: " << elements->Count() << " " << l1->elements()->Count() << " ";
+
+ for (i = 0; i < elements->Count(); i++)
+ {
+ dm = (DocMatch *) (*elements)[i];
+ dm2 = l1->find(dm->id);
+ if (dm2)
+ {
+ //
+ // Duplicate document. Add scores and average "OR-matches" count
+ //
+ dm3 = new DocMatch;
+// "if (dm2)" means "?:" operator not needed...
+// dm3->score = dm->score + (dm2 ? dm2->score : 0);
+// dm3->orMatches = (dm->orMatches + (dm2 ? dm2->orMatches : 0))/2;
+ dm3->score = dm->score + dm2->score;
+ dm3->orMatches = (dm->orMatches + dm2->orMatches)/2;
+ dm3->id = dm->id;
+ dm3->anchor = dm->anchor;
+// if (dm2 && dm2->anchor < dm3->anchor)
+ if (dm2->anchor < dm3->anchor)
+ dm3->anchor = dm2->anchor;
+ result->add(dm3);
+ }
+ }
+ if(debug)
+ cerr << result->elements()->Count() << endl;
+
+ elements->Release();
+ delete elements;
+ delete l1;
+ delete l2;
+}
+
+// a b a not b
+// 0 0 0
+// 0 1 0
+// 0 x 0
+// 1 0 a
+// 1 1 intersect(a,not b)
+// 1 x a
+// x 0 x
+// x 1 x
+// x x x
+void
+Parser::perform_not()
+{
+ ResultList *l1 = (ResultList *) stack.pop();
+ ResultList *l2 = (ResultList *) stack.pop();
+ int i;
+ DocMatch *dm, *dm2, *dm3;
+ HtVector *elements;
+
+
+ if(!l2)
+ {
+ if(debug) cerr << "not: no positive term, pushing 0 @" << stack.Size() << endl;
+ // Should probably be interpreted as "* not l1"
+ stack.push(0);
+ if(l1) delete l1;
+ return;
+ }
+ if(!l1 || l1->isIgnore || l2->isIgnore)
+ {
+ if(debug) cerr << "not: no negative term, pushing positive @" << stack.Size() << endl;
+ stack.push(l2);
+ if(l1) delete l1;
+ return;
+ }
+
+ ResultList *result = new ResultList;
+ if(debug) cerr << "not: pushing result @" << stack.Size() << endl;
+ stack.push(result);
+ elements = l2->elements();
+
+ if(debug)
+ cerr << "perform not: " << elements->Count() << " " << l1->elements()->Count() << " ";
+
+ for (i = 0; i < elements->Count(); i++)
+ {
+ dm = (DocMatch *) (*elements)[i];
+ dm2 = l1->find(dm->id);
+ if (!dm2)
+ {
+ //
+ // Duplicate document.
+ //
+ dm3 = new DocMatch;
+ dm3->score = dm->score;
+ dm3->orMatches = dm->orMatches;
+ dm3->id = dm->id;
+ dm3->anchor = dm->anchor;
+ result->add(dm3);
+ }
+ }
+ if(debug)
+ cerr << result->elements()->Count() << endl;
+
+ elements->Release();
+ delete elements;
+ delete l1;
+ delete l2;
+}
+
+//*****************************************************************************
+// The top two entries in the stack need to be ORed together.
+//
+void
+Parser::perform_or()
+{
+ ResultList *l1 = (ResultList *) stack.pop();
+ ResultList *result = (ResultList *) stack.peek();
+ int i;
+ DocMatch *dm, *dm2;
+ HtVector *elements;
+
+ //
+ // If either of the arguments is not present, we will use the other as
+ // the results.
+ //
+ if (!l1 && result)
+ {
+ if(debug) cerr << "or: no 2nd operand" << endl;
+ return; // result in top of stack
+ }
+ else if (l1 && !result)
+ {
+ if(debug) cerr << "or: no 1st operand" << endl;
+ stack.pop();
+ stack.push(l1);
+ return;
+ }
+ else if (!l1 && !result)
+ {
+ if(debug) cerr << "or: no operands" << endl;
+ stack.pop();
+ stack.push(0); // empty result
+ return;
+ }
+
+ //
+ // If either of the arguments is set to be ignored, we will use the
+ // other as the result.
+ //
+ if (l1->isIgnore)
+ {
+ delete l1;
+ return;
+ }
+ else if (result->isIgnore)
+ {
+ result = (ResultList *) stack.pop();
+ stack.push(l1);
+ delete result;
+ return;
+ }
+
+ elements = l1->elements();
+ if(debug)
+ cerr << "perform or: " << elements->Count() << " " << result->elements()->Count() << " ";
+ for (i = 0; i < elements->Count(); i++)
+ {
+ dm = (DocMatch *) (*elements)[i];
+ dm2 = result->find(dm->id);
+ if (dm2)
+ {
+ //
+ // Update document. Add scores and add "OR-match" counts
+ //
+ dm2->score += dm->score;
+ dm2->orMatches += dm->orMatches;
+ if (dm->anchor < dm2->anchor)
+ dm2->anchor = dm->anchor;
+ }
+ else
+ {
+ dm2 = new DocMatch;
+ dm2->score = dm->score;
+ dm2->orMatches = dm->orMatches;
+ dm2->id = dm->id;
+ dm2->anchor = dm->anchor;
+ result->add(dm2);
+ }
+ }
+ if(debug)
+ cerr << result->elements()->Count() << endl;
+ elements->Release();
+ delete elements;
+ delete l1;
+}
+
+//*****************************************************************************
+// void Parser::parse(List *tokenList, ResultList &resultMatches)
+//
+void
+Parser::parse(List *tokenList, ResultList &resultMatches)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ tokens = tokenList;
+ DocumentRef *ref = NULL;
+
+ fullexpr(1);
+
+ ResultList *result = (ResultList *) stack.pop();
+ if (!result) // Ouch!
+ {
+// It seems we now end up here on a syntax error, so don't clear anything!
+// valid = 0;
+// error = 0;
+// error << "Expected to have something to parse!";
+ return;
+ }
+ HtVector *elements = result->elements();
+ DocMatch *dm;
+
+ // multimatch_factor gives extra weight to matching documents which
+ // contain more than one "OR" term. This is applied after the whole
+ // document is parsed, so multiple matches don't give exponentially
+ // increasing weights
+ double multimatch_factor = config->Double("multimatch_factor");
+
+ for (int i = 0; i < elements->Count(); i++)
+ {
+ dm = (DocMatch *) (*elements)[i];
+ ref = collection->getDocumentRef(dm->GetId());
+ if(ref && ref->DocState() == Reference_normal)
+ {
+ dm->collection = collection; // back reference
+ if (dm->orMatches > 1)
+ dm->score *= 1+multimatch_factor;
+ resultMatches.add(dm);
+ }
+ }
+ elements->Release();
+ result->Release();
+ delete elements;
+ delete result;
+}
+
+void
+Parser::setCollection(Collection *coll)
+{
+ if (coll)
+ words.Open(coll->getWordFile(), O_RDONLY);
+ collection = coll;
+}
+