diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htsearch/parser.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htsearch/parser.cc | 918 |
1 files changed, 918 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc new file mode 100644 index 00000000..3ed1531c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc @@ -0,0 +1,918 @@ +// +// parser.cc +// +// parser: Parses a boolean expression tree, retrieving and scoring +// the resulting document list +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: parser.cc,v 1.36 2004/06/11 16:50:33 grdetil Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "parser.h" +#include "HtPack.h" +#include "Collection.h" +#include "Dictionary.h" +#include "QuotedStringList.h" + +#define WORD 1000 +#define DONE 1001 + +QuotedStringList boolean_syntax_errors; +enum ErrorIndices { EXPECTED, SEARCH_WORD, AT_END, INSTEAD_OF, END_OF_EXPR, QUOTE }; + +//***************************************************************************** +Parser::Parser() : + words(*(HtConfiguration::config())) +{ + tokens = 0; + result = 0; + current = 0; + valid = 1; +} + + +//***************************************************************************** +// int Parser::checkSyntax(List *tokenList) +// As the name of the function implies, we will only perform a syntax check +// on the list of tokens. +// +int +Parser::checkSyntax(List *tokenList) +{ + HtConfiguration* config= HtConfiguration::config(); + void reportError(char *); + // Load boolean_syntax_errors from configuration + // they should be placed in this order: + // 0 1 2 3 4 + // Expected "a search word" "at the end" "instead of" "end of expression" + // 5 + // "a closing quote" + boolean_syntax_errors.Destroy(); + boolean_syntax_errors.Create(config->Find("boolean_syntax_errors"), "| \t\r\n\001"); + if (boolean_syntax_errors.Count() == 5) + { // for backward compatibility + boolean_syntax_errors.Add (new String ("a closing quote")); + if (debug) + cerr << "Parser::checkSyntax() : boolean_syntax_errors should have six entries\n"; + } else if (boolean_syntax_errors.Count() != 6) + reportError("boolean_syntax_errors attribute should have six entries"); + tokens = tokenList; + valid = 1; + fullexpr(0); + return valid; +} + +//***************************************************************************** +/* Called by: Parser::parse(List*, ResultList&), checkSyntax(List*) */ +/* Inputs: output -- if zero, simply check syntax */ +/* otherwise, list matching documents in head of "stack" */ +void +Parser::fullexpr(int output) +{ + tokens->Start_Get(); + lookahead = lexan(); + expr(output); + if (valid && lookahead != DONE) + { + setError(boolean_syntax_errors[END_OF_EXPR]); + } +} + +//***************************************************************************** +int +Parser::lexan() +{ + current = (WeightWord *) tokens->Get_Next(); + if (!current) + return DONE; + else if (mystrcasecmp((char*)current->word, "&") == 0) + return '&'; + else if (mystrcasecmp((char*)current->word, "|") == 0) + return '|'; + else if (mystrcasecmp((char*)current->word, "!") == 0) + return '!'; + else if (mystrcasecmp((char*)current->word, "(") == 0) + return '('; + else if (mystrcasecmp((char*)current->word, ")") == 0) + return ')'; + else if (mystrcasecmp((char*)current->word, "\"") == 0) + return '"'; + else + return WORD; +} + +//***************************************************************************** +// Attempt to deal with expressions in the form +// term | term | term ... +/* Called by: Parser::fullexpr(int), factor(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::expr(int output) +{ + term(output); + while (1) + { + if (match('|')) + { + term(output); + if (output) + { + if(debug) cerr << "or--" << endl; + perform_or(); + if(debug) cerr << "stack:" << stack.Size() << endl; + } + } + else + break; + } + if (valid && lookahead == WORD) + { + String expected = "'"; + expected << boolean_keywords[AND] << "' "<< boolean_keywords[OR] <<" '" + << boolean_keywords[OR] << "'"; + setError(expected.get()); + } +} + +//***************************************************************************** +// Attempt to deal with terms in the form +// factor & factor & factor ... +/* Called by: Parser::expr(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::term(int output) +{ + + factor(output); + if(debug) cerr << "term:factor" << endl; + while (1) + { + if(match('&')) + { + factor(output); + if(output) + { + if(debug) cerr << "and--" << endl; + perform_and(); + if(debug) cerr << "stack:" << stack.Size() << endl; + } + } + else if(match('!')) + { + factor(output); + if(output) + { + if(debug) cerr << "not--" << endl; + perform_not(); + if(debug) cerr << "stack:" << stack.Size() << endl; + } + } + else + { + break; + } + } +} + +//***************************************************************************** +/* Gather and score a (possibly bracketed) boolean expression */ +/* Called by: Parser::term(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::factor(int output) +{ + if(match('"')) + { + phrase(output); + } + else if (match('(')) + { + expr(output); + if (match(')')) + { + return; + } + else + { + setError("')'"); + } + } + else if (lookahead == WORD) + { + if (output) + { + perform_push(); + } + lookahead = lexan(); + } + else + { + setError(boolean_syntax_errors[SEARCH_WORD]); +// setError("a search word, a quoted phrase, a boolean expression between ()"); + } +} + +//***************************************************************************** +/* Gather and score a quoted phrase */ +/* Called by: Parser::factor(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::phrase(int output) +{ + List *wordList = 0; + double weight = 1.0; + + while (1) + { + if (match('"')) + { + if (output) + { + if(!wordList) wordList = new List; + if(debug) cerr << "scoring phrase" << endl; + score(wordList, weight, FLAGS_MATCH_ONE); // look in all fields + } + break; + } + else if (lookahead == WORD) + { + weight *= current->weight; + if (output) + perform_phrase(wordList); + + lookahead = lexan(); + } + else if (lookahead == DONE) + { + setError(boolean_syntax_errors[QUOTE]); + break; + } + else + { + // skip '&' '|' and '!' in the phrase + current->isIgnore = 1; + if (output) + perform_phrase(wordList); + lookahead = lexan (); + } + } // end while + if(wordList) delete wordList; +} + +//***************************************************************************** +int +Parser::match(int t) +{ + if (lookahead == t) + { + lookahead = lexan(); + return 1; + } + else + return 0; +} + +//***************************************************************************** +void +Parser::setError(char *expected) +{ + if (valid) + { + valid = 0; + error = 0; + error << boolean_syntax_errors[EXPECTED] << ' ' << expected; + if (lookahead == DONE || !current) + { + error << ' ' << boolean_syntax_errors[AT_END]; + } + else + { + error << ' ' << boolean_syntax_errors[INSTEAD_OF] << " '" + << current->word.get() << "'"; + switch (lookahead) + { + case '&': error << ' ' << boolean_keywords[OR] << " '" + << boolean_keywords[AND] << "'"; + break; + case '|': error << ' ' << boolean_keywords[OR] << " '" + << boolean_keywords[OR] << "'"; + break; + case '!': error << ' ' << boolean_keywords[OR] << " '" + << boolean_keywords[NOT] << "'"; + break; + } + } + if (debug) cerr << "Syntax error: " << error << endl; + } +} + +//***************************************************************************** +// Perform a lookup of the current word and push the result onto the stack +// +void +Parser::perform_push() +{ + HtConfiguration* config= HtConfiguration::config(); + static int maximum_word_length = config->Value("maximum_word_length", 12); + String temp = current->word.get(); + char *p; + + if(debug) + cerr << "perform_push @"<< stack.Size() << ": " << temp << endl; + + String wildcard = config->Find("prefix_match_character"); + if (!wildcard.get()) + wildcard = "*"; + if (temp == wildcard) + { + if (debug) cerr << "Wild card search\n"; + ResultList *list = new ResultList; + String doc_db = config->Find("doc_db"); + DocumentDB docdb; + docdb.Read(doc_db); + List *docs = docdb.DocIDs(); + + // + // Traverse all the known documents + // + DocumentRef *ref; + IntObject *id; + DocMatch *dm; + docs->Start_Get(); + while ((id = (IntObject *) docs->Get_Next())) + { + ref = docdb[id->Value()]; + if (debug) + cerr << (ref ? "Wildcard match" : "Wildcard empty") << endl; + if (ref) + { + dm = new DocMatch; + dm->score = current->weight; + dm->id = ref->DocID(); + dm->orMatches = 1; + dm->anchor = 0; + list->add(dm); + } + delete ref; + } + delete docs; + stack.push(list); + + return; + } + + // Must be after wildcard: "*" is "isIgnore" because it is too short. + if (current->isIgnore) + { + if(debug) cerr << "ignore: " << temp << " @" << stack.Size() << endl; + // + // This word needs to be ignored. Make it so. + // + ResultList *list = new ResultList; + list->isIgnore = 1; + stack.push(list); + return; + } + + temp.lowercase(); + p = temp.get(); + if (temp.length() > maximum_word_length) + p[maximum_word_length] = '\0'; + + List* result = words[p]; + score(result, current->weight, current->flags); + delete result; +} + +//***************************************************************************** +// BUG: Phrases containing "bad words" can have *any* "bad word" in that +// position. Words less than minimum_word_length ignored entirely, +// as they are not indexed. +void +Parser::perform_phrase(List * &oldWords) +{ + HtConfiguration* config= HtConfiguration::config(); + static int maximum_word_length = config->Value("maximum_word_length", 12); + String temp = current->word.get(); + char *p; + List *newWords = 0; + HtWordReference *oldWord, *newWord; + + // how many words ignored since last checked word? + static int ignoredWords = 0; + + // if the query is empty, no further effort is needed + if(oldWords && oldWords->Count() == 0) + { + if(debug) cerr << "phrase not found, skip" << endl; + return; + } + + if(debug) cerr << "phrase current: " << temp << endl; + if (current->isIgnore) + { + // + // This word needs to be ignored. Make it so. + // + if (temp.length() >= config->Value ("minimum_word_length") && oldWords) + ignoredWords++; + if(debug) cerr << "ignoring: " << temp << endl; + return; + } + + temp.lowercase(); + p = temp.get(); + if (temp.length() > maximum_word_length) + p[maximum_word_length] = '\0'; + + newWords = words[p]; + if(debug) cerr << "new words count: " << newWords->Count() << endl; + + // If we don't have a prior list of words, we want this one... + if (!oldWords) + { + oldWords = new List; + if(debug) cerr << "phrase adding first: " << temp << endl; + newWords->Start_Get(); + while ((newWord = (HtWordReference *) newWords->Get_Next())) + { + oldWords->Add(newWord); + } + if(debug) cerr << "old words count: " << oldWords->Count() << endl; + return; + } + + // OK, now we have a previous list in wordList and a new list + List *results = new List; + + Dictionary newDict(5000); + + String nid; + newWords->Start_Get(); + while ((newWord = (HtWordReference *) newWords->Get_Next())) + { + nid = ""; + int did = newWord->DocID(); + nid << did; + nid << "-"; + int loc = newWord->Location(); + nid << loc; + if (! newDict.Exists(nid)) { + newDict.Add(nid, (Object *)newWord); + } else { +// cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n"; +// Double addition is a problem if you don't want your original objects deleted + } + } + + String oid; + oldWords->Start_Get(); + while ((oldWord = (HtWordReference *) oldWords->Get_Next())) + { + oid = ""; + int did = oldWord->DocID(); + oid << did; + oid << "-"; + int loc = oldWord->Location(); + oid << loc + ignoredWords+1; + if (newDict.Exists(oid)) + { + newWord = (HtWordReference *)newDict.Find(oid); + + HtWordReference *result = new HtWordReference(*oldWord); + + result->Flags(oldWord->Flags() & newWord->Flags()); + result->Location(newWord->Location()); + + results->Add(result); + } + } + ignoredWords = 0; // most recent word is not a non-ignored word + + newDict.Release(); + + if(debug) cerr << "old words count: " << oldWords->Count() << endl; + if(debug) cerr << "results count: " << results->Count() << endl; + oldWords->Destroy(); + results->Start_Get(); + while ((newWord = (HtWordReference *) results->Get_Next())) + { + oldWords->Add(newWord); + } + if(debug) cerr << "old words count: " << oldWords->Count() << endl; + results->Release(); + delete results; + + newWords->Destroy(); + delete newWords; + +} + +//***************************************************************************** +// Allocate scores based on words in wordList. +// Fields within which the word must appear are specified in flags +// (see HtWordReference.h). +void +Parser::score(List *wordList, double weight, unsigned int flags) +{ + HtConfiguration* config= HtConfiguration::config(); + DocMatch *dm; + HtWordReference *wr; + static double text_factor = config->Double("text_factor", 1); + static double caps_factor = config->Double("caps_factor", 1); + static double title_factor = config->Double("title_factor", 1); + static double heading_factor = config->Double("heading_factor", 1); + static double keywords_factor = config->Double("keywords_factor", 1); + static double meta_description_factor = config->Double("meta_description_factor", 1); + static double author_factor = config->Double("author_factor", 1); + static double description_factor = config->Double("description_factor", 1); + double wscore; + int docanchor; + int word_count; + + if (!wordList || wordList->Count() == 0) + { + // We can't score an empty list, so push a null pointer... + if(debug) cerr << "score: empty list, push 0 @" << stack.Size() << endl; + + stack.push(0); + return; + } + + ResultList *list = new ResultList; + if(debug) cerr << "score: push @" << stack.Size() << endl; + stack.push(list); + // We're now guaranteed to have a non-empty list + // We'll use the number of occurences of this word for scoring + word_count = wordList->Count(); + + wordList->Start_Get(); + while ((wr = (HtWordReference *) wordList->Get_Next())) + { + // + // ******* Compute the score for the document + // + + // If word not in one of the required fields, skip the entry. + // Plain text sets no flag in dbase, so treat it separately. + if (!(wr->Flags() & flags) && (wr->Flags() || !(flags & FLAG_PLAIN))) + { + if (debug > 2) + cerr << "Flags " << wr->Flags() << " lack " << flags << endl; + continue; + } + + wscore = 0.0; + if (wr->Flags() == FLAG_TEXT) wscore += text_factor; + if (wr->Flags() & FLAG_CAPITAL) wscore += caps_factor; + if (wr->Flags() & FLAG_TITLE) wscore += title_factor; + if (wr->Flags() & FLAG_HEADING) wscore += heading_factor; + if (wr->Flags() & FLAG_KEYWORDS) wscore += keywords_factor; + if (wr->Flags() & FLAG_DESCRIPTION) wscore += meta_description_factor; + if (wr->Flags() & FLAG_AUTHOR) wscore += author_factor; + if (wr->Flags() & FLAG_LINK_TEXT) wscore += description_factor; + wscore *= weight; + wscore = wscore / (double)word_count; + docanchor = wr->Anchor(); + dm = list->find(wr->DocID()); + if (dm) + { + wscore += dm->score; + if (dm->anchor < docanchor) + docanchor = dm->anchor; + // We wish to *update* this, not add a duplicate + list->remove(wr->DocID()); + } + + dm = new DocMatch; + dm->id = wr->DocID(); + dm->score = wscore; + dm->orMatches = 1; // how many "OR" terms this doc has + dm->anchor = docanchor; + list->add(dm); + } +} + + +//***************************************************************************** +// The top two entries in the stack need to be ANDed together. +// +// a b a and b +// 0 0 0 +// 0 1 0 +// 0 x 0 +// 1 0 0 +// 1 1 intersect(a,b) +// 1 x a +// x 0 0 +// x 1 b +// x x x +// +void +Parser::perform_and() +{ + ResultList *l1 = (ResultList *) stack.pop(); + ResultList *l2 = (ResultList *) stack.pop(); + int i; + DocMatch *dm, *dm2, *dm3; + HtVector *elements; + + if(!(l2 && l1)) + { + if(debug) cerr << "and: at least one empty operator, pushing 0 @" << stack.Size() << endl; + stack.push(0); + if(l1) delete l1; + if(l2) delete l2; + return; + } + + // + // If either of the arguments is set to be ignored, we will use the + // other as the result. + // remember l2 and l1, l2 not l1 + + if (l1->isIgnore && l2->isIgnore) + { + if(debug) cerr << "and: ignoring all, pushing ignored list @" << stack.Size() << endl; + ResultList *result = new ResultList; + result->isIgnore = 1; + delete l1; delete l2; + stack.push(result); + return; + } + else if (l1->isIgnore) + { + if(debug) cerr << "and: ignoring l1, pushing l2 @" << stack.Size() << endl; + stack.push(l2); + delete l1; + return; + } + else if (l2->isIgnore) + { + if(debug) cerr << "and: ignoring l2, pushing l2 @" << stack.Size() << endl; + stack.push(l1); + delete l2; + return; + } + + ResultList *result = new ResultList; + stack.push(result); + elements = l2->elements(); + + if(debug) + cerr << "perform and: " << elements->Count() << " " << l1->elements()->Count() << " "; + + for (i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + dm2 = l1->find(dm->id); + if (dm2) + { + // + // Duplicate document. Add scores and average "OR-matches" count + // + dm3 = new DocMatch; +// "if (dm2)" means "?:" operator not needed... +// dm3->score = dm->score + (dm2 ? dm2->score : 0); +// dm3->orMatches = (dm->orMatches + (dm2 ? dm2->orMatches : 0))/2; + dm3->score = dm->score + dm2->score; + dm3->orMatches = (dm->orMatches + dm2->orMatches)/2; + dm3->id = dm->id; + dm3->anchor = dm->anchor; +// if (dm2 && dm2->anchor < dm3->anchor) + if (dm2->anchor < dm3->anchor) + dm3->anchor = dm2->anchor; + result->add(dm3); + } + } + if(debug) + cerr << result->elements()->Count() << endl; + + elements->Release(); + delete elements; + delete l1; + delete l2; +} + +// a b a not b +// 0 0 0 +// 0 1 0 +// 0 x 0 +// 1 0 a +// 1 1 intersect(a,not b) +// 1 x a +// x 0 x +// x 1 x +// x x x +void +Parser::perform_not() +{ + ResultList *l1 = (ResultList *) stack.pop(); + ResultList *l2 = (ResultList *) stack.pop(); + int i; + DocMatch *dm, *dm2, *dm3; + HtVector *elements; + + + if(!l2) + { + if(debug) cerr << "not: no positive term, pushing 0 @" << stack.Size() << endl; + // Should probably be interpreted as "* not l1" + stack.push(0); + if(l1) delete l1; + return; + } + if(!l1 || l1->isIgnore || l2->isIgnore) + { + if(debug) cerr << "not: no negative term, pushing positive @" << stack.Size() << endl; + stack.push(l2); + if(l1) delete l1; + return; + } + + ResultList *result = new ResultList; + if(debug) cerr << "not: pushing result @" << stack.Size() << endl; + stack.push(result); + elements = l2->elements(); + + if(debug) + cerr << "perform not: " << elements->Count() << " " << l1->elements()->Count() << " "; + + for (i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + dm2 = l1->find(dm->id); + if (!dm2) + { + // + // Duplicate document. + // + dm3 = new DocMatch; + dm3->score = dm->score; + dm3->orMatches = dm->orMatches; + dm3->id = dm->id; + dm3->anchor = dm->anchor; + result->add(dm3); + } + } + if(debug) + cerr << result->elements()->Count() << endl; + + elements->Release(); + delete elements; + delete l1; + delete l2; +} + +//***************************************************************************** +// The top two entries in the stack need to be ORed together. +// +void +Parser::perform_or() +{ + ResultList *l1 = (ResultList *) stack.pop(); + ResultList *result = (ResultList *) stack.peek(); + int i; + DocMatch *dm, *dm2; + HtVector *elements; + + // + // If either of the arguments is not present, we will use the other as + // the results. + // + if (!l1 && result) + { + if(debug) cerr << "or: no 2nd operand" << endl; + return; // result in top of stack + } + else if (l1 && !result) + { + if(debug) cerr << "or: no 1st operand" << endl; + stack.pop(); + stack.push(l1); + return; + } + else if (!l1 && !result) + { + if(debug) cerr << "or: no operands" << endl; + stack.pop(); + stack.push(0); // empty result + return; + } + + // + // If either of the arguments is set to be ignored, we will use the + // other as the result. + // + if (l1->isIgnore) + { + delete l1; + return; + } + else if (result->isIgnore) + { + result = (ResultList *) stack.pop(); + stack.push(l1); + delete result; + return; + } + + elements = l1->elements(); + if(debug) + cerr << "perform or: " << elements->Count() << " " << result->elements()->Count() << " "; + for (i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + dm2 = result->find(dm->id); + if (dm2) + { + // + // Update document. Add scores and add "OR-match" counts + // + dm2->score += dm->score; + dm2->orMatches += dm->orMatches; + if (dm->anchor < dm2->anchor) + dm2->anchor = dm->anchor; + } + else + { + dm2 = new DocMatch; + dm2->score = dm->score; + dm2->orMatches = dm->orMatches; + dm2->id = dm->id; + dm2->anchor = dm->anchor; + result->add(dm2); + } + } + if(debug) + cerr << result->elements()->Count() << endl; + elements->Release(); + delete elements; + delete l1; +} + +//***************************************************************************** +// void Parser::parse(List *tokenList, ResultList &resultMatches) +// +void +Parser::parse(List *tokenList, ResultList &resultMatches) +{ + HtConfiguration* config= HtConfiguration::config(); + tokens = tokenList; + DocumentRef *ref = NULL; + + fullexpr(1); + + ResultList *result = (ResultList *) stack.pop(); + if (!result) // Ouch! + { +// It seems we now end up here on a syntax error, so don't clear anything! +// valid = 0; +// error = 0; +// error << "Expected to have something to parse!"; + return; + } + HtVector *elements = result->elements(); + DocMatch *dm; + + // multimatch_factor gives extra weight to matching documents which + // contain more than one "OR" term. This is applied after the whole + // document is parsed, so multiple matches don't give exponentially + // increasing weights + double multimatch_factor = config->Double("multimatch_factor"); + + for (int i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + ref = collection->getDocumentRef(dm->GetId()); + if(ref && ref->DocState() == Reference_normal) + { + dm->collection = collection; // back reference + if (dm->orMatches > 1) + dm->score *= 1+multimatch_factor; + resultMatches.add(dm); + } + } + elements->Release(); + result->Release(); + delete elements; + delete result; +} + +void +Parser::setCollection(Collection *coll) +{ + if (coll) + words.Open(coll->getWordFile(), O_RDONLY); + collection = coll; +} + |