diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/test/search.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/test/search.cc | 3543 |
1 files changed, 3543 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/test/search.cc b/debian/htdig/htdig-3.2.0b6/test/search.cc new file mode 100644 index 00000000..4f5e9690 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/test/search.cc @@ -0,0 +1,3543 @@ +// +// search.cc +// +// search: Sample implementation of search algorithms using +// a mifluz inverted index. +// +// Each class is documented in the class definition. Before +// each method declaration a comment explains the semantic of +// the method. In the method definition comments in the code +// may contain additional information. +// +// Each virtual function is documented in the base class, not +// in the derived classes except for semantic differences. +// +// The class tree is: +// +// WordKeySemantic +// +// WordExclude +// WordExcludeMask +// WordPermute +// +// WordSearch +// +// WordMatch +// +// WordTree +// WordTreeOperand +// WordTreeOptional +// WordTreeOr +// WordTreeAnd +// WordTreeNear +// WordTreeMandatory +// WordTreeNot +// WordTreeLiteral +// +// WordParser +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: search.cc,v 1.9 2004/05/28 13:15:29 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include <htconfig.h> +#endif /* HAVE_CONFIG_H */ + +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif /* HAVE_UNISTD_H */ + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif /* HAVE_GETOPT_H */ +#ifdef HAVE_MALLOC_H +#include <malloc.h> +#endif /* HAVE_MALLOC_H */ +#include <stdlib.h> + +#include <htString.h> +#include <WordList.h> +#include <WordContext.h> +#include <WordCursor.h> + +// +// Verbosity level set with -v (++) +// +static int verbose = 0; + +// ************************* Document definition implementation *********** + +#define TAG 1 +#define SERVER 2 +#define URL 3 +#define LOCATION 4 + +// *********************** WordKeySemantic implementation ******************** +// +// NAME +// +// encapsulate WordKey semantic for document and location +// +// SYNOPSIS +// +// #include <WordKeySemantic.h> +// +// #define SERVER 1 +// #define URL 2 +// #define LOCATION 3 +// +// static int document[] = { +// SERVER, +// URL +// }; +// +// WordKeySemantic semantic; +// semantic.Initialize(document, sizeof(document)/sizeof(int), LOCATION); +// +// DESCRIPTION +// +// Encapsulate the semantic of a WordKey object fields. It defines +// what a document and a location are. It implements the set of +// operation that a search needs to perform given the fact that it +// implements a search whose purpose is to retrieve a document and +// wants to implement proximity search based on a word location. +// +// +// END +// +// A document is a set of fields in a given order. +// A location is a field. +// The actual fields used to implement WordKeySemantic methods are +// set with the Initialize method. +// +class WordKeySemantic { +public: + WordKeySemantic(); + ~WordKeySemantic(); + + //- + // Set the actual field numbers that define what a document is and + // what a location is. The <b>document_arg<b> is a list of WordKey field + // positions of length <b>document_length_arg</b> that must be adjacent. + // The <b>location_arg</b> is the WordKey field position of the word + // location within a document. + // Return OK on success, NOTOK on failure. + // + int Initialize(int* document_arg, int document_length_arg, int location_arg); + + // + // These functions and only these know what a document is. + // This should really be a class containing function pointers and be + // given as argument to the search algorithm. + // + //- + // Copy the document in <b>from</b> into <b>to.</b> + // + void DocumentSet(const WordKey& from, WordKey& to); + //- + // Increment the document in <b>key</b> using the <i>SetToFollowing</i> + // method of WordKey. <b>uniq</b> is the WordKey position at which the + // increment starts. + // + void DocumentNext(WordKey& key, int uniq); + //- + // Compare the document fields defined in both <b>a</b> and <b>b</b> + // and return the difference a - b, as in strcmp. If all document + // fields in <b>a</b> or <b>b</b> are undefined return 1. + // + int DocumentCompare(const WordKey& a, const WordKey& b); + //- + // Set all document fields to 0. + // + int DocumentClear(WordKey& key); + + // + // These functions and only these know what a location is. + // This should really be a class containing function pointers and be + // given as argument to the search algorithm. + // + //- + // Copy the document and location in <b>from</b> into <b>to.</b> + // + void LocationSet(const WordKey& from, WordKey& to); + //- + // Increment the document and location in <b>key</b> + // using the <i>SetToFollowing</i> + // method of WordKey. + // + void LocationNext(WordKey& key); + //- + // Compare <b>expected</b> location to <b>actual</b> location. Compares equal + // as long as expected location is at a maximum distance of <b>proximity</b> + // of actual. If <b>actual</b> only has undefined field, return > 0. + // <b>expected</b> must always be the lowest possible bound. + // <b>actual</b> is tolerated if it is greater than <b>actual</b> but not + // greater than <b>proximity</b> if <b>proximity</b> > 0 or abs(<b>proximity</b>) * 2 if + // <b>proximity</b> < 0. + // Return the difference expected - actual. + // + int LocationCompare(const WordKey& expected, const WordKey& actual, int proximity = 0); + //- + // <b>key</b> is the expected location of a searched key. + // LocationNearLowest modifies <b>key</b> to add tolerance accroding to + // <b>proximity</b>. + // + // The idea is that <b>key</b> will be the lowest possible match for + // for the <b>proximity</b> range. If <proxmity> is positive, <b>key</b> + // is already the lowest possible match since we accept [0 proximity]. + // If <b>proximity</b> is negative, substract it since we accept + // [-proximity proximity]. + // + // For better understanding see the functions in which it is used. + // + void LocationNearLowest(WordKey& key, int proximity); + + //- + // Undefined the location field in <b>key.</b>. + // + void Location2Document(WordKey& key); + +protected: + int* document; + int document_length; + int location; +}; + +WordKeySemantic::WordKeySemantic() +{ + int nfields = WordKey::NFields(); + document = new int[nfields]; + document_length = 0; + location = -1; +} + +WordKeySemantic::~WordKeySemantic() +{ + if(document) delete [] document; +} + +int WordKeySemantic::Initialize(int* document_arg, int document_length_arg, int location_arg) +{ + memcpy((char*)document, (char*)document_arg, document_length_arg * sizeof(int)); + document_length = document_length_arg; + location = location_arg; + return OK; +} + +void WordKeySemantic::DocumentSet(const WordKey& from, WordKey& to) +{ + to.Clear(); + for(int i = 0; i < document_length; i++) + to.Set(document[i], from.Get(document[i])); +} + +int WordKeySemantic::DocumentCompare(const WordKey& a, const WordKey& b) +{ + int ret = 1; + for(int i = 0; i < document_length; i++) { + int idx = document[i]; + if((a.IsDefined(idx) && b.IsDefined(idx)) && + (ret = a.Get(idx) - b.Get(idx)) != 0) return ret; + } + return ret; +} + +int WordKeySemantic::DocumentClear(WordKey& key) +{ + for(int i = 0; i < document_length; i++) + key.Set(document[i], 0); + return 0; +} + +void WordKeySemantic::DocumentNext(WordKey& key, int uniq) +{ + if(uniq) + key.SetToFollowing(uniq); + else + key.SetToFollowing(document[document_length-1]); +} + + +void WordKeySemantic::LocationSet(const WordKey& from, WordKey& to) +{ + DocumentSet(from, to); + to.Set(location, from.Get(location)); +} + +int WordKeySemantic::LocationCompare(const WordKey& expected, const WordKey& actual, int proximity) +{ + int ret = 1; + if((ret = DocumentCompare(expected, actual)) != 0) return ret; + // + // Only compare location if defined. + // + if((expected.IsDefined(location) && actual.IsDefined(location)) && + (ret = expected.Get(location) - actual.Get(location))) { + if(proximity < 0) { + // + // -N means ok if in range [-N +N] + // + proximity *= 2; + if(ret < 0 && ret >= proximity) + ret = 0; + } else { + // + // N means ok if in range [0 +N] + // + if(ret < 0 && ret >= -proximity) + ret = 0; + } + } + return ret; +} + +void WordKeySemantic::LocationNext(WordKey& key) +{ + key.SetToFollowing(location); +} + +void WordKeySemantic::LocationNearLowest(WordKey& key, int proximity) +{ + if(proximity < 0) { + if(key.Underflow(location, proximity)) + key.Get(location) = 0; + else + key.Get(location) += proximity; + } +} + +void WordKeySemantic::Location2Document(WordKey& key) +{ + key.Undefined(location); +} + +// ************************* WordExclude implementation ******************** +// +// NAME +// +// permute bits in bit field +// +// SYNOPSIS +// +// #include <WordExclude.h> +// +// #define BITS 5 +// +// WordExclude permute; +// permute.Initialize(BITS); +// while(permute.Next() == WORD_EXCLUDE_OK) +// ... +// +// DESCRIPTION +// +// Count from 1 to the specified maximum. A variable++ loop does the same. +// The <b>WordExclude</b> class counts in a specific order. +// It first step thru all the permutations containing only 1 bit set, in +// increasing order. Then thru all the permutations containing 2 bits set, +// in increasing order. As so forth until the maximum number is reached. +// See the <b>Permute</b> method for more information. +// +// +// END + +// +// Helper that displays an unsigned int in binary/hexa/decimal +// +static inline void show_bits(unsigned int result) +{ + int i; + for(i = 0; i < 10; i++) { + fprintf(stderr, "%c", (result & (1 << i)) ? '1' : '0'); + } + fprintf(stderr, " (0x%08x - %15d)\n", result, result); +} + +// +// WordExclude methods return values +// +#define WORD_EXCLUDE_OK 1 +#define WORD_EXCLUDE_END 2 + +// +// Maximum number of bits +// +#define WORD_EXCLUDE_MAX (sizeof(unsigned int) * 8) + +// +// Convert a position <p> in a <l> bits mask into a bit offset (from 0) +// +#define WORD_EXCLUDE_POSITION2BIT(l,p) ((l) - (p) - 1) + +class WordExclude { +public: + //- + // Reset the generator and prepare it for <b>length</b> bits generation. + // The <b>length</b> cannot be greater than <i>WORD_EXCLUDE_MAX.</i> + // Returns OK if no error occurs, NOTOK otherwise. + // + virtual int Initialize(unsigned int length); + //- + // Move to next exclude mask. Returns WORD_EXCLUDE_OK if successfull, + // WORD_EXCLUDE_END if at the end of the permutations. It starts by + // calling <i>Permute</i> with one bit set, then two and up to + // <i>Maxi()</i> included. The last permutation only generates one + // possibility since all the bits are set. + // + virtual int Next(); + //- + // Exclude bit for <b>position</b> starts at most significant bit. That is + // position 0 exclude bit is most significant bit of the current mask. + // Returns true if position is excluded, false otherwise. + // + virtual inline unsigned int Excluded(int position) { return mask & (1 << WORD_EXCLUDE_POSITION2BIT(maxi, position)); } + //- + // Returns how many bits are not excluded with current mask. + // + virtual inline int NotExcludedCount() const { return maxi - bits; } + //- + // Returns how many bits are excluded with current mask. + // + virtual inline int ExcludedCount() const { return bits; } + // + // Save and restore in string + // + //- + // Write an ascii representation of the WordExclude object in <b>buffer.</b> + // Each bit is represented by the character 0 or 1. The most significant + // bit is the last character in the string. For instance + // 1000 is the string representation of a WordExclude object initialized + // with length = 4 after the first <i>Next</i> operation. + // + virtual void Get(String& buffer) const; + //- + // Initialize the object from the string representation in <b>buffer.</b> + // Returns OK on success, NOTOK on failure. + // + virtual int Set(const String& buffer); + + //- + // Generate all the permutations + // containing <i>n</i> bits in a <b>bits</b> bit word in increasing order. + // The <b>mask</b> argument is originally filled by the caller + // with the <i>n</i> least significant bits set. A call to Permute + // generates the next permutation immediately greater (numerically) + // than the one contained in <b>mask</b>. + // + // Permute returns the next permutation or 0 if it reached the + // maximum. + // + // To understand the algorithm, imagine 1 is a ball and 0 a space. + // + // When playing the game you start with a rack of <b>bits</b> slots filled + // with <i>n</i> balls all on the left side. You end the game when all + // the balls are on the right side. + // + // Sarting from the left, search for the first ball that has an empty + // space to the right. While searching remove all the balls you find. + // Place a ball in the empty space you found, at the right of the last + // ball removed. Sarting from the left, fill all empty spaces with + // the removed balls. Repeat until all balls are to the right. + // + // Here is a sample generated by repeated calls to WordExclude::Permute: + // (left most bit is least significant) + // <pre> + // mask = 1111100000 + // while(mask = WordExclude::Permute(mask, 7)) + // show_bits(mask) + // + // 1111100000 (0x0000001f - 31) + // 1111010000 (0x0000002f - 47) + // 1110110000 (0x00000037 - 55) + // 1101110000 (0x0000003b - 59) + // 1011110000 (0x0000003d - 61) + // 0111110000 (0x0000003e - 62) + // 1111001000 (0x0000004f - 79) + // 1110101000 (0x00000057 - 87) + // 1101101000 (0x0000005b - 91) + // 1011101000 (0x0000005d - 93) + // 0111101000 (0x0000005e - 94) + // 1110011000 (0x00000067 - 103) + // 1101011000 (0x0000006b - 107) + // 1011011000 (0x0000006d - 109) + // 0111011000 (0x0000006e - 110) + // 1100111000 (0x00000073 - 115) + // 1010111000 (0x00000075 - 117) + // 0110111000 (0x00000076 - 118) + // 1001111000 (0x00000079 - 121) + // 0101111000 (0x0000007a - 122) + // 0011111000 (0x0000007c - 124) + // </pre> + // A recursive implementation would be: + // <pre> + // /* Recursive */ + // void permute(unsigned int result, int bits_count, int bits_toset) + // { + // if(bits_toset <= 0 || bits_count <= 0) { + // if(bits_toset <= 0) + // do_something(result); + // } else { + // permute(result, bits_count - 1, bits_toset); + // permute(result | (1 << (bits_count - 1)), bits_count - 1, bits_toset - 1); + // } + // } + // </pre> + // Which is more elegant but not practical at all in our case. + // + inline unsigned int Permute(unsigned int mask, unsigned int bits); + + //- + // Return the current bit field value. + // + virtual inline unsigned int& Mask() { return mask; } + virtual inline unsigned int Mask() const { return mask; } + + virtual inline unsigned int& Maxi() { return maxi; } + virtual inline unsigned int Maxi() const { return maxi; } + + virtual inline unsigned int& Bits() { return bits; } + virtual inline unsigned int Bits() const { return bits; } + +private: + unsigned int mask; + unsigned int maxi; + unsigned int bits; +}; + +int WordExclude::Initialize(unsigned int length) +{ + if(length > WORD_EXCLUDE_MAX) { + fprintf(stderr, "WordExclude::Initialize: length must be < %d\n", (int)WORD_EXCLUDE_MAX); + return NOTOK; + } + + mask = 0; + bits = 0; + maxi = length; + + return OK; +} + +inline unsigned int WordExclude::Permute(unsigned int mask, unsigned int bits) +{ + unsigned int bits_cleared = 0; + unsigned int j; + for(j = 0; j < bits; j++) { + if(mask & (1 << j)) { + bits_cleared++; + mask &= ~(1 << j); + } else { + if(bits_cleared) { + bits_cleared--; + mask |= (1 << j); + break; + } + } + } + + if(j >= bits) + return 0; + + for(j = 0; j < bits_cleared; j++) + mask |= (1 << j); + + return mask; +} + +int WordExclude::Next() +{ + mask = Permute(mask, maxi); + + int ret = WORD_EXCLUDE_OK; + + if(mask == 0) { + bits++; + if(bits > maxi) + ret = WORD_EXCLUDE_END; + else { + unsigned int i; + for(i = 0; i < bits; i++) + mask |= (1 << i); + ret = WORD_EXCLUDE_OK; + } + } + + if(verbose > 2) show_bits(mask); + + return ret; +} + +void WordExclude::Get(String& buffer) const +{ + buffer.trunc(); + unsigned int i; + for(i = 0; i < maxi; i++) { + buffer << ((mask & (1 << i)) ? '1' : '0'); + } +} + +int WordExclude::Set(const String& buffer) +{ + if(Initialize(buffer.length()) == NOTOK) + return NOTOK; + unsigned int i; + for(i = 0; i < maxi; i++) { + if(buffer[i] == '1') { + mask |= (1 << i); + bits++; + } + } + return OK; +} + +// ************************* WordExcludeMask implementation ******************* +// +// NAME +// +// WordExclude specialization that ignore some bits +// +// SYNOPSIS +// +// #include <WordExcludeMask.h> +// +// #define BITS 9 +// #define IGNORE 0x0f0 +// #define IGNORE_MASK 0x050 +// +// WordExcludeMask permute; +// permute.Initialize(BITS, IGNORE, IGNORE_MASK); +// while(permute.Next() == WORD_EXCLUDE_OK) +// ... +// +// DESCRIPTION +// +// Only perform WordExclude operations on the bits that are not set in +// <i>ignore.</i> The bits of <i>ignore_mask</i> that are set in +// <i>ignore</i> are untouched. In the synopsis section, for instance, +// bits 1,2,3,4 and 9 will be permuted and the bits 5,6,7,8 will be +// left untouched. +// +// +// END +// +#define WORD_EXCLUDE_IGNORED (-1) + +class WordExcludeMask : public WordExclude { +public: + //- + // <b>ignore</b> gives the mask of bits to ignore. The actual WordExclude + // operations are made on a number of bits that is <b>length</b> - (the number + // of bits set in <b>ignore).</b> + // The <b>ignore_mask_arg</b> contains the actual values of the bits ignored by + // the <b>ignore</b> argument. + // + virtual inline int Initialize(unsigned int length, unsigned int ignore, unsigned int ignore_mask_arg) { + ignore_mask = ignore_mask_arg; + ignore_maxi = length; + unsigned int maxi = 0; + unsigned int i; + for(i = 0, ignore_bits = 0; i < length; i++) { + if(ignore & (1 << i)) { + bit2bit[i] = WORD_EXCLUDE_IGNORED; + if(ignore_mask & (1 << i)) ignore_bits++; + } else { + bit2bit[i] = maxi++; + } + } + + return WordExclude::Initialize(maxi); + } + + virtual inline unsigned int Excluded(int position) { + position = WORD_EXCLUDE_POSITION2BIT(ignore_maxi, position); + if(bit2bit[position] == WORD_EXCLUDE_IGNORED) + return ignore_mask & (1 << position); + else + return WordExclude::Mask() & (1 << bit2bit[position]); + } + + virtual inline int NotExcludedCount() const { + return ignore_maxi - ignore_bits - WordExclude::Bits(); + } + + virtual inline int ExcludedCount() const { + return ignore_bits - WordExclude::Bits(); + } + + //- + // The semantic is the same as the Get method of Wordexclude + // except that ignored bits are assigned 3 and 2 instead of 1 and 0 + // respectively. + // + virtual void Get(String& buffer) const; + //- + // The semantic is the same as the Get method of Wordexclude + // except that ignored bits are assigned 3 and 2 instead of 1 and 0 + // respectively. + // + virtual int Set(const String& buffer); + + virtual inline unsigned int Mask() const { + unsigned int ret = ignore_mask; + unsigned int i; + for(i = 0; i < ignore_maxi; i++) { + if(bit2bit[i] != WORD_EXCLUDE_IGNORED) { + if(WordExclude::Mask() & (1 << bit2bit[i])) + ret |= (1 << i); + } + } + return ret; + } + + virtual inline unsigned int Maxi() const { return ignore_maxi; } + + virtual inline unsigned int Bits() const { return ignore_bits + WordExclude::Bits(); } + +private: + unsigned int ignore_mask; + unsigned int ignore_maxi; + unsigned int ignore_bits; + int bit2bit[WORD_EXCLUDE_MAX]; +}; + +void WordExcludeMask::Get(String& buffer) const +{ + buffer.trunc(); + unsigned int i; + for(i = 0; i < ignore_maxi; i++) { + if(bit2bit[i] == WORD_EXCLUDE_IGNORED) + buffer << ((ignore_mask & (1 << i)) ? '3' : '2'); + else + buffer << ((WordExclude::Mask() & (1 << bit2bit[i])) ? '1' : '0'); + } +} + +int WordExcludeMask::Set(const String& buffer) +{ + WordExclude::Initialize(0); + + unsigned int& maxi = WordExclude::Maxi(); + unsigned int& mask = WordExclude::Mask(); + unsigned int& bits = WordExclude::Bits(); + ignore_mask = 0; + ignore_bits = 0; + ignore_maxi = buffer.length(); + + unsigned int i; + for(i = 0; i < ignore_maxi; i++) { + if(buffer[i] == '1' || buffer[i] == '0') { + if(buffer[i] == '1') { + mask |= (1 << maxi); + bits++; + } + bit2bit[i] = maxi; + maxi++; + } else if(buffer[i] == '3' || buffer[i] == '2') { + if(buffer[i] == '3') { + ignore_mask |= (1 << i); + ignore_bits++; + } + bit2bit[i] = WORD_EXCLUDE_IGNORED; + } + } + + return OK; +} + +// ************************* WordPermute implementation ******************** +// +// NAME +// +// WordExclude specialization with proximity toggle +// +// SYNOPSIS +// +// #include <WordPermute.h> +// +// #define BITS 5 +// +// WordPermute permute; +// permute.Initialize(BITS); +// while(permute.Next() == WORD_EXCLUDE_OK) +// if(permute.UseProximity()) ... +// +// DESCRIPTION +// +// Each WordExclude permutation is used twice by Next. Once with +// the proximity flag set and once with the proximity flag cleared. +// If the length of the bit field (length argument of Initialize) is +// lower or equal to 1, then the proximity flag is always false. +// +// +// END +// +// WordPermute methods return values +// +#define WORD_PERMUTE_OK WORD_EXCLUDE_OK +#define WORD_PERMUTE_END WORD_EXCLUDE_END + +// +// Use or don't use proximity flag +// +#define WORD_PERMUTE_PROXIMITY_NO 0 +#define WORD_PERMUTE_PROXIMITY_TOGGLE 1 +#define WORD_PERMUTE_PROXIMITY_ONLY 2 + +// +// Deals with word exclusion and proximity permutations for +// the implementation of the Optional retrieval model. +// +class WordPermute : public WordExcludeMask { +public: + //- + // The <b>nuse_proximity</b> may be set to the following: + // + // WORD_PERMUTE_PROXIMITY_NO so that the object behaves as + // WordExcludeMask and Proximity() always return false. + // + // WORD_PERMUTE_PROXIMITY_TOGGLE so that each permutation is issued twice: + // once with the proximity flag set (Proximity() method) and once with + // the proximity flag cleared. + // + // WORD_PERMUTE_PROXIMITY_ONLY so that the object behaves as + // WordExcludeMask and Proximity() always return true. + // + virtual inline int Initialize(unsigned int length, unsigned int ignore, unsigned int ignore_mask_arg, int nuse_proximity) { + use_proximity = nuse_proximity; + switch(use_proximity) { + case WORD_PERMUTE_PROXIMITY_NO: + proximity = 0; + break; + case WORD_PERMUTE_PROXIMITY_TOGGLE: + // + // Don't bother to try proximity search if only one word + // is involved. + // + proximity = length > 1; + break; + case WORD_PERMUTE_PROXIMITY_ONLY: + proximity = 1; + break; + default: + fprintf(stderr, "WordPermute::Initialize: unexpected use_proximity = %d\n", use_proximity); + return 0; + } + return WordExcludeMask::Initialize(length, ignore, ignore_mask_arg); + } + + //- + // Return true if the proximity flag is set, false if it is + // cleared. + // + inline int Proximity() { + switch(use_proximity) { + case WORD_PERMUTE_PROXIMITY_NO: + return 0; + break; + case WORD_PERMUTE_PROXIMITY_TOGGLE: + return proximity; + break; + case WORD_PERMUTE_PROXIMITY_ONLY: + return 1; + break; + default: + fprintf(stderr, "WordPermute::Proximity: unexpected use_proximity = %d\n", use_proximity); + return 0; + break; + } + } + + //- + // Return WORD_PERMUTE_PROXIMITY_NO, WORD_PERMUTE_PROXIMITY_TOGGLE or + // WORD_PERMUTE_PROXIMITY_ONLY. + // + inline int UseProximity() { return use_proximity; } + + //- + // Find the next permutation. If <b>WORD_PERMUTE_PROXIMITY_TOGGLE<b> was + // specified in Initialize each permutation is issued twice (see + // Proximity() to differentiate them), except when the mask + // only contains one non exluded bit (NotExcludeCount() <= 1). + // In both case the last permutation with all bits excluded + // (i.e. when NotExcludedCount() <= 0) is never returned because + // it is useless. + // + virtual int Next() { + if(Maxi() <= 0) + return WORD_PERMUTE_END; + + int ret = WORD_PERMUTE_OK; + int check_useless = 0; + if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) { + // + // Move to next permutation as follows: + // exclude mask 1 + use proximity + // exclude mask 1 + don't use proximity + // exclude mask 2 + use proximity + // exclude mask 2 + don't use proximity + // and so on. + // If only one word is involved never use proximity. + // + if(proximity) { + proximity = 0; + } else { + proximity = 1; + if((ret = WordExcludeMask::Next()) == WORD_PERMUTE_OK) { + // + // Do not toggle proximity for only one non excluded word + // + if(NotExcludedCount() <= 1) + proximity = 0; + check_useless = 1; + } else if(ret == WORD_PERMUTE_END) + proximity = 0; + } + } else { + ret = WordExcludeMask::Next(); + check_useless = 1; + } + + if(check_useless && ret == WORD_PERMUTE_OK) { + // + // If no bits are ignored or all ignore_mask bits are set to + // one, the last permutation has all exclude bits set, which + // is useless. Just skip it and expect to be at the end of + // all permutations. + // + if(NotExcludedCount() <= 0) { + ret = WordExcludeMask::Next(); + if(ret != WORD_PERMUTE_END) { + fprintf(stderr, "WordPermute::Next: expected WORD_PERMUTE_END\n"); + ret = NOTOK; + } + } + } + + return ret; + } + + //- + // The semantic is the same as the Get method of Wordexclude + // but a letter T is appended to the string if the proximity + // flag is set, or F is appended to the string if the proximity + // is clear. + // + virtual inline void Get(String& buffer) const { + WordExcludeMask::Get(buffer); + if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) + buffer << (proximity ? 'T' : 'F'); + } + + //- + // The semantic is the same as the Get method of Wordexclude + // but if the string end with a T the proximity flag is set + // and if the string end with a F the proximity flag is cleared. + // + virtual inline int Set(const String& buffer) { + if(buffer.length() < 1) { + fprintf(stderr, "WordPermute::Set: buffer length < 1\n"); + return NOTOK; + } + int ret = OK; + if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) { + if((ret = WordExcludeMask::Set(buffer.sub(0, buffer.length() - 1))) == OK) + proximity = buffer.last() == 'T'; + } else { + ret = WordExcludeMask::Set(buffer); + } + + return ret; + } + +protected: + int use_proximity; + int proximity; +}; + +// ************************* WordTree implementation ******************** +// +// NAME +// +// Base class for query resolution nodes +// +// SYNOPSIS +// +// #include <WordTree.h> +// +// class WordTreeMethod : public WordTree { +// ... +// }; +// +// DESCRIPTION +// +// The WordTree class is derived from the WordCursor class and implement +// the basic operations and data structures needed for query resolution. +// It is the common base class of all the classes that actually implement +// a query resolution. The derived classes must be implemented to follow +// the WordCursor semantic for Walk* operations. +// +// +// END +// + +#define WORD_WALK_REDO 0x1000 +#define WORD_WALK_RESTART 0x2000 +#define WORD_WALK_NEXT 0x4000 + +// +// Return values of CursorsObeyProximity method +// +#define WORD_SEARCH_NOPROXIMITY 1 + +// +// operand values +// +#define WORD_TREE_OR 1 +#define WORD_TREE_AND 2 +#define WORD_TREE_NEAR 3 +#define WORD_TREE_OPTIONAL 4 +#define WORD_TREE_LITERAL 5 +#define WORD_TREE_MANDATORY 6 +#define WORD_TREE_NOT 7 + +#define WORD_TREE_OP_SIZE 20 + +// +// Default proximity is to search for adjacent words in order +// +#ifndef WORD_SEARCH_DEFAULT_PROXIMITY +#define WORD_SEARCH_DEFAULT_PROXIMITY 1 +#endif /* WORD_SEARCH_DEFAULT_PROXIMITY */ + +static char* operator_name[WORD_TREE_OP_SIZE] = { + "", + "or", + "and", + "near", + "optional", + "literal", + "mandatory", + "not", + 0 +}; + +class WordTree : public WordCursor { +public: + WordTree() { + proximity = 0; + uniq = 0; + } + + virtual int ContextSaveList(StringList& list) const { + return OK; + } + + virtual int ContextRestoreList(StringList& list) { + return OK; + } + + //- + // Initialize the object. <b>words</b> is used to initialize the + // WordCursor base class, <b>document, document_length</b> and + // <b>location</b> are used to initialize the WordKeySemantic data + // member. The <b>nuniq</b> is the WordKey field position used by + // the WordKeySemantic::DocumentNext function. The <b>nproximity</b> + // is the proximity factor used by the WordKeySemantic::LocationCompare + // method. + // Return OK on success, NOTOK on failure. + // + virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) { + int ret; + proximity = nproximity; + uniq = nuniq; + if((ret = key_semantic.Initialize(document, document_length, location)) != OK) + return ret; + WordKey key; + if(!scope.empty()) { + if(key.Set(scope) != OK) { + fprintf(stderr, "WordTree::Prepare: setting scope %s failed\n", (char*)scope); + return NOTOK; + } + } + key.SetWord(search); + return WordCursor::Initialize(words, key, 0, 0, HTDIG_WORDLIST_WALKER); + } + + //- + // Return a copy of the last document found. + // + WordKey GetDocument() { + WordKey found; + key_semantic.DocumentSet(GetFound().Key(), found); + return found; + } + + //- + // Store in the <i>info</i> data member textual information about + // the latest match found. + // + virtual void SetInfo() { info = GetFound().Key().GetWord(); } + + //- + // Return a copy of the <i>info</i> data member. Should be + // called after SetInfo(). + // + String GetInfo() { return info; } + + //- + // Sort WordTree data members (if any) in ascending frequency order. + // Return OK on success, NOTOK on failure. + // + virtual int AscendingFrequency() { return OK; } + + //- + // Delete WordTree data members (if any) that have a zero frequency. + // The number of data members deleted is returned in <b>stripped</b>. + // Return OK on success, NOTOK on failure. + // + virtual int StripNonExistent(unsigned int& stripped) { + stripped = 0; + return OK; + } + + // + // Input + // + //- + // Proximity factor. See WordKeySemantic::LocationCompare. + // + int proximity; + //- + // Uniq WordKey field position. See WordKeySemantic::DocumentNext. + // + int uniq; + //- + // Semantic of the WordKey object. + // + WordKeySemantic key_semantic; + //- + // Textual representation of the search scope. + // + String scope; + //- + // Original search criterion that may be different from the + // WordCursor::searchKey data member. + // + String search; + + // + // Internal state + // + //- + // Textual information about the latest match. + // + String info; +}; + +// ************************* WordTreeLiteral implementation **************** + +class WordTreeLiteral : public WordTree { +public: + //- + // Constructor. The search criterion is <b>string</b> and the + // scope is <b>nscope.</b>. + // + WordTreeLiteral(const char* string, const char* nscope = "") { + search.set((char*)string); + scope.set((char*)nscope); + } + + //- + // Returns WORD_TREE_LITERAL. + // + int IsA() const { return WORD_TREE_LITERAL; } + + virtual int WalkRewind(); + //- + // Only return a match for each distinct document. + // + virtual int WalkNext(); + virtual int Seek(const WordKey& patch); + + //- + // If scope is set the <b>bufferout</b> is filled with + // <pre> + // ( word "scope" ) + // </pre> + // otherwise the <b>bufferout</b> only contains the word. + // + virtual int Get(String& bufferout) const { + if(scope.empty()) + bufferout << search; + else + bufferout << "( " << operator_name[IsA()] << " \"" << scope << "\" " << search << " )"; + return OK; + } + +protected: + WordKey current_document; +}; + +int WordTreeLiteral::WalkRewind() +{ + current_document.Clear(); + return WordCursor::WalkRewind(); +} + +int WordTreeLiteral::WalkNext() +{ + int ret; + do { + ret = WordCursor::WalkNext(); + if(verbose > 3) fprintf(stderr, "WordTreeLiteral::WalkNext: reached %s\n", (char*)GetDocument().Get()); + } while(ret == OK && + key_semantic.DocumentCompare(current_document, GetDocument()) == 0); + + if(ret == OK) + current_document = GetDocument(); + else + current_document.Clear(); + + return ret; +} + +int WordTreeLiteral::Seek(const WordKey& position) +{ + current_document.Clear(); + return WordCursor::Seek(position); +} + +// ************************* WordTreeOperand implementation **************** +// +// NAME +// +// Base class for boolean query resolution nodes +// +// SYNOPSIS +// +// #include <WordTree.h> +// +// class WordTreeMethod : public WordTreeOperand { +// ... +// }; +// +// DESCRIPTION +// +// The WordTreeOperand class is derived from WordTree and implemet +// the basic operations and data structures needed for query resultion +// of boolean operators. It contains a list of WordTree objects (the +// operands or cursors) and redefine the basic WordCursor methods +// to operate on all of them according to the logic defined by the +// derived class. +// +// +// END +// + +// +// Helper for debugging that returns the string representation +// of the return codes. +// +static char* ret2str(int ret) +{ + if(ret == WORD_WALK_REDO) + return "REDO"; + + if(ret == WORD_WALK_RESTART) + return "RESTART"; + + if(ret == WORD_WALK_NEXT) + return "NEXT"; + + if(ret == OK) + return "OK"; + + if(ret == NOTOK) + return "NOTOK"; + + if(ret == WORD_WALK_ATEND) + return "ATEND"; + + return "???"; +} + +class WordTreeOperand : public WordTree +{ +public: + //- + // Constructor. The scope is <b>nscope</b>. + // + WordTreeOperand(const char* nscope) { + scope.set((char*)nscope); + } + //- + // Free the objects pointed by <i>cursors</i> with delete as well + // as the <i>cursors</i> array itself with delete []. + // + virtual ~WordTreeOperand(); + + virtual void Clear() { + cursors = 0; + cursors_length = 0; + WordCursor::Clear(); + } + + //- + // Recursively call Optimize on each <i>cursors</i>. + // + virtual int Optimize(); + + //- + // Change the <i>permutation</i> data member ignore mask according + // to WORD_TREE_MANDATORY and WORD_TREE_NOT nodes found in + // <i>cursors</i>. MANDATORY and NOT nodes are reduced (replaced + // by their first child cursor. For each MANDATORY and NOT nodes + // the bit (see WordExcludeMask for information) + // corresponding to their position is ignored (set in the <b>ignore</b> + // argument of the WordExcludeMask::Initialize function. For NOT + // nodes, the bit corresponding to their position is set in + // the <b>ignore_mask</b> of the WordExcludeMask::Initialize function + // (i.e. implementing a <i>not</i> operation). + // The <b>proximity</b> argument may be WORD_PERMUTE_PROXIMITY_TOGGLE or + // WORD_PERMUTE_PROXIMITY_NO. + // Returns OK on success, NOTOK on failure. + // + int OptimizeOr(int proximity); + + virtual int ContextSave(String& buffer) const { + StringList list; + int ret; + if((ret = ContextSaveList(list)) != OK) + return ret; + + buffer.trunc(); + String* element; + list.Start_Get(); + while((element = (String*)list.Get_Next())) { + buffer << (*element) << ';'; + } + // + // Trim last ; + // + buffer.chop(1); + + return OK; + } + + virtual int ContextSaveList(StringList& list) const { + // + // Apply to each cursor + // + unsigned int i; + for(i = 0; i < cursors_length; i++) + if(cursors[i]->ContextSaveList(list) == NOTOK) + return NOTOK; + return OK; + } + + virtual int ContextRestore(const String& buffer) { + if(!buffer.empty()) { + StringList list(buffer, ";"); + return ContextRestoreList(list); + } else { + return OK; + } + } + + virtual int ContextRestoreList(StringList& list) { + // + // Apply to each cursor + // + unsigned int i; + for(i = 0; i < cursors_length; i++) + if(cursors[i]->ContextRestoreList(list) == NOTOK) + return NOTOK; + return OK; + } + + //- + // Recursively call WalkInit on each <i>cursors</i>. + // + virtual int WalkInit(); + //- + // Recursively call WalkRewind on each <i>cursors</i>. + // Reset the <i>pos</i> data member with WordKeySemantic::DocumentClear. + // + virtual int WalkRewind(); + //- + // Recursively call WalkFinish on each <i>cursors</i>. + // + virtual int WalkFinish(); + //- + // Recursively call Seek on each <i>cursors</i>. + // Save the <b>patch</b> argument in the <i>pos</i> data + // member. + // + virtual int Seek(const WordKey& patch); + + //- + // The number of occurrence of a WordTreeOperand is the sum of the + // number of occurrence of each term. + // + virtual int Noccurrence(unsigned int& noccurrence) const { + noccurrence = 0; + unsigned int i; + for(i = 0; i < cursors_length; i++) { + unsigned int frequency; + if(cursors[i]->Noccurrence(frequency) != OK) + return NOTOK; + noccurrence += frequency; + } + return OK; + } + + //- + // The <b>bufferout</b> argument is filled with a lisp like representation + // of the tree starting at this node. + // + virtual int Get(String& bufferout) const { + bufferout << "( " << operator_name[IsA()] << " \"" << scope << "\" "; + unsigned int i; + for(i = 0; i < cursors_length; i++) + bufferout << cursors[i]->Get() << " "; + bufferout << " )"; + return OK; + } + + //- + // Call Prepare on each <i>cursors</i>. Set the <i>search</i> member + // with an textual representation of the tree starting at this node. + // + virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) { + int ret; + if((ret = WordTree::Prepare(words, nuniq, nproximity, document, document_length, location)) != OK) + return ret; + unsigned int i; + for(i = 0; i < cursors_length; i++) { + if((ret = cursors[i]->Prepare(words, nuniq, nproximity, document, document_length, location)) != OK) + return ret; + } + return Get(GetSearch().GetWord()); + } + + //- + // The current cursor offset (set by Seek for instance). It + // duplicates the function of the WordCursor <i>key</i> data member + // because the data type is different (WordKey instead of String). + // + WordKey pos; + //- + // Sub nodes array. + // + WordTree** cursors; + //- + // Number of valid entries in the <i>cursors</i> member. + // + unsigned int cursors_length; + //- + // Permutation generator with proximity toggle + // + WordPermute permutation; +}; + +WordTreeOperand::~WordTreeOperand() +{ + if(cursors) { + unsigned int i; + for(i = 0; i < cursors_length; i++) + delete cursors[i]; + free(cursors); + } +} + +int +WordTreeOperand::Optimize() +{ + // + // Apply to each cursor + // + unsigned int i; + for(i = 0; i < cursors_length; i++) + if(cursors[i]->Optimize() == NOTOK) + return NOTOK; + return OK; +} + +int WordTreeOperand::OptimizeOr(int proximity) +{ + unsigned int ignore = 0; + unsigned int ignore_mask = 0; + unsigned int i; + for(i = 0; i < cursors_length; i++) { + int reduce; + // + // Set ignore & ignore_mask if cursor is NOT or MANDATORY + // + switch(cursors[i]->IsA()) { + case WORD_TREE_MANDATORY: + ignore |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i)); + reduce = 1; + break; + case WORD_TREE_NOT: + ignore |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i)); + ignore_mask |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i)); + reduce = 1; + break; + default: + reduce = 0; + break; + } + // + // Replace the NOT or MANDATORY node by its only child + // + if(reduce) { + WordTreeOperand* old = (WordTreeOperand*)cursors[i]; + cursors[i] = old->cursors[0]; + old->cursors[0] = 0; + old->cursors_length--; + if(old->cursors_length > 0) { + fprintf(stderr, "WordTreeOptional::OptimizeOr: too many cursors\n"); + return NOTOK; + } + delete old; + } + } + return permutation.Initialize(cursors_length, ignore, ignore_mask, proximity); +} + +int +WordTreeOperand::WalkInit() +{ + unsigned int i; + int ret = WORD_WALK_ATEND; + for(i = 0; i < cursors_length; i++) + if((ret = cursors[i]->WalkInit()) != OK) + return ret; + return (status = ret); +} + +int +WordTreeOperand::WalkRewind() +{ + unsigned int i; + int ret = OK; + for(i = 0; i < cursors_length; i++) + if((ret = cursors[i]->WalkRewind()) != OK) + return ret; + status = OK; + key_semantic.DocumentClear(pos); + cursor_get_flags = DB_SET_RANGE; + found.Clear(); + return ret; +} + +int +WordTreeOperand::WalkFinish() +{ + unsigned int i; + int ret = OK; + for(i = 0; i < cursors_length; i++) + if((ret = cursors[i]->WalkFinish()) != OK) + return ret; + return ret; +} + +int +WordTreeOperand::Seek(const WordKey& patch) +{ + pos.CopyFrom(patch); + cursor_get_flags = DB_SET_RANGE; + + unsigned int i; + int ret = OK; + for(i = 0; i < cursors_length; i++) + if((ret = cursors[i]->Seek(patch)) != OK && + ret != WORD_WALK_ATEND) + return ret; + status = OK; + return OK; +} + +// ************************* WordTreeOptional implementation **************** + +class WordTreeOptional : public WordTreeOperand { + public: + WordTreeOptional(const char* nscope) : WordTreeOperand(nscope) { } + + //- + // Return WORD_TREE_OPTIONAL + // + virtual int IsA() const { return WORD_TREE_OPTIONAL; } + + virtual int Optimize(); + + virtual int ContextSaveList(StringList& list) const; + + virtual int ContextRestoreList(StringList& list); + + //- + // Multipass walk of the occurrences according to the <i>permutation</i> + // data member specifications. First search for documents containing + // all occurrences near to each other. Then documents that + // contain all occurrences far appart. Then ignore the most frequent + // search criterion and search for documents that contain all the others + // near to each other. The logic goes on until there only remains the + // most frequent word. + // + virtual int WalkNext(); + //- + // Only seek the first non excluded cursor. The implementation + // of WalkNext makes it useless to seek the others. + // + virtual int Seek(const WordKey& position); + + virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) { + int ret; + if((ret = permutation.Initialize(cursors_length, 0, 0, WORD_PERMUTE_PROXIMITY_TOGGLE)) != OK) + return ret; + return WordTreeOperand::Prepare(words, nuniq, nproximity, document, document_length, location); + } + + virtual void SetInfo(); + + virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_TOGGLE; } + + virtual int UsePermutation() const { return 1; } + + //- + // Returns true if all cursors must have a frequency > 0, false otherwise. + // + virtual int AllOrNothing() const { return 0; } + + //- + // Comparison between <b>cursor</b> and <b>constraint</b> is made + // with WordKeySemantic::LocationCompare using the <b>proximity</b> + // argument. If <b>master</b> is NULL it is set to point to <b> + // <b>cursor</b>. + // + // Return WORD_WALK_NEXT if <b>cursor</b> is at <b>constraint</b> and + // set <b>constraint</b> if <b>cursor</b> is <b>master</b>. + // + // Return WORD_WALK_REDO if <b>cursor</b> is above <b>constraint</b> and + // call cursor.WalkNext(). + // + // Return WORD_WALK_RESTART if <b>cursor</b> is below <b>constraint</b> and + // set <b>constraint</b> from <b>cursor</b> using + // WordKeySemantic::DocumentSet if <b>cursor</b> is not <b>master</b> + // otherwise also set location of <b>constraint</b> using + // WordKeySemantic::LocationSet and call WordKeySemantic::LocationNext + // on <b>constraint.</b> + // + // Return WORD_WALK_ATEND if no more match possible. + // + // Return NOTOK on failure. + // + int SearchCursorNear(WordTree& cursor, WordTree*& master, WordKey& constraint, int proximity); + //- + // Comparison between <b>cursor</b> and <b>document</b> is made + // with WordKeySemantic::DocumentCompare. + // + // Return WORD_WALK_NEXT if <b>cursor</b> is above <b>document.</b> + // + // Return WORD_WALK_REDO if <b>cursor</b> is below <b>document</b> + // and call cursor.WalkNext(). + // + // Return WORD_WALK_RESTART if <b>cursor</b> is at <b>document</b> + // and call WordKeySemantic::DocumentNext method on <b>document.</b> + // + // Return WORD_WALK_ATEND if no more match possible. + // + // Return NOTOK on failure. + // + int SearchCursorNot(WordTree& cursor, WordKey& document); + //- + // Comparison between <b>cursor</b> and <b>document</b> is made + // with WordKeySemantic::DocumentCompare. + // + // Return WORD_WALK_NEXT if <b>cursor</b> is at <b>document.</b>. + // + // Return WORD_WALK_REDO if <b>cursor</b> is below <b>document</b> + // + // Return WORD_WALK_RESTART if <b>cursor</b> is above <b>document</b> + // and call WordKeySemantic::DocumentNext method on <b>document.</b> + // + // Return WORD_WALK_ATEND if no more match possible. + // + // Return NOTOK on failure. + // + // + int SearchCursorAnd(WordTree& cursor, WordKey& document, WordExclude& permutation); + // + // We know that : + // 1) document does not contain any excluded words. + // 2) contains at least one occurrence of each non excluded word. + // The logic, although very similar to WordSearchNear::SearchOne + // is therefore simpler. We ignore all excluded cursors and + // return WORD_SEARCH_NOPROXIMITY as soon as a cursor move outside + // <document>. + // + //- + // If <b>document</b> contains words that match proximity + // requirement, return OK. Return WORD_SEARCH_NOPROXIMITY if proximity + // requirement cannot be matched for <document>. + // + int CursorsObeyProximity(WordKey& document); + + //- + // Sort the <i>cursors</i> in ascending frequency order using the + // Noccurrence method on each cursor. + // Return OK on success, NOTOK on failure. + // + virtual int AscendingFrequency(); + //- + // Delete all elements of the <i>cursors</i> array that have a + // zero frequency. The <i>cursors</i> array is shrinked and the + // <i>cursors_length</i> set accordingly. Returns the number of + // deletions in the <b>stripped</i> argument. + // Return OK on success, NOTOK on failure. + // + virtual int StripNonExistent(unsigned int& stripped); +}; + +int WordTreeOptional::Optimize() +{ + int ret; + if((ret = WordTreeOperand::Optimize()) != OK) + return ret; + + if(UseProximity() != WORD_PERMUTE_PROXIMITY_ONLY) { + if((ret = AscendingFrequency()) != OK) + return ret; + } + + unsigned int stripped; + if((ret = StripNonExistent(stripped)) != OK) + return ret; + + if(AllOrNothing() && stripped) { + // + // One word is missing and everything is lost, + // Just kill the remaining cursors. + // + unsigned int i; + for(i = 0; i < cursors_length; i++) + delete cursors[i]; + cursors_length = 0; + + return OK; + } else { + return OptimizeOr(UseProximity()); + } +} + +int WordTreeOptional::ContextSaveList(StringList& list) const +{ + int ret; + if((ret = WordTreeOperand::ContextSaveList(list)) != OK) + return ret; + + if(UsePermutation()) { + String* buffer = new String(); + permutation.Get(*buffer); + + list.Add(buffer); + } + + { + String* buffer = new String(); + if((ret = WordCursor::ContextSave(*buffer)) != OK) + return ret; + + list.Add(buffer); + } + + return OK; +} + +int WordTreeOptional::ContextRestoreList(StringList& list) +{ + int ret; + if((ret = WordTreeOperand::ContextRestoreList(list)) != OK) + return ret; + + if(UsePermutation()) { + char* buffer = list[0]; + if((ret = permutation.Set(buffer)) != OK) + return ret; + list.Remove(0); + } + + { + char* buffer = list[0]; + if(!buffer) return NOTOK; + WordKey key(buffer); + if((ret = Seek(key)) != OK) + return ret; + cursor_get_flags = DB_NEXT; + + list.Remove(0); + } + + return OK; +} + +int WordTreeOptional::WalkNext() +{ + WordKey& constraint = pos; + // + // Set constraint with all 0 + // + if(constraint.Empty()) + key_semantic.DocumentClear(constraint); + + // + // Advance cursors to next constraint, if not at the + // beginning of the search. + // + int ret = OK; + int match_ok = 0; + do { + // + // Advance cursors so that next call fetches another constraint + // + if(cursor_get_flags == DB_NEXT) + key_semantic.DocumentNext(constraint, uniq); + + if((ret = Seek(constraint)) != OK) + return ret; + + int near = permutation.Proximity(); + WordTree* first = 0; + for(unsigned int i = 0; i < cursors_length;) { + WordTree& cursor = *(cursors[i]); + near = permutation.Proximity(); + int excluded = permutation.Excluded(i); + if(verbose) fprintf(stderr, "WordTreeOptional::WalkNext: %s excluded = %s, proximity = %s\n", (char*)cursor.GetSearch().GetWord(), (excluded ? "yes" : "no"), (near ? "yes" : "no" )); + + int ret; + if(excluded) { + ret = SearchCursorNot(cursor, constraint); + if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: Not -> %s\n", ret2str(ret)); + } else { + if(near) { + ret = SearchCursorNear(cursor, first, constraint, proximity); + if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: Near -> %s\n", ret2str(ret)); + } else { + ret = SearchCursorAnd(cursor, constraint, permutation); + if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: And -> %s\n", ret2str(ret)); + } + } + + switch(ret) { + case WORD_WALK_ATEND: + if(UsePermutation()) { + // + // The search is over with this permutation, try another one. + // + switch(permutation.Next()) { + // + // No permutations left, the end + // + case WORD_PERMUTE_END: + return (status = WORD_WALK_ATEND); + break; + + // + // Sart over with this permutation + // + case WORD_PERMUTE_OK: + if(WalkRewind() != OK) + return NOTOK; + break; + } + first = 0; + i = 0; + } else { + return (status = WORD_WALK_ATEND); + } + break; + case WORD_WALK_REDO: + break; + case WORD_WALK_RESTART: + first = 0; + i = 0; + break; + case WORD_WALK_NEXT: + i++; + break; + case NOTOK: + default: + return ret; + break; + } + } + + cursor_get_flags = DB_NEXT; + + SetInfo(); + + // + // Save possible result, i.e. first non excluded cursor + // + for(unsigned int i = 0; i < cursors_length; i++) { + WordTree& cursor = *(cursors[i]); + if(!permutation.Excluded(i)) { + found.Key().CopyFrom(cursor.GetFound().Key()); + break; + } + } + + match_ok = 1; + // + // Only bother if near and non near search are involved + // + if(UseProximity() == WORD_PERMUTE_PROXIMITY_TOGGLE) { + // + // If we reach this point in the function and + // either proximity search is active or there is + // only one word involved, the match is valid. + // Otherwise it may be excluded, see below. + // + if(!near && permutation.NotExcludedCount() > 1) { + // + // If not using proximity, a match that fits the proximity + // requirements must be skipped because it was matched by + // the previous permutation (see WordPermute). + // + switch(CursorsObeyProximity(constraint)) { + case OK: + match_ok = 0; + break; + case WORD_SEARCH_NOPROXIMITY: + match_ok = 1; + break; + default: + case NOTOK: + return NOTOK; + break; + } + } + } + } while(!match_ok && ret == OK); + + return ret; +} + +int WordTreeOptional::Seek(const WordKey& position) +{ + pos.CopyFrom(position); + cursor_get_flags = DB_SET_RANGE; + status = OK; + + unsigned int i; + for(i = 0; i < cursors_length; i++) { + if(!permutation.Excluded(i)) { + WordTree& cursor = *(cursors[i]); + return cursor.Seek(position); + } + } + + fprintf(stderr, "WordTreeOptional::Seek: failed\n"); + return NOTOK; +} + + +void WordTreeOptional::SetInfo() +{ + unsigned int i; + for(i = 0; i < cursors_length; i++) + cursors[i]->SetInfo(); + + info.trunc(); + + for(i = 0; i < cursors_length; i++) { + WordTree& cursor = *(cursors[i]); + + if(!permutation.Excluded(i)) + info << cursor.info << " "; + } + + info << (permutation.Proximity() ? "proximity" : ""); +} + +int WordTreeOptional::SearchCursorNear(WordTree& cursor, WordTree*& master, WordKey& constraint, int proximity) +{ + int is_master = master == 0 || master == &cursor; + if(master == 0) master = &cursor; + const WordKey& masterKey = master->GetFound().Key(); + + int direction = key_semantic.LocationCompare(constraint, cursor.GetFound().Key(), proximity); + if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorNear: LocationCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(constraint.Get()), (char*)(cursor.GetFound().Key().Get()), direction); + + // + // If the cursor is in the authorized locations, consider + // next cursor + // + if(direction == 0) { + // + // master cursor makes the rules for location : its location + // is the base to calculate other words mandatory loacations. + // + if(is_master) + key_semantic.LocationSet(cursor.GetFound().Key(), constraint); + // + // Fix location constraint to accomodate proximity tolerance. + // + key_semantic.LocationNearLowest(constraint, proximity); + return WORD_WALK_NEXT; + + // + // If current location is above cursor location + // + } else if(direction > 0) { + // + // Move the cursor up to the location. + // + cursor.Seek(constraint); + if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorNear: leap to %s\n", (char*)constraint.Get()); + int ret; + if((ret = cursor.WalkNext()) == OK) { + // + // Remove the location constraint for the master word + // so that it matches and then enforce location for other + // keys. + // + if(is_master) + key_semantic.Location2Document(constraint); + // + // Reconsider the situation for this cursor + // + return WORD_WALK_REDO; + } else { + return ret; + } + + // + // If current location is lower than cursor location, + // meaning that the cursor found no match for the current + // location. + // + } else if(direction < 0) { + // + // The cursor document becomes the current document. + // The master cursor is forced to catch up. + // + key_semantic.DocumentSet(cursor.GetDocument(), constraint); + // + // It is possible that this cursor document is the same + // as the master cursor document (if this cursor hit in the + // same document but a higher location). In this case we must + // increase the location of the master cursor otherwise it will + // match without moving and loop forever. + // + if(!is_master && key_semantic.DocumentCompare(masterKey, constraint) == 0) { + key_semantic.LocationSet(masterKey, constraint); + key_semantic.LocationNext(constraint); + } + // + // Since the current location changed, start over. + // + return WORD_WALK_RESTART; + } else { + fprintf(stderr, "WordTreeOptional::WordCursorNear: reached unreachable statement\n"); + return NOTOK; + } + return NOTOK; +} + +int WordTreeOptional::SearchCursorNot(WordTree& cursor, WordKey& document) +{ + int direction = key_semantic.DocumentCompare(document, cursor.GetFound().Key()); + if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorNot: DocumentCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(document.Get()), (char*)(cursor.GetFound().Key().Get()), direction); + + // + // If the cursor is above the current document + // (being at the end of walk is being above all documents). + // + // Means that the cursor is positioned in an acceptable document + // and proceed to the next cursor. + // + if(direction < 0 || cursor.IsAtEnd()) { + return WORD_WALK_NEXT; + + // + // If the cursor is below current document + // + } else if(direction > 0) { + // + // Move the cursor up to the document + // + cursor.Seek(document); + if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorNot: leap to %s\n", (char*)document.Get()); + int ret; + if((ret = cursor.WalkNext()) != OK && ret != WORD_WALK_ATEND) + return NOTOK; + // + // It is expected in this case that the cursor has moved above + // the current document and another visit in the loop will + // tell us. + // + return WORD_WALK_REDO; + + // + // If the cursor matches the current document. + // + // Means that the current document is not a possible match + // since it is pointed by this cursor. + // + } else if(direction == 0) { + // + // The cursor does not give any hint on a possible + // next document, just go to the next possible one. + // + key_semantic.DocumentNext(document, uniq); + // + // Since the current document changed, start over. + // + return WORD_WALK_RESTART; + } else { + fprintf(stderr, "WordTreeOptional::WordCursorNot: reached unreachable statement\n"); + return NOTOK; + } + return NOTOK; +} + +int WordTreeOptional::SearchCursorAnd(WordTree& cursor, WordKey& document, WordExclude& permutation) +{ + int direction = key_semantic.DocumentCompare(document, cursor.GetFound().Key()); + if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorAnd: DocumentCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(document.Get()), (char*)(cursor.GetFound().Key().Get()), direction); + + // + // If the cursor is in the current document. + // + // Means that the cursor is positioned in an acceptable document + // and proceed to the next cursor. + // + if(direction == 0) { + return WORD_WALK_NEXT; + + // + // If the cursor is below current document + // + } else if(direction > 0) { + // + // Move the cursor up to the document + // + cursor.Seek(document); + if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorAnd: leap to %s\n", (char*)document.Get()); + int ret; + if((ret = cursor.WalkNext()) == OK) + return WORD_WALK_REDO; + else + return ret; + + // + // If the cursor is above current document. + // + // Means the the current document is not a possible match + // since it will never reach it because it's already + // above it. + // + } else if(direction < 0) { + // + // The cursor document becomes the current document. + // + key_semantic.DocumentSet(cursor.GetDocument(), document); + + // + // Since the current document changed, start over. + // + return WORD_WALK_RESTART; + } else { + fprintf(stderr, "WordTreeOptional::WordCursorAnd: reached unreachable statement\n"); + return NOTOK; + } + return NOTOK; +} + +int WordTreeOptional::CursorsObeyProximity(WordKey& document) +{ + // + // Run if more than one word is involved, proximity + // is always true if there is only one word. + // + if(permutation.NotExcludedCount() <= 1) return OK; + + WordKey location; + + // + // The first non excluded cursor contains anchor location. + // + unsigned int master_index = 0; + for(unsigned int i = 0; i < cursors_length; i++) { + if(!permutation.Excluded(i)) { + master_index = i; + break; + } + } + const WordKey& masterKey = cursors[master_index]->GetFound().Key(); + key_semantic.DocumentSet(masterKey, location); + + for(unsigned int i = 0; i < cursors_length;) { + if(permutation.Excluded(i)) { + i++; + continue; + } + + WordTree& cursor = *(cursors[i]); + if(cursor.IsAtEnd()) return WORD_SEARCH_NOPROXIMITY; + // if(cursor.status & WORD_WALK_FAILED) return NOTOK; + + // + // If the cursor moved outside of the tested document, + // no proximity match is possible. + // + if(key_semantic.DocumentCompare(cursor.GetFound().Key(), document) != 0) + return WORD_SEARCH_NOPROXIMITY; + + int direction = key_semantic.LocationCompare(location, cursor.GetFound().Key(), proximity); + + // + // If the cursor is in the authorized locations, consider + // next cursor + // + if(direction == 0) { + // + // master cursor makes the rules for location : its location + // is the base to calculate other words mandatory loacations. + // + if(i == master_index) + key_semantic.LocationSet(cursor.GetFound().Key(), location); + // + // Fix location constraint to accomodate proximity tolerance. + // + key_semantic.LocationNearLowest(location, proximity); + i++; + + // + // If current location is greater than cursor location + // + } else if(direction > 0) { + // + // Move the cursor up to the location. + // + cursor.Seek(location); + if(verbose > 1) fprintf(stderr, "WordTreeOptional::CursorsObeyProximity: leap to %s\n", (char*)location.Get()); + int ret; + if((ret = cursor.WalkNext()) != OK) { + if(ret == WORD_WALK_ATEND) { + return WORD_SEARCH_NOPROXIMITY; + } else { + return NOTOK; + } + } + // + // Remove the location constraint for the master word + // so that it matches and then enforce location for other + // keys. + // + if(i == master_index) + key_semantic.Location2Document(location); + // + // Reconsider the situation for this cursor + // + + // + // If current location is lower than cursor location, + // meaning that the cursor found no match in the current + // document. + // + } else if(direction < 0) { + // + // Move to next master key, if possible. + // + if(i != master_index) { + key_semantic.LocationSet(masterKey, location); + key_semantic.LocationNext(location); + } + // + // Since the current location changed, start over. + // + i = 0; + } + } + + return OK; +} + +// +// Helper class for AscendingFrequency method +// +class WordSort { +public: + unsigned int frequency; + WordTree *cursor; +}; + +// +// Helper function for AscendingFrequency method +// +static int ascending_frequency(const void *a, const void *b) +{ + const WordSort& a_cursor = *(WordSort*)a; + const WordSort& b_cursor = *(WordSort*)b; + + return a_cursor.frequency - b_cursor.frequency; +} + +int WordTreeOptional::AscendingFrequency() +{ + // + // Reorder cursors + // + WordSort *tmp = new WordSort[cursors_length]; + + memset((char*)tmp, '\0', cursors_length * sizeof(WordSort)); + + unsigned int i; + for(i = 0; i < cursors_length; i++) { + unsigned int frequency; + if(cursors[i]->Noccurrence(frequency) != OK) { + delete [] tmp; + return NOTOK; + } + if(verbose > 2) fprintf(stderr, "WordTreeOptional::AscendingFrequency: %s occurs %d times\n", (char*)cursors[i]->GetSearch().Get(), frequency); + tmp[i].frequency = frequency; + tmp[i].cursor = cursors[i]; + } + + memset((char*)cursors, '\0', cursors_length * sizeof(WordTree*)); + + qsort((void *)tmp, cursors_length, sizeof(WordSort), &ascending_frequency); + + for(i = 0; i < cursors_length; i++) + cursors[i] = tmp[i].cursor; + + delete [] tmp; + return OK; +} + +int WordTreeOptional::StripNonExistent(unsigned int& stripped) +{ + stripped = 0; + + WordTree** tmp = new WordTree*[cursors_length]; + memset((char*)tmp, '\0', cursors_length * sizeof(WordTree*)); + + unsigned int from; + unsigned int to; + + for(to = from = 0; from < cursors_length; from++) { + unsigned int frequency; + if(cursors[from]->Noccurrence(frequency) != OK) { + delete [] tmp; + return NOTOK; + } + + if(verbose > 2) fprintf(stderr, "WordTreeOptional::StripNonExistent: %s occurs %d times\n", (char*)cursors[from]->GetSearch().Get(), frequency); + if(frequency > 0) { + tmp[to++] = cursors[from]; + } else { + delete cursors[from]; + stripped++; + } + } + + memset((char*)cursors, '\0', cursors_length * sizeof(WordTree*)); + + cursors_length = to; + unsigned int i; + for(i = 0; i < cursors_length; i++) + cursors[i] = tmp[i]; + + delete [] tmp; + + return OK; +} + +// ************************* WordTreeOr implementation ******************** + +class WordTreeOr : public WordTreeOperand { + public: + WordTreeOr(const char* nscope) : WordTreeOperand(nscope) { } + + //- + // Return WORD_TREE_OR + // + virtual int IsA() const { return WORD_TREE_OR; } + + virtual int Optimize(); + + virtual int ContextSaveList(StringList& list) const; + + virtual int ContextRestoreList(StringList& list); + + virtual void SetInfo(); + + virtual int WalkNext(); + + virtual int UsePermutation() const { return 0; } + + virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_NO; } +}; + +int WordTreeOr::Optimize() +{ + int ret; + if((ret = WordTreeOperand::Optimize()) != OK) + return ret; + + if((ret = AscendingFrequency()) != OK) + return ret; + + unsigned int stripped; + if((ret = StripNonExistent(stripped)) != OK) + return ret; + + return OptimizeOr(WORD_PERMUTE_PROXIMITY_NO); +} + +int WordTreeOr::ContextSaveList(StringList& list) const +{ + int ret; + if((ret = WordTreeOperand::ContextSaveList(list)) != OK) + return ret; + + { + String* buffer = new String(); + permutation.Get(*buffer); + + list.Add(buffer); + } + + { + String* buffer = new String(); + if((ret = WordCursor::ContextSave(*buffer)) != OK) + return ret; + + list.Add(buffer); + } + + return OK; +} + +int WordTreeOr::ContextRestoreList(StringList& list) +{ + int ret; + if((ret = WordTreeOperand::ContextRestoreList(list)) != OK) + return ret; + + { + char* buffer = list[0]; + if((ret = permutation.Set(buffer)) != OK) + return ret; + list.Remove(0); + } + + { + char* buffer = list[0]; + if(!buffer) return NOTOK; + WordKey key(buffer); + if((ret = Seek(key)) != OK) + return ret; + cursor_get_flags = DB_NEXT; + + list.Remove(0); + } + + return OK; +} + +void WordTreeOr::SetInfo() +{ + unsigned int i; + for(i = 0; i < cursors_length; i++) + cursors[i]->SetInfo(); + + info.trunc(); + + for(i = 0; i < cursors_length; i++) { + WordTree& cursor = *(cursors[i]); + + if(!permutation.Excluded(i) && + !cursor.IsAtEnd() && + key_semantic.DocumentCompare(cursor.GetFound().Key(), GetFound().Key()) == 0) { + info << cursor.info << " "; + } + } +} + +int WordTreeOr::WalkNext() +{ + WordKey& constraint = pos; + // + // Set constraint with all 0 + // + if(constraint.Empty()) + key_semantic.DocumentClear(constraint); + + WordKey candidate; + int match_ok; + do { + int ret; + unsigned int i; + candidate.Clear(); + // + // Advance cursors so that next call fetches another constraint + // + if(cursor_get_flags == DB_NEXT) + key_semantic.DocumentNext(constraint, uniq); + + if((ret = Seek(constraint)) != OK) + return ret; + + match_ok = 1; + // + // All non excluded cursors are about to move + // at or beyond constraint. Search for the one (candidate) that + // is located at the lowest location beyond the constraint. + // + for(i = 0; i < cursors_length; i++) { + if(permutation.Excluded(i)) + continue; + WordTree& cursor = *(cursors[i]); + + switch((ret = cursor.WalkNext())) { + case WORD_WALK_ATEND: + // + // Constraint is above all matches for this cursor + // + break; + case OK: + // + // If candidate is not set or current cursor is below + // the current candidate, the curent cursor document becomes + // the candidate. + // + if(candidate.Empty() || + key_semantic.DocumentCompare(candidate, cursor.GetFound().Key()) > 0) { + key_semantic.DocumentSet(cursor.GetDocument(), candidate); + } + break; + default: + return ret; + break; + } + } + + // + // No candidate ? It's the end of the match list. + // + if(candidate.Empty()) + return WORD_WALK_ATEND; + + found.Key().CopyFrom(candidate); + + SetInfo(); + + if(permutation.ExcludedCount() > 0) { + if((ret = Seek(candidate)) != OK) + return ret; + + // + // Restart loop if candidate matches an excluded cursor. + // + for(i = 0; i < cursors_length && match_ok; i++) { + if(!permutation.Excluded(i)) + continue; + WordTree& cursor = *(cursors[i]); + + switch((ret = cursor.WalkNext())) { + case WORD_WALK_ATEND: + // + // This excluded cursor can't match the candidate, fine. + // + break; + case OK: + // + // This excluded cursor matches candidate therefore it's + // not a valid candidate. Restart search with this candidate + // as the constraint. + // + if(key_semantic.DocumentCompare(candidate, cursor.GetFound().Key()) == 0) { + constraint = candidate; + match_ok = 0; + } + break; + default: + return ret; + break; + } + + } + } + + cursor_get_flags = DB_NEXT; + + } while(!match_ok); + + constraint = candidate; + + return OK; +} + +// ************************* WordTreeAnd implementation ******************** + +class WordTreeAnd : public WordTreeOptional { + public: + WordTreeAnd(const char* nscope) : WordTreeOptional(nscope) { } + + //- + // Return WORD_TREE_AND + // + virtual int IsA() const { return WORD_TREE_AND; } + + virtual int UsePermutation() const { return 0; } + + virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_NO; } + + virtual int AllOrNothing() const { return 1; } +}; + +// ************************* WordTreeNear implementation ******************** + +class WordTreeNear : public WordTreeOptional { + public: + WordTreeNear(const char* nscope) : WordTreeOptional(nscope) { } + + //- + // Return WORD_TREE_NEAR + // + virtual int IsA() const { return WORD_TREE_NEAR; } + + virtual int UsePermutation() const { return 0; } + + virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_ONLY; } + + virtual int AllOrNothing() const { return 1; } +}; + +// ************************* WordTreeMandatory implementation *************** + +class WordTreeMandatory : public WordTreeOperand { + public: + WordTreeMandatory(const char* nscope) : WordTreeOperand(nscope) { } + + //- + // Return WORD_TREE_MANDATORY + // + virtual int IsA() const { return WORD_TREE_MANDATORY; } +}; + +// ************************* WordTreeNot implementation *************** + +class WordTreeNot : public WordTreeOperand { + public: + WordTreeNot(const char* nscope) : WordTreeOperand(nscope) { } + + //- + // Return WORD_TREE_NOT + // + virtual int IsA() const { return WORD_TREE_NOT; } +}; + +// ************************* WordMatch implementation ******************** + +// +// Return value of the Search method, tells us which document +// matched and why. +// +class WordMatch { +public: + + //- + // Return a textual representation of the object. + // + String Get() const; + + //- + // The document that matched + // + WordKey match; + //- + // An ascii description of why it matched. + // + String info; +}; + +String WordMatch::Get() const +{ + String tmp; + match.Get(tmp); + if(!info.empty()) + tmp << "(" << info << ")"; + return tmp; +} + +// ************************* WordSearch implementation ******************** +// +// NAME +// +// Solve a query from a WordTree syntax tree +// +// SYNOPSIS +// +// #include <WordSearch.h> +// +// WordTree* expr = get_query(); +// WordSearch search; +// search.limit_count = NUMBER_OF_RESULTS; +// WordMatch* search.Search(expr); +// ... +// +// DESCRIPTION +// +// The WordSearch class is a wrapper to query an inverted index +// using a WordTree syntax tree. +// +// END +// +class WordSearch { +public: + WordSearch(); + + //- + // Perform a search from the <b>expr</b> specifications. + // Restore the context from <i>context_in</i> on <b>expr</b>. + // Then skip (using WalkNext) <i>limit_bottom</i> entries. + // Then collect in a WordMatch array of size <i>limit_count</i> + // each match returned by WalkNext. When finished store + // the context (ContextSave) in <i>context_out</i>. + // It is the responsibility of the caller to free the WordMatch + // array. If no match are found a null pointer is returned. + // + WordMatch *Search(); + + // + // Search backend, only run the WalkNext loop but does not + // allocate/deallocate data. + // + int SearchLoop(WordTree *expr); + + // + // Return a context description string to resume the + // search at a given point. + // + const String& Context() const { return context_out; } + + // + // Input + // + unsigned int limit_bottom; + unsigned int limit_count; + String context_in; + WordTree* expr; + + // + // Output + // + WordMatch* matches; + unsigned int matches_size; + unsigned int matches_length; + String context_out; +}; + +WordSearch::WordSearch() +{ + // + // Input + // + limit_bottom = 0; + limit_count = 0; + context_in.trunc(); + expr = 0; + + // + // Output + // + matches = 0; + matches_size = 0; + matches_length = 0; + context_out.trunc(); +} + +WordMatch *WordSearch::Search() +{ + int ret = 0; + + if(verbose) fprintf(stderr, "WordSearch::Search: non optimized expression %s\n", (char*)expr->Get()); + if(expr->Optimize() != OK) + return 0; + if(verbose) fprintf(stderr, "WordSearch::Search: optimized expression %s\n", (char*)expr->Get()); + + + // + // Build space for results + // + matches_size = limit_count + 1; + matches = new WordMatch[matches_size]; + matches_length = 0; + + // + // Move to first possible position. + // + if(expr->WalkInit() != OK) + goto end; + + if(expr->ContextRestore(context_in) == NOTOK) + goto end; + ret = SearchLoop(expr); + // + // Don't bother saving the context if at end of + // search (WORD_WALK_ATEND) or error (NOTOK) + // + if(ret == OK && expr->ContextSave(context_out) == NOTOK) + goto end; + +end: + expr->WalkFinish(); + + if(ret == NOTOK || matches_length <= 0) { + delete [] matches; + matches = 0; + } + + return matches; +} + +int WordSearch::SearchLoop(WordTree *expr) +{ + int ret = OK; + unsigned int i; + // + // Skip the first <limit_bottom> documents + // + { + for(i = 0; i < limit_bottom; i++) { + if((ret = expr->WalkNext()) != OK) + return ret; + } + } + // + // Get documents up to <limit_count> or exhaustion + // + for(matches_length = 0; matches_length < limit_count; matches_length++) { + if((ret = expr->WalkNext()) != OK) { + break; + } else { + matches[matches_length].match = expr->GetDocument(); + if(expr->IsA() != WORD_TREE_LITERAL) + matches[matches_length].info = ((WordTreeOperand*)expr)->GetInfo(); + if(verbose) fprintf(stderr, "WordSearch::Search: match %s\n", (char*)matches[matches_length].match.Get()); + } + } + + if(ret == WORD_WALK_ATEND) + matches[matches_length].match.Clear(); + + return ret; +} + +// ************************* WordParser implementation ******************** +// +// NAME +// +// Textual query parser for test purpose +// +// SYNOPSIS +// +// #include <WordParser.h> +// +// WordParser parser; +// WordTree* expr = parser.Parse("( or \"scope1\" a query )"); +// ... +// delete expr; +// +// DESCRIPTION +// +// The WordParser class implement a lisp-like parser for queries +// implemented by the WordTree derived classes. The syntax is rigid +// and should not be used for human input. The generic syntax of an +// expression is +// <pre> +// ( operator "scope" operand [operand ...] ) +// </pre> +// The parenthesis must <b>always</b> be surrounded by white space otherwise +// the parser will be lost. The separator is white space and newline. +// Tabulation may be used in scope to separate key fields. +// +// As a special case a single word is strictly equivalent +// to +// <pre> +// ( literal "" word ) +// </pre> +// +// Operators can be lower case or upper case. There is almost no syntax +// checking and it's the responsibility of the caller to associate meaningfull +// operands. For instance ( near ( not foo ) bar ) is meaningless. +// +// OPERATORS +// +// <dl> +// +// <dt> optional +// <dd> WordTreeOptional +// +// <dt> or +// <dd> WordTreeOr +// +// <dt> and +// <dd> WordTreeAnd +// +// <dt> near +// <dd> WordTreeNear +// +// <dt> not,forbiden +// <dd> WordTreeNot +// +// <dt> mandatory +// <dd> WordTreeMandatory +// +// <dt> literal +// <dd> WordTreeLiteral +// +// </dl> +// +// +// END + +// +// Possible values of the info argument of ParseOperands +// +#define WORD_TREE_MANY 0x01 +#define WORD_TREE_ONE 0x02 +#define WORD_TREE_TWO 0x04 + +class WordParser { +public: + WordTree *Parse(const String& expr); + WordTree *ParseList(StringList& terms); + + WordTree *ParseExpr(StringList& terms); + WordTree *ParseUnary(StringList& terms); + WordTree *ParseConj(StringList& terms); + void ParseOperands(StringList& terms, int info, WordTreeOperand* expr); + WordTree *ParseLiteral(StringList& terms); + char *ParseScope(StringList& terms); + + void Shift(StringList& terms); + char *Term(StringList& terms); +}; + +WordTree *WordParser::Parse(const String& expr) +{ + StringList terms(expr, " \n"); + return ParseList(terms); +} + +WordTree *WordParser::ParseList(StringList& terms) +{ + WordTree *expr = ParseExpr(terms); + return expr; +} + +WordTree *WordParser::ParseExpr(StringList& terms) +{ + WordTree *expr = 0; + char* term = strdup(Term(terms)); + if(!strcmp(term, "(")) { + Shift(terms); + expr = ParseExpr(terms); + } else if(!strcmp(term, ")")) { + // + // At end of expression, return null + // + } else if(!mystrcasecmp(term, "optional") || + !mystrcasecmp(term, "or") || + !mystrcasecmp(term, "and") || + !mystrcasecmp(term, "near")) { + expr = ParseConj(terms); + } else if(!mystrcasecmp(term, "not") || + !mystrcasecmp(term, "mandatory") || + !mystrcasecmp(term, "forbiden")) { + expr = ParseUnary(terms); + } else { + expr = ParseLiteral(terms); + } + free(term); + return expr; +} + +WordTree *WordParser::ParseUnary(StringList& terms) +{ + int op = 0; + if(!mystrcasecmp(Term(terms), "mandatory")) + op = WORD_TREE_MANDATORY; + else if(!mystrcasecmp(Term(terms), "forbiden") || + !mystrcasecmp(Term(terms), "not")) + op = WORD_TREE_NOT; + + Shift(terms); + char* scope = ParseScope(terms); + WordTreeOperand *expr = 0; + switch(op) { + case WORD_TREE_MANDATORY: + expr = new WordTreeMandatory(scope); + break; + case WORD_TREE_NOT: + expr = new WordTreeNot(scope); + break; + default: + fprintf(stderr, "WordParser::ParseUnary: unexpected operator %d\n", op); + exit(1); + break; + } + free(scope); + ParseOperands(terms, WORD_TREE_ONE, expr); + return expr; +} + +WordTree *WordParser::ParseConj(StringList& terms) +{ + int op = 0; + if(!mystrcasecmp(Term(terms), "optional")) + op = WORD_TREE_OPTIONAL; + else if(!mystrcasecmp(Term(terms), "or")) + op = WORD_TREE_OR; + else if(!mystrcasecmp(Term(terms), "and")) + op = WORD_TREE_AND; + else if(!mystrcasecmp(Term(terms), "near")) + op = WORD_TREE_NEAR; + + Shift(terms); + char* scope = ParseScope(terms); + WordTreeOperand *expr = 0; + switch(op) { + case WORD_TREE_OR: + expr = new WordTreeOr(scope); + break; + case WORD_TREE_OPTIONAL: + expr = new WordTreeOptional(scope); + break; + case WORD_TREE_AND: + expr = new WordTreeAnd(scope); + break; + case WORD_TREE_NEAR: + expr = new WordTreeNear(scope); + break; + default: + fprintf(stderr, "WordParser::ParseOrAnd: unexpected operator %d\n", op); + exit(1); + break; + } + free(scope); + ParseOperands(terms, WORD_TREE_MANY, expr); + return expr; +} + +void WordParser::ParseOperands(StringList& terms, int info, WordTreeOperand* expr) +{ + unsigned int operands_length = 0; + unsigned int operands_size = 1; + WordTree **operands = (WordTree**)malloc(operands_size * sizeof(WordTree*)); + WordTree *subexpr = 0; + while((subexpr = ParseExpr(terms))) { + operands_length++; + if((info & WORD_TREE_ONE) && operands_length > 1) { + fprintf(stderr, "WordParser::ParseOperands: expected only one operands\n"); + exit(1); + } else if((info & WORD_TREE_TWO) && operands_length > 2) { + fprintf(stderr, "WordParser::ParseOperands: expected only two operands\n"); + exit(1); + } + if(operands_length > operands_size) { + operands_size = operands_length * 2; + operands = (WordTree**)realloc(operands, operands_size * sizeof(WordTree*)); + } + operands[operands_length - 1] = subexpr; + } + // + // Discard close parenthesis + // + if(strcmp(Term(terms), ")")) { + fprintf(stderr, "WordParser::ParseOperands: expected close parenthesis\n"); + exit(1); + } + Shift(terms); + + expr->cursors = operands; + expr->cursors_length = operands_length; +} + +WordTree *WordParser::ParseLiteral(StringList& terms) +{ + char* term = strdup(Term(terms)); + char* scope = 0; + if(!mystrcasecmp(term, "literal")) { + Shift(terms); + scope = ParseScope(terms); + free(term); + term = strdup(Term(terms)); + Shift(terms); + } else { + scope = strdup(""); + } + WordTreeLiteral *expr = new WordTreeLiteral(term, scope); + Shift(terms); + free(scope); + free(term); + return expr; +} + +char *WordParser::ParseScope(StringList& terms) +{ + char *scope = Term(terms); + int scope_length = strlen(scope); + + // + // Remove surrounding quotes, if any + // + if(scope_length > 0) { + if(scope[scope_length - 1] == '"') + scope[--scope_length] = '\0'; + if(scope[0] == '"') + scope++; + } + + scope = strdup(scope); + + Shift(terms); + + return scope; +} + +char *WordParser::Term(StringList& terms) +{ + char *term = terms[0]; + if(!term) { + fprintf(stderr, "WordParser::Term: unexpected end of expression\n"); + exit(1); + } + return term; +} + +void WordParser::Shift(StringList& terms) +{ + terms.Shift(LIST_REMOVE_DESTROY); +} + +// ************************* main loop implementation ******************** + +// +// Store all options from the command line +// +class params_t +{ +public: + char* dbfile; + char* find; + unsigned int bottom; + unsigned int count; + char* context; + int uniq_server; + int proximity; + int nop; + int exclude; +}; + +// +// Explain options +// +static void usage(); +// +// Torture WordExclude* classes +// +static void exclude_test(); + +int main(int ac, char **av) +{ + int c; + extern char *optarg; + params_t params; + + params.dbfile = strdup("test"); + params.find = 0; + params.bottom = 0; + params.count = 10; + params.context = 0; + params.uniq_server = 0; + params.proximity = WORD_SEARCH_DEFAULT_PROXIMITY; + params.nop = 0; + params.exclude = 0; + + while ((c = getopt(ac, av, "vB:f:b:c:C:SP:ne")) != -1) + { + switch (c) + { + case 'v': + verbose++; + break; + case 'B': + free(params.dbfile); + params.dbfile = strdup(optarg); + break; + case 'f': + params.find = strdup(optarg); + break; + case 'b': + params.bottom = (unsigned int)atoi(optarg); + break; + case 'c': + params.count = (unsigned int)atoi(optarg); + break; + case 'C': + params.context = strdup(optarg); + break; + case 'P': + params.proximity = atoi(optarg); + break; + case 'S': + params.uniq_server = SERVER; + break; + case 'n': + params.nop = 1; + break; + case 'e': + params.exclude = 1; + break; + case '?': + usage(); + break; + } + } + + if(params.exclude) { + exclude_test(); + exit(0); + } + + if(!params.find) + usage(); + + Configuration* config = WordContext::Initialize(); + if(!config) { + fprintf(stderr, "search: no config file found\n"); + exit(1); + } + + // + // Forward command line verbosity to htword library. + // + if(verbose > 1) { + String tmp; + tmp << (verbose - 1); + config->Add("wordlist_verbose", tmp); + } + + // + // Prepare the index (-B). + // + WordList words(*config); + words.Open(params.dbfile, O_RDONLY); + + // + // Try the query parser alone + // + if(params.nop) { + WordTree* expr = WordParser().Parse(params.find); + printf("%s\n", (char*)expr->Get()); + exit(0); + } + + // + // Build a syntax tree from the expression provided by user + // + WordTree* expr = WordParser().Parse(params.find); + + // + // Define the semantic of the key + // + { +#define DOCUMENT_LENGTH 3 + static int document[DOCUMENT_LENGTH] = { + TAG, + SERVER, + URL + }; + int document_length = DOCUMENT_LENGTH; + int location = LOCATION; + if(expr->Prepare(&words, params.uniq_server, params.proximity, document, document_length, location) != OK) + exit(1); + } + + WordSearch* search = new WordSearch(); + + // + // Forward query options to WordSearch object + // + search->limit_bottom = params.bottom; // -b + search->limit_count = params.count; // -c + if(params.context) // -C + search->context_in.set(params.context, strlen(params.context)); + + // + // Perform the search (-f) + // + search->expr = expr; + WordMatch* matches = search->Search(); + + // + // Display results, if any. + // + if(matches) { + int i; + for(i = 0; !matches[i].match.Empty(); i++) + printf("match: %s\n", (char*)matches[i].Get()); + const String& context = search->Context(); + if(!context.empty()) + printf("context: %s\n", (const char*)context); + delete [] matches; + } else { + printf("match: none\n"); + } + + // + // Cleanup + // + delete search; + if(params.context) free(params.context); + if(params.find) free(params.find); + if(params.dbfile) free(params.dbfile); + delete expr; + + words.Close(); + delete config; +} + +static void exclude_test() +{ + static unsigned int expected[] = { + 0x00000001, + 0x00000002, + 0x00000004, + 0x00000008, + 0x00000010, + 0x00000003, + 0x00000005, + 0x00000006, + 0x00000009, + 0x0000000a, + 0x0000000c, + 0x00000011, + 0x00000012, + 0x00000014, + 0x00000018, + 0x00000007, + 0x0000000b, + 0x0000000d, + 0x0000000e, + 0x00000013, + 0x00000015, + 0x00000016, + 0x00000019, + 0x0000001a, + 0x0000001c, + 0x0000000f, + 0x00000017, + 0x0000001b, + 0x0000001d, + 0x0000001e, + 0x0000001f + }; + + // + // WordExclude + // + if(verbose) fprintf(stderr, "exclude_test: testing WordExclude\n"); + { + WordExclude exclude; + exclude.Initialize(5); + int count = 0; + while(exclude.Next() == WORD_EXCLUDE_OK) { + if(expected[count] != exclude.Mask()) { + fprintf(stderr, "exclude_test: WordExclude iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask()); + exit(1); + } + count++; + } + if(count != sizeof(expected)/sizeof(unsigned int)) { + fprintf(stderr, "exclude_test: WordExclude expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count); + exit(1); + } + } + + // + // WordExcludeMask without ignore bits behaves exactly the same + // as WordExclude. + // + if(verbose) fprintf(stderr, "exclude_test: testing WordExcludeMask behaving like WordExclude\n"); + { + WordExcludeMask exclude; + exclude.Initialize(5, 0, 0); + int count = 0; + while(exclude.Next() == WORD_EXCLUDE_OK) { + if(expected[count] != exclude.Mask()) { + fprintf(stderr, "exclude_test: WordExcludeMask 1 iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask()); + exit(1); + } + count++; + } + if(count != sizeof(expected)/sizeof(unsigned int)) { + fprintf(stderr, "exclude_test: WordExcludeMask 1 expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count); + exit(1); + } + } + + // + // WordExcludeMask + // + if(verbose) fprintf(stderr, "exclude_test: testing WordExcludeMask\n"); + { + static unsigned int expected[] = { + 0x00000102, + 0x00000108, + 0x00000120, + 0x00000180, + 0x0000010a, + 0x00000122, + 0x00000128, + 0x00000182, + 0x00000188, + 0x000001a0, + 0x0000012a, + 0x0000018a, + 0x000001a2, + 0x000001a8, + 0x000001aa + }; + static unsigned int excluded[] = { + 1, + 0, + 0, + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 1 + }; + + WordExcludeMask exclude; + unsigned int ignore = 0x155; + unsigned int ignore_mask = 0x100; + exclude.Initialize(9, ignore, ignore_mask); + if(verbose) { + fprintf(stderr, "exclude_test: ignore\n"); + show_bits(ignore); + fprintf(stderr, "exclude_test: ignore_mask\n"); + show_bits(ignore_mask); + } + if(exclude.NotExcludedCount() != 8) { + fprintf(stderr, "exclude_test: WordExcludeMask 2 expected NoExcludedCount = 8 but got %d\n", exclude.NotExcludedCount()); + exit(1); + } + int count = 0; + while(exclude.Next() == WORD_EXCLUDE_OK) { + if(expected[count] != exclude.Mask()) { + fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask()); + exit(1); + } + // + // Test Excluded() method on ignored bit + // Is bit 5 set ? (9 - 4) = 5 (counting from 1) + // + if(exclude.Excluded(4)) { + fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d bit 5 was set 0x%08x\n", count, exclude.Mask()); + exit(1); + } + // + // Test Excluded() method on variable bit + // Is bit 2 set ? (9 - 2) = 7 (counting from 1) + // + if((exclude.Excluded(7) && !excluded[count]) || + (!exclude.Excluded(7) && excluded[count])) { + fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d expected bit 2 %s but was %s in 0x%08x\n", count, (excluded[count] ? "set" : "not set"), (excluded[count] ? "not set" : "set"), expected[count]); + exit(1); + } + count++; + } + if(count != sizeof(expected)/sizeof(unsigned int)) { + fprintf(stderr, "exclude_test: WordExcludeMask 2 expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count); + exit(1); + } + } + + { + WordExclude exclude; + String ascii("110101"); + String tmp; + exclude.Set(ascii); + exclude.Get(tmp); + if(tmp != ascii) { + fprintf(stderr, "exclude_test: WordExclude::Get/Set expected %s but got %s\n", (char*)ascii, (char*)tmp); + exit(1); + } + if(exclude.Mask() != 0x2b) { + fprintf(stderr, "exclude_test: WordExclude::Mask expected 0x2b but got 0x%02x\n", exclude.Mask()); + exit(1); + } + } + { + WordExcludeMask exclude; + String ascii("12031"); + String tmp; + exclude.Set(ascii); + exclude.Get(tmp); + if(tmp != ascii) { + fprintf(stderr, "exclude_test: WordExcludeMask::Get/Set expected %s but got %s\n", (char*)ascii, (char*)tmp); + exit(1); + } + if(exclude.Mask() != 0x19) { + fprintf(stderr, "exclude_test: WordExcludeMask::Mask expected 0x19 but got 0x%02x\n", exclude.Mask()); + exit(1); + } + } +} + +// ***************************************************************************** +// void usage() +// Display program usage information +// +static void usage() +{ + printf("usage:\tsearch -f words [options]\n"); + printf("\tsearch -e\n"); + printf("Options:\n"); + printf("\t-v\t\tIncreases the verbosity.\n"); + printf("\t-B dbfile\tUse <dbfile> as a db file name (default test).\n"); + printf("\t-f expr\t\tLisp like search expression.\n"); + printf("\t\t\tSee WordParser comments in source for more information.\n"); + printf("\t-b number\tSkip number documents before retrieving.\n"); + printf("\t-c number\tRetrieve number documents at most.\n"); + printf("\t-n\t\tOnly parse the search expression and print it.\n"); + printf("\t-P proximity\tUse with near/optional, proximity tolerance is <proximity>\n"); + printf("\t\t\tif negative order of terms is not meaningful\n"); + printf("\t\t\t(default 1).\n"); + printf("\t-C context\tResume search at <context>.\n"); + printf("\t-S\t\tReturn at most one match per server.\n"); + printf("\n"); + printf("\t-e\t\tRun tests on WordExclude and WordExcludeMask.\n"); + exit(1); +} |