diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htword/WordList.h')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htword/WordList.h | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordList.h b/debian/htdig/htdig-3.2.0b6/htword/WordList.h new file mode 100644 index 00000000..1aa87864 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordList.h @@ -0,0 +1,372 @@ +// +// WordList.h +// +// NAME +// +// manage and use an inverted index file. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// Configuration* config; +// WordReference wordRef; +// ... +// WordList* words = new WordList(config) +// +// delete words; +// +// DESCRIPTION +// +// WordList is the <i>mifluz</i> equivalent of a database handler. Each +// WordList object is bound to an inverted index file and implements the +// operations to create it, fill it with word occurrences and search +// for an entry matching a given criterion. +// +// CONFIGURATION +// +// wordlist_extend {true|false} (default false) +// If <b>true</b> maintain reference count of unique +// words. The <b>Noccurrence</b> method gives access to this count. +// +// wordlist_verbose <number> (default 0) +// Set the verbosity level of the WordList class. +// <br> +// 1 walk logic +// <br> +// 2 walk logic details +// <br> +// 3 walk logic lots of details +// +// wordlist_page_size <bytes> (default 8192) +// Berkeley DB page size (see Berkeley DB documentation) +// +// wordlist_cache_size <bytes> (default 500K) +// Berkeley DB cache size (see Berkeley DB documentation) +// Cache makes a huge difference in performance. It must be at least 2% +// of the expected total data size. Note that if compression is activated +// the data size is eight times larger than the actual file size. In this +// case the cache must be scaled to 2% of the data size, not 2% +// of the file size. See <b>Cache tuning</b> in the mifluz guide for +// more hints. +// +// wordlist_compress {true|false} (default false) +// Activate compression of the index. The resulting index is eight times +// smaller than the uncompressed index. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordList.h,v 1.10 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordList_h_ +#define _WordList_h_ + +#include <fcntl.h> +#include <stdio.h> + +#ifndef SWIG +#include "Dictionary.h" +#include "List.h" +#include "htString.h" +#include "WordRecord.h" +#include "WordReference.h" +#include "WordType.h" +#include "WordDB.h" +#include "WordDBCompress.h" +#include "Configuration.h" +#include "WordCursor.h" +#endif /* SWIG */ + +class List; +class WordList; +class WordDBCursor; + +// +// Inverted index interface +// +class WordList +{ +public: + //- + // Constructor. Build inverted index handling object using + // run time configuration parameters listed in the <b>CONFIGURATION</b> + // section. + // + WordList(const Configuration& config_arg); + virtual ~WordList(); + + //- + // Insert <b>wordRef</b> in index. It is an error to insert + // the same <b>wordRef</b> twice. This requires a lookup in the index + // prior to the insertion. + // Returns OK on success, NOTOK on error. + // + int Insert(const WordReference& wordRef) { return Put(wordRef, DB_NOOVERWRITE); } + //- + // Insert <b>wordRef</b> in index. If the <i>Key()</i> part of + // the <b>wordRef</b> exists in the index, override it. + // Returns OK on success, NOTOK on error. + // + int Override(const WordReference& wordRef) { return Put(wordRef, 0); } +#ifndef SWIG + int Put(const WordReference& wordRef, int flags); +#endif /* SWIG */ + + //- + // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise. + // + int Exists(const WordReference& wordRef) { return db.Exists(wordRef) == 0 ? OK : NOTOK; } +#ifndef SWIG + //- + // Returns OK if <b>word</b> exists in the index, NOTOK otherwise. + // + int Exists(const String& word) { return Exists(WordReference(word)); } +#endif /* SWIG */ + + // + // Delete permanently + // + //- + // Delete all entries in the index whose key matches the + // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i> + // method. + // Returns the number of entries successfully deleted. + // + int WalkDelete(const WordReference& wordRef); + //- + // Delete the entry in the index that exactly matches the + // <i>Key()</i> part of <b>wordRef.</b> + // Returns OK if deletion is successfull, NOTOK otherwise. + // + int Delete(const WordReference& wordRef) { + if(db.Del(wordRef) == 0) + return Unref(wordRef); + else + return NOTOK; + } +#ifdef SWIG +%name(DeleteCursor) +#endif /* SWIG */ + //- + // Delete the inverted index entry currently pointed to by the + // <b>cursor.</b> + // Returns 0 on success, Berkeley DB error code on error. This + // is mainly useful when implementing a callback function for + // a <b>WordCursor.</b> + // + int Delete(WordDBCursor& cursor) { return cursor.Del(); } + + //- + // Open inverted index <b>filename.</b> <b>mode</b> + // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is + // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset + // the content of an existing inverted index. + // If word_only is true, entries will compare equal if the "word" part + // of the key is equal, even if the numeric fields aren't. (What are the + // numeric fields, anyway??) + // Return OK on success, NOTOK otherwise. + // + int Open(const String& filename, int mode, int word_only=false); + //- + // Close inverted index. + // + int Close(); + + // + // These returns a list of all the WordReference * matching + // the constraint. + //- + // Returns the list of word occurrences exactly matching the + // <i>Key()</i> part of <b>wordRef.</b> The <i>List</i> returned + // contains pointers to <i>WordReference</i> objects. It is + // the responsibility of the caller to free the list. See List.h + // header for usage. + // + List *Find(const WordReference& wordRef) { return (*this)[wordRef]; } + //- + // Returns the list of word occurrences exactly matching the + // <b>word.</b> The <i>List</i> returned + // contains pointers to <i>WordReference</i> objects. It is + // the responsibility of the caller to free the list. See List.h + // header for usage. + // + List *FindWord(const String& word) { return (*this)[word]; } +#ifndef SWIG + //- + // Alias to the <b>Find</b> method. + // + List *operator [] (const WordReference& wordRef); + //- + // Alias to the <b>FindWord</b> method. + // + List *operator [] (const String& word) { return (*this)[WordReference(word)]; } +#endif /* SWIG */ + //- + // Returns the list of word occurrences matching the <i>Key()</i> + // part of <b>wordRef.</b> In the <i>Key()</i>, the string + // (accessed with <i>GetWord()</i>) matches any string that begins + // with it. The <i>List</i> returned contains pointers to + // <i>WordReference</i> objects. It is the responsibility of the + // caller to free the list. + // + List *Prefix (const WordReference& prefix); +#ifndef SWIG + //- + // Returns the list of word occurrences matching the + // <b>word.</b> In the <i>Key()</i>, the string (accessed with + // <i>GetWord()</i>) matches any string that begins with it. The + // <i>List</i> returned contains pointers to <i>WordReference</i> + // objects. It is the responsibility of the caller to free the + // list. + // + List *Prefix (const String& prefix) { return this->Prefix(WordReference(prefix)); } +#endif /* SWIG */ + + // + // Iterate over the complete database. + // +#ifndef SWIG + //- + // Returns a list of all unique words contained in the inverted + // index. The <i>List</i> returned contains pointers to + // <i>String</i> objects. It is the responsibility of the caller + // to free the list. See List.h header for usage. + // + List *Words(); +#endif /* SWIG */ + //- + // Returns a list of all entries contained in the + // inverted index. The <i>List</i> returned contains pointers to + // <i>WordReference</i> objects. It is the responsibility of + // the caller to free the list. See List.h header for usage. + // + List *WordRefs(); + +#ifndef SWIG + //- + // Create a cursor that searches all the occurrences in the + // inverted index and call <b>ncallback</b> with + // <b>ncallback_data</b> for every match. + // + WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursor(this, callback, callback_data); } +#endif /* SWIG */ + //- + // Create a cursor that searches all the occurrences in the + // inverted index and that match <b>nsearchKey.</b> If + // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls + // <b>searchKey.callback</b> with <b>searchKey.callback_data</b> + // for every match. If <b>naction</b> is set to + // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b> + // data member as a <b>WordReference</b> object. It is the responsibility + // of the caller to free the <b>searchKey.collectRes</b> list. + // + WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursor(this, searchKey, action); } +#ifndef SWIG + //- + // Create a cursor that searches all the occurrences in the + // inverted index and that match <b>nsearchKey</b> and calls + // <b>ncallback</b> with <b>ncallback_data</b> for every match. + // + WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursor(this, searchKey, callback, callback_data); } +#endif /* SWIG */ + + // + // Update/get global word statistics statistics + // + //- + // Add one to the reference count for the string contained + // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> + // Returns OK on success, NOTOK otherwise. + // + int Ref(const WordReference& wordRef); + //- + // Substract one to the reference count for the string contained + // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> + // Returns OK on success, NOTOK otherwise. + // + int Unref(const WordReference& wordRef); +#ifndef SWIG + //- + // Return in <b>noccurrence</b> the number of occurrences of the + // string contained in the <i>GetWord()</i> part of <b>key.</b> + // Returns OK on success, NOTOK otherwise. + // + int Noccurrence(const WordKey& key, unsigned int& noccurrence) const; + + // + // Accessors + // + // + // Get the Berkeley DB object + // + const WordType& GetWordType() const { return wtype; } +#endif /* SWIG */ + //- + // Return the <i>Configuration</i> object used to initialize + // the <i>WordList</i> object. + // + const Configuration& GetConfiguration() const { return config; } + +#ifndef SWIG + // + // Input/Output + // + //- + // Write on file descriptor <b>f</b> an ASCII description of the + // index. Each line of the file contains a <i>WordReference</i> + // ASCII description. + // Returns 0 on success, not 0 otherwise. + // + int Write(FILE* f); + // + //- + // Read <i>WordReference</i> ASCII descriptions from <b>f</b>, + // returns the number of inserted WordReference or < 0 if an error + // occurs. Invalid descriptions are ignored as well as empty + // lines. + // + int Read(FILE* f); + +#endif /* SWIG */ + // + // Retrieve WordReferences from the database. + // Backend of WordRefs, operator[], Prefix... + // + List *Collect(const WordReference& word); +#ifndef SWIG + // + // Compressor object accessors + // + WordDBCompress *GetCompressor() { return compressor; } + void SetCompressor(WordDBCompress* compressor_arg) { compressor = compressor_arg; } + + const WordType wtype; + const Configuration& config; + + int isopen; + int isread; + + // + // If true enable extended functionalities of WordList such + // as per-word statistics. Read from wordlist_extended configuration + // parameter. + // + int extended; + + + WordDB db; + WordDBCompress *compressor; + int verbose; +#endif /* SWIG */ +}; + +#endif /* _WordList_h_ */ + |