summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htword/WordList.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htword/WordList.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htword/WordList.cc436
1 files changed, 436 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordList.cc b/debian/htdig/htdig-3.2.0b6/htword/WordList.cc
new file mode 100644
index 00000000..566acb93
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htword/WordList.cc
@@ -0,0 +1,436 @@
+//
+// WordList.cc
+//
+// WordList: Interface to the word database. Previously, this wrote to
+// a temporary text file. Now it writes directly to the
+// word database.
+// NOTE: Some code previously attempted to directly read from
+// the word db. This will no longer work, so it's preferred to
+// use the access methods here.
+// Configuration parameter used:
+// wordlist_extend
+// wordlist_verbose 1 walk logic
+// wordlist_verbose 2 walk logic details
+// wordlist_verbose 3 walk logic lots of details
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1999-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: WordList.cc,v 1.13 2004/05/28 13:15:27 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "WordList.h"
+#include "WordReference.h"
+#include "WordRecord.h"
+#include "WordType.h"
+#include "WordStat.h"
+#include "Configuration.h"
+#include "htString.h"
+#include "HtPack.h"
+#include "HtTime.h"
+#include "WordDBCompress.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+
+// *****************************************************************************
+//
+WordList::WordList(const Configuration& config_arg) :
+ wtype(config_arg),
+ config(config_arg)
+{
+ // The database itself hasn't been opened yet
+ isopen = 0;
+ isread = 0;
+ extended = config.Boolean("wordlist_extend");
+ verbose = config.Value("wordlist_verbose");
+ compressor = 0;
+}
+
+// *****************************************************************************
+//
+WordList::~WordList()
+{
+ Close();
+}
+
+// *****************************************************************************
+//
+int WordList::Open(const String& filename, int mode, int word_only)
+{
+ int usecompress=0;
+
+ // If word_only, entries compare equal if the "word" part matches.
+ // This should only be used for querying the database, not writing it.
+ // It is needed by speling to test for the existence of words.
+ db.set_bt_compare(word_only ? word_only_db_cmp : word_db_cmp);
+
+ if(config.Value("wordlist_page_size", 0))
+ db.set_pagesize(config.Value("wordlist_page_size"));
+
+ if(config.Boolean("wordlist_compress") == 1) {
+ usecompress = DB_COMPRESS;
+ WordDBCompress* compressor = new WordDBCompress(
+ config.Boolean("wordlist_compress_zlib",0), config.Value("compression_level",0));
+
+ // compressor->debug = config.Value("wordlist_compress_debug");
+ SetCompressor(compressor);
+ db.CmprInfo(compressor->CmprInfo());
+ }
+
+ int flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY;
+ if(mode & O_TRUNC) {
+ if(flags == DB_CREATE)
+ flags |= DB_TRUNCATE;
+ else
+ fprintf(stderr, "WordList::Open: O_TRUNC | O_RDONLY is meaningless\n");
+ }
+ flags |= usecompress;
+
+ int ret = db.Open(filename, DB_BTREE, flags, 0666) == 0 ? OK : NOTOK;
+
+ isread = mode & O_RDONLY;
+ isopen = 1;
+
+ return ret;
+}
+
+// *****************************************************************************
+//
+int WordList::Close()
+{
+ if(isopen) {
+ if(db.Close() != 0) return NOTOK;
+ isopen = 0;
+ isread = 0;
+ }
+
+ {
+ WordDBCompress* compressor = GetCompressor();
+ if(compressor) {
+ delete compressor;
+ SetCompressor(0);
+ }
+ }
+
+ return OK;
+}
+
+// ****************************************************************************
+//
+int WordList::Put(const WordReference& arg, int flags)
+{
+ if (arg.Key().GetWord().length() == 0) {
+ fprintf(stderr, "WordList::Put(%s) word is zero length\n", (char*)arg.Get());
+ return NOTOK;
+ }
+ if (!arg.Key().Filled()) {
+ fprintf(stderr, "WordList::Put(%s) key is not fully defined\n", (char*)arg.Get());
+ return NOTOK;
+ }
+
+ WordReference wordRef(arg);
+ String word = wordRef.Key().GetWord();
+ if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK)
+ return NOTOK;
+ wordRef.Key().SetWord(word);
+
+ //
+ // The two case could be grouped in a more compact way.
+ // However, the resources consumption difference between
+ // a Put(DB_NOOVERWRITE) and Put(0) is huge (the first is 75%
+ // slower than the second). Check the db_put sources for the
+ // explanation.
+ //
+ int ret = NOTOK;
+ if(flags) {
+ //
+ // First attempt tells us if the key exists. If it
+ // does not we just increment the reference count.
+ // Otherwise, and only if flags does not contain DB_NOOVERWRITE,
+ // we override the key/record pair.
+ //
+ int error;
+ if((error = db.Put(wordRef, DB_NOOVERWRITE)) != 0) {
+ if(error == DB_KEYEXIST && flags == 0)
+ ret = db.Put(wordRef, 0) == 0 ? OK : NOTOK;
+ } else {
+ ret = Ref(wordRef);
+ }
+ } else {
+ if((ret = db.Put(wordRef, 0)) == 0)
+ ret = Ref(wordRef);
+ }
+
+ return ret;
+}
+
+
+// *****************************************************************************
+//
+List *WordList::operator [] (const WordReference& wordRef)
+{
+ return Collect(wordRef);
+}
+
+// *****************************************************************************
+//
+List *WordList::Prefix (const WordReference& prefix)
+{
+ WordReference prefix2(prefix);
+ prefix2.Key().UndefinedWordSuffix();
+ return Collect(prefix2);
+}
+
+// *****************************************************************************
+//
+List *WordList::WordRefs()
+{
+ return Collect(WordReference());
+}
+
+// *****************************************************************************
+//
+List *WordList::Collect(const WordReference& wordRef)
+{
+ WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
+ if(search->Walk() != OK) return 0;
+ List* result = search->GetResults();
+ delete search;
+ return result;
+}
+
+// *****************************************************************************
+//
+// Callback data dedicated to Dump and dump_word communication
+//
+class DeleteWordData : public Object
+{
+public:
+ DeleteWordData() { count = 0; }
+
+ int count;
+};
+
+// *****************************************************************************
+//
+//
+static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data)
+{
+ if(words->Delete(cursor) == 0) {
+ words->Unref(*word);
+ ((DeleteWordData&)data).count++;
+ return OK;
+ } else {
+ fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get());
+ return NOTOK;
+ }
+}
+
+// *****************************************************************************
+//
+// Delete all records matching wordRef, return the number of
+// deleted records.
+//
+int WordList::WalkDelete(const WordReference& wordRef)
+{
+ DeleteWordData data;
+ WordCursor *description = Cursor(wordRef.Key(), delete_word, &data);
+ description->Walk();
+ delete description;
+ return data.count;
+}
+
+// *****************************************************************************
+//
+//
+List *WordList::Words()
+{
+ List *list = 0;
+ String key;
+ String record;
+ WordReference lastWord;
+ WordDBCursor cursor;
+
+ if(cursor.Open(db.db) != 0) return 0;
+
+ //
+ // Move past the first word count record
+ //
+ const WordReference& last = WordStat::Last();
+ last.Pack(key, record);
+ if(cursor.Get(key, record, DB_SET_RANGE) != 0)
+ return 0;
+ list = new List;
+ do {
+ WordReference wordRef(key, record);
+ if(lastWord.Key().GetWord().empty() ||
+ wordRef.Key().GetWord() != lastWord.Key().GetWord())
+ {
+ list->Add(new String(wordRef.Key().GetWord()));
+ lastWord = wordRef;
+ }
+ } while (cursor.Get(key, record, DB_NEXT) == 0);
+
+ return list;
+}
+
+// *****************************************************************************
+//
+// Returns the reference count for word in <count> arg
+//
+int WordList::Noccurrence(const WordKey& key, unsigned int& noccurrence) const
+{
+ noccurrence = 0;
+ WordStat stat(key.GetWord());
+ int ret;
+ if((ret = db.Get(stat)) != 0) {
+ if(ret != DB_NOTFOUND)
+ return NOTOK;
+ } else {
+ noccurrence = stat.Noccurrence();
+ }
+
+ return OK;
+}
+
+// *****************************************************************************
+//
+// Increment reference count for wordRef
+//
+int WordList::Ref(const WordReference& wordRef)
+{
+ if(!extended) return OK;
+
+ WordStat stat(wordRef.Key().GetWord());
+ int ret;
+ if((ret = db.Get(stat)) != 0 && ret != DB_NOTFOUND)
+ return NOTOK;
+
+ stat.Noccurrence()++;
+
+ return db.Put(stat, 0) == 0 ? OK : NOTOK;
+}
+
+// *****************************************************************************
+//
+// Decrement reference count for wordRef
+//
+int WordList::Unref(const WordReference& wordRef)
+{
+ if(!extended) return OK;
+
+ WordStat stat(wordRef.Key().GetWord());
+ int ret;
+ if((ret = db.Get(stat)) != 0) {
+ if(ret == DB_NOTFOUND)
+ fprintf(stderr, "WordList::Unref(%s) Unref on non existing word occurrence\n", (char*)wordRef.Get());
+ return NOTOK;
+ }
+
+ if(stat.Noccurrence() == 0) {
+ fprintf(stderr, "WordList::Unref(%s) Unref on 0 occurrences word\n", (char*)wordRef.Get());
+ return NOTOK;
+ }
+ stat.Noccurrence()--;
+
+ if(stat.Noccurrence() > 0) {
+ ret = db.Put(stat, 0) == 0 ? OK : NOTOK;
+ } else
+ ret = db.Del(stat) == 0 ? OK : NOTOK;
+ return ret;
+}
+
+
+// *****************************************************************************
+//
+// streaming operators for ascii dumping and reading a list
+class FileOutData : public Object
+{
+public:
+ FILE* f;
+ FileOutData(FILE* f_arg) : f(f_arg) { }
+};
+
+// *****************************************************************************
+//
+static int
+wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *word, Object &data)
+{
+ fprintf(((FileOutData&)data).f, "%s\n", (char*)word->Get());
+ return OK;
+}
+
+// *****************************************************************************
+//
+int
+WordList::Write(FILE* f)
+{
+ WordKey empty;
+ FileOutData data(f);
+ WordCursor *description = Cursor(empty, wordlist_walk_callback_file_out, (Object *)&data);
+ description->Walk();
+ delete description;
+ return 0;
+}
+
+// *****************************************************************************
+//
+int
+WordList::Read(FILE* f)
+{
+ WordReference word;
+#define WORD_BUFFER_SIZE 1024
+ char buffer[WORD_BUFFER_SIZE + 1];
+ String line;
+ int line_number = 0;
+ int inserted = 0;
+
+ while(fgets(buffer, WORD_BUFFER_SIZE, f)) {
+ line_number++;
+ int buffer_length = strlen(buffer);
+ int eol = buffer[buffer_length - 1] == '\n';
+
+ if(eol) buffer[--buffer_length] = '\0';
+
+ line.append(buffer, buffer_length);
+ //
+ // Join big lines
+ //
+ if(!eol) continue;
+ //
+ // If line ends with a \ continue
+ //
+ if(line.last() == '\\') {
+ line.chop(1);
+ continue;
+ }
+
+ if(!line.empty()) {
+ if(word.Set(line) != OK) {
+ fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
+ fprintf(stderr, " cannot build WordReference (ignored)\n");
+ } else {
+ if(Insert(word) != OK) {
+ fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
+ fprintf(stderr, " insert failed (ignored)\n");
+ } else {
+ inserted++;
+ }
+ if(verbose) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)word.Get());
+ }
+
+ line.trunc();
+ }
+ }
+ return inserted;
+}