summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h114
1 files changed, 114 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h b/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h
new file mode 100644
index 00000000..0f5c1973
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h
@@ -0,0 +1,114 @@
+//
+// WordDBCompress.h
+//
+// WordDBCompress: Implements specific compression scheme for
+// Berkeley DB pages containing WordReferences objects.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1999-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: WordDBCompress.h,v 1.6 2004/05/28 13:15:26 lha Exp $
+//
+
+#ifndef _WordDBCompress_h_
+#define _WordDBCompress_h_
+
+// ***********************************************
+// *************** WordDBCompress*****************
+// ***********************************************
+// Starting point for compression.
+//
+//
+// Comrpession HOW IT WORKS:
+//
+// ** General outline:
+//
+// BerkeleyDB pages are stored in a memory pool. When the memory pool
+// is full, least recently used pages are swaped to disk. Page
+// compression occurs at page in/out level. The
+// WordDBCompress_compress_c functions are C callbacks that are called
+// by the the page compression code in BerkeleyDB. The C callbacks the
+// call the WordDBCompress comress/uncompress methods. The
+// WordDBCompress creates a WordDBPage which does the actual
+// compress/uncompress job.
+//
+// The WordDBPage compression/uncompression methods store/retreive data
+// from a bitstream. BitStream is a simple bitstream, and Compressor is
+// a bitstream with added compression capabilities.
+//
+
+// Compression algorithm.
+//
+// Most DB pages are full of really redundant data. Mifluz choice of using
+// one db entry per word makes the DB pages have an even more redundant.
+// But this choice also makes the pages have a very simple structure.
+//
+// Here is a real world example of what a page can look like:
+// (key structure: word + 4 numerical fields)
+//
+// "trois" 1 4482 1 10b
+// "trois" 1 4482 1 142
+// "trois" 1 4484 1 40
+// "trois" 1 449f 1 11e
+// "trois" 1 4545 1 11
+// "trois" 1 45d3 1 545
+// "trois" 1 45e0 1 7e5
+// "trois" 1 45e2 1 830
+// "trois" 1 45e8 1 545
+// "trois" 1 45fe 1 ec
+// "trois" 1 4616 1 395
+// "trois" 1 461a 1 1eb
+// "trois" 1 4631 1 49
+// "trois" 1 4634 1 48
+// .... etc ....
+//
+// To compress we chose to only code differences between succesive entries.
+//
+// Differences in words are coded by 2 numbers and some letters:
+// - the position within the word of the first letter that changes
+// - the size of the new suffix
+// - the letters in the new suffix
+//
+// Only differences in succesive numerical entries are stored.
+//
+// A flag is stored for each entry indicating which fields have changed.
+//
+// All this gives us a few numerical arrays which are themselves compressed
+// and sent to the bitstream.
+//
+//
+class WordDBCompress
+{
+ public:
+ WordDBCompress();
+ WordDBCompress(int, int);
+
+ int Compress(const u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp);
+ int Uncompress(const u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length);
+
+ //
+ // Return a new DB_CMPR_INFO initialized with characteristics of the
+ // current object and suitable as WordDB::CmprInfo argument.
+ //
+ DB_CMPR_INFO *CmprInfo();
+
+ private:
+ DB_CMPR_INFO *cmprInfo;
+
+ //ZLIB WordDBCompression Flags
+ int use_zlib;
+ int zlib_level;
+
+// DEBUGING / BENCHMARKING
+ int debug;
+// 0 : no debug no check
+// 1 : TestCompress before each compression (but no debug within Compress Uncompress)
+// 2 : use_tags (BitStream) within TestCompress -> Compress Uncompress
+// 3 : verbose
+ int TestCompress(const u_int8_t* pagebuff, int pagebuffsize);
+};
+
+#endif