diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc | 547 |
1 files changed, 547 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc new file mode 100644 index 00000000..97900cd3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc @@ -0,0 +1,547 @@ +// +// DocumentRef.cc +// +// DocumentRef: Reference to an indexed document. Keeps track of all +// information stored on the document, either by the dig +// or temporary search information. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "DocumentRef.h" +#include "good_strtok.h" +#include "WordRecord.h" +#include "HtConfiguration.h" +#include "HtURLCodec.h" +#include "WordType.h" +#include "HtWordReference.h" +#include <stdlib.h> +#include <ctype.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +// extern HtConfiguration config; + +//***************************************************************************** +// DocumentRef::DocumentRef() +// +DocumentRef::DocumentRef() +{ + Clear(); +} + + +//***************************************************************************** +// DocumentRef::~DocumentRef() +// +DocumentRef::~DocumentRef() +{ +} + + +//***************************************************************************** +// void DocumentRef::Clear() +// +void DocumentRef::Clear() +{ + docID = 0; + docURL = 0; + docTime = 0; + docAccessed = 0; + docHead = 0; + docHeadIsSet = 0; + docMetaDsc = 0; + docTitle = 0; + descriptions.Destroy(); + docState = Reference_normal; + docSize = 0; + docLinks = 0; + docBackLinks = 0; + docAnchors.Destroy(); + docHopCount = 0; + docSig = 0; + docEmail = 0; + docNotification = 0; + docSubject = 0; + docScore = 0; + docAnchor = 0; +} + +//***************************************************************************** +// void DocumentRef::DocState(int s) +// +void DocumentRef::DocState(int s) +{ + // You can't easily do this with a cast, so we'll use a switch + switch(s) + { + case 0: + docState = Reference_normal; + break; + case 1: + docState = Reference_not_found; + break; + case 2: + docState = Reference_noindex; + break; + case 3: + docState = Reference_obsolete; + break; + } +} + + +enum +{ + DOC_ID, // 0 + DOC_TIME, // 1 + DOC_ACCESSED, // 2 + DOC_STATE, // 3 + DOC_SIZE, // 4 + DOC_LINKS, // 5 + DOC_IMAGESIZE, // 6 -- No longer used + DOC_HOPCOUNT, // 7 + DOC_URL, // 8 + DOC_HEAD, // 9 + DOC_TITLE, // 10 + DOC_DESCRIPTIONS, // 11 + DOC_ANCHORS, // 12 + DOC_EMAIL, // 13 + DOC_NOTIFICATION, // 14 + DOC_SUBJECT, // 15 + DOC_STRING, // 16 + DOC_METADSC, // 17 + DOC_BACKLINKS, // 18 + DOC_SIG // 19 +}; + +// Must be powers of two never reached by the DOC_... enums. +#define CHARSIZE_MARKER_BIT 64 +#define SHORTSIZE_MARKER_BIT 128 + +//***************************************************************************** +// void DocumentRef::Serialize(String &s) +// Convert all the data in the object to a string. +// The data is in the string is tagged with +// +void DocumentRef::Serialize(String &s) +{ + int length; + String *str; + +// +// The following macros make the serialization process a little easier +// to follow. Note that if an object to be serialized has the default +// value for this class, it it NOT serialized. This means that +// storage will be saved... +// +#define addnum(id, out, var) \ + if (var != 0) \ + { \ + if (var <= (unsigned char) ~1) \ + { \ + unsigned char _tmp = var; \ + out << (char) (id | CHARSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else if (var <= (unsigned short int) ~1) \ + { \ + unsigned short int _tmp = var; \ + out << (char) (id | SHORTSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + out << (char) id; \ + out.append((char *) &var, sizeof(var)); \ + } \ + } + +#define addstring(id, out, str) \ + if (str.length()) \ + { \ + length = str.length(); \ + if (length <= (unsigned char) ~1) \ + { \ + unsigned char _tmp = length; \ + out << (char) (id | CHARSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else if (length <= (unsigned short int) ~1) \ + { \ + unsigned short int _tmp = length; \ + out << (char) (id | SHORTSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + out << (char) id; \ + out.append((char *) &length, sizeof(length)); \ + } \ + out.append(str); \ + } + +// To keep compatibility with old databases, don't bother +// with long lists at all. Bloat the size for long strings with +// one char to just keep a ~1 marker since we don't know the +// endianness; we don't know where to put a endian-safe +// size-marker, and we probably rather want the full char to +// keep the length. Only strings shorter than (unsigned char) ~1 +// will be "optimized"; trying to optimize strings that fit in +// (unsigned short) does not seem to give anything substantial. +#define addlist(id, out, list) \ + if (list.Count()) \ + { \ + length = list.Count(); \ + if (length <= (unsigned short int) ~1) \ + { \ + if (length <= (unsigned char) ~1) \ + { \ + unsigned char _tmp = length; \ + out << (char) (id | CHARSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + unsigned short int _tmp = length; \ + out << (char) (id | SHORTSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + list.Start_Get(); \ + while ((str = (String *) list.Get_Next())) \ + { \ + length = str->length(); \ + if (length < (unsigned char) ~1) \ + { \ + unsigned char _tmp = length; \ + out.append((char*) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + unsigned char _tmp = ~1; \ + out.append((char*) &_tmp, sizeof(_tmp)); \ + out.append((char*) &length, sizeof(length)); \ + } \ + out.append(*str); \ + } \ + } \ + else \ + { \ + out << (char) id; \ + out.append((char *) &length, sizeof(length)); \ + list.Start_Get(); \ + while ((str = (String *) list.Get_Next())) \ + { \ + length = str->length(); \ + out.append((char*) &length, sizeof(length)); \ + out.append(*str); \ + } \ + } \ + } + + addnum(DOC_ID, s, docID); + addnum(DOC_TIME, s, docTime); + addnum(DOC_ACCESSED, s, docAccessed); + addnum(DOC_STATE, s, docState); + addnum(DOC_SIZE, s, docSize); + addnum(DOC_LINKS, s, docLinks); + addnum(DOC_BACKLINKS, s, docBackLinks); + addnum(DOC_HOPCOUNT, s, docHopCount); + addnum(DOC_SIG, s, docSig); + + // Use a temporary since the addstring macro will evaluate + // this multiple times. + String tmps = HtURLCodec::instance()->encode(docURL); + addstring(DOC_URL, s, tmps); + // This is done in the DocumentDB code through the excerpt database + // addstring(DOC_HEAD, s, docHead); + addstring(DOC_METADSC, s, docMetaDsc); + addstring(DOC_TITLE, s, docTitle); + + addlist(DOC_DESCRIPTIONS, s, descriptions); + addlist(DOC_ANCHORS, s, docAnchors); + + addstring(DOC_EMAIL, s, docEmail); + addstring(DOC_NOTIFICATION, s, docNotification); + addstring(DOC_SUBJECT, s, docSubject); +} + + +//***************************************************************************** +// void DocumentRef::Deserialize(String &stream) +// Extract the contents of our private variables from the given +// character string. The character string is expected to have been +// created using the Serialize member. +// +void DocumentRef::Deserialize(String &stream) +{ + Clear(); + char *s = stream.get(); + char *end = s + stream.length(); + int length; + int count; + int i; + int x; + int throwaway; // As the name sounds--used for old fields + String *str; + +// There is a problem with getting a numeric value into a +// numeric unknown type that may be an enum (the other way +// around is simply by casting (int)). +// Supposedly the enum incarnates as a simple type, so we can +// just check the size and copy the bits. +#define MEMCPY_ASSIGN(to, from, type) \ + do { \ + type _tmp = (type) (from); \ + memcpy((char *) &(to), (char *) &_tmp, sizeof(to)); \ + } while (0) + +#define NUM_ASSIGN(to, from) \ + do { \ + if (sizeof(to) == sizeof(unsigned long int)) \ + MEMCPY_ASSIGN(to, from, unsigned long int); \ + else if (sizeof(to) == sizeof(unsigned int)) \ + MEMCPY_ASSIGN(to, from, unsigned int); \ + else if (sizeof(to) == sizeof(unsigned short int)) \ + MEMCPY_ASSIGN(to, from, unsigned short int); \ + else if (sizeof(to) == sizeof(unsigned char)) \ + MEMCPY_ASSIGN(to, from, unsigned char); \ + /* else fatal error here? */ \ + } while (0) + +#define getnum(type, in, var) \ + if (type & CHARSIZE_MARKER_BIT) \ + { \ + NUM_ASSIGN(var, *(unsigned char *) in); \ + in += sizeof(unsigned char); \ + } \ + else if (type & SHORTSIZE_MARKER_BIT) \ + { \ + unsigned short int _tmp0; \ + memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short)); \ + NUM_ASSIGN(var, _tmp0); \ + in += sizeof(unsigned short int); \ + } \ + else \ + { \ + memcpy((char *) &var, in, sizeof(var)); \ + in += sizeof(var); \ + } + +#define getstring(type, in, str) \ + getnum(type, in, length); \ + str = 0; \ + str.append(in, length); \ + in += length + +#define getlist(type, in, list) \ + getnum(type, in, count); \ + if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) \ + { \ + for (i = 0; i < count; i++) \ + { \ + unsigned char _tmp = *(unsigned char *) in; \ + in += sizeof(_tmp); \ + if (_tmp < (unsigned char) ~1) \ + length = _tmp; \ + else \ + getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \ + length); \ + str = new String; \ + str->append(in, length); \ + list.Add(str); \ + in += length; \ + } \ + } \ + else \ + { \ + for (i = 0; i < count; i++) \ + { \ + getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \ + length); \ + str = new String; \ + str->append(in, length); \ + list.Add(str); \ + in += length; \ + } \ + } + + while (s < end) + { + x = (unsigned char) *s++; + switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) + { + case DOC_ID: + getnum(x, s, docID); + break; + case DOC_TIME: + getnum(x, s, docTime); + break; + case DOC_ACCESSED: + getnum(x, s, docAccessed); + break; + case DOC_STATE: + getnum(x, s, docState); + break; + case DOC_SIZE: + getnum(x, s, docSize); + break; + case DOC_IMAGESIZE: // No longer used + getnum(x, s, throwaway); + break; + case DOC_LINKS: + getnum(x, s, docLinks); + break; + case DOC_HOPCOUNT: + getnum(x, s, docHopCount); + break; + case DOC_BACKLINKS: + getnum(x, s, docBackLinks); + break; + case DOC_SIG: + getnum(x, s, docSig); + break; + case DOC_URL: + { + // Use a temporary since the addstring macro will evaluate + // this multiple times. + String tmps; + getstring(x, s, tmps); + + docURL = HtURLCodec::instance()->decode(tmps); + } + break; + case DOC_HEAD: + getstring(x, s, docHead); docHeadIsSet = 1; + break; + case DOC_METADSC: + getstring(x, s, docMetaDsc); + break; + case DOC_TITLE: + getstring(x, s, docTitle); + break; + case DOC_DESCRIPTIONS: + getlist(x, s, descriptions); + break; + case DOC_ANCHORS: + getlist(x, s, docAnchors); + break; + case DOC_EMAIL: + getstring(x, s, docEmail); + break; + case DOC_NOTIFICATION: + getstring(x, s, docNotification); + break; + case DOC_SUBJECT: + getstring(x, s, docSubject); + break; + case DOC_STRING: + // This is just a debugging string. Ignore it. + break; + default: + cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl; + return; + } + } +} + + +//***************************************************************************** +// void DocumentRef::AddDescription(char *d, HtWordList &words) +// +void DocumentRef::AddDescription(const char *d, HtWordList &words) +{ + if (!d || !*d) + return; + + while (isspace(*d)) + d++; + + if (!d || !*d) + return; + + String desc = d; + desc.chop(" \t"); + + // Add the description text to the word database with proper factor + // Do this first because we may have reached the max_description limit + // This also ensures we keep the proper weight on descriptions + // that occur many times + + // Parse words. + char *p = desc; + HtConfiguration* config= HtConfiguration::config(); + static int minimum_word_length = config->Value("minimum_word_length", 3); + static int max_descriptions = config->Value("max_descriptions", 5); + + String word; + HtWordReference wordRef; + wordRef.Flags(FLAG_LINK_TEXT); + wordRef.DocID(docID); + + while (*p) + { + // Reset contents before adding chars each round. + word = 0; + + while (*p && HtIsWordChar(*p)) + word << *p++; + + HtStripPunctuation(word); + + if (word.length() >= minimum_word_length) { + // The wordlist takes care of lowercasing; just add it. + wordRef.Location((p - (char*)desc) - word.length()); + wordRef.Word(word); + words.Replace(wordRef); + } + + while (*p && !HtIsStrictWordChar(*p)) + p++; + } + + // And let's flush the words! (nice comment hu :-) + words.Flush(); + + // Now are we at the max_description limit? + if (descriptions.Count() >= max_descriptions) + return; + + descriptions.Start_Get(); + String *description; + while ((description = (String *) descriptions.Get_Next())) + { + if (mystrcasecmp(description->get(), (char*)desc) == 0) + return; + } + descriptions.Add(new String(desc)); +} + + +//***************************************************************************** +// void DocumentRef::AddAnchor(char *a) +// +void DocumentRef::AddAnchor(const char *a) +{ + if (a) + docAnchors.Add(new String(a)); +} + + |