summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc547
1 files changed, 547 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc
new file mode 100644
index 00000000..97900cd3
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc
@@ -0,0 +1,547 @@
+//
+// DocumentRef.cc
+//
+// DocumentRef: Reference to an indexed document. Keeps track of all
+// information stored on the document, either by the dig
+// or temporary search information.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "DocumentRef.h"
+#include "good_strtok.h"
+#include "WordRecord.h"
+#include "HtConfiguration.h"
+#include "HtURLCodec.h"
+#include "WordType.h"
+#include "HtWordReference.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+// extern HtConfiguration config;
+
+//*****************************************************************************
+// DocumentRef::DocumentRef()
+//
+DocumentRef::DocumentRef()
+{
+ Clear();
+}
+
+
+//*****************************************************************************
+// DocumentRef::~DocumentRef()
+//
+DocumentRef::~DocumentRef()
+{
+}
+
+
+//*****************************************************************************
+// void DocumentRef::Clear()
+//
+void DocumentRef::Clear()
+{
+ docID = 0;
+ docURL = 0;
+ docTime = 0;
+ docAccessed = 0;
+ docHead = 0;
+ docHeadIsSet = 0;
+ docMetaDsc = 0;
+ docTitle = 0;
+ descriptions.Destroy();
+ docState = Reference_normal;
+ docSize = 0;
+ docLinks = 0;
+ docBackLinks = 0;
+ docAnchors.Destroy();
+ docHopCount = 0;
+ docSig = 0;
+ docEmail = 0;
+ docNotification = 0;
+ docSubject = 0;
+ docScore = 0;
+ docAnchor = 0;
+}
+
+//*****************************************************************************
+// void DocumentRef::DocState(int s)
+//
+void DocumentRef::DocState(int s)
+{
+ // You can't easily do this with a cast, so we'll use a switch
+ switch(s)
+ {
+ case 0:
+ docState = Reference_normal;
+ break;
+ case 1:
+ docState = Reference_not_found;
+ break;
+ case 2:
+ docState = Reference_noindex;
+ break;
+ case 3:
+ docState = Reference_obsolete;
+ break;
+ }
+}
+
+
+enum
+{
+ DOC_ID, // 0
+ DOC_TIME, // 1
+ DOC_ACCESSED, // 2
+ DOC_STATE, // 3
+ DOC_SIZE, // 4
+ DOC_LINKS, // 5
+ DOC_IMAGESIZE, // 6 -- No longer used
+ DOC_HOPCOUNT, // 7
+ DOC_URL, // 8
+ DOC_HEAD, // 9
+ DOC_TITLE, // 10
+ DOC_DESCRIPTIONS, // 11
+ DOC_ANCHORS, // 12
+ DOC_EMAIL, // 13
+ DOC_NOTIFICATION, // 14
+ DOC_SUBJECT, // 15
+ DOC_STRING, // 16
+ DOC_METADSC, // 17
+ DOC_BACKLINKS, // 18
+ DOC_SIG // 19
+};
+
+// Must be powers of two never reached by the DOC_... enums.
+#define CHARSIZE_MARKER_BIT 64
+#define SHORTSIZE_MARKER_BIT 128
+
+//*****************************************************************************
+// void DocumentRef::Serialize(String &s)
+// Convert all the data in the object to a string.
+// The data is in the string is tagged with
+//
+void DocumentRef::Serialize(String &s)
+{
+ int length;
+ String *str;
+
+//
+// The following macros make the serialization process a little easier
+// to follow. Note that if an object to be serialized has the default
+// value for this class, it it NOT serialized. This means that
+// storage will be saved...
+//
+#define addnum(id, out, var) \
+ if (var != 0) \
+ { \
+ if (var <= (unsigned char) ~1) \
+ { \
+ unsigned char _tmp = var; \
+ out << (char) (id | CHARSIZE_MARKER_BIT); \
+ out.append((char *) &_tmp, sizeof(_tmp)); \
+ } \
+ else if (var <= (unsigned short int) ~1) \
+ { \
+ unsigned short int _tmp = var; \
+ out << (char) (id | SHORTSIZE_MARKER_BIT); \
+ out.append((char *) &_tmp, sizeof(_tmp)); \
+ } \
+ else \
+ { \
+ out << (char) id; \
+ out.append((char *) &var, sizeof(var)); \
+ } \
+ }
+
+#define addstring(id, out, str) \
+ if (str.length()) \
+ { \
+ length = str.length(); \
+ if (length <= (unsigned char) ~1) \
+ { \
+ unsigned char _tmp = length; \
+ out << (char) (id | CHARSIZE_MARKER_BIT); \
+ out.append((char *) &_tmp, sizeof(_tmp)); \
+ } \
+ else if (length <= (unsigned short int) ~1) \
+ { \
+ unsigned short int _tmp = length; \
+ out << (char) (id | SHORTSIZE_MARKER_BIT); \
+ out.append((char *) &_tmp, sizeof(_tmp)); \
+ } \
+ else \
+ { \
+ out << (char) id; \
+ out.append((char *) &length, sizeof(length)); \
+ } \
+ out.append(str); \
+ }
+
+// To keep compatibility with old databases, don't bother
+// with long lists at all. Bloat the size for long strings with
+// one char to just keep a ~1 marker since we don't know the
+// endianness; we don't know where to put a endian-safe
+// size-marker, and we probably rather want the full char to
+// keep the length. Only strings shorter than (unsigned char) ~1
+// will be "optimized"; trying to optimize strings that fit in
+// (unsigned short) does not seem to give anything substantial.
+#define addlist(id, out, list) \
+ if (list.Count()) \
+ { \
+ length = list.Count(); \
+ if (length <= (unsigned short int) ~1) \
+ { \
+ if (length <= (unsigned char) ~1) \
+ { \
+ unsigned char _tmp = length; \
+ out << (char) (id | CHARSIZE_MARKER_BIT); \
+ out.append((char *) &_tmp, sizeof(_tmp)); \
+ } \
+ else \
+ { \
+ unsigned short int _tmp = length; \
+ out << (char) (id | SHORTSIZE_MARKER_BIT); \
+ out.append((char *) &_tmp, sizeof(_tmp)); \
+ } \
+ list.Start_Get(); \
+ while ((str = (String *) list.Get_Next())) \
+ { \
+ length = str->length(); \
+ if (length < (unsigned char) ~1) \
+ { \
+ unsigned char _tmp = length; \
+ out.append((char*) &_tmp, sizeof(_tmp)); \
+ } \
+ else \
+ { \
+ unsigned char _tmp = ~1; \
+ out.append((char*) &_tmp, sizeof(_tmp)); \
+ out.append((char*) &length, sizeof(length)); \
+ } \
+ out.append(*str); \
+ } \
+ } \
+ else \
+ { \
+ out << (char) id; \
+ out.append((char *) &length, sizeof(length)); \
+ list.Start_Get(); \
+ while ((str = (String *) list.Get_Next())) \
+ { \
+ length = str->length(); \
+ out.append((char*) &length, sizeof(length)); \
+ out.append(*str); \
+ } \
+ } \
+ }
+
+ addnum(DOC_ID, s, docID);
+ addnum(DOC_TIME, s, docTime);
+ addnum(DOC_ACCESSED, s, docAccessed);
+ addnum(DOC_STATE, s, docState);
+ addnum(DOC_SIZE, s, docSize);
+ addnum(DOC_LINKS, s, docLinks);
+ addnum(DOC_BACKLINKS, s, docBackLinks);
+ addnum(DOC_HOPCOUNT, s, docHopCount);
+ addnum(DOC_SIG, s, docSig);
+
+ // Use a temporary since the addstring macro will evaluate
+ // this multiple times.
+ String tmps = HtURLCodec::instance()->encode(docURL);
+ addstring(DOC_URL, s, tmps);
+ // This is done in the DocumentDB code through the excerpt database
+ // addstring(DOC_HEAD, s, docHead);
+ addstring(DOC_METADSC, s, docMetaDsc);
+ addstring(DOC_TITLE, s, docTitle);
+
+ addlist(DOC_DESCRIPTIONS, s, descriptions);
+ addlist(DOC_ANCHORS, s, docAnchors);
+
+ addstring(DOC_EMAIL, s, docEmail);
+ addstring(DOC_NOTIFICATION, s, docNotification);
+ addstring(DOC_SUBJECT, s, docSubject);
+}
+
+
+//*****************************************************************************
+// void DocumentRef::Deserialize(String &stream)
+// Extract the contents of our private variables from the given
+// character string. The character string is expected to have been
+// created using the Serialize member.
+//
+void DocumentRef::Deserialize(String &stream)
+{
+ Clear();
+ char *s = stream.get();
+ char *end = s + stream.length();
+ int length;
+ int count;
+ int i;
+ int x;
+ int throwaway; // As the name sounds--used for old fields
+ String *str;
+
+// There is a problem with getting a numeric value into a
+// numeric unknown type that may be an enum (the other way
+// around is simply by casting (int)).
+// Supposedly the enum incarnates as a simple type, so we can
+// just check the size and copy the bits.
+#define MEMCPY_ASSIGN(to, from, type) \
+ do { \
+ type _tmp = (type) (from); \
+ memcpy((char *) &(to), (char *) &_tmp, sizeof(to)); \
+ } while (0)
+
+#define NUM_ASSIGN(to, from) \
+ do { \
+ if (sizeof(to) == sizeof(unsigned long int)) \
+ MEMCPY_ASSIGN(to, from, unsigned long int); \
+ else if (sizeof(to) == sizeof(unsigned int)) \
+ MEMCPY_ASSIGN(to, from, unsigned int); \
+ else if (sizeof(to) == sizeof(unsigned short int)) \
+ MEMCPY_ASSIGN(to, from, unsigned short int); \
+ else if (sizeof(to) == sizeof(unsigned char)) \
+ MEMCPY_ASSIGN(to, from, unsigned char); \
+ /* else fatal error here? */ \
+ } while (0)
+
+#define getnum(type, in, var) \
+ if (type & CHARSIZE_MARKER_BIT) \
+ { \
+ NUM_ASSIGN(var, *(unsigned char *) in); \
+ in += sizeof(unsigned char); \
+ } \
+ else if (type & SHORTSIZE_MARKER_BIT) \
+ { \
+ unsigned short int _tmp0; \
+ memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short)); \
+ NUM_ASSIGN(var, _tmp0); \
+ in += sizeof(unsigned short int); \
+ } \
+ else \
+ { \
+ memcpy((char *) &var, in, sizeof(var)); \
+ in += sizeof(var); \
+ }
+
+#define getstring(type, in, str) \
+ getnum(type, in, length); \
+ str = 0; \
+ str.append(in, length); \
+ in += length
+
+#define getlist(type, in, list) \
+ getnum(type, in, count); \
+ if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) \
+ { \
+ for (i = 0; i < count; i++) \
+ { \
+ unsigned char _tmp = *(unsigned char *) in; \
+ in += sizeof(_tmp); \
+ if (_tmp < (unsigned char) ~1) \
+ length = _tmp; \
+ else \
+ getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \
+ length); \
+ str = new String; \
+ str->append(in, length); \
+ list.Add(str); \
+ in += length; \
+ } \
+ } \
+ else \
+ { \
+ for (i = 0; i < count; i++) \
+ { \
+ getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \
+ length); \
+ str = new String; \
+ str->append(in, length); \
+ list.Add(str); \
+ in += length; \
+ } \
+ }
+
+ while (s < end)
+ {
+ x = (unsigned char) *s++;
+ switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))
+ {
+ case DOC_ID:
+ getnum(x, s, docID);
+ break;
+ case DOC_TIME:
+ getnum(x, s, docTime);
+ break;
+ case DOC_ACCESSED:
+ getnum(x, s, docAccessed);
+ break;
+ case DOC_STATE:
+ getnum(x, s, docState);
+ break;
+ case DOC_SIZE:
+ getnum(x, s, docSize);
+ break;
+ case DOC_IMAGESIZE: // No longer used
+ getnum(x, s, throwaway);
+ break;
+ case DOC_LINKS:
+ getnum(x, s, docLinks);
+ break;
+ case DOC_HOPCOUNT:
+ getnum(x, s, docHopCount);
+ break;
+ case DOC_BACKLINKS:
+ getnum(x, s, docBackLinks);
+ break;
+ case DOC_SIG:
+ getnum(x, s, docSig);
+ break;
+ case DOC_URL:
+ {
+ // Use a temporary since the addstring macro will evaluate
+ // this multiple times.
+ String tmps;
+ getstring(x, s, tmps);
+
+ docURL = HtURLCodec::instance()->decode(tmps);
+ }
+ break;
+ case DOC_HEAD:
+ getstring(x, s, docHead); docHeadIsSet = 1;
+ break;
+ case DOC_METADSC:
+ getstring(x, s, docMetaDsc);
+ break;
+ case DOC_TITLE:
+ getstring(x, s, docTitle);
+ break;
+ case DOC_DESCRIPTIONS:
+ getlist(x, s, descriptions);
+ break;
+ case DOC_ANCHORS:
+ getlist(x, s, docAnchors);
+ break;
+ case DOC_EMAIL:
+ getstring(x, s, docEmail);
+ break;
+ case DOC_NOTIFICATION:
+ getstring(x, s, docNotification);
+ break;
+ case DOC_SUBJECT:
+ getstring(x, s, docSubject);
+ break;
+ case DOC_STRING:
+ // This is just a debugging string. Ignore it.
+ break;
+ default:
+ cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl;
+ return;
+ }
+ }
+}
+
+
+//*****************************************************************************
+// void DocumentRef::AddDescription(char *d, HtWordList &words)
+//
+void DocumentRef::AddDescription(const char *d, HtWordList &words)
+{
+ if (!d || !*d)
+ return;
+
+ while (isspace(*d))
+ d++;
+
+ if (!d || !*d)
+ return;
+
+ String desc = d;
+ desc.chop(" \t");
+
+ // Add the description text to the word database with proper factor
+ // Do this first because we may have reached the max_description limit
+ // This also ensures we keep the proper weight on descriptions
+ // that occur many times
+
+ // Parse words.
+ char *p = desc;
+ HtConfiguration* config= HtConfiguration::config();
+ static int minimum_word_length = config->Value("minimum_word_length", 3);
+ static int max_descriptions = config->Value("max_descriptions", 5);
+
+ String word;
+ HtWordReference wordRef;
+ wordRef.Flags(FLAG_LINK_TEXT);
+ wordRef.DocID(docID);
+
+ while (*p)
+ {
+ // Reset contents before adding chars each round.
+ word = 0;
+
+ while (*p && HtIsWordChar(*p))
+ word << *p++;
+
+ HtStripPunctuation(word);
+
+ if (word.length() >= minimum_word_length) {
+ // The wordlist takes care of lowercasing; just add it.
+ wordRef.Location((p - (char*)desc) - word.length());
+ wordRef.Word(word);
+ words.Replace(wordRef);
+ }
+
+ while (*p && !HtIsStrictWordChar(*p))
+ p++;
+ }
+
+ // And let's flush the words! (nice comment hu :-)
+ words.Flush();
+
+ // Now are we at the max_description limit?
+ if (descriptions.Count() >= max_descriptions)
+ return;
+
+ descriptions.Start_Get();
+ String *description;
+ while ((description = (String *) descriptions.Get_Next()))
+ {
+ if (mystrcasecmp(description->get(), (char*)desc) == 0)
+ return;
+ }
+ descriptions.Add(new String(desc));
+}
+
+
+//*****************************************************************************
+// void DocumentRef::AddAnchor(char *a)
+//
+void DocumentRef::AddAnchor(const char *a)
+{
+ if (a)
+ docAnchors.Add(new String(a));
+}
+
+