1 files changed, 547 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc
new file mode 100644
index 00000000..97900cd3
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc
@@ -0,0 +1,547 @@
+//
+// DocumentRef.cc
+//
+// DocumentRef: Reference to an indexed document. Keeps track of all
+//              information stored on the document, either by the dig 
+//              or temporary search information.
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "DocumentRef.h"
+#include "good_strtok.h"
+#include "WordRecord.h"
+#include "HtConfiguration.h"
+#include "HtURLCodec.h"
+#include "WordType.h"
+#include "HtWordReference.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+// extern HtConfiguration config;
+
+//*****************************************************************************
+// DocumentRef::DocumentRef()
+//
+DocumentRef::DocumentRef()
+{
+    Clear();
+}
+
+
+//*****************************************************************************
+// DocumentRef::~DocumentRef()
+//
+DocumentRef::~DocumentRef()
+{
+}
+
+
+//*****************************************************************************
+// void DocumentRef::Clear()
+//
+void DocumentRef::Clear()
+{
+  docID = 0;
+  docURL = 0;
+  docTime = 0;
+  docAccessed = 0;
+  docHead = 0;
+  docHeadIsSet = 0;
+  docMetaDsc = 0;
+  docTitle = 0;
+  descriptions.Destroy();
+  docState = Reference_normal;
+  docSize = 0;
+  docLinks = 0;
+  docBackLinks = 0;
+  docAnchors.Destroy();
+  docHopCount = 0;
+  docSig = 0;
+  docEmail = 0;
+  docNotification = 0;
+  docSubject = 0;
+  docScore = 0;
+  docAnchor = 0;
+}
+
+//*****************************************************************************
+// void DocumentRef::DocState(int s)
+//
+void DocumentRef::DocState(int s)
+{
+  // You can't easily do this with a cast, so we'll use a switch
+  switch(s)
+    {
+      case 0:
+	docState = Reference_normal;
+	break;
+      case 1:
+	docState = Reference_not_found;
+	break;
+      case 2:
+	docState = Reference_noindex;
+	break;
+      case 3:
+	docState = Reference_obsolete;
+	break;
+    }
+}
+
+
+enum
+{
+    DOC_ID,				// 0
+    DOC_TIME,				// 1
+    DOC_ACCESSED,			// 2
+    DOC_STATE,				// 3
+    DOC_SIZE,				// 4
+    DOC_LINKS,				// 5
+    DOC_IMAGESIZE,			// 6 -- No longer used
+    DOC_HOPCOUNT,			// 7
+    DOC_URL,				// 8
+    DOC_HEAD,				// 9
+    DOC_TITLE,				// 10
+    DOC_DESCRIPTIONS,	        	// 11
+    DOC_ANCHORS,			// 12
+    DOC_EMAIL,				// 13
+    DOC_NOTIFICATION,		        // 14
+    DOC_SUBJECT,			// 15
+    DOC_STRING,                         // 16
+    DOC_METADSC,                        // 17
+    DOC_BACKLINKS,                      // 18
+    DOC_SIG                             // 19
+};
+
+// Must be powers of two never reached by the DOC_... enums.
+#define CHARSIZE_MARKER_BIT 64
+#define SHORTSIZE_MARKER_BIT 128
+
+//*****************************************************************************
+// void DocumentRef::Serialize(String &s)
+//   Convert all the data in the object to a string. 
+//   The data is in the string is tagged with 
+//
+void DocumentRef::Serialize(String &s)
+{
+    int		length;
+    String	*str;
+
+//
+// The following macros make the serialization process a little easier
+// to follow.  Note that if an object to be serialized has the default
+// value for this class, it it NOT serialized.  This means that
+// storage will be saved...
+//
+#define addnum(id, out, var) \
+ if (var != 0)                                                        \
+ {                                                                    \
+   if (var <= (unsigned char) ~1)                                     \
+   {                                                                  \
+     unsigned char _tmp = var;                                        \
+     out << (char) (id | CHARSIZE_MARKER_BIT);                        \
+     out.append((char *) &_tmp, sizeof(_tmp));                        \
+   }                                                                  \
+   else if (var <= (unsigned short int) ~1)                           \
+   {                                                                  \
+     unsigned short int _tmp = var;                                   \
+     out << (char) (id | SHORTSIZE_MARKER_BIT);                       \
+     out.append((char *) &_tmp, sizeof(_tmp));                        \
+   }                                                                  \
+   else                                                               \
+   {                                                                  \
+     out << (char) id;                                                \
+     out.append((char *) &var, sizeof(var));                          \
+   }                                                                  \
+ }
+
+#define	addstring(id, out, str)	\
+ if (str.length())                                                    \
+ {                                                                    \
+   length = str.length();                                             \
+   if (length <= (unsigned char) ~1)                                  \
+   {                                                                  \
+     unsigned char _tmp = length;                                     \
+     out << (char) (id | CHARSIZE_MARKER_BIT);                        \
+     out.append((char *) &_tmp, sizeof(_tmp));                        \
+   }                                                                  \
+   else if (length <= (unsigned short int) ~1)                        \
+   {                                                                  \
+     unsigned short int _tmp = length;                                \
+     out << (char) (id | SHORTSIZE_MARKER_BIT);                       \
+     out.append((char *) &_tmp, sizeof(_tmp));                        \
+   }                                                                  \
+   else                                                               \
+   {                                                                  \
+     out << (char) id;                                                \
+     out.append((char *) &length, sizeof(length));                    \
+   }                                                                  \
+   out.append(str);                                                   \
+ }
+
+// To keep compatibility with old databases, don't bother
+// with long lists at all.  Bloat the size for long strings with
+// one char to just keep a ~1 marker since we don't know the
+// endianness; we don't know where to put a endian-safe
+// size-marker, and we probably rather want the full char to
+// keep the length.  Only strings shorter than (unsigned char) ~1 
+// will be "optimized"; trying to optimize strings that fit in
+// (unsigned short) does not seem to give anything substantial.
+#define	addlist(id, out, list) \
+ if (list.Count())                                                    \
+ {                                                                    \
+   length = list.Count();                                             \
+   if (length <= (unsigned short int) ~1)                             \
+   {                                                                  \
+     if (length <= (unsigned char) ~1)                                \
+     {                                                                \
+       unsigned char _tmp = length;                                   \
+       out << (char) (id | CHARSIZE_MARKER_BIT);                      \
+       out.append((char *) &_tmp, sizeof(_tmp));                      \
+     }                                                                \
+     else                                                             \
+     {                                                                \
+       unsigned short int _tmp = length;                              \
+       out << (char) (id | SHORTSIZE_MARKER_BIT);                     \
+       out.append((char *) &_tmp, sizeof(_tmp));                      \
+     }                                                                \
+     list.Start_Get();                                                \
+     while ((str = (String *) list.Get_Next()))		              \
+     {                                                                \
+       length = str->length();                                        \
+       if (length < (unsigned char) ~1)                               \
+       {                                                              \
+         unsigned char _tmp = length;                                 \
+         out.append((char*) &_tmp, sizeof(_tmp));                     \
+       }                                                              \
+       else                                                           \
+       {                                                              \
+         unsigned char _tmp = ~1;                                     \
+         out.append((char*) &_tmp, sizeof(_tmp));                     \
+         out.append((char*) &length, sizeof(length));                 \
+       }                                                              \
+       out.append(*str);                                              \
+     }                                                                \
+   }                                                                  \
+   else                                                               \
+   {                                                                  \
+     out << (char) id;                                                \
+     out.append((char *) &length, sizeof(length));                    \
+     list.Start_Get();                                                \
+     while ((str = (String *) list.Get_Next()))                       \
+     {                                                                \
+       length = str->length();                                        \
+       out.append((char*) &length, sizeof(length));                   \
+       out.append(*str);                                              \
+     }                                                                \
+   }                                                                  \
+ }
+
+    addnum(DOC_ID, s, docID);
+    addnum(DOC_TIME, s, docTime);
+    addnum(DOC_ACCESSED, s, docAccessed);
+    addnum(DOC_STATE, s, docState);
+    addnum(DOC_SIZE, s, docSize);
+    addnum(DOC_LINKS, s, docLinks);
+    addnum(DOC_BACKLINKS, s, docBackLinks);
+    addnum(DOC_HOPCOUNT, s, docHopCount);
+    addnum(DOC_SIG, s, docSig);
+
+    // Use a temporary since the addstring macro will evaluate
+    // this multiple times.
+    String tmps = HtURLCodec::instance()->encode(docURL);
+    addstring(DOC_URL, s, tmps);
+    // This is done in the DocumentDB code through the excerpt database
+    //    addstring(DOC_HEAD, s, docHead);
+    addstring(DOC_METADSC, s, docMetaDsc);
+    addstring(DOC_TITLE, s, docTitle);
+
+    addlist(DOC_DESCRIPTIONS, s, descriptions);
+    addlist(DOC_ANCHORS, s, docAnchors);
+
+    addstring(DOC_EMAIL, s, docEmail);
+    addstring(DOC_NOTIFICATION, s, docNotification);
+    addstring(DOC_SUBJECT, s, docSubject);
+}
+
+
+//*****************************************************************************
+// void DocumentRef::Deserialize(String &stream)
+//   Extract the contents of our private variables from the given
+//   character string.  The character string is expected to have been
+//   created using the Serialize member.
+//
+void DocumentRef::Deserialize(String &stream)
+{
+    Clear();
+    char	*s = stream.get();
+    char	*end = s + stream.length();
+    int		length;
+    int		count;
+    int		i;
+    int		x;
+    int		throwaway; // As the name sounds--used for old fields
+    String	*str;
+
+// There is a problem with getting a numeric value into a
+// numeric unknown type that may be an enum (the other way
+// around is simply by casting (int)).
+//  Supposedly the enum incarnates as a simple type, so we can
+// just check the size and copy the bits.
+#define MEMCPY_ASSIGN(to, from, type) \
+ do {                                                                 \
+   type _tmp = (type) (from);                                         \
+   memcpy((char *) &(to), (char *) &_tmp, sizeof(to));                \
+ } while (0)
+
+#define NUM_ASSIGN(to, from) \
+ do {                                                                 \
+   if (sizeof(to) == sizeof(unsigned long int))                       \
+     MEMCPY_ASSIGN(to, from, unsigned long int);                      \
+   else if (sizeof(to) == sizeof(unsigned int))                       \
+     MEMCPY_ASSIGN(to, from, unsigned int);                           \
+   else if (sizeof(to) == sizeof(unsigned short int))                 \
+     MEMCPY_ASSIGN(to, from, unsigned short int);                     \
+   else if (sizeof(to) == sizeof(unsigned char))                      \
+     MEMCPY_ASSIGN(to, from, unsigned char);                          \
+   /* else fatal error here? */                                       \
+ } while (0)
+
+#define	getnum(type, in, var) \
+ if (type & CHARSIZE_MARKER_BIT)                                      \
+ {                                                                    \
+   NUM_ASSIGN(var, *(unsigned char *) in);                            \
+   in += sizeof(unsigned char);                                       \
+ }                                                                    \
+ else if (type & SHORTSIZE_MARKER_BIT)                                \
+ {                                                                    \
+   unsigned short int _tmp0;                                          \
+   memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short));    \
+   NUM_ASSIGN(var, _tmp0);                                            \
+   in += sizeof(unsigned short int);                                  \
+ }                                                                    \
+ else                                                                 \
+ {                                                                    \
+   memcpy((char *) &var, in, sizeof(var));                            \
+   in += sizeof(var);                                                 \
+ }
+
+#define	getstring(type, in, str) \
+ getnum(type, in, length);                                            \
+ str = 0;                                                             \
+ str.append(in, length);                                              \
+ in += length
+
+#define	getlist(type, in, list) \
+ getnum(type, in, count);                                             \
+ if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))             \
+ {                                                                    \
+   for (i = 0; i < count; i++)                                        \
+   {                                                                  \
+     unsigned char _tmp = *(unsigned char *) in;                      \
+     in += sizeof(_tmp);                                              \
+     if (_tmp < (unsigned char) ~1)                                   \
+       length = _tmp;                                                 \
+     else                                                             \
+       getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in,      \
+              length);                                                \
+     str = new String;                                                \
+     str->append(in, length);                                         \
+     list.Add(str);                                                   \
+     in += length;                                                    \
+   }                                                                  \
+ }                                                                    \
+ else                                                                 \
+ {                                                                    \
+   for (i = 0; i < count; i++)                                        \
+   {                                                                  \
+     getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in,        \
+            length);                                                  \
+     str = new String;                                                \
+     str->append(in, length);                                         \
+     list.Add(str);                                                   \
+     in += length;                                                    \
+   }                                                                  \
+ }
+
+    while (s < end)
+    {
+        x = (unsigned char) *s++;
+        switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))
+        {
+        case DOC_ID:
+            getnum(x, s, docID);
+            break;
+        case DOC_TIME:
+            getnum(x, s, docTime);
+            break;
+        case DOC_ACCESSED:
+            getnum(x, s, docAccessed);
+            break;
+        case DOC_STATE:
+            getnum(x, s, docState);
+            break;
+        case DOC_SIZE:
+            getnum(x, s, docSize);
+            break;
+        case DOC_IMAGESIZE: // No longer used
+	    getnum(x, s, throwaway);
+	    break;
+        case DOC_LINKS:
+            getnum(x, s, docLinks);
+            break;
+        case DOC_HOPCOUNT:
+            getnum(x, s, docHopCount);
+            break;
+	case DOC_BACKLINKS:
+	    getnum(x, s, docBackLinks);
+	    break;
+	case DOC_SIG:
+	    getnum(x, s, docSig);
+	    break;
+        case DOC_URL:
+	    {
+	      // Use a temporary since the addstring macro will evaluate
+	      // this multiple times.
+	      String tmps;
+	      getstring(x, s, tmps);
+
+	      docURL = HtURLCodec::instance()->decode(tmps);
+	    }
+	    break;
+        case DOC_HEAD:
+            getstring(x, s, docHead); docHeadIsSet = 1;
+            break;
+	case DOC_METADSC:
+	    getstring(x, s, docMetaDsc);
+	    break;
+        case DOC_TITLE:
+            getstring(x, s, docTitle);
+            break;
+        case DOC_DESCRIPTIONS:
+            getlist(x, s, descriptions);
+            break;
+        case DOC_ANCHORS:
+            getlist(x, s, docAnchors);
+            break;
+        case DOC_EMAIL:
+            getstring(x, s, docEmail);
+            break;
+        case DOC_NOTIFICATION:
+            getstring(x, s, docNotification);
+            break;
+        case DOC_SUBJECT:
+            getstring(x, s, docSubject);
+            break;
+	case DOC_STRING:
+	  // This is just a debugging string. Ignore it.
+	    break;
+        default:
+            cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl;
+            return;
+        }
+    }
+}
+
+
+//*****************************************************************************
+// void DocumentRef::AddDescription(char *d, HtWordList &words)
+//
+void DocumentRef::AddDescription(const char *d, HtWordList &words)
+{
+    if (!d || !*d)
+        return;
+
+    while (isspace(*d))
+        d++;
+   
+   if (!d || !*d)
+     return;
+
+    String	desc = d;
+    desc.chop(" \t");
+
+    // Add the description text to the word database with proper factor
+    // Do this first because we may have reached the max_description limit
+    // This also ensures we keep the proper weight on descriptions 
+    // that occur many times
+
+    // Parse words.
+    char         *p                   = desc;
+    HtConfiguration* config= HtConfiguration::config();
+    static int    minimum_word_length = config->Value("minimum_word_length", 3);
+    static int    max_descriptions    = config->Value("max_descriptions", 5);
+
+    String word;
+    HtWordReference wordRef;
+    wordRef.Flags(FLAG_LINK_TEXT);
+    wordRef.DocID(docID);
+
+    while (*p)
+    {
+      // Reset contents before adding chars each round.
+      word = 0;
+
+      while (*p && HtIsWordChar(*p))
+        word << *p++;
+
+      HtStripPunctuation(word);
+
+      if (word.length() >= minimum_word_length) {
+        // The wordlist takes care of lowercasing; just add it.
+	wordRef.Location((p - (char*)desc) - word.length());
+	wordRef.Word(word);
+        words.Replace(wordRef);
+      }
+
+      while (*p && !HtIsStrictWordChar(*p))
+        p++;
+    }
+
+    // And let's flush the words! (nice comment hu :-)
+    words.Flush();
+    
+    // Now are we at the max_description limit?
+    if (descriptions.Count() >= max_descriptions)
+  	return;
+  	
+    descriptions.Start_Get();
+    String	*description;
+    while ((description = (String *) descriptions.Get_Next()))
+    {
+        if (mystrcasecmp(description->get(), (char*)desc) == 0)
+            return;
+    }
+    descriptions.Add(new String(desc));
+}
+
+
+//*****************************************************************************
+// void DocumentRef::AddAnchor(char *a)
+//
+void DocumentRef::AddAnchor(const char *a)
+{
+    if (a)
+    	docAnchors.Add(new String(a));
+}
+
+