summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc517
1 files changed, 517 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
new file mode 100644
index 00000000..3f6d5e5f
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
@@ -0,0 +1,517 @@
+//--------------------------------------------------------------------
+//
+// TextCollector.cc
+//
+// 2/6/2002 created for libhtdig
+//
+// Neal Richter nealr@rightnow.com
+//
+// TextCollector:
+// General Purpose Text Document Indexer.
+// Calls appropriate parsers.
+// The parser notifies the TextCollector object that it got something
+// (got_* functions) and the TextCollector object feed the databases
+// and statistics accordingly.
+//
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "TextCollector.h"
+#include "htdig.h"
+#include "HtWordList.h"
+#include "WordRecord.h"
+#include "URLRef.h"
+#include "Server.h"
+#include "Parsable.h"
+#include "BasicDocument.h"
+#include "StringList.h"
+#include "WordType.h"
+#include "md5.h"
+#include "defaults.h"
+
+#include <signal.h>
+#include <stdio.h>
+
+#include <sys/timeb.h>
+
+
+//*****************************************************************************
+// TextCollector::TextCollector()
+//
+TextCollector::TextCollector(TextCollectorLog flags):
+words(*(HtConfiguration::config()))
+{
+ HtConfiguration *config = HtConfiguration::config();
+ //FILE *urls_parsed;
+
+ currenthopcount = 0;
+
+ //turn on word tracking!
+ trackWords = 1;
+
+ //
+ // Initialize the flags for the various HTML factors
+ //
+
+ // text_factor
+ factor[0] = FLAG_TEXT;
+ // title_factor
+ factor[1] = FLAG_TITLE;
+ // heading factor (now generic)
+ factor[2] = FLAG_HEADING;
+ factor[3] = FLAG_HEADING;
+ factor[4] = FLAG_HEADING;
+ factor[5] = FLAG_HEADING;
+ factor[6] = FLAG_HEADING;
+ factor[7] = FLAG_HEADING;
+ // img alt text
+ //factor[8] = FLAG_KEYWORDS;
+ factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has
+ // its own FLAG and factor.
+ // keywords factor
+ factor[9] = FLAG_KEYWORDS;
+ // META description factor
+ factor[10] = FLAG_DESCRIPTION;
+
+ doc = NULL;
+ minimumWordLength = config->Value("minimum_word_length", 3);
+
+
+ //TODO put document-index log file stuff here via logs like Retriever
+
+ check_unique_md5 = config->Boolean("check_unique_md5", 0);
+ check_unique_date = config->Boolean("check_unique_date", 0);
+
+ d_md5 = 0;
+ if (check_unique_md5)
+ {
+ d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+ if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
+ {
+ cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
+ }
+ }
+
+ temp_doc_count = 0;
+
+}
+
+
+//*****************************************************************************
+// TextCollector::~TextCollector()
+//
+TextCollector::~TextCollector()
+{
+ if (d_md5)
+ d_md5->Close();
+ //delete doc;
+
+ if(temp_doc_count != 0)
+ {
+ words.Flush();
+ temp_doc_count = 0;
+ }
+
+ words.Flush();
+ words.Close();
+
+}
+
+
+//*****************************************************************************
+// void TextCollector::IndexDoc()
+//
+//
+
+int
+TextCollector::IndexDoc(BasicDocument & a_basicdoc)
+{
+ DocumentRef *ref;
+ time_t date;
+ int old_document = 0;
+ static int index = 0;
+
+ //struct timeb tb;
+
+ //HtConfiguration *config = HtConfiguration::config();
+
+ doc = &a_basicdoc;
+
+ ref = docs[doc->Location()]; // It might be nice to have just an Exists() here
+ if (ref)
+ {
+ //
+ // We already have an entry for this document in our database.
+ // This means we can get the document ID and last modification
+ // time from there.
+ //
+ current_id = ref->DocID();
+ date = ref->DocTime();
+ if (ref->DocAccessed())
+ old_document = 1;
+ else // we haven't retrieved it yet, so we only have the first link
+ old_document = 0;
+ ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link
+ ref->DocAccessed(time(0));
+ ref->DocState(Reference_normal);
+ currenthopcount = ref->DocHopCount();
+ }
+ else
+ {
+ //
+ // Never seen this document before. We need to create an
+ // entry for it. This implies that it gets a new document ID.
+ //
+
+ date = 0;
+
+ current_id = docs.NextDocID();
+ ref = new DocumentRef;
+ ref->DocID(current_id);
+ ref->DocURL(doc->Location());
+ ref->DocState(Reference_normal);
+ ref->DocAccessed(time(0));
+ ref->DocHopCount(0);
+ ref->DocBackLinks(1); // We had to have a link to get here!
+ old_document = 0;
+ }
+
+ word_context.DocID(ref->DocID());
+
+ if (debug > 0)
+ {
+ //
+ // Display progress
+ //
+ cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() <<
+ ": ";
+ cout.flush();
+ }
+
+ //printf("New Doc\n");
+ //ftime(&tb);
+ //fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ RetrievedDocument(ref);
+
+ //ftime(&tb);
+ //fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ if(temp_doc_count > 250)
+ {
+ //words.Flush();
+ temp_doc_count = 0;
+ }
+ else
+ {
+ temp_doc_count++;
+ }
+
+ //ftime(&tb);
+ //fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ docs.Add(*ref);
+
+ //ftime(&tb);
+ //fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+ delete ref;
+
+ words.Flush();
+ //words.Close();
+
+ if (urls_seen)
+ {
+ fprintf(urls_seen, "%s|%d|%s|%d|0|1\n",
+ (const char *) doc->Location(), doc->Length(), doc->ContentType(),
+ (int) doc->ModTime());
+ }
+
+
+ return(1);
+}
+
+int TextCollector::FlushWordDB()
+{
+ if(temp_doc_count != 0)
+ {
+ words.Flush();
+ temp_doc_count = 0;
+ }
+
+ words.Flush();
+ words.Close();
+ return(1);
+}
+
+//*****************************************************************************
+// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
+// We found a document that needs to be parsed. Since we don't know the
+// document type, we'll let the Document itself return an appropriate
+// Parsable object which we can call upon to parse the document contents.
+//
+void
+TextCollector::RetrievedDocument(DocumentRef * ref)
+{
+ n_links = 0;
+ current_ref = ref;
+ current_title = 0;
+ word_context.Anchor(0);
+ current_time = 0;
+ current_head = 0;
+ current_meta_dsc = 0;
+ time_t doc_time;
+
+ //Check if the Document is self-parseable
+ //We will pass ourselves as a callback object for all the got_*() routines
+ if (doc->SelfParseable() == TRUE)
+ {
+ doc->internalParser(*this);
+ }
+ else
+ {
+ // Create a parser object and let it have a go at the document.
+ // We will pass ourselves as a callback object for all the got_*()
+ // routines.
+ // This will generate the Parsable object as a specific parser
+ /*
+ Parsable *parsable = doc->getParsable();
+ if (parsable)
+ parsable->parse(*this, *base);
+ else
+ { // If we didn't get a parser, then we should get rid of this!
+ ref->DocState(Reference_noindex);
+ return;
+ }
+ */
+ }
+
+ // We don't need to dispose of the parsable object since it will
+ // automatically be reused.
+
+
+ //
+ // Update the document reference
+ //
+ ref->DocTitle((char *) current_title);
+ ref->DocHead((char *) current_head);
+ ref->DocMetaDsc((char *) current_meta_dsc);
+
+/* if (current_time == 0)
+ ref->DocTime(doc->ModTime());
+ else
+ ref->DocTime(current_time); */
+
+ doc_time = doc->ModTime();
+ if(doc_time != 0)
+ ref->DocTime(doc_time);
+ else
+ ref->DocTime(time(NULL));
+
+ ref->DocSize(doc->Length());
+ ref->DocAccessed(time(0));
+ ref->DocLinks(n_links);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_word(char *word, int location, int heading)
+// The location is normalized to be in the range 0 - 1000.
+//
+void
+TextCollector::got_word(const char *word, int location, int heading)
+{
+ if (debug > 3)
+ cout << "word: " << word << '@' << location << endl;
+ if (heading >= 11 || heading < 0) // Current limits for headings
+ heading = 0; // Assume it's just normal text
+
+ if ((trackWords) && (strlen(word) >= minimumWordLength))
+ {
+ String w = word;
+ HtWordReference wordRef;
+
+ wordRef.Location(location);
+ wordRef.Flags(factor[heading]);
+
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+
+#ifdef DEBUG
+ cout << "Adding: [" << w << "]"<< endl; //NEALR
+#endif
+
+ // Check for compound words...
+ String parts = word;
+ int added;
+ int nparts = 1;
+ do
+ {
+ added = 0;
+ char *start = parts.get();
+ char *punctp = 0, *nextp = 0, *p;
+ char punct;
+ int n;
+ while (*start)
+ {
+ p = start;
+ for (n = 0; n < nparts; n++)
+ {
+ while (HtIsStrictWordChar((unsigned char) *p))
+ p++;
+ punctp = p;
+ if (!*punctp && n + 1 < nparts)
+ break;
+ while (*p && !HtIsStrictWordChar((unsigned char) *p))
+ p++;
+ if (n == 0)
+ nextp = p;
+ }
+ if (n < nparts)
+ break;
+ punct = *punctp;
+ *punctp = '\0';
+ if (*start && (*p || start > parts.get()))
+ {
+ w = start;
+ HtStripPunctuation(w);
+ if (w.length() >= minimumWordLength)
+ {
+ wordRef.Word(w);
+ words.Replace(WordReference::Merge(wordRef, word_context));
+ if (debug > 3)
+ cout << "word part: " << start << '@' << location << endl;
+
+#ifdef DEBUG
+ cout << "Adding: [" << w << "]"<< endl; //NEALR
+#endif
+ }
+ added++;
+ }
+ start = nextp;
+ *punctp = punct;
+ }
+ nparts++;
+ }
+ while (added > 2);
+ }
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_title(const char *title)
+//
+void
+TextCollector::got_title(const char *title)
+{
+ if (debug > 1)
+ cout << "\ntitle: " << title << endl;
+ current_title = title;
+}
+
+//*****************************************************************************
+// void TextCollector::got_time(const char *time)
+//
+void
+TextCollector::got_time(const char *time)
+{
+ HtDateTime new_time(current_time);
+
+ if (debug > 1)
+ cout << "\ntime: " << time << endl;
+
+ //
+ // As defined by the Dublin Core, this should be YYYY-MM-DD
+ // In the future, we'll need to deal with the scheme portion
+ // in case someone picks a different format.
+ //
+ new_time.SetFTime(time, "%Y-%m-%d");
+ current_time = new_time.GetTime_t();
+
+ // If we can't convert it, current_time stays the same and we get
+ // the default--the date returned by the server...
+}
+
+//*****************************************************************************
+// void TextCollector::got_head(const char *head)
+//
+void
+TextCollector::got_head(const char *head)
+{
+ if (debug > 4)
+ cout << "head: " << head << endl;
+ current_head = head;
+}
+
+//*****************************************************************************
+// void TextCollector::got_meta_dsc(const char *md)
+//
+void
+TextCollector::got_meta_dsc(const char *md)
+{
+ if (debug > 4)
+ cout << "meta description: " << md << endl;
+ current_meta_dsc = md;
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_email(const char *e)
+//
+void
+TextCollector::got_meta_email(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta email: " << e << endl;
+ current_ref->DocEmail(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_notification(const char *e)
+//
+void
+TextCollector::got_meta_notification(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta notification date: " << e << endl;
+ current_ref->DocNotification(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_subject(const char *e)
+//
+void
+TextCollector::got_meta_subject(const char *e)
+{
+ if (debug > 1)
+ cout << "\nmeta subect: " << e << endl;
+ current_ref->DocSubject(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_noindex()
+//
+void
+TextCollector::got_noindex()
+{
+ if (debug > 1)
+ cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
+ current_ref->DocState(Reference_noindex);
+}