summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/Retriever.h')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Retriever.h183
1 files changed, 183 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
new file mode 100644
index 00000000..b2fff24d
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Retriever.h
@@ -0,0 +1,183 @@
+//
+// Retriever.h
+//
+// Retriever: Crawl from a list of URLs and calls appropriate parsers. The
+// parser notifies the Retriever object that it got something
+// (got_* functions) and the Retriever object feed the databases
+// and statistics accordingly.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Retriever.h,v 1.28 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifndef _Retriever_h_
+#define _Retriever_h_
+
+#include "DocumentRef.h"
+#include "Dictionary.h"
+#include "Queue.h"
+#include "HtWordReference.h"
+#include "List.h"
+#include "StringList.h"
+#include "DocumentDB.h"
+
+#define HTDIG_ERROR_TESTURL_EXCLUDE -109
+#define HTDIG_ERROR_TESTURL_BADQUERY -110
+#define HTDIG_ERROR_TESTURL_EXTENSION -111
+#define HTDIG_ERROR_TESTURL_EXTENSION2 -112
+#define HTDIG_ERROR_TESTURL_LIMITS -113
+#define HTDIG_ERROR_TESTURL_LIMITSNORM -114
+#define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115
+#define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116
+#define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117
+#define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118
+
+
+class URL;
+class Document;
+class URLRef;
+class HtWordList;
+
+enum RetrieverLog {
+ Retriever_noLog,
+ Retriever_logUrl,
+ Retriever_Restart
+};
+
+struct word_entry : public Object
+{
+ word_entry (int loc, int fl, HtWordReference& ref) :
+ location (loc), flags (fl), context (ref)
+ {};
+ int location;
+ int flags;
+ HtWordReference context;
+};
+
+class Retriever
+{
+public:
+ //
+ // Construction/Destruction
+ //
+ Retriever(RetrieverLog flags = Retriever_noLog);
+ virtual ~Retriever();
+
+ //
+ // Getting it all started
+ //
+ void Initial(const String& url, int checked = 0);
+ void Initial(List &list , int checked = 0);
+ void Start();
+
+ //
+ // Report statistics about the parser
+ //
+ void ReportStatistics(const String& name);
+
+ //
+ // These are the callbacks that we need to write code for
+ //
+ void got_word(const char *word, int location, int heading);
+ void got_href(URL &url, const char *description, int hops = 1);
+ void got_title(const char *title);
+ void got_author(const char *author);
+ void got_time(const char *time);
+ void got_head(const char *head);
+ void got_meta_dsc(const char *md);
+ void got_anchor(const char *anchor);
+ void got_image(const char *src);
+ void got_meta_email(const char *);
+ void got_meta_notification(const char *);
+ void got_meta_subject(const char *);
+ void got_noindex();
+
+ //
+ // Allow for the indexing of protected sites by using a
+ // username/password
+ //
+ void setUsernamePassword(const char *credentials);
+
+ //
+ // Routines for dealing with local filesystem access
+ //
+ StringList * GetLocal(const String &strurl);
+ StringList * GetLocalUser(const String &url, StringList *defaultdocs);
+ int IsLocalURL(const String &url);
+
+private:
+ //
+ // A hash to keep track of what we've seen
+ //
+ Dictionary visited;
+
+ URL *base;
+ String current_title;
+ String current_head;
+ String current_meta_dsc;
+ time_t current_time;
+ int current_id;
+ DocumentRef *current_ref;
+ int current_anchor_number;
+ int trackWords;
+ int n_links;
+ String credentials;
+ HtWordReference word_context;
+ HtWordList words;
+
+ Dictionary words_to_add;
+
+ int check_unique_md5;
+ int check_unique_date;
+
+
+ RetrieverLog log;
+ //
+ // These are weights for the words. The index is the heading level.
+ //
+ long int factor[12];
+ int currenthopcount;
+
+ //
+ // Some semi-constants...
+ //
+ int max_hop_count;
+
+ //
+ // The list of server-specific information objects is indexed by
+ // ip address and port number. The list contains Server objects.
+ //
+ Dictionary servers;
+
+ //
+ // For efficiency reasons, we will only use one document object which
+ // we reuse.
+ //
+ Document *doc;
+
+ Database *d_md5;
+
+ String notFound;
+
+ // Some useful constants
+ int minimumWordLength;
+
+ //
+ // Helper routines
+ //
+ int Need2Get(const String &url);
+ int IsValidURL(const String &url);
+ void RetrievedDocument(Document &, const String &url, DocumentRef *ref);
+ void parse_url(URLRef &urlRef);
+ void got_redirect(const char *, DocumentRef *, const char * = 0);
+ void recordNotFound(const String &url, const String &referer, int reason);
+};
+
+#endif
+
+