summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc116
1 files changed, 116 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
new file mode 100644
index 00000000..e7006fb1
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc
@@ -0,0 +1,116 @@
+//
+// Plaintext.cc
+//
+// Plaintext: Parses plaintext files. Not much to do, really.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Plaintext.cc,v 1.20 2004/05/28 13:15:15 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "WordType.h"
+
+#include <ctype.h>
+#include "defaults.h"
+
+
+//*****************************************************************************
+// Plaintext::Plaintext()
+//
+Plaintext::Plaintext()
+{
+}
+
+
+//*****************************************************************************
+// Plaintext::~Plaintext()
+//
+Plaintext::~Plaintext()
+{
+}
+
+
+//*****************************************************************************
+// void Plaintext::parse(Retriever &retriever, URL &)
+//
+void
+Plaintext::parse(Retriever &retriever, URL &)
+{
+ if (contents == 0 || contents->length() == 0)
+ return;
+
+ HtConfiguration* config= HtConfiguration::config();
+ unsigned char *position = (unsigned char *) contents->get();
+ static int minimumWordLength = config->Value("minimum_word_length", 3);
+ int wordIndex = 1;
+ int in_space = 0;
+ String word;
+ String head;
+
+ while (*position)
+ {
+ word = 0;
+
+ if (HtIsStrictWordChar(*position))
+ {
+ //
+ // Start of a word. Try to find the whole thing
+ //
+ in_space = 0;
+ while (*position && HtIsWordChar(*position))
+ {
+ word << *position;
+ position++;
+ }
+
+ if (head.length() < max_head_length)
+ {
+ head << word;
+ }
+
+ if (word.length() >= minimumWordLength)
+ {
+ retriever.got_word((char*)word, wordIndex++, 0);
+ }
+ }
+
+ if (head.length() < max_head_length)
+ {
+ //
+ // Characters that are not part of a word
+ //
+ if (*position && isspace(*position))
+ {
+ //
+ // Reduce all multiple whitespace to a single space
+ //
+ if (!in_space)
+ {
+ head << ' ';
+ }
+ in_space = 1;
+ }
+ else
+ {
+ head << *position;
+ in_space = 0;
+ }
+ }
+ if (*position)
+ position++;
+ }
+ retriever.got_head((char*)head);
+}
+
+