diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc new file mode 100644 index 00000000..e7006fb1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/Plaintext.cc @@ -0,0 +1,116 @@ +// +// Plaintext.cc +// +// Plaintext: Parses plaintext files. Not much to do, really. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Plaintext.cc,v 1.20 2004/05/28 13:15:15 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Plaintext.h" +#include "htdig.h" +#include "htString.h" +#include "WordType.h" + +#include <ctype.h> +#include "defaults.h" + + +//***************************************************************************** +// Plaintext::Plaintext() +// +Plaintext::Plaintext() +{ +} + + +//***************************************************************************** +// Plaintext::~Plaintext() +// +Plaintext::~Plaintext() +{ +} + + +//***************************************************************************** +// void Plaintext::parse(Retriever &retriever, URL &) +// +void +Plaintext::parse(Retriever &retriever, URL &) +{ + if (contents == 0 || contents->length() == 0) + return; + + HtConfiguration* config= HtConfiguration::config(); + unsigned char *position = (unsigned char *) contents->get(); + static int minimumWordLength = config->Value("minimum_word_length", 3); + int wordIndex = 1; + int in_space = 0; + String word; + String head; + + while (*position) + { + word = 0; + + if (HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + in_space = 0; + while (*position && HtIsWordChar(*position)) + { + word << *position; + position++; + } + + if (head.length() < max_head_length) + { + head << word; + } + + if (word.length() >= minimumWordLength) + { + retriever.got_word((char*)word, wordIndex++, 0); + } + } + + if (head.length() < max_head_length) + { + // + // Characters that are not part of a word + // + if (*position && isspace(*position)) + { + // + // Reduce all multiple whitespace to a single space + // + if (!in_space) + { + head << ' '; + } + in_space = 1; + } + else + { + head << *position; + in_space = 0; + } + } + if (*position) + position++; + } + retriever.got_head((char*)head); +} + + |