summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc614
1 files changed, 614 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
new file mode 100644
index 00000000..d967ba0b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
@@ -0,0 +1,614 @@
+//
+// ExternalParser.cc
+//
+// ExternalParser: Implementation of ExternalParser
+// Allows external programs to parse unknown document formats.
+// The parser is expected to return the document in a
+// specific format. The format is documented
+// in http://www.htdig.org/attrs.html#external_parser
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalParser.cc,v 1.29 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "Dictionary.h"
+#include "good_strtok.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#ifdef _MSC_VER /* _WIN32 */
+#include <process.h>
+#endif
+
+
+#include "defaults.h"
+
+static Dictionary *parsers = 0;
+static Dictionary *toTypes = 0;
+extern String configFile;
+
+//*****************************************************************************
+// ExternalParser::ExternalParser(char *contentType)
+//
+ExternalParser::ExternalParser(char *contentType)
+{
+ String mime;
+ int sep;
+
+ if (canParse(contentType))
+ {
+ String mime = contentType;
+ mime.lowercase();
+ sep = mime.indexOf(';');
+ if (sep != -1)
+ mime = mime.sub(0, sep).get();
+
+ currentParser = ((String *)parsers->Find(mime))->get();
+ }
+ ExternalParser::contentType = contentType;
+}
+
+
+//*****************************************************************************
+// ExternalParser::~ExternalParser()
+//
+ExternalParser::~ExternalParser()
+{
+}
+
+
+//*****************************************************************************
+// int ExternalParser::readLine(FILE *in, String &line)
+//
+int
+ExternalParser::readLine(FILE *in, String &line)
+{
+ char buffer[2048];
+ int length;
+
+ line = 0; // read(in, buffer, sizeof(buffer)
+ while (fgets(buffer, sizeof(buffer), in))
+ {
+ length = strlen(buffer);
+ if (buffer[length - 1] == '\n')
+ {
+ //
+ // A full line has been read. Return it.
+ //
+ line << buffer;
+ line.chop('\n');
+ return 1;
+ }
+ else
+ {
+ //
+ // Only a partial line was read. Append it to the line
+ // and read some more.
+ //
+ line << buffer;
+ }
+ }
+ return line.length() > 0;
+}
+
+
+//*****************************************************************************
+// int ExternalParser::canParse(char *contentType)
+//
+int
+ExternalParser::canParse(char *contentType)
+{
+ HtConfiguration* config= HtConfiguration::config();
+ int sep;
+
+ if (!parsers)
+ {
+ parsers = new Dictionary();
+ toTypes = new Dictionary();
+
+ QuotedStringList qsl(config->Find("external_parsers"), " \t");
+ String from, to;
+ int i;
+
+ for (i = 0; qsl[i]; i += 2)
+ {
+ from = qsl[i];
+ to = "";
+ sep = from.indexOf("->");
+ if (sep != -1)
+ {
+ to = from.sub(sep+2).get();
+ from = from.sub(0, sep).get();
+ }
+ from.lowercase();
+ sep = from.indexOf(';');
+ if (sep != -1)
+ from = from.sub(0, sep).get();
+
+ parsers->Add(from, new String(qsl[i + 1]));
+ toTypes->Add(from, new String(to));
+ }
+ }
+
+ String mime = contentType;
+ mime.lowercase();
+ sep = mime.indexOf(';');
+ if (sep != -1)
+ mime = mime.sub(0, sep).get();
+ return parsers->Exists(mime);
+}
+
+//*****************************************************************************
+// void ExternalParser::parse(Retriever &retriever, URL &base)
+//
+void
+ExternalParser::parse(Retriever &retriever, URL &base)
+{
+// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
+#ifndef _MSC_VER /* _WIN32 */
+ HtConfiguration* config= HtConfiguration::config();
+ if (contents == 0 || contents->length() == 0 ||
+ currentParser.length() == 0)
+ {
+ return;
+ }
+
+ //
+ // Write the contents to a temporary file.
+ //
+ String path = getenv("TMPDIR");
+ int fd;
+ if (path.length() == 0)
+ path = "/tmp";
+#ifndef HAVE_MKSTEMP
+ path << "/htdext." << getpid(); // This is unfortunately predictable
+
+#ifdef O_BINARY
+ fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY);
+#else
+ fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL);
+#endif
+#else
+ path << "/htdex.XXXXXX";
+ fd = mkstemp((char*)path);
+ // can we force binary mode somehow under Cygwin, if it has mkstemp?
+#endif
+ if (fd < 0)
+ {
+ if (debug)
+ cout << "External parser error: Can't create temp file "
+ << (char *)path << endl;
+ return;
+ }
+
+ write(fd, contents->get(), contents->length());
+ close(fd);
+
+// unsigned int minimum_word_length = config->Value("minimum_word_length", 3);
+ String line;
+ char *token1, *token2, *token3;
+ int loc = 0, hd = 0;
+ URL url;
+ String mime = contentType;
+ mime.lowercase();
+ int sep = mime.indexOf(';');
+ if (sep != -1)
+ mime = mime.sub(0, sep).get();
+ String convertToType = ((String *)toTypes->Find(mime))->get();
+ int get_hdr = (convertToType.nocase_compare("user-defined") == 0);
+ int get_file = (convertToType.length() != 0);
+ String newcontent;
+
+ StringList cpargs(currentParser);
+ char **parsargs = new char * [cpargs.Count() + 5];
+ int argi;
+ for (argi = 0; argi < cpargs.Count(); argi++)
+ parsargs[argi] = (char *)cpargs[argi];
+ parsargs[argi++] = path.get();
+ parsargs[argi++] = contentType.get();
+ parsargs[argi++] = (char *)base.get().get();
+ parsargs[argi++] = configFile.get();
+ parsargs[argi++] = 0;
+
+ int stdout_pipe[2];
+ int fork_result = -1;
+ int fork_try;
+
+ if (pipe(stdout_pipe) == -1)
+ {
+ if (debug)
+ cout << "External parser error: Can't create pipe!" << endl;
+ unlink((char*)path);
+ delete [] parsargs;
+ return;
+ }
+
+ for (fork_try = 4; --fork_try >= 0;)
+ {
+ fork_result = fork(); // Fork so we can execute in the child process
+ if (fork_result != -1)
+ break;
+ if (fork_try)
+ sleep(3);
+ }
+ if (fork_result == -1)
+ {
+ if (debug)
+ cout << "Fork Failure in ExternalParser" << endl;
+ unlink((char*)path);
+ delete [] parsargs;
+ return;
+ }
+
+ if (fork_result == 0) // Child process
+ {
+ close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+ dup(stdout_pipe[1]);
+ close(stdout_pipe[0]);
+ close(stdout_pipe[1]);
+ close(STDIN_FILENO); // Close STDIN to replace with file
+ open((char*)path, O_RDONLY);
+
+ // Call External Parser
+ execv(parsargs[0], parsargs);
+
+ exit(EXIT_FAILURE);
+ }
+
+ // Parent Process
+ delete [] parsargs;
+ close(stdout_pipe[1]); // Close STDOUT for writing
+#ifdef O_BINARY
+ FILE *input = fdopen(stdout_pipe[0], "rb");
+#else
+ FILE *input = fdopen(stdout_pipe[0], "r");
+#endif
+ if (input == NULL)
+ {
+ if (debug)
+ cout << "Fdopen Failure in ExternalParser" << endl;
+ unlink((char*)path);
+ return;
+ }
+
+ while ((!get_file || get_hdr) && readLine(input, line))
+ {
+ if (get_hdr)
+ {
+ line.chop('\r');
+ if (line.length() == 0)
+ get_hdr = false;
+ else if (mystrncasecmp((char*)line, "content-type:", 13) == 0)
+ {
+ token1 = line.get() + 13;
+ while (*token1 && isspace(*token1))
+ token1++;
+ token1 = strtok(token1, "\n\t");
+ convertToType = token1;
+ }
+ continue;
+ }
+#ifdef O_BINARY
+ line.chop('\r');
+#endif
+ token1 = strtok(line, "\t");
+ if (token1 == NULL)
+ token1 = "";
+ token2 = NULL;
+ token3 = NULL;
+ switch (*token1)
+ {
+ case 'w': // word
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ token2 = strtok(0, "\t");
+ if (token2 != NULL)
+ token3 = strtok(0, "\t");
+ if (token1 != NULL && token2 != NULL && token3 != NULL &&
+ (loc = atoi(token2)) >= 0 &&
+ (hd = atoi(token3)) >= 0 && hd < 12)
+ retriever.got_word(token1, loc, hd);
+ else
+ cerr<< "External parser error: expected word in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'u': // href
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ token2 = strtok(0, "\t");
+ if (token1 != NULL && token2 != NULL)
+ {
+ url.parse(token1);
+ url.hopcount(base.hopcount() + 1);
+ retriever.got_href(url, token2);
+ }
+ else
+ cerr<< "External parser error: expected URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 't': // title
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_title(token1);
+ else
+ cerr<< "External parser error: expected title in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'h': // head
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_head(token1);
+ else
+ cerr<< "External parser error: expected text in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'a': // anchor
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_anchor(token1);
+ else
+ cerr<< "External parser error: expected anchor in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'i': // image url
+ token1 = strtok(0, "\t");
+ if (token1 != NULL)
+ retriever.got_image(token1);
+ else
+ cerr<< "External parser error: expected image URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+
+ case 'm': // meta
+ {
+ // Using good_strtok means we can accept empty
+ // fields.
+ char *httpEquiv = good_strtok(token1+2, '\t');
+ char *name = good_strtok(0, '\t');
+ char *content = good_strtok(0, '\t');
+
+ if (httpEquiv != NULL && name != NULL && content != NULL)
+ {
+ // It would be preferable if we could share
+ // this part with HTML.cc, but it has other
+ // chores too, and I do not see a point where to
+ // split it up to get a common shared function
+ // (or class). This should not stop anybody from
+ // finding a better solution.
+ // For now, there is duplicated code.
+ static StringMatch *keywordsMatch = 0;
+ if (!keywordsMatch)
+ {
+ StringList kn(config->Find("keywords_meta_tag_names"), " \t");
+ keywordsMatch = new StringMatch();
+ keywordsMatch->IgnoreCase();
+ keywordsMatch->Pattern(kn.Join('|'));
+ }
+ static StringMatch *descriptionMatch = 0;
+ if (!descriptionMatch)
+ {
+ StringList dn(config->Find("description_meta_tag_names"), " \t");
+ descriptionMatch = new StringMatch();
+ descriptionMatch->IgnoreCase();
+ descriptionMatch->Pattern(dn.Join('|'));
+ }
+ static StringMatch *metadatetags = 0;
+ if (!metadatetags)
+ {
+ metadatetags = new StringMatch();
+ metadatetags->IgnoreCase();
+ metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
+ }
+
+ // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5>
+ // says that the "name" attribute defaults to
+ // the http-equiv attribute if empty.
+ if (*name == '\0')
+ name = httpEquiv;
+
+ if (*httpEquiv != '\0')
+ {
+ // <META HTTP-EQUIV=REFRESH case
+ if (mystrcasecmp(httpEquiv, "refresh") == 0
+ && *content != '\0')
+ {
+ char *q = (char*)mystrcasestr(content, "url");
+ if (q && *q)
+ {
+ q += 3; // skiping "URL"
+ while (*q && ((*q == '=') || isspace(*q))) q++;
+ char *qq = q;
+ while (*qq && (*qq != ';') && (*qq != '"') &&
+ !isspace(*qq))qq++;
+ *qq = 0;
+ URL href(q, base);
+ // I don't know why anyone would do this, but hey...
+ retriever.got_href(href, "");
+ }
+ }
+ }
+
+ //
+ // Now check for <meta name=... content=...> tags that
+ // fly with any reasonable DTD out there
+ //
+ if (*name != '\0' && *content != '\0')
+ {
+ if (keywordsMatch->CompareWord(name))
+ {
+ int wordindex = 1;
+ addKeywordString (retriever, content, wordindex);
+// // can this be merged with Parser::addKeywordString ?
+// char *w = strtok(content, " ,\t\r");
+// while (w)
+// {
+// if (strlen(w) >= minimum_word_length)
+// retriever.got_word(w, 1, 9);
+// w = strtok(0, " ,\t\r");
+// }
+ }
+ if (metadatetags->CompareWord(name) &&
+ config->Boolean("use_doc_date", 0))
+ {
+ retriever.got_time(content);
+ }
+ else if (mystrcasecmp(name, "author") == 0)
+ {
+ int wordindex = 1;
+ retriever.got_author(content);
+ addString (retriever, content, wordindex, 11);
+ }
+ else if (mystrcasecmp(name, "htdig-email") == 0)
+ {
+ retriever.got_meta_email(content);
+ }
+ else if (mystrcasecmp(name, "htdig-notification-date") == 0)
+ {
+ retriever.got_meta_notification(content);
+ }
+ else if (mystrcasecmp(name, "htdig-email-subject") == 0)
+ {
+ retriever.got_meta_subject(content);
+ }
+ else if (descriptionMatch->CompareWord(name)
+ && strlen(content) != 0)
+ {
+ //
+ // We need to do two things. First grab the description
+ //
+ String meta_dsc = content;
+
+ if (meta_dsc.length() > max_meta_description_length)
+ meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+ if (debug > 1)
+ cout << "META Description: " << content << endl;
+ retriever.got_meta_dsc((char*)meta_dsc);
+
+ //
+ // Now add the words to the word list
+ // (slot 10 is the new slot for this)
+ //
+ int wordindex = 1;
+ addString (retriever, content, wordindex, 10);
+// // can this be merged with Parser::addString ?
+// char *w = strtok(content, " \t\r");
+// while (w)
+// {
+// if (strlen(w) >= minimum_word_length)
+// retriever.got_word(w, 1, 10);
+// w = strtok(0, " \t\r");
+// }
+ }
+ }
+ }
+ else
+ cerr<< "External parser error: expected metadata in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+ }
+
+ default:
+ cerr<< "External parser error: unknown field in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+ break;
+ }
+ } // while(readLine)
+ if (get_file)
+ {
+ if (!canParse(convertToType) &&
+ mystrncasecmp((char*)convertToType, "text/", 5) != 0)
+ {
+ if (mystrcasecmp((char*)convertToType, "user-defined") == 0)
+ cerr << "External parser error: no Content-Type given\n";
+ else
+ cerr << "External parser error: can't parse Content-Type \""
+ << convertToType << "\"\n";
+ cerr << " URL: " << base.get() << "\n";
+ }
+ else
+ {
+ char buffer[2048];
+ int length;
+ int nbytes = config->Value("max_doc_size");
+ while (nbytes > 0 &&
+ (length = fread(buffer, 1, sizeof(buffer), input)) > 0)
+ {
+ nbytes -= length;
+ if (nbytes < 0)
+ length += nbytes;
+ newcontent.append(buffer, length);
+ }
+ }
+ }
+ fclose(input);
+ // close(stdout_pipe[0]); // This is closed for us by the fclose()
+ int rpid, status;
+ while ((rpid = wait(&status)) != fork_result && rpid != -1)
+ ;
+ unlink((char*)path);
+
+ if (newcontent.length() > 0)
+ {
+ static HTML *html = 0;
+ static Plaintext *plaintext = 0;
+ Parsable *parsable = 0;
+
+ contentType = convertToType;
+ if (canParse(contentType))
+ {
+ currentParser = ((String *)parsers->Find(contentType))->get();
+ parsable = this;
+ }
+ else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
+ {
+ if (!html)
+ html = new HTML();
+ parsable = html;
+ }
+ else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ }
+ else
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ if (debug)
+ cout << "External parser error: \"" << contentType <<
+ "\" not a recognized type. Assuming text/plain\n";
+ }
+ parsable->setContents(newcontent.get(), newcontent.length());
+ parsable->parse(retriever, base);
+ }
+#endif //ifndef _MSC_VER /* _WIN32 */
+}
+
+