1 files changed, 614 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
new file mode 100644
index 00000000..d967ba0b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htdig/ExternalParser.cc
@@ -0,0 +1,614 @@
+//
+// ExternalParser.cc
+//
+// ExternalParser: Implementation of ExternalParser
+//                 Allows external programs to parse unknown document formats.
+//                 The parser is expected to return the document in a 
+//                 specific format. The format is documented 
+//                 in http://www.htdig.org/attrs.html#external_parser
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: ExternalParser.cc,v 1.29 2004/05/28 13:15:14 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "htdig.h"
+#include "htString.h"
+#include "QuotedStringList.h"
+#include "URL.h"
+#include "Dictionary.h"
+#include "good_strtok.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER /* _WIN32 */
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#ifdef _MSC_VER /* _WIN32 */
+#include <process.h>
+#endif
+
+
+#include "defaults.h"
+
+static Dictionary	*parsers = 0;
+static Dictionary	*toTypes = 0;
+extern String		configFile;
+
+//*****************************************************************************
+// ExternalParser::ExternalParser(char *contentType)
+//
+ExternalParser::ExternalParser(char *contentType)
+{
+  String mime;
+  int sep;
+
+    if (canParse(contentType))
+    {
+        String mime = contentType;
+	mime.lowercase();
+	sep = mime.indexOf(';');
+	if (sep != -1)
+	  mime = mime.sub(0, sep).get();
+	
+	currentParser = ((String *)parsers->Find(mime))->get();
+    }
+    ExternalParser::contentType = contentType;
+}
+
+
+//*****************************************************************************
+// ExternalParser::~ExternalParser()
+//
+ExternalParser::~ExternalParser()
+{
+}
+
+
+//*****************************************************************************
+// int ExternalParser::readLine(FILE *in, String &line)
+//
+int
+ExternalParser::readLine(FILE *in, String &line)
+{
+    char	buffer[2048];
+    int		length;
+    
+    line = 0; // read(in, buffer, sizeof(buffer)
+    while (fgets(buffer, sizeof(buffer), in))
+    {
+	length = strlen(buffer);
+	if (buffer[length - 1] == '\n')
+	{
+	    //
+	    // A full line has been read.  Return it.
+	    //
+	    line << buffer;
+	    line.chop('\n');
+	    return 1;
+	}
+	else
+	{
+	    //
+	    // Only a partial line was read.  Append it to the line
+	    // and read some more.
+	    //
+	    line << buffer;
+	}
+    }
+    return line.length() > 0;
+}
+
+
+//*****************************************************************************
+// int ExternalParser::canParse(char *contentType)
+//
+int
+ExternalParser::canParse(char *contentType)
+{
+  HtConfiguration* config= HtConfiguration::config();
+  int			sep;
+
+    if (!parsers)
+    {
+	parsers = new Dictionary();
+	toTypes = new Dictionary();
+	
+	QuotedStringList	qsl(config->Find("external_parsers"), " \t");
+	String			from, to;
+	int			i;
+
+	for (i = 0; qsl[i]; i += 2)
+	{
+	    from = qsl[i];
+	    to = "";
+	    sep = from.indexOf("->");
+	    if (sep != -1)
+	    {
+		to = from.sub(sep+2).get();
+		from = from.sub(0, sep).get();
+	    }
+	    from.lowercase();
+	    sep = from.indexOf(';');
+	    if (sep != -1)
+	      from = from.sub(0, sep).get();
+
+	    parsers->Add(from, new String(qsl[i + 1]));
+	    toTypes->Add(from, new String(to));
+	}
+    }
+
+    String mime = contentType;
+    mime.lowercase();
+    sep = mime.indexOf(';');
+    if (sep != -1)
+      mime = mime.sub(0, sep).get();
+    return parsers->Exists(mime);
+}
+
+//*****************************************************************************
+// void ExternalParser::parse(Retriever &retriever, URL &base)
+//
+void
+ExternalParser::parse(Retriever &retriever, URL &base)
+{
+// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
+#ifndef _MSC_VER /* _WIN32 */
+	HtConfiguration* config= HtConfiguration::config();
+    if (contents == 0 || contents->length() == 0 ||
+	currentParser.length() == 0)
+    {
+	return;
+    }
+
+    //
+    // Write the contents to a temporary file.
+    //
+    String      path = getenv("TMPDIR");
+    int		fd;
+    if (path.length() == 0)
+      path = "/tmp";
+#ifndef HAVE_MKSTEMP
+    path << "/htdext." << getpid(); // This is unfortunately predictable
+
+#ifdef O_BINARY
+    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY);
+#else
+    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL);
+#endif
+#else
+    path << "/htdex.XXXXXX";
+    fd = mkstemp((char*)path);
+    // can we force binary mode somehow under Cygwin, if it has mkstemp?
+#endif
+    if (fd < 0)
+    {
+      if (debug)
+	cout << "External parser error: Can't create temp file "
+	     << (char *)path << endl;
+      return;
+    }
+    
+    write(fd, contents->get(), contents->length());
+    close(fd);
+
+//  unsigned int minimum_word_length = config->Value("minimum_word_length", 3);
+    String	line;
+    char	*token1, *token2, *token3;
+    int		loc = 0, hd = 0;
+    URL		url;
+    String mime = contentType;
+    mime.lowercase();
+    int	sep = mime.indexOf(';');
+    if (sep != -1)
+      mime = mime.sub(0, sep).get();
+    String	convertToType = ((String *)toTypes->Find(mime))->get();
+    int		get_hdr = (convertToType.nocase_compare("user-defined") == 0);
+    int		get_file = (convertToType.length() != 0);
+    String	newcontent;
+
+    StringList	cpargs(currentParser);
+    char   **parsargs = new char * [cpargs.Count() + 5];
+    int    argi;
+    for (argi = 0; argi < cpargs.Count(); argi++)
+	parsargs[argi] = (char *)cpargs[argi];
+    parsargs[argi++] = path.get();
+    parsargs[argi++] = contentType.get();
+    parsargs[argi++] = (char *)base.get().get();
+    parsargs[argi++] = configFile.get();
+    parsargs[argi++] = 0;
+
+    int    stdout_pipe[2];
+    int	   fork_result = -1;
+    int	   fork_try;
+
+    if (pipe(stdout_pipe) == -1)
+    {
+      if (debug)
+	cout << "External parser error: Can't create pipe!" << endl;
+      unlink((char*)path);
+      delete [] parsargs;
+      return;
+    }
+
+    for (fork_try = 4; --fork_try >= 0;)
+    {
+      fork_result = fork(); // Fork so we can execute in the child process
+      if (fork_result != -1)
+	break;
+      if (fork_try)
+	sleep(3);
+    }
+    if (fork_result == -1)
+    {
+      if (debug)
+	cout << "Fork Failure in ExternalParser" << endl;
+      unlink((char*)path);
+      delete [] parsargs;
+      return;
+    }
+
+    if (fork_result == 0) // Child process
+    {
+	close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+	dup(stdout_pipe[1]);
+	close(stdout_pipe[0]);
+	close(stdout_pipe[1]);
+	close(STDIN_FILENO); // Close STDIN to replace with file
+	open((char*)path, O_RDONLY);
+
+	// Call External Parser
+	execv(parsargs[0], parsargs);
+
+	exit(EXIT_FAILURE);
+    }
+
+    // Parent Process
+    delete [] parsargs;
+    close(stdout_pipe[1]); // Close STDOUT for writing
+#ifdef O_BINARY
+    FILE *input = fdopen(stdout_pipe[0], "rb");
+#else
+    FILE *input = fdopen(stdout_pipe[0], "r");
+#endif
+    if (input == NULL)
+    {
+      if (debug)
+	cout << "Fdopen Failure in ExternalParser" << endl;
+      unlink((char*)path);
+      return;
+    }
+
+    while ((!get_file || get_hdr) && readLine(input, line))
+    {
+	if (get_hdr)
+	{
+	    line.chop('\r');
+	    if (line.length() == 0)
+		get_hdr = false;
+	    else if (mystrncasecmp((char*)line, "content-type:", 13) == 0)
+	    {
+		token1 = line.get() + 13;
+		while (*token1 && isspace(*token1))
+		    token1++;
+		token1 = strtok(token1, "\n\t");
+		convertToType = token1;
+	    }
+	    continue;
+	}
+#ifdef O_BINARY
+	line.chop('\r');
+#endif
+	token1 = strtok(line, "\t");
+	if (token1 == NULL)
+	    token1 = "";
+	token2 = NULL;
+	token3 = NULL;
+	switch (*token1)
+	{
+	    case 'w':	// word
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  token2 = strtok(0, "\t");
+		if (token2 != NULL)
+		  token3 = strtok(0, "\t");
+		if (token1 != NULL && token2 != NULL && token3 != NULL &&
+			(loc = atoi(token2)) >= 0 &&
+			(hd = atoi(token3)) >= 0 && hd < 12)
+		  retriever.got_word(token1, loc, hd);
+		else
+		  cerr<< "External parser error: expected word in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'u':	// href
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  token2 = strtok(0, "\t");
+		if (token1 != NULL && token2 != NULL)
+		{
+		  url.parse(token1);
+		  url.hopcount(base.hopcount() + 1);
+		  retriever.got_href(url, token2);
+		}
+		else
+		  cerr<< "External parser error: expected URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 't':	// title
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_title(token1);
+		else
+		  cerr<< "External parser error: expected title in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'h':	// head
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_head(token1);
+		else
+		  cerr<< "External parser error: expected text in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'a':	// anchor
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_anchor(token1);
+		else
+		  cerr<< "External parser error: expected anchor in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+		
+	    case 'i':	// image url
+		token1 = strtok(0, "\t");
+		if (token1 != NULL)
+		  retriever.got_image(token1);
+		else
+		  cerr<< "External parser error: expected image URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+
+	    case 'm':	// meta
+	      {
+		// Using good_strtok means we can accept empty
+		// fields.
+		char *httpEquiv = good_strtok(token1+2, '\t');
+		char *name = good_strtok(0, '\t');
+		char *content = good_strtok(0, '\t');
+
+		if (httpEquiv != NULL && name != NULL && content != NULL)
+		{
+		  // It would be preferable if we could share
+		  // this part with HTML.cc, but it has other
+		  // chores too, and I do not see a point where to
+		  // split it up to get a common shared function
+		  // (or class).  This should not stop anybody from
+		  // finding a better solution.
+		  // For now, there is duplicated code.
+		  static StringMatch *keywordsMatch = 0;
+		  if (!keywordsMatch)
+		  {
+			StringList kn(config->Find("keywords_meta_tag_names"), " \t");
+			keywordsMatch = new StringMatch();
+			keywordsMatch->IgnoreCase();
+			keywordsMatch->Pattern(kn.Join('|'));
+		  }
+		  static StringMatch *descriptionMatch = 0;
+		  if (!descriptionMatch)
+		  {
+			StringList dn(config->Find("description_meta_tag_names"), " \t");
+			descriptionMatch = new StringMatch();
+			descriptionMatch->IgnoreCase();
+			descriptionMatch->Pattern(dn.Join('|'));
+		  }
+		  static StringMatch *metadatetags = 0;
+		  if (!metadatetags)
+		  {
+			metadatetags = new StringMatch();
+			metadatetags->IgnoreCase();
+			metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
+		  }
+    
+		  // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5> 
+		  // says that the "name" attribute defaults to
+		  // the http-equiv attribute if empty.
+		  if (*name == '\0')
+		    name = httpEquiv;
+
+		  if (*httpEquiv != '\0')
+		  {
+		    // <META HTTP-EQUIV=REFRESH case
+		    if (mystrcasecmp(httpEquiv, "refresh") == 0
+			&& *content != '\0')
+		    {
+		      char *q = (char*)mystrcasestr(content, "url");
+		      if (q && *q)
+		      {
+			q += 3; // skiping "URL"
+			while (*q && ((*q == '=') || isspace(*q))) q++;
+			char *qq = q;
+			while (*qq && (*qq != ';') && (*qq != '"') &&
+			       !isspace(*qq))qq++;
+			*qq = 0;
+			URL href(q, base);
+			// I don't know why anyone would do this, but hey...
+			retriever.got_href(href, "");
+		      }
+		    }
+		  }
+
+		  //
+		  // Now check for <meta name=...  content=...> tags that
+		  // fly with any reasonable DTD out there
+		  //
+		  if (*name != '\0' && *content != '\0')
+		  {
+		    if (keywordsMatch->CompareWord(name))
+		    {
+			int wordindex = 1;
+			addKeywordString (retriever, content, wordindex);
+//			// can this be merged with Parser::addKeywordString ?
+//		      char	*w = strtok(content, " ,\t\r");
+//		      while (w)
+//		      {
+//			if (strlen(w) >= minimum_word_length)
+//			  retriever.got_word(w, 1, 9);
+//			w = strtok(0, " ,\t\r");
+//		      }
+		    }
+		    if (metadatetags->CompareWord(name) &&
+					config->Boolean("use_doc_date", 0))
+		    {
+		      retriever.got_time(content);
+		    }
+		    else if (mystrcasecmp(name, "author") == 0)
+		    {
+			int wordindex = 1;
+			retriever.got_author(content);
+			addString (retriever, content, wordindex, 11);
+		    }
+		    else if (mystrcasecmp(name, "htdig-email") == 0)
+		    {
+		      retriever.got_meta_email(content);
+		    }
+		    else if (mystrcasecmp(name, "htdig-notification-date") == 0)
+		    {
+		      retriever.got_meta_notification(content);
+		    }
+		    else if (mystrcasecmp(name, "htdig-email-subject") == 0)
+		    {
+		      retriever.got_meta_subject(content);
+		    }
+		    else if (descriptionMatch->CompareWord(name)
+			     && strlen(content) != 0)
+		    {
+		      //
+		      // We need to do two things. First grab the description
+		      //
+		      String meta_dsc = content;
+
+		      if (meta_dsc.length() > max_meta_description_length)
+			meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
+		      if (debug > 1)
+			cout << "META Description: " << content << endl;
+		      retriever.got_meta_dsc((char*)meta_dsc);
+
+		      //
+		      // Now add the words to the word list
+		      // (slot 10 is the new slot for this)
+		      //
+		      int wordindex = 1;
+		      addString (retriever, content, wordindex, 10);
+//		      // can this be merged with Parser::addString ?
+//		      char	  *w = strtok(content, " \t\r");
+//		      while (w)
+//		      {
+//			if (strlen(w) >= minimum_word_length)
+//			  retriever.got_word(w, 1, 10);
+//			w = strtok(0, " \t\r");
+//		      }
+		    }
+		  }
+		}
+		else
+		  cerr<< "External parser error: expected metadata in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+	      }
+
+	    default:
+		  cerr<< "External parser error: unknown field in line "<<line<<"\n" << " URL: " << base.get() << "\n";
+		break;
+	}
+    } // while(readLine)
+    if (get_file)
+    {
+	if (!canParse(convertToType) &&
+	    mystrncasecmp((char*)convertToType, "text/", 5) != 0)
+	{
+	    if (mystrcasecmp((char*)convertToType, "user-defined") == 0)
+		cerr << "External parser error: no Content-Type given\n";
+	    else
+		cerr << "External parser error: can't parse Content-Type \""
+		     << convertToType << "\"\n";
+	    cerr << " URL: " << base.get() << "\n";
+	}
+	else
+	{
+	    char	buffer[2048];
+	    int		length;
+	    int		nbytes = config->Value("max_doc_size");
+	    while (nbytes > 0 &&
+			(length = fread(buffer, 1, sizeof(buffer), input)) > 0)
+	    {
+		nbytes -= length;
+		if (nbytes < 0)
+		    length += nbytes;
+		newcontent.append(buffer, length);
+	    }
+	}
+    }
+    fclose(input);
+    // close(stdout_pipe[0]); // This is closed for us by the fclose()
+    int rpid, status;
+    while ((rpid = wait(&status)) != fork_result && rpid != -1)
+	;
+    unlink((char*)path);
+
+    if (newcontent.length() > 0)
+    {
+	static HTML			*html = 0;
+	static Plaintext		*plaintext = 0;
+	Parsable			*parsable = 0;
+
+	contentType = convertToType;
+	if (canParse(contentType))
+	{
+	    currentParser = ((String *)parsers->Find(contentType))->get();
+	    parsable = this;
+	}
+	else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
+	{
+	    if (!html)
+		html = new HTML();
+	    parsable = html;
+	}
+	else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
+	{
+	    if (!plaintext)
+		plaintext = new Plaintext();
+	    parsable = plaintext;
+	}
+	else 
+	{
+	    if (!plaintext)
+		plaintext = new Plaintext();
+	    parsable = plaintext;
+	    if (debug)
+		cout << "External parser error: \"" << contentType <<
+			"\" not a recognized type.  Assuming text/plain\n";
+	}
+	parsable->setContents(newcontent.get(), newcontent.length());
+	parsable->parse(retriever, base);
+    }
+#endif //ifndef _MSC_VER /* _WIN32 */
+}
+
+