3 files changed, 307 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/README b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/README
new file mode 100644
index 00000000..4ec0f6ab
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/README
@@ -0,0 +1,38 @@
+
+>    Subject: htdig: HTDIG: Searching Word files
+>         To: htdig@sdsu.edu
+>       From: Richard Jones <rjones@imcl.com>
+>       Date: Tue, 15 Jul 1997 12:44:03 +0100
+>
+> I'm currently trying to hack together a script to search
+> Word files. I have a little program called `catdoc' (attached)
+> which takes Word files and turns them into passable text files.
+> What I did was write a shell script around this called
+> `htparsedoc' (also attached) and add it as an external
+> parser:
+> 
+>         --- /usr/local/lib/htdig/conf/htdig.conf ---
+> 
+>         # External parser for Word documents.
+>         external_parsers:       "applications/msword"
+> "/usr/local/lib/htdig/bin/htparsedoc"
+> 
+> This script produces output like this:
+> 
+>         t Word document http://annexia.imcl.com/test/comm.doc
+>         w INmEDIA 1 -
+>         w Investment 2 -
+>         w Ltd 3 -
+>         w Applications 4 -
+>         w Subproject 5 -
+>         w Terms 6 -
+>         w of 7 -
+>  [...]
+>         w Needed 994 -
+>         w Tbd 995 -
+>         w Resources 996 -
+>         w Needed 997 -
+>         w Tbd 998 -
+>         w i 1000 -
+> 
+
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/catdoc.c b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/catdoc.c
new file mode 100644
index 00000000..93bf02f8
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/catdoc.c
@@ -0,0 +1,197 @@
+
+From VDiGiampietro@sansalvo.marelli.it Fri Jul  3 09:52:34 1998
+Date: Fri, 3 Jul 1998 17:20:50 +0200 (MET DST)
+From: Valerio Di Giampietro <VDiGiampietro@sansalvo.marelli.it>
+To: htdig@sdsu.edu
+Subject: htdig: Searching Word files
+/* catdoc.c version 0.3 */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define TEXT_WIDTH 72
+/* #define LATIN1 */
+/* enable this define, if you don't want cyrillic code page translations */
+
+unsigned char specs[]={7, /* tab columns separator - handled specially*/
+                       '\n',/* hook to handle end of line in tables */ 
+                       0x1E,/* unbreakable defis */
+                       0x1F,/* soft hyphen */
+                       0x85,/* dots */
+                       0x91,/* opening single quote */
+                       0x92,/* closing single quote */
+                       0x93,/* opening double quote */
+                       0x94,/* closing double quote */
+                       0x96,/* em-dash (or em-space)*/
+                       0x97,/* en-dash */
+                       0x99,/* Trade Mark sign */
+                       0xA0,/* unbreakable space */
+                       0xA9,/* Copyright sign */
+                       0xAE,/* Reserved sign */
+                       0xAB,/* opening << quote*/
+                       0xBB,/* closing >> quote*/
+ /* The rest is translated into itself unless TeX mode is selected */
+                       '%','$','_','{','}','\\', 
+                    };
+                         
+char *ascii_specs[]={"\t","\n","-","","...","`","'","``","''","-","-","tm",
+ " ","(c)","(R)","\"","\"","%","$","_","{","}","\\"};
+char *TeX_specs[]={"\t&","\\\\\n","-","\\-","\\dots{}","`","'","``","''","---","--",
+"${}^{\\scriptscriptstyle\\mathrm{TM}}$",/* this is my idea about tm sign*/
+"~",
+"{\\copyright}",
+"(R)",/* to be replaced with correct command */
+"<",">","\\%","\\$","$\\{$","$\\}$","$\\backslash$",};
+#ifndef LATIN1
+#ifdef unix
+unsigned char table[256]={
+/* Windows cyrillic code page to KOI-8 */
+0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0D,0x0C,0x0D,0x0E,0x0F,
+0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x2D,0x20,
+0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
+0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
+0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
+0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
+0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
+0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
+0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B,
+0x98,0x60,0x27,0x22,0x22,0x9A,0x2D,0x2D,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2,
+0x20,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0x22,0xAB,0xAC,0xAD,0xAE,
+0xAF,0xB2,0xB1,'i',0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0x22,0xBC,0xBD,0xBE,0x9B,
+0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,
+0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,
+0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,
+0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1};
+#else
+unsigned char table[256]={
+0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0D,0x0c,0x0d,0x0e,0x0f,
+0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x2D,0x20,
+0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
+0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
+0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
+0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
+0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
+0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
+0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
+0x90,0x60,0x27,0x22,0x22,0x95,0x2D,0x2D,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
+0x20,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0x22,0xac,0xad,0xae,0xaf,
+0xb0,0xb1,0xb2,0xb3,'i',0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0x22,0xbc,0xbd,0xbe,0xbf,
+0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
+0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
+0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
+0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef};
+#endif
+#define recode_char(x) table[x]
+#else
+#define recode_char(x) x
+#endif
+char *map_char(char **map,int c)
+
+{unsigned char *ptr;
+ static char buffer[2]="a";
+ if ((ptr=strchr(specs,c))) 
+  return map[ptr-specs];
+ else
+  { buffer[0]=recode_char(c); return buffer; }
+}
+void format(char *buf,char **map)
+{ unsigned char outstring[128]="";
+  unsigned char *sp=buf,*dp;int table=0;
+  while (*sp)
+  { if (*sp==7&&table)
+     { printf("%s%s",outstring,map_char(map,'\n'));
+       outstring[0]=0;
+       table=0;sp++; 
+     }
+   else
+   { if (strlen(strcat(outstring,map_char(map,*sp)))>TEXT_WIDTH)
+    { dp=strrchr(outstring,' ');
+      if (dp) 
+       { *(dp++)=0;
+         printf("%s\n",outstring);
+         strcpy(outstring,dp);
+       }
+       else 
+       { int i;
+         for(i=0;i<72;i++) putc(outstring[i],stdout);
+           putc('\n',stdout);
+         strcpy(outstring,outstring+72);
+       }
+    }
+   table=*(sp++)==7; 
+   } 
+ }
+if (outstring[0]==0) putc('\n',stdout); 
+ else printf("%s\n\n",outstring);
+    
+}
+void help(void)
+{ printf("catdoc - exctract text from MS-Word files and catenate it to stdout\n"
+         "Copyright (c) by Victor B. Wagner, 1996\n"
+         "Usage catdoc [-ast] files ...\n"
+         "\t-a - converts non-standard printable chars into readable form (default)\n"
+	 "\t-t - converts them into TeX control sequences\n"
+	 "\t-s - exits with code 1 if MSWordDoc signature not found before\n"
+	 "\t\tfirst printable paragraph\n\n"
+         "All options affects only files, specified AFTER them\n");
+         exit(2);
+}  
+
+char buf[8192];
+void do_file(FILE *f,char **map,int search_sign)
+{ int ok=!search_sign;
+  int bufptr,c;
+ while(!feof(f))
+ {bufptr=-1;
+  do {
+   c=getc(f);
+   /* Special printable symbols 7- table separator \r - paragraph end
+      0x1E - short defis */
+   if ((c<=255&&c>=32)||c==7||c=='\t'||c=='\r'||c==0x1E)
+      buf[++bufptr]=c;
+   else
+   if (c==0x0b) buf[++bufptr]='\r';
+   else
+   { if (!c) {buf[++bufptr]=0;
+              if(!strcmp(buf,"MSWordDoc"))
+                { ok=1; }
+             }
+    if (c!=2) bufptr=-1;/* \002 is Word's footnote mark */
+   }
+  } while (c!='\r'&&c!=EOF);
+ if (bufptr>0&&buf[bufptr]=='\r')
+   { if (!ok) exit( 1);
+     buf[bufptr]=0; format(buf,map);
+   }
+ }
+}
+  
+int main(int argc,char **argv)
+{ int search_sign =0; /* Must program exit with exit code 1 if MSWordDoc
+                         signature is not found? */
+  char **sequences=ascii_specs;/* pointer to array of character sequences 
+                            to represent special characters of Word */
+  int i=1,stdin_processed=0;
+  if (argc<2) help();
+  for(;i<argc;i++)
+  { if (!strcmp(argv[i],"-s")) search_sign=1;
+    else 
+    if (!strcmp(argv[i],"-t")) sequences=TeX_specs;
+    else
+    if (!strcmp(argv[i],"-a")) sequences=ascii_specs;
+    else   
+    if (!strcmp(argv[i],"-"))
+        if (!stdin_processed) {do_file(stdin,sequences,search_sign);
+                               stdin_processed=1;}
+         else { fprintf(stderr,"Cannot process standard input twice a row\n");
+                exit (2);}
+    else
+     if (argv[i][0]=='-') {fprintf(stderr,"Invalid option %s\n",argv[i]);
+                           help();} 
+    else
+     { FILE *f=fopen(argv[i],"r");
+       if(!f) {fprintf(stderr,"Cannot open file %s\n",argv[i]);exit(2);}
+       do_file(f,sequences,search_sign);
+     }
+   }
+  return 0;
+}      
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/htparsedoc b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/htparsedoc
new file mode 100755
index 00000000..9d47e85d
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/htparsedoc
@@ -0,0 +1,72 @@
+#!/bin/sh -
+
+#--
+# External parser for HTDIG that parses Word files so they can
+# be indexed.
+#--
+# Written by Richard W.M. Jones <rjones@imcl.com>. Distributed freely
+# under the terms of the GNU General Public License (GPL).
+# Modified by Andrew M. Bishop <amb@gedanken.demon.co.uk>
+#--
+
+#----------------------------------------------------------------------
+# Configurable stuff here:
+
+# The program that converts Word files into text. I use ``catdoc''
+# by Victor Wagner <vitus@agropc.msk.su>. You may wish to just use
+# ``strings''.
+CATDOC=/usr/local/bin/catdoc
+#CATDOC=strings
+
+# End of configurable stuff.
+#----------------------------------------------------------------------
+
+# Arguments are:
+#   $1 = input file
+#   $2 = content type (ignored)
+#   $3 = base URL
+#   $4 = HTDIG config file (ignored)
+# HTDIG expects us to print out:
+#   w WORD LOCATION HEADING    Word at location 0-1000 under heading
+#   u URL DESCRIPTION          URL with description
+#   t TITLE                    Title of document
+#   h HEAD                     Heading
+#   a ANCHOR                   Anchor (ie. like <a name="">)
+#   i IMAGE_URL                Image pointer
+
+#----------------------------------------------------------------------
+
+# Format input to word per line.
+
+wordPerLine () {
+    tr '[ \010]' '\012' | awk 'NF==1 {print;}'
+}
+
+# Change non-alphabetical/numeric characters in space.
+
+removeNonAlNum () {
+    tr -c '[a-zA-Z0-9\015]' ' '
+}
+
+#----------------------------------------------------------------------
+
+# Parse input file to linear list of words.
+$CATDOC $1 | removeNonAlNum | wordPerLine > /tmp/htparsedoc.$$
+
+# Compute length of list.
+filelen=`wc -l < /tmp/htparsedoc.$$`
+
+# We can't find the title from the document, so make one up.
+echo "t	Binary Document $3"
+
+# We can't make an excerpt so we make one up.
+echo "h	No excerpt available"
+
+# Pass words to htdig.
+if [ $filelen -gt 0 ]; then
+    awk "{printf (\"w\t%s\t%d\t-\t\n\", \$1, 1000*NR/$filelen);}" \
+	< /tmp/htparsedoc.$$
+fi
+
+# Remove temporary file.
+rm /tmp/htparsedoc.$$