diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl')
-rwxr-xr-x | debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl b/debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl new file mode 100755 index 00000000..ad7d4d79 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl @@ -0,0 +1,93 @@ +#!/usr/local/bin/perl +# +# Sample external converter for htdig 3.1.4 or later, to convert PDFs +# using Adobe Acrobat 3's acroread -toPostScript option on UNIX systems. +# (Use it in place of conv_doc.pl if you have acroread but not pdftotext.) +# Written by Gilles Detillieux. +# +# Usage: (in htdig.conf) +# +# external_parsers: application/pdf->text/html /usr/local/bin/acroconv.pl +# +# This is a pretty quick and dirty implementation, but it does seem to +# give functionality equivalent to the now defunct htdig/PDF.cc parser. +# I'm not a Perl expert by any stretch of the imagination, so the code +# could probably use a lot of optimization to make it work better. +# + +$watch = 0; +$bigspace = 0; +$putspace = 0; +$putbody = 1; + +system("ln $ARGV[0] $ARGV[0].pdf; acroread -toPostScript $ARGV[0].pdf"); +open(INP, "< $ARGV[0].ps") || die "Can't open $ARGV[0].ps\n"; + +print "<HTML>\n<head>\n"; +while (<INP>) { + if (/^%%Title: / && $putbody) { + s/^%%Title: \((.*)\).*\n/$1/; + s/\\222/'/g; + s/\\267/*/g; + s/\\336/fi/g; + s/\\([0-7]{3})/pack(C, oct($1))/eig; + s/\\([0-7]{2})/pack(C, oct($1))/eig; + s/\\([0-7])/pack(C, oct($1))/eig; + s/\\[nrtbf]/ /g; + s/\\(.)/$1/g; + s/&/\&\;/g; + s/</\<\;/g; + s/>/\>\;/g; + print "<title>$_</title>\n"; + print "</head>\n<body>\n"; + $putbody = 0; + } elsif (/^BT/) { + $watch = 1; + } elsif (/^ET/) { + $watch = 0; + if ($putspace) { + print "\n"; + $putspace = 0; + } + } elsif ($watch) { + if (/T[Jj]$/) { + s/\)[^(]*\(//g; + s/^[^(]*\((.*)\).*\n/$1/; + s/\\222/'/g; + s/\\267/*/g; + s/\\336/fi/g; + s/\\([0-7]{3})/pack(C, oct($1))/eig; + s/\\([0-7]{2})/pack(C, oct($1))/eig; + s/\\([0-7])/pack(C, oct($1))/eig; + s/\\[nrtbf]/ /g; + s/\\(.)/$1/g; + if ($bigspace) { + s/(.)/$1 /g; + } + s/&/\&\;/g; + s/</\<\;/g; + s/>/\>\;/g; + if ($putbody) { + print "</head>\n<body>\n"; + $putbody = 0; + } + print "$_"; + $putspace = 1; + } elsif (/T[Ddm*]$/ && $putspace) { + print "\n"; + $putspace = 0; + } elsif (/Tc$/) { + $bigspace = 0; + if (/^([3-9]|[1-9][0-9]+)\..*Tc$/) { + $bigspace = 1; + } + } + } +} +if ($putbody) { + print "</head>\n<body>\n"; +} +print "</body>\n</HTML>\n"; + +close(INP); +system("rm -f $ARGV[0].pdf $ARGV[0].ps"); |