diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl')
-rwxr-xr-x | debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl b/debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl new file mode 100755 index 00000000..436c5eef --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl @@ -0,0 +1,285 @@ +#!/usr/local/bin/perl + +## +## urlindex.pl (C) 1995 Andrew Scherpbier +## +## This program will build an index of all the URLs in the +## htdig document database. +## + +use GDBM_File; +require('SDSU_www.pl'); + +$dbfile = "/gopher/www/htdig/sdsu3d.docdb"; +$dbfile = "/tmp/db.docdb"; +$exclude = "rohan.sdsu.edu\\/home\\/"; + +tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "Unable to open $dbfile: $!"; + +print "Reading...\n"; + +## +## Read in all the relevant data. +## +while (($key, $value) = each %docdb) +{ + next if $key =~ /^nextDocID/; + %record = parse_ref_record($value); + next if $record{"STATE"} eq 1; + next if $key =~ /$exclude/; + + $title = $record{"TITLE"}; + + ## + ## Get rid of starting and trailing whitespace junk + ## + $title =~ s/^[ \t\n\r]*//; + $title =~ s/[ \t\n\r]*$//; + + ## + ## If the title starts with 'the', it will be taken out and added + ## to the end of the title. This means that a title like "The + ## Homepage of X" will become "Homepage of X, The" + ## + if ($title =~ /^the /i) + { + $title = substr($title, 4) . ", " . substr($title, 0, 3); + } + if ($title =~ /^SDSU /) + { + $title = substr($title, 5) . ", " . substr($title, 0, 4); + } + if ($title =~ /^San Diego State University /i) + { + $title = substr($title, 27) . ", " . substr($title, 0, 26); + } + $value = $title; + $value =~ tr/A-Z/a-z/; + $titles{$value} = "$title\001$key"; + push(@unsorted, $value); +} + +$current = " "; +open(M, ">index.html"); +print M "<html><head><title>Index of all documents at SDSU</title></head>\n"; +print M "<body>\n"; +print M &www_logo_2("Index of all documents at SDSU"); +print M "<p>This is a list of WWW documents that were found while indexing all\n"; +print M "the publicly available WWW servers at San Diego State University.\n"; +print M "The documents are indexed by their titles.\n"; +print M "</p><h2>\n"; + +$previous = ""; + +print "Writing...\n"; + +foreach $value (sort @unsorted) +{ + next if $value eq $previous; + $previous = $value; + next if !($value =~ /^[a-zA-Z]/); + + ($title, $url) = split('\001', $titles{$value}, 2); + + $first = substr($title, 0, 1); + if ($current =~ /$first/i) + { + print F "<li><a href=\"$url\">$title</a></li>\n"; + } + else + { + ## + ## New letter. Open a new file for it + ## + $current = $first; + $current =~ tr/a-z/A-Z/; + print F "</li></body></html>\n"; + close(F); + open(F, ">index$current.html"); + print F "<html><head><title>Index for $current</title></head>\n"; + print F "<body>\n"; + print F &www_logo_2("Index for $current"); + print F "<ul>\n"; + print F "<li><a href=\"$url\">$title</a></li>\n"; + + ## + ## Add a reference to the main index for this letter + ## + print M " <a href=\"index$current.html\">$current</a>\n"; + + print "Index of $current\n"; + } +} + +close(F); + +print M "</h2></body></html>\n"; +close(M); + + +sub parse_ref_record +{ + local($value) = @_; + local(%rec, $length, $count, $result); + + while (length($value) > 0) + { + $what = unpack("C", $value); + $value = substr($value, 1); + if ($what == 0) + { + # ID + $rec{"ID"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 1) + { + # TIME + $rec{"TIME"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 2) + { + # ACCESSED + $rec{"ACCESSED"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 3) + { + # STATE + $rec{"STATE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 4) + { + # SIZE + $rec{"SIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 5) + { + # LINKS + $rec{"LINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 6) + { + # IMAGESIZE + $rec{"IMAGESIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 7) + { + # HOPCOUNT + $rec{"HOPCOUNT"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 8) + { + # URL + $length = unpack("i", $value); + $rec{"URL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 9) + { + # HEAD + $length = unpack("i", $value); + $rec{"HEAD"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 10) + { + # TITLE + $length = unpack("i", $value); + $rec{"TITLE"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 11) + { + # DESCRIPTIONS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"DESCRIPTIONS"} = $result; + } + elsif ($what == 12) + { + # ANCHORS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"ANCHORS"} = $result; + } + elsif ($what == 13) + { + # EMAIL + $length = unpack("i", $value); + $rec{"EMAIL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 14) + { + # NOTIFICATION + $length = unpack("i", $value); + $rec{"NOTIFICATION"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 15) + { + # SUBJECT + $length = unpack("i", $value); + $rec{"SUBJECT"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 16) + { + # STRING (ignore, but unpack) + $length = unpack("i", $value); + $rec{"STRING"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 17) + { + # METADSC + $length = unpack("i", $value); + $rec{"METADSC"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 18) + { + # BACKLINKS + $rec{"BACKLINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 19) + { + # SIGNATURE + $rec{"SIG"} = unpack("i", $value); + $value = substr($value, 4); + } + } + print "title = $rec{'TITLE'}\n"; + return %rec; +} + + + + + + + |