summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/test/t_parsing
blob: d11a70688e192493d863d64f58c0eb65102954e0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#
# Part of the ht://Dig package   <http://www.htdig.org/>
# Copyright (c) 1999-2004 The ht://Dig Group
# For copyright details, see the file COPYING in your distribution
# or the GNU Library General Public License (LGPL) version 2 or later
# <http://www.gnu.org/copyleft/lgpl.html>
#
# $Id: t_parsing,v 1.4 2004/05/28 13:15:30 lha Exp $
#


# Tests (or should eventually test) the following config attributes:
#	description_meta_tag_names
#	ignore_alt_text
#	max_doc_size
#	max_keywords
#	max_meta_description_length
#	max_description_length
#	max_descriptions
#	max_head_length
#	noindex_end
#	noindex_start
#	external_parsers
#	external_protocols
#	use_meta_description


test_functions_action=--start-apache
. ./test_functions

config=$testdir/conf/htdig.conf.tmp
tmp=/tmp/t_htsearch$$

# set up config file with chosen non-default values
cp $testdir/conf/htdig.conf $config

try() {
    comment="$1"
    shift
    query="$1"
    shift
    $htsearch -c $config "$query" > $tmp
    for pattern
    do
	if grep "$pattern" $tmp > /dev/null 
	then :
	else
	    $htsearch -v -c $config "$query" > /dev/null
	    echo "Output doesn't match \"$pattern\""
	    fail "$htsearch -c $config '$query' >> $tmp --
		  $comment"
	fi
    done
}


# Tests (or should eventually test) the following config attributes:
#	description_meta_tag_names
#	ignore_alt_text
#	max_doc_size
#	max_keywords
#	max_meta_description_length
#	max_description_length		(May put in t_templates)
#	max_descriptions		(May put in t_templates)
#	max_head_length
#	noindex_end
#	noindex_start
#	external_parsers		(TODO)
#	external_protocols
#	use_meta_description

$htdig "$@" -t -i -c $config	|| fail "Couldn't do first dig"
$htpurge -c $config		|| fail "Couldn't do first purge"

try "Search for alt text 'earth'" \
    "words=earth" \
    '1 matches' 'site3.html'

try "'claims and collections', unlimited doc size" \
    "words=%22claims+and+collections%22" \
    '1 matches' 'site4.html'

try "Search for keyword 'martial', default max_keywords" \
    "words=martial" \
    '1 matches' 'site2.html'

try "Search for 'service', default noindex_start/end" \
    "words=technical" \
    '1 matches' 'site%201.html'

set_attr use_meta_description true
try "Search for 'call handling' with default max_meta_description_length" \
    "words=%22call+handling%22" \
    '1 matches' 'script.html' 'call handling.*signalling'

set_attr ignore_alt_text true
set_attr max_doc_size 15112
set_attr max_keywords 5
set_attr noindex_start "'Software Distribution'"
set_attr noindex_end "'Contact Information'"
set_attr max_meta_description_length 80
set_attr description_meta_tag_names "description generator"
set_attr max_head_length 30

$htdig "$@" -t -i -c $config	|| fail "Couldn't do second dig"
$htpurge -c $config		|| fail "Couldn't do second purge"

try "Search for alt text 'earth' with ignore_alt_text=true" \
    "words=earth" \
    'No matches'

try "'claims and collections', max_doc_size 15112" \
    "words=%22claims+and+collections%22" \
    '1 matches' 'site4.html'

# (Martial is 6th keyword listed in site 2, but "Fu" is too short and omitted.)
try "Search for keyword 'martial', max_keywords = 5" \
    "words=martial" \
    'No matches'

# Only occurrence of "technical" is between noindex_start and _end in  site 1
try "Search for 'technical', noindex_start=Software Distribution, noindex_end=Contact Information" \
    "words=technical" \
    'No matches'

# Visitor occurs after  noindex_end
try "Search for 'visitor', noindex_start=Software Distribution, noindex_end=Contact Information" \
    "words=visitor" \
    '2 matches' 'site%201.html' 'site3.html'

# Displaying meta description instead of excerpt, check it is truncated
try "Search for 'call handling' with max_meta_description_length=80" \
    "words=%22call+handling%22" \
    '1 matches' 'script.html' 'means of<br>'

# Check <meta name="generator"...> counts as a description
try "Search for 'category', description_meta_tag_names includes 'generator'" \
    "words=category" \
    '1 matches' 'site3.html' 'FrontPage'

# Check that only specified number of bytes of header is stored.
# Header size is rounded up to contain the whole of the last word.
try "Search for 'also', max_head_length=30" \
    "words=also" \
    '4 matches' 'bad_local.htm' 'site2.html' 'script.html' 'site4.html' \
    'WHERE.*Copyright<br>'

set_attr max_doc_size 15042
set_attr max_keywords 6
set_attr noindex_start "'software distribution'"
set_attr noindex_end "'contact information'"

$htdig "$@" -t -i -c $config	|| fail "Couldn't do third dig"
$htpurge -c $config		|| fail "Couldn't do third purge"

try "Search for keyword 'martial', max_keywords = 6" \
    "words=martial" \
    '1 matches' 'site2.html'

try "'claims and collections', max_doc_size 15042" \
    "words=%22claims+and+collections%22" \
    'No matches'

# Check noindex_start/end are case-insensitive
try "Search for 'technical', noindex_start=software distribution, noindex_end=contact information" \
    "words=technical" \
    'No matches'

PROTOCOL=my-protocol
echo '#!/bin/sh
      echo "s	200"
      echo "t	text/html"
      echo
      echo "<html>$2</html>"' > $PROTOCOL
chmod 755 $PROTOCOL
set_attr external_protocols "echo: $PWD/$PROTOCOL"
set_attr start_url "echo:foo.html"
$htdig "$@" -t -i -c $config	|| fail "Couldn't do fourth dig"
try "trying external protocol  echo" \
    "words=foo" \
    "1 matches" "echo:foo.html"


test_functions_action=--stop-apache
. ./test_functions

rm -f $tmp $PROTOCOL

exit 0