diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htdig/HTML.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htdig/HTML.cc | 1002 |
1 files changed, 1002 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc new file mode 100644 index 00000000..56e1d00f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htdig/HTML.cc @@ -0,0 +1,1002 @@ +// +// HTML.cc +// +// HTML: Class to parse HTML documents and return useful information +// to the Retriever +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HTML.cc,v 1.76 2004/06/09 17:35:34 grdetil Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htdig.h" +#include "HTML.h" +#include "HtSGMLCodec.h" +#include "HtConfiguration.h" +#include "StringMatch.h" +#include "StringList.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "WordType.h" + +#include <ctype.h> + +#include "defaults.h" + +// Flags for noindex & nofollow, indicating who turned indexing off/on... +#define TAGnoindex 0x0001 +#define TAGstyle 0x0002 +#define TAGscript 0x0004 +#define TAGmeta_htdig_noindex 0x0008 +#define TAGmeta_robots 0x0010 + +static StringMatch tags; +static StringMatch nobreaktags; +static StringMatch spacebeforetags; +static StringMatch spaceaftertags; +static StringMatch metadatetags; +static StringMatch descriptionMatch; +static StringMatch keywordsMatch; +//static int keywordsCount; +//static int max_keywords; + + +//***************************************************************************** +// ADDSPACE() macro, to insert space where needed in various strings +// Reduces all multiple whitespace to a single space + +#define ADDSPACE(in_space) \ + if (!in_space) \ + { \ + if (in_title && !noindex) \ + { \ + title << ' '; \ + } \ + if (in_ref && description.length() < max_description_length) \ + { \ + description << ' '; \ + } \ + if (head.length() < max_head_length && !noindex && !in_title) \ + { \ + head << ' '; \ + } \ + in_space = 1; \ + } + + +//***************************************************************************** +// HTML::HTML() +// +HTML::HTML() : + skip_start (HtConfiguration::config()->Find("noindex_start")," \t"), + skip_end (HtConfiguration::config()->Find("noindex_end"), " \t") +{ + HtConfiguration *config= HtConfiguration::config(); + // + // Initialize the patterns that we will try to match. + // The tags Match object is used to match tag commands while + // + tags.IgnoreCase(); + tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object|link|style|/style|script|/script"); + + // These tags don't cause a word break. They may also be in "tags" above, + // except for the "a" tag, which must be handled as a special case. + // Note that <sup> & <sub> should cause a word break. + nobreaktags.IgnoreCase(); + nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s"); + + // These tags, which may also be in "tags" above, cause word breaks and + // therefore cause space to be inserted before (or after) do_tag() is done. + spacebeforetags.IgnoreCase(); + spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer"); + spaceaftertags.IgnoreCase(); + spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote"); + + // These are the name values of meta tags that carry date information. + metadatetags.IgnoreCase(); + metadatetags.Pattern("date|dc.date|dc.date.created|dc.date.modified"); + + // These are the name values of meta tags that carry descriptions. + StringList descrNames(config->Find("description_meta_tag_names"), " \t"); + descriptionMatch.IgnoreCase(); + descriptionMatch.Pattern(descrNames.Join('|')); + + // These are the name values of meta tags that carry keywords. + StringList keywordNames(config->Find("keywords_meta_tag_names"), " \t"); + keywordsMatch.IgnoreCase(); + keywordsMatch.Pattern(keywordNames.Join('|')); +// (now in Parser) +// max_keywords = config->Value("max_keywords", -1); +// if (max_keywords < 0) +// max_keywords = (int) ((unsigned int) ~1 >> 1); + + // skip_start/end mark sections of text to be ignored by ht://Dig + // Make sure there are equal numbers of each, and warn of deprecated + // syntax. + if (skip_start.Count() > 1 || skip_end.Count() > 1) + { + if (skip_start.Count() != 0 && skip_end.Count() != 0) + { + // check for old-style start/end which allowed unquoted spaces + // (Check noindex_start/end for exactly one "<" or/followed-by + // exactly one ">", and no leading quotes.) + // Can someone think of a better (or simpler) check?? + String noindex_end (config->Find ("noindex_end")); + char *first_left = strchr (noindex_end.get(), '<'); + char *secnd_left = first_left ? strchr(first_left+1,'<') : (char*)0; + char *first_right= strchr (noindex_end.get(), '>'); + char *secnd_right= first_right? strchr(first_right+1,'>'): (char*)0; + String noindex_start (config->Find ("noindex_start")); + char *first_lft = strchr (noindex_start.get(), '<'); + char *secnd_lft = first_left ? strchr (first_lft +1,'<') : (char*)0; + char *first_rght= strchr (noindex_start.get(), '>'); + char *secnd_rght= first_right? strchr (first_rght+1,'>') : (char*)0; + + if (((first_right && !secnd_right && first_right < first_left) || + (first_left && !secnd_left && !first_right) || + (first_rght && !secnd_rght && first_rght < first_lft) || + (first_lft && !secnd_lft && !first_rght)) && + noindex_end[0] != '\"' && noindex_start[0] != '\"') + { + cout << "\nWarning: To allow multiple noindex_start/end patterns, patterns containing\nspaces should now be in quotation marks. (If the entries are indended to be\nmultiple patterns, this warning can be suppressed by placing the first pattern\nin quotes.)\n\n"; + // Should we treat the patterns as if they had been quoted + // (as we assume was intended)? + } + } + } + + // check each start has an end + if (skip_start.Count() < skip_end.Count()) + { + cout << "Warning: " << skip_end.Count() + << " noindex_end patterns, but only " << skip_start.Count() + << " noindex_start patterns.\n"; + } else + { + while (skip_start.Count () > skip_end.Count()) + { + int missing = skip_end.Count() - 1; + skip_end.Add ((missing >= 0) ? skip_end [missing] + : "<!--/htdig_noindex-->"); + cout << "Warning: Copying " << skip_end [missing+1] + << " as noindex_end match for " << skip_start [missing+1] + << endl; + } + } + + word = 0; + href = 0; + title = 0; + description = 0; + head = 0; + meta_dsc = 0; + tag = 0; + in_title = 0; + in_ref = 0; + in_heading = 0; + base = 0; + noindex = 0; + nofollow = 0; +// minimumWordLength = config->Value("minimum_word_length", 3); +} + + +//***************************************************************************** +// HTML::~HTML() +// +HTML::~HTML() +{ +} + + +//***************************************************************************** +// void HTML::parse(Retriever &retriever, URL &baseURL) +// Parse the HTML document using the Retriever object for all the callbacks. +// The HTML document contents are contained in the contents String. +// +void +HTML::parse(Retriever &retriever, URL &baseURL) +{ + if (contents == 0 || contents->length() == 0) + return; + + base = &baseURL; + + // + // We have some variables which will contain the various items we + // are looking for + // + int wordindex = 1; + int in_space; + int in_punct; + String scratch, textified; + unsigned char *q, *start; + unsigned char *position = (unsigned char *) contents->get(); + unsigned char *text = (unsigned char *)new char[contents->length()+1]; + unsigned char *ptext = text; + + keywordsCount = 0; + title = 0; + head = 0; + meta_dsc = 0; + noindex = 0; + nofollow = 0; + in_heading = 0; + in_title = 0; + in_ref = 0; + in_space = 0; + in_punct = 0; + + while (*position) + { + + // + // Filter out section marked to be ignored for indexing. + // This can contain any HTML. + // On finding a noindex_start, skip to first occurrence of matching + // noindex_end. Any noindex_start within will be ignored. + // + int i; + for (i = 0; i < skip_start.Count(); i++) + { + if (mystrncasecmp((char *)position, skip_start[i], + ((String*)skip_start.Nth(i))->length()) == 0) + break; // break from this loop for "continue" below... + } + if (i < skip_start.Count()) // found a match; + { + q = (unsigned char*)mystrcasestr((char *)position, skip_end[i]); + if (!q) + *position = '\0'; // Rest of document will be skipped... + else + position = q + ((String*)skip_end.Nth(i))->length(); + continue; + } + // end of noindex_start/end code + + + if (strncmp((char *)position, "<!", 2) == 0) + { + // + // Possible comment declaration (but could be DTD declaration!) + // A comment can contain other '<' and '>': + // we have to ignore complete comment declarations + // but of course also DTD declarations. + // + position += 2; // Get past declaration start + if (strncmp((char *)position, "--", 2) == 0) + { + // Found start of comment - now find the end + position += 2; + do + { + q = (unsigned char*)strstr((char *)position, "--"); + if (!q) + { + *position = '\0'; + break; // Rest of document seems to be a comment... + } + else + { + position = q + 2; + // Skip extra dashes after a badly formed comment + while (*position == '-') + position++; + // Skip whitespace after an individual comment + while (isspace(*position)) + position++; + } + // if comment declaration hasn't ended, skip another comment + } + while (*position && *position != '>'); + if (*position == '>') + { + position++; // End of comment declaration + } + } + else + { + // Not a comment declaration after all + // but possibly DTD: get to the end + q = (unsigned char*)strchr((char *)position, '>'); + if (q) + { + position = q + 1; + // End of (whatever) declaration + } + else + { + *position = '\0'; // Rest of document is DTD? + } + } + continue; + } + + if (*position == '<') + { + // + // Start of a tag. Since tags cannot be nested, we can simply + // search for the closing '>' + // + q = (unsigned char*)strchr((char *)position, '>'); + if (q) + { // copy tag + while (position <= q) + *ptext++ = *position++; + } + else + { // copy rest of text, as tag does not end + while (*position) + *ptext++ = *position++; + } + } + else if (*position == '&') + { + q = (unsigned char*)strchr((char *)position, ';'); + if (q && q <= position+10) + { // got ending, looks like valid SGML entity + scratch = 0; + scratch.append((char*)position, q+1 - position); + textified = HtSGMLCodec::instance()->encode(scratch); + if (textified[0] != '&' || textified.length() == 1) + { // it was decoded, copy it + position = (unsigned char *)textified.get(); + while (*position) + { + if (*position == '<') + { // got a decoded <, make a fake tag for it + // to avoid confusing it with real tag start + *ptext++ = '<'; + *ptext++ = '~'; + *ptext++ = '>'; + position++; + } + else + *ptext++ = *position++; + } + position = q+1; + } + else // it wasn't decoded, copy '&', and rest will follow + *ptext++ = *position++; + } + else // not SGML entity, copy bare '&' + *ptext++ = *position++; + } + else + { + *ptext++ = *position++; + } + } + *ptext++ = '\0'; + + position = text; + start = position; + + while (*position) + { + if (*position == '<' && (position[1] != '~' || position[2] != '>')) + { + // + // Start of a tag. Since tags cannot be nested, we can simply + // search for the closing '>' + // + q = (unsigned char*)strchr((char *)position, '>'); + if (!q) + break; // Syntax error in the doc. Tag never ends. + position++; + if (noindex & TAGscript) + { // Special handling in case '<' is part of JavaScript code + while (isspace(*position)) + position++; + if (mystrncasecmp((char *)position, "/script", 7) != 0) + continue; + } + tag = 0; + tag.append((char*)position, q - position); + while (isspace(*position)) + position++; + if (!in_space && spacebeforetags.CompareWord((char *)position) + || !in_space && !in_punct && *position != '/') + { + // These opening tags cause a space to be inserted + // before anything they insert. + // Tags processed here (i.e. not in nobreaktags), like <a ...> + // tag, are a special case: they don't actually add space in + // formatted text, but because in our processing it causes + // a word break, we avoid word concatenation in "head" string. + ADDSPACE(in_space); + in_punct = 0; + } + do_tag(retriever, tag); + if (!in_space && spaceaftertags.CompareWord((char *)position)) + { + // These closing tags cause a space to be inserted + // after anything they insert. + ADDSPACE(in_space); + in_punct = 0; + } + position = q+1; + } + else if (*position > 0 && HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + word = 0; + in_space = 0; + in_punct = 0; + while (*position && HtIsWordChar(*position)) + { + word << (char)*position; + // handle case where '<' is in extra_word_characters... + if (strncmp((char *)position, "<~>", 3) == 0) + position += 2; // skip over fake tag for decoded '<' + position++; + if (*position == '<') + { + q = position+1; + while (isspace(*q)) + q++; + // Does this tag cause a word break? + if (nobreaktags.CompareWord((char *)q)) + { + // These tags just change character formatting and + // don't break words. + q = (unsigned char*)strchr((char *)position, '>'); + if (q) + { + position++; + tag = 0; + tag.append((char*)position, q - position); + do_tag(retriever, tag); + position = q+1; + } + } + } + } + + if (in_title && !noindex) + { + title << word; + } + + if (in_ref) + { + if (description.length() < max_description_length) + { + description << word; + } + else + { + description << " ..."; + if (!nofollow) + retriever.got_href(*href, (char*)description); + in_ref = 0; + description = 0; + } + } + + if (head.length() < max_head_length && !noindex && !in_title) + { + // + // Capitalize H1 and H2 blocks + // + if (in_heading > 1 && in_heading < 4) + { + word.uppercase(); + } + + // + // Append the word to the head (excerpt) + // + head << word; + } + + if (word.length() >= (int)minimum_word_length && !noindex) + { + retriever.got_word((char*)word, wordindex++, in_heading); + } + } + else + { + // + // Characters that are not part of a word + // + if (isspace(*position)) + { + ADDSPACE(in_space); + in_punct = 0; + } + else + { + // + // Not whitespace + // + if (head.length() < max_head_length && !noindex && !in_title) + { + // We don't want to add random chars to the + // excerpt if we're in the title. + head << *position; + } + if (in_ref && description.length() < max_description_length) + { + description << *position; + } + if (in_title && !noindex) + { + title << *position; + } + in_space = 0; + in_punct = 1; + // handle normal case where decoded '<' is punctuation... + if (strncmp((char *)position, "<~>", 3) == 0) + position += 2; // skip over fake tag for decoded '<' + } + position++; + } + } + retriever.got_head((char*)head); + + delete [] text; +} + + +//***************************************************************************** +// void HTML::do_tag(Retriever &retriever, String &tag) +// +void +HTML::do_tag(Retriever &retriever, String &tag) +{ + HtConfiguration* config= HtConfiguration::config(); + int wordindex = 1; + char *position = tag.get(); + int which, length; + static int ignore_alt_text = config->Boolean("ignore_alt_text", 0); + + while (isspace(*position)) + position++; + + which = -1; + if (tags.CompareWord(position, which, length) < 0) + return; // Nothing matched. + + // Use the configuration code to match attributes as key-value pairs + HtConfiguration attrs; + attrs.NameValueSeparators("="); + attrs.Add(position); + + if (debug > 3) + cout << "Tag: <" << tag << ">, matched " << which << endl; + + switch (which) + { + case 0: // "title" + if (title.length()) + { + if (debug) + cout << "More than one <title> tag in document!" + << " (possible search engine spamming)" << endl; + break; + } + in_title = 1; + in_heading = 1; + break; + + case 1: // "/title" + if (!in_title) + break; + in_title = 0; + in_heading = 0; + retriever.got_title((char*)title); + break; + + case 2: // "a" + { + if (!attrs["href"].empty()) + { + // + // a href seen + // + if (in_ref) + { + if (debug > 1) + cout << "Terminating previous <a href=...> tag," + << " which didn't have a closing </a> tag." + << endl; + if (!nofollow) + retriever.got_href(*href, (char*)description); + in_ref = 0; + } + if (href) + delete href; + href = new URL(transSGML(attrs["href"]), *base); + in_ref = 1; + description = 0; + break; + } + + if (!attrs["title"].empty() && !attrs["href"].empty()) + { + // + // a title seen for href + // + retriever.got_href(*href, transSGML(attrs["title"])); + } + + if (!attrs["name"].empty()) + { + // + // a name seen + // + retriever.got_anchor(transSGML(attrs["name"])); + } + break; + } + + case 3: // "/a" + if (in_ref) + { + if (!nofollow) + retriever.got_href(*href, (char*)description); + in_ref = 0; + } + break; + + case 4: // "h1" + in_heading = 2; + break; + + case 5: // "h2" + in_heading = 3; + break; + + case 6: // "h3" + in_heading = 4; + break; + + case 7: // "h4" + in_heading = 5; + break; + + case 8: // "h5" + in_heading = 6; + break; + + case 9: // "h6" + in_heading = 7; + break; + + case 10: // "/h1" + case 11: // "/h2" + case 12: // "/h3" + case 13: // "/h4" + case 14: // "/h5" + case 15: // "/h6" + in_heading = 0; + break; + + case 16: // "noindex" + noindex |= TAGnoindex; + nofollow |= TAGnoindex; + if (!attrs["follow"].empty()) + nofollow &= ~TAGnoindex; + break; + + case 27: // "style" + noindex |= TAGstyle; + nofollow |= TAGstyle; + break; + + case 29: // "script" + noindex |= TAGscript; + nofollow |= TAGscript; + break; + + case 17: // "/noindex" + noindex &= ~TAGnoindex; + nofollow &= ~TAGnoindex; + break; + + case 28: // "/style" + noindex &= ~TAGstyle; + nofollow &= ~TAGstyle; + break; + + case 30: // "/script" + noindex &= ~TAGscript; + nofollow &= ~TAGscript; + break; + + case 19: // "li" + if (!noindex && !in_title && head.length() < max_head_length) + head << "* "; + break; + + case 20: // "meta" + { + // + // First test for old-style meta tags (these break any + // reasonable DTD...) + // + if (!attrs["htdig-noindex"].empty()) + { + retriever.got_noindex(); + noindex |= TAGmeta_htdig_noindex; + nofollow |= TAGmeta_htdig_noindex; + } + if (!attrs["htdig-index"].empty()) + { + noindex &= ~TAGmeta_htdig_noindex; + nofollow &= ~TAGmeta_htdig_noindex; + } + if (!attrs["htdig-email"].empty()) + retriever.got_meta_email(transSGML(attrs["htdig-email"])); + + if (!attrs["htdig-notification-date"].empty()) + retriever.got_meta_notification(transSGML(attrs["htdig-notification-date"])); + + if (!attrs["htdig-email-subject"].empty()) + retriever.got_meta_subject(transSGML(attrs["htdig-email-subject"])); + + if (!attrs["htdig-keywords"].empty() || !attrs["keywords"].empty()) + { + // + // Keywords are added as being at the very top of the + // document and have a weight factor of + // keywords-factor which is assigned to slot 9 in the + // factor table. + // + const String keywords = attrs["htdig-keywords"].empty() ? + attrs["htdig-keywords"] : + attrs["keywords"]; + if (!noindex) + { + String tmp = transSGML(keywords); + addKeywordString (retriever, tmp, wordindex); + } + } + + if (!attrs["http-equiv"].empty()) + { + + // <META HTTP-EQUIV=REFRESH case + if (mystrcasecmp(attrs["http-equiv"], "refresh") == 0 + && !attrs["content"].empty()) + { + String content = attrs["content"]; + char *q = (char*)mystrcasestr((char*)content, "url"); + if (q && *q) + { + q += 3; // skiping "URL" + while (*q && ((*q == '=') || isspace(*q))) q++; + char *qq = q; + while (*qq && (*qq != ';') && (*qq != '"') && + !isspace(*qq))qq++; + *qq = 0; + if (href) + delete href; + href = new URL(transSGML(q), *base); + // I don't know why anyone would do this, but hey... + if (!nofollow) + retriever.got_href(*href, ""); + } + } + } + + // + // Now check for <meta name=... content=...> tags that + // fly with any reasonable DTD out there + // + + if (!attrs["name"].empty() && !attrs["content"].empty()) + { + const String cache = attrs["name"]; + + // First of all, check for META description + + if (descriptionMatch.CompareWord(cache) + && !attrs["content"].empty()) + { + // + // We need to do two things. First grab the description + // and clean it up + // + meta_dsc = transSGML(attrs["content"]); + meta_dsc.replace('\n', ' '); + meta_dsc.replace('\r', ' '); + meta_dsc.replace('\t', ' '); + if (meta_dsc.length() > max_meta_description_length) + meta_dsc = meta_dsc.sub(0, max_meta_description_length).get(); + if (debug > 1) + cout << "META Description: " << attrs["content"] << endl; + retriever.got_meta_dsc((char*)meta_dsc); + + + // + // Now add the words to the word list + // Slot 10 is the current slot for this + // + if (!noindex) + { + String tmp = transSGML(attrs["content"]); + addString (retriever, tmp, wordindex, 10); + } + } + + if (keywordsMatch.CompareWord(cache) && !noindex) + { + String tmp = transSGML(attrs["content"]); + addKeywordString (retriever, tmp, wordindex); + } + else if (mystrcasecmp(cache, "author") == 0) + { + String author = transSGML(attrs["content"]); + retriever.got_author(author.get()); + if (!noindex) + addString (retriever, author, wordindex, 11); + } + else if (mystrcasecmp(cache, "htdig-email") == 0) + { + retriever.got_meta_email(transSGML(attrs["content"])); + } + else if (metadatetags.CompareWord(cache, which, length) && + (cache.get())[length] == '\0' && config->Boolean("use_doc_date",0)) + { + retriever.got_time(transSGML(attrs["content"])); + } + else if (mystrcasecmp(cache, "htdig-notification-date") == 0) + { + retriever.got_meta_notification(transSGML(attrs["content"])); + } + else if (mystrcasecmp(cache, "htdig-email-subject") == 0) + { + retriever.got_meta_subject(transSGML(attrs["content"])); + } + else if (mystrcasecmp(cache, "htdig-noindex") == 0) + { + retriever.got_noindex(); + noindex |= TAGmeta_htdig_noindex; + nofollow |= TAGmeta_htdig_noindex; + } + else if (mystrcasecmp(cache, "robots") == 0 + && !attrs["content"].empty()) + { + String content_cache = attrs["content"]; + content_cache.lowercase(); + if (content_cache.indexOf("noindex") != -1) + { + noindex |= TAGmeta_robots; + retriever.got_noindex(); + } + if (content_cache.indexOf("nofollow") != -1) + nofollow |= TAGmeta_robots; + if (content_cache.indexOf("none") != -1) + { + noindex |= TAGmeta_robots; + nofollow |= TAGmeta_robots; + retriever.got_noindex(); + } + } + } + else if (mystrcasecmp(attrs["name"], "htdig-noindex") == 0) + { + retriever.got_noindex(); + noindex |= TAGmeta_htdig_noindex; + nofollow |= TAGmeta_htdig_noindex; + } + break; + } + + case 21: // frame + case 24: // embed + { + if (!attrs["src"].empty()) + { + // + // src seen + // + if (!nofollow) + { + if (href) + delete href; + href = new URL(transSGML(attrs["src"]), *base); + // Frames have the same hopcount as the parent. + retriever.got_href(*href, transSGML(attrs["title"]), 0); + in_ref = 0; + } + } + break; + } + + case 25: // object + { + if (!attrs["data"].empty()) + { + // + // data seen + // + if (!nofollow) + { + if (href) + delete href; + href = new URL(transSGML(attrs["data"]), *base); + // Assume objects have the same hopcount as the parent. + retriever.got_href(*href, transSGML(attrs["title"]), 0); + in_ref = 0; + } + } + break; + } + + case 22: // area + case 26: // link + { + if (!attrs["href"].empty()) + { + // href seen + if (!nofollow) + { + if (href) + delete href; + href = new URL(transSGML(attrs["href"]), *base); + // area & link are like anchor tags -- one hopcount! + retriever.got_href(*href, transSGML(attrs["title"]), 1); + in_ref = 0; + } + } + break; + } + + case 23: // base + { + if (!attrs["href"].empty()) + { + URL tempBase(transSGML(attrs["href"])); + *base = tempBase; + } + break; + } + + case 18: // img + { + if (!ignore_alt_text && !attrs["alt"].empty()) + { + String tmp = transSGML(attrs["alt"]); + if (!noindex && in_title) + title << tmp << " "; + if (in_ref && description.length() < max_description_length) + description << tmp << " "; + if (!noindex && !in_title && head.length() < max_head_length) + head << tmp << " "; + if (!noindex) + addString (retriever, tmp, wordindex, 8); // slot for img_alt + } + if (!attrs["src"].empty()) + { + retriever.got_image(transSGML(attrs["src"])); + } + break; + } + + default: + return; // Nothing... + } +} + + +//***************************************************************************** +// const String HTML::transSGML(const String& str) +// +const String +HTML::transSGML(const String& str) +{ + return HtSGMLCodec::instance()->encode(str); +} |