// // Display.cc // // Display: Takes results of search and fills in the HTML templates // // Part of the ht://Dig package // Copyright (c) 1995-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: Display.cc,v 1.122 2004/05/28 13:15:24 lha Exp $ // #ifdef HAVE_CONFIG_H #include "htconfig.h" #endif /* HAVE_CONFIG_H */ #include "htsearch.h" #include "Display.h" #include "ResultMatch.h" #include "WeightWord.h" #include "StringMatch.h" #include "QuotedStringList.h" #include "URL.h" #include "HtSGMLCodec.h" #include "HtURLCodec.h" #include "HtURLRewriter.h" #include "WordType.h" #include "Collection.h" #include "HtURLSeedScore.h" //#include "HtURLRewriter.h" #include "SplitMatches.h" #ifdef HAVE_STD #include #ifdef HAVE_NAMESPACES using namespace std; #endif #else #include #endif /* HAVE_STD */ #include #include // for abs #include #ifndef _MSC_VER /* _WIN32 */ #include #endif #include #include #include #if !defined(DBL_MAX) # if defined (MAXDOUBLE) # define DBL_MAX MAXDOUBLE # elif defined(HUGE_VAL) # define DBL_MAX HUGE_VAL # elif defined(MAXFLOAT) # define DBL_MAX MAXFLOAT # else # define DBL_MAX 1e37 # endif #endif //***************************************************************************** // Display::Display(Dictionary *collections) { HtConfiguration* config= HtConfiguration::config(); selected_collections = collections; limitTo = 0; excludeFrom = 0; // needExcerpt = 0; templateError = 0; maxStars = config->Value("max_stars"); maxScore = -DBL_MAX; minScore = DBL_MAX; setupImages(); setupTemplates(); if (!templates.createFromString(config->Find("template_map"))) { // Error in createFromString. // Let's try the default template_map config->Add("template_map", "Long builtin-long builtin-long Short builtin-short builtin-short"); if (!templates.createFromString(config->Find("template_map"))) { // Unrecoverable Error // (No idea why this would happen) templateError = 1; } } currentTemplate = templates.get(config->Find("template_name")); if (!currentTemplate) { // // Must have been some error. Resort to the builtin-long (slot 0) // currentTemplate = (Template *) templates.templates[0]; } if (!currentTemplate) { // // Another error!? Time to bail out... // templateError = 1; } // if (mystrcasestr(currentTemplate->getMatchTemplate(), "excerpt")) // needExcerpt = 1; } //***************************************************************************** Display::~Display() { // docDB.Close(); } //***************************************************************************** // void Display::display(int pageNumber) { HtConfiguration* config= HtConfiguration::config(); int good_sort = 0; good_sort = ResultMatch::setSortType(config->Find("sort")); if (!good_sort) { // Must temporarily stash the message in a String, since // displaySyntaxError will overwrite the static temp used in form. String s(form("No such sort method: `%s'", (const char*)config->Find("sort"))); displaySyntaxError(s); return; } List *matches = buildMatchList(); int currentMatch = 0; int numberDisplayed = 0; ResultMatch *match = 0; int number = 0; number = config->Value("matches_per_page"); if (number <= 0) number = 10; int startAt = (pageNumber - 1) * number; if (config->Boolean("logging")) { logSearch(pageNumber, matches); } displayHTTPheaders(); setVariables(pageNumber, matches); // // The first match is guaranteed to have the highest score of // all the matches. We use this to compute the number of stars // to display for all the other matches. // match = (ResultMatch *) (*matches)[0]; if (!match) { // // No matches. // delete matches; // if( config->Boolean("nph") ) cout << "HTTP/1.0 200 OK\r\n"; // cout << "Content-type: text/html\r\n\r\n"; displayNomatch(); return; } // maxScore = match->getScore(); // now done in buildMatchList() // if( config->Boolean("nph") ) cout << "HTTP/1.0 200 OK\r\n"; // cout << "Content-type: text/html\r\n\r\n"; String wrap_file = config->Find("search_results_wrapper"); String *wrapper = 0; char *header = 0, *footer = 0; if (wrap_file.length()) { wrapper = readFile(wrap_file.get()); if (wrapper && wrapper->length()) { char wrap_sepr[] = "HTSEARCH_RESULTS"; char *h = wrapper->get(); char *p = strstr(h, wrap_sepr); if (p) { if (p > h && p[-1] == '$') { footer = p + strlen(wrap_sepr); header = h; p[-1] = '\0'; } else if (p > h+1 && p[-2] == '$' && (p[-1] == '(' || p[-1] == '{') && (p[strlen(wrap_sepr)] == ')' || p[strlen(wrap_sepr)] == '}')) { footer = p + strlen(wrap_sepr) + 1; header = h; p[-2] = '\0'; } } } } if (header) expandVariables(header); else displayHeader(); // // Display the window of matches requested. // if (!currentTemplate->getStartTemplate().empty()) { expandVariables(currentTemplate->getStartTemplate()); } matches->Start_Get(); while ((match = (ResultMatch *)matches->Get_Next()) && numberDisplayed < number) { if (currentMatch >= startAt) { // DocumentRef *ref = docDB[match->getID()]; Collection *collection = match->getCollection(); DocumentRef *ref = collection->getDocumentRef(match->getID()); if (!ref || ref->DocState() != Reference_normal) continue; // The document isn't present or shouldn't be displayed ref->DocAnchor(match->getAnchor()); ref->DocScore(match->getScore()); displayMatch(match, ref, currentMatch+1); numberDisplayed++; delete ref; } currentMatch++; } if (!currentTemplate->getEndTemplate().empty()) { expandVariables(currentTemplate->getEndTemplate()); } if (footer) expandVariables(footer); else displayFooter(); if (wrapper) delete wrapper; delete matches; } //***************************************************************************** // Return true if the specified URL should be counted towards the results. int Display::includeURL(const String& url) { if (limitTo && limitTo->match(url, 1, 0) == 0) return 0; else { if (excludeFrom && excludeFrom->match(url, 0, 0) != 0) return 0; else return 1; } } //***************************************************************************** void Display::displayMatch(ResultMatch *match, DocumentRef *ref, int current) { HtConfiguration* config= HtConfiguration::config(); String *str = 0; char *coded_url = ref->DocURL(); String url = HtURLCodec::instance()->decode(coded_url); HtURLRewriter::instance()->replace(url); ref->DocURL(url.get()); // for star_patterns & template_patterns match vars.Add("URL", new String(url.get())); vars.Remove("ANCHOR"); // get rid of any previous setting int iA = ref->DocAnchor(); String *anchor = 0; int fanchor = 0; if (iA > 0) // if an anchor was found { List *anchors = ref->DocAnchors(); if (anchors->Count() >= iA) { anchor = new String(); fanchor = 1; *anchor << "#" << ((String*) (*anchors)[iA-1])->get(); vars.Add("ANCHOR", anchor); } } // // no condition for determining excerpt any more: // we need it anyway to see if an anchor is relevant // int first = -1; String urlanchor(url); if (anchor) urlanchor << anchor; vars.Add("EXCERPT", excerpt(match, ref, urlanchor, fanchor, first)); // // anchor only relevant if an excerpt was found, i.e., // the search expression matches the body of the document // instead of only META keywords. // if (first < 0) { vars.Remove("ANCHOR"); } vars.Add("METADESCRIPTION", new String(ref->DocMetaDsc())); vars.Add("SCORE", new String(form("%f", ref->DocScore()))); vars.Add("CURRENT", new String(form("%d", current))); char *title = ref->DocTitle(); if (!title || !*title) { if ( strcmp(config->Find("no_title_text"), "filename") == 0 ) { // use actual file name title = strrchr(url.get(), '/'); if (title) { title++; // Skip slash str = new String(form("[%s]", title)); decodeURL(*str); // convert %20 to space, etc } else // URL without '/' ?? str = new String("[No title]"); } else // use configure 'no title' text str = new String(config->Find("no_title_text")); } else str = new String(title); vars.Add("TITLE", str); vars.Add("STARSRIGHT", generateStars(ref, 1)); vars.Add("STARSLEFT", generateStars(ref, 0)); vars.Add("SIZE", new String(form("%d", ref->DocSize()))); vars.Add("SIZEK", new String(form("%d", (ref->DocSize() + 1023) / 1024))); if (maxScore != 0 && maxScore != minScore) { int percent = (int)((ref->DocScore() - minScore) * 100 / (maxScore - minScore)); if (percent <= 0) percent = 1; vars.Add("PERCENT", new String(form("%d", percent))); } else vars.Add("PERCENT", new String("100")); { str = new String(); char buffer[100]; time_t t = ref->DocTime(); if (t) { struct tm *tm = localtime(&t); String datefmt = config->Find("date_format"); const String locale = config->Find("locale"); if (datefmt.empty()) { if (config->Boolean("iso_8601")) datefmt = "%Y-%m-%d %H:%M:%S %Z"; else datefmt = "%x"; } if (!locale.empty()) { setlocale(LC_TIME,locale); } strftime(buffer, sizeof(buffer), (char*)datefmt, tm); *str << buffer; } vars.Add("MODIFIED", str); } vars.Add("HOPCOUNT", new String(form("%d", ref->DocHopCount()))); vars.Add("DOCID", new String(form("%d", ref->DocID()))); vars.Add("BACKLINKS", new String(form("%d", ref->DocBackLinks()))); { str = new String(); List *list = ref->Descriptions(); int n = list->Count(); for (int i = 0; i < n; i++) { *str << ((String*) (*list)[i])->get() << "
"; } vars.Add("DESCRIPTIONS", str); String *description = new String(); if (list->Count()) *description << ((String*) (*list)[0]); vars.Add("DESCRIPTION", description); } int index = 0; int length = 0; int status = -1; if (URLtemplate.hasPattern()) status = URLtemplate.FindFirst(ref->DocURL(), index, length); if (status >= 0 && index >= 0) displayParsedFile( ((String*) URLtemplateList[index])->get() ); else expandVariables(currentTemplate->getMatchTemplate()); } //***************************************************************************** void Display::setVariables(int pageNumber, List *matches) { HtConfiguration* config= HtConfiguration::config(); String tmp; int i; int nMatches = 0; if (matches) nMatches = matches->Count(); int matchesPerPage = config->Value("matches_per_page"); if (matchesPerPage <= 0) matchesPerPage = 10; int nPages = (nMatches + matchesPerPage - 1) / matchesPerPage; if (nPages > config->Value("maximum_pages", 10)) nPages = config->Value("maximum_pages", 10); if (nPages < 1) nPages = 1; // We always have at least one page... vars.Add("MATCHES_PER_PAGE", new String(config->Find("matches_per_page"))); vars.Add("MAX_STARS", new String(config->Find("max_stars"))); vars.Add("CONFIG", new String(config->Find("config"))); vars.Add("VERSION", new String(config->Find("version"))); vars.Add("RESTRICT", new String(config->Find("restrict"))); vars.Add("EXCLUDE", new String(config->Find("exclude"))); vars.Add("KEYWORDS", new String(config->Find("keywords"))); vars.Add("MATCHES", new String(form("%d", nMatches))); vars.Add("PLURAL_MATCHES", new String((nMatches == 1) ? (char *)"" : (const char *) config->Find("plural_suffix"))); vars.Add("PAGE", new String(form("%d", pageNumber))); vars.Add("PAGES", new String(form("%d", nPages))); vars.Add("FIRSTDISPLAYED", new String(form("%d", (pageNumber - 1) * matchesPerPage + 1))); if (nPages > 1) vars.Add("PAGEHEADER", new String(config->Find("page_list_header"))); else vars.Add("PAGEHEADER", new String(config->Find("no_page_list_header"))); i = pageNumber * matchesPerPage; if (i > nMatches) i = nMatches; vars.Add("LASTDISPLAYED", new String(form("%d", i))); if (config->Find("script_name").length() != 0) { vars.Add("CGI", new String(config->Find("script_name"))); } else { vars.Add("CGI", new String(getenv("SCRIPT_NAME"))); } vars.Add("STARTYEAR", new String(config->Find("startyear"))); vars.Add("STARTMONTH", new String(config->Find("startmonth"))); vars.Add("STARTDAY", new String(config->Find("startday"))); vars.Add("ENDYEAR", new String(config->Find("endyear"))); vars.Add("ENDMONTH", new String(config->Find("endmonth"))); vars.Add("ENDDAY", new String(config->Find("endday"))); String *str; char *format = input->get("format"); String *in; vars.Add("SELECTED_FORMAT", new String(format)); str = new String(); *str << "\n"; vars.Add("FORMAT", str); str = new String(); tmp = config->Find("match_method"); vars.Add("SELECTED_METHOD", new String(tmp)); QuotedStringList ml(config->Find("method_names"), " \t\r\n"); *str << "\n"; vars.Add("METHOD", str); ////////////////// Multiple database support ////////////////////// // Emit collection table. Ensure that previously selected collections // are "checked". // Collections are specified in the config file with the // "collection_names" attribute. An example of the corresponding snippet // in the config file is as follows: // // collection_names: htdig_docs htdig_bugs // // htdig_bugs and htdig_docs are the two collections (databases) and // their corresponding config files are: $CONFIG_DIR/htdig_bugs.conf and // $CONFIG_DIR/htdig_docs.conf respectively. // QuotedStringList clist(config->Find("collection_names"), " \t\r\n"); for (i =0; i < clist.Count(); i++) { String config_name = clist[i]; for (int j=0; j < collectionList.Count(); j++) { if (strcmp(config_name.get(), collectionList[j]) == 0) { str = new String(); *str << "checked"; String collection_id = "COLLECTION_"; collection_id << config_name; vars.Add(collection_id, str); break; } } } ////////////////// Multiple database support ////////////////////// str = new String(); QuotedStringList sl(config->Find("sort_names"), " \t\r\n"); const String st = config->Find("sort"); StringMatch datetime; datetime.IgnoreCase(); datetime.Pattern("date|time"); *str << "\n"; vars.Add("SORT", str); vars.Add("SELECTED_SORT", new String(st)); // Handle user-defined select lists. // Uses octuples containing these values: // // // e.g.: // METHOD_LIST method method_names 2 1 2 match_method "" // FORMAT_LIST format template_map 3 2 1 template_name "" // EXCLUDE_LIST exclude exclude_names 2 1 2 exclude "" // MATCH_LIST matchesperpage matches_per_page_list 1 1 1 // matches_per_page "Previous Amount" QuotedStringList builds(config->Find("build_select_lists"), " \t\r\n"); for (int b = 0; b <= builds.Count()-8; b += 8) { int ntuple = atoi(builds[b+3]); int ivalue = atoi(builds[b+4]); int ilabel = atoi(builds[b+5]); int nsel = 0; int mult = 0, asinput = 0; const char *cp; char sepc = '\001'; String currval; String pre, post; QuotedStringList nameopt(builds[b], ",", 1); QuotedStringList namelist(config->Find(builds[b+2]), " \t\r\n"); if (ntuple > 0 && ivalue > 0 && ivalue <= ntuple && ilabel > 0 && ilabel <= ntuple && namelist.Count() % ntuple == 0 && nameopt.Count() > 0) { if (strcmp(builds[b+1], "restrict") == 0 || strcmp(builds[b+1], "exclude") == 0) sepc = '|'; if (nameopt.Count() == 1) ; // default is single select else if (mystrcasecmp(nameopt[1], "multiple") == 0) mult = 1; else if (mystrcasecmp(nameopt[1], "radio") == 0) asinput = 1; else if (mystrcasecmp(nameopt[1], "checkbox") == 0) { mult = 1; asinput = 1; } if (nameopt.Count() > 2) pre = nameopt[2]; else pre = ""; if (nameopt.Count() > 3) post = nameopt[3]; else post = ""; str = new String(); if (!asinput) { *str << "get(builds[b+1]) << "\" checked>" << builds[b+7] << post << '\n'; else *str << pre << "get(builds[b+1]) << "\" checked>" << builds[b+7] << post << '\n'; } if (!asinput) *str << "\n"; vars.Add(nameopt[0], str); } } // // If a paged output is required, set the appropriate variables // if (nPages > 1) { if (pageNumber > 1) { str = new String("" << config->Find("prev_page_text") << ""; } else { str = new String(config->Find("no_prev_page_text")); } vars.Add("PREVPAGE", str); if (pageNumber < nPages) { str = new String("" << config->Find("next_page_text") << ""; } else { str = new String(config->Find("no_next_page_text")); } vars.Add("NEXTPAGE", str); str = new String(); char *p; QuotedStringList pnt(config->Find("page_number_text"), " \t\r\n"); QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n"); QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n"); if (nPages > config->Value("maximum_page_buttons", 10)) nPages = config->Value("maximum_page_buttons", 10); for (i = 1; i <= nPages; i++) { if (i == pageNumber) { p = npnt[i - 1]; if (!p) p = form("%d", i); *str << p; } else { p = pnt[i - 1]; if (!p) p = form("%d", i); *str << "" << p << ""; } if (i != nPages && sep.Count() > 0) *str << sep[(i-1)%sep.Count()]; else if (i != nPages && sep.Count() <= 0) *str << " "; } vars.Add("PAGELIST", str); } StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); String* key; for (i= 0; i < form_vars.Count(); i++) { if(!config->Find(form_vars[i]).empty()) { key= new String(form_vars[i]); key->uppercase(); vars.Add(key->get(), new String(config->Find(form_vars[i]))); } } } //***************************************************************************** void Display::createURL(String &url, int pageNumber) { HtConfiguration* config= HtConfiguration::config(); String s; int i; #define encodeInput(name) (s = input->get(name), encodeURL(s), s.get()) if (!config->Find("script_name").empty()) { url << config->Find("script_name"); } else { url << getenv("SCRIPT_NAME"); } url << '?'; if (input->exists("restrict")) url << "restrict=" << encodeInput("restrict") << ';'; if (input->exists("exclude")) url << "exclude=" << encodeInput("exclude") << ';'; // Not needed: The next loop below handles this output //if (input->exists("config")) // url << "config=" << encodeInput("config") << ';'; // Put out all specified collections. If none selected, resort to // default behaviour. char *config_name = collectionList[0]; String config_encoded; if (config_name && config_name[0] == '\0') config_name = NULL; if (config_name) { for (i = 0; i < collectionList.Count(); i++) { config_name = collectionList[i]; config_encoded = config_name; encodeURL(config_encoded); url << "config=" << config_encoded << ';'; } } if (input->exists("method")) url << "method=" << encodeInput("method") << ';'; if (input->exists("format")) url << "format=" << encodeInput("format") << ';'; if (input->exists("sort")) url << "sort=" << encodeInput("sort") << ';'; if (input->exists("matchesperpage")) url << "matchesperpage=" << encodeInput("matchesperpage") << ';'; if (input->exists("keywords")) url << "keywords=" << encodeInput("keywords") << ';'; if (input->exists("words")) url << "words=" << encodeInput("words") << ';'; if (input->exists("startyear")) url << "startyear=" << encodeInput("startyear") << ';'; if (input->exists("startmonth")) url << "startmonth=" << encodeInput("startmonth") << ';'; if (input->exists("startday")) url << "startday=" << encodeInput("startday") << ';'; if (input->exists("endyear")) url << "endyear=" << encodeInput("endyear") << ';'; if (input->exists("endmonth")) url << "endmonth=" << encodeInput("endmonth") << ';'; if (input->exists("endday")) url << "endday=" << encodeInput("endday") << ';'; StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); for (i= 0; i < form_vars.Count(); i++) { if (input->exists(form_vars[i])) { s = form_vars[i]; encodeURL(s); // shouldn't be needed, but just in case url << s << '='; url << encodeInput(form_vars[i]) << ';'; } } url << "page=" << pageNumber; } //***************************************************************************** void Display::displayHTTPheaders() { HtConfiguration* config= HtConfiguration::config(); String content_type = config->Find("search_results_contenttype"); if (config->Boolean("nph")) cout << "HTTP/1.0 200 OK\r\n"; if (content_type.length()) cout << "Content-type: " << content_type << "\r\n\r\n"; } //***************************************************************************** void Display::displayHeader() { HtConfiguration* config= HtConfiguration::config(); displayParsedFile(config->Find("search_results_header")); } //***************************************************************************** void Display::displayFooter() { HtConfiguration* config= HtConfiguration::config(); displayParsedFile(config->Find("search_results_footer")); } //***************************************************************************** void Display::displayNomatch() { HtConfiguration* config= HtConfiguration::config(); displayParsedFile(config->Find("nothing_found_file")); } //***************************************************************************** void Display::displaySyntaxError(const String& message) { HtConfiguration* config= HtConfiguration::config(); displayHTTPheaders(); setVariables(0, 0); vars.Add("SYNTAXERROR", new String(message)); displayParsedFile(config->Find("syntax_error_file")); } //***************************************************************************** void Display::displayParsedFile(const String& filename) { FILE *fl = fopen(filename, "r"); char buffer[1000]; while (fl && fgets(buffer, sizeof(buffer), fl)) { expandVariables(buffer); } if (fl) fclose(fl); else if (debug) cerr << "displayParsedFile: Can't open " << filename << endl; } //***************************************************************************** // If the result templates need to depend on the URL of the match, we need // an efficient way to determine which template file to use. To do this, we // will build a StringMatch object with all the URL patterns and also // a List parallel to that pattern that contains the actual template file // names to use for each URL. // void Display::setupTemplates() { HtConfiguration* config= HtConfiguration::config(); String templatePatterns = config->Find("template_patterns"); if (!templatePatterns.empty()) { // // The templatePatterns string will have pairs of values. The first // value of a pair will be a pattern, the second value will be a // result template file name. // char *token = strtok(templatePatterns, " \t\r\n"); String pattern; while (token) { // // First token is a pattern... // pattern << token << '|'; // // Second token is an URL // token = strtok(0, " \t\r\n"); URLtemplateList.Add(new String(token)); if (token) token = strtok(0, " \t\r\n"); } pattern.chop(1); URLtemplate.Pattern(pattern); } } //***************************************************************************** // If the star images need to depend on the URL of the match, we need // an efficient way to determine which image to use. To do this, we // will build a StringMatch object with all the URL patterns and also // a List parallel to that pattern that contains the actual images to // use for each URL. // void Display::setupImages() { HtConfiguration* config= HtConfiguration::config(); String starPatterns = config->Find("star_patterns"); if (!starPatterns.empty()) { // // The starPatterns string will have pairs of values. The first // value of a pair will be a pattern, the second value will be an // URL to an image. // char *token = strtok(starPatterns, " \t\r\n"); String pattern; while (token) { // // First token is a pattern... // pattern << token << '|'; // // Second token is an URL // token = strtok(0, " \t\r\n"); URLimageList.Add(new String(token)); if (token) token = strtok(0, " \t\r\n"); } pattern.chop(1); URLimage.Pattern(pattern); } } //***************************************************************************** String * Display::generateStars(DocumentRef *ref, int right) { int i; String *result = new String(); HtConfiguration* config= HtConfiguration::config(); if (!config->Boolean("use_star_image", 1)) return result; String image = config->Find("star_image"); const String blank = config->Find("star_blank"); double score; if (maxScore != 0 && maxScore != minScore) { score = (ref->DocScore() - minScore) / (maxScore - minScore); if(debug) cerr << "generateStars: doc, min, max " << ref->DocScore() << ", " << minScore << ", " << maxScore <DocScore(); score = 1; } int nStars = int(score * (maxStars - 1) + 0.5) + 1; vars.Add("NSTARS", new String(form("%.d", nStars))); if(debug) cerr << "generateStars: nStars " << nStars << " of " << maxStars <"; } } int match = 0; int length = 0; int status; if (URLimage.hasPattern()) status = URLimage.FindFirst(ref->DocURL(), match, length); else status = -1; if (status >= 0 && match >= 0) { image = ((String*) URLimageList[match])->get(); } for (i = 0; i < nStars; i++) { *result << "\"*\""; } if (!right) { for (i = 0; i < maxStars - nStars; i++) { *result << "\""; } } return result; } //***************************************************************************** String * Display::readFile(const String& filename) { FILE *fl; String *s = new String(); char line[1024]; fl = fopen(filename, "r"); while (fl && fgets(line, sizeof(line), fl)) { *s << line; } if (fl) fclose(fl); else if (debug) cerr << "readFile: Can't open " << filename << endl; return s; } //***************************************************************************** void Display::expandVariables(const String& str_arg) { const char* str = str_arg; enum { StStart, StLiteral, StVarStart, StVarClose, StVarPlain, StGotVar } state = StStart; String var = ""; while (str && *str) { switch (state) { case StStart: if (*str == '\\') state = StLiteral; else if (*str == '$') state = StVarStart; else cout << *str; break; case StLiteral: cout << *str; state = StStart; break; case StVarStart: if (*str == '%' || *str == '=') var << *str; // code for URL-encoded/decoded variable else if (*str == '&') { var << *str; // code for SGML-encoded variable if (mystrncasecmp("&", str, 5) == 0) str += 4; } else if (*str == '(' || *str == '{') state = StVarClose; else if (isalnum(*str) || *str == '_' || *str == '-') { var << *str; state = StVarPlain; } else state = StStart; break; case StVarClose: if (*str == ')' || *str == '}') state = StGotVar; else if (isalnum(*str) || *str == '_' || *str == '-') var << *str; else state = StStart; break; case StVarPlain: if (isalnum(*str) || *str == '_' || *str == '-') var << *str; else { state = StGotVar; continue; } break; case StGotVar: // // We have a complete variable in var. Look it up and // see if we can find a good replacement for it. // outputVariable(var); var = ""; state = StStart; continue; } str++; } if (state == StGotVar || state == StVarPlain) { // // The end of string was reached, but we are still trying to // put a variable together. Since we now have a complete // variable, we will look up the value for it. // outputVariable(var); } } //***************************************************************************** void Display::outputVariable(const String& var) { String *temp; String value = ""; const char *ev, *name; // We have a complete variable name in var. Look it up and // see if we can find a good replacement for it, either in our // vars dictionary or in the environment variables. name = var; while (*name == '&' || *name == '%' || *name == '=') name++; temp = (String *) vars[name]; if (temp) value = *temp; else { ev = getenv(name); if (ev) value = ev; } while (--name >= var.get() && value.length()) { if (*name == '%') encodeURL(value); else if(*name == '&') value = HtSGMLCodec::instance()->decode(value); else // (*name == '=') decodeURL(value); } cout << value; } //***************************************************************************** List * Display::buildMatchList() { HtConfiguration* config= HtConfiguration::config(); char *cpid; String url; ResultMatch *thisMatch; SplitMatches matches(*config); double backlink_factor = config->Double("backlink_factor"); double date_factor = config->Double("date_factor"); double backlink_score = 0; double date_score = 0; double base_score = 0; // Additions made here by Mike Grommet ... tm startdate; // structure to hold the startdate specified by the user tm enddate; // structure to hold the enddate specified by the user time_t now = time((time_t *)0); // fill in all fields for mktime tm *lt = localtime(&now); // - Gilles's fix startdate = *lt; enddate = *lt; time_t eternity = ~(1<<(sizeof(time_t)*8-1)); // will be the largest value holdable by a time_t tm endoftime; // the time_t eternity will be converted into a tm, held by this variable time_t timet_startdate; time_t timet_enddate; int monthdays[] = {31,28,31,30,31,30,31,31,30,31,30,31}; // boolean to test to see if we need to build date information or not int dategiven = ((config->Value("startmonth")) || (config->Value("startday")) || (config->Value("startyear")) || (config->Value("endmonth")) || (config->Value("endday")) || (config->Value("endyear"))); // find the end of time lt = gmtime(&eternity); endoftime = *lt; if(dategiven) // user specified some sort of date information { int reldate = ((config->Value("startmonth") < 0) || (config->Value("startday") < 0) || (config->Value("startyear") < 0)); int t; // set up the startdate structure // see man mktime for details on the tm structure startdate.tm_sec = 0; startdate.tm_min = 0; startdate.tm_hour = 0; startdate.tm_yday = 0; startdate.tm_wday = 0; // The concept here is that if a user did not specify a part of a date, // then we will make assumtions... // For instance, suppose the user specified Feb, 1999 as the start // range, we take steps to make sure that the search range date starts // at Feb 1, 1999, // along these same lines: (these are in MM-DD-YYYY format) // Startdates: Date Becomes // 01-01 01-01-1970 // 01-1970 01-01-1970 // 04-1970 04-01-1970 // 1970 01-01-1970 // These things seem to work fine for start dates, as all months have // the same first day however the ending date can't work this way. if(config->Value("startday")) // form input specified a start day { t = config->Value("startday"); if (t < 0) { time_t then = now + (t * (24*60*60)); lt = localtime(&then); startdate.tm_mday = lt->tm_mday; startdate.tm_mon = lt->tm_mon; startdate.tm_year = lt->tm_year; } else startdate.tm_mday = t; // tm days are 1 based, they are passed in as 1 based } else if (!reldate) startdate.tm_mday = 1; // otherwise, no start day, default to 1 if(config->Value("startmonth")) // form input specified a start month { t = config->Value("startmonth"); if (t < 0) startdate.tm_mon += t; else startdate.tm_mon = t - 1; // tm months are zero based. They are passed in as 1 based while (startdate.tm_mon < 0) { startdate.tm_mon += 12; startdate.tm_year--; } } else if (!reldate) startdate.tm_mon = 0; // otherwise, no start month, default to 0 // year is handled a little differently... the tm_year structure // wants the tm_year in a format of year - 1900. // since we are going to convert these dates to a time_t, // a time_t value of zero, the earliest possible date // occurs Jan 1, 1970. If we allow dates < 1970, then we // could get negative time_t values right??? // (barring minor timezone offsets west of GMT, where Epoch is 12-31-69) if(config->Value("startyear")) // form input specified a start year { t = config->Value("startyear"); if (t < 0) startdate.tm_year += t; else { startdate.tm_year = config->Value("startyear") - 1900; if (startdate.tm_year < 69-1900) // correct for 2-digit years 00-68 startdate.tm_year += 2000; // - Gilles's fix if (startdate.tm_year < 0) // correct for 2-digit years 69-99 startdate.tm_year += 1900; } } else if (!reldate) startdate.tm_year = 1970-1900; // otherwise, no start day, specify start at 1970 reldate = ((config->Value("endmonth") < 0) || (config->Value("endday") < 0) || (config->Value("endyear") < 0)); // set up the enddate structure enddate.tm_sec = 59; // allow up to last second of end day enddate.tm_min = 59; // - Gilles's fix enddate.tm_hour = 23; enddate.tm_yday = 0; enddate.tm_wday = 0; if(config->Value("endday") < 0) // form input specified relative end day { // relative end day must be done before month or year t = config->Value("endday"); time_t then = now + (t * (24*60*60)); lt = localtime(&then); enddate.tm_mday = lt->tm_mday; enddate.tm_mon = lt->tm_mon; enddate.tm_year = lt->tm_year; } if(config->Value("endmonth")) // form input specified an end month { t = config->Value("endmonth"); if (t < 0) enddate.tm_mon += t; else enddate.tm_mon = t - 1; // tm months are zero based. They are passed in as 1 based while (enddate.tm_mon < 0) { enddate.tm_mon += 12; enddate.tm_year--; } } else if (!reldate) enddate.tm_mon = 11; // otherwise, no end month, default to 11 if(config->Value("endyear")) // form input specified a end year { t = config->Value("endyear"); if (t < 0) enddate.tm_year += t; else { enddate.tm_year = config->Value("endyear") - 1900; if (enddate.tm_year < 69-1900) // correct for 2-digit years 00-68 enddate.tm_year += 2000; // - Gilles's fix if (enddate.tm_year < 0) // correct for 2-digit years 69-99 enddate.tm_year += 1900; } } else if (!reldate) enddate.tm_year = endoftime.tm_year; // otherwise, no end year, specify end at the end of time allowable // Months have different number of days, and this makes things more // complicated than the startdate range. // Following the example above, here is what we want to happen: // Enddates: Date Becomes // 04-31 04-31-endoftime.tm_year // 05-1999 05-31-1999, may has 31 days... we want to search until the end of may so... // 1999 12-31-1999, search until the end of the year if(config->Value("endday") > 0) // form input specified an end day { enddate.tm_mday = config->Value("endday"); // tm days are 1 based, they are passed in as 1 based } else if (!reldate) { // otherwise, no end day, default to the end of the month enddate.tm_mday = monthdays[enddate.tm_mon]; if (enddate.tm_mon == 1) // February, so check for leap year if (((enddate.tm_year+1900) % 4 == 0 && (enddate.tm_year+1900) % 100 != 0) || (enddate.tm_year+1900) % 400 == 0) enddate.tm_mday += 1; // Feb. 29 - Gilles's fix } // Convert the tm values into time_t values. // Web servers specify modification times in GMT, but htsearch // displays these modification times in the server's local time zone. // For consistency, we would prefer to select based on this same // local time zone. - Gilles's fix timet_startdate = mktime(&startdate); timet_enddate = mktime(&enddate); // I'm not quite sure what behavior I want to happen if // someone reverses the start and end dates, and one of them is invalid. // for now, if there is a completely invalid date on the start or end // date, I will force the start date to time_t 0, and the end date to // the maximum that can be handled by a time_t. if(timet_startdate < 0) timet_startdate = 0; if(timet_enddate < 0) timet_enddate = eternity; // what if the user did something really goofy like choose an end date // that's before the start date if(timet_enddate < timet_startdate) // if so, then swap them so they are in order { time_t timet_temp = timet_enddate; timet_enddate = timet_startdate; timet_startdate = timet_temp; } } else // no date was specifed, so plug in some defaults { timet_startdate = 0; timet_enddate = eternity; } // ... MG URLSeedScore adjustments(*config); // If we knew where to pass it, this would be a good place to pass // on errors from adjustments.ErrMsg(). // Deal with all collections // selected_collections->Start_Get(); Collection *collection= NULL; while ((collection = (Collection *) selected_collections->Get_NextElement())) { ResultList *results = collection->getResultList(); if (results == NULL) continue; results->Start_Get(); while ((cpid = results->Get_Next())) { int id = atoi(cpid); // DocumentRef *thisRef = docDB[id]; DocMatch *dm = results->find(cpid); Collection *collection = NULL; if (dm) collection = dm->collection; if (collection == NULL) continue; DocumentRef *thisRef = collection->getDocumentRef(id); // // If it wasn't there, then ignore it // if (thisRef == 0) { continue; } url = thisRef->DocURL(); HtURLRewriter::instance()->replace(url); if (!includeURL(url.get())) { // Get rid of it to free the memory! delete thisRef; continue; } // Code added by Mike Grommet for date search ranges // check for valid date range. toss it out if it isn't relevant. if ((timet_startdate > 0 || timet_enddate < eternity) && (thisRef->DocTime() < timet_startdate || thisRef->DocTime() > timet_enddate)) { delete thisRef; continue; } thisMatch = ResultMatch::create(); thisMatch->setID(id); thisMatch->setCollection(collection); // // Assign the incomplete score to this match. This score was // computed from the word database only, no excerpt context was // known at that time, or info about the document itself, // so this still needs to be done. // // Moved up: DocMatch *dm = results->find(cpid); double score = dm->score; // We need to scale based on date relevance and backlinks // Other changes to the score can happen now // Or be calculated by the result match in getScore() // This formula derived through experimentation // We want older docs to have smaller values and the // ultimate values to be a reasonable size (max about 100) base_score = score; if (date_factor != 0.0) { // Macro for calculating the date factor (31536000 is the number of // seconds in a 365 days year). The formula gives less weight // as the distance between the date document and the current time // increases (the absolute value is for documents with future date) #define DATE_FACTOR(df, n, dd) ((df) * 100 / (1+(double)(abs((n) - (dd)) / 31536000))) date_score = DATE_FACTOR(date_factor, now, thisRef->DocTime()); score += date_score; } if (backlink_factor != 0.0) { int links = thisRef->DocLinks(); if (links == 0) links = 1; // It's a hack, but it helps... backlink_score = backlink_factor * (thisRef->DocBackLinks() / (double)links); score += backlink_score; } if (debug) { cerr << thisRef->DocURL() << "\n"; } thisMatch->setTime(thisRef->DocTime()); thisMatch->setTitle(thisRef->DocTitle()); score = adjustments.adjust_score(score, thisRef->DocURL()); score = log(1.0 + score); thisMatch->setScore(score); thisMatch->setAnchor(dm->anchor); // // Append this match to our list of matches. // if (score > 0.0) matches.Add(thisMatch, thisRef->DocURL()); // Get rid of it to free the memory! delete thisRef; if (debug) { cerr << " base_score " << base_score << " date_score " << date_score << " backlink_score " << backlink_score << "\n"; cerr << " score " << score << "(" << thisMatch->getScore() << "), maxScore " << maxScore <<", minScore " << minScore << endl; } if (maxScore < score) {if(debug) cerr << "Set maxScore = score" < score && score > 0.0) {if(debug) cerr << "Set minScore = score" <getCollection(); if (config->Boolean("use_meta_description",0) && strlen(ref->DocMetaDsc()) != 0) { // Set the head to point to description head = ref->DocMetaDsc(); use_meta_description = 1; } else { // docDB.ReadExcerpt(*ref); collection->ReadExcerpt(*ref); head = ref->DocHead(); // head points to the top } //head_string = HtSGMLCodec::instance()->decode(head); //head = head_string.get(); int which, length; char *temp = head; String part; String *text = new String(""); StringMatch *allWordsPattern = NULL; if (collection) allWordsPattern = collection->getSearchWordsPattern(); if (!allWordsPattern) return text; // htsearch displays the description when: // 1) a description has been found // 2) the option "use_meta_description" is set to true // If previous conditions are false and "excerpt_show_top" is set to true // it shows the whole head. Else, it acts as default. if (config->Boolean("excerpt_show_top", 0) || use_meta_description || !allWordsPattern->hasPattern()) first = 0; else first = allWordsPattern->FindFirstWord(head, which, length); if (first < 0 && config->Boolean("no_excerpt_show_top")) first = 0; // No excerpt, but we want to show the top. if (first < 0) { // // No excerpt available, don't show top, so display message // if (!config->Find("no_excerpt_text").empty()) { *text << config->Find("no_excerpt_text"); } } else if ( first == 0 || config->Value( "max_excerpts" ) == 1 ) { int headLength = strlen(head); int length = config->Value("excerpt_length", 50); char *start; char *end; WordType type(*config); if (!config->Boolean("add_anchors_to_excerpt")) // negate flag if it's on (anchor available) fanchor = 0; // // Figure out where to start the excerpt. Basically we go back // half the excerpt length from the first matched word // start = &temp[first] - length / 2; if (start < temp) start = temp; else { *text << config->Find("start_ellipses"); while (*start && type.IsStrictChar(*start)) start++; } // // Figure out the end of the excerpt. // end = start + length; if (end > temp + headLength) { end = temp + headLength; *text << hilight(match, start, urlanchor, fanchor); } else { while (*end && type.IsStrictChar(*end)) end++; *end = '\0'; *text << hilight(match, start, urlanchor, fanchor); *text << config->Find("end_ellipses"); } } else { *text = buildExcerpts( allWordsPattern, match, head, urlanchor, fanchor ); } return text; } // //***************************************************************************** // Handle cases where multiple document excerpts are requested. // const String Display::buildExcerpts( StringMatch *allWordsPattern, ResultMatch* match, char *head, String urlanchor, int fanchor ) { HtConfiguration* config= HtConfiguration::config(); if ( !config->Boolean( "add_anchors_to_excerpt" ) ) { fanchor = 0; } int headLength = strlen( head ); int excerptNum = config->Value( "max_excerpts", 1 ); int excerptLength = config->Value( "excerpt_length", 50 ); int lastPos = 0; int curPos = 0; String text; for ( int i = 0; i < excerptNum; ++i ) { int which, termLength; int nextPos = allWordsPattern->FindFirstWord( head + lastPos, which, termLength ); if ( nextPos < 0 ) { // Ran out of matching terms break; } else { // Determine offset from beginning of head curPos = lastPos + nextPos; } // Slip a break in since there is another excerpt coming if ( i != 0 ) { text << "
"; } // Determine where excerpt starts char *start = &head[curPos] - excerptLength / 2; if ( start < head ) { start = head; } else { text << config->Find("start_ellipses"); while ( *start && HtIsStrictWordChar( *start ) ) { start++; } } // Determine where excerpt ends char *end = start + excerptLength; if ( end > head + headLength ) { end = head + headLength; text << hilight( match, start, urlanchor, fanchor ); } else { while ( *end && HtIsStrictWordChar( *end ) ) { end++; } // Save end char so that it can be restored char endChar = *end; *end = '\0'; text << hilight(match, start, urlanchor, fanchor); text << config->Find("end_ellipses"); *end = endChar; } // No more words left to examine in head if ( (lastPos = curPos + termLength) > headLength ) break; } return text; } //***************************************************************************** String Display::hilight(ResultMatch *match, const String& str_arg, const String& urlanchor, int fanchor) { HtConfiguration* config= HtConfiguration::config(); const String start_highlight = config->Find("start_highlight"); const String end_highlight = config->Find("end_highlight"); const String anchor_target = config->Find("anchor_target"); const char *str = str_arg; String result; int pos = 0; int which, length; WeightWord *ww; int first = 1; String s; #define SGMLencodedChars(p, l) (s = 0, s.append(p, l), HtSGMLCodec::instance()->decode(s)) result = 0; Collection *collection = match->getCollection(); StringMatch *allWordsPattern = NULL; if (collection) allWordsPattern = collection->getSearchWordsPattern(); List *searchWords = NULL; if (collection) searchWords = collection->getSearchWords(); if (!allWordsPattern || !searchWords) return result; while (allWordsPattern->hasPattern() && (pos = allWordsPattern->FindFirstWord(str, which, length)) >= 0) { //result.append(str, pos); result << SGMLencodedChars(str, pos); ww = (WeightWord *) (*searchWords)[which]; result << start_highlight; if (first && fanchor) { result << " 0) result << "target=\"" << anchor_target << "\" "; result << "href=\"" << urlanchor << "\">"; } //result.append(str + pos, length); result << SGMLencodedChars(str + pos, length); if (first && fanchor) result << ""; result << end_highlight; str += pos + length; first = 0; } //result.append(str); result << SGMLencodedChars(str, strlen(str)); return result; } //***************************************************************************** void Display::sort(List *matches) { HtConfiguration* config= HtConfiguration::config(); int numberOfMatches = matches->Count(); int i; if (numberOfMatches <= 1) return; ResultMatch **array = new ResultMatch*[numberOfMatches]; for (i = 0; i < numberOfMatches; i++) { array[i] = (ResultMatch *)(*matches)[i]; } matches->Release(); qsort((char *) array, numberOfMatches, sizeof(ResultMatch *), array[0]->getSortFun()); const String st = config->Find("sort"); if (!st.empty() && mystrncasecmp("rev", st, 3) == 0) { for (i = numberOfMatches; --i >= 0; ) matches->Add(array[i]); } else { for (i = 0; i < numberOfMatches; i++) matches->Add(array[i]); } delete [] array; } //***************************************************************************** void Display::logSearch(int page, List *matches) { //Note: This is Posix and dependent on a running syslogd.. //does not work for Win32 //TODO: Look into using native windows system logs instead #ifndef _MSC_VER /* _WIN32 */ HtConfiguration* config= HtConfiguration::config(); // Currently unused time_t t; int nMatches = 0; int level = LOG_LEVEL; int facility = LOG_FACILITY; char *host = getenv("REMOTE_HOST"); char *ref = getenv("HTTP_REFERER"); if (host == NULL) host = getenv("REMOTE_ADDR"); if (host == NULL) host = "-"; if (ref == NULL) ref = "-"; if (matches) nMatches = matches->Count(); openlog("htsearch", LOG_PID, facility); syslog(level, "%s [%s] (%s) [%s] [%s] (%d/%s) - %d -- %s\n", host, input->exists("config") ? input->get("config") : "default", (const char*)config->Find("match_method"), input->exists("words") ? input->get("words") : "", logicalWords.get(), nMatches, (const char*)config->Find("matches_per_page"), page, ref ); #endif }