summaryrefslogtreecommitdiffstats
path: root/klinkstatus/src/parser/htmlparser.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'klinkstatus/src/parser/htmlparser.cpp')
-rw-r--r--klinkstatus/src/parser/htmlparser.cpp455
1 files changed, 455 insertions, 0 deletions
diff --git a/klinkstatus/src/parser/htmlparser.cpp b/klinkstatus/src/parser/htmlparser.cpp
new file mode 100644
index 00000000..6bc93761
--- /dev/null
+++ b/klinkstatus/src/parser/htmlparser.cpp
@@ -0,0 +1,455 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#include "htmlparser.h"
+
+#include <kapplication.h>
+#include <kdebug.h>
+
+
+HtmlParser::HtmlParser(QString const& documento)
+ : is_content_type_set_(false), document_(documento)
+{
+ Q_ASSERT(!documento.isEmpty());
+
+ stripScriptContent();
+ stripComments(); // after removing the script because comments in scripts have diferent sintaxe
+
+ nodes_.reserve(estimativaLinks(documento.length() * 2)); // à confiança ;)
+
+ parseNodesOfTypeA();
+ parseNodesOfTypeAREA();
+ parseNodesOfTypeLINK();
+ parseNodesOfTypeMETA();
+ parseNodesOfTypeIMG();
+ parseNodesOfTypeFRAME();
+ parseNodesOfTypeIFRAME();
+ parseNodesOfTypeBASE();
+ parseNodesOfTypeTITLE();
+}
+
+bool HtmlParser::hasBaseUrl() const
+{
+ return (node_BASE_.element() == Node::BASE &&
+ !node_BASE_.url().isEmpty());
+}
+
+NodeBASE const& HtmlParser::baseUrl() const
+{
+ Q_ASSERT(hasBaseUrl());
+ return node_BASE_;
+}
+
+NodeMETA const& HtmlParser::contentTypeMetaNode() const
+{
+ Q_ASSERT(hasContentType());
+ return node_META_content_type_;
+}
+
+bool HtmlParser::hasTitle() const
+{
+ return (node_TITLE_.element() == Node::TITLE &&
+ !node_TITLE_.attributeTITLE().isEmpty());
+}
+
+NodeTITLE const& HtmlParser::title() const
+{
+ Q_ASSERT(hasTitle());
+ return node_TITLE_;
+}
+
+vector<QString> const& HtmlParser::parseNodesOfType(QString const& element)
+{
+ HtmlParser::parseNodesOfType(element, document_, aux_);
+ return aux_;
+}
+
+void HtmlParser::parseNodesOfType(QString const& tipo, QString const& document, vector<QString>& nodes)
+{
+ QString node;
+ QString doc(document);
+ int inicio = 0, fim = 0;
+
+ nodes.clear();
+ if(upperCase(tipo) == "A")
+ nodes.reserve(estimativaLinks(doc.length() * 2));
+
+ while(true)
+ {
+ inicio = findSeparableWord(doc, "<" + tipo);
+ if(inicio == -1)
+ return;
+
+ //if( (doc[inicio] != ' ' && doc[inicio] != '\n' && doc[inicio] != '\r') )
+ if(!::isSpace(doc[inicio]))
+ {
+ doc.remove(0, QString("<" + tipo).length());
+ continue;
+ }
+
+ if(upperCase(tipo) == "A")
+ fim = findWord(doc, "</A>", inicio);
+ else
+ {
+ //fim = findChar(doc, '>', inicio + 1);
+ fim = endOfTag(doc, inicio, '>');
+ }
+
+ if(fim == -1)
+ {
+ doc.remove(0, 1);
+ continue;
+ }
+
+ int tag_begining_go_back = (tipo.length() + QString("<").length());
+ node = doc.mid(inicio - tag_begining_go_back,
+ fim - inicio + tag_begining_go_back);
+ nodes.push_back(node);
+ doc.remove(0, fim);
+ }
+}
+
+int HtmlParser::endOfTag(QString const& s, int index, QChar end_of_tag)
+{
+ if( (uint)index >= s.length() )
+ return -1;
+
+ int _end_of_tag = s.find(end_of_tag, index);
+ if(_end_of_tag == -1)
+ return _end_of_tag;
+
+ int open_aspas = s.find('"', index);
+ if(open_aspas == -1)
+ return _end_of_tag + 1;
+
+ else if(_end_of_tag < open_aspas)
+ return _end_of_tag + 1;
+
+ else if( ((uint)open_aspas + 1) >= s.length() - 1 )
+ return -1;
+
+ else
+ {
+ int close_aspas = s.find('"', open_aspas + 1);
+ if(close_aspas != -1)
+ return endOfTag(s, close_aspas + 1, end_of_tag);
+ else
+ {
+ kdDebug(23100) << "Mismatched quotes (\"): " << s.mid(index, _end_of_tag - index) << endl;
+ //return -1;
+ return _end_of_tag + 1;
+ }
+ }
+}
+
+vector<Node*> const& HtmlParser::nodes() const
+{
+ return nodes_;
+}
+
+
+void HtmlParser::parseNodesOfTypeA()
+{
+ vector<QString> const& aux = parseNodesOfType("A");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ {
+ nodes_.push_back( new NodeA(aux[i]) );
+ }
+}
+
+void HtmlParser::parseNodesOfTypeAREA()
+{
+ vector<QString> const& aux = parseNodesOfType("AREA");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ {
+ nodes_.push_back( new NodeAREA(aux[i]) );
+ }
+}
+
+void HtmlParser::parseNodesOfTypeLINK()
+{
+ vector<QString> const& aux = parseNodesOfType("LINK");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ nodes_.push_back( new NodeLINK(aux[i]) );
+}
+
+void HtmlParser::parseNodesOfTypeMETA()
+{
+ vector<QString> const& aux = parseNodesOfType("META");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ {
+ NodeMETA* node = new NodeMETA(aux[i]);
+ nodes_.push_back(node);
+
+ if(!is_content_type_set_ && node->atributoHTTP_EQUIV().lower() == QString("Content-Type").lower()) {
+ is_content_type_set_ = true;
+ node_META_content_type_.setNode(aux[i]);
+ }
+ }
+}
+
+QString HtmlParser::findCharsetInMetaElement(QString const& html)
+{
+ vector<QString> metaTags;
+ parseNodesOfType("META", html, metaTags);
+
+ for(vector<QString>::size_type i = 0; i != metaTags.size(); ++i)
+ {
+ NodeMETA node(metaTags[i]);
+
+ if(node.atributoHTTP_EQUIV().lower() == QString("Content-Type").lower()) {
+ return node.charset();
+ }
+ }
+ return QString();
+}
+
+void HtmlParser::parseNodesOfTypeIMG()
+{
+ vector<QString> const& aux = parseNodesOfType("IMG");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ nodes_.push_back( new NodeIMG(aux[i]) );
+}
+
+void HtmlParser::parseNodesOfTypeFRAME()
+{
+ vector<QString> const& aux = parseNodesOfType("FRAME");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ nodes_.push_back( new NodeFRAME(aux[i]) );
+}
+
+void HtmlParser::parseNodesOfTypeIFRAME()
+{
+ vector<QString> const& aux = parseNodesOfType("IFRAME");
+
+ for(vector<QString>::size_type i = 0; i != aux.size(); ++i)
+ nodes_.push_back( new NodeFRAME(aux[i]) );
+}
+
+void HtmlParser::parseNodesOfTypeBASE()
+{
+ QString node;
+ QString doc = document_;
+ int inicio = 0, fim = 0;
+
+ inicio = findSeparableWord(doc, "<BASE");
+ if(inicio == -1 || !doc[inicio].isSpace())
+ return;
+
+ fim = doc.find(">", inicio);
+ if(fim == -1)
+ return;
+
+ node = doc.mid(inicio, fim-inicio);
+ node_BASE_.setNode(node);
+}
+
+void HtmlParser::parseNodesOfTypeTITLE()
+{
+ QString node;
+ QString doc = document_;
+ int inicio = 0, fim = 0;
+
+ inicio = findSeparableWord(doc, "<TITLE>");
+ if(inicio == -1)
+ return;
+
+ fim = findSeparableWord(doc, "</TITLE>", inicio);
+ if(fim == -1)
+ return;
+
+ node = doc.mid(inicio, fim-inicio);
+
+ node_TITLE_.setNode(node);
+}
+
+
+void HtmlParser::stripComments()
+{
+ QString begin_comment = "<!--";
+ QString end_comment = "-->";
+ uint const begin_comment_length = begin_comment.length();
+
+ int inicio = -1;
+ do
+ {
+ inicio = findWord(document_, begin_comment);
+ if(inicio != -1)
+ {
+ int fim = findWord(document_, end_comment, inicio);
+ if(fim == -1)
+ {
+ kdDebug(23100) << "End of comment is missing!" << endl;
+ document_.remove(inicio - begin_comment_length, begin_comment_length);
+ }
+ else
+ {
+ comments_ += "\n" + document_.mid(inicio - begin_comment_length,
+ fim - inicio + begin_comment_length);
+ document_.remove(inicio - begin_comment_length, fim - inicio + begin_comment_length);
+ }
+ }
+ }
+ while(inicio != -1);
+}
+
+void HtmlParser::stripScriptContent()
+{
+ int inicio = -1;
+ QString const begin_script = "<script";
+ QString const end_script = "</script>";
+ uint const begin_script_length = begin_script.length();
+
+ do
+ {
+ inicio = findWord(document_, begin_script);
+ if(inicio != -1)
+ {
+ int fim = findWord(document_, end_script, inicio);
+
+ if(fim == -1)
+ {
+ kdDebug(23100) << "Malformed script tag!" << endl;
+ document_.remove(inicio - begin_script_length, begin_script_length);
+ }
+ else
+ {
+ script_ += "\n" + document_.mid(inicio - begin_script_length,
+ fim - inicio + begin_script_length);
+
+ document_.remove(inicio - begin_script_length,
+ fim - inicio + begin_script_length);
+ }
+ }
+ }
+ while(inicio != -1);
+}
+
+
+
+
+#include <iostream>
+void HtmlParser::mostra() const
+{
+ kdDebug(23100) << "\nA:\n\n";
+ for(unsigned int i = 0; i != nodes_.size(); ++i)
+ {
+ if(nodes_[i]->element() == Node::A)
+ kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl;
+ }
+ kdDebug(23100) << "____________________________________________________________________" << endl;
+
+ kdDebug(23100) << "\nLINK:\n\n";
+ for(unsigned int i = 0; i != nodes_.size(); ++i)
+ {
+ if(nodes_[i]->element() == Node::LINK)
+ kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl;
+ }
+ kdDebug(23100) << "____________________________________________________________________" << endl;
+
+ kdDebug(23100) << "\nMETA:\n";
+ for(unsigned int i = 0; i != nodes_.size(); ++i)
+ {
+ if(nodes_[i]->element() == Node::META)
+ {
+#if defined Q_WS_WIN
+ NodeMETA* nm = (NodeMETA*)nodes_[i];
+#else
+
+ NodeMETA* nm = dynamic_cast<NodeMETA*>(nodes_[i]);
+#endif
+
+ kdDebug(23100) << nm->url() << endl
+ << nm->atributoHTTP_EQUIV() << endl
+ << nm->atributoNAME() << endl
+ << nm->atributoCONTENT() << endl;
+ }
+ }
+ kdDebug(23100) << "____________________________________________________________________" << endl;
+
+ kdDebug(23100) << "\nIMG:\n\n";
+ for(unsigned int i = 0; i != nodes_.size(); ++i)
+ {
+ if(nodes_[i]->element() == Node::IMG)
+ kdDebug(23100) << nodes_[i]->url() << "\t"
+ << nodes_[i]->linkLabel() << endl;
+ }
+ kdDebug(23100) << "____________________________________________________________________" << endl;
+
+ kdDebug(23100) << "\nFRAME:\n\n";
+ for(unsigned int i = 0; i != nodes_.size(); ++i)
+ {
+ if(nodes_[i]->element() == Node::FRAME)
+ kdDebug(23100) << nodes_[i]->url() << endl;
+ }
+ kdDebug(23100) << "____________________________________________________________________" << endl;
+
+ kdDebug(23100) << "\nBASE:\n\n";
+ kdDebug(23100) << node_BASE_.url() << endl;
+
+ kdDebug(23100) << "____________________________________________________________________" << endl;
+
+}
+
+#ifdef HTMLPARSER
+
+#include <fstream>
+
+int main()
+{
+ //ifstream stream("aterraprometida.html");
+ //ifstream stream("/var/www/html/STL/standard_library.html");
+ //ifstream stream("/var/www/html/qt-doc/functions.html");
+ ifstream stream("/var/www/html/index.html");
+
+ QString content;
+ while(stream)
+ {
+ char c;
+ stream.get(c);
+ content += c;
+ }
+ // kdDebug(23100) << content << endl;
+ kdDebug(23100) << "__________________________________________________________" << endl;
+ HtmlParser parser(content);
+ parser.mostra();
+ kdDebug(23100) << "__________________________________________________________\n\n\n" << endl;
+ vector<Node*> nods = parser.nodes();
+ for(int i = 0; i != nods.size(); ++i)
+ {
+ if(nods[i]->element() == Node::META)
+ {
+ NodeMETA* nod_meta = (NodeMETA*)(nods[i]);
+ //Node* nod_meta = nods[i];
+
+ kdDebug(23100) << nod_meta->atributoCONTENT() << endl;
+ }
+
+ }
+}
+
+
+#endif