You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdepim/akregator/src/librss/feeddetector.cpp

180 lines
5.2 KiB

/*
This file is part of Akregator.
Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
As a special exception, permission is given to link this program
with any edition of TQt, and distribute the resulting executable,
without including the source code for TQt in the source distribution.
*/
#include <tqregexp.h>
#include <tqstring.h>
#include <tqstringlist.h>
#include <tqvaluelist.h>
#include <kcharsets.h>
#include <kurl.h>
#include "feeddetector.h"
using namespace RSS;
FeedDetectorEntryList FeedDetector::extractFromLinkTags(const TQString& s)
{
//reduce all sequences of spaces, newlines etc. to one space:
TQString str = s.simplifyWhiteSpace();
// extracts <link> tags
TQRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false);
// extracts the URL (href="url")
TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
// extracts type attribute
TQRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
// extracts the title (title="title")
TQRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
int pos = 0;
int matchpos = 0;
// get all <link> tags
TQStringList linkTags;
//int strlength = str.length();
while ( matchpos != -1 )
{
matchpos = reLinkTag.search(str, pos);
if (matchpos != -1)
{
linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) );
pos = matchpos + reLinkTag.matchedLength();
}
}
FeedDetectorEntryList list;
for ( TQStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it )
{
TQString type;
int pos = reType.search(*it, 0);
if (pos != -1)
type = TQString(reType.cap(1)).lower();
// we accept only type attributes indicating a feed
if ( type != "application/rss+xml" && type != "application/rdf+xml"
&& type != "application/atom+xml" && type != "text/xml" )
continue;
TQString title;
pos = reTitle.search(*it, 0);
if (pos != -1)
title = reTitle.cap(1);
title = KCharsets::resolveEntities(title);
TQString url;
pos = reHref.search(*it, 0);
if (pos != -1)
url = reHref.cap(1);
url = KCharsets::resolveEntities(url);
// if feed has no title, use the url as preliminary title (until feed is parsed)
if ( title.isEmpty() )
title = url;
if ( !url.isEmpty() )
list.append(FeedDetectorEntry(url, title) );
}
return list;
}
TQStringList FeedDetector::extractBruteForce(const TQString& s)
{
TQString str = s.simplifyWhiteSpace();
TQRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false);
// extracts the URL (href="url")
TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
TQRegExp rssrdfxml(".*(RSS|RDF|XML)", false);
int pos = 0;
int matchpos = 0;
// get all <a href> tags and capture url
TQStringList list;
//int strlength = str.length();
while ( matchpos != -1 )
{
matchpos = reAhrefTag.search(str, pos);
if ( matchpos != -1 )
{
TQString ahref = str.mid(matchpos, reAhrefTag.matchedLength());
int hrefpos = reHref.search(ahref, 0);
if ( hrefpos != -1 )
{
TQString url = reHref.cap(1);
url = KCharsets::resolveEntities(url);
if ( rssrdfxml.exactMatch(url) )
list.append(url);
}
pos = matchpos + reAhrefTag.matchedLength();
}
}
return list;
}
TQString FeedDetector::fixRelativeURL(const TQString &s, const KURL &baseurl)
{
TQString s2=s;
KURL u;
if (KURL::isRelativeURL(s2))
{
if (s2.startsWith("//"))
{
s2=s2.prepend(baseurl.protocol()+":");
u=s2;
}
else if (s2.startsWith("/"))
{
KURL b2(baseurl);
b2.setPath(TQString()); // delete path and query, so that only protocol://host remains
b2.setQuery(TQString());
u = KURL(b2, s2.remove(0,1)); // remove leading "/"
}
else
{
u = KURL(baseurl, s2);
}
}
else
u=s2;
u.cleanPath();
//kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() <<
//endl;
return u.url();
}