/* This file is part of the KDE libraries Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001-2003 Dirk Mueller (mueller@kde.org) (C) 2004 Apple Computer, Inc. (C) 2006 Germain Garand (germain@ebooksfrance.org) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ //---------------------------------------------------------------------------- // // KDE HTML Widget - Tokenizers //#define TOKEN_DEBUG 1 //#define TOKEN_DEBUG 2 #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "html/htmltokenizer.h" #include "html/html_documentimpl.h" #include "html/htmlparser.h" #include "html/dtd.h" #include "misc/loader.h" #include "misc/htmlhashes.h" #include "tdehtmlview.h" #include "tdehtml_part.h" #include "xml/dom_docimpl.h" #include "css/csshelper.h" #include "ecma/kjs_proxy.h" #include #include #include #include #include #include #include #include "kentities.c" using namespace tdehtml; static const TQChar commentStart [] = { '<','!','-','-', TQChar::null }; static const char scriptEnd [] = "deref(this); if ( buffer ) TDEHTML_DELETE_QCHAR_VEC(buffer); buffer = dest = 0; size = 0; if ( scriptCode ) TDEHTML_DELETE_QCHAR_VEC(scriptCode); scriptCode = 0; scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; if (m_autoCloseTimer) { killTimer(m_autoCloseTimer); m_autoCloseTimer = 0; } currToken.reset(); } void HTMLTokenizer::begin() { m_executingScript = 0; onHold = false; reset(); size = 254; buffer = TDEHTML_ALLOC_QCHAR_VEC( 255 ); dest = buffer; tag = NoTag; pending = NonePending; discard = NoneDiscard; pre = false; prePos = 0; plaintext = false; xmp = false; processingInstruction = false; script = false; escaped = false; style = false; skipLF = false; select = false; comment = false; server = false; textarea = false; title = false; startTag = false; tquote = NoQuote; searchCount = 0; Entity = NoEntity; noMoreData = false; brokenComments = false; brokenServer = false; brokenScript = false; lineno = 0; scriptStartLineno = 0; tagStartLineno = 0; } void HTMLTokenizer::processListing(TokenizerString list) { bool old_pre = pre; // This function adds the listing 'list' as // preformatted text-tokens to the token-collection // thereby converting TABs. if(!style) pre = true; prePos = 0; while ( !list.isEmpty() ) { checkBuffer(3*TAB_SIZE); if (skipLF && ( *list != '\n' )) { skipLF = false; } if (skipLF) { skipLF = false; ++list; } else if (( *list == '\n' ) || ( *list == '\r' )) { if (discard == LFDiscard) { // Ignore this LF discard = NoneDiscard; // We have discarded 1 LF } else { // Process this LF if (pending) addPending(); // we used to do it not at all and we want to have // it fixed for textarea. So here we are if ( textarea ) { prePos++; *dest++ = *list; } else pending = LFPending; } /* Check for MS-DOS CRLF sequence */ if (*list == '\r') { skipLF = true; } ++list; } else if (( *list == ' ' ) || ( *list == '\t')) { if (pending) addPending(); if (*list == ' ') pending = SpacePending; else pending = TabPending; ++list; } else { discard = NoneDiscard; if (pending) addPending(); prePos++; *dest++ = *list; ++list; } } if ((pending == SpacePending) || (pending == TabPending)) addPending(); else pending = NonePending; prePos = 0; pre = old_pre; } void HTMLTokenizer::parseSpecial(TokenizerString &src) { assert( textarea || title || !Entity ); assert( !tag ); assert( xmp+textarea+title+style+script == 1 ); if (script) scriptStartLineno = lineno+src.lineCount(); if ( comment ) parseComment( src ); while ( !src.isEmpty() ) { checkScriptBuffer(); unsigned char ch = src->latin1(); if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && TQConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "' ) ) { ++src; scriptCodeSize = scriptCodeResync-1; scriptCodeResync = 0; scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0; if ( script ) scriptHandler(); else { processListing(TokenizerString(scriptCode, scriptCodeSize)); processToken(); if ( style ) { currToken.tid = ID_STYLE + ID_CLOSE_TAG; } else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; } else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; } else if ( xmp ) { currToken.tid = ID_XMP + ID_CLOSE_TAG; } processToken(); script = style = textarea = title = xmp = false; tquote = NoQuote; scriptCodeSize = scriptCodeResync = 0; } return; } // possible end of tagname, lets check. if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch && scriptCodeSize >= searchStopperLen && !TQConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) { scriptCodeResync = scriptCodeSize-searchStopperLen+1; tquote = NoQuote; continue; } if ( scriptCodeResync && !escaped ) { if(ch == '\"') tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote); else if(ch == '\'') tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote; else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) tquote = NoQuote; } escaped = ( !escaped && ch == '\\' ); if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') { TQChar *scriptCodeDest = scriptCode+scriptCodeSize; ++src; parseEntity(src,scriptCodeDest,true); scriptCodeSize = scriptCodeDest-scriptCode; } else { scriptCode[ scriptCodeSize++ ] = *src; ++src; } } } void HTMLTokenizer::scriptHandler() { TQString currentScriptSrc = scriptSrc; scriptSrc = TQString::null; processListing(TokenizerString(scriptCode, scriptCodeSize)); TQString exScript( buffer, dest-buffer ); processToken(); currToken.tid = ID_SCRIPT + ID_CLOSE_TAG; processToken(); // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts. bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET); bool effectiveScript = !parser->skipMode() && !followingFrameset; bool deferredScript = false; if ( effectiveScript ) { CachedScript* cs = 0; // forget what we just got, load from src url instead if ( !currentScriptSrc.isEmpty() && javascript && (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) { cachedScript.enqueue(cs); } if (cs) { pendingQueue.push(src); uint scriptCount = cachedScript.count(); setSrc(TokenizerString()); scriptCodeSize = scriptCodeResync = 0; cs->ref(this); if (cachedScript.count() == scriptCount) deferredScript = true; } else if (currentScriptSrc.isEmpty() && view && javascript ) { pendingQueue.push(src); setSrc(TokenizerString()); scriptCodeSize = scriptCodeResync = 0; scriptExecution( exScript, TQString::null, tagStartLineno /*scriptStartLineno*/ ); } else { // script was filtered or disallowed effectiveScript = false; } } script = false; scriptCodeSize = scriptCodeResync = 0; if ( !effectiveScript ) return; if ( !m_executingScript && cachedScript.isEmpty() ) { src.append(pendingQueue.pop()); } else if ( cachedScript.isEmpty() ) { write( pendingQueue.pop(), false ); } else if ( !deferredScript && pendingQueue.count() > 1) { TokenizerString t = pendingQueue.pop(); pendingQueue.top().prepend( t ); } } void HTMLTokenizer::scriptExecution( const TQString& str, const TQString& scriptURL, int baseLine) { bool oldscript = script; m_executingScript++; script = false; TQString url; if (scriptURL.isNull() && view) url = static_cast(view->part()->document().handle())->URL().url(); else url = scriptURL; if (view) view->part()->executeScript(url,baseLine+1,Node(),str); m_executingScript--; script = oldscript; } void HTMLTokenizer::parseComment(TokenizerString &src) { // SGML strict bool strict = parser->doc()->inStrictMode() && parser->doc()->htmlMode() != DocumentImpl::XHtml && !script && !style; int delimiterCount = 0; bool canClose = false; checkScriptBuffer(src.length()); while ( src.length() ) { scriptCode[ scriptCodeSize++ ] = *src; #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("comment is now: *%s*", src.toString().left(16).latin1()); #endif if (strict) { if (src->unicode() == '-') { delimiterCount++; if (delimiterCount == 2) { delimiterCount = 0; canClose = !canClose; } } else delimiterCount = 0; } if ((!strict || canClose) && src->unicode() == '>') { bool handleBrokenComments = brokenComments && !( script || style ); bool scriptEnd=false; if (!strict) { if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-' ) scriptEnd=true; } if (canClose || handleBrokenComments || scriptEnd ){ ++src; if ( !( title || script || xmp || textarea || style) ) { #ifdef COMMENTS_IN_DOM checkScriptBuffer(); scriptCode[ scriptCodeSize ] = 0; scriptCode[ scriptCodeSize + 1 ] = 0; currToken.tid = ID_COMMENT; processListing(DOMStringIt(scriptCode, scriptCodeSize - 2)); processToken(); currToken.tid = ID_COMMENT + ID_CLOSE_TAG; processToken(); #endif scriptCodeSize = 0; } comment = false; return; // Finished parsing comment } } ++src; } } void HTMLTokenizer::parseServer(TokenizerString &src) { checkScriptBuffer(src.length()); while ( !src.isEmpty() ) { scriptCode[ scriptCodeSize++ ] = *src; if (src->unicode() == '>' && scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') { ++src; server = false; scriptCodeSize = 0; return; // Finished parsing server include } ++src; } } void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src) { char oldchar = 0; while ( !src.isEmpty() ) { unsigned char chbegin = src->latin1(); if(chbegin == '\'') { tquote = tquote == SingleQuote ? NoQuote : SingleQuote; } else if(chbegin == '\"') { tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; } // Look for '?>' // some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) ) { // We got a '?>' sequence processingInstruction = false; ++src; discard=LFDiscard; return; // Finished parsing comment! } ++src; oldchar = chbegin; } } void HTMLTokenizer::parseText(TokenizerString &src) { while ( !src.isEmpty() ) { // do we need to enlarge the buffer? checkBuffer(); // ascii is okay because we only do ascii comparisons unsigned char chbegin = src->latin1(); if (skipLF && ( chbegin != '\n' )) { skipLF = false; } if (skipLF) { skipLF = false; ++src; } else if (( chbegin == '\n' ) || ( chbegin == '\r' )) { if (chbegin == '\r') skipLF = true; *dest++ = '\n'; ++src; } else { *dest++ = *src; ++src; } } } void HTMLTokenizer::parseEntity(TokenizerString &src, TQChar *&dest, bool start) { if( start ) { cBufferPos = 0; entityLen = 0; Entity = SearchEntity; } while( !src.isEmpty() ) { ushort cc = src->unicode(); switch(Entity) { case NoEntity: return; break; case SearchEntity: if(cc == '#') { cBuffer[cBufferPos++] = cc; ++src; Entity = NumericSearch; } else Entity = EntityName; break; case NumericSearch: if(cc == 'x' || cc == 'X') { cBuffer[cBufferPos++] = cc; ++src; Entity = Hexadecimal; } else if(cc >= '0' && cc <= '9') Entity = Decimal; else Entity = SearchSemicolon; break; case Hexadecimal: { int uc = EntityChar.unicode(); int ll = kMin(src.length(), 8); while(ll--) { TQChar csrc(src->lower()); cc = csrc.cell(); if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) { break; } uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10)); cBuffer[cBufferPos++] = cc; ++src; } EntityChar = TQChar(uc); Entity = SearchSemicolon; break; } case Decimal: { int uc = EntityChar.unicode(); int ll = kMin(src.length(), 9-cBufferPos); while(ll--) { cc = src->cell(); if(src->row() || !(cc >= '0' && cc <= '9')) { Entity = SearchSemicolon; break; } uc = uc * 10 + (cc - '0'); cBuffer[cBufferPos++] = cc; ++src; } EntityChar = TQChar(uc); if(cBufferPos == 9) Entity = SearchSemicolon; break; } case EntityName: { int ll = kMin(src.length(), 9-cBufferPos); while(ll--) { TQChar csrc = *src; cc = csrc.cell(); if(csrc.row() || !((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { Entity = SearchSemicolon; break; } cBuffer[cBufferPos++] = cc; ++src; // be IE compatible and interpret even unterminated entities // outside tags. like "foo  stuff bla". if ( tag == NoTag ) { const entity* e = kde_findEntity(cBuffer, cBufferPos); if ( e && e->code < 256 ) { EntityChar = e->code; entityLen = cBufferPos; } } } if(cBufferPos == 9) Entity = SearchSemicolon; if(Entity == SearchSemicolon) { if(cBufferPos > 1) { const entity *e = kde_findEntity(cBuffer, cBufferPos); // IE only accepts unterminated entities < 256, // Gecko accepts them all, but only outside tags if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) { EntityChar = e->code; entityLen = cBufferPos; } } } break; } case SearchSemicolon: #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << endl; #endif fixUpChar(EntityChar); if (*src == ';') ++src; if ( !EntityChar.isNull() ) { checkBuffer(); if (entityLen > 0 && entityLen < cBufferPos) { int rem = cBufferPos - entityLen; src.prepend( TokenizerString(TQString::fromAscii(cBuffer+entityLen, rem)) ); } src.push( EntityChar ); } else { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "unknown entity!" << endl; #endif checkBuffer(11); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for(unsigned int i = 0; i < cBufferPos; i++) dest[i] = cBuffer[i]; dest += cBufferPos; if (pre) prePos += cBufferPos+1; } Entity = NoEntity; EntityChar = TQChar::null; return; }; } } void HTMLTokenizer::parseTag(TokenizerString &src) { assert(!Entity ); checkScriptBuffer( src.length() ); while ( !src.isEmpty() ) { checkBuffer(); #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 uint l = 0; while(l < src.length() && (src.toString()[l]).latin1() != '>') l++; tqDebug("src is now: *%s*, tquote: %d", src.toString().left(l).latin1(), tquote); #endif switch(tag) { case NoTag: return; case TagName: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("TagName"); #endif if (searchCount > 0) { if (*src == commentStart[searchCount]) { searchCount++; if (searchCount == 4) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Found comment" << endl; #endif // Found ' searchCount = 1; // Look for ' " << name << " id = " << currToken.tid << endl; if (currToken.flat) kdDebug( 6036 ) << "Token is FLAT!" << endl; if(!text.isNull()) kdDebug( 6036 ) << "text: \"" << text << "\"" << endl; unsigned long l = currToken.attrs ? currToken.attrs->length() : 0; if(l) { kdDebug( 6036 ) << "Attributes: " << l << endl; for (unsigned long i = 0; i < l; ++i) { NodeImpl::Id tid = currToken.attrs->idAt(i); DOMString value = currToken.attrs->valueAt(i); kdDebug( 6036 ) << " " << tid << " " << parser->doc()->getDocument()->getName(NodeImpl::AttributeId, tid).string() << "=\"" << value.string() << "\"" << endl; } } kdDebug( 6036 ) << endl; #endif // In some cases, parseToken() can cause javascript code to be executed // (for example, when setting an attribute that causes an event handler // to be created). So we need to protect against re-entrancy into the parser m_executingScript++; // pass the token over to the parser, the parser DOES NOT delete the token parser->parseToken(&currToken); m_executingScript--; if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() ) discard = NoneDiscard; currToken.reset(); if (jsProxy) jsProxy->setEventHandlerLineno(1); } HTMLTokenizer::~HTMLTokenizer() { reset(); delete parser; } void HTMLTokenizer::enlargeBuffer(int len) { int newsize = kMax(size*2, size+len); int oldoffs = (dest - buffer); buffer = TDEHTML_REALLOC_QCHAR_VEC(buffer, newsize); dest = buffer + oldoffs; size = newsize; } void HTMLTokenizer::enlargeScriptBuffer(int len) { int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len); scriptCode = TDEHTML_REALLOC_QCHAR_VEC(scriptCode, newsize); scriptCodeMaxSize = newsize; } void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/) { assert(!cachedScript.isEmpty()); bool done = false; while (!done && cachedScript.head()->isLoaded()) { kdDebug( 6036 ) << "Finished loading an external script" << endl; CachedScript* cs = cachedScript.dequeue(); DOMString scriptSource = cs->script(); #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl; #endif setSrc(TokenizerString()); // make sure we forget about the script before we execute the new one // infinite recursion might happen otherwise TQString cachedScriptUrl( cs->url().string() ); cs->deref(this); scriptExecution( scriptSource.string(), cachedScriptUrl ); done = cachedScript.isEmpty(); // 'script' is true when we are called synchronously from // scriptHandler(). In that case scriptHandler() will take care // of 'scriptOutput'. if ( !script ) { while (pendingQueue.count() > 1) { TokenizerString t = pendingQueue.pop(); pendingQueue.top().prepend( t ); } if (done) { write(pendingQueue.pop(), false); } // we might be deleted at this point, do not // access any members. } } } bool HTMLTokenizer::isWaitingForScripts() const { return cachedScript.count(); } bool HTMLTokenizer::isExecutingScript() const { return (m_executingScript > 0); } void HTMLTokenizer::setSrc(const TokenizerString& source) { lineno += src.lineCount(); src = source; src.resetLineCount(); } void HTMLTokenizer::setOnHold(bool _onHold) { if (onHold == _onHold) return; onHold = _onHold; }