/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set sw=2 ts=2 et tw=78: */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is mozilla.org code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 1998 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Blake Kaplan * * Alternatively, the contents of this file may be used under the terms of * either of the GNU General Public License Version 2 or later (the "GPL"), * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ /** * @file nsHTMLTokenizer.cpp * This is an implementation of the nsITokenizer interface. * This file contains the implementation of a tokenizer to tokenize an HTML * document. It attempts to do so, making tradeoffs between compatibility with * older parsers and the SGML specification. Note that most of the real * "tokenization" takes place in nsHTMLTokens.cpp. */ #include "nsIAtom.h" #include "nsHTMLTokenizer.h" #include "nsScanner.h" #include "nsElementTable.h" #include "CParserContext.h" #include "nsReadableUtils.h" #include "nsUnicharUtils.h" /************************************************************************ And now for the main class -- nsHTMLTokenizer... ************************************************************************/ /** * Satisfy the nsISupports interface. */ NS_IMPL_ISUPPORTS1(nsHTMLTokenizer, nsITokenizer) /** * Default constructor * * @param aParseMode The current mode the document is in (quirks, etc.) * @param aDocType The document type of the current document * @param aCommand What we are trying to do (view-source, parse a fragment, etc.) */ nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode, eParserDocType aDocType, eParserCommands aCommand, PRUint16 aFlags) : nsITokenizer(), mTokenDeque(0), mFlags(aFlags) { if (aParseMode == eDTDMode_full_standards || aParseMode == eDTDMode_almost_standards) { mFlags |= NS_IPARSER_FLAG_STRICT_MODE; } else if (aParseMode == eDTDMode_quirks) { mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE; } else if (aParseMode == eDTDMode_autodetect) { mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE; } else { mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE; } if (aDocType == ePlainText) { mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT; } else if (aDocType == eXML) { mFlags |= NS_IPARSER_FLAG_XML; } else if (aDocType == eHTML_Quirks || aDocType == eHTML3_Quirks || aDocType == eHTML_Strict) { mFlags |= NS_IPARSER_FLAG_HTML; } mFlags |= aCommand == eViewSource ? NS_IPARSER_FLAG_VIEW_SOURCE : NS_IPARSER_FLAG_VIEW_NORMAL; NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) || (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE), "Why isn't this XML document going through our XML parser?"); mTokenAllocator = nsnull; mTokenScanPos = 0; } /** * The destructor ensures that we don't leak any left over tokens. */ nsHTMLTokenizer::~nsHTMLTokenizer() { if (mTokenDeque.GetSize()) { CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool()); mTokenDeque.ForEach(theDeallocator); } } /******************************************************************* Here begins the real working methods for the tokenizer. *******************************************************************/ /** * Adds a token onto the end of the deque if aResult is a successful result. * Otherwise, this function frees aToken and sets it to nsnull. * * @param aToken The token that wants to be added. * @param aResult The error code that will be used to determine if we actually * want to push this token. * @param aDeque The deque we want to push aToken onto. * @param aTokenAllocator The allocator we use to free aToken in case aResult * is not a success code. */ /* static */ void nsHTMLTokenizer::AddToken(CToken*& aToken, nsresult aResult, nsDeque* aDeque, nsTokenAllocator* aTokenAllocator) { if (aToken && aDeque) { if (NS_SUCCEEDED(aResult)) { aDeque->Push(aToken); } else { IF_FREE(aToken, aTokenAllocator); } } } /** * Retrieve a pointer to the global token recycler... * * @return Pointer to recycler (or null) */ nsTokenAllocator* nsHTMLTokenizer::GetTokenAllocator() { return mTokenAllocator; } /** * This method provides access to the topmost token in the tokenDeque. * The token is not really removed from the list. * * @return Pointer to token */ CToken* nsHTMLTokenizer::PeekToken() { return (CToken*)mTokenDeque.PeekFront(); } /** * This method provides access to the topmost token in the tokenDeque. * The token is really removed from the list; if the list is empty we return 0. * * @return Pointer to token or NULL */ CToken* nsHTMLTokenizer::PopToken() { return (CToken*)mTokenDeque.PopFront(); } /** * Pushes a token onto the front of our deque such that the next call to * PopToken() or PeekToken() will return that token. * * @param theToken The next token to be processed * @return theToken */ CToken* nsHTMLTokenizer::PushTokenFront(CToken* theToken) { mTokenDeque.PushFront(theToken); return theToken; } /** * Pushes a token onto the deque. * * @param theToken the new token. * @return theToken */ CToken* nsHTMLTokenizer::PushToken(CToken* theToken) { mTokenDeque.Push(theToken); return theToken; } /** * Returns the size of the deque. * * @return The number of remaining tokens. */ PRInt32 nsHTMLTokenizer::GetCount() { return mTokenDeque.GetSize(); } /** * Allows access to an arbitrary token in the deque. The accessed token is left * in the deque. * * @param anIndex The index of the target token. Token 0 would be the same as * the result of a call to PeekToken() * @return The requested token. */ CToken* nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex) { return (CToken*)mTokenDeque.ObjectAt(anIndex); } /** * This method is part of the "sandwich" that occurs when we want to tokenize * a document. This prepares us to be able to tokenize properly. * * @param aIsFinalChunk Whether this is the last chunk of data that we will * get to see. * @param aTokenAllocator The token allocator to use for this document. * @return Our success in setting up. */ nsresult nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk, nsTokenAllocator* aTokenAllocator) { mTokenAllocator = aTokenAllocator; mIsFinalChunk = aIsFinalChunk; // Cause ScanDocStructure to search from here for new tokens... mTokenScanPos = mTokenDeque.GetSize(); return NS_OK; } /** * Pushes all of the tokens in aDeque onto the front of our deque so they * get processed before any other tokens. * * @param aDeque The deque with the tokens in it. */ void nsHTMLTokenizer::PrependTokens(nsDeque& aDeque) { PRInt32 aCount = aDeque.GetSize(); for (PRInt32 anIndex = 0; anIndex < aCount; ++anIndex) { CToken* theToken = (CToken*)aDeque.Pop(); PushTokenFront(theToken); } } /** * Copies the state flags from aTokenizer into this tokenizer. This is used * to pass information around between the main tokenizer and tokenizers * created for document.write() calls. * * @param aTokenizer The tokenizer with more information in it. * @return NS_OK */ nsresult nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer) { if (aTokenizer) { mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags; } return NS_OK; } /** * This is a utilty method for ScanDocStructure, which finds a given * tag in the stack. The return value is meant to be used with * nsDeque::ObjectAt() on aTagStack. * * @param aTag -- the ID of the tag we're seeking * @param aTagStack -- the stack to be searched * @return index position of tag in stack if found, otherwise kNotFound */ static PRInt32 FindLastIndexOfTag(eHTMLTags aTag, nsDeque &aTagStack) { PRInt32 theCount = aTagStack.GetSize(); while (0 < theCount) { CHTMLToken* theToken = (CHTMLToken*)aTagStack.ObjectAt(--theCount); if (theToken) { eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID(); if (theTag == aTag) { return theCount; } } } return kNotFound; } /** * This method scans the sequence of tokens to determine whether or not the * tag structure of the document is well formed. In well formed cases, we can * skip doing residual style handling and allow inlines to contain block-level * elements. * * @param aFinalChunk Is unused. * @return Success (currently, this function cannot fail). */ nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk) { nsresult result = NS_OK; if (!mTokenDeque.GetSize()) { return result; } CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos); // Start by finding the first start tag that hasn't been reviewed. while (mTokenScanPos > 0) { if (theToken) { eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType()); if (theType == eToken_start && theToken->GetContainerInfo() == eFormUnknown) { break; } } theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos); } // Now that we know where to start, let's walk through the // tokens to see which are well-formed. Stop when you run out // of fresh tokens. nsDeque theStack(0); nsDeque tempStack(0); PRInt32 theStackDepth = 0; // Don't bother if we get ridiculously deep. static const PRInt32 theMaxStackDepth = 200; while (theToken && theStackDepth < theMaxStackDepth) { eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType()); eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID(); if (nsHTMLElement::IsContainer(theTag)) { // Bug 54117 PRBool theTagIsBlock = gHTMLElements[theTag].IsMemberOf(kBlockEntity); PRBool theTagIsInline = theTagIsBlock ? PR_FALSE : gHTMLElements[theTag].IsMemberOf(kInlineEntity); if (theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) { switch(theType) { case eToken_start: { if (gHTMLElements[theTag].ShouldVerifyHierarchy()) { PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack); if (earlyPos != kNotFound) { // Uh-oh, we've found a tag that is not allowed to nest at // all. Mark the previous one and all of its children as // malformed to increase our chances of doing RS handling // on all of them. We want to do this for cases such as: //
. // Note that we have to iterate through all of the chilren // of the original malformed tag to protect against: //
, so that the // is allowed to contain the
. // XXX What about , where the second closes // the ? nsDequeIterator it(theStack, earlyPos), end(theStack.End()); while (it < end) { CHTMLToken *theMalformedToken = static_cast(it++); theMalformedToken->SetContainerInfo(eMalformed); } } } theStack.Push(theToken); ++theStackDepth; } break; case eToken_end: { CHTMLToken *theLastToken = static_cast(theStack.Peek()); if (theLastToken) { if (theTag == theLastToken->GetTypeID()) { theStack.Pop(); // Yank it for real theStackDepth--; theLastToken->SetContainerInfo(eWellFormed); } else { // This token wasn't what we expected it to be! We need to // go searching for its real start tag on our stack. Each // tag in between the end tag and start tag must be malformed if (FindLastIndexOfTag(theTag, theStack) != kNotFound) { // Find theTarget in the stack, marking each (malformed!) // tag in our way. theStack.Pop(); // Pop off theLastToken for real. do { theLastToken->SetContainerInfo(eMalformed); tempStack.Push(theLastToken); theLastToken = static_cast(theStack.Pop()); } while (theLastToken && theTag != theLastToken->GetTypeID()); // XXX The above test can confuse two different userdefined // tags. NS_ASSERTION(theLastToken, "FindLastIndexOfTag lied to us!" " We couldn't find theTag on theStack"); theLastToken->SetContainerInfo(eMalformed); // Great, now push all of the other tokens back onto the // stack to preserve the general structure of the document. // Note that we don't push the target token back onto the // the stack (since it was just closed). while (tempStack.GetSize() != 0) { theStack.Push(tempStack.Pop()); } } } } } break; default: break; } } } theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos); } return result; } /** * This method is called after we're done tokenizing a chunk of data. * * @param aFinalChunk Tells us if this was the last chunk of data. * @return Error result. */ nsresult nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk) { return ScanDocStructure(aFinalChunk); } /** * This method is repeatedly called by the tokenizer. * Each time, we determine the kind of token we're about to * read, and then we call the appropriate method to handle * that token type. * * @param aScanner The source of our input. * @param aFlushTokens An OUT parameter to tell the caller whether it should * process our queued tokens up to now (e.g., when we * reach a