Click here to Skip to main content
15,884,099 members
Articles / Desktop Programming / MFC

The Ultimate Toolbox - Updates and User Contributions

Rate me:
Please Sign up or sign in to vote.
4.79/5 (26 votes)
12 Feb 2013CPOL8 min read 254.7K   23.6K   170  
Updates and User Contributions for the Ultimate Toolbox Libraries
// HTMLParser.cpp: implementation of the COXHTMLParser class.
//
//////////////////////////////////////////////////////////////////////
// Version: 9.3


#include "stdafx.h"
#include "OXHTMLParser.h"

#include "UTBStrOp.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

// Limitations:
// - <a href=http://domain/directory/script.cgi?ord="string"> tags not parsed correctly
// - only VERY limited validation is done. For instance, there is no check on tag 
//   ordering or nesting (eg <head> may appear after <body>)
// - No real validation is performed. For instance, empty <p></p> tags are not removed.
// 
// The full solution is to make COXParser fully XML 1.0 compliant in regards to Document
// Type Definitions, then simply provide a HTML DTD for the given version of HTML to
// be parsed. One day...

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

// Tags (up to HTML 4.0)
// Tag Flags:
//
//   TAG_BLOCK   - tags that delimit blocks that signify the end of a paragraph 
//   TAG_OPTEND  - tags that have an optional closing tag eg <P>
//   TAG_EMPTY   - tags that do not have a closing tag at all (eg <HR>)
//   TAG_SECTION - tags for special sections (eg <BODY>)
//   TAG_CANCENTER - can be wrapped by a "center" tag
// 
// Text flags - for tags that modify text formatting (eg <B>). These flags
//              are not block elements, but merely font attribute "toggles"
TagDescriptor COXHTMLParser::m_HTMLTags[] = 
{
  //   Tag name            Tag Flags                           Tag Text Flag
     { TEXT("HTML"),       TAG_BLOCK|TAG_OPTEND|TAG_SECTION,   0,              },

     { TEXT("HEAD"),       TAG_BLOCK|TAG_OPTEND|TAG_SECTION,   0,              },

     { TEXT("TITLE"),      TAG_HEAD|TAG_BLOCK,                 0,              },
     { TEXT("BASE"),       TAG_HEAD|TAG_BLOCK|TAG_EMPTY,       0,              },
     { TEXT("LINK"),       TAG_HEAD|TAG_EMPTY,                 0,              },
     { TEXT("META"),       TAG_HEAD|TAG_BLOCK|TAG_EMPTY,       0,              },
     { TEXT("SCRIPT"),     TAG_HEAD|TAG_BLOCK,                 0,              },
     { TEXT("STYLE"),      TAG_HEAD,                           0,              },

     { TEXT("BODY"),       TAG_BLOCK|TAG_OPTEND|TAG_SECTION,   0,              },
     { TEXT("FRAMESET"),   TAG_BLOCK,                          0,              },

     { TEXT("A"),          0,                                  0               },
     { TEXT("ABBR"),       0,                                  0,              },
     { TEXT("ACRONYM"),    0,                                  0,              },
     { TEXT("ADDRESS"),    TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("APPLET"),     TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("AREA"),       TAG_EMPTY,                          0,              },  
     { TEXT("B"),          0,                                  TEXT_BOLD,      },
     { TEXT("BASEFONT"),   TAG_BLOCK|TAG_EMPTY,                0,              },
     { TEXT("BDO"),        0,                                  0,              },
     { TEXT("BIG"),        0,                                  TEXT_BIG,       },
     { TEXT("BLOCKQUOTE"), TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("BR"),         TAG_EMPTY,                          0,              },
     { TEXT("BUTTON"),     TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("CAPTION"),    TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("CENTER"),     TAG_BLOCK,                          TEXT_CENTER,    },
     { TEXT("CITE"),       0,                                  TEXT_CITE,      },
     { TEXT("CODE"),       0,                                  TEXT_CODE,      },
     { TEXT("COL"),        TAG_BLOCK|TAG_EMPTY,                0,              },
     { TEXT("COLGROUP"),   TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("DD"),         TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("DEL"),        0,                                  TEXT_DEFN,      },
     { TEXT("DFN"),        0,                                  0,              },
     { TEXT("DIR"),        TAG_BLOCK,                          0,              },
     { TEXT("DIV"),        TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("DL"),         TAG_BLOCK,                          0,              },
     { TEXT("DT"),         TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("EM"),         0,                                  TEXT_EMPHASIS,  },
     { TEXT("FIELDSET"),   TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("FONT"),       0,                                  TEXT_FONT,      },
     { TEXT("FORM"),       TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("FRAME"),      TAG_BLOCK|TAG_EMPTY|TAG_CANCENTER,  0,              },
     { TEXT("H1"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("H2"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("H3"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("H4"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("H5"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("H6"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("HR"),         TAG_BLOCK|TAG_EMPTY,                0,              },
     { TEXT("I"),          0,                                  TEXT_ITALIC,    },
     { TEXT("IFRAME"),     TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("IMG"),        TAG_EMPTY|TAG_CANCENTER,            0,              },
     { TEXT("INPUT"),      TAG_BLOCK|TAG_EMPTY|TAG_CANCENTER,  0,              },
     { TEXT("INS"),        0,                                  0,              },
     { TEXT("ISINDEX"),    TAG_EMPTY,                          0,              },
     { TEXT("KBD"),        0,                                  TEXT_KEYBOARD,  },
     { TEXT("LABEL"),      TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("LEGEND"),     TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("LI"),         TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("MAP"),        TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("MENU"),       TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("NOFRAMES"),   TAG_BLOCK,                          0,              },
     { TEXT("NOSCRIPT"),   TAG_BLOCK,                          0,              },
     { TEXT("OBJECT"),     TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("OL"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("OPTGROUP"),   0,                                  0,              },
     { TEXT("OPTION"),     TAG_OPTEND,                         0,              },
     { TEXT("P"),          TAG_BLOCK|TAG_OPTEND|TAG_CANCENTER, 0,              },
     { TEXT("PARAM"),      TAG_EMPTY,                          0,              },
     { TEXT("PRE"),        0,                                  TEXT_FORMATTED, },
     { TEXT("Q"),          TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("S"),          0,                                  TEXT_STRIKE,    },
     { TEXT("SAMP"),       0,                                  TEXT_SAMPLE,    },
     { TEXT("SELECT"),     0,                                  0,              },
     { TEXT("SMALL"),      0,                                  TEXT_SMALL,     },
     { TEXT("SPAN"),       0,                                  0,              },
     { TEXT("STRIKE"),     0,                                  TEXT_STRIKE,    },
     { TEXT("STRONG"),     0,                                  TEXT_STRONG,    },
     { TEXT("SUB"),        0,                                  TEXT_SUB,       },
     { TEXT("SUP"),        0,                                  TEXT_SUP,       },
     { TEXT("TABLE"),      TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("TBODY"),      TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("TD"),         TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("TEXTAREA"),   0,                                  0,              },
     { TEXT("TFOOT"),      TAG_OPTEND,                         0,              },
     { TEXT("TH"),         TAG_OPTEND,                         0,              },
     { TEXT("THEAD"),      TAG_OPTEND,                         0,              },
     { TEXT("TR"),         TAG_BLOCK|TAG_OPTEND,               0,              },
     { TEXT("TT"),         0,                                  TEXT_TELETYPE,  },
     { TEXT("U"),          0,                                  TEXT_UNDERLINE, },
     { TEXT("UL"),         TAG_BLOCK|TAG_CANCENTER,            0,              },
     { TEXT("VAR"),        0,                                  TEXT_VAR,       },

     { NULL,               0,                                  0,              },
};

// Taken from the "Tidy" program from the W3C site. Some of the following tags 
// are not recognised by the major browsers, so leave the esoteric ones out for now.
ParserEntity COXHTMLParser::m_HTMLEntity[] =
{
    { TEXT("nbsp"),   TEXT("�"), },
    //{ TEXT("iexcl"),  TEXT("�"), },
    { TEXT("cent"),   TEXT("�"), },
    { TEXT("pound"),  TEXT("�"), },
    //{ TEXT("curren"), TEXT("�"), },
    //{ TEXT("yen"),    TEXT("�"), },
    //{ TEXT("brvbar"), TEXT("�"), },
    //{ TEXT("sect"),   TEXT("�"), },
    //{ TEXT("uml"),    TEXT("�"), },
    { TEXT("copy"),   TEXT("�"), },
    //{ TEXT("ordf"),   TEXT("�"), },
    //{ TEXT("laquo"),  TEXT("�"), },
    //{ TEXT("not"),    TEXT("�"), },
    //{ TEXT("shy"),    TEXT("�"), },
    { TEXT("reg"),    TEXT("�"), },
    //{ TEXT("macr"),   TEXT("�"), },
    //{ TEXT("deg"),    TEXT("�"), },
    //{ TEXT("plusmn"), TEXT("�"), },
    //{ TEXT("sup2"),   TEXT("�"), },
    //{ TEXT("sup3"),   TEXT("�"), },
    //{ TEXT("acute"),  TEXT("�"), },
    //{ TEXT("micro"),  TEXT("�"), },
    //{ TEXT("para"),   TEXT("�"), },
    //{ TEXT("middot"), TEXT("�"), },
    //{ TEXT("cedil"),  TEXT("�"), },
    //{ TEXT("sup1"),   TEXT("�"), },
    //{ TEXT("ordm"),   TEXT("�"), },
    //{ TEXT("raquo"),  TEXT("�"), },
    //{ TEXT("frac14"), TEXT("�"), },
    //{ TEXT("frac12"), TEXT("�"), },
    //{ TEXT("frac34"), TEXT("�"), },
    //{ TEXT("iquest"), TEXT("�"), },
    //{ TEXT("Agrave"), TEXT("�"), },
    //{ TEXT("Aacute"), TEXT("�"), },
    //{ TEXT("Acirc"),  TEXT("�"), },
    //{ TEXT("Atilde"), TEXT("�"), },
    //{ TEXT("Auml"),   TEXT("�"), },
    //{ TEXT("Aring"),  TEXT("�"), },
    //{ TEXT("AElig"),  TEXT("�"), },
    //{ TEXT("Ccedil"), TEXT("�"), },
    //{ TEXT("Egrave"), TEXT("�"), },
    //{ TEXT("Eacute"), TEXT("�"), },
    //{ TEXT("Ecirc"),  TEXT("�"), },
    //{ TEXT("Euml"),   TEXT("�"), },
    //{ TEXT("Igrave"), TEXT("�"), },
    //{ TEXT("Iacute"), TEXT("�"), },
    //{ TEXT("Icirc"),  TEXT("�"), },
    //{ TEXT("Iuml"),   TEXT("�"), },
    //{ TEXT("ETH"),    TEXT("�"), },
    //{ TEXT("Ntilde"), TEXT("�"), },
    //{ TEXT("Ograve"), TEXT("�"), },
    //{ TEXT("Oacute"), TEXT("�"), },
    //{ TEXT("Ocirc"),  TEXT("�"), },
    //{ TEXT("Otilde"), TEXT("�"), },
    //{ TEXT("Ouml"),   TEXT("�"), },
    //{ TEXT("times"),  TEXT("�"), },
    //{ TEXT("Oslash"), TEXT("�"), },
    //{ TEXT("Ugrave"), TEXT("�"), },
    //{ TEXT("Uacute"), TEXT("�"), },
    //{ TEXT("Ucirc"),  TEXT("�"), },
    //{ TEXT("Uuml"),   TEXT("�"), },
    //{ TEXT("Yacute"), TEXT("�"), },
    //{ TEXT("THORN"),  TEXT("�"), },
    //{ TEXT("szlig"),  TEXT("�"), },
    //{ TEXT("agrave"), TEXT("�"), },
    //{ TEXT("aacute"), TEXT("�"), },
    //{ TEXT("acirc"),  TEXT("�"), },
    //{ TEXT("atilde"), TEXT("�"), },
    //{ TEXT("auml"),   TEXT("�"), },
    //{ TEXT("aring"),  TEXT("�"), },
    //{ TEXT("aelig"),  TEXT("�"), },
    //{ TEXT("ccedil"), TEXT("�"), },
    //{ TEXT("egrave"), TEXT("�"), },
    //{ TEXT("eacute"), TEXT("�"), },
    //{ TEXT("ecirc"),  TEXT("�"), },
    //{ TEXT("euml"),   TEXT("�"), },
    //{ TEXT("igrave"), TEXT("�"), },
    //{ TEXT("iacute"), TEXT("�"), },
    //{ TEXT("icirc"),  TEXT("�"), },
    //{ TEXT("iuml"),   TEXT("�"), },
    //{ TEXT("eth"),    TEXT("�"), },
    //{ TEXT("ntilde"), TEXT("�"), },
    //{ TEXT("ograve"), TEXT("�"), },
    //{ TEXT("oacute"), TEXT("�"), },
    //{ TEXT("ocirc"),  TEXT("�"), },
    //{ TEXT("otilde"), TEXT("�"), },
    //{ TEXT("ouml"),   TEXT("�"), },
    //{ TEXT("divide"), TEXT("�"), },
    //{ TEXT("oslash"), TEXT("�"), },
    //{ TEXT("ugrave"), TEXT("�"), },
    //{ TEXT("uacute"), TEXT("�"), },
    //{ TEXT("ucirc"),  TEXT("�"), },
    //{ TEXT("uuml"),   TEXT("�"), },
    //{ TEXT("yacute"), TEXT("�"), },
    //{ TEXT("thorn"),  TEXT("�"), },
    //{ TEXT("yuml"),   TEXT("�"), },
    
    NULL, 0
};

COXHTMLParser::COXHTMLParser()
{
    m_FontStack.clear();
    m_TextStyleStack.clear();

    m_bErrorOnMissingTag = FALSE;

    SetCaseSensitive(FALSE);
    m_HTMLTagTable.SetCaseSensitive(FALSE);

    // Fill hash table with tags
	int i = 0;
    for (i = 0; m_HTMLTags[i].szTag; i++)
        m_HTMLTagTable.Add(m_HTMLTags[i].szTag, (DWORD)(INT_PTR) &(m_HTMLTags[i]));

    // Add predefined HTML character entities to the entity list
    for (i = 0; m_HTMLEntity[i].szName; i++)
        m_EntityTable.Add(m_HTMLEntity[i].szName, (DWORD)(INT_PTR) m_HTMLEntity[i].szLiteral);
}

COXHTMLParser::~COXHTMLParser()
{
}

void COXHTMLParser::Clear()
{
    COXParser::Clear();

    // Clear out the font stack
    for (UINT i = 0; i < m_FontStack.size(); i++)
    {
        COXParserElement* pElm = (COXParserElement*) m_FontStack[i];
        delete pElm;
    }
    m_FontStack.clear();
    m_TextStyleStack.clear();
}

BOOL COXHTMLParser::Initialize()
{
    BOOL bResult = COXParser::Initialize();

    PushTextStyle();

    // Delete the &apos; tag from the entity table - HTML browsers don't seem to
    // support it.
    m_EntityTable.Remove(TEXT("apos"));

    return bResult;
}

BOOL COXHTMLParser::Cleanup()
{
    BOOL bResult = COXParser::Cleanup();

    return bResult;
}

// Make a new copy of the font element
COXParserElement* COXHTMLParser::DuplicateFontElement(COXParserElement* pElement)
{
    if (!pElement || !pElement->IsName(TEXT("font")))
        return NULL;
    
    COXParserElement* pFontElement = new COXParserElement(NULL, pElement->GetName());
    for (int i = 0; i < pElement->NumAttributes(); i++)
    {
        if (!pElement->Attribute(i))
            continue;

        COXAttribute* pAttribute = new COXAttribute;
        if (!pAttribute)
        {
            delete pFontElement;
            return NULL;
        }
        
        pAttribute->SetName(pElement->Attribute(i)->GetName());
        pAttribute->SetValue(pElement->Attribute(i)->GetStringValue());
        
        pFontElement->AddAttribute(pAttribute); 
    }

    return pFontElement;
}

COXParserElement* COXHTMLParser::ConstructFontElement()
{
    // Check font stack
    if (m_FontStack.size() <= 0)
        return NULL;
    
    // We need to combine the various font tags that have been encountered
    // into one element. We go through the font stack and add attributes,
    // and if we encounter a repeated attribute, then the latest one found
    // takes precedence.
    COXParserElement* pNewFontElement = new COXParserElement(NULL, TEXT("font"));
    
    if (pNewFontElement)
    {
        for (UINT i = 0; i < m_FontStack.size(); i++)
        {
            // Get the "font" element off the top of the stack
            COXParserElement* pFontElement = (COXParserElement*) m_FontStack[i];
            if (!pFontElement) continue;
            
            // Create a new element that matches this font element, and insert
            // it between the parent and our new object. This effectively "wraps"
            // the current text object in the current font attribute
            for (int i = 0; i < pFontElement->NumAttributes(); i++)
            {
                COXAttribute* pFontAttr = pFontElement->Attribute(i);
                if (!pFontAttr) continue;
                
                COXAttribute* pAttr = pNewFontElement->FindAttribute(pFontAttr->GetName());
                
                // If the attribute exists, copy over it, otherwise add it
                if (pAttr)
                    pAttr->SetValue(pFontAttr->GetStringValue());
                else
                {
                    pAttr = new COXAttribute;
                    pAttr->SetName(pFontAttr->GetName());
                    pAttr->SetValue(pFontAttr->GetStringValue());
                    pNewFontElement->AddAttribute(pAttr);
                }
            }
        }    
    }

    return pNewFontElement;
}

COXParserObject* COXHTMLParser::ParseText(COXParserElement* pParent)
{
    COXParserObject* pObject = COXParser::ParseText(pParent);
    if (!pObject)
        return NULL;
    
    // Stop text modifiers working inside tables 
    //if (pParent->IsName(TEXT("TD")))
    //    return pObject;

    // Set this text element within nested text modifier elements, depending
    // on the current text modifiers that have been turned on.

    UINT nTextStyle = GetTextStyle(); 

    // Add a space to the end of the text
    COXQuickString str = pObject->GetText();

    // Strip out whitespace (unless formatted)
    if (!(nTextStyle & TEXT_FORMATTED) && !(nTextStyle & TEXT_PRE))
        str.Strip();

    // add a space if necessary (160 = non-break space
    TCHAR chNBSP = TEXT('�'); // This is character 160, NOT character 32
    if (str.GetLength() && str[str.GetLength()-1] != chNBSP)
        str.Append(TEXT(' '));

    pObject->SetText(str);
    
    // Check current text mode
    for (int i = 0; m_HTMLTags[i].szTag; i++)
    {
        // Don't add center tags here - they will be added as a wrap around
        // the element holding this text
        if (!m_HTMLTags[i].dwTextFlag || (m_HTMLTags[i].dwTextFlag & TEXT_CENTER))
            continue;


        // If TextModifiers[i] is on, then create a new element with its name,
        // and add it to the beginning of the chain.
        if (nTextStyle & m_HTMLTags[i].dwTextFlag)
        {
            COXParserElement* pElm = new COXParserElement(NULL, m_HTMLTags[i].szTag);
            if (!pElm)
                break;
            
            pElm->AddObject(pObject);   // Add new element to begining of chain
            pObject = pElm;             // Move to beginning of chain
        }
    }
    
    COXParserElement* pNewFontElement = ConstructFontElement();
    if (pNewFontElement)
    {
        pNewFontElement->AddObject(pObject);
        pObject = pNewFontElement;
    }

    // Add a <p> wrapper if we have found standalone text
    if (pParent->GetParent() == NULL)
    {
        COXParserElement* pElm = new COXParserElement(pParent, TEXT("p"));
        pElm->SetFlags(GetTextStyle());
        if (pElm)
        {
            pElm->AddObject(pObject);
            pObject = pElm;             // Move to beginning of chain
            if (!ParseElement(pElm, 1))
            {
                delete pObject;
                pObject = NULL;
            }
        }
    }
    
    return pObject;
}

BOOL COXHTMLParser::GetValueString(COXQuickString& str)
{
    str.Empty();
    str.SetLength(100);
    
    TCHAR ch = GetNextChar();
    while (ch && ch != m_chEndDelim && !_istspace(ch))
    {
        if (ch == m_chTagEnd)
        {
            TCHAR chNext = GetNextChar();
            UngetChar();
            if (chNext == m_chEndDelim)
                break;
        }
        str.Append(ch);
        ch = GetNextChar();
    }

    BOOL bResult = TRUE;

    if (ch == m_chNULL)
    {
        ReportError(ERROR_END_OF_BUFFER,  TEXT("Unexpected end of buffer while name."));
        bResult = FALSE;
    }
    else
        UngetChar();

    if (!bResult)
        str.Empty();

    return bResult;
}

// Add new object to element. Insert text modifiers if necessary
void COXHTMLParser::AddObjectToElement(COXParserElement* pElement, COXParserObject* pObject)
{
    if (!pObject)
        return;
    
    if (pObject->GetType() == COXParserObject::ELEMENT)
    {
        // wrap blocks with "center" if centering was in force
        if ( (pObject->GetFlags() & TEXT_CENTER) && CanCenter(pObject->GetText()))
        {
            COXParserElement* pElm = new COXParserElement(NULL, TEXT("center"));
            if (pElm)
            {
                pElm->AddObject(pObject);   // Add new element to begining of chain
                pObject = pElm;             // Move to beginning of chain
            }
        }
    }

    pElement->AddObject(pObject);
}

// Modify the name/value parser so that all values are string values,
// and value=<string value> (ie no quotes) is OK. Also allow empty values,
// eg <tr nowrap>
BOOL COXHTMLParser::ParseAttributes(COXParserElement* pElement)
{
    BOOL bResult = TRUE;
    while (bResult)
    {
        bResult = GetToken(m_Token);
        if (!bResult)
        {
            ReportError(ERROR_END_OF_BUFFER,
                        TEXT("Unexpected end of buffer while parsing attributes (Element %s)"), 
                        pElement->GetName());
            break;
        }

        // end of attribute list?
        if (m_Token.GetType() != COXToken::STRING)
            break;

        // Create a new name/value
        COXAttribute* pAttribute = new COXAttribute;
        if (!pAttribute)
        {
            ReportError(ERROR_OUT_OF_MEMORY, 
                        TEXT("Unable to create new attribute (Element %s)"), 
                        pElement->GetName());
            bResult = FALSE;
            break;
        }

        // Get the name of the name/value pair
        COXQuickString str;
        if (!GetNameToken(str))
        {
            bResult = FALSE;
            break;
        }
        pAttribute->SetName(str);

        SAVEPOS pos;
        SaveBufferPos(pos);

        // May have "=" sign next
        if (!GetToken(m_Token))
        {
            ReportError(ERROR_BAD_TOKEN, 
                        TEXT("Error while parsing attribute (Element %s, name %s)."),
                        pElement->GetName(), pAttribute->GetName());
            delete pAttribute;
            bResult = FALSE;
            break;
        }

        // If an empty Attribute then continue on without searching for a value
        if (m_Token.GetType() != COXToken::EQUAL_SIGN)
        {
            pAttribute->SetValue(TEXT(""));
            pElement->AddAttribute(pAttribute);
            RestoreBufferPos(pos);
            continue;
        }

        // Should have a number, "string" or 'string' value next.
        if (!GetToken(m_Token))
        {
            ReportError(ERROR_BAD_TOKEN, 
                        TEXT("Error while parsing attribute (element %s, name %s)."),
                        pElement->GetName(), pAttribute->GetName());
            bResult = FALSE;
            break;
        }

        if (m_Token.GetType() == COXToken::STRING)
        {
            if (!GetValueString(str))
            {
                bResult = FALSE;
                break;
            }
            pAttribute->SetValue(str);
        }
        else if (m_Token.GetType() == COXToken::QUOTE)
        {
            if ( !GetStringToken(str, TEXT('"')) ) 
            {
                bResult = FALSE;
                break;
            }
            pAttribute->SetValue(str);
        }
        else if (m_Token.GetType() == COXToken::APOSTROPHE)
        {
            if ( !GetStringToken(str, TEXT('\'')) ) 
            {
                bResult = FALSE;
                break;
            }
            pAttribute->SetValue(str);
        }
        else
        {
            ReportError(ERROR_UNEXPECTED_TOKEN, 
                        TEXT("Unexpected token while parsing attribute (element %s, name %s)."),
                        pElement->GetName(), pAttribute->GetName());
            bResult = FALSE;
            break;
        }

        if (bResult)
            pElement->AddAttribute(pAttribute);
    }

    return bResult;
}

// In HTML, some tags such as <br> don't have an end tag - so treat them
// as Empty Tags.
COXParserElement* COXHTMLParser::ParseStartTag(COXParserElement* pParent, BOOL& bEmptyTag)
{
    COXParserElement* pElement = COXParser::ParseStartTag(pParent, bEmptyTag);
    if (!pElement)
        return NULL;

    // Force to empty tag for "simple" tags (HR, BR etc)
    if (IsEmptyTag(pElement->GetName()))
        bEmptyTag = TRUE;
   
    // Store the current text mode
    pElement->SetFlags(GetTextStyle());

    // Wrap empties with P's
    if (pParent->GetParent() == NULL       && 
        !IsBlockTag(pElement->GetName())   &&
        !IsHeadTag(pElement->GetName())    &&
        !IsTextModifier(pElement->GetName()))
    {
        COXParserElement* pElm = new COXParserElement(NULL, TEXT("P"));
        pElm->AddObject(pElement);   // Add new element to begining of chain
        pElement = pElm;             // Move to beginning of chain
    }

    // For new tables push a new text style onto the stack
    if (_tcsicmp(pElement->GetName(), TEXT("table")) == 0)
        PushTextStyle();

    return pElement;
}

BOOL COXHTMLParser::ParseEndTag(COXParserElement* pElement, COXQuickString& strEndTag)
{
    BOOL bResult = COXParser::ParseEndTag(pElement, strEndTag);
    if (!bResult)
        return FALSE;

    if ( !_tcsicmp(strEndTag, TEXT("table")) && 
         !_tcsicmp(strEndTag, pElement->GetName()) )
    {
        PopTextStyle();
    }

    return TRUE;
}

// Text modifier tags <b>, <i> etc don't work in the normal way. We treat
// them as "toggles" and just get the base parser to ignore them
BOOL COXHTMLParser::IgnoreStartTag(COXParserElement* pElement, BOOL bEmptyTag)
{
    UNUSED_ALWAYS(bEmptyTag);

    BOOL bIgnore = FALSE;

    // If we hit a font tag, then add it to the top of the font stack
    if (pElement->IsName(TEXT("font")))
    {
        COXParserElement* pFontElement = DuplicateFontElement(pElement);
        if (pFontElement)
            m_FontStack.push_back(pFontElement);

        bIgnore = TRUE;
    }
    else
    {   
        TagDescriptor* pTag = GetTagDescriptor(pElement->GetName());
        if (pTag && pTag->dwTextFlag)
        {
            //for (UINT i = 0; i < m_TextStyleStack.size(); i++)
            //    TRACE2("Before: Text style %d: %d\n",i,m_TextStyleStack[i]);

            UINT nTextStyle = GetTextStyle(); 
            SetTextStyle(nTextStyle | pTag->dwTextFlag);

            //for (i = 0; i < m_TextStyleStack.size(); i++)
            //    TRACE2("After: Text style %d: %d\n",i,m_TextStyleStack[i]);

            bIgnore = TRUE;
        }
    }

    return bIgnore;
}

// Text modifier tags <b>, <i> etc don't work in the normal way. We treat
// them as "toggles" and just get the base parser to ignore them
BOOL COXHTMLParser::IgnoreEndTag(LPCTSTR szEndTag)
{
    // If we come across a text modifier then we don't want to check for
    // the normal pair - /pair nesting. The text modifiers merely turn-on
    // and turn-off text attributes
    BOOL bTextModifier = FALSE;

    // If we hit a /font tag, then pop off the top font from the font stack
    if (_tcsicmp(szEndTag, TEXT("font")) == 0)
    {
        if (m_FontStack.size())
        {
            COXParserElement* pFont = (COXParserElement*) m_FontStack.back();
            delete pFont;
            m_FontStack.pop_back();
        }
        bTextModifier = TRUE;
    }
    else
    {
        TagDescriptor* pTag = GetTagDescriptor(szEndTag);
        if (pTag && pTag->dwTextFlag)
        {
            bTextModifier = TRUE;

            //for (UINT i = 0; i < m_TextStyleStack.size(); i++)
            //    TRACE2("Before: Text style %d: %d\n",i,m_TextStyleStack[i]);

            UINT nStyle = GetTextStyle();
            SetTextStyle(nStyle & ~(pTag->dwTextFlag));

            //for (i = 0; i < m_TextStyleStack.size(); i++)
            //    TRACE2("After: Text style %d: %d\n",i,m_TextStyleStack[i]);
        }
    }

    return bTextModifier;
}

// Returns TRUE if the tag is an empty tag (eg HR, BR etc)
BOOL COXHTMLParser::IsEmptyTag(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return ((pTag->dwTagFlag & COXHTMLParser::TAG_EMPTY) == COXHTMLParser::TAG_EMPTY);
}

BOOL COXHTMLParser::IsTextModifier(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return (pTag->dwTextFlag > 0);
}

// Returns TRUE if the tag has an optional end tag (eg P, LI etc)
BOOL COXHTMLParser::IsOptionalEndTag(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return ((pTag->dwTagFlag & COXHTMLParser::TAG_OPTEND) == COXHTMLParser::TAG_OPTEND);
}

// Returns TRUE if the tag is a special section tag (BODY and HEAD)
BOOL COXHTMLParser::IsSectionTag(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return ((pTag->dwTagFlag & COXHTMLParser::TAG_SECTION) == COXHTMLParser::TAG_SECTION);
}

// Returns TRUE if the tag ends paragraphs (eg P, TABLE etc)
BOOL COXHTMLParser::IsBlockTag(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return ((pTag->dwTagFlag & COXHTMLParser::TAG_BLOCK) == COXHTMLParser::TAG_BLOCK);
}

BOOL COXHTMLParser::IsHeadTag(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return ((pTag->dwTagFlag & COXHTMLParser::TAG_HEAD) == COXHTMLParser::TAG_HEAD);
}

BOOL COXHTMLParser::CanCenter(TagDescriptor* pTag)
{
    if (!pTag)
        return FALSE;

    return ((pTag->dwTagFlag & COXHTMLParser::TAG_CANCENTER) == COXHTMLParser::TAG_CANCENTER);
}

// This MUST be rewritten to use hash tables to get some speed
TagDescriptor* COXHTMLParser::GetTagDescriptor(LPCTSTR szTag) const
{
	// v9.3 - update 03 - 64-bit - HashNode uses DWORD, so revised this - TD
    // DWORD_PTR dwData;
    DWORD dwData;
    if (!m_HTMLTagTable.Lookup(szTag, dwData))
        return NULL;
    else
        return (TagDescriptor*) dwData;
}

// Return TRUE if 
//  a) szCurrentTag has an optional end tag, and szNewTag specifies a new element, or
//  a) szNewTag is NULL and szCurrentTag has an optional end tag
// Returning TRUE means that an end tag should be inserted for szCurrentTag, and that
// szNewTag represents a new sibling element. Returning FALSE means that szNewTag
// represents a new child element of szCurrentTag
// Since this funnction may be called when either a new start or a new end tag has
// been found, NewTagIsEndTag specifies whether or not szNewTag is an end tag (TRUE)
// or a start tag (NewTagIsEndTag = FALSE)
BOOL COXHTMLParser::IsEndTagMissing(LPCTSTR szCurrentTag, LPCTSTR szNewTag, 
                                  BOOL NewTagIsEndTag)
{
    if (!szCurrentTag || *szCurrentTag == 0)
        return FALSE;

    // Main HTML tag can only be left off when there is nothing else after it
    if (_tcsicmp(szCurrentTag, TEXT("HTML")) == 0)
        return (szNewTag == NULL || *szNewTag == 0);

    TagDescriptor* pElementTag = GetTagDescriptor(szCurrentTag);
    if (!pElementTag)
        return FALSE;

    TagDescriptor* pObjectTag = NULL;
    if (szNewTag && *szNewTag)
    {
        pObjectTag = GetTagDescriptor(szNewTag);
        if (!pObjectTag)
            return FALSE;
    }

    // Deal with main sections (HEAD, BODY) first
    if ( IsSectionTag(pElementTag))
        return (pObjectTag? IsSectionTag(pObjectTag) : TRUE);

    // Check that the tag we are now dealing with has an optional end.
    if (!IsOptionalEndTag(pElementTag))
        return FALSE;

    // We have an optional end tag - if no more data then everything is fine
    if (szNewTag == NULL || *szNewTag == 0)
        return TRUE;

    // Text modifiers do not mean a new element
    if (IsTextModifier(pObjectTag))
        return FALSE;

    // Certain optionally ended tags can only be ended with certain other tags

    if ( _tcsicmp(szCurrentTag, TEXT("P")) == 0 )
        return IsBlockTag(pObjectTag);

    if ( _tcsicmp(szCurrentTag, TEXT("LI")) == 0 )
        return ( _tcsicmp(szNewTag, TEXT("UL")) == 0 ||
                 _tcsicmp(szNewTag, TEXT("OL")) == 0);

    if ( _tcsicmp(szCurrentTag, TEXT("TR")) == 0 )
    {
        if (NewTagIsEndTag)
            return (_tcsicmp(szNewTag, TEXT("TABLE")) == 0);
        else
            return (_tcsicmp(szNewTag, TEXT("TR")) == 0);
    } 

    if ( _tcsicmp(szCurrentTag, TEXT("TD")) == 0 )
    {
        if (NewTagIsEndTag)
            return ( _tcsicmp(szNewTag, TEXT("TR")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TABLE")) == 0);
        else
            return ( _tcsicmp(szNewTag, TEXT("TD")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TR")) == 0 );
    }

    if ( _tcsicmp(szCurrentTag, TEXT("DT")) == 0 || 
         _tcsicmp(szCurrentTag, TEXT("DD")) == 0 )
    {
        if (NewTagIsEndTag)
            return ( _tcsicmp(szNewTag, TEXT("DT")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("DD")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("DL")) == 0);
        else
            return ( _tcsicmp(szNewTag, TEXT("DT")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("DD")) == 0 );
    }

    if ( _tcsicmp(szCurrentTag, TEXT("THEAD")) == 0 ||
         _tcsicmp(szCurrentTag, TEXT("TFOOT")) == 0 ||
         _tcsicmp(szCurrentTag, TEXT("TBODY")) == 0 )
    {
        if (NewTagIsEndTag)
            return ( _tcsicmp(szNewTag, TEXT("TFOOT")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("THEAD")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TBODY")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TABLE")) == 0);
        else
            return ( _tcsicmp(szNewTag, TEXT("TFOOT")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("THEAD")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TBODY")) == 0 );
    }

    if ( _tcsicmp(szCurrentTag, TEXT("COLGROUP")) == 0 )
    {
        if (NewTagIsEndTag)
            return ( _tcsicmp(szNewTag, TEXT("COLGROUP")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TR")) == 0       ||
                     _tcsicmp(szNewTag, TEXT("TD")) == 0       ||
                     _tcsicmp(szNewTag, TEXT("THEAD")) == 0    ||
                     _tcsicmp(szNewTag, TEXT("TFOOT")) == 0    ||
                     _tcsicmp(szNewTag, TEXT("TABLE")) == 0);
        else
            return ( _tcsicmp(szNewTag, TEXT("COLGROUP")) == 0 ||
                     _tcsicmp(szNewTag, TEXT("TR")) == 0       ||
                     _tcsicmp(szNewTag, TEXT("TD")) == 0       ||
                     _tcsicmp(szNewTag, TEXT("THEAD")) == 0    ||
                     _tcsicmp(szNewTag, TEXT("TFOOT")) == 0 );
    }

    // pElement is optionally ended, followed by a new tag that is not a text
    // modifier. By default we close off pElement and start a new sibling element
    return TRUE;
}

BOOL COXHTMLParser::WriteAttributes(HANDLE hFile, COXParserElement* pElement)
{
    USES_CONVERSION;

    static char buffer[512];
    static DWORD nCount;

    for (int i = 0; i < pElement->NumAttributes(); i++)
    {
        COXAttribute* pAttribute = pElement->Attribute(i);
        if (!pAttribute) continue;

		UTBStr::sprintf(buffer, 512, " %s", T2A((LPTSTR) pAttribute->GetName()));
        if (!::WriteFile(hFile, buffer, PtrToUlong(strlen(buffer)), &nCount, NULL))
            return FALSE;

        LPCTSTR szValue = pAttribute->GetStringValue();
        if (szValue && *szValue)
        {
            UTBStr::sprintf(buffer, 512, "=\"%s\"", T2A((LPTSTR) pAttribute->GetStringValue()));       
            if (!::WriteFile(hFile, buffer, PtrToUlong(strlen(buffer)), &nCount, NULL))
                return FALSE;
        }
    }

    return TRUE;
}

BOOL COXHTMLParser::WriteElement(HANDLE hFile, COXParserElement* pElement, int nLevel)
{   
    USES_CONVERSION;

    static char buffer[512];
    static DWORD nCount;

    if (pElement->GetType() != COXParserObject::ELEMENT)
        return FALSE;

    BOOL bSeparateLine = ( IsBlockTag(pElement->GetName())   || 
                           IsHeadTag(pElement->GetName())    || 
                           IsSectionTag(pElement->GetName()) ||
                           pElement->IsName(TEXT("br")) );

    if (bSeparateLine)
    {
        if (!::WriteFile(hFile, "\r\n", 2, &nCount, NULL))
            return FALSE;
    }

    UTBStr::sprintf(buffer, 512, "<%s", T2A((LPTSTR)pElement->GetName()));
    if (!::WriteFile(hFile, buffer, PtrToUlong(strlen(buffer)), &nCount, NULL))
        return FALSE;
    
    if (!WriteAttributes(hFile, pElement))
        return FALSE;
    
    if (!::WriteFile(hFile, ">", 1, &nCount, NULL))
        return FALSE;

    for (int i = 0; i < pElement->NumObjects(); i++)
        WriteObject(hFile, pElement->Object(i), nLevel+1);
        
    //if (!WriteTabs(hFile, nLevel))
    //    return FALSE;

    if (!IsEmptyTag(pElement->GetName()))
    {
		UTBStr::sprintf(buffer, 512, "</%s>", T2A((LPTSTR)pElement->GetName()));
        if (!::WriteFile(hFile, buffer, PtrToUlong(strlen(buffer)), &nCount, NULL))
            return FALSE;

        if (bSeparateLine)
        {
            if (!::WriteFile(hFile, "\r\n", 2, &nCount, NULL))
                return FALSE;
        }
    }

    return TRUE;
}

BOOL COXHTMLParser::WriteText(HANDLE hFile, COXParserObject* pObject, int nLevel)
{   
    USES_CONVERSION;
    UNUSED_ALWAYS(nLevel);

    if (pObject->GetType() != COXParserObject::PLAINTEXT)
        return FALSE;

    BOOL bSeparateLine = FALSE;
    if (pObject->GetParent())
    {
        bSeparateLine = ( IsBlockTag(pObject->GetParent()->GetName())   ||
                          IsSectionTag(pObject->GetParent()->GetName()) ||
                          pObject->GetParent()->IsName(TEXT("br")) );
    }

    DWORD nCount;
    if (bSeparateLine)
    {
        if (!::WriteFile(hFile, "\r\n", 2, &nCount, NULL))
            return FALSE;
    }

    COXQuickString str = EncodeText(pObject->GetText());
    if (str.IsEmpty())
        return FALSE;

    if (!::WriteFile(hFile, T2A((LPTSTR)str.GetString()), str.GetLength(), &nCount, NULL))
        return FALSE;

    if (bSeparateLine)
    {
        if (!::WriteFile(hFile, "\r\n", 2, &nCount, NULL))
            return FALSE;
    }

    return TRUE;
}

LPCTSTR COXHTMLParser::TranslateErrorCode(int nErrorCode)
{
    switch (nErrorCode)
    {
        case WARNING_UNKNOWN_TAG: return TEXT("Unknown tag found");

        default:
            /* fall through */;
    }
    return COXParser::TranslateErrorCode(nErrorCode);
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Web Developer
Canada Canada
In January 2005, David Cunningham and Chris Maunder created TheUltimateToolbox.com, a new group dedicated to the continued development, support and growth of Dundas Software’s award winning line of MFC, C++ and ActiveX control products.

Ultimate Grid for MFC, Ultimate Toolbox for MFC, and Ultimate TCP/IP have been stalwarts of C++/MFC development for a decade. Thousands of developers have used these products to speed their time to market, improve the quality of their finished products, and enhance the reliability and flexibility of their software.
This is a Organisation

476 members

Comments and Discussions