65.9K
CodeProject is changing. Read more.
Home

PreParse XML using CString

starIconstarIconstarIconstarIcon
emptyStarIcon
starIcon

4.29/5 (3 votes)

Mar 5, 2004

viewsIcon

39092

An article on XML parsing using CString.

Introduction

It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.

Background

I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.

Using the code

You read out the XML string buf, to say, strxml. Then just call PreFormatXML(strxml); after that, you can create an instance of xmldom, and call LoadXML.

The following are the functions involved:

void PreFormatXML(CString& strxml)
{
    // it is possible that the tooltip include a \r\n
    strxml.Remove(_T('\r'));
    strxml.Replace(_T('\n'),_T(' '));
    strxml.Replace(_T('\t'),_T(' '));
    //Dump(strxml,_T("e:\\bbb.xml"));

    //////////////////////// in the following, you should reassign \r\n
    ////////// now just one line
    int iStart = 0;
    int iEnd1,iEnd2;
    while(iStart>-1)
    {
        iEnd1 = strxml.Find(_T("<!--"),iStart);
        if(iEnd1<0) 
            break;
        iEnd2 = strxml.Find(_T("-->"),iEnd1);
        if(iEnd2<0) 
            break;

        if(iEnd2>iEnd1) {
            strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3);
            int n = strxml.Replace(strtemp,_T(" "));
            iStart = 0;
            continue;
        }
        iStart = iEnd2+1;
    }

    iStart  = 0;
    iEnd1    = 0;
    iEnd2    = 0;
    CString strcmp = strxml;
    while(iEnd2>-1)
    {
        iEnd1 = strxml.Find(_T('\"'),iStart);
        iEnd2 = strxml.Find(_T('\"'),iEnd1+1);
        if(iEnd1>-1&&iEnd2>-1) {
            for(int i=iEnd1;i<iEnd2+1;i++)
                strcmp.SetAt(i,_T('*'));
        }
        else
            break;
        iStart = iEnd2+1;
    }

    strcmp.MakeLower();

    // minimize all tag and split concat attr
    int i1 = 0;
    int i2 = 0;
    int nlen = strcmp.GetLength();
    while(i1<nlen)
    {
        TCHAR ch1 = strcmp.GetAt(i1);
        if(ch1!=_T('*')) 
            strxml.SetAt(i2,ch1);

        if(ch1==_T('>'))
        {
            strxml.Insert(i2+1,_T('\n'));
            strxml.Insert(i2+1,_T('\r'));
            i2 += 2;
        }

        if(ch1==_T('*')) 
        {
            if(i1<nlen-1)
            {
                TCHAR ch2 = strcmp.GetAt(i1+1);
                if(ch2!=_T('*'))
                {
                    /// the 
                    if(ch2>_T('a'-1)&&ch2<_T('z'+1))
                    {
                        strxml.Insert(i2+1,_T(' '));
                        i2++;
                    }
                }
            }
        }
        i1++;
        i2++;
    }

    ////// the following remove duplicate tag, and makelower
    //     of everything except attribute value.
    ////int nlen;
    nlen = strxml.GetLength();
    strxml.Insert(nlen,_T("\r\n"));
    ////Dump(strxml,_T("e:\\aaaa.txt"));
    
    CString strtemp,strfake,strleft,strright;

    ///////////////////////////////////////////////////////////////////
    int size = m_ArrPreDefTag.GetSize();
    preTag pa;
    CString strnodename,strattr;

    iStart    = iEnd1    = iEnd2    = 0;
    while(iEnd1>-1)
    {
        nlen  = strxml.GetLength();
        iEnd1 = strxml.Find(_T('<'),iStart);
        if(iEnd1<0)
            break;
        iEnd2 = strxml.Find(_T(">\r\n"),iEnd1);
        if(iEnd2<0)
            break;
            
        //// keep left and right
        strleft.Empty();
        strright.Empty();
        strleft = strxml.Left(iEnd1+1);
        strright= strxml.Right(nlen-iEnd2);
        // pick out <> and process it
        strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1);
        strfake = strtemp;
        strfake.TrimLeft();
        strfake.TrimRight();

        int lensub = strfake.GetLength();
        if(lensub>0) 
        {
            bool bselfClosed    = _T('/')==strfake.GetAt(lensub-1);
            if(bselfClosed)
                strfake = strfake.Left(lensub-1);

            strfake.TrimLeft();
            strfake.TrimRight();
            bool breversetag    =_T('/')==strfake.GetAt(0);
            if(breversetag) 
            {
                strfake = strfake.Right(lensub-1);
                int n1 = strfake.Find(_T(' '));

                // just truncate it. reverse tag has no attr-value pair
                if(n1>1)
                    strfake = strfake.Left(n1-1);        

                for(int isize=0;isize<size;isize++)
                {
                    pa = m_ArrPreDefTag[isize];
                    if(strfake.CompareNoCase(pa.tag)==0)
                    {
                        strfake = pa.oritag;
                        break;
                    }
                }
                strfake.MakeLower();
            }
            else
            {
                /// replace tag with predefined tag
                int n0 = strfake.GetLength();
                int n1 = strfake.Find(_T(' '));
                if(n1>0) /// yeah, it include serveral fields
                {
                    strnodename = strfake.Left(n1);
                    strnodename.MakeLower();
                    for(int isize=0;isize<size;isize++)
                    {
                        pa = m_ArrPreDefTag[isize];
                        if(strnodename.CompareNoCase(pa.tag)==0)
                        {
                            strnodename = pa.oritag + _T(" ") + pa.preattrs;
                            break;
                        }
                    }
                    // reconcat 
                    // find out nodename, attr-pair;
                    ATLASSERT(n0>n1);
                    strfake = strnodename+strfake.Right(n0-n1);
                    // breplaced is possible to change in this tag.
                    RemoveDuplicate(strfake);
                }
                ///only one tag, and no attr-value pair

            }
            strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) + 
                strfake +     ((bselfClosed&&!breversetag)?_T("/"):_T("")) ;
            strxml = strleft + strtemp    +     strright;
            iEnd2    = strtemp.GetLength()+iEnd1;
        }
        else
            ATLASSERT(0);        // there must be no < > things 
        iStart = iEnd2+3;
    }
    return;
}

void RemoveDuplicate(CString& str)
{
    str.TrimLeft();
    str.TrimRight();

    // replace simplified tag and spawn attribute array.
    // fill out attr-pair map;

    CString strnodename;
    int n0 = str.GetLength();
    int n1 = str.Find(_T(' '));
    if(n1>0) 
    {
        strnodename = str.Left(n1);
        strnodename.MakeLower();

        CString strfake;
        CSimpleMap<CString,CString> attributes;
        CString strattr,strvalue;

        strfake = str.Right(n0-n1);
        strfake.TrimLeft();
        strfake.TrimRight();
        /// find attr-value in strfake

        int m0 = 0;
        int m1 = 0;
        int m2 = 0;
        int mlen;
        ////////////////////
        while(m0>-1)
        {
            mlen = strfake.GetLength();
            m1 = strfake.Find(_T('\"'),m0);
            if(m1<0)
                break;

            m2 = strfake.Find(_T('\"'),m1+1);
            if(m1<0)
                break;

            strattr  = strfake.Mid(m0,m1-m0-1);
            strattr.Remove(_T('='));
            strattr.MakeLower();
            strattr.TrimLeft();
            strattr.TrimRight();

            strvalue = strfake.Mid(m1+1,m2-m1-1);
            strvalue.TrimLeft();
            strvalue.TrimRight();
            int nd = attributes.FindKey(strattr);
            if(nd<0)
                attributes.Add(strattr,strvalue);
            m0 = m2+1;
        }
        /// process default id

        str = strnodename;
        int size = attributes.GetSize();
        for(int i=0;i<size;i++)
        {
            strattr     = attributes.GetKeyAt(i);
            strvalue    = attributes.GetValueAt(i);
            str += _T(" ");
            str += strattr;
            str += _T("=\"");
            str += strvalue;
            str += _T("\"");
        }
        attributes.RemoveAll();
    }
    return;
}

As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">, the space between "blah" and c has been added, or you cannot load successfully.

This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.