PreParse XML using CString






4.29/5 (3 votes)
Mar 5, 2004

39092
An article on XML parsing using CString.
Introduction
It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.
Background
I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.
Using the code
You read out the XML string buf
, to say, strxml
. Then just call PreFormatXML(strxml)
; after that, you can create an instance of xmldom
, and call LoadXML
.
The following are the functions involved:
void PreFormatXML(CString& strxml) { // it is possible that the tooltip include a \r\n strxml.Remove(_T('\r')); strxml.Replace(_T('\n'),_T(' ')); strxml.Replace(_T('\t'),_T(' ')); //Dump(strxml,_T("e:\\bbb.xml")); //////////////////////// in the following, you should reassign \r\n ////////// now just one line int iStart = 0; int iEnd1,iEnd2; while(iStart>-1) { iEnd1 = strxml.Find(_T("<!--"),iStart); if(iEnd1<0) break; iEnd2 = strxml.Find(_T("-->"),iEnd1); if(iEnd2<0) break; if(iEnd2>iEnd1) { strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3); int n = strxml.Replace(strtemp,_T(" ")); iStart = 0; continue; } iStart = iEnd2+1; } iStart = 0; iEnd1 = 0; iEnd2 = 0; CString strcmp = strxml; while(iEnd2>-1) { iEnd1 = strxml.Find(_T('\"'),iStart); iEnd2 = strxml.Find(_T('\"'),iEnd1+1); if(iEnd1>-1&&iEnd2>-1) { for(int i=iEnd1;i<iEnd2+1;i++) strcmp.SetAt(i,_T('*')); } else break; iStart = iEnd2+1; } strcmp.MakeLower(); // minimize all tag and split concat attr int i1 = 0; int i2 = 0; int nlen = strcmp.GetLength(); while(i1<nlen) { TCHAR ch1 = strcmp.GetAt(i1); if(ch1!=_T('*')) strxml.SetAt(i2,ch1); if(ch1==_T('>')) { strxml.Insert(i2+1,_T('\n')); strxml.Insert(i2+1,_T('\r')); i2 += 2; } if(ch1==_T('*')) { if(i1<nlen-1) { TCHAR ch2 = strcmp.GetAt(i1+1); if(ch2!=_T('*')) { /// the if(ch2>_T('a'-1)&&ch2<_T('z'+1)) { strxml.Insert(i2+1,_T(' ')); i2++; } } } } i1++; i2++; } ////// the following remove duplicate tag, and makelower // of everything except attribute value. ////int nlen; nlen = strxml.GetLength(); strxml.Insert(nlen,_T("\r\n")); ////Dump(strxml,_T("e:\\aaaa.txt")); CString strtemp,strfake,strleft,strright; /////////////////////////////////////////////////////////////////// int size = m_ArrPreDefTag.GetSize(); preTag pa; CString strnodename,strattr; iStart = iEnd1 = iEnd2 = 0; while(iEnd1>-1) { nlen = strxml.GetLength(); iEnd1 = strxml.Find(_T('<'),iStart); if(iEnd1<0) break; iEnd2 = strxml.Find(_T(">\r\n"),iEnd1); if(iEnd2<0) break; //// keep left and right strleft.Empty(); strright.Empty(); strleft = strxml.Left(iEnd1+1); strright= strxml.Right(nlen-iEnd2); // pick out <> and process it strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1); strfake = strtemp; strfake.TrimLeft(); strfake.TrimRight(); int lensub = strfake.GetLength(); if(lensub>0) { bool bselfClosed = _T('/')==strfake.GetAt(lensub-1); if(bselfClosed) strfake = strfake.Left(lensub-1); strfake.TrimLeft(); strfake.TrimRight(); bool breversetag =_T('/')==strfake.GetAt(0); if(breversetag) { strfake = strfake.Right(lensub-1); int n1 = strfake.Find(_T(' ')); // just truncate it. reverse tag has no attr-value pair if(n1>1) strfake = strfake.Left(n1-1); for(int isize=0;isize<size;isize++) { pa = m_ArrPreDefTag[isize]; if(strfake.CompareNoCase(pa.tag)==0) { strfake = pa.oritag; break; } } strfake.MakeLower(); } else { /// replace tag with predefined tag int n0 = strfake.GetLength(); int n1 = strfake.Find(_T(' ')); if(n1>0) /// yeah, it include serveral fields { strnodename = strfake.Left(n1); strnodename.MakeLower(); for(int isize=0;isize<size;isize++) { pa = m_ArrPreDefTag[isize]; if(strnodename.CompareNoCase(pa.tag)==0) { strnodename = pa.oritag + _T(" ") + pa.preattrs; break; } } // reconcat // find out nodename, attr-pair; ATLASSERT(n0>n1); strfake = strnodename+strfake.Right(n0-n1); // breplaced is possible to change in this tag. RemoveDuplicate(strfake); } ///only one tag, and no attr-value pair } strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) + strfake + ((bselfClosed&&!breversetag)?_T("/"):_T("")) ; strxml = strleft + strtemp + strright; iEnd2 = strtemp.GetLength()+iEnd1; } else ATLASSERT(0); // there must be no < > things iStart = iEnd2+3; } return; } void RemoveDuplicate(CString& str) { str.TrimLeft(); str.TrimRight(); // replace simplified tag and spawn attribute array. // fill out attr-pair map; CString strnodename; int n0 = str.GetLength(); int n1 = str.Find(_T(' ')); if(n1>0) { strnodename = str.Left(n1); strnodename.MakeLower(); CString strfake; CSimpleMap<CString,CString> attributes; CString strattr,strvalue; strfake = str.Right(n0-n1); strfake.TrimLeft(); strfake.TrimRight(); /// find attr-value in strfake int m0 = 0; int m1 = 0; int m2 = 0; int mlen; //////////////////// while(m0>-1) { mlen = strfake.GetLength(); m1 = strfake.Find(_T('\"'),m0); if(m1<0) break; m2 = strfake.Find(_T('\"'),m1+1); if(m1<0) break; strattr = strfake.Mid(m0,m1-m0-1); strattr.Remove(_T('=')); strattr.MakeLower(); strattr.TrimLeft(); strattr.TrimRight(); strvalue = strfake.Mid(m1+1,m2-m1-1); strvalue.TrimLeft(); strvalue.TrimRight(); int nd = attributes.FindKey(strattr); if(nd<0) attributes.Add(strattr,strvalue); m0 = m2+1; } /// process default id str = strnodename; int size = attributes.GetSize(); for(int i=0;i<size;i++) { strattr = attributes.GetKeyAt(i); strvalue = attributes.GetValueAt(i); str += _T(" "); str += strattr; str += _T("=\""); str += strvalue; str += _T("\""); } attributes.RemoveAll(); } return; }
As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap
. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">
, the space between "blah"
and c
has been added, or you cannot load successfully.
This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.