Introduction
It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.
Background
I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.
Using the code
You read out the XML string buf, to say, strxml. Then just call PreFormatXML(strxml); after that, you can create an instance of xmldom, and call LoadXML.
The following are the functions involved:
void PreFormatXML(CString& strxml)
{
strxml.Remove(_T('\r'));
strxml.Replace(_T('\n'),_T(' '));
strxml.Replace(_T('\t'),_T(' '));
int iStart = 0;
int iEnd1,iEnd2;
while(iStart>-1)
{
iEnd1 = strxml.Find(_T("<!--"),iStart);
if(iEnd1<0)
break;
iEnd2 = strxml.Find(_T("-->"),iEnd1);
if(iEnd2<0)
break;
if(iEnd2>iEnd1) {
strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3);
int n = strxml.Replace(strtemp,_T(" "));
iStart = 0;
continue;
}
iStart = iEnd2+1;
}
iStart = 0;
iEnd1 = 0;
iEnd2 = 0;
CString strcmp = strxml;
while(iEnd2>-1)
{
iEnd1 = strxml.Find(_T('\"'),iStart);
iEnd2 = strxml.Find(_T('\"'),iEnd1+1);
if(iEnd1>-1&&iEnd2>-1) {
for(int i=iEnd1;i<iEnd2+1;i++)
strcmp.SetAt(i,_T('*'));
}
else
break;
iStart = iEnd2+1;
}
strcmp.MakeLower();
int i1 = 0;
int i2 = 0;
int nlen = strcmp.GetLength();
while(i1<nlen)
{
TCHAR ch1 = strcmp.GetAt(i1);
if(ch1!=_T('*'))
strxml.SetAt(i2,ch1);
if(ch1==_T('>'))
{
strxml.Insert(i2+1,_T('\n'));
strxml.Insert(i2+1,_T('\r'));
i2 += 2;
}
if(ch1==_T('*'))
{
if(i1<nlen-1)
{
TCHAR ch2 = strcmp.GetAt(i1+1);
if(ch2!=_T('*'))
{
if(ch2>_T('a'-1)&&ch2<_T('z'+1))
{
strxml.Insert(i2+1,_T(' '));
i2++;
}
}
}
}
i1++;
i2++;
}
nlen = strxml.GetLength();
strxml.Insert(nlen,_T("\r\n"));
CString strtemp,strfake,strleft,strright;
int size = m_ArrPreDefTag.GetSize();
preTag pa;
CString strnodename,strattr;
iStart = iEnd1 = iEnd2 = 0;
while(iEnd1>-1)
{
nlen = strxml.GetLength();
iEnd1 = strxml.Find(_T('<'),iStart);
if(iEnd1<0)
break;
iEnd2 = strxml.Find(_T(">\r\n"),iEnd1);
if(iEnd2<0)
break;
strleft.Empty();
strright.Empty();
strleft = strxml.Left(iEnd1+1);
strright= strxml.Right(nlen-iEnd2);
strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1);
strfake = strtemp;
strfake.TrimLeft();
strfake.TrimRight();
int lensub = strfake.GetLength();
if(lensub>0)
{
bool bselfClosed = _T('/')==strfake.GetAt(lensub-1);
if(bselfClosed)
strfake = strfake.Left(lensub-1);
strfake.TrimLeft();
strfake.TrimRight();
bool breversetag =_T('/')==strfake.GetAt(0);
if(breversetag)
{
strfake = strfake.Right(lensub-1);
int n1 = strfake.Find(_T(' '));
if(n1>1)
strfake = strfake.Left(n1-1);
for(int isize=0;isize<size;isize++)
{
pa = m_ArrPreDefTag[isize];
if(strfake.CompareNoCase(pa.tag)==0)
{
strfake = pa.oritag;
break;
}
}
strfake.MakeLower();
}
else
{
int n0 = strfake.GetLength();
int n1 = strfake.Find(_T(' '));
if(n1>0) {
strnodename = strfake.Left(n1);
strnodename.MakeLower();
for(int isize=0;isize<size;isize++)
{
pa = m_ArrPreDefTag[isize];
if(strnodename.CompareNoCase(pa.tag)==0)
{
strnodename = pa.oritag + _T(" ") + pa.preattrs;
break;
}
}
ATLASSERT(n0>n1);
strfake = strnodename+strfake.Right(n0-n1);
RemoveDuplicate(strfake);
}
}
strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) +
strfake + ((bselfClosed&&!breversetag)?_T("/"):_T("")) ;
strxml = strleft + strtemp + strright;
iEnd2 = strtemp.GetLength()+iEnd1;
}
else
ATLASSERT(0); iStart = iEnd2+3;
}
return;
}
void RemoveDuplicate(CString& str)
{
str.TrimLeft();
str.TrimRight();
CString strnodename;
int n0 = str.GetLength();
int n1 = str.Find(_T(' '));
if(n1>0)
{
strnodename = str.Left(n1);
strnodename.MakeLower();
CString strfake;
CSimpleMap<CString,CString> attributes;
CString strattr,strvalue;
strfake = str.Right(n0-n1);
strfake.TrimLeft();
strfake.TrimRight();
int m0 = 0;
int m1 = 0;
int m2 = 0;
int mlen;
while(m0>-1)
{
mlen = strfake.GetLength();
m1 = strfake.Find(_T('\"'),m0);
if(m1<0)
break;
m2 = strfake.Find(_T('\"'),m1+1);
if(m1<0)
break;
strattr = strfake.Mid(m0,m1-m0-1);
strattr.Remove(_T('='));
strattr.MakeLower();
strattr.TrimLeft();
strattr.TrimRight();
strvalue = strfake.Mid(m1+1,m2-m1-1);
strvalue.TrimLeft();
strvalue.TrimRight();
int nd = attributes.FindKey(strattr);
if(nd<0)
attributes.Add(strattr,strvalue);
m0 = m2+1;
}
str = strnodename;
int size = attributes.GetSize();
for(int i=0;i<size;i++)
{
strattr = attributes.GetKeyAt(i);
strvalue = attributes.GetValueAt(i);
str += _T(" ");
str += strattr;
str += _T("=\"");
str += strvalue;
str += _T("\"");
}
attributes.RemoveAll();
}
return;
}
As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">, the space between "blah" and c has been added, or you cannot load successfully.
This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.