|
// -------------------------------------------------------
// PowerPointCreator by Elm�Soft
// www.netcult.ch/elmue
// -------------------------------------------------------
using System;
using System.Collections;
using System.Text;
namespace PowerPointCreator
{
/// <summary>
/// This class parses XML and HTML code and corrects errors
/// </summary>
public class HtmlParser
{
private class HtmlNode
{
public string ms_Open = ""; // e.g. "<Font" or "</Font" or "<?XML"
public string ms_Close = ""; // e.g. ">" or "/>" or "?>"
public Hashtable mi_Attributes = new Hashtable();
public void ToString(StringBuilder s_Html)
{
s_Html.Append(ms_Open);
foreach (string s_Attrib in mi_Attributes.Keys)
{
// The quotation marks are required for XML
s_Html.AppendFormat(" {0}=\"{1}\"", s_Attrib, mi_Attributes[s_Attrib]);
}
s_Html.Append(ms_Close);
}
}
/// <summary>
/// HTML --> XML
/// This function avoids errors when loading HTML code into the XML parser.
/// XML uses a much more strict syntax than HTML.
/// A single <BR> is not possible in XML. All tags must be closed!
/// A <P> without a following </P> cuases a XML syntax error exception.
/// <FONT size=1> must be replaced with <FONT size="1">
/// This is required to feed HTML code into the XML parser
/// </summary>
public static string HtmlToXml(string s_Html)
{
s_Html = Functions.Replace(s_Html, "<BR>", "<BR />");
s_Html = Functions.Replace(s_Html, "</P>", "");
s_Html = Functions.Replace(s_Html, "<P>", "<BR /><BR />");
// This is required to avoid errors from the stupid XML parser:
// "Undeclared entity 'nbsp'" exception.
// See http://blogs.pingpoet.com/overflow/archive/2005/07/20/6607.aspx
s_Html = Functions.Replace(s_Html, " ", "#nbsp;");
s_Html += " "; // allows accessing s_Html[s32_Pos+2] without length check
int s32_Pos = 0;
StringBuilder s_Out = new StringBuilder(s_Html.Length * 2);
while (true)
{
// find start of next tag
int s32_Open = s_Html.IndexOf("<", s32_Pos);
if (s32_Open < 0)
{
// Append the rest of the html code
s_Out.Append(s_Html.Substring(s32_Pos));
break;
}
int s32_End = s32_Open - 1;
// insert text which is contained between neighboured tags
if (s32_End >= s32_Pos)
{
s_Out.Append(s_Html.Substring(s32_Pos, s32_End - s32_Pos + 1));
}
// read the tag and all it's arguments (e.g. Color=red)
s32_Pos = s32_Open;
HtmlNode i_Node = GetNextTag(ref s_Html, ref s32_Pos);
i_Node.ToString(s_Out);
}
return s_Out.ToString();
}
/// <summary>
/// Retrieves the next Html tag
/// Start position must be on the "<" character of the tag
/// returns with position set behind the ">" character of the tag
/// </summary>
private static HtmlNode GetNextTag(ref string s_Html, ref int s32_Pos)
{
HtmlNode i_Node = new HtmlNode();
int s32_First = s32_Pos;
s32_Pos++;
// Special case comments: don't parse !!
if (s_Html[s32_Pos] == '!' && s_Html[s32_Pos + 1] == '-' && s_Html[s32_Pos + 2] == '-')
{
s32_Pos = s_Html.IndexOf(">", s32_Pos);
if (s32_Pos < 0)
s32_Pos = s_Html.Length;
else
s32_Pos++;
i_Node.ms_Open = s_Html.Substring(s32_First, s32_Pos - s32_First);
return i_Node;
}
// count valid tag name characters up to first space or ">"
while (s32_Pos + 1 < s_Html.Length && !Char.IsWhiteSpace(s_Html[s32_Pos]) && s_Html[s32_Pos] != '>')
{
s32_Pos++;
}
int s32_Last = s32_Pos;
i_Node.ms_Open = s_Html.Substring(s32_First, s32_Last - s32_First);
// read arguments
bool b_Quoted = false;
string s_Attr = null;
while (s32_Pos < s_Html.Length)
{
if (!b_Quoted)
{
int s32_End = 0;
if (s_Html[s32_Pos] == '>') s32_End = 1;
if (s_Html[s32_Pos] == '/' && s_Html[s32_Pos + 1] == '>') s32_End = 2;
if (s_Html[s32_Pos] == '?' && s_Html[s32_Pos + 1] == '>') s32_End = 2;
if (s32_End > 0 ||
(s_Attr == null && s_Html[s32_Pos] == '=') ||
(s_Attr != null && Char.IsWhiteSpace(s_Html[s32_Pos])))
{
string s_Part = s_Html.Substring(s32_Last, s32_Pos - s32_Last);
// "\t \" test \" \r\n" ---> " test "
s_Part = s_Part.Trim().Trim(new char[]{'\"'});
if (s_Html[s32_Pos] == '=') s32_Pos++;
if (s_Attr == null)
s_Attr = s_Part;
else
{
i_Node.mi_Attributes[s_Attr] = s_Part;
s_Attr = null;
}
s32_Last = s32_Pos;
}
if (s32_End > 0)
{
i_Node.ms_Close = s_Html.Substring(s32_Pos, s32_End);
s32_Pos += s32_End;
break;
}
}
if (s_Html[s32_Pos] == '"')
b_Quoted = !b_Quoted;
s32_Pos++;
}
return i_Node;
}
}
}
|
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please
let us know and we'll add colourisation support for it.