|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Announcements
Chapters
Services
Feature Zones
|
IntroductionThe articles on our web site are mainly in HTML 4.0, however, many of them don't conform the W3C standard; there are a lot of bad tags in these articles, and I wanted to convert these files to XHTML files in order to conform the W3C standard. Sometimes I want to extract some information from web pages. If the web page is in XHTML format, then I can get the information more easily since I can use an XML document prototype to parse the file. BackgroundThere are several tools that can convert HTML to an XHTML. Dreamweaver is able to convert file by using File-->Convert Menu. But there are some issues with Dreamweaver: it's not free, and sometimes Dreamweaver is not able to fix some errors. Also, you can use a free famous tool called "HTML Tidy". However, HTML Tidy can process some languages only. This article is based on HTML Tidy. Since XHTML 2.0 is not compatible with HTML and XHTML 1.0, it's not universally used. For example, the default schema for .NET web application is XHTML 1.0. In this article, XHTML refers to XHTML 1.0 transitional format. Step I. Convert HTML file to UTF-8 formatIn order to process all languages, we first have to convert the file to UTF-8 format. (Note: If the source file is already in UTF-8 format, then you can just ignore this step) We can use Here we suppose the HTML encoding method is the default encoding of the operation system. /// <summary>
/// read all the content from a file as byte array
/// </summary>
/// <param name="strFilePath">source file path</param>
/// <returns>dest byte array on succced</returns>
public static byte[] ReadFileAsBytes(String strFilePath)
{
System.IO.FileStream fs = new System.IO.FileStream(strFilePath,
System.IO.FileMode.Open, System.IO.FileAccess.Read,
System.IO.FileShare.ReadWrite);
System.IO.BinaryReader br = new System.IO.BinaryReader(fs);
byte[] baResult = null;
try
{
baResult = new byte[fs.Length];
br.Read(baResult, 0, baResult.Length);
}
finally
{
br.Close();
fs.Close();
}
return baResult;
}
/// <summary>
/// convert a byte array to string using default encoding
/// </summary>
/// <param name="bData">the content of the array</param>
/// <returns>converted string</returns>
public static String BytesToString(byte[] bData)
{
return System.Text.Encoding.GetEncoding(0).GetString(bData);
}
Step II. Convert file to XHTMLWe use HTML Tidy to convert HTML files to XHTML files. Tidy has lots of parameters. If you want to know the details, you can read the manual. If we want to convert a UTF-8 html file to XHTML file, you can use use it like this: tidy.exe -raw -utf8 -asxhtml -i -f logfilename -o outputfilename inputfilename By using the /// <summary>
/// This methond convert a html file to an xhtml file
/// </summary>
/// <param name="strOriginalContent">input html file</param>
/// <param name="strTempPath">Temppath,if this parameter is
/// null,then it refers to the temp path of the system</param>
/// <returns>converted xhtml file content from input file</returns>
public static String HTML2XHTML(String strOriginalContent,String strOutputPath)
{
String strTempPath = strOutputPath != null ? strOutputPath :
System.IO.Path.GetTempPath();
String strFileName = String.Format("{0}tidy.exe",strTempPath);
//check wether tidy execuble exists
if (!System.IO.File.Exists(strFileName))
{
ChinaCars.Util.SysUtil.WriteFile(strFileName,
ChinaCars.Util.App_GlobalResources.Resource.tidy);
}
//Create process
System.Diagnostics.ProcessStartInfo psiInfo =
new System.Diagnostics.ProcessStartInfo();
psiInfo.FileName = strFileName;
psiInfo.CreateNoWindow = true;
psiInfo.WindowStyle = System.Diagnostics.ProcessWindowStyle.Hidden;
psiInfo.WorkingDirectory = strTempPath;
String strMainFileName = System.Guid.NewGuid().ToString("N");
//Specify the in/out/error file name,which is located in the temporary
//path
String strInFileName = String.Format("{0}{1}.in",
strTempPath,strMainFileName);
String strOutFileName = String.Format("{0}{1}.out",
strTempPath,strMainFileName);
String strErrorFileName = String.Format("{0}{1}.log",
strTempPath,strMainFileName);
System.IO.File.Delete(strInFileName);
//UTF8 Version,and we suppose the original content is encoded though the
//default encoding of the system
byte[] baUTF8Data = Encoding.Convert(Encoding.GetEncoding(0),
Encoding.UTF8, Encoding.GetEncoding(0).GetBytes(strOriginalContent));
ChinaCars.Util.SysUtil.WriteFile(strInFileName, baUTF8Data);
//UTF8 Version
psiInfo.Arguments = String.Format(" -raw -utf8 -asxhtml -i -f
{0}.log -o {0}.out {0}.in", strMainFileName);
System.IO.File.Delete(strOutFileName);
System.Diagnostics.Process proc =
System.Diagnostics.Process.Start(psiInfo);
proc.WaitForExit();
System.IO.File.Delete(strInFileName);
System.IO.File.Delete(strErrorFileName);
byte[] baResult = ChinaCars.Util.SysUtil.ReadFileAsBytes(strOutFileName);
//We need a head for xhtml processing
String strContent =
Encoding.GetEncoding(0).GetString(Encoding.Convert(Encoding.UTF8,
Encoding.GetEncoding(0), baResult));
strContent = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0
Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-
transitional.dtd\">" + strContent;
System.IO.File.Delete(strOutFileName);
return strContent;
}
Step III. Developing your own XHTML resloverNow you may use The DOCTYPE header in the XHTML tells the .NET XML parser to load corresponding file resource from World Wide Web Consortium (W3C), and may take several or more rounds! Fortunately, the .NET Framework allows us to resolve XML files by ourselves. By overriding public class XHTMLResolver:XmlResolver { override public ICredentials Credentials { set { } } public XHTMLResolver() { } public override Uri ResolveUri(Uri baseUri, String relativeUri) { if (String.Compare(relativeUri, "-//W3C//DTD XHTML 1.0 Transitional//EN", true) == 0) { return new Uri("http://www.w3.org/tr/xhtml1/DTD/xhtml1- transitional.dtd"); } else if (String.Compare(relativeUri, "-//W3C//DTD XHTML 1.0 Transitional//EN", true) == 0) { return new Uri("http://www.w3.org/tr/xhtml1/DTD/ xhtml1-strict.dtd"); } else if (String.Compare(relativeUri, "-//W3C//DTD XHTML 1.0 Transitional//EN", true) == 0) { return new Uri("http://www.w3.org/tr/xhtml1/DTD/ xhtml1-frameset.dtd"); } else if (String.Compare(relativeUri, "-//W3C//DTD XHTML 1.1//EN", true) == 0) { return new Uri("http://www.w3.org/tr/xhtml11/DTD/xhtml11.dtd"); } return base.ResolveUri(baseUri,relativeUri); } override public object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn) { Object entityObj = null; String strURI = absoluteUri.AbsoluteUri; System.IO.MemoryStream msStream=null; switch (strURI.ToLower()) { case "http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd": msStream = new MemoryStream(Resource.xhtml1_transitional); break; case "http://www.w3.org/tr/xhtml1/dtd/xhtml1.dcl": msStream = new MemoryStream(Resource.xhtml1); break; case "http://www.w3.org/tr/xhtml1/dtd/xhtml-lat1.ent": msStream = new MemoryStream(Resource.xhtml_lat1); break; case "http://www.w3.org/tr/xhtml1/dtd/xhtml-special.ent": msStream = new MemoryStream(Resource.xhtml_special); break; case "http://www.w3.org/tr/xhtml1/dtd/xhtml-symbol.ent": msStream = new MemoryStream(Resource.xhtml_symbol); break; case "http://www.w3.org/tr/xhtml1/dtd/xhtml1-strict.dtd": msStream = new MemoryStream(Resource.xhtml1_strict); break; case "http://www.w3.org/tr/xhtml1/dtd/xhtml1-frameset.dtd": msStream = new MemoryStream(Resource.xhtml1_frameset); break; case "http://www.w3.org/tr/xhtml11/dtd/xhtml11.dtd": msStream = new MemoryStream(Resource.xhtml11); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-inlstyle-1.mod": msStream = new MemoryStream(Resource.xhtml_inlstyle_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-framework-1.mod": msStream = new MemoryStream(Resource.xhtml_framework_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-datatypes-1.mod": msStream = new MemoryStream(Resource.xhtml_datatypes_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-qname-1.mod": msStream = new MemoryStream(Resource.xhtml_qname_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-events-1.mod": msStream = new MemoryStream(Resource.xhtml_events_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-attribs-1.mod": msStream = new MemoryStream(Resource.xhtml_attribs_1); break; case "http://www.w3.org/tr/xhtml11/dtd/ xhtml11-model-1.mod": msStream = new MemoryStream(Resource.xhtml11_model_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-charent-1.mod": msStream = new MemoryStream(Resource.xhtml_charent_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-lat1.ent": msStream = new MemoryStream(Resource.xhtml_lat11); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-symbol.ent": msStream = new MemoryStream(Resource.xhtml_symbol11); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-special.ent": msStream = new MemoryStream(Resource.xhtml_special11); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-text-1.mod": msStream = new MemoryStream(Resource.xhtml_text_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-inlstruct-1.mod": msStream = new MemoryStream(Resource.xhtml_inlstruct_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-inlphras-1.mod": msStream = new MemoryStream(Resource.xhtml_inlphras_1); break; case "http://www.w3.org/tr/ruby/xhtml-ruby-1.mod": msStream = new MemoryStream(Resource.xhtml_ruby_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-blkstruct-1.mod": msStream = new MemoryStream(Resource.xhtml_blkstruct_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-blkphras-1.mod": msStream = new MemoryStream(Resource.xhtml_blkphras_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-hypertext-1.mod": msStream = new MemoryStream(Resource.xhtml_hypertext_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-list-1.mod": msStream = new MemoryStream(Resource.xhtml_list_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-edit-1.mod": msStream = new MemoryStream(Resource.xhtml_edit_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-bdo-1.mod": msStream = new MemoryStream(Resource.xhtml_bdo_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-pres-1.mod": msStream = new MemoryStream(Resource.xhtml_pres_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-inlpres-1.mod": msStream = new MemoryStream(Resource.xhtml_inlpres_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-blkpres-1.mod": msStream = new MemoryStream(Resource.xhtml_blkpres_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-link-1.mod": msStream = new MemoryStream(Resource.xhtml_link_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-meta-1.mod": msStream = new MemoryStream(Resource.xhtml_meta_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-base-1.mod": msStream = new MemoryStream(Resource.xhtml_base_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-script-1.mod": msStream = new MemoryStream(Resource.xhtml_script_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-style-1.mod": msStream = new MemoryStream(Resource.xhtml_style_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-image-1.mod": msStream = new MemoryStream(Resource.xhtml_image_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-csismap-1.mod": msStream = new MemoryStream(Resource.xhtml_csismap_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-ssismap-1.mod": msStream = new MemoryStream(Resource.xhtml_ssismap_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-param-1.mod": msStream = new MemoryStream(Resource.xhtml_param_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-object-1.mod": msStream = new MemoryStream(Resource.xhtml_object_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-table-1.mod": msStream = new MemoryStream(Resource.xhtml_table_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-form-1.mod": msStream = new MemoryStream(Resource.xhtml_form_1); break; case "http://www.w3.org/tr/xhtml-modularization/dtd/ xhtml-struct-1.mod": msStream = new MemoryStream(Resource.xhtml_struct_1); break; } if (msStream != null) { entityObj = msStream; } else { XmlUrlResolver xur = new XmlUrlResolver(); entityObj = xur.GetEntity(absoluteUri, role, ofObjectToReturn); } return entityObj; } Using the codeBy using the HTML2XHTML method, you can convert an HTML file to an XHTML file. System.Net.WebClient webClient = new System.Net.WebClient(); String strHTMLContent = webClient.DownloadString("http://www.codeproject.com"); String strXHTMLContent = ChinaCars.Util.XMLUtil.HTML2XHTML(strHTMLContent); By using the XHTMLResolver, you can resolve the XHTML file as XML very quickly. System.Xml.XmlDocument xmlDoc=new System.Xml.XmlDocument(); xmlDoc.XmlResolver =new ChinaCars.Util.XHTMLResolver(); xmlDoc.LoadXml(xmlContent); HistoryMar 12th,2007 Publish the first version
|
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||