Click here to Skip to main content
14,971,097 members
Please Sign up or sign in to vote.
3.17/5 (6 votes)
See more:
C#
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Xml.Linq;
using System.Text.RegularExpressions;
using System.Data;

public partial class _Default : System.Web.UI.Page
{
    
    protected void Page_Load(object sender, EventArgs e)
    {
        GetHtmlPage("http://iaspub.epa.gov/triexplorer/release_chem?p_view=USCH&trilib=TRIQ1&sort=_VIEW_&sort_fmt=1&state=All+states&county=All+counties&chemical=All+chemicals&industry=ALL&year=2010&tab_rpt=1&fld=RELLBY&fld=TSFDSP");
    }
    private string GetHtmlPage(string strURL)
    {

        String strResult;
        WebResponse objResponse;
        WebRequest objRequest = HttpWebRequest.Create(strURL);
        objResponse = objRequest.GetResponse();
        using (StreamReader sr = new StreamReader(objResponse.GetResponseStream()))
        {
            strResult = sr.ReadToEnd();
            
            sr.Close();
        }
       // strResult = strResult.Remove(0, strResult.LastIndexOf("<table>"));
        string[] values = strResult.Split(new string[] { "<tbody>", "</tbody>"}, StringSplitOptions.RemoveEmptyEntries);
      
        // Response.Write("<table>" + values[1] + "</table>");
         ConvertHTMLTablesToDataSet("<table>" + values[1] + "</table>");
      //  List<string> list = new List<string>(values);
        
        return strResult;
    }

    private DataSet ConvertHTMLTablesToDataSet(string HTML)
    {
        // Declarations 
        DataSet ds = new DataSet();
        DataTable dt = null;
        DataRow dr = null;
        DataColumn dc = null;
        string TableExpression = "<table[^>]*>(.*?)</string></string></table>";
        string HeaderExpression = "<th[^>]*>(.*?)";
        string RowExpression = "<tr[^>]*>(.*?)";
        string ColumnExpression = "<td[^>]*>(.*?)";
        bool HeadersExist = false;
        int iCurrentColumn = 0;
        int iCurrentRow = 0;

        // Get a match for all the tables in the HTML 
        MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

        // Loop through each table element 
        foreach (Match Table in Tables)
        {
            // Reset the current row counter and the header flag 
            iCurrentRow = 0;
            HeadersExist = false;

            // Add a new table to the DataSet 
            dt = new DataTable();

            //Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names) 
            if (Table.Value.Contains("<th"))>
            {
                // Set the HeadersExist flag 
                HeadersExist = true;

                // Get a match for all the rows in the table 
                MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

                // Loop through each header element 
                foreach (Match Header in Headers)
                {
                    dt.Columns.Add(Header.Groups[1].ToString());
                }
            }
            else
            {
                for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++)
                {
                    dt.Columns.Add("Column " + iColumns);
                }
            }


            //Get a match for all the rows in the table 

            MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

            // Loop through each row element 
            foreach (Match Row in Rows)
            {
                // Only loop through the row if it isn't a header row 
                if (!(iCurrentRow == 0 && HeadersExist))
                {
                    // Create a new row and reset the current column counter 
                    dr = dt.NewRow();
                    iCurrentColumn = 0;

                    // Get a match for all the columns in the row 
                    MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

                    // Loop through each column element 
                    foreach (Match Column in Columns)
                    {
                        // Add the value to the DataRow 
                        dr[iCurrentColumn] = Column.Groups[1].ToString();

                        // Increase the current column  
                        iCurrentColumn++;
                    }

                    // Add the DataRow to the DataTable 
                    dt.Rows.Add(dr);

                }

                // Increase the current row counter 
                iCurrentRow++;
            }


            // Add the DataTable to the DataSet 
            ds.Tables.Add(dt);

        }
        GridView1.DataSource = ds;
        GridView1.DataBind();
        return ds;

    }



}
Posted
Updated 6-Apr-16 22:16pm
v2
Comments
Prasad_Kulkarni 27-Aug-12 6:24am
   
..what to do with this code??

where's your question.?
Sandeep Mewara 27-Aug-12 7:08am
   
And the issue is?
ridoy 27-Aug-12 7:22am
   
where is your question?!
Mac12334 27-Aug-12 8:44am
   
Please explain briefly your question.

Instead of doing all that HTML parsing by hand, you might want to try the Html Agility Pack[^] to do the brunt of the work for you.

Regards,

—Manfred
   
Comments
fjdiewornncalwe 27-Aug-12 10:01am
   
+5. One of the better solutions.
Manfred Rudolf Bihy 28-Aug-12 7:57am
   
Thanks Marcus!
Of course it's one of the better ones, since there is only one solution at all! :)
fjdiewornncalwe 28-Aug-12 9:58am
   
I see where you've gotten that from. Thankfully I can assume that you're joking even without the smiley at the end. To clarify, I have seen too many HTML parsing home grown solutions that do a bit here and a bit there, but I haven't found something that works quite as well as the Agility Pack as of yet. Cheers.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Xml.Linq;
namespace TGetWeb
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void Form1_Load(object sender, EventArgs e)
{
GetHtmlPage("http://localhost/t/z3.htm");

}
private string GetHtmlPage(string strURL)
{

String strResult;
WebResponse objResponse;
WebRequest objRequest = HttpWebRequest.Create(strURL);
objResponse = objRequest.GetResponse();
using (StreamReader sr = new StreamReader(objResponse.GetResponseStream()))
{
strResult = sr.ReadToEnd();

sr.Close();
}
// strResult = strResult.Remove(0, strResult.LastIndexOf(""));
string[] values = strResult.Split(new string[] { "
", "
" }, StringSplitOptions.RemoveEmptyEntries);

// Response.Write("
" + values[1] + "
");
ConvertHTMLTablesToDataSet("" + values[1] + "
");
// List<string> list = new List<string>(values);

return strResult;
}

private DataSet ConvertHTMLTablesToDataSet(string HTML)
{
// Declarations
DataSet ds = new DataSet();
DataTable dt = null;
DataRow dr = null;
//DataColumn dc = null;
string TableExpression = "<table[^>]*>(.*?)";
string HeaderExpression = "<th[^>]*>(.*?)";
string RowExpression = "<tr[^>]*>(.*?)";
string ColumnExpression = "<td[^>]*>(.*?)";
bool HeadersExist = false;
int iCurrentColumn = 0;
int iCurrentRow = 0;

// Get a match for all the tables in the HTML
MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each table element
foreach (Match Table in Tables)
{
// Reset the current row counter and the header flag
iCurrentRow = 0;
HeadersExist = false;

// Add a new table to the DataSet
dt = new DataTable();

//Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)
if (Table.Value.Contains("<th")> {
// Set the HeadersExist flag
HeadersExist = true;

// Get a match for all the rows in the table
MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

// Loop through each header element
foreach (Match Header in Headers)
{
dt.Columns.Add(Header.Groups[1].ToString());
}
}
else
{
for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++)
{
dt.Columns.Add("Column " + iColumns);
}
}


//Get a match for all the rows in the table

MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

// Loop through each row element
foreach (Match Row in Rows)
{
// Only loop through the row if it isn't a header row
if (!(iCurrentRow == 0 && HeadersExist))
{
// Create a new row and reset the current column counter
dr = dt.NewRow();
iCurrentColumn = 0;

// Get a match for all the columns in the row
MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

// Loop through each column element
foreach (Match Column in Columns)
{
// Add the value to the DataRow
dr[iCurrentColumn] = Column.Groups[1].ToString();

// Increase the current column
iCurrentColumn++;
}

// Add the DataRow to the DataTable
dt.Rows.Add(dr);

}

// Increase the current row counter
iCurrentRow++;
}


// Add the DataTable to the DataSet
ds.Tables.Add(dt);

}
GridView1.DataSource = dt;
lbRowCount.Text = dt.Rows.Count.ToString();
return ds;
}
}
}
   

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)




CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900