Click here to Skip to main content
15,034,292 members
Please Sign up or sign in to vote.
3.17/5 (6 votes)
See more:
C#
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Xml.Linq;
using System.Text.RegularExpressions;
using System.Data;

public partial class _Default : System.Web.UI.Page
{
    
    protected void Page_Load(object sender, EventArgs e)
    {
        GetHtmlPage("http://iaspub.epa.gov/triexplorer/release_chem?p_view=USCH&trilib=TRIQ1&sort=_VIEW_&sort_fmt=1&state=All+states&county=All+counties&chemical=All+chemicals&industry=ALL&year=2010&tab_rpt=1&fld=RELLBY&fld=TSFDSP");
    }
    private string GetHtmlPage(string strURL)
    {

        String strResult;
        WebResponse objResponse;
        WebRequest objRequest = HttpWebRequest.Create(strURL);
        objResponse = objRequest.GetResponse();
        using (StreamReader sr = new StreamReader(objResponse.GetResponseStream()))
        {
            strResult = sr.ReadToEnd();
            
            sr.Close();
        }
       // strResult = strResult.Remove(0, strResult.LastIndexOf("<table>"));
        string[] values = strResult.Split(new string[] { "<tbody>", "</tbody>"}, StringSplitOptions.RemoveEmptyEntries);
      
        // Response.Write("<table>" + values[1] + "</table>");
         ConvertHTMLTablesToDataSet("<table>" + values[1] + "</table>");
      //  List<string> list = new List<string>(values);
        
        return strResult;
    }

    private DataSet ConvertHTMLTablesToDataSet(string HTML)
    {
        // Declarations 
        DataSet ds = new DataSet();
        DataTable dt = null;
        DataRow dr = null;
        DataColumn dc = null;
        string TableExpression = "<table[^>]*>(.*?)</string></string></table>";
        string HeaderExpression = "<th[^>]*>(.*?)";
        string RowExpression = "<tr[^>]*>(.*?)";
        string ColumnExpression = "<td[^>]*>(.*?)";
        bool HeadersExist = false;
        int iCurrentColumn = 0;
        int iCurrentRow = 0;

        // Get a match for all the tables in the HTML 
        MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

        // Loop through each table element 
        foreach (Match Table in Tables)
        {
            // Reset the current row counter and the header flag 
            iCurrentRow = 0;
            HeadersExist = false;

            // Add a new table to the DataSet 
            dt = new DataTable();

            //Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names) 
            if (Table.Value.Contains("<th"))>
            {
                // Set the HeadersExist flag 
                HeadersExist = true;

                // Get a match for all the rows in the table 
                MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

                // Loop through each header element 
                foreach (Match Header in Headers)
                {
                    dt.Columns.Add(Header.Groups[1].ToString());
                }
            }
            else
            {
                for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++)
                {
                    dt.Columns.Add("Column " + iColumns);
                }
            }


            //Get a match for all the rows in the table 

            MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

            // Loop through each row element 
            foreach (Match Row in Rows)
            {
                // Only loop through the row if it isn't a header row 
                if (!(iCurrentRow == 0 && HeadersExist))
                {
                    // Create a new row and reset the current column counter 
                    dr = dt.NewRow();
                    iCurrentColumn = 0;

                    // Get a match for all the columns in the row 
                    MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

                    // Loop through each column element 
                    foreach (Match Column in Columns)
                    {
                        // Add the value to the DataRow 
                        dr[iCurrentColumn] = Column.Groups[1].ToString();

                        // Increase the current column  
                        iCurrentColumn++;
                    }

                    // Add the DataRow to the DataTable 
                    dt.Rows.Add(dr);

                }

                // Increase the current row counter 
                iCurrentRow++;
            }


            // Add the DataTable to the DataSet 
            ds.Tables.Add(dt);

        }
        GridView1.DataSource = ds;
        GridView1.DataBind();
        return ds;

    }



}
Posted
Updated 6-Apr-16 22:16pm
v2
Comments
Prasad_Kulkarni 27-Aug-12 6:24am
   
..what to do with this code??

where's your question.?
Sandeep Mewara 27-Aug-12 7:08am
   
And the issue is?
ridoy 27-Aug-12 7:22am
   
where is your question?!
Mac12334 27-Aug-12 8:44am
   
Please explain briefly your question.

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Xml.Linq;
namespace TGetWeb
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void Form1_Load(object sender, EventArgs e)
{
GetHtmlPage("http://localhost/t/z3.htm");

}
private string GetHtmlPage(string strURL)
{

String strResult;
WebResponse objResponse;
WebRequest objRequest = HttpWebRequest.Create(strURL);
objResponse = objRequest.GetResponse();
using (StreamReader sr = new StreamReader(objResponse.GetResponseStream()))
{
strResult = sr.ReadToEnd();

sr.Close();
}
// strResult = strResult.Remove(0, strResult.LastIndexOf(""));
string[] values = strResult.Split(new string[] { "
", "
" }, StringSplitOptions.RemoveEmptyEntries);

// Response.Write("
" + values[1] + "
");
ConvertHTMLTablesToDataSet("" + values[1] + "
");
// List<string> list = new List<string>(values);

return strResult;
}

private DataSet ConvertHTMLTablesToDataSet(string HTML)
{
// Declarations
DataSet ds = new DataSet();
DataTable dt = null;
DataRow dr = null;
//DataColumn dc = null;
string TableExpression = "<table[^>]*>(.*?)";
string HeaderExpression = "<th[^>]*>(.*?)";
string RowExpression = "<tr[^>]*>(.*?)";
string ColumnExpression = "<td[^>]*>(.*?)";
bool HeadersExist = false;
int iCurrentColumn = 0;
int iCurrentRow = 0;

// Get a match for all the tables in the HTML
MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each table element
foreach (Match Table in Tables)
{
// Reset the current row counter and the header flag
iCurrentRow = 0;
HeadersExist = false;

// Add a new table to the DataSet
dt = new DataTable();

//Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)
if (Table.Value.Contains("<th")> {
// Set the HeadersExist flag
HeadersExist = true;

// Get a match for all the rows in the table
MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

// Loop through each header element
foreach (Match Header in Headers)
{
dt.Columns.Add(Header.Groups[1].ToString());
}
}
else
{
for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++)
{
dt.Columns.Add("Column " + iColumns);
}
}


//Get a match for all the rows in the table

MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

// Loop through each row element
foreach (Match Row in Rows)
{
// Only loop through the row if it isn't a header row
if (!(iCurrentRow == 0 && HeadersExist))
{
// Create a new row and reset the current column counter
dr = dt.NewRow();
iCurrentColumn = 0;

// Get a match for all the columns in the row
MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);

// Loop through each column element
foreach (Match Column in Columns)
{
// Add the value to the DataRow
dr[iCurrentColumn] = Column.Groups[1].ToString();

// Increase the current column
iCurrentColumn++;
}

// Add the DataRow to the DataTable
dt.Rows.Add(dr);

}

// Increase the current row counter
iCurrentRow++;
}


// Add the DataTable to the DataSet
ds.Tables.Add(dt);

}
GridView1.DataSource = dt;
lbRowCount.Text = dt.Rows.Count.ToString();
return ds;
}
}
}
   
Instead of doing all that HTML parsing by hand, you might want to try the Html Agility Pack[^] to do the brunt of the work for you.

Regards,

—Manfred
   
Comments
fjdiewornncalwe 27-Aug-12 10:01am
   
+5. One of the better solutions.
Manfred Rudolf Bihy 28-Aug-12 7:57am
   
Thanks Marcus!
Of course it's one of the better ones, since there is only one solution at all! :)
fjdiewornncalwe 28-Aug-12 9:58am
   
I see where you've gotten that from. Thankfully I can assume that you're joking even without the smiley at the end. To clarify, I have seen too many HTML parsing home grown solutions that do a bit here and a bit there, but I haven't found something that works quite as well as the Agility Pack as of yet. Cheers.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)




CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900