Click here to Skip to main content
15,891,778 members
Articles / Web Development / ASP.NET

HTTP Data Client - Web Scraping

Rate me:
Please Sign up or sign in to vote.
4.79/5 (8 votes)
21 Jul 2011CPOL12 min read 47.4K   1.7K   56  
A HTTPWebRequest based library which abstracts how data is retrieved from web sources.
using System;
using System.Windows.Forms;
using System.Threading;

namespace HttpData.Client.MsHtmlToXml
{
    public class WebHost : ApplicationContext
    {
        #region Private Variables
        private AutoResetEvent resultEvent;
        private HtmlLoader loader;
        private Thread loaderThread;
        #endregion

        #region Public Variables
        public delegate void ProcessingCompleted(string value);
        #endregion

        #region .ctor
        public WebHost(string htmlContent, bool disableScripting, HtmlFixOption options, ProcessingCompleted completed, AutoResetEvent resultEvent)
        {
            try
            {
                this.resultEvent = resultEvent;
                this.DisableScripting = disableScripting;

                loaderThread = new Thread(new ThreadStart(
                delegate
                {
                    Process(htmlContent, options, completed);
                    Application.Run(this);
                }));

                loaderThread.SetApartmentState(ApartmentState.STA);
                loaderThread.Start();
            }
            catch (Exception ex)
            {
                //Helpers.ExceptionHelper.DebugException(Helpers.ExceptionHelper.GetExceptionMessage(ex, "Web Host"), ex);
            }
        }
        #endregion

        #region Properties
        public bool DisableScripting { get; set; }
        #endregion

        #region Protected Methods
        protected override void Dispose(bool disposing)
        {
            if (loaderThread != null)
            {
                try
                {
                    loaderThread.Abort();
                }
                // ReSharper disable EmptyGeneralCatchClause
                catch
                // ReSharper restore EmptyGeneralCatchClause
                {
                }
                finally
                {
                    loaderThread = null;
                }
            }

            base.Dispose(disposing);
        }
        #endregion

        #region Private Methods
        private void Process(string htmlContent, HtmlFixOption options, ProcessingCompleted completed)
        {
            loader = new HtmlLoader
                         {
                             DisableScripting = this.DisableScripting
                         };
            string value = loader.GenerateXmlEx(htmlContent, options);

            completed(value);
            resultEvent.Set();
        }
        #endregion
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer (Senior)
Cyprus Cyprus
I am a senior software engineer with over 8 years experience. Have worked for different international software companies using different technologies and programming languages like: C/C++, lotus script, lotus API, C#, ASP.NET, WCF, MS-SQL, Oracle, Domino Server, JavaScript.

Comments and Discussions