Click here to Skip to main content
15,889,834 members
Articles / Code generation

Semi generated crawler

Rate me:
Please Sign up or sign in to vote.
5.00/5 (3 votes)
22 Jun 2012CPOL4 min read 21.6K   599   10  
Leverage Visual studio Web Test Framework for your crawling needs...
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
namespace LightWebTestFramework
{
	public class HtmlTagInnerTextParser
	{
		private string m_documentContent;
		private int m_beginParseOffset;
		private int m_endParseOffset;
		private int m_currentOffset;
		private StringBuilder m_currentAttributeName;
		private StringBuilder m_currentAttributeValue;
		private StringBuilder m_currentTag;
		private int m_currentTagBeginOffset;
		private int m_currentTagEndOffset;
		private bool m_currentSelfClosingTag;
		private List<HtmlAttribute> m_currentAttributeList;
		public HtmlTagInnerTextParser(string documentContent)
		{
			this.m_documentContent = documentContent;
			this.m_currentAttributeList = new List<HtmlAttribute>();
			this.m_currentAttributeName = new StringBuilder();
			this.m_currentAttributeValue = new StringBuilder();
			this.m_currentTag = new StringBuilder();
		}
		public IEnumerable<HtmlTagInnerText> GetInnerTextForHtmlTags(string tagName, bool hasClosingTag, bool removeInnerTags, bool collapseWhiteSpace)
		{
			return this.GetInnerTextForHtmlTags(tagName, null, null, hasClosingTag, removeInnerTags, collapseWhiteSpace, 0, this.m_documentContent.Length - 1);
		}
		public IEnumerable<HtmlTagInnerText> GetInnerTextForHtmlTags(string tagName, bool hasClosingTag, bool removeInnerTags, bool collapseWhiteSpace, int beginParseOffset, int endParseOffset)
		{
			return this.GetInnerTextForHtmlTags(tagName, null, null, hasClosingTag, removeInnerTags, collapseWhiteSpace, beginParseOffset, endParseOffset);
		}
		public IEnumerable<HtmlTagInnerText> GetInnerTextForHtmlTags(string tagName, string attributeName, string attributeValue, bool hasClosingTag, bool removeInnerTags, bool collapseWhiteSpace)
		{
			return this.GetInnerTextForHtmlTags(tagName, attributeName, attributeValue, hasClosingTag, removeInnerTags, collapseWhiteSpace, 0, this.m_documentContent.Length - 1);
		}
		public IEnumerable<HtmlTagInnerText> GetInnerTextForHtmlTags(string tagName, string attributeName, string attributeValue, bool hasClosingTag, bool removeInnerTags, bool collapseWhiteSpace, int beginParseOffset, int endParseOffset)
		{
			int currentOffset = this.m_currentOffset;
			int currentOffset2 = beginParseOffset;
			this.m_currentOffset = beginParseOffset;
			while (!this.IsEndParseRegion())
			{
				this.ResetOffsets(beginParseOffset, endParseOffset, currentOffset2);
				HtmlTagWithOffsets specificOpeningTag = this.GetSpecificOpeningTag(tagName, attributeName, attributeValue);
				if (specificOpeningTag != null && !this.m_currentSelfClosingTag)
				{
					HtmlTagWithOffsets specificClosingTag = this.GetSpecificClosingTag(tagName, hasClosingTag);
					if (specificClosingTag != null)
					{
						currentOffset2 = this.m_currentOffset;
						this.m_currentOffset = currentOffset;
						yield return new HtmlTagInnerText(this.m_documentContent, specificOpeningTag.EndOffset + 1, specificClosingTag.BeginOffset - 1, removeInnerTags, collapseWhiteSpace);
						continue;
					}
					this.m_currentOffset = currentOffset;
				}
				else
				{
					if (specificOpeningTag != null && this.m_currentSelfClosingTag)
					{
						currentOffset2 = this.m_currentOffset;
						this.m_currentOffset = currentOffset;
						yield return new HtmlTagInnerText();
						continue;
					}
					this.m_currentOffset = currentOffset;
				}
				yield break;
			}
			this.m_currentOffset = currentOffset;
			yield break;
		}
		private void AddAttribute()
		{
			HtmlAttribute item = new HtmlAttribute(this.m_currentAttributeName.ToString(), this.m_currentAttributeValue.ToString());
			this.m_currentAttributeList.Add(item);
		}
		private bool IsEndParseRegion()
		{
			return this.m_currentOffset > this.m_endParseOffset || this.m_currentOffset >= this.m_documentContent.Length;
		}
		private char GetCurrentChar()
		{
			return this.m_documentContent[this.m_currentOffset];
		}
		private char GetCurrentCharAndAdvance()
		{
			return this.m_documentContent[this.m_currentOffset++];
		}
		private char GetPeekChar(int peek)
		{
			if (this.m_currentOffset + peek < this.m_endParseOffset || this.m_currentOffset + peek < this.m_documentContent.Length)
			{
				return this.m_documentContent[this.m_currentOffset + peek];
			}
			return '\0';
		}
		private HtmlTagWithOffsets GetEndParsingRegionTag()
		{
			return new HtmlTagWithOffsets
			{
				BeginOffset = this.m_endParseOffset + 1,
				EndOffset = this.m_endParseOffset + 1
			};
		}
		private HtmlTagWithOffsets GetNextTag()
		{
			HtmlTagWithOffsets htmlTagWithOffsets = null;
			if (this.ParseUntilNextTag())
			{
				this.ParseTag();
				htmlTagWithOffsets = new HtmlTagWithOffsets();
				htmlTagWithOffsets.Name = this.m_currentTag.ToString();
				htmlTagWithOffsets.BeginOffset = this.m_currentTagBeginOffset;
				htmlTagWithOffsets.EndOffset = this.m_currentTagEndOffset;
				foreach (HtmlAttribute current in this.m_currentAttributeList)
				{
					htmlTagWithOffsets.AddAttribute(current);
				}
			}
			return htmlTagWithOffsets;
		}
		private HtmlTagWithOffsets GetSpecificClosingTag(string tagName, bool hasClosingTag)
		{
			int num = 1;
			bool flag = false;
			int currentOffset = this.m_currentOffset;
			while (!this.IsEndParseRegion())
			{
				HtmlTagWithOffsets nextTag = this.GetNextTag();
				if (nextTag != null)
				{
					if (string.Equals(nextTag.Name, tagName, StringComparison.OrdinalIgnoreCase))
					{
						if (!hasClosingTag)
						{
							this.m_currentOffset = nextTag.BeginOffset;
							return nextTag;
						}
						flag = true;
						num++;
					}
					if (nextTag.Name.IndexOf('/') == 0 && string.Equals(nextTag.Name.Substring(1), tagName, StringComparison.OrdinalIgnoreCase))
					{
						if (hasClosingTag)
						{
							num--;
						}
						if (!hasClosingTag || num == 0)
						{
							if (flag)
							{
								this.m_currentOffset = currentOffset;
							}
							return nextTag;
						}
					}
				}
			}
			return this.GetEndParsingRegionTag();
		}
		private HtmlTagWithOffsets GetSpecificOpeningTag(string tagName, string attributeName, string attributeValue)
		{
			while (!this.IsEndParseRegion())
			{
				HtmlTagWithOffsets nextTag = this.GetNextTag();
				if (nextTag != null && string.Equals(nextTag.Name, tagName, StringComparison.OrdinalIgnoreCase))
				{
					if (string.IsNullOrEmpty(attributeName) && string.IsNullOrEmpty(attributeValue))
					{
						return nextTag;
					}
					string attributeValueAsString = nextTag.GetAttributeValueAsString(attributeName);
					if (attributeValueAsString != null && string.Equals(attributeValueAsString, attributeValue, StringComparison.OrdinalIgnoreCase))
					{
						return nextTag;
					}
				}
			}
			return null;
		}
		private void IgnoreWhiteSpace()
		{
			while (!this.IsEndParseRegion() && this.IsWhiteSpace(this.GetCurrentChar()))
			{
				this.m_currentOffset++;
			}
		}
		private bool IsPartOfClosingTag(char ch)
		{
			return ch == '>' || (ch == '/' && this.GetPeekChar(1) == '>');
		}
		private bool IsPartOfSelfClosingTag(char ch)
		{
			return ch == '/' && this.GetPeekChar(1) == '>';
		}
		private bool IsWhiteSpace(char ch)
		{
			return "\t\n\r ".IndexOf(ch) != -1;
		}
		private void ParseAttributeName()
		{
			this.IgnoreWhiteSpace();
			while (!this.IsEndParseRegion())
			{
				char currentChar = this.GetCurrentChar();
				if (this.IsWhiteSpace(currentChar) || currentChar == '=' || this.IsPartOfClosingTag(currentChar))
				{
					break;
				}
				this.m_currentAttributeName.Append(currentChar);
				this.m_currentOffset++;
			}
			this.IgnoreWhiteSpace();
		}
		private void ParseAttributeValue()
		{
			if (this.GetCurrentChar() == '=')
			{
				this.m_currentOffset++;
				this.IgnoreWhiteSpace();
				if (this.GetCurrentChar() == '\'' || this.GetCurrentChar() == '"')
				{
					char currentChar = this.GetCurrentChar();
					this.m_currentOffset++;
					while (!this.IsEndParseRegion() && this.GetCurrentChar() != currentChar)
					{
						this.m_currentAttributeValue.Append(this.GetCurrentChar());
						this.m_currentOffset++;
					}
					this.m_currentOffset++;
				}
				else
				{
					while (!this.IsEndParseRegion())
					{
						char currentChar2 = this.GetCurrentChar();
						if (this.IsWhiteSpace(currentChar2) || this.IsPartOfClosingTag(currentChar2))
						{
							break;
						}
						this.m_currentAttributeValue.Append(currentChar2);
						this.m_currentOffset++;
					}
				}
				this.IgnoreWhiteSpace();
			}
		}
		private void ParseComment()
		{
			while (!this.IsEndParseRegion() && (this.GetCurrentChar() != '-' || this.GetPeekChar(1) != '-' || this.GetPeekChar(2) != '>'))
			{
				this.m_currentOffset++;
			}
			this.m_currentOffset += 3;
		}
		private void ParseTag()
		{
			this.m_currentTagBeginOffset = this.m_currentOffset - 1;
			this.m_currentTag.Remove(0, this.m_currentTag.Length);
			this.m_currentAttributeList.Clear();
			this.IgnoreWhiteSpace();
			while (!this.IsEndParseRegion())
			{
				char currentChar = this.GetCurrentChar();
				if (this.IsWhiteSpace(currentChar) || this.IsPartOfClosingTag(currentChar))
				{
					break;
				}
				this.m_currentTag.Append(currentChar);
				this.m_currentOffset++;
			}
			this.IgnoreWhiteSpace();
			while (!this.IsPartOfClosingTag(this.GetCurrentChar()))
			{
				this.m_currentAttributeName.Remove(0, this.m_currentAttributeName.Length);
				this.m_currentAttributeValue.Remove(0, this.m_currentAttributeValue.Length);
				this.ParseAttributeName();
				this.ParseAttributeValue();
				this.AddAttribute();
			}
			if (this.IsPartOfSelfClosingTag(this.GetCurrentChar()))
			{
				this.m_currentSelfClosingTag = true;
				this.m_currentOffset += 2;
			}
			else
			{
				this.m_currentSelfClosingTag = false;
				this.m_currentOffset++;
			}
			this.m_currentTagEndOffset = this.m_currentOffset - 1;
		}
		private bool ParseUntilNextTag()
		{
			while (!this.IsEndParseRegion())
			{
				if (this.GetCurrentCharAndAdvance() == '<')
				{
					if (this.GetCurrentChar() == '!' && this.GetPeekChar(1) == '-' && this.GetPeekChar(2) == '-')
					{
						this.ParseComment();
					}
					else
					{
						char c = char.ToUpper(this.GetCurrentChar(), CultureInfo.InvariantCulture);
						if ((c >= 'A' && c <= 'Z') || c == '!' || c == '/')
						{
							return true;
						}
					}
				}
			}
			return false;
		}
		private void ResetOffsets(int beginParseOffset, int endParseOffset, int currentOffset)
		{
			this.m_beginParseOffset = beginParseOffset;
			this.m_endParseOffset = endParseOffset;
			this.m_currentOffset = currentOffset;
		}
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer Freelance
France France
I am currently the CTO of Metaco, we are leveraging the Bitcoin Blockchain for delivering financial services.

I also developed a tool to make IaaS on Azure more easy to use IaaS Management Studio.

If you want to contact me, go this way Smile | :)

Comments and Discussions