using Rilling.Web.Mhtml;
using System.Text;
using Rilling.MhtmlLib.Media.Artifacts;
using Rilling.Web.Mhtml.Configuration;
using System.Xml;
using System.Diagnostics;
using System.Reflection;
using Rilling.MhtmlLib.Media;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO;
using System;
namespace Rilling.MhtmlLib.Media.Resolvers
{
/// <summary>
/// This resolver is responsible for locating all image references
/// within the HTML or CSS content.
/// </summary>
public class ImageLinkResolver : LinkResolverBase, IExpandedConfiguration
{
/// <summary>
/// Sets or retrieves whether background images are also located.
/// </summary>
/// <remarks>
/// Background images are defined as the following.
/// 1) CSS styles (in the style tag or inline attributes)
/// that define the backgound-image or
/// list-style-image attributes.
/// 2) The BACKGROUND attribute that can exist on tags
/// such as BODY and TD.
/// </remarks>
public bool IncludeBackgroundImages
{
get{return m_inclBackImgs;}
set{m_inclBackImgs = value;}
}
/// <summary>
/// Sets or retrieves whether foreground images are also located.
/// </summary>
/// <remarks>
/// Foreground images are defined as the following:
/// 1) Paths provided in the SRC attribute of the IMG tag.
/// 2) Paths provided in an INPUT type=image tag.
/// </remarks>
public bool IncludeForegroundImages
{
get{return m_inclForeImgs;}
set{m_inclForeImgs = value;}
}
/// <summary>
/// Sets or retrieves whether CSS files are processed.
/// </summary>
public bool EvaluateCssFiles
{
get{return m_srchCssFiles;}
set{m_srchCssFiles = value;}
}
/// <summary>
/// Sets or retrieves whether HTML files are processed.
/// </summary>
public bool EvaluateHtmlFiles
{
get{return m_srchHtmlFiles;}
set{m_srchHtmlFiles = value;}
}
/// <summary>
/// Parses the file contents locates any potential external
/// image references.
/// </summary>
/// <param name="contentData">
/// The contents that will be parsed.
/// </param>
/// <param name="baseUrl">
/// The <see cref="Uri"/> used to locate relative references.
/// In order for a download to occur, an absolute reference is
/// required. If the reference is already absolute, this parameter
/// is not used.
/// </param>
/// <returns>
/// An array of <see cref="LinkReplacementInfo"/> objects that
/// describe the references such as its address and location
/// within the text stream.
/// </returns>
/// <remarks>
/// Simply finding a references does not, by itself, indicate
/// whether or not the resource actually exists. The purpose
/// of this method is to only locate patterns within the text
/// that could be a reference. Once the potential links are
/// returned, the calling engine will attempt to resolve them
/// to physical resources.
/// </remarks>
/// <remarks>
/// If parameter baseUrl ends in a forward-slash (/), then the
/// path is considered to be a directory, which is then merged
/// with the located reference. If there is no forward-slash,
/// then the base url is consider a file, in which case the parent
/// directory is then used when forming an absolute address with
/// the located reference.
/// </remarks>
/// <remarks>
/// There is no gaurentee of order when locating references. Meaning
/// they may not be returned in the same order that they existed
/// within the documents.
/// </remarks>
public override LinkIdentification[]
LocateReferences(TextArtifact contentData, Uri baseUrl)
{
if (contentData == null) throw (new ArgumentNullException("contentData"));
if (baseUrl == null) throw (new ArgumentNullException("baseUrl"));
ArrayList al = new ArrayList();
string contentSource = contentData.ToString();
//
// Locate all references within a CSS document. Note that CSS images are always
// considered part of the background.
//
if( contentData.ContentType == ContentTypes.Css &&
EvaluateCssFiles &&
IncludeBackgroundImages)
{
al.AddRange(Utilities.SearchContent(m_linkPatternStyleAtt,
contentSource,
baseUrl));
}
//
// Locate all references within an HTML document.
//
if( contentData.ContentType == ContentTypes.Html && EvaluateHtmlFiles)
{
//
// Detemine if background images are to be located.
//
if(IncludeBackgroundImages)
{
//
// To simpify the pattern matching, each style section is isolated
// so a second search can be performed in that limited scope. There
// can be more than one style section in a page. Same for the
// style attributes.
//
foreach(Match m in m_styleTagContainment.Matches(contentSource))
{
al.AddRange(Utilities.SearchContent(m_linkPatternStyleAtt,
m.Value,
baseUrl));
}
foreach(Match m in m_styleAttContainment.Matches(contentSource))
{
al.AddRange(Utilities.SearchContent(m_linkPatternStyleAtt,
m.Value,
baseUrl));
}
al.AddRange(Utilities.SearchContent(m_linkPatternBkTag,
contentSource,
baseUrl));
}
//
// Determine if foreground images are to be located.
//
if(IncludeForegroundImages)
{
al.AddRange(Utilities.SearchContent(m_linkPatternImgTag,
contentSource,
baseUrl));
al.AddRange(Utilities.SearchContent(m_linkPatternImgBntTag,
contentSource,
baseUrl));
}
}
//
// Convert to a strongly-typed array.
//
LinkIdentification[] linkInfos = new LinkIdentification[al.Count];
al.CopyTo(linkInfos);
//
// Send event for each link found.
//
foreach (LinkIdentification link in linkInfos)
{
OnHyperlinkFound(new HyperlinkFoundEventArgs(link.Url));
}
return linkInfos;
}
public void InitializeContext(XmlNode node)
{
if (node == null) throw (new ArgumentNullException("node"));
//
// Extract the known parameters and configure this object.
//
XmlAttribute evalCssAtt = node.Attributes["evalCss"];
XmlAttribute evalHtmlAtt = node.Attributes["evalHtml"];
XmlAttribute includeBackImgAtt = node.Attributes["includeBackImg"];
XmlAttribute includeForeImgAtt = node.Attributes["includeForeImg"];
if (evalCssAtt != null)
this.EvaluateCssFiles = bool.Parse(evalCssAtt.Value);
if (evalHtmlAtt != null)
this.EvaluateHtmlFiles = bool.Parse(evalHtmlAtt.Value);
if (includeBackImgAtt != null)
this.IncludeBackgroundImages = bool.Parse(includeBackImgAtt.Value);
if (includeForeImgAtt != null)
this.IncludeForegroundImages = bool.Parse(includeForeImgAtt.Value);
}
private bool m_inclBackImgs = true;
private bool m_inclForeImgs = true;
private bool m_srchCssFiles = true;
private bool m_srchHtmlFiles = true;
//
// The following expressions are used to locate areas of the page
// to ensure the correct match context. These area are often containers
// for actual references, such as the <STYLE> tag.
//
private static Regex m_styleTagContainment =
new Regex(Utilities.GetResolutionPattern("LR_STYLETAG_CONTAINMENT"),
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static Regex m_styleAttContainment =
new Regex(Utilities.GetResolutionPattern("LR_STYLEATT_CONTAINMENT"),
RegexOptions.Compiled | RegexOptions.IgnoreCase);
//
// Expressions used to locate image references.
//
private static Regex m_linkPatternBkTag =
new Regex(LinkMatchPatterns.BackImagePattern1,
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static Regex m_linkPatternStyleAtt =
new Regex(LinkMatchPatterns.BackImagePattern2,
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static Regex m_linkPatternImgTag =
new Regex(LinkMatchPatterns.ForeImagePattern1,
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static Regex m_linkPatternImgBntTag =
new Regex(LinkMatchPatterns.ForeImagePattern2,
RegexOptions.Compiled | RegexOptions.IgnoreCase);
}
}