// WebResourceProvider.cpp : implementation file
//
// Written by Ravi Bhavnani <ravib@ravib.com>
// Copyright (c) 2002. All Rights Reserved.
//
// This code may be used in compiled form in any way you desire. This
// file may be redistributed unmodified by any means PROVIDING it is
// not sold for profit without the author's written consent, and
// providing that this notice and the author's name and all copyright
// notices remains intact.
//
// An email letting me know how you are using it would be nice as well.
//
// This code is provided "as is" with no expressed or implied warranty.
// The author accepts no liability for any damage/loss of business that
// this product may cause.
//
// A description of this object and instructions on how to use it are
// provided at http://www.codeproject.com/internet/WebResourceProvider.asp.
//
// Revision history:
//
// 07 Aug 2002 - Bug fix: Fixed computation error in extractToEnd().
// Bug fix: Added missing call to init() to fetchResource().
// Added methods skipBackTo() and skipBackToExact().
//
// 08 May 2002 - Added methods urlExists(), getLinks(), removeComments(),
// removeScripts(), findNoCase() and findStringInArray().
// - Modified at(), skipTo(), and extractTo() to be case insensitive.
// - Added case sensitive analogs atExact(), skipToExact() and
// extractToExact().
//
// 29 Apr 2002 - Initial version
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "AmHttpSocket.h"
#include "WebGrab.h"
#include "WebResourceProvider.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
CWebResourceProvider::CWebResourceProvider()
{
m_dwFetchStatus = 0;
m_nIndex = 0;
m_pWndStatus = NULL;
}
CWebResourceProvider::~CWebResourceProvider()
{
}
/////////////////////////////////////////////////////////////////////////////
// Public methods (API)
void CWebResourceProvider::fetchResource()
{
// Reset fetch status
m_dwFetchStatus = 0;
m_strFetchError.Empty();
m_strContent.Empty();
m_tmFetchTime = CTime::GetCurrentTime();
// Initialize the provider
if (!init())
return;
do {
// Construct URL to be fetched
constructUrl (m_strUrl);
// Get and parse fetched content
resetIndex();
getContent (m_strUrl);
parseContent();
}
while (moreAvailable());
}
bool CWebResourceProvider::urlExists
(CString strUrl)
{
CWebGrab webGrab;
if (!webGrab.Initialise ("", NULL))
return (false);
webGrab.SetForceReload (true);
// Get file info
CString strModified;
DWORD dwSize;
DWORD dwServerError;
BOOL bStatus = webGrab.GetFileInfo (strUrl, strModified, dwSize, dwServerError);
return (bStatus ? true : false);
}
/////////////////////////////////////////////////////////////////////////////
// Methods overridden by derived classes
bool CWebResourceProvider::init()
{
return (true);
}
bool CWebResourceProvider::isPost()
{
return (false);
}
void CWebResourceProvider::getPostData
(CString& strPostData)
{
}
bool CWebResourceProvider::moreAvailable()
{
return (false);
}
void CWebResourceProvider::parseContent()
{
}
/////////////////////////////////////////////////////////////////////////////
// Helper methods used by derived classes (see WebResourceProvider.h for
// descriptions)
bool CWebResourceProvider::at
(CString strText)
{
if (findNoCase (m_strContent, strText, m_nIndex) == m_nIndex)
return (true);
return (false);
}
bool CWebResourceProvider::atExact
(CString strText)
{
if (m_strContent.Find (strText, m_nIndex) == m_nIndex)
return (true);
return (false);
}
bool CWebResourceProvider::skipTo
(CString strText)
{
long nNewIndex = findNoCase (m_strContent, strText, m_nIndex);
if (nNewIndex == (-1))
return (false);
m_nIndex = nNewIndex + strText.GetLength();
return (true);
}
bool CWebResourceProvider::skipToExact
(CString strText)
{
long nNewIndex = m_strContent.Find (strText, m_nIndex);
if (nNewIndex == (-1))
return (false);
m_nIndex = nNewIndex + strText.GetLength();
return (true);
}
bool CWebResourceProvider::skipBackTo
(CString strText)
{
long nSavedIndex = m_nIndex;
do {
m_nIndex--;
}
while ((m_nIndex >= 0) && !at (strText));
if (at (strText))
return (true);
m_nIndex = nSavedIndex;
return (false);
}
bool CWebResourceProvider::skipBackToExact
(CString strText)
{
long nSavedIndex = m_nIndex;
do {
m_nIndex--;
}
while ((m_nIndex >= 0) && !atExact (strText));
if (atExact (strText))
return (true);
m_nIndex = nSavedIndex;
return (false);
}
bool CWebResourceProvider::extractTo
(CString strTerminator,
CString& strResult)
{
long nLength = m_strContent.GetLength();
if (m_nIndex < (nLength - 1))
{
long nEndSegment = findNoCase (m_strContent, strTerminator, m_nIndex);
if (nEndSegment != (-1))
{
strResult = m_strContent.Mid (m_nIndex, nEndSegment - m_nIndex);
m_nIndex = nEndSegment + strTerminator.GetLength();
return (true);
}
}
return (false);
}
bool CWebResourceProvider::extractToExact
(CString strTerminator,
CString& strResult)
{
long nLength = m_strContent.GetLength();
if (m_nIndex < (nLength - 1))
{
long nEndSegment = m_strContent.Find (strTerminator, m_nIndex);
if (nEndSegment != (-1))
{
strResult = m_strContent.Mid (m_nIndex, nEndSegment - m_nIndex);
m_nIndex = nEndSegment + strTerminator.GetLength();
return (true);
}
}
return (false);
}
void CWebResourceProvider::extractToEnd
(CString& strResult)
{
long nLength = m_strContent.GetLength();
if (m_nIndex < (nLength - 1)) {
strResult = m_strContent.Mid (m_nIndex, nLength - m_nIndex);
}
}
void CWebResourceProvider::getLinks
(CStringArray& documents,
CStringArray& images)
{
// Remove comments and JavaScript and fix links
CString strOriginalContent = m_strContent;
removeComments();
removeScripts();
m_strContent = replaceEvery (m_strContent, "'", "\"");
// Create URL prefix - this will be used to create fully qualified URLs
CString strUrlPrefix = m_strUrl;
long nSlash = strUrlPrefix.ReverseFind ('/');
if (nSlash > 7) {
strUrlPrefix = strUrlPrefix.Left (nSlash);
}
// Extract HREF targets - prepend base URL if extracted URL is relative
CString strUrl;
CString strPrefix;
while (skipTo ("href=\""))
if (extractTo ("\"", strUrl)) {
trim (strUrl);
if (strUrl.Find ("mailto:") == -1) {
if (strUrl.Find ("http://") != 0) {
if (strUrl.GetAt (0) == '/')
strUrl = strUrlPrefix + strUrl;
else {
CString strFormattedUrl;
strFormattedUrl.Format ("%s/%s", strUrlPrefix, strUrl);
strUrl = strFormattedUrl;
}
}
if (findStringInArray (documents, strUrl) == -1) {
documents.Add (strUrl);
}
}
}
// Extract SRC targets - prepend base URL if extracted URL is relative
resetIndex();
while (skipTo ("src=\""))
if (extractTo ("\"", strUrl)) {
trim (strUrl);
if (strUrl.Find ("http://") != 0) {
if (strUrl.GetAt (0) == '/')
strUrl = strUrlPrefix + strUrl;
else {
CString strFormattedUrl;
strFormattedUrl.Format ("%s/%s", strUrlPrefix, strUrl);
strUrl = strFormattedUrl;
}
}
if (findStringInArray (images, strUrl) == -1) {
images.Add (strUrl);
}
}
// Restore original content
m_strContent = strOriginalContent;
resetIndex();
}
void CWebResourceProvider::resetIndex()
{
m_nIndex = 0;
}
CString CWebResourceProvider::replaceEvery
(LPCTSTR strTarget,
LPCTSTR strOccurence,
LPCTSTR strWith)
{
CString output ( strTarget );
// lowercase-versions to search in.
CString input_lower( strTarget );
CString oldone_lower( strOccurence );
input_lower.MakeLower();
oldone_lower.MakeLower();
// search in the lowercase versions,
// replace in the original-case version.
int pos=0;
while ( (pos=input_lower.Find(oldone_lower,pos))!=-1 ) {
// need for empty "newstr" cases.
input_lower.Delete( pos, lstrlen(strOccurence) );
input_lower.Insert( pos, strWith );
// actually replace.
output.Delete( pos, lstrlen(strOccurence) );
output.Insert( pos, strWith );
pos += strlen (strWith);
}
return output;
}
void CWebResourceProvider::removeComments()
{
// Get comment-free version of content
resetIndex();
CString strContentBody;
CString strContentSegment;
while (extractTo ("<!--", strContentSegment)) {
strContentBody += strContentSegment;
if (!skipTo ("-->")) {
m_strContent = strContentBody;
resetIndex();
return;
}
}
extractToEnd (strContentSegment);
strContentBody += strContentSegment;
// Replace existing content
m_strContent = strContentBody;
resetIndex();
}
void CWebResourceProvider::removeScripts()
{
// Get comment-free version of content
resetIndex();
CString strContentBody;
CString strContentSegment;
while (extractTo ("<script", strContentSegment)) {
strContentBody += strContentSegment;
if (!skipTo ("/script>")) {
m_strContent = strContentBody;
resetIndex();
return;
}
}
extractToEnd (strContentSegment);
strContentBody += strContentSegment;
// Replace existing content
m_strContent = strContentBody;
resetIndex();
}
void CWebResourceProvider::removeEnclosingAnchorTag
(CString& strText)
{
if ((strText.Find ("<a") == 0) || (strText.Find ("<A") == 0))
{
long nIndex = strText.Find ('>');
if (nIndex != (-1))
{
strText = strText.Right (strText.GetLength() - nIndex - 1);
nIndex = strText.Find ('<');
if (nIndex != (-1))
strText = strText.Left (nIndex);
}
}
}
void CWebResourceProvider::removeEnclosingQuotes
(CString& strText)
{
long nLength = strText.GetLength();
if ((strText.GetAt (0) == '\"') && (strText.GetAt (nLength - 1) == '\"'))
strText = strText.Mid (1, strText.GetLength() - 2);
}
void CWebResourceProvider::removeHtml
(CString& strText)
{
// Remove all tags
CString strClean;
long nIndex = 0;
long nStartTag = 0;
while ((nStartTag = strText.Find ('<', nIndex)) != -1) {
// Extract to start of tag
CString strSubstring = strText.Mid (nIndex, (nStartTag - nIndex));
strClean += strSubstring;
nIndex = nStartTag + 1;
// Skip over tag
long nEndTag = strText.Find ('>', nIndex);
if (nEndTag == (-1))
break;
nIndex = nEndTag + 1;
}
// Gather remaining text
if (nIndex < strText.GetLength())
strClean += strText.Right (strText.GetLength() - nIndex);
strText = strClean;
strClean.Empty();
// Do some common replacements
if (strText.Find (" ") != (-1))
strText = replaceEvery (strText, " ", " ");
if (strText.Find ("&") != (-1))
strText = replaceEvery (strText, "&", "&");
if (strText.Find ("å") != (-1))
strText = replaceEvery (strText, "å", "");
if (strText.Find ("ä") != (-1))
strText = replaceEvery (strText, "ä", "");
if (strText.Find ("é") != (-1))
strText = replaceEvery (strText, "é", "");
if (strText.Find ("í") != (-1))
strText = replaceEvery (strText, "í", "");
if (strText.Find ("ì") != (-1))
strText = replaceEvery (strText, "ì", "");
if (strText.Find ("ò") != (-1))
strText = replaceEvery (strText, "ò", "");
if (strText.Find ("ö") != (-1))
strText = replaceEvery (strText, "ö", "");
if (strText.Find ("ß") != (-1))
strText = replaceEvery (strText, "ß", "");
if (strText.Find ("�") != (-1))
strText = replaceEvery (strText, "�", "#");
if (strText.Find ("'") != (-1))
strText = replaceEvery (strText, "'", "'");
}
void CWebResourceProvider::trim
(CString& strText)
{
strText.TrimLeft();
strText.TrimRight();
long nNewLine = strText.Find ('\012');
while (nNewLine != (-1)) {
strText.SetAt (nNewLine, ' ');
nNewLine = strText.Find ('\012');
}
}
/////////////////////////////////////////////////////////////////////////////
// Implementation
long CWebResourceProvider::findNoCase
(CString strString,
CString strSubstring,
long nStart)
{
// Get lower-case version of substring
CString strSubstringLCase = strSubstring;
strSubstringLCase.MakeLower();
// Initialize indices
long nStringIndex = nStart;
long nSubstringIndex = 0;
// Walk source string
while (nStringIndex < strString.GetLength()) {
char chString = tolower (strString.GetAt (nStringIndex));
if (nSubstringIndex < strSubstringLCase.GetLength()) {
if (chString == strSubstringLCase.GetAt (nSubstringIndex)) {
// If source and substring characters match, continue searching for
// the rest of the substring
nStringIndex++;
nSubstringIndex++;
} else {
// Otherwise restart search from the next location in source string
nStart++;
nStringIndex = nStart;
nSubstringIndex = 0;
}
} else {
// If the entire substring has been found, return its location in the
// source string
return (nStart);
}
}
// If the search ended because the entire source string segment matched
// return the location where the match began
if (nSubstringIndex == strSubstringLCase.GetLength()) {
return (nStart);
}
// Otherwise indicate that the substring was not found
return (-1);
}
long CWebResourceProvider::findStringInArray
(CStringArray& stringArray,
CString string)
{
for (long nIndex=0; (nIndex < stringArray.GetSize()); nIndex++) {
if (string.Compare (stringArray.GetAt (nIndex)) == 0) {
return (nIndex);
}
}
return (-1);
}
void CWebResourceProvider::getContent
(CString strUrl)
{
// Initialize fetch status
m_dwFetchStatus = 0;
m_strFetchError.Empty();
// If this is a POST request ...
if (isPost()) {
// Use CAmHttpSocket to fetch the content
CString strPostData;
getPostData (strPostData);
CAmHttpSocket http;
http.setAgent (m_strAgent);
char* pContent = http.GetPage (strUrl, true, strPostData);
m_strContent = pContent;
// Set fetch status
long nStatus = http.GetPageStatusCode();
if ((nStatus % 100) != 2) {
m_dwFetchStatus = 1;
m_strFetchError.Format ("Error %d fetching resource", nStatus, strUrl);
}
} else {
// Otherwise, initialize web grabber - return on error
CWebGrab webGrab;
if (!webGrab.Initialise (m_strAgent, m_pWndStatus))
{
m_dwFetchStatus = 1;
m_strFetchError = "Unable to initialize resource provider";
return;
}
webGrab.SetForceReload (true);
// webGrab.SetTimeOut (1000);
// Fetch url into m_strContent and set fetch status
BOOL bFetched = webGrab.GetFile (strUrl, m_strContent, m_strAgent, m_pWndStatus);
webGrab.Close();
if (!bFetched)
{
m_dwFetchStatus = 1;
m_strFetchError = "Error fetching resource" + strUrl;
}
}
}
// End WebResourceProvider.cpp