Click here to Skip to main content
11,490,973 members (66,773 online)
Click here to Skip to main content
Add your own
alternative version

WebResourceProvider

, 23 Mar 2007 CPOL 259.3K 1.8K 143
A framework to allow public web services to be used as objects in your application.
// WebResourceProvider.cpp : implementation file
//
// Written by Ravi Bhavnani <ravib@ravib.com>
// Copyright (c) 2002.  All Rights Reserved.
//
// This code may be used in compiled form in any way you desire. This
// file may be redistributed unmodified by any means PROVIDING it is 
// not sold for profit without the author's written consent, and 
// providing that this notice and the author's name and all copyright 
// notices remains intact. 
//
// An email letting me know how you are using it would be nice as well. 
//
// This code is provided "as is" with no expressed or implied warranty.
// The author accepts no liability for any damage/loss of business that
// this product may cause.
//
// A description of this object and instructions on how to use it are
// provided at http://www.codeproject.com/internet/WebResourceProvider.asp.
//
// Revision history:
//
// 07 Aug 2002 - Bug fix: Fixed computation error in extractToEnd().
//               Bug fix: Added missing call to init() to fetchResource().
//               Added methods skipBackTo() and skipBackToExact().
//
// 08 May 2002 - Added methods urlExists(), getLinks(), removeComments(),
//               removeScripts(), findNoCase() and findStringInArray().
//             - Modified at(), skipTo(), and extractTo() to be case insensitive.
//             - Added case sensitive analogs atExact(), skipToExact() and
//               extractToExact().
//
// 29 Apr 2002 - Initial version
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "AmHttpSocket.h"
#include "WebGrab.h"
#include "WebResourceProvider.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction

CWebResourceProvider::CWebResourceProvider()
{
  m_dwFetchStatus = 0;
  m_nIndex = 0;
  m_pWndStatus = NULL;
}

CWebResourceProvider::~CWebResourceProvider()
{
}

/////////////////////////////////////////////////////////////////////////////
// Public methods (API)

void CWebResourceProvider::fetchResource()
{
  // Reset fetch status
  m_dwFetchStatus = 0;
  m_strFetchError.Empty();
  m_strContent.Empty();
  m_tmFetchTime = CTime::GetCurrentTime();

  // Initialize the provider
  if (!init())
     return;

  do {
    // Construct URL to be fetched
    constructUrl (m_strUrl);

    // Get and parse fetched content
    resetIndex();
    getContent (m_strUrl);
    parseContent();
  }
  while (moreAvailable());
}

bool CWebResourceProvider::urlExists
  (CString strUrl)
{
  CWebGrab  webGrab;
  if (!webGrab.Initialise ("", NULL))
     return (false);
  webGrab.SetForceReload (true);

  // Get file info
  CString strModified;
  DWORD   dwSize;
  DWORD   dwServerError;
  BOOL bStatus = webGrab.GetFileInfo (strUrl, strModified, dwSize, dwServerError);
  return (bStatus ? true : false);
}

/////////////////////////////////////////////////////////////////////////////
// Methods overridden by derived classes

bool CWebResourceProvider::init()
{
  return (true);
}

bool CWebResourceProvider::isPost()
{
  return (false);
}

void CWebResourceProvider::getPostData
  (CString& strPostData)
{
}

bool CWebResourceProvider::moreAvailable()
{
  return (false);
}

void CWebResourceProvider::parseContent()
{
}

/////////////////////////////////////////////////////////////////////////////
// Helper methods used by derived classes (see WebResourceProvider.h for
// descriptions)

bool CWebResourceProvider::at
  (CString strText)
{
  if (findNoCase (m_strContent, strText, m_nIndex) == m_nIndex)
     return (true);
  return (false);
}

bool CWebResourceProvider::atExact
  (CString strText)
{
  if (m_strContent.Find (strText, m_nIndex) == m_nIndex)
     return (true);
  return (false);
}

bool CWebResourceProvider::skipTo
  (CString strText)
{
  long nNewIndex = findNoCase (m_strContent, strText, m_nIndex);
  if (nNewIndex == (-1))
     return (false);
  m_nIndex = nNewIndex + strText.GetLength();
  return (true);
}

bool CWebResourceProvider::skipToExact
  (CString strText)
{
  long nNewIndex = m_strContent.Find (strText, m_nIndex);
  if (nNewIndex == (-1))
     return (false);
  m_nIndex = nNewIndex + strText.GetLength();
  return (true);
}

bool CWebResourceProvider::skipBackTo
  (CString strText)
{
  long nSavedIndex = m_nIndex;
  do {
    m_nIndex--;
  }
  while ((m_nIndex >= 0) && !at (strText));
  if (at (strText))
     return (true);
  m_nIndex = nSavedIndex;
  return (false);
}

bool CWebResourceProvider::skipBackToExact
  (CString strText)
{
  long nSavedIndex = m_nIndex;
  do {
    m_nIndex--;
  }
  while ((m_nIndex >= 0) && !atExact (strText));
  if (atExact (strText))
     return (true);
  m_nIndex = nSavedIndex;
  return (false);
}

bool CWebResourceProvider::extractTo
  (CString  strTerminator,
   CString& strResult)
{
  long  nLength = m_strContent.GetLength();
  if (m_nIndex < (nLength - 1))
     {
       long nEndSegment = findNoCase (m_strContent, strTerminator, m_nIndex);
       if (nEndSegment != (-1))
          {
            strResult = m_strContent.Mid (m_nIndex, nEndSegment - m_nIndex);
            m_nIndex = nEndSegment + strTerminator.GetLength();
            return (true);
          }
     }
  return (false);
}

bool CWebResourceProvider::extractToExact
  (CString  strTerminator,
   CString& strResult)
{
  long  nLength = m_strContent.GetLength();
  if (m_nIndex < (nLength - 1))
     {
       long nEndSegment = m_strContent.Find (strTerminator, m_nIndex);
       if (nEndSegment != (-1))
          {
            strResult = m_strContent.Mid (m_nIndex, nEndSegment - m_nIndex);
            m_nIndex = nEndSegment + strTerminator.GetLength();
            return (true);
          }
     }
  return (false);
}

void CWebResourceProvider::extractToEnd
  (CString& strResult)
{
  long  nLength = m_strContent.GetLength();
  if (m_nIndex < (nLength - 1)) {
      strResult = m_strContent.Mid (m_nIndex, nLength - m_nIndex);
  }
}

void CWebResourceProvider::getLinks
  (CStringArray& documents,
   CStringArray& images)
{
  // Remove comments and JavaScript and fix links
  CString strOriginalContent = m_strContent;
  removeComments();
  removeScripts();
  m_strContent = replaceEvery (m_strContent, "'", "\"");

  // Create URL prefix - this will be used to create fully qualified URLs
  CString strUrlPrefix = m_strUrl;
  long nSlash = strUrlPrefix.ReverseFind ('/');
  if (nSlash > 7) {
      strUrlPrefix = strUrlPrefix.Left (nSlash);
  }

  // Extract HREF targets - prepend base URL if extracted URL is relative
  CString strUrl;
  CString strPrefix;
  while (skipTo ("href=\""))
         if (extractTo ("\"", strUrl)) {
              trim (strUrl);
              if (strUrl.Find ("mailto:") == -1) {
                  if (strUrl.Find ("http://") != 0) {
                      if (strUrl.GetAt (0) == '/')
                         strUrl = strUrlPrefix + strUrl;
                      else {
                         CString strFormattedUrl;
                         strFormattedUrl.Format ("%s/%s", strUrlPrefix, strUrl);
                         strUrl = strFormattedUrl;
                      }
                  }
                  if (findStringInArray (documents, strUrl) == -1) {
                      documents.Add (strUrl);
                  }
              }
         }

  // Extract SRC targets - prepend base URL if extracted URL is relative
  resetIndex();
  while (skipTo ("src=\""))
         if (extractTo ("\"", strUrl)) {
              trim (strUrl);
              if (strUrl.Find ("http://") != 0) {
                  if (strUrl.GetAt (0) == '/')
                     strUrl = strUrlPrefix + strUrl;
                  else {
                     CString strFormattedUrl;
                     strFormattedUrl.Format ("%s/%s", strUrlPrefix, strUrl);
                     strUrl = strFormattedUrl;
                  }
              }
              if (findStringInArray (images, strUrl) == -1) {
                  images.Add (strUrl);
              }
         }

  // Restore original content
  m_strContent = strOriginalContent;
  resetIndex();
}

void CWebResourceProvider::resetIndex()
{
  m_nIndex = 0;
}

CString CWebResourceProvider::replaceEvery
  (LPCTSTR  strTarget,
   LPCTSTR  strOccurence,
   LPCTSTR  strWith)
{
  CString output ( strTarget );

	// lowercase-versions to search in.
	CString input_lower( strTarget );
	CString oldone_lower( strOccurence );
	input_lower.MakeLower();
	oldone_lower.MakeLower();

	// search in the lowercase versions,
	// replace in the original-case version.
	int pos=0;
	while ( (pos=input_lower.Find(oldone_lower,pos))!=-1 ) {

		// need for empty "newstr" cases.
		input_lower.Delete( pos, lstrlen(strOccurence) );	
		input_lower.Insert( pos, strWith );

		// actually replace.
		output.Delete( pos, lstrlen(strOccurence) );
		output.Insert( pos, strWith );
    pos += strlen (strWith);
	}

	return output;
}

void CWebResourceProvider::removeComments()
{
  // Get comment-free version of content
  resetIndex();
  CString strContentBody;
  CString strContentSegment;
  while (extractTo ("<!--", strContentSegment)) {
      strContentBody += strContentSegment;
      if (!skipTo ("-->")) {
          m_strContent = strContentBody;
          resetIndex();
          return;
      }
  }
  extractToEnd (strContentSegment);
  strContentBody += strContentSegment;

  // Replace existing content
  m_strContent = strContentBody;
  resetIndex();
}

void CWebResourceProvider::removeScripts()
{
  // Get comment-free version of content
  resetIndex();
  CString strContentBody;
  CString strContentSegment;
  while (extractTo ("<script", strContentSegment)) {
      strContentBody += strContentSegment;
      if (!skipTo ("/script>")) {
          m_strContent = strContentBody;
          resetIndex();
          return;
      }
  }
  extractToEnd (strContentSegment);
  strContentBody += strContentSegment;

  // Replace existing content
  m_strContent = strContentBody;
  resetIndex();
}

void CWebResourceProvider::removeEnclosingAnchorTag
  (CString& strText)
{
  if ((strText.Find ("<a") == 0) || (strText.Find ("<A") == 0))
     {
       long nIndex = strText.Find ('>');
       if (nIndex != (-1))
          {
            strText = strText.Right (strText.GetLength() - nIndex - 1);
            nIndex = strText.Find ('<');
            if (nIndex != (-1))
               strText = strText.Left (nIndex);
          }
     }
}

void CWebResourceProvider::removeEnclosingQuotes
  (CString& strText)
{
  long nLength = strText.GetLength();
  if ((strText.GetAt (0) == '\"') && (strText.GetAt (nLength - 1) == '\"'))
     strText = strText.Mid (1, strText.GetLength() - 2);
}

void CWebResourceProvider::removeHtml
  (CString& strText)
{
  // Remove all tags
  CString strClean;
  long nIndex = 0;
  long nStartTag = 0;
  while ((nStartTag = strText.Find ('<', nIndex)) != -1) {

      // Extract to start of tag
      CString strSubstring = strText.Mid (nIndex, (nStartTag - nIndex));
      strClean += strSubstring;
      nIndex = nStartTag + 1;

      // Skip over tag
      long nEndTag = strText.Find ('>', nIndex);
      if (nEndTag == (-1))
         break;
      nIndex = nEndTag + 1;
  }

  // Gather remaining text
  if (nIndex < strText.GetLength())
     strClean += strText.Right (strText.GetLength() - nIndex);
  strText = strClean;
  strClean.Empty();

  // Do some common replacements
  if (strText.Find ("&nbsp;") != (-1))
     strText = replaceEvery (strText, "&nbsp;", " ");
  if (strText.Find ("&amp;") != (-1))
     strText = replaceEvery (strText, "&amp;", "&");
  if (strText.Find ("&aring;") != (-1))
     strText = replaceEvery (strText, "&aring;", "");
  if (strText.Find ("&auml;") != (-1))
     strText = replaceEvery (strText, "&auml;", "");
  if (strText.Find ("&eacute;") != (-1))
     strText = replaceEvery (strText, "&eacute;", "");
  if (strText.Find ("&iacute;") != (-1))
     strText = replaceEvery (strText, "&iacute;", "");
  if (strText.Find ("&igrave;") != (-1))
     strText = replaceEvery (strText, "&igrave;", "");
  if (strText.Find ("&ograve;") != (-1))
     strText = replaceEvery (strText, "&ograve;", "");
  if (strText.Find ("&ouml;") != (-1))
     strText = replaceEvery (strText, "&ouml;", "");
  if (strText.Find ("&szlig;") != (-1))
     strText = replaceEvery (strText, "&szlig;", "");
  if (strText.Find ("&#0;") != (-1))
     strText = replaceEvery (strText, "&#0", "#");
  if (strText.Find ("&#39;") != (-1))
     strText = replaceEvery (strText, "&#39", "'");

}

void CWebResourceProvider::trim
  (CString& strText)
{
  strText.TrimLeft();
  strText.TrimRight();
  long nNewLine = strText.Find ('\012');
  while (nNewLine != (-1)) {
      strText.SetAt (nNewLine, ' ');
      nNewLine = strText.Find ('\012');
  }
}

/////////////////////////////////////////////////////////////////////////////
// Implementation

long CWebResourceProvider::findNoCase
  (CString  strString,
   CString  strSubstring,
   long     nStart)
{
  // Get lower-case version of substring
  CString strSubstringLCase = strSubstring;
  strSubstringLCase.MakeLower();

  // Initialize indices
  long nStringIndex = nStart;
  long nSubstringIndex = 0;

  // Walk source string
  while (nStringIndex < strString.GetLength()) {

      char chString = tolower (strString.GetAt (nStringIndex));
      if (nSubstringIndex < strSubstringLCase.GetLength()) {
          if (chString == strSubstringLCase.GetAt (nSubstringIndex)) {
              // If source and substring characters match, continue searching for
              // the rest of the substring
              nStringIndex++;
              nSubstringIndex++;
          } else {
              // Otherwise restart search from the next location in source string
              nStart++;
              nStringIndex = nStart;
              nSubstringIndex = 0;
          }
      } else {
          // If the entire substring has been found, return its location in the
          // source string
          return (nStart);
      }
  }

  // If the search ended because the entire source string segment matched
  // return the location where the match began
  if (nSubstringIndex == strSubstringLCase.GetLength()) {
      return (nStart);
  }

  // Otherwise indicate that the substring was not found
  return (-1);
}

long CWebResourceProvider::findStringInArray
  (CStringArray&  stringArray,
   CString        string)
{
  for (long nIndex=0; (nIndex < stringArray.GetSize()); nIndex++) {
      if (string.Compare (stringArray.GetAt (nIndex)) == 0) {
          return (nIndex);
      }
  }
  return (-1);
}

void CWebResourceProvider::getContent
  (CString strUrl)
{
  // Initialize fetch status
  m_dwFetchStatus = 0;
  m_strFetchError.Empty();

  // If this is a POST request ...
  if (isPost()) {

      // Use CAmHttpSocket to fetch the content
      CString   strPostData;
      getPostData (strPostData);
      CAmHttpSocket http;
      http.setAgent (m_strAgent);
      char* pContent = http.GetPage (strUrl, true, strPostData);
      m_strContent = pContent;

      // Set fetch status
      long nStatus = http.GetPageStatusCode();
      if ((nStatus % 100) != 2) {
         m_dwFetchStatus = 1;
         m_strFetchError.Format ("Error %d fetching resource", nStatus, strUrl);
      }

  } else {
    // Otherwise, initialize web grabber - return on error
    CWebGrab  webGrab;
    if (!webGrab.Initialise (m_strAgent, m_pWndStatus))
       {
         m_dwFetchStatus = 1;
         m_strFetchError = "Unable to initialize resource provider";
         return;
       }
    webGrab.SetForceReload (true);
    // webGrab.SetTimeOut (1000);

    // Fetch url into m_strContent and set fetch status
    BOOL bFetched = webGrab.GetFile (strUrl, m_strContent, m_strAgent, m_pWndStatus);
    webGrab.Close();
    if (!bFetched)
       {
         m_dwFetchStatus = 1;
         m_strFetchError = "Error fetching resource" + strUrl;
       }
  }
}

// End WebResourceProvider.cpp

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

Ravi Bhavnani
Technical Lead
Canada Canada
Ravi Bhavnani is an ardent fan of Microsoft technologies who loves building Windows apps, especially PIMs, system utilities, and things that go bump on the Internet. During his career, Ravi has developed expert systems, desktop imaging apps, marketing automation software, EDA tools, a platform to help people find, analyze and understand information, trading software for institutional investors and advanced data visualization solutions. He currently works for a company that provides enterprise workforce management solutions to large clients.

His interests include the .NET framework, reasoning systems, financial analysis and algorithmic trading, NLP, HCI and UI design. Ravi holds a BS in Physics and Math and an MS in Computer Science and was a Microsoft MVP (C++ and C# in 2006 and 2007). He is also the co-inventor of 2 patents on software security and generating data visualization dashboards. His claim to fame is that he crafted CodeProject's "joke" forum post icon.

Ravi's biggest fear is that one day he might actually get a life, although the chances of that happening seem extremely remote.
Follow on   Google+   LinkedIn

| Advertise | Privacy | Terms of Use | Mobile
Web02 | 2.8.150520.1 | Last Updated 23 Mar 2007
Article Copyright 2002 by Ravi Bhavnani
Everything else Copyright © CodeProject, 1999-2015
Layout: fixed | fluid