65.9K
CodeProject is changing. Read more.
Home

How to identify the different elements in the selection of web pages?

starIconstarIconstarIconstarIcon
emptyStarIcon
starIcon

4.64/5 (4 votes)

Mar 22, 2005

viewsIcon

73143

downloadIcon

1122

How to identify the different elements in the selection of web pages?

Introduction

First, I get an IHTMLDocument2 interface for the browser component and then get the selection property of the interface to get an IHTMLSelectionObject interface. Now came the tough part, actually parsing the stuff. I'd assumed that if I was able to create a control range using the createRange method for the selection, that I'd be able to get a list which would have the HTML tags and their attributes neatly separated.

Now, use IMarkupServices, I can enum all elements in selected portion of web browser. Here is the code:

//File:IESelection.CPP

#include <afxwin.h>
#include <afxdisp.h>

#include <atlbase.h>
//You may derive a class from CComModule and use it
//if you want to override
//something, but do not change the name of _Module
extern CComModule _Module;
#include <atlcom.h>

///
#include <mshtml.h>
#include <MsHtmcid.h>

///
HRESULT GetMarkupServices(IDispatch *pDocument, 
        IMarkupServices ** pMarkupServices );
HRESULT EnumSelectionElements(IDispatch *pDocument, CString &msg);

HRESULT PrintElement(IHTMLElement *pElement, CString &msg);
///
HRESULT EnumSelectionElements(IDispatch * pDocument, CString &msg)
{
    HRESULT                         hr     = S_OK;
    CComQIPtr<IHTMLDOCUMENT2>       pDoc;
    CComQIPtr<IHTMLSELECTIONOBJECT> pSel;
    CComQIPtr<IHTMLTXTRANGE>        pRange;
    CComQIPtr<IMARKUPSERVICES>      pMarkupServices;
    CComQIPtr<IMARKUPPOINTER>       pHtmlStart;
    CComQIPtr<IMARKUPPOINTER>       pHtmlEnd;
    CComQIPtr<IHTMLELEMENT>         pElement;
    CComBSTR                        bstrTagName;
   BOOL                            bRight = FALSE;
    CComBSTR                        bstrinnerText;
   
   ///
   msg = L"";

   ///
    if( ! (pDoc = pDocument) )
      return E_FAIL;
   
    hr = pDoc->get_selection( & pSel );
    if (hr || (!pSel) )
        return E_FAIL;
   
    hr = pSel->createRange((IDispatch **)&pRange);
    if (hr || (!pRange))
        return E_FAIL;
   
    hr = GetMarkupServices(pDocument, &pMarkupServices);
    if (hr || (!pMarkupServices) )
        return E_FAIL;
   
    hr = pMarkupServices->CreateMarkupPointer( &pHtmlStart );
    if (hr || (!pHtmlStart) )
        return E_FAIL;
   
    hr = pMarkupServices->CreateMarkupPointer( &pHtmlEnd );
    if (hr || (!pHtmlEnd))
        return E_FAIL;
   
    hr = pMarkupServices->MovePointersToRange( pRange, 
         pHtmlStart, pHtmlEnd );
    if (hr)
        return E_FAIL;
   
   ///
    while (TRUE)
    {
      pElement = (IUnknown*)NULL;

      hr = pHtmlStart->IsRightOf(pHtmlEnd, &bRight);
      if( hr )
         return E_FAIL;
      if( bRight )
         break;

      hr = pHtmlStart->CurrentScope( &pElement );
      if (hr)
         return E_FAIL;
      hr = pElement->get_tagName( &bstrTagName );
      if (hr)
         return E_FAIL;
      hr = pElement->get_innerText( &bstrinnerText);
      if (hr)
         return E_FAIL;
      //   
      CString ele_msg;
      PrintElement( pElement, ele_msg);
      
      msg += ele_msg;

      //move to next element
      hr = pHtmlStart->MoveUnit(MOVEUNIT_NEXTBLOCK);
      if (hr)
         return E_FAIL;
   }
   
   return S_OK;
}

HRESULT GetMarkupServices(IDispatch *pDocument, 
        IMarkupServices ** pMarkupServices)
{
    CComQIPtr<IHTMLDOCUMENT2>    pDoc;
    CComQIPtr<IHTMLWINDOW2>      pWindow;
    CComQIPtr<ISERVICEPROVIDER>  pService;
    HRESULT                      hr = S_OK;
   
    pDoc = pDocument;
   if( ! pDoc)
      return E_FAIL;

    hr = pDoc->get_parentWindow( &pWindow );
    if (hr)
      return E_FAIL;
   
    pService = pWindow;
    if ( !pService )
      return E_FAIL;
   
    hr = pService->QueryService( CLSID_HTMLDocument,
      IID_IMarkupServices,
      (void **) pMarkupServices);
    if (hr)
      return E_FAIL;
   return S_OK;
}

////////////////////////////////////////
HRESULT PrintElement(IHTMLElement *pElement, CString &msg)
{
   CComQIPtr<IHTMLIMGELEMENT>  pImg( pElement );
   CComBSTR  bstrTagName;
   CComBSTR  bstrinnerText;
   CComBSTR  bstrSrc;
   HRESULT   hr = S_OK;
   
   hr = pElement->get_tagName( &bstrTagName );
   if (FAILED(hr))
      return hr;
   hr = pElement->get_innerText( &bstrinnerText);
   if (FAILED(hr))
      return hr;
   if( pImg )
   {
      hr = pImg->get_src( &bstrSrc );
      if (FAILED(hr))
         return hr;
   }
   //
   CString ele_msg;
   ele_msg.Format("tagName=%S", bstrTagName);
   if( bstrinnerText.Length())
   {
      ele_msg += ",innerText=";
      ele_msg += CString(bstrinnerText);
   }
   if( bstrSrc.Length())
   {
      ele_msg += ",src=";
      ele_msg += CString(bstrSrc);
   }
   ele_msg += "\n";

   //enum childres
   CComQIPtr<IHTMLELEMENTCOLLECTION> pAll;
   hr = pElement->get_all( (IDispatch**)& pAll );
   if (FAILED(hr))
      return hr;

   long count = 0;
   hr = pAll->get_length( & count );
   if (FAILED(hr))
      return hr;
   for(long i=0; i<COUNT; CComQIPtr<IDispatch index(i); 
          CComVariant { i++)> pdisp;
          CComQIPtr<IHTMLELEMENT> pitem;
          hr = pAll->item( index, index, & pdisp );
        if (FAILED(hr))
           return hr;
      //
      pitem = pdisp;
      if( !pitem )
         continue;
      PrintElement( pitem, ele_msg);
      //
   }
   //
   msg += ele_msg;
   //
   TRACE0( ele_msg );
   return S_OK;
}