views:

425

answers:

3

Hi, I've got an IE BHO plugin that sends out via a COM call the HTML of a page that was loaded in the window.

// Note all error handling removed for readability :)
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
 CComPtr<IDispatch> pDispatch;
 MSHTML::IHTMLDocument2Ptr pDocument2 = NULL;
 MSHTML::IHTMLDocument3Ptr pDocument3 = NULL;
 hr = m_spWebBrowser->get_Document(&pDispatch);
 hr = pDispatch->QueryInterface(IID_IHTMLDocument3, (void**)&pDocument3);
 MSHTML::IHTMLElementPtr pRoot = pDocument3->documentElement;
 wstring strHTML = pRoot->outerHTML;
 CComBSTR bstrHTML = strOutput.c_str();
 bstrHTML.CopyTo(pbstrHTML);
}

However when it encounters a very large page (e.g. "http://sitemap.zillow.com/uncompressed/ForSale_Hood_MedPri_1.xml"), it takes 3 minutes to create the HTML from the DOM.

Is there a way to access the raw HTML/XML?

When you do a 'view page source' in IE, it pops up almost immediately, so internally IE must be using some API that can do what I want.

Thanks,
Shane.

+1  A: 

Yes, you can QI for IPersistStream and save to a memory stream created by CreateStreamOnHGlobal Note the document must finished downloading (ready state needs to be complete).

Sheng Jiang 蒋晟
Ah, thank you for the pointer Sheng Jiang, I'll update this question when I have some working code.
Shane
I tried: CComPtr<IDispatch> pDispatch; MSHTML::IHTMLDocumentPtr pDocument = NULL; hr = m_spWebBrowser->get_Document( hr = pDispatch->QueryInterface(IID_IHTMLDocument, (void**) IPersistStreamInitPtr persistStream = pDocument; IStreamPtr stream = NULL; hr = CreateStreamOnHGlobal(NULL, TRUE, hr = persistStream->Save(stream, FALSE);But that took the same 3 minutes to execute the 'Save'. So I'm guessing that this mechanism also interacts with the DOM, because doing a 'view page source' in IE still pops up the window almost immediately.
Shane
Strange. The returning data is the original data. See if you can dig the file out of IE's cache http://support.microsoft.com/kb/172607.
Sheng Jiang 蒋晟
+2  A: 

It seems that in old versions of MSHTML, outerHTML had a O(n^2) performance. However, in newer versions (IE8) this problem is gone. If you have a choice, use IE8 or later.

Otherwise, using IPersistStream::Save is an option. But CreateStreamOnHGlobal won't help you since its implementation is also O(n^2). You'll have to use a custom IStream for that.

Included is an IStream implementation which was made for this purpose and supports quick writes:

#include <atlbase.h>
#include <atlcom.h>
#include <vector>

// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
    public CComObjectRootEx<CComSingleThreadModel>,
    public CComCoClass<MyStream>,
    public IStreamImpl 
{
public: 

    std::vector<char> buf;

BEGIN_COM_MAP(MyStream)
    COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()

    STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);
};
/*

Usage:

    CComPtr<IStream> stream;
    hr = MyStream::CreateInstance(&stream);
    // streamObj will be valid as long as IStream smart pointer lives
    MyStream *streamObj = (MyStream*)stream.p;
 */


STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten) 
{
    buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
    return S_OK;
}
Amnon
Thank you Amnon, I'll try it out and then update this question with results.
Shane
Regarding the question below: the IPersistStream probably saves the source in its original encoding. So it can be single byte, two bytes or a variable number of bytes per character (utf-8) depending on the document's encoding.
Amnon
Ah, yes. Getting the Charset from the IHTMLDocument2 pointer, I see that 'unicode' generally denotes UTF16, so I hope IE sticks to this, and I hope making the assumption that everything else is UTF8 compatible is not wrong and break stuff.
Shane
A: 

Thanks Amnon, the following code is mostly working for me.

// an implementation of a write-only IStream.
// needed because the CreateStreamOnHGlobal implementation doesn't handle
// resizes well (N writes seem to take O(N^2) time)
class MyStream :
    public CComObjectRootEx<CComSingleThreadModel>,
    public CComCoClass<MyStream>,
    public IStream
{
public: 

    std::vector<char> buf;

BEGIN_COM_MAP(MyStream)
    COM_INTERFACE_ENTRY(IStream)
END_COM_MAP()

    STDMETHOD(Write) (const void * pv, ULONG cb, ULONG *pcbWritten);

    // Implement IStream abstract functions
    STDMETHOD(Read) (void *pv, ULONG cb, ULONG *pcbRead) { return S_OK; };
    STDMETHOD(Seek) (LARGE_INTEGER dlibMove,DWORD dwOrigin,ULARGE_INTEGER *plibNewPosition) { return S_OK; };
    STDMETHOD(SetSize) (ULARGE_INTEGER libNewSize) { return S_OK; };
    STDMETHOD(CopyTo) (IStream *pstm,ULARGE_INTEGER cb,ULARGE_INTEGER *pcbRead,ULARGE_INTEGER *pcbWritten) { return S_OK; };
    STDMETHOD(Commit) (DWORD grfCommitFlags) { return S_OK; };
    STDMETHOD(Revert) () { return S_OK; };
    STDMETHOD(LockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
    STDMETHOD(UnlockRegion) (ULARGE_INTEGER libOffset,ULARGE_INTEGER cb,DWORD dwLockType) { return S_OK; };
    STDMETHOD(Stat) (__RPC__out STATSTG *pstatstg,DWORD grfStatFlag) { return S_OK; };
    STDMETHOD(Clone) (__RPC__deref_out_opt IStream **ppstm) { return S_OK; };
};

STDMETHODIMP MyStream::Write(const void * pv, ULONG cb, ULONG *pcbWritten) 
{
    buf.insert(buf.end(), (char*)pv, (char*)pv+cb);
    return S_OK;
}

// Retrieves the HTML of the current page
STDMETHODIMP CPlugin::get_HTML(long lMaxSize, BSTR *pbstrHTML)
{
    HRESULT hr = S_OK;
    try
    {
     CComPtr<IDispatch> pDispatch;
     MSHTML::IHTMLDocumentPtr pDocument = NULL;

     CComPtr<IStream> mystream;
     hr = MyStream::CreateInstance(&mystream);
     // streamObj will be valid as long as IStream smart pointer lives
     MyStream *streamObj = (MyStream*)mystream.p;

     hr = m_spWebBrowser->get_Document(&pDispatch);

     hr = pDispatch->QueryInterface(IID_IHTMLDocument, (void**)&pDocument);
     IPersistStreamInitPtr persistStream = pDocument;

     hr = CreateStreamOnHGlobal(NULL, TRUE, &stream);
     hr = persistStream->Save(mystream, FALSE);
    }
    catch(...)
    {
     TRACE_FN("Got exception somewhere");
    }
    return hr;
}

Now the only problem left is how to figure why some it returns me single-byte chars most times, and double-byte chars at other times. Any ideas?

Thanks for the help.

Shane