html页⾯解析成dom树,将⽹页解析成dom树的⼏种⽅法当需要将⼀个⽹页解析成⽐较直观的dom树形式,有以下⼏种⽅法。
(1)MSHTML中提供的IWebBrowser2接⼝。输⼊为⽹页的url;
IHTMLDocument2 *pDoc = NULL;
CoInitialize(NULL);
//取得⽹页内容
IWebBrowser2* pWebBrowser = NULL;
HRESULT hr = CoCreateInstance(CLSID_InternetExplorer,NULL,CLSCTX_LOCAL_SERVER,IID_IWebBrowser2,
(void**)&pWebBrowser );
if( FAILED(hr) )
{
MessageBox(NULL,_T("WebBrowser2接⼝失败"),_T("Error"),NULL);
return -1 ;
}
string WebUrl("beiningsa.blog.sohu/130372778.html");
CComVariant varUrl( WebUrl.c_str() );
CComVariant var;
hr = pWebBrowser->Navigate2( &varUrl,&var,&var,&var,&var );
if( FAILED(hr) ) return -1;
READYSTATE readystate;
hr = pWebBrowser->get_ReadyState(&readystate);
if( FAILED( hr ) ) return -1;
DWORD t1 = GetTickCount();
while( READYSTATE_COMPLETE != readystate )
{
Sleep(50);
pWebBrowser->get_ReadyState(&readystate);
}
DWORD t2 = GetTickCount()-t1;
std::cout<
CComPtrpDisp;
pWebBrowser->get_Document( &pDisp );
CComQIPtr< IHTMLDocument2 >pDoc2( pDisp );
CComQIPtr< IHTMLElement >pBody;
pDoc2->get_body( &pBody );html document是什么
(2)借助MSHTML中的IHTMLDocument2中的write⽅法,输⼊为⽹页源码字符串;
Coinitialize(NULL);
IWebBrowser2* pWeb2 = NULL;
HRESULT hr = CoCreateInstance( CLSID_InternetExplorer,NULL,CLSCTX_LOCAL_SERVER,IID_IWebBrowser2,
(void**)&pWeb2 );
CComQIPtr< IHTMLDocument2 >pDoc2;
if( pWeb2 )
{
CComVariant var( "about:blank" );
CComVariant vEmpty;
HRESULT hr = pWeb2->Navigate2( &var,&vEmpty,&vEmpty,&vEmpty,&vEmpty );
CComPtr< IDispatch >pDisp;
pWeb2->get_Document( &pDisp );
pDoc2 = pDisp;
pDisp.Detach()->Release();
}
if(pDoc2 )
{
WriteDebugLog( 2,"InitDomTree","开始执⾏");
SAFEARRAY* psa = SafeArrayCreateVector( VT_VARIANT, 0, 1 );
VARIANT *param = NULL;
CComBSTR bsData = CComBSTR( m_strCode.c_str() );
HRESULT hr = SafeArrayAccessData( psa, (LPVOID*)¶m );
param->vt = VT_BSTR;
param->bstrVal = bsData.Copy();
pDoc2->write( psa );//取到document2接⼝指针,解析源码;
pDoc2->close();
SafeArrayUnaccessData( psa );
SafeArrayDestroyData( psa );
CComQIPtr< IHTMLElement >pBody;
pDoc2->get_body( &pBody );
}
(3)使⽤IMarkservice中的parsestring,据jiangsheng说有内存泄露,是这个接⼝的⼀个bug,官⽅以发布报告,解决⽅法为第⼆种⽅法。
if( SUCCEEDED( CoInitialize( NULL ) ) )
{
CComPtr< IHTMLDocument2 >pDoc ;
HRESULT hr = CoCreateInstance( CLSID_HTMLDocument,NULL,CLSCTX_INPROC_SERVER, IID_IHTMLDocument2,reinterpret_cast( &pDoc ) );
DWORD t3 = 0;
if (pDoc)
{
CComQIPtr< IPersistStreamInit >pPersist;
pDoc->QueryInterface( IID_IPersistStreamInit, reinterpret_cast (&pPersist) );
if (pPersist)
{
pPersist->InitNew();
pPersist.Release();
CComPtr< IMarkupServices > pMS;
pDoc->QueryInterface(IID_IMarkupServices, reinterpret_cast( &pMS) );
if (pMS)
{
CComPtr< IMarkupContainer >pMC;
CComPtr< IMarkupPointer >pMkStart;
CComPtr< IMarkupPointer >pMkFinish;
pMS->CreateMarkupPointer(&pMkStart);
pMS->CreateMarkupPointer(&pMkFinish);
HRESULT hr0 = pMS->ParseString(szHTML,0,&pMC,pMkStart, pMkFinish );
szHTML.Empty();
CComBSTR bvret;
pDoc->get_readyState( &bvret );
if( bvret )
hFile<< (char*)(_bstr_t)bvret;
if (pMC)
{
CComPtrpNewDoc = NULL;
pMC->QueryInterface( IID_IHTMLDocument2,reinterpret_cast( &pNewDoc ));
if( pNewDoc )
{
pNewDoc.Release();
}
pMC.Release();
}
pMS.Release();
}
}
pDoc.Release();
}
::CoUninitialize();
}
(4)借助tidy库解析。
不多说了。
第⼀种⽅法,WebBrowser控件会将所有的页⾯元素全部下载下来,⽐较慢,若没有下载完,相关的UI属性不是最后⽹页所展现的样式;
这种⽅法,解析⽹页很慢,但是分析⽹页⾮常详尽;
第⼆种⽅法⽐第⼀种⽅法快,⽽且也可以取到UI属性;不同的是,源码下载这⼀块可以⾃⼰单独控制,但是,对于有iframe框架的⽹页,推荐⽤第⼀种⽅法来解析;
第三种⽅法解析后的dom树没有ui属性;但是解析速度⾮常快;只是需要操作单独的元素,也就够了,
但是,有内存泄漏,要求不很严格的情况下,可以使⽤下。
第四种性能同第三种;详细见⽹站;
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论