读书人

怎么从html代码中获取纯文本?在线多

发布时间: 2012-12-21 12:03:49 作者: rapoo

如何从html代码中获取纯文本?在线,谢谢
我想问一下:保存到数据库字段中含有:<p></p>类似的代码,如何去掉这些,只获取纯文本
qq:543644213
e-mail:xufy576@163.com
[最优解释]
已发邮箱:需要BOOST正则表达式,邮件里有

#include <process.h>
#include <iostream>
using namespace std;
#include <vector>
#include <boost/regex.hpp>
using namespace boost;
#include <afxinet.h>
#include <fstream>

#ifdef _UNICODE
#define String std::wstring
#define COUT wcout
#define CIN wcin

#else
#define String std::string
#define COUT cout
#define CIN cin
#endif

#define SIZE_HTTPCACHE 1024

typedef struct tag_SoftTopInfo
{
String szSoftName;
String szGallery;
String szSynopsis;
String szUrl;

}SoftTopInfo,*lpSoftTopInfo;

typedef struct tag_WebPageData
{
int nSoftIndex;
WCHAR* pData;

tag_WebPageData()
{
pData = NULL;
}
}WebPageData, *lpWebPageData;

std::vector<String> g_vBaseUrl;

std::vector<SoftTopInfo> g_vBaseSoftInfo;
std::vector<SoftTopInfo> g_vSecondSoftInfo;
std::vector<SoftTopInfo> g_vFinalSoftInfo;

std::vector<WCHAR*> g_vBaseWebPageDataSet;
std::vector<WebPageData> g_vSecondWebPageDataSet;
std::vector<WebPageData> g_vFinalWebPageDataSet;

String g_strUrlHead = _T("http://www.onlinedown.net");
HANDLE g_hAnsEvent[3], g_hGetHttpEvent[3];

std::vector<String> g_vBaseGetHttpFailedItem;
std::vector<SoftTopInfo> g_vSecondGetHttpFailedItem;
std::vector<SoftTopInfo> g_vFinalGetHttpFailedItem;

std::vector<int> g_vSecondAnsFailedItem;
std::vector<int> g_vFinalAnsFailedItem;
std::vector<int> g_vNoOfficalDownItem;

//////////////////////////////////////////////////////////////////////////

BOOL GetHtml(String strUrl,std::vector<char> &vBaseInfo)
{
COUT<<endl;
COUT<<_T("提取网页数据:")<<strUrl.c_str()<<endl;

//删除缓存
DeleteUrlCacheEntry(strUrl.c_str());

BOOL bReturn = FALSE;

CInternetSession session(_T("SoftTop"));
CInternetFile* httpFile = NULL;
try
{
httpFile = (CInternetFile*) session.OpenURL(strUrl.c_str());
if (httpFile)
{
char byContent[SIZE_HTTPCACHE];

int nTotalSize = 0;;
while (TRUE)
{
ZeroMemory(byContent,SIZE_HTTPCACHE);

int nReadSize = httpFile->Read(byContent,SIZE_HTTPCACHE);
if (nReadSize <= 0)
break;

vBaseInfo.resize(vBaseInfo.size() + nReadSize);
memcpy_s(&vBaseInfo.front() + nTotalSize, nReadSize,byContent,nReadSize);

nTotalSize += nReadSize;
}
httpFile->Close();
delete httpFile;
bReturn = TRUE;
}
}
catch (CInternetException* m_pException)
{
httpFile = NULL;
m_pException->Delete();
}

return bReturn;
}

WCHAR* UTF8ToUnicode(char* pszUTF8)
{
COUT<<_T("转换UTF8为UNICODE...")<<endl;

DWORD dwUnicodeLen;


WCHAR *pwText = NULL;

if (pszUTF8 == NULL)
return pwText;

dwUnicodeLen = MultiByteToWideChar(CP_UTF8,0,pszUTF8,-1,NULL,0);
pwText = new WCHAR[dwUnicodeLen];

MultiByteToWideChar(CP_UTF8,0,pszUTF8,-1,pwText,dwUnicodeLen);
return pwText;
}

void BaseInfoAnsThread( void *pParam)
{
int nIndex = 0;

while(TRUE)
{
int nCount = g_vBaseWebPageDataSet.size();
if (nCount <= nIndex )
{
if (WaitForSingleObject(g_hGetHttpEvent[0],0) == WAIT_OBJECT_0)
{
nCount = g_vBaseWebPageDataSet.size();
if (nIndex >= nCount - 1 )
{
break;
}
}
Sleep(100);
continue;
}

const WCHAR* pContent = g_vBaseWebPageDataSet[nIndex];

boost::wregex reg(_T("<tr>.*?<a\\shref=\"(\\.\\./.*?\\.htm)\".*?>(.*?)</a>.*?<td>\\[<a.*?\">(.*?)</a>\\]</td>"));
boost::wcmatch match;

const WCHAR* pTemp = pContent;

SoftTopInfo stInfo;

while(boost::regex_search(pTemp,match,reg))
{
COUT<<endl;
COUT<<_T("步骤 1: ")<<match[1]<<_T("--")<<match[2]<<_T("--")<<match[3]<<endl;

stInfo.szUrl = match[1];
stInfo.szSoftName = match[2];
stInfo.szGallery = match[3];

g_vBaseSoftInfo.push_back(stInfo);

pTemp = match[0].second;

}

if (WaitForSingleObject(g_hGetHttpEvent[0],0) == WAIT_OBJECT_0)
{
nCount = g_vBaseWebPageDataSet.size();
if (nIndex >= nCount - 1)
{
break;
}
}

nIndex++;
}

SetEvent(g_hAnsEvent[0]);
COUT<<_T("分析线程1退出")<<endl;
_endthread();
}

void SecondAnsThread( void *pParam)
{
int nIndex = 0;

while(TRUE)
{
int nCount = g_vSecondWebPageDataSet.size();
if (nCount <= nIndex)
{
if (WaitForSingleObject(g_hGetHttpEvent[1],0) == WAIT_OBJECT_0)
{
nCount = g_vSecondWebPageDataSet.size();
if (nIndex >= nCount - 1)
{
break;
}
}
Sleep(100);
continue;
}

const WCHAR* pContent = g_vSecondWebPageDataSet[nIndex].pData;

boost::wregex reg(_T("<meta name.*?description.*?content=\"(.*?)\".*?/>"));
boost::wcmatch match;
SoftTopInfo stInfo = g_vBaseSoftInfo[g_vSecondWebPageDataSet[nIndex].nSoftIndex];
if(boost::regex_search(pContent,match,reg))
{
COUT<<endl;
COUT<<_T("步骤 2-1: ")<<match[1]<<endl;

stInfo.szSynopsis = match[1];

}

boost::wregex reg2(_T("<h4>(.*?)<a href=\"(.*?)\".*?\\[下载地址\\]</a></h4>"));
boost::wcmatch match2;
if(boost::regex_search(pContent,match2,reg2))
{
COUT<<endl;
COUT<<_T("步骤 2-2: ")<<match2[1]<<_T("--")<<match2[2]<<endl;

stInfo.szSoftName = match2[1];
stInfo.szUrl = match2[2];

g_vSecondSoftInfo.push_back(stInfo);


}
else
{
g_vSecondAnsFailedItem.push_back(nIndex);
}

if (WaitForSingleObject(g_hGetHttpEvent[1],0) == WAIT_OBJECT_0)
{
nCount = g_vSecondWebPageDataSet.size();
if (nIndex >= nCount - 1)
{
break;
}
}

nIndex++;
}

SetEvent(g_hAnsEvent[1]);
COUT<<_T("分析线程2退出")<<endl;
_endthread();
}

void FinalAnsThread( void *pParam)
{
int nIndex = 0;

while(TRUE)
{
int nCount = g_vFinalWebPageDataSet.size();
if (nCount <= nIndex)
{
if (WaitForSingleObject(g_hGetHttpEvent[2],0) == WAIT_OBJECT_0)
{
nCount = g_vFinalWebPageDataSet.size();
if (nIndex >= nCount - 1)
{
break;
}
}
Sleep(100);
continue;
}

const WCHAR* pContent = g_vFinalWebPageDataSet[nIndex].pData;

boost::wregex reg(_T("官方下载1"));
boost::wcmatch match;

SoftTopInfo stInfo = g_vSecondSoftInfo[g_vFinalWebPageDataSet[nIndex].nSoftIndex];

if(boost::regex_search(pContent,match,reg))
{
boost::wregex reg2(_T("<a\\shref=\"(.*?)\"\\sclass=\"other\">官方下载1</a>"));

if (boost::regex_search(match[0].second-200,match,reg2))
{
COUT<<endl;
COUT<<_T("步骤 3: ")<<match[1]<<endl;

stInfo.szUrl = match[1];

g_vFinalSoftInfo.push_back(stInfo);
}
else
{
g_vFinalAnsFailedItem.push_back(nIndex);
}
}
else
{
g_vNoOfficalDownItem.push_back(nIndex);
}


if (WaitForSingleObject(g_hGetHttpEvent[2],0) == WAIT_OBJECT_0)
{
nCount = g_vFinalWebPageDataSet.size();
if (nIndex >= nCount - 1)
{
break;
}
}

nIndex++;
}


SetEvent(g_hAnsEvent[2]);
COUT<<_T("分析线程3退出")<<endl;
_endthread();
}

void GetBaseHttpThread(void* pParam)
{
for (vector<String>::iterator it = g_vBaseUrl.begin(); it != g_vBaseUrl.end(); it++)
{
std::vector<char> vBaseInfo;
BOOL bSuccess = GetHtml(*it,vBaseInfo);
if (!bSuccess)
{
COUT<<endl;
COUT<<_T("获取网页基础数据出错。")<<endl;
g_vBaseGetHttpFailedItem.push_back(*it);
continue;
}

WCHAR* pwBaseInfo = NULL;
pwBaseInfo = UTF8ToUnicode(&vBaseInfo.front());
if (pwBaseInfo == NULL)
{
COUT<<_T("转换出错。")<<endl;
g_vBaseGetHttpFailedItem.push_back(*it);
continue;
}
vBaseInfo.clear();

g_vBaseWebPageDataSet.push_back(pwBaseInfo);
}

SetEvent(g_hGetHttpEvent[0]);
COUT<<_T("抓取线程1退出")<<endl;
_endthread();
}


[其他解释]
该回复于2012-03-13 08:36:12被版主删除
[其他解释]
当做xml来处理也可以,试试CMarkup
[其他解释]
接楼上:
void GetSecondHttpThread(void* pParam)
{
int nIndex = 0;
while(TRUE)


{
String strSecondUrl = _T("");

int nCount = g_vBaseSoftInfo.size();
if (nCount <= nIndex)
{
if (WaitForSingleObject(g_hAnsEvent[0],0) == WAIT_OBJECT_0)
{
nCount = g_vBaseSoftInfo.size();
if (nIndex >= nCount -1)
{
break;
}
}
Sleep(100);
continue;
}

if (nCount > nIndex)
{
strSecondUrl = g_vBaseSoftInfo[nIndex].szUrl;
strSecondUrl = g_strUrlHead+strSecondUrl.erase(0,2);

std::vector<char> vInfo;
BOOL bSuccess = GetHtml(strSecondUrl,vInfo);
if (!bSuccess)
{
g_vSecondGetHttpFailedItem.push_back(g_vBaseSoftInfo[nIndex]);
}
else
{
WCHAR* pwInfo = NULL;
pwInfo = UTF8ToUnicode(&vInfo.front());
if (pwInfo == NULL)
{
g_vSecondGetHttpFailedItem.push_back(g_vBaseSoftInfo[nIndex]);
}
else
{
WebPageData stData;
stData.nSoftIndex = nIndex;
stData.pData = pwInfo;
g_vSecondWebPageDataSet.push_back(stData);
}
}
vInfo.clear();

if (WaitForSingleObject(g_hAnsEvent[0],0) == WAIT_OBJECT_0)
{
nCount = g_vBaseSoftInfo.size();
if (nIndex >= nCount -1)
{
break;
}
}

nIndex++;
}
}

SetEvent(g_hGetHttpEvent[1]);
COUT<<_T("抓取线程2退出")<<endl;
_endthread();
}

void GetFinalHttpThread(void* pParam)
{
int nIndex = 0;
while(TRUE)
{
String strFinalUrl = _T("");

int nCount = g_vSecondSoftInfo.size();
if (nCount <= nIndex)
{
if (WaitForSingleObject(g_hAnsEvent[1],0) == WAIT_OBJECT_0)
{
nCount = g_vSecondSoftInfo.size();
if (nIndex >= nCount -1)
{
break;
}
}

Sleep(100);
continue;
}

if (nCount > nIndex)
{
strFinalUrl = g_vSecondSoftInfo[nIndex].szUrl;
strFinalUrl = g_strUrlHead+strFinalUrl;

std::vector<char> vInfo;
BOOL bSuccess = GetHtml(strFinalUrl,vInfo);
if (!bSuccess)
{
g_vFinalGetHttpFailedItem.push_back(g_vSecondSoftInfo[nIndex]);
}
else
{
WCHAR* pwInfo = NULL;
pwInfo = UTF8ToUnicode(&vInfo.front());
if (pwInfo == NULL)
{
g_vFinalGetHttpFailedItem.push_back(g_vSecondSoftInfo[nIndex]);
}
else
{
WebPageData stData;
stData.nSoftIndex = nIndex;
stData.pData = pwInfo;
g_vFinalWebPageDataSet.push_back(stData);
}
}
vInfo.clear();

if (WaitForSingleObject(g_hAnsEvent[1],0) == WAIT_OBJECT_0)
{
nCount = g_vSecondSoftInfo.size();
if (nIndex >= nCount -1)
{
break;
}
}
nIndex++;
}
}
SetEvent(g_hGetHttpEvent[2]);
COUT<<_T("抓取线程3退出")<<endl;
_endthread();
}

int main()
{
#ifdef _UNICODE
wcout.imbue(locale("chs"));
wcin.imbue(locale("chs"));


#endif

String strUrl = _T("http://www.onlinedown.net/hits/month_1.htm");

g_vBaseUrl.push_back(strUrl);

g_hAnsEvent[0] = CreateEvent(NULL,TRUE,FALSE,NULL);
g_hAnsEvent[1] = CreateEvent(NULL,TRUE,FALSE,NULL);
g_hAnsEvent[2] = CreateEvent(NULL,TRUE,FALSE,NULL);

g_hGetHttpEvent[0] = CreateEvent(NULL,TRUE,FALSE,NULL);
g_hGetHttpEvent[1] = CreateEvent(NULL,TRUE,FALSE,NULL);
g_hGetHttpEvent[2] = CreateEvent(NULL,TRUE,FALSE,NULL);

COUT<<_T("开始基础数据分析线程")<<endl;
_beginthread(BaseInfoAnsThread, 0,NULL);

COUT<<_T("开始第二层分析线程")<<endl;
_beginthread(SecondAnsThread, 0,NULL);

COUT<<_T("开始第三层分析线程")<<endl;
_beginthread(FinalAnsThread, 0,NULL);

COUT<<_T("开始基础网页地址集抓取线程")<<endl;
_beginthread(GetBaseHttpThread, 0,NULL);

COUT<<_T("开始第二层网页地址集抓取线程")<<endl;
_beginthread(GetSecondHttpThread, 0,NULL);

COUT<<_T("开始第三层网页地址集抓取线程")<<endl;
_beginthread(GetFinalHttpThread, 0,NULL);

WaitForMultipleObjects(3,g_hAnsEvent,TRUE,INFINITE);
WaitForMultipleObjects(3,g_hGetHttpEvent,TRUE,INFINITE);

COUT<<_T("操作完成.")<<endl<<endl<<endl<<endl<<endl;

COUT<<_T("得到有效数据共:")<<g_vFinalSoftInfo.size()<<endl;
COUT<<endl;

COUT<<_T("没有官方下载项:")<<g_vNoOfficalDownItem.size()<<endl;
COUT<<endl;

COUT<<_T("网页数据提取错误:")<<endl;
COUT<<_T("1th网页:")<<g_vBaseGetHttpFailedItem.size()<<endl;
COUT<<_T("2th网页:")<<g_vSecondGetHttpFailedItem.size()<<endl;
COUT<<_T("3th网页:")<<g_vFinalGetHttpFailedItem.size()<<endl;
COUT<<_T("网页数据分析错误:")<<endl;
COUT<<_T("1th分析:")<<g_vBaseSoftInfo.size()<<endl;
COUT<<_T("2th分析")<<g_vSecondAnsFailedItem.size()<<endl;
COUT<<_T("3th分析")<<g_vFinalAnsFailedItem.size()<<endl;

for (vector<WCHAR*>::iterator it = g_vBaseWebPageDataSet.begin(); it != g_vBaseWebPageDataSet.end(); it++)
{
WCHAR* p = *it;
if (p)
{
delete []p;
p = NULL;
}
}
g_vBaseWebPageDataSet.clear();

for (vector<WebPageData>::iterator it = g_vSecondWebPageDataSet.begin(); it != g_vSecondWebPageDataSet.end(); it++)
{
WebPageData p = *it;

if ( p.pData)
{
delete []p.pData;
p.pData = NULL;
}
}
g_vSecondWebPageDataSet.clear();

for (vector<WebPageData>::iterator it = g_vFinalWebPageDataSet.begin(); it != g_vFinalWebPageDataSet.end(); it++)
{
WebPageData p = *it;

if ( p.pData)
{
delete []p.pData;
p.pData = NULL;
}
}
g_vFinalWebPageDataSet.clear();

COUT<<_T("是否保存? y/n :");
TCHAR cFlag;
CIN>>cFlag;
if (cFlag == L'y'

读书人网 >VC/MFC

热点推荐