读书人

获取网页源代码的办法!该如何解决

发布时间: 2012-03-22 17:43:57 作者: rapoo

获取网页源代码的办法!
我大概要对15万个网页进行查询(通过源代码分析数据),如:http://www.163.com/name=000001
000001是变量,大家告诉我下用什么比较好,而且可以给出简单的代码看下吗?

[解决办法]
用web browser控件等来获得网页的Dom
不同网页可以用Navigate来打开对应网页,获得对应Dom后,打开下一个网页..继续
[解决办法]
写了个类。试试用它来去获取网页代码。
用法也比较简单

C/C++ code
//httpGet.h#ifndef httpGetH#define httpGetH//---------------------------------------#include <windows.h>#include "winsock2.h"#include <stdio.h>#include <string>using namespace std;class XBBHttpGet{private:    LPSTR m_Host, m_File, m_Buf;    string stream;public:    XBBHttpGet(LPSTR URL):m_Host(NULL), m_File(NULL), m_Buf(NULL) {        char *p, *pos;        m_Buf = strdup(URL);        CharLowerBuff(m_Buf, strlen(m_Buf)); //转换成小写        p = strstr(m_Buf, "http://");        if(!p) {            p = m_Buf;        }        else {            p = m_Buf + strlen("http://");        }        pos = strchr(p, '/');        if(pos) {            int len = pos - p;            m_Host = (char *)malloc(len + 1);            memset(m_Host, 0x00, len + 1);            memcpy(m_Host, p, len);            m_File = strdup(pos);        }        else {            m_Host = strdup(p);            m_File = strdup("/");        }        free(m_Buf);    };        ~XBBHttpGet() {        if(m_Buf) free(m_Buf);        if(m_File) free(m_File);        if(m_Host) free(m_Host);    };        LPSTR GetBuffer(){ return m_Buf; };        int Get() {        WSADATA wsaData;        SOCKET s;        struct sockaddr_in addr;        if(::WSAStartup(0x0202,&wsaData) != 0) {            return WSAGetLastError();        }        struct hostent* phe;        if((phe=gethostbyname(m_Host)) == 0) {  //域名转IP            return WSAGetLastError();        }        int aa;        memcpy((char*)(&aa), phe->h_addr_list[0], 4);        struct in_addr ip;        ip.S_un.S_addr = aa;        memset((void *)&addr, 0, sizeof(addr));        addr.sin_family = AF_INET;        addr.sin_port = htons(80);        addr.sin_addr.s_addr = inet_addr(inet_ntoa(ip));        s = socket(AF_INET,SOCK_STREAM,0); //创建套接字        if(s == INVALID_SOCKET) {            return WSAGetLastError();        }        int ret = ::connect(s, (struct sockaddr *)&addr, sizeof(addr));//连接对方端口        if(ret == SOCKET_ERROR) {            return WSAGetLastError();        }        //构造http请求        m_Buf = (char *)malloc(4096);        memset(m_Buf, 0x00, 4096);        strcpy(m_Buf, "GET ");        strcat(m_Buf, m_File);        strcat(m_Buf, " HTTP/1.1\r\n");        strcat(m_Buf, "Host: ");        strcat(m_Buf, m_Host);        strcat(m_Buf, "\r\n");        strcat(m_Buf, "User-Agent: XBBHttpGet\r\n\r\n");        ret = 0;        //发送http请求        while(ret != SOCKET_ERROR && ret < strlen(m_Buf)) {            ret = ::send(s, m_Buf + ret, strlen(m_Buf) - ret, 0);            if(ret == SOCKET_ERROR) {                return WSAGetLastError();            }        }        //接收数据        while(1) {            memset(m_Buf, 0x00, 4096);            ret = ::recv(s, m_Buf, 4096, 0);            if(ret == 0) break;            if(ret == SOCKET_ERROR) {                return WSAGetLastError();            }            stream.append(m_Buf);        }        free(m_Buf);        int len = stream.size();        m_Buf = (char *)malloc(len+1);        memset(m_Buf, 0x00, len);        strcpy(m_Buf, stream.c_str());        stream.clear();        closesocket(s);        return 0;    }};#endif 

读书人网 >C++ Builder

热点推荐