新手求助,C++实现HTTP协议出现中文乱码。
最近刚刚学HTTP协议,抓取网页时比如百度、新浪都没有问题,但抓取SOHU的时候就会出现中文乱码问题,求助。
源码:
- C/C++ code
#include <cstdio>#include <iostream>#include <cstring>#include <winsock.h>#pragma comment(lib,"ws2_32.lib")using namespace std;int main(){ WSADATA wsaData={0}; SOCKET sock; struct sockaddr_in addr; struct hostent *pUrl; char myurl[]="www.sohu.com"; char host[BUFSIZ],GET[BUFSIZ],*pHost=0; char header[BUFSIZ]=""; static char text[BUFSIZ]; if(WSAStartup(MAKEWORD(2,2),&wsaData)) { printf("WSA failed\n"); return 0; } for(pHost=myurl;*pHost!='/'&&*pHost!='\0';pHost++); if(int(pHost-myurl)==strlen(myurl)) { strcpy(GET,"/"); } else { strcpy(GET,pHost); } *pHost='\0'; strcpy(host,myurl); sock=socket(AF_INET,SOCK_STREAM,0); pUrl=gethostbyname(host); addr.sin_family=AF_INET; addr.sin_addr.s_addr=*((unsigned long*)pUrl->h_addr); addr.sin_port=htons(80); strcat(header, "GET "); strcat(header, GET); strcat(header, " HTTP/1.1\r\n"); strcat(header, "HOST: "); strcat(header, host); strcat(header,"\r\nAccept-language:zh-CN"); strcat(header, "\r\nConnection: Close\r\n\r\n"); connect(sock,(SOCKADDR*)&addr,sizeof(addr)); send(sock,header,strlen(header),0); while(recv(sock,text,BUFSIZ,0)>0) { cout<<text; strnset(text,'\0',BUFSIZ); } closesocket(sock); WSACleanup(); return 0;}
[解决办法]
用你的程序运行了,发现返回的内容中前面部分是:
host: www.sohu.com
HTTP/1.1 200 OK
Content-Type: text/html
Content-Length: 78418
Connection: close
Date: Wed, 28 Mar 2012 06:43:41 GMT
Server: SWS
Vary: Accept-Encoding,X-Up-Calling-Line-id,X-Source-ID,X-Up-Bearer-Type
Cache-Control: max-age=70
Expires: Wed, 28 Mar 2012 06:44:51 GMT
Last-Modified: Wed, 28 Mar 2012 06:42:51 GMT
Content-Encoding: gzip
FSS-Cache: HIT from 4086639.6118265.5480846
----------------------------------------------------
正如楼上所料,是压缩格式的!