读书人

Socket下载页面部分出现乱码请大家帮

发布时间: 2012-01-07 21:41:56 作者: rapoo

Socket下载页面部分出现乱码,请大家帮忙解决
采集某一站点页面,该站点页面使用gb2312编码,但是发现采集后的部分页面中会出现少量的乱码。困扰很久了,请大家帮忙,以下是采集部分的代码。请大家看看,是哪里有问题?

#region public static string GetClientBySocket(string UrlString) //通过Socket取得页面
/// <summary>
/// 通过Socket取得页面
/// </summary>
/// <param name= "UrlString "> </param>
/// <returns> </returns>
public static string GetClientBySocket(string UrlString)
{
string HostName = URLHelper.GetHostName(UrlString);
IPAddress[] ips = Dns.GetHostAddresses(HostName);
IPAddress ip = ips[0];
IPEndPoint serverhost = new IPEndPoint(ip, 80);
Socket clientSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
clientSocket.Connect(serverhost);

string httpReq = "GET " + UrlString + " HTTP/1.0 \r\n ";
httpReq += "Host: " + HostName + " \r\n ";
httpReq += "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98) \r\n ";
httpReq += "Accept:*/* \r\n ";
httpReq += "Connection:Keep-Alive \r\n\r\n ";
string txtHTML= " ";
try
{
clientSocket.Send(System.Text.Encoding.ASCII.GetBytes(httpReq));

Byte[] buffer = new byte[10240];
int byteCount = clientSocket.Receive(buffer, buffer.Length, 0);


txtHTML = Encoding.ASCII.GetString(buffer, 0, byteCount);
while (byteCount > 0)
{
byteCount = clientSocket.Receive(buffer, buffer.Length, 0);
//txtHTML = txtHTML + Encoding.Default.GetString(buffer, 0, byteCount);
txtHTML = txtHTML + Encoding.GetEncoding( "gb2312 ").GetString(buffer, 0, byteCount);
}

//clientSocket.Close();
int index = txtHTML.IndexOf( "Location: ");
if (index > 0)
{
MatchCollection MatchList = RegexHelper.DoRegex(txtHTML, "Location:(? <URL> .*?)\r ");
string url = MatchList[0].Groups[ "URL "].Value;
txtHTML = GetClientBySocket(url);
}
}
catch (Exception err)
{
string errMessage = String.Format( "网络错误,Socket网络数据下载失败:{0} ", UrlString);
string Forder = ConfigurationManager.AppSettings[ "LogFolder "];
ILogWriter LogWriter = new WinLogWriter();


LogWriter.AddLog(err, errMessage, Forder);
}
return txtHTML;
}
#endregion

[解决办法]
try
{
clientSocket.Send(System.Text.Encoding.UTF8.GetBytes(httpReq));



[解决办法]
应该是frame页有不同编码的问题吧

[解决办法]
txtHTML = System.Text.Encoding.Default.GetString(buffer);
[解决办法]
也不写出最终解决方案,没道德的说

读书人网 >C#

热点推荐