抓网页的小程序,为什么只能取到比较少的一部分网页代码?
private void button1_Click(object sender, EventArgs e)
{
string str = " ";
string FileTxt = " ";
int count = 0;
FileTxt = this.filetxt.Text;
if (FileTxt== " ")
{ FileTxt = "html.txt "; }
int begindex = 0;
int endindex = 0;
string num = " ";
byte[] buf = new byte[38192000];
num = this.tb1.Text.ToString();
string keyword = this.keyword.Text;
string kwend = this.kwend.Text;
int index = num.IndexOf(keyword);
int indexend = num.IndexOf(kwend);
int start=0;
index= index + keyword.Length;
begindex = Convert.ToInt32(num.Substring(index, indexend-index));
endindex = Convert.ToInt32(this.tb2.Text);
for (; begindex <= endindex; begindex++)
{
string page = tb1.Text.Substring(0, index - 1) + begindex + tb1.Text.Substring(indexend);
HttpWebRequest request = (HttpWebRequest)
WebRequest.Create(page);
HttpWebResponse response = (HttpWebResponse)
request.GetResponse();
Stream resStream = response.GetResponseStream();
count = resStream.Read(buf, 0, buf.Length);
str = str + Encoding.Default.GetString(buf, 0, count);
resStream.Close();
}
this.LoadToText(str, FileTxt);
}
当点击抓取按钮时,程序根网页url(123.asp?id=456&page=1)里 id的范围抓取网页html源码,所有的网页能全部抓到了,但都不全,是 byte[]错了吗?高手指教!
[解决办法]
count = resStream.Read(buf, 0, buf.Length);
这个地方要分几次读, 你的缓冲区new byte[38192000] 太大了,
,内部socket的缓冲区一次根本读不下那么多, 所以你得循环几次来读取。
你不考虑下,你的内存么
[解决办法]
使用下边这个通用获取远程web函数来试验,专门使用StreamReader实例来获取流比较稳当点
/// <summary>
/// 判断页面是否存在,并返回页面快照信息
/// </summary>
/// <param name= "urlstr "> 远程地址 </param>
/// <param name= "strEncoding "> 编码格式 </param>
/// <returns> 获取的数据,没有则返回空 </returns>
private StringBuilder FValidAndGetURL(string urlstr,string strEncoding)
{
//lock_GetPageInfo.WaitOne(); //线程加锁
WebResponse response = null;
Stream stream = null;
StreamReader reader = null;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(@urlstr.Trim());
request.Timeout=60000;
request.Method= "GET ";
request.AllowAutoRedirect=true;
request.UserAgent= "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727) ";
request.Accept=@ "*/* ";
//request.CookieContainer=this.Get_CookieContainer(urlstr.Trim());
response = request.GetResponse();
if (!response.ContentType.ToLower().StartsWith( "text/ "))
{
//SaveBinaryFile(response);
MessageBox.Show( "获取的不是web信息: " + response.ContentType.Trim());
return null;
}
stream = response.GetResponseStream();
System.Text.Encoding encoding;
switch(strEncoding.Trim().ToUpper())
{
case "UTF-8 ":
{
encoding = Encoding.UTF8;
break;
}
case "UTF-7 ":
{
encoding = Encoding.UTF7;
break;
}
case "UNICODE ":
{
encoding = Encoding.Unicode;
break;
}
default:
{
encoding = Encoding.Default;
break;
}
}
reader = new StreamReader(stream, encoding);
StringBuilder buffer=new StringBuilder();
buffer.Append(reader.ReadToEnd());
reader.Close();
stream.Close();
response.Close();
return buffer;
}
catch (WebException e)
{
MessageBox.Show(e.Message);
return null;
}
catch (IOException e)
{
MessageBox.Show(e.Message);
return null;
}
finally
{
if (reader != null)
{
reader.Close();
}
if (stream != null)
{
stream.Close();
}
if (response != null)
{
response.Close();
}
}
//lock_GetPageInfo.ReleaseMutex();//线程解锁
}