如何抓取查询结果中的列表信息
本帖最后由 chenxi0516 于 2012-11-28 11:05:38 编辑 如何抓取 http://search.anccnet.com/searchResult.aspx?keyword=%u9171%u6cb9 类似这个商品列表信息,存在到数据库中。
给个思路,有参考程序的最好了,谢谢。。。没写过类似这样的程序,无从下手。。
在线等。。。。
[最优解释]
DataTable dt = new DataTable();
dt.Columns.Add("商品条码", typeof(string));
dt.Columns.Add("名称", typeof(string));
dt.Columns.Add("规格", typeof(string));
dt.Columns.Add("描述", typeof(string));
dt.Columns.Add("商标", typeof(string));
dt.Columns.Add("发布厂家", typeof(string));
Encoding encoding = Encoding.GetEncoding("gb2312");
string keyword = HttpUtility.UrlEncode("麻油", encoding);
string url = @"http://search.anccnet.com/searchResult.aspx?keyword=" + keyword;
HtmlWeb htmlWeb = new HtmlWeb();
htmlWeb.OverrideEncoding = encoding;
htmlWeb.PreRequest = x =>
{
byte[] bytes = encoding.GetBytes("/wEPDwUKMTA5MTc1OTY3Mg9kFgICAQ9kFgYCAw8PFgIeBFRleHQFYjxmb250IGNvbG9yPSdibGFjayc+57O757uf5Lit56ym5ZCI5p2h5Lu2PC9mb250PiDphbHmsrk8Zm9udCBjb2xvcj0nYmxhY2snPiDnmoTllYblk4HmnInvvJo8L2ZvbnQ+ZGQCBw8PFgQeC1JlY29yZGNvdW50AowRHg5DdXN0b21JbmZvVGV4dAWTAeaAu+iusOW9leaVsO+8mjxmb250IGNvbG9yPSJibHVlIj48Yj4yMTg4PC9iPjwvZm9udD4g5oC76aG15pWw77yaPGZvbnQgY29sb3I9ImJsdWUiPjxiPjE0NjwvYj48L2ZvbnQ+IOW9k+WJjemhte+8mjxmb250IGNvbG9yPSJyZWQiPjxiPjE8L2I+PC9mb250PmRkAgkPFgIeB1Zpc2libGVoZGRegsQ/9sftuonRL+jiHPQitwQcKg==");
x.ContentLength = bytes.Length;
x.ContentType = "application/x-www-form-urlencoded";
using (Stream stream = x.GetRequestStream())
stream.Write(bytes, 0, bytes.Length);
return true;
};
HtmlDocument htmlDoc = htmlWeb.Load(url, "POST");
int pageCount = int.Parse(htmlDoc.DocumentNode.SelectSingleNode("//table[@id='myPager']/tr[1]/td[1]/table/tr/td/font[2]").InnerText);
for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
{
htmlWeb.PreRequest = x =>
{
byte[] bytes = encoding.GetBytes("__VIEWSTATE=%2FwEPDwUKMTA5MTc1OTY3Mg9kFgICAQ9kFgYCAw8PFgIeBFRleHQFYjxmb250IGNvbG9yPSdibGFjayc%2B57O757uf5Lit56ym5ZCI5p2h5Lu2PC9mb250PiDphbHmsrk8Zm9udCBjb2xvcj0nYmxhY2snPiDnmoTllYblk4HmnInvvJo8L2ZvbnQ%2BZGQCBw8PFgYeEEN1cnJlbnRQYWdlSW5kZXgCAh4OQ3VzdG9tSW5mb1RleHQFkwHmgLvorrDlvZXmlbDvvJo8Zm9udCBjb2xvcj0iYmx1ZSI%2BPGI%2BMjE4ODwvYj48L2ZvbnQ%2BIOaAu%2BmhteaVsO%2B8mjxmb250IGNvbG9yPSJibHVlIj48Yj4xNDY8L2I%2BPC9mb250PiDlvZPliY3pobXvvJo8Zm9udCBjb2xvcj0icmVkIj48Yj4yPC9iPjwvZm9udD4eC1JlY29yZGNvdW50AowRZGQCCQ8WAh4HVmlzaWJsZWhkZLozrKPv3rWTss7F9rFI0qlAJrVU&__EVENTTARGET=myPager&__EVENTARGUMENT=" + pageIndex + "&keyword=");
x.ContentLength = bytes.Length;
x.ContentType = "application/x-www-form-urlencoded";
using (Stream stream = x.GetRequestStream())
stream.Write(bytes, 0, bytes.Length);
return true;
};
htmlDoc = htmlWeb.Load(url, "POST");
HtmlNodeCollection dls = htmlDoc.DocumentNode.SelectNodes(@"//dl[@class='p-supplier' or @class='p-info']");
for (int i = 0; i < dls.Count; i = i + 2)
{
DataRow row = dt.NewRow();
row["商标"] = dls[i].SelectSingleNode(@"dd[1]").InnerText.Trim();
row["发布厂家"] = dls[i].SelectSingleNode(@"dd[2]/a").InnerText.Trim();
row["商品条码"] = dls[i + 1].SelectSingleNode(@"dd[1]").InnerText.Trim();
row["名称"] = dls[i + 1].SelectSingleNode(@"dd[2]").InnerText.Trim();
row["规格"] = dls[i + 1].SelectSingleNode(@"dd[3]").InnerText.Trim();
row["描述"] = dls[i + 1].SelectSingleNode(@"dd[4]").InnerText.Trim();
dt.Rows.Add(row);
}
File.AppendAllText(@"c:\ttt.txt", DateTime.Now.ToString("HH:mm:ss") + ":第" + pageIndex + "页处理完\r\n");
}
GridView1.DataSource = dt;
GridView1.DataBind();
[其他解释]
通过网页源文件来找元素
<div class="result">
<p class="p-img" align="center"><a id="repList_ctl00_herl" target="_blank"> <img src="/img/empty_90-90.8.png" id="repList_ctl00_productimg" width="90" height="90" onload="limitImgSize(this, 90, 90,true);" /></a></p>
<dl class="p-supplier">
<dt>商标:</dt>
<dd>三富</dd>
<dt>发布厂家:</dt>
<dd><a id="repList_ctl00_firmLink" href="http://www.anccnet.com/info_search/factory/Detail.aspx?id=F25F56A9F703ED747435DEE7129FF5F679568959EBB9BF75849EDBE212FD6E31&temp=3" style="text-decoration:none;">辽宁三富酱油酿造有限公司</a></dd>
</dl>
<dl class="p-info">
<dt>商品条码:</dt>
<dd>
06956306580019</dd>
<dt>名称:</dt>
<dd> 三富原味酿造酱油</dd>
<dt>规格型号:</dt>
<dd> 350ml袋装</dd>
<dt>描述:</dt>
<dd> ,酱油、三富、酿造、低盐固态</dd>
</dl>
<br clear="all" />
</div>
每一个class="result" 的div里面就是一条查询结果,<dl class="p-supplier">
是供应商信息,<dl class="p-info">是商品信息
[其他解释]
关键是,下一页,如何抓取,这个比较困难
[其他解释]
如果有下一页,模仿点击动作,点下一页,再采
[其他解释]
用下一页标记,在当前页面获取下一页的链接,再抓取,循环抓取,直到没有下一页标记为止!
[其他解释]
爬虫啊,好久没弄过了。学习下
[其他解释]
System.Net.WebClient WebClientObj = new System.Net.WebClient();
System.Collections.Specialized.NameValueCollection PostVars = new System.Collections.Specialized.NameValueCollection();
//添加值域
PostVars.Add("ean", "6923450601549");
PostVars.Add("verify", "c0af28e35d89117bccd58d16b192bec8");
try
{
byte[] byRemoteInfo = WebClientObj.UploadValues("http://www.liantu.com/tiaoma/", "POST", PostVars);
//下面都没用啦,就上面一句话就可以了
string sRemoteInfo = System.Text.Encoding.UTF8.GetString(byRemoteInfo);
//这是获取返回信息
Response.Write(sRemoteInfo);
}
catch
{ }
这样写,获取不到值,为什么呢
[其他解释]
用HtmlAgilityPack(网上下载最新版1.4.6),获取第一页的:
DataTable dt = new DataTable();
dt.Columns.Add("商品条码", typeof(string));
dt.Columns.Add("名称", typeof(string));
dt.Columns.Add("规格", typeof(string));
dt.Columns.Add("描述", typeof(string));
dt.Columns.Add("商标", typeof(string));
dt.Columns.Add("发布厂家", typeof(string));
Encoding encoding = Encoding.GetEncoding("gb2312");
string keyword = HttpUtility.UrlEncode("酱油", encoding);
HtmlWeb htmlWeb = new HtmlWeb();
htmlWeb.OverrideEncoding = encoding;
HtmlDocument htmlDoc = htmlWeb.Load(@"http://search.anccnet.com/searchResult.aspx?keyword=" + keyword);
HtmlNodeCollection dls = htmlDoc.DocumentNode.SelectNodes(@"//dl[@class='p-supplier' or @class='p-info']");
for (int i = 0; i < dls.Count; i = i + 2)
{
DataRow row = dt.NewRow();
row["商标"] = dls[i].SelectSingleNode(@"dd[1]").InnerText;
row["发布厂家"] = dls[i].SelectSingleNode(@"dd[2]/a").InnerText;
row["商品条码"] = dls[i + 1].SelectSingleNode(@"dd[1]").InnerText;
row["名称"] = dls[i + 1].SelectSingleNode(@"dd[2]").InnerText;
row["规格"] = dls[i + 1].SelectSingleNode(@"dd[3]").InnerText;
row["描述"] = dls[i + 1].SelectSingleNode(@"dd[4]").InnerText;
dt.Rows.Add(row);
}
GridView1.DataSource = dt;
GridView1.DataBind();
[其他解释]
后面的页面也取到了,这里我取3页:
DataTable dt = new DataTable();
dt.Columns.Add("商品条码", typeof(string));
dt.Columns.Add("名称", typeof(string));
dt.Columns.Add("规格", typeof(string));
dt.Columns.Add("描述", typeof(string));
dt.Columns.Add("商标", typeof(string));
dt.Columns.Add("发布厂家", typeof(string));
Encoding encoding = Encoding.GetEncoding("gb2312");
string keyword = HttpUtility.UrlEncode("酱油", encoding);
for (int pageIndex = 1; pageIndex <= 3; pageIndex++)
{
HttpWebRequest httpWebRequest = WebRequest.Create(@"http://search.anccnet.com/searchResult.aspx?keyword=" + keyword) as HttpWebRequest;
httpWebRequest.Method = "POST";
byte[] bytes = encoding.GetBytes("__VIEWSTATE=%2FwEPDwUKMTA5MTc1OTY3Mg9kFgICAQ9kFgYCAw8PFgIeBFRleHQFYjxmb250IGNvbG9yPSdibGFjayc%2B57O757uf5Lit56ym5ZCI5p2h5Lu2PC9mb250PiDphbHmsrk8Zm9udCBjb2xvcj0nYmxhY2snPiDnmoTllYblk4HmnInvvJo8L2ZvbnQ%2BZGQCBw8PFgYeEEN1cnJlbnRQYWdlSW5kZXgCAh4OQ3VzdG9tSW5mb1RleHQFkwHmgLvorrDlvZXmlbDvvJo8Zm9udCBjb2xvcj0iYmx1ZSI%2BPGI%2BMjE4ODwvYj48L2ZvbnQ%2BIOaAu%2BmhteaVsO%2B8mjxmb250IGNvbG9yPSJibHVlIj48Yj4xNDY8L2I%2BPC9mb250PiDlvZPliY3pobXvvJo8Zm9udCBjb2xvcj0icmVkIj48Yj4yPC9iPjwvZm9udD4eC1JlY29yZGNvdW50AowRZGQCCQ8WAh4HVmlzaWJsZWhkZLozrKPv3rWTss7F9rFI0qlAJrVU&__EVENTTARGET=myPager&__EVENTARGUMENT=" + pageIndex + "&keyword=");
httpWebRequest.ContentLength = bytes.Length;
httpWebRequest.ContentType = "application/x-www-form-urlencoded";
Stream stream = httpWebRequest.GetRequestStream();
stream.Write(bytes, 0, bytes.Length);
stream.Close();
HttpWebResponse httpWebResponse = httpWebRequest.GetResponse() as HttpWebResponse;
stream = httpWebResponse.GetResponseStream();
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.Load(stream, encoding);
stream.Close();
HtmlNodeCollection dls = htmlDoc.DocumentNode.SelectNodes(@"//dl[@class='p-supplier' or @class='p-info']");
for (int i = 0; i < dls.Count; i = i + 2)
{
DataRow row = dt.NewRow();
row["商标"] = dls[i].SelectSingleNode(@"dd[1]").InnerText;
row["发布厂家"] = dls[i].SelectSingleNode(@"dd[2]/a").InnerText;
row["商品条码"] = dls[i + 1].SelectSingleNode(@"dd[1]").InnerText;
row["名称"] = dls[i + 1].SelectSingleNode(@"dd[2]").InnerText;
row["规格"] = dls[i + 1].SelectSingleNode(@"dd[3]").InnerText;
row["描述"] = dls[i + 1].SelectSingleNode(@"dd[4]").InnerText;
dt.Rows.Add(row);
}
}
GridView1.DataSource = dt;
GridView1.DataBind();
[其他解释]
完全用HtmlAgilityPack写:
DataTable dt = new DataTable();
dt.Columns.Add("商品条码", typeof(string));
dt.Columns.Add("名称", typeof(string));
dt.Columns.Add("规格", typeof(string));
dt.Columns.Add("描述", typeof(string));
dt.Columns.Add("商标", typeof(string));
dt.Columns.Add("发布厂家", typeof(string));
Encoding encoding = Encoding.GetEncoding("gb2312");
string keyword = HttpUtility.UrlEncode("酱油", encoding);
for (int pageIndex = 1; pageIndex <= 3; pageIndex++)
{
HtmlWeb htmlWeb = new HtmlWeb();
htmlWeb.OverrideEncoding = encoding;
htmlWeb.PreRequest = x =>
{
byte[] bytes = encoding.GetBytes("__VIEWSTATE=%2FwEPDwUKMTA5MTc1OTY3Mg9kFgICAQ9kFgYCAw8PFgIeBFRleHQFYjxmb250IGNvbG9yPSdibGFjayc%2B57O757uf5Lit56ym5ZCI5p2h5Lu2PC9mb250PiDphbHmsrk8Zm9udCBjb2xvcj0nYmxhY2snPiDnmoTllYblk4HmnInvvJo8L2ZvbnQ%2BZGQCBw8PFgYeEEN1cnJlbnRQYWdlSW5kZXgCAh4OQ3VzdG9tSW5mb1RleHQFkwHmgLvorrDlvZXmlbDvvJo8Zm9udCBjb2xvcj0iYmx1ZSI%2BPGI%2BMjE4ODwvYj48L2ZvbnQ%2BIOaAu%2BmhteaVsO%2B8mjxmb250IGNvbG9yPSJibHVlIj48Yj4xNDY8L2I%2BPC9mb250PiDlvZPliY3pobXvvJo8Zm9udCBjb2xvcj0icmVkIj48Yj4yPC9iPjwvZm9udD4eC1JlY29yZGNvdW50AowRZGQCCQ8WAh4HVmlzaWJsZWhkZLozrKPv3rWTss7F9rFI0qlAJrVU&__EVENTTARGET=myPager&__EVENTARGUMENT=" + pageIndex + "&keyword=");
x.ContentLength = bytes.Length;
x.ContentType = "application/x-www-form-urlencoded";
using (Stream stream = x.GetRequestStream())
stream.Write(bytes, 0, bytes.Length);
return true;
};
HtmlDocument htmlDoc = htmlWeb.Load(@"http://search.anccnet.com/searchResult.aspx?keyword=" + keyword, "POST");
HtmlNodeCollection dls = htmlDoc.DocumentNode.SelectNodes(@"//dl[@class='p-supplier' or @class='p-info']");
for (int i = 0; i < dls.Count; i = i + 2)
{
DataRow row = dt.NewRow();
row["商标"] = dls[i].SelectSingleNode(@"dd[1]").InnerText;
row["发布厂家"] = dls[i].SelectSingleNode(@"dd[2]/a").InnerText;
row["商品条码"] = dls[i + 1].SelectSingleNode(@"dd[1]").InnerText;
row["名称"] = dls[i + 1].SelectSingleNode(@"dd[2]").InnerText;
row["规格"] = dls[i + 1].SelectSingleNode(@"dd[3]").InnerText;
row["描述"] = dls[i + 1].SelectSingleNode(@"dd[4]").InnerText;
dt.Rows.Add(row);
}
}
GridView1.DataSource = dt;
GridView1.DataBind();
[其他解释]
页数能智能抓取到么?如何写啊,谢谢。。
[其他解释]
采集“酱油”的所有页,注意连续发送请求的时间间隔不能太短,否则会采集不到,这里大致为500毫秒发送一次:
DataTable dt = new DataTable();
dt.Columns.Add("商品条码", typeof(string));
dt.Columns.Add("名称", typeof(string));
dt.Columns.Add("规格", typeof(string));
dt.Columns.Add("描述", typeof(string));
dt.Columns.Add("商标", typeof(string));
dt.Columns.Add("发布厂家", typeof(string));
Encoding encoding = Encoding.GetEncoding("gb2312");
string keyword = HttpUtility.UrlEncode("酱油", encoding);
string url = @"http://search.anccnet.com/searchResult.aspx?keyword=" + keyword;
HtmlWeb htmlWeb = new HtmlWeb();
htmlWeb.OverrideEncoding = encoding;
HtmlDocument htmlDoc = htmlWeb.Load(url);
int pageCount = int.Parse(htmlDoc.DocumentNode.SelectSingleNode("//table[@id='myPager']//table[1]//td[1]/font[2]/b").InnerText);
for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
{
htmlWeb.PreRequest = x =>
{
byte[] bytes = encoding.GetBytes("__VIEWSTATE=%2FwEPDwUKMTA5MTc1OTY3Mg9kFgICAQ9kFgYCAw8PFgIeBFRleHQFYjxmb250IGNvbG9yPSdibGFjayc%2B57O757uf5Lit56ym5ZCI5p2h5Lu2PC9mb250PiDphbHmsrk8Zm9udCBjb2xvcj0nYmxhY2snPiDnmoTllYblk4HmnInvvJo8L2ZvbnQ%2BZGQCBw8PFgYeEEN1cnJlbnRQYWdlSW5kZXgCAh4OQ3VzdG9tSW5mb1RleHQFkwHmgLvorrDlvZXmlbDvvJo8Zm9udCBjb2xvcj0iYmx1ZSI%2BPGI%2BMjE4ODwvYj48L2ZvbnQ%2BIOaAu%2BmhteaVsO%2B8mjxmb250IGNvbG9yPSJibHVlIj48Yj4xNDY8L2I%2BPC9mb250PiDlvZPliY3pobXvvJo8Zm9udCBjb2xvcj0icmVkIj48Yj4yPC9iPjwvZm9udD4eC1JlY29yZGNvdW50AowRZGQCCQ8WAh4HVmlzaWJsZWhkZLozrKPv3rWTss7F9rFI0qlAJrVU&__EVENTTARGET=myPager&__EVENTARGUMENT=" + pageIndex + "&keyword=");
x.ContentLength = bytes.Length;
x.ContentType = "application/x-www-form-urlencoded";
using (Stream stream = x.GetRequestStream())
stream.Write(bytes, 0, bytes.Length);
return true;
};
htmlDoc = htmlWeb.Load(url, "POST");
HtmlNodeCollection dls = htmlDoc.DocumentNode.SelectNodes(@"//dl[@class='p-supplier' or @class='p-info']");
for (int i = 0; i < dls.Count; i = i + 2)
{
DataRow row = dt.NewRow();
row["商标"] = dls[i].SelectSingleNode(@"dd[1]").InnerText.Trim();
row["发布厂家"] = dls[i].SelectSingleNode(@"dd[2]/a").InnerText.Trim();
row["商品条码"] = dls[i + 1].SelectSingleNode(@"dd[1]").InnerText.Trim();
row["名称"] = dls[i + 1].SelectSingleNode(@"dd[2]").InnerText.Trim();
row["规格"] = dls[i + 1].SelectSingleNode(@"dd[3]").InnerText.Trim();
row["描述"] = dls[i + 1].SelectSingleNode(@"dd[4]").InnerText.Trim();
dt.Rows.Add(row);
}
Thread.Sleep(500);
}
GridView1.DataSource = dt;
GridView1.DataBind();
[其他解释]
哦,好像有点问题,等等。
[其他解释]
哈哈,研究了近半天时间,终于搞定了(前面的有误),楼主最好加到100分吧,我发给你代码。
[其他解释]
可以啊。 http://bbs.csdn.net/topics/390303081?page=1#post-393093696 这个帖子也是我开的,问题解决了,我都把分接给你啊。。
发我邮箱吧 liangguang0515@163.com 多谢了。
[其他解释]
想抓取淘宝搜索结果,怎么个修改办法,求dalmeeme 帮忙做个示例
[其他解释]
需要.net3.5版本。
[其他解释]
htmlWeb.PreRequest = x => 为什么我编译到这里的时候,提示错误啊。
错误1无效的表达式项“>”E:\test\Get_zgspxxpt_eg.aspx.cs3337E:\test\
[其他解释]
.net 2.0可以用htmlWeb.PreRequest = delegate(HttpWebRequest x){....};