求教:C#关于网页抓取的问题
本人正在研究怎么抓取网页数据,并且插入到数据库中。这个网页是一个英文网站,网页地址先是一个期刊的信息,我怎么把这个网页中的文章中的内容给下载下来,并且插入到Access数据中呢? 请哪位高人指点 谢谢噢?有意可以加1192488985联系!!!!
[解决办法]
帮顶!
[解决办法]
- C# code
using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Linq;using System.Text;using System.Windows.Forms;using System.Text.RegularExpressions;using System.Data.SqlClient;using System.Net;using System.IO;namespace GetQQData{ public partial class Form1 : Form { public Form1() { InitializeComponent(); tb_WebAddress.Text = "地址,自己替换"; } //连接数据库字符串 private static string strConn = "server=.;database=QQData;uid=xxx;password=xxx"; private void btn_SaveAs_Click(object sender, EventArgs e) { string QQnum = ""; int result = 0; //查询是否存在 int count= 0; string HtmlStr = tb_HtmlContent.Text; StringBuilder sb = null; MatchCollection mc = GetValues(HtmlStr, "<li>", "</li>"); for (int i = 0; i < mc.Count; i++) { try { string tmpHtml = string.Empty; tmpHtml = mc[i].Value; QQnum = GetValue(tmpHtml, "qq=", "\"><img"); count = SelectQQ(QQnum); if (count == 0) { sb = new StringBuilder(); sb.Append("insert into QQ(qqnum,addtime) values ('"); //sb.Append("QQnum"); //sb.Append("','"); sb.Append(QQnum); sb.Append("','"+DateTime.Now+"')"); result += ExecuteNonQuery(sb.ToString()); } } catch { } } lbl_Over.Text = "保存了"+result+"个QQ"; } //查询是否已经存在 public int SelectQQ(string qqnum) { int result=0; string strSql = "select count(*) from QQ where qqnum='" + qqnum + "'"; using (SqlConnection conn = new SqlConnection(strConn)) { using (SqlCommand cmd = new SqlCommand(strSql, conn)) { try { conn.Open(); result = Convert.ToInt32(cmd.ExecuteScalar()); } catch { result = 1; } finally { cmd.Dispose(); conn.Close(); } } } return result; } /// <summary> /// 正则全文匹配开始标签与结束标签之间的值,并放回数组 /// </summary> /// <remarks> /// 方法名称:GetValues /// 创建人: /// 创建时间:2011-09-06 /// Email:vochely@hotmail.com /// </remarks> /// <param name="strHtml">需要匹配的字符串</param> /// <param name="strStart">开始标签</param> /// <param name="strEnd">结束标签</param> /// <returns>返回开始标签与结束标签之间的值</returns> public static MatchCollection GetValues(string strHtml, string strStart, string strEnd) { MatchCollection result = null; try { Regex rg = new Regex("(?<=(" + strStart + "))[.\\s\\S]*?(?=(" + strEnd + "))", RegexOptions.Multiline | RegexOptions.Singleline); result = rg.Matches(strHtml); } catch { } return result; } /// <summary> /// 正则匹配开始标签与结束标签之间的值 /// </summary> /// <remarks> /// 方法名称:GetValue /// 创建人:zt /// 创建时间:2011-09-06 /// Email:vochely@hotmail.com /// </remarks> /// <param name="strHtml">需要匹配的字符串</param> /// <param name="strStart">开始标签</param> /// <param name="strEnd">结束标签</param> /// <returns>返回开始标签与结束标签之间的值</returns> public static string GetValue(string strHtml, string strStart, string strEnd) { string result = string.Empty; try { Regex rg = new Regex("(?<=(" + strStart + "))[.\\s\\S]*?(?=(" + strEnd + "))", RegexOptions.Multiline | RegexOptions.Singleline); result = rg.Match(strHtml).Value; } catch { } return result; } /// <summary> /// 执行增删改的对象,返回所影响的行数 /// </summary> /// <remarks> /// 方法名称:ExecuteNonQuery /// 创建人: /// 创建时间:2011-09-06 /// Email:vochely@hotmail.com /// </remarks> /// <param name="strSql">sql语句</param> /// <returns>返回 所影响的行数</returns> /// <connectionStrings> public static int ExecuteNonQuery(string strSql) { int result = 0; using (SqlConnection conn = new SqlConnection(strConn)) { using (SqlCommand cmd = new SqlCommand(strSql, conn)) { try { conn.Open(); result = cmd.ExecuteNonQuery(); } catch { result = 0; } finally { cmd.Dispose(); conn.Close(); } } } return result; } private void btn_EmptyContent_Click(object sender, EventArgs e) { tb_HtmlContent.Text = ""; lbl_Over.Text = ""; } private void btn_GetSoure_Click(object sender, EventArgs e) { string html = ""; string HtmlUrl = tb_WebAddress.Text.ToString().Trim(); html = GetHtml(HtmlUrl); tb_HtmlContent.Text = html; } /// <summary> /// 根据地址抓取Html /// ---------- /// /// 2011-09-06 /// ---------- /// </summary> /// <param name="url">抓取页面地址</param> /// <returns>抓取页面结果的源码</returns> public static string GetHtml(string url) { //根据Url地址获取当前地址的相应 HttpWebResponse response = (HttpWebResponse)WebRequest.Create(url).GetResponse(); //获取当前相应流 Stream responseStream = response.GetResponseStream(); //读取基础流文件 StreamReader reader = new StreamReader(responseStream, Encoding.GetEncoding("UTF-8")); string strHtml = reader.ReadToEnd(); //关闭基础流 reader.Close(); //关闭当前流的资源 responseStream.Close(); //关闭相应流 response.Close(); //返回页面源代码 return strHtml; } }}
[解决办法]
问题解决了没 ?我没写过程序抓数据,但是知道怎么用工具来抓。
[解决办法]
使用HttpWebRequest或者WebClient来获取网页信息
然后利用正则匹配出你想要的数据
添加入库.
[解决办法]
其实就是跟你用浏览器请求一张页面一样
WebRequest和WebResponse可以
[解决办法]
做蜘蛛,考虑用xml文件形似存放信息,这样可以分块,性能更好。首先正则过硬,再则多线程方式获取。整个实现还是比较麻烦的。
[解决办法]
为什么要抓取网页信息呢。
[解决办法]
学习 了。
[解决办法]
又一个弄采集的程序员,唉,努力吧!
[解决办法]
标记一下,可能我下一步也要研究这个。