读书人

winform中怎么提取指定网页中的类似lt;a

发布时间: 2012-01-06 22:55:18 作者: rapoo

winform中如何提取指定网页中的类似<a href="news.asp?id=3"的超连接
如标题
CSDN上都是如下的答案,可获得超连接都是http://开头的,而网页中以 <a href= "news.asp?id=3 " > 内容 </a> 中的news.asp?id=3如何获得?检测它是否可以正常打开?
using System;
using System.Xml;
using System.Text;
using System.Net;
using System.IO;
using System.Collections;
using System.Text.RegularExpressions;

public class App
{
public static void Main()
{
string strCode;
ArrayList alLinks;

Console.Write( "请输入一个网页地址: ");
string strURL = Console.ReadLine();
if(strURL.Substring(0,7) != @ "http:// ")
{
strURL = @ "http:// " + strURL;
}

Console.WriteLine( "正在获取页面代码,请稍侯... ");
strCode = GetPageSource(strURL);

Console.WriteLine( "正在提取超链接,请稍侯... ");
alLinks = GetHyperLinks(strCode);

Console.WriteLine( "正在写入文件,请稍侯... ");
WriteToXml(strURL,alLinks);
}

// 获取指定网页的HTML代码
static string GetPageSource(string URL)
{
Uri uri =new Uri(URL);

HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

hwReq.Method = "Get ";

hwReq.KeepAlive = false;

StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding( "GB2312 "));

return reader.ReadToEnd();
}

// 提取HTML代码中的网址
static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();

string strRegex = @ "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? ";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);

for(int i=0; i <=m.Count-1; i++)
{
bool rep = false;
string strNew = m[i].ToString();

// 过滤重复的URL
foreach(string str in al)
{
if(strNew==str)
{
rep =true;
break;
}
}

if(!rep) al.Add(strNew);
}

al.Sort();

return al;
}

// 把网址写入xml文件
static void WriteToXml(string strURL, ArrayList alHyperLinks)
{
XmlTextWriter writer = new XmlTextWriter( "HyperLinks.xml ",Encoding.UTF8);

writer.Formatting = Formatting.Indented;
writer.WriteStartDocument(false);
writer.WriteDocType( "HyperLinks ", null, "urls.dtd ", null);
writer.WriteComment( "提取自 " + strURL + "的超链接 ");


writer.WriteStartElement( "HyperLinks ");
writer.WriteStartElement( "HyperLinks ", null);
writer.WriteAttributeString( "DateTime ",DateTime.Now.ToString());


foreach(string str in alHyperLinks)
{
string title = GetDomain(str);
string body = str;
writer.WriteElementString(title,null,body);
}

writer.WriteEndElement();
writer.WriteEndElement();

writer.Flush();
writer.Close();
}

// 获取网址的域名后缀
static string GetDomain(string strURL)
{
string retVal;

string strRegex = @ "(\.com/|\.net/|\.cn/|\.org/|\.gov/) ";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
Match m = r.Match(strURL);
retVal = m.ToString();

strRegex = @ "\.|/$ ";
retVal = Regex.Replace(retVal, strRegex, " ").ToString();

if(retVal == " ")
retVal = "other ";

return retVal;
}
}


[解决办法]
正则表达式分组捕捉
href= "(? <Url> .*?)\ "\s+[> ]
[解决办法]
如果用webBrowser控件,非常简单就能获得。

private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (webBrowser1.Document.GetElementsByTagName( "A ") != null)
{
foreach (HtmlElement obj in webBrowser1.Document.GetElementsByTagName( "A "))
{
if (obj.GetAttribute( "href ") != null && obj.GetAttribute( "href ") != -1)
{
}
}
}

......



[解决办法]
" <a*?href=(\ "(? <href> [^\ "]*)\ "| '(? <href> [^ ']*) '|(? <href> *))[^> ]*?> (? <title> *?) </a> "

读书人网 >C#

热点推荐