请教用正则表达式取HTML中的字段
- HTML code
<tr> <td>
<img border='0' src='/_layouts/images/reportsup.gif' alt='Reports to' />
</td> <td>User1 </td>
</tr>
<tr> <td>
<img src="/_layouts/images/reportsdown.gif" alt="Manager">
</td> <td>User2 </td>
</tr>
<tr> <td>
<table> <tr> <td>
<span/>
</td> </tr> </table>
</td> <td>User3 </td>
</tr>
是否可以用正则表达式从上面这段HTML代码中取得下面的数组?取出图片的地址就可以,这个<img/>可能没有,但是user名字肯定存在。可以用证则表达式一次取出来么?
arr[0][0]="/_layouts/images/reportsup.gif" //地址
arr[0][1]"User1"
arr[1][0]="/_layouts/images/reportsdown.gif"
arr[1][1]"User2"
arr[2][0]=""
arr[2][1]"User3"
多谢
[解决办法]
- C# code
//取出图片路径 MatchCollection mc; Regex r = new Regex(@"<img.*?src=(?:""|')?(.*?\.(?:jpg|gif)).*?"); mc = r.Matches("这里填写你要找的HTML代码 赋值给一个字符串"); for(int i = 0;i<mc.count;i++) { //mc[i]就是图片路径了 }
[解决办法]
- C# code
Regex xxx=new Regex("src=('|\")(?<text>.*)('|\")( alt)"); string[,] arr = new string[,] { { "","" }, { "","" } }; int i = 0; int j = 0; foreach (Match m in xxx.Matches(str)) //str为HTML文档 { //MessageBox.Show(xxx.Match(str).Groups["text"].Value); arr[i, j] = m.Groups["text"].Value; arr[i++,j + 1] = "User" + i.ToString(); } string x=""; //以下为检测用 for(int ix=0;ix<i;ix++) { x+=arr[ix,0].ToString(); x+="\r\n"; x+=arr[ix,1].ToString(); x+="\r\n"; } MessageBox.Show(x);
[解决办法]
- C# code
/// <summary>
/// 返回一组标签中指定属性的值小写格式
/// 属性值必须被双引号括起来。
/// </summary>
/// <param name="Tag">一组标签 </param>
/// <param name="TagName">属性名称 </param>
/// <returns>属性的小写值 </returns>
private string TagVal(string tag, string tagName)
{
Regex rt = new Regex(@"(? <Keyword>\w+)\s*=\s*(? <Value>" + "\"" + "[^\"]*\")", RegexOptions.Compiled | RegexOptions.IgnoreCase);
foreach (Match mt in rt.Matches(tag)) {
if (mt.Groups["Keyword"].ToString().Trim().ToLower() == tagName.ToLower()) {
return mt.Groups["Value"].ToString().ToLower().Replace("\"", "");
}
}
return null;
}
///去掉所以的html标签,如 <a>aabcd </a> 返回的是aabcd
private string DropHTML(string strHtml)
{
return Regex.Replace(strHtml, " <[^>]*>", "");
}
这里可以对你内容进行匹配,不过对于第三个,有嵌套td的则不能匹配了
Regex r = new Regex(@"( <tr[^>]*>[\s\S]*? </tr>)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
Regex td = new Regex(@"( <td[^>]*>[\s\S]*? </td>)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
foreach (Match m in r.Matches(str)) {
src属性 = TagVal(td.Matches(m.Groups[0].Value.ToString())[0].Groups[0].Value.ToString(),"src");
第二个td中的文字 = DropHTML(td.Matches(m.Groups[0].Value.ToString())[1].Groups[0].Value.ToString());
}
[解决办法]
刚我写的那个数组长度有限(其实你最好用ArrayList)而且不能显示空段如User3,我改了一下
- C# code
string[,] arr = new string[100,2]; Regex xxx = new Regex("(src=('|\")(?<text>.*)('|\")( alt=.*\n))?</td><td>User(?<id>[0-9]*)<"); //Regex xxx=new Regex("src=('|\")(?<text>.*)('|\")( alt)"); int i = 0; int j = 0; foreach (Match m in xxx.Matches(str)) { //MessageBox.Show(xxx.Match(str).Groups["text"].Value); arr[i, j] = m.Groups["text"].Value; arr[i++, j + 1] = "User" + m.Groups["id"].Value.ToString(); } string x=""; for(int ix=0;ix<i;ix++) { x+=arr[ix,0].ToString(); x+="\r\n"; x+=arr[ix,1].ToString(); x+="\r\n"; } MessageBox.Show(x);
[解决办法]
- C# code
//取出图片路径 String patterns = ("<td>(\\s+)<img.*?src=('|\")(?<src>.*?)\\2.*?</td><td>(?<name>.*?)</td>"); MatchCollection cols = Regex.Matches(@"<tr><td> <img border='0' src='/_layouts/images/reportsup.gif' alt='Reports to' /></td><td>User1</td></tr><tr><td> <img src=""/_layouts/images/reportsdown.gif"" alt=""Manager""></td><td>User2</td></tr><tr><td><table><tr><td> <span/></td></tr></table></td><td>User3</td></tr>", patterns, System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline); System.Text.StringBuilder builder = new System.Text.StringBuilder(); foreach (Match mch in cols) { String imgSrc = mch.Groups["src"].ToString(); String name = mch.Groups["name"].ToString(); builder.AppendFormat("{0}:\t{1}", name, imgSrc); builder.Append(Environment.NewLine); } System.Diagnostics.Debug.WriteLine(builder.ToString());