读书人

发一个解析HTML的代码.目前只能解析ta

发布时间: 2012-04-16 16:20:04 作者: rapoo

发一个解析HTML的代码.目前只能解析table与div....
代码挺简单的..但是解析的代码一定要配对出现。否则出现错误我不官。至少用来解析baidu搜索结果是没问题的
有志于写解析器的同学可以拿去玩玩

C# code
    public class SimpleHtmlParser    {        /// <summary>        /// 解析函数        /// </summary>        /// <param name="s">解析字符串</param>        /// <param name="elements">解析后的控件列表</param>        /// <returns>返回控件树</returns>        public static Element ParseHtml(string s,out List<Element> elements)        {            elements = new List<Element>();            elements.Clear();            Stack<Element> es = new Stack<Element>();            string pattern = @"(?=(</?table.*?>)|(</?div/?.*?>))";            RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Singleline;            Regex regex = new Regex(pattern, options);            MatchCollection matches = regex.Matches(s);            var element = new Element();            var lastElement = element;            foreach (Match match in matches)            {                var wordindex = 0;                var wordlength = 0;                var word = "";                for (int i = 0; i < match.Groups.Count; i++)                {                    var t = match.Groups[i];                    if (t.Length > 0)                    {                        wordindex = t.Index;                        wordlength = t.Length;                        word = t.Value;                        break;                    }                }                if (wordlength <= 0) continue;                if (word == "<div/>") continue;                bool isTable = word.IndexOf("table") >= 0;                bool isDiv = word.IndexOf("div") >= 0;                bool isEnd = word.IndexOf("</") >= 0;                if (!isEnd)                {                    //新标签                    Element ee;                    if (isDiv)                    {                        ee = new DivElement();                    }                    else if (isTable)                    {                        ee = new TableElement();                    }                    else                    {                        ee = new Element();                    }                    ee.StartTagIndex = wordindex;                    ee.StartTagLength = wordlength;                    ee.BegTag = word;                    //设定父级                    ee.Parent = lastElement;                    lastElement = ee;                    ee.Parent.Children.Add(ee);                    //进栈                    es.Push(ee);                }                else                {                    //闭合标签                    var t = es.Pop();                    t.EndTag = word;                    t.EndIndex = wordindex;                    t.EndTagLength = wordlength;                    lastElement = t.Parent;                    t.OuterHtml = s.Substring(t.StartTagIndex, (t.EndIndex - t.StartTagIndex) + t.EndTagLength);                    t.InnerHtml = s.Substring(t.StartTagIndex + t.StartTagLength, (t.EndIndex - t.StartTagIndex-t.StartTagLength));                    elements.Add(t);                }            }            return element;        }        //去除代码中无用的标签        public static string ReplaceFontSpan(string s)        {            Regex r = new Regex("<head>.*?</head>");            s = r.Replace(s, "");            r = new Regex("</?font.*?>");            s = r.Replace(s, "");            r = new Regex("</?span.*?>");            s = r.Replace(s, "");            r = new Regex("</?a.*?>");            s = r.Replace(s, "");            return s;        }        //下载网页源文件        public static string  DownLoadHtml(string url)        {            try            {                HttpWebRequest r = (HttpWebRequest)WebRequest.Create(url);                r.Method = "get";                HttpWebResponse rep = (HttpWebResponse)r.GetResponse();                Stream receiveStream = rep.GetResponseStream();                StreamReader readStream = new StreamReader(receiveStream, System.Text.Encoding.Default);                var result = readStream.ReadToEnd();                return result.ToString();            }            catch            {                return "";            }        }    }    public class Element : StringElement    {                public int StartTagIndex { get; set; }        public int StartTagLength {get;set;}        public int EndIndex { get; set; }        public int EndTagLength { get; set; }        public string BegTag { get; set; }        public string EndTag {get;set;}        public List<Element> Children = new List<Element>();        public Element Parent        {            get;            set;        }    }    public class DivElement : Element    {            }    public class TableElement : Element    {    }    public class TrElement : Element    {     }        public class StringElement    {        public string OuterHtml        {            get;            set;        }        public string InnerHtml        {            get;            set;        }    } 



调用代码
C# code
 private void Form1_Load(object sender, EventArgs e)        {            var url = "http://www.baidu.com/s?wd=惠阳妇科病医院&rsv_bp=0&rsv_spt=3&inputT=21000";            var s = SimpleHtmlParser.DownLoadHtml(url);            //去除无用标签            s = SimpleHtmlParser.ReplaceFontSpan(s);            List<Element> t = null;            var element = SimpleHtmlParser.ParseHtml(s, out t);            //查找没有子控件的结果            foreach (var o in t)            {                if (o.Children.Count <= 0)                {                    MessageBox.Show(o.OuterHtml);                }            }            //遍历控件树            List(element);        }        public void List(Element e)        {                     if (e.Children.Count > 0)            {                foreach (var t in e.Children)                {                     List(t);                }            }            MessageBox.Show(e.OuterHtml);         }


[解决办法]
支持分享!我用HtmlAgilityPack。
[解决办法]
HtmlAgilityPack

+1
[解决办法]

读书人网 >C#

热点推荐