从网站中“扒”资讯

从网站中“扒”新闻

元旦放假三天，天气很冷没有打算出去玩，就在家里琢磨着弄一下扒网站新闻，主要是同寝室的一个同事在弄，所以想学点东西，自己也动手写了一个，思路很简单，下面就描述一下是怎么实现的吧！

首先进入主页网站中，然后选择自己想“扒”的信息模块，例如是新闻、经济、娱乐等等或者其他什么的，这样就能找到自己需要信息，然后把这个模块的url链接地址给读取出来，然后遍历读取到的URL地址，读取信息的内容。

现在的网站一般都是动态生成的，也就是说新闻信息页面有自己的模板，那么所有的信息肯定是在某个DIV或者是容器中，只要找到这个控件的ID就能够得到里面的数据，然后把里面的数据找出来。

下面的代码是我测试了某网站的信息，已经读取到了信息列表，先弄上去供大家参考，为了防止某些人恶意攻击，因此我删除了具体的链接地址

package hb.downweb;import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.LinkedList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;/* * 从网上扒新闻信息 */public class Main {//显示新闻列表的地址private static final String http_url = "网站导航列表网址";//找到需要扒的信息模块的IDprivate static final String summaryBlock = "id=\"blist\"";//显示的信息以什么HTML标签结束private static final String endSummaryBlock = "&lt;/table&gt;";//存储网页中的链接标签public static List&lt;String&gt; list = new LinkedList&lt;String&gt;();public static void main(String[] args) {//想要抓取信息的页面StringBuffer stringBuffer = new StringBuffer();try {//通过字符串得到URL对象URL url = new URL(http_url);//远程连接，得到URLConnection对象(它代表应用程序和 URL 之间的通信链接)URLConnection conn = url.openConnection();int find_flag = 0;//表示没有找到需要的内容//从连接中读取数据流，BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));String line;while((line = reader.readLine()) != null){//找到了需要下载链接模块if(line.indexOf(summaryBlock)!= -1){find_flag = 1;//表示找到了需要的内容}//需要新闻模块的结束标记if(line.indexOf(endSummaryBlock) != -1){find_flag = 2;//表示需要找的内容结束了}//将找到的信息放入stringBuffer中if(1 == find_flag){stringBuffer.append(line);}//需要找的信息已经结束if(2 == find_flag){System.out.println("over");find_flag = 0;}}System.out.println(stringBuffer);//使用正则表达式获取想要的字符串Pattern pattern = Pattern.compile("[0-9]{5}\\.htm");Matcher matcher = pattern.matcher(stringBuffer);System.out.println(matcher.find());while(matcher.find()) {//将连接的地址存储到list容器中list.add("显示网页内容的网址" + matcher.group());//下面显示匹配的内容//System.out.println(matcher.group());}//读取具体链接信息内容readNews(list);} catch (MalformedURLException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} }/* * 读显示新闻的网页 */public static void readNews(List&lt;String&gt; list){String flagName = "news";for(int i = 0; i &lt; list.size(); i++){//得到的是每篇文章的链接地址 具体网页的地址String temp = list.get(i);String filename = "";filename = flagName + i+".txt";//将下载的网页信息保存到文件中getNewsContent(temp,filename);}}/* * 将显示新闻的网页的内容存放在本地文件中 */public static void getNewsContent(String httpLink,String fileName){try {System.out.println("getNewsContent : " + httpLink);//通过URL产生链接到具体的网页，然后读取数据URL url = new URL(httpLink);URLConnection conn = url.openConnection();//这里读取的网页内容一定要注意后面的编码，跟网页的报纸一致，否则在后面存储在文件中的也为乱码BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(),"UTF-8"));String tempStr;//根据显示具体网页个格式，找到对应的模块，然后读取出来存储在文件中File file = new File(fileName);FileOutputStream fos = new FileOutputStream(file);String class_name = "class=\"content2";String end_content = "&lt;/div&gt;";int readContentFlag = 0;StringBuffer strbuf = new StringBuffer();while((tempStr = reader.readLine()) != null){if(tempStr.indexOf(class_name)!= -1){readContentFlag = 1;}if(tempStr.indexOf(end_content)!= -1){readContentFlag = 2;}if(1 == readContentFlag){strbuf.append(tempStr);//System.out.println(line);}if(2 == readContentFlag){System.out.println("over");readContentFlag = 0;}tempStr = strbuf.toString();System.out.println("tempStr.indexOf(class_name)2: "+ tempStr.indexOf(class_name));tempStr = delHTMLTag(tempStr);tempStr = stripHtml(tempStr);fos.write(tempStr.getBytes("utf-8"));}//一定不要忘记了关闭数据流，否则出现异常情况fos.close();} catch (MalformedURLException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}public static String delHTMLTag(String htmlStr){         String regEx_script="&lt;script[^&gt;]*?&gt;[\\s\\S]*?&lt;\\/script&gt;"; //定义script的正则表达式         String regEx_style="&lt;style[^&gt;]*?&gt;[\\s\\S]*?&lt;\\/style&gt;"; //定义style的正则表达式         String regEx_html="&lt;[^&gt;]+&gt;"; //定义HTML标签的正则表达式                  Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);         Matcher m_script=p_script.matcher(htmlStr);         htmlStr=m_script.replaceAll(""); //过滤script标签                  Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);         Matcher m_style=p_style.matcher(htmlStr);         htmlStr=m_style.replaceAll(""); //过滤style标签                  Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);         Matcher m_html=p_html.matcher(htmlStr);         htmlStr=m_html.replaceAll(""); //过滤html标签         return htmlStr.trim(); //返回文本字符串     } public static String stripHtml(String content) { // &lt;p&gt;段落替换为换行 content = content.replaceAll("&lt;p .*?&gt;", "\r\n"); // &lt;br&gt;&lt;br/&gt;替换为换行 content = content.replaceAll("&lt;br\\s*/?&gt;", "\r\n"); // 去掉其它的&lt;&gt;之间的东西 content = content.replaceAll("\\&lt;.*?&gt;", ""); // 还原HTML // content = HTMLDecoder.decode(content); content = content.replaceAll("&nbsp;", "");return content; } }

备注：上面运行是同步的，为了提高用户体验，可以把上面的方式改为“线程”处理，这样体验会好很多，为了让读者更容易明白，这里就不再赘述了。

我这种方法不是通用的，但是可以作为参考，希望大家多提意见！

从网站中“扒”资讯

热点推荐