读书人

jsoup解析html/依据关键词拿到论坛帖子

发布时间: 2013-04-21 15:31:38 作者: rapoo

jsoup解析html/根据关键词拿到论坛帖子信息
import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class KeyWordsSearchUtil {/** * 根据关键词查询论坛所需信息map * @param KeyWord 传入关键词 * @return */public static List<Map<String, Object>> findByKeyWord(String KeyWord) {List<Map<String, Object>>postsList=new ArrayList<Map<String,Object>>();Map<String, Object>postsOneMap=null;try {Document doc = Jsoup.connect("http://club.pchome.net/forum_1_15____md__1_"+java.net.URLEncoder.encode(KeyWord,"utf-8")+".html") .data("query", "Java") .userAgent("Mozilla") .cookie("auth", "token") .timeout(10000) .ignoreHttpErrors(true) .post();Elements postsLs=doc.select("li.i2").not(".h-bg");if (postsLs!=null&&postsLs.size()>0) {for (Element childPost : postsLs) {postsOneMap=new HashMap<String, Object>();postsOneMap.put("postsPopularity", childPost.select("li > span.n2").first().text());postsOneMap.put("postsTitle", childPost.select("span.n3 > a").attr("title"));postsOneMap.put("postsFloor", childPost.select("span.n4").first().text());postsOneMap.put("postsCname", childPost.select("a.bind_hover_card").first().text());postsOneMap.put("postsCtime", childPost.select("li > span.n6").first().text());postsOneMap.put("postsUrl", "http://club.pchome.net"+childPost.select("span.n3 a").attr("href"));postsOneMap.put("postsContents", getContentsByUrl("http://club.pchome.net"+childPost.select("span.n3 a").attr("href")));postsList.add(postsOneMap);}}} catch (Exception e) {e.printStackTrace();}return postsList;}/** * 根据帖子的url获取帖子的文本内容 * @param url 帖子的路径 * @return */public static String getContentsByUrl(String url) {String contents="11";try {Document doc = Jsoup.connect(url) .data("query", "Java") .userAgent("Mozilla") .cookie("auth", "token") .timeout(10000) .ignoreHttpErrors(true) .post();if(doc.select("div.mc").first()!=null){Element contentsEle=doc.select("div.mc div").first();contents=contentsEle.select("div").first().text();if (contents.contains("[向左转]??[向右转]??[原图]")) {contents=contents.replace("[向左转]??[向右转]??[原图]", "");}}} catch (Exception e) {e.printStackTrace();}return contents;}public static void main(String[] args) throws Exception {List<Map<String, Object>>postsList=KeyWordsSearchUtil.findByKeyWord("电影");System.out.println("http://club.pchome.net/forum_1_15____md__1_"+java.net.URLEncoder.encode("电影","utf-8")+".html");System.out.println(postsList.size()+"/////");for (int i = 0; i < postsList.size(); i++) {for(Map.Entry<String, Object>entry:postsList.get(i).entrySet()){System.out.println("key="+entry.getKey()+"| value="+entry.getValue());}System.out.println("-----------------");}//http://club.pchome.net/thread_1_15_7519679.html//String str=getContentsByUrl("http://club.pchome.net/thread_1_15_7519679.html");//System.out.println(str);}}

??????????????????????

??????????????????????????? 以上代码能成功抓取宽带山论坛中,关键词为:电影 的相关帖子列表,main方法中已有测试,网络畅通下可测试通过。但上面代码仅为完成功能,性能较差,项目中需重写或优化

1 楼 muscle1990 10 小时前 LZ威武霸气啊!!!学习了

读书人网 >CSS

热点推荐