读书人

使用java将网页保存为mht格式(1)

发布时间: 2009-01-05 12:18:04 作者: liuhuituzi

package com.tag;
  import java.io.BufferedInputStream;
  import java.io.BufferedOutputStream;
  import java.io.BufferedReader;
  import java.io.ByteArrayInputStream;
  import java.io.DataOutputStream;
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.FileOutputStream;
  import java.io.FileWriter;
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.InputStreamReader;
  import java.io.OutputStream;
  import java.io.Reader;
  import java.net.MalformedURLException;
  import java.net.URL;
  import java.util.*;
  import org.htmlparser.Parser;
  import org.htmlparser.Tag;
  import org.htmlparser.filters.TagNameFilter;
  import org.htmlparser.lexer.Lexer;
  import org.htmlparser.lexer.Page;
  import org.htmlparser.util.DefaultParserFeedback;
  import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
  import toptrack.tools.JQuery;
  import javax.activation.DataHandler;
  import javax.activation.DataSource;
  import javax.activation.MimetypesFileTypeMap;
  import javax.mail.Message;
  import javax.mail.MessagingException;
  import javax.mail.Multipart;
  import javax.mail.Session;
  import javax.mail.internet.InternetAddress;
  import javax.mail.internet.MimeBodyPart;
  import javax.mail.internet.MimeMessage;
  import javax.mail.internet.MimeMultipart;
  import javax.mail.internet.MimePartDataSource;
  /**
  * mht文件解析类
  * @author examda
  */
  public class Html2MHTCompiler {
  private URL strWeb = null; /**网页地址*/
  private String strText = null; /**网页文本内容*/
  private String strFileName = null; /**本地文件名*/
  private String strEncoding = null; /**网页编码*/
  //mht格式附加信息
  private String from = "dongle2001@126.com";
  private String to;
  private String subject = "mht compile";
  private String cc;
  private String bcc;
  private String smtp = "localhost";
  public static void main(String[] args) {
  String strUrl = "http://www.mtime.com/my/tropicofcancer/blog/843555/";
  String strEncoding = "utf-8";
  String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
  if (strText == null)
  return;
  Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
  h2t.compile();
  //Html2MHTCompiler.mht2html("test.mht", "a.html");
  }
  /**
  *方法说明:初始化
  *输入参数:strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名
  *返回类型:
  */
  public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
  // TODO Auto-generated constructor stub
  try {
  strWeb = new URL(strUrl);
  } catch (MalformedURLException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  return;
  }
  this.strText = strText;
  this.strEncoding = strEncoding;
  this.strFileName = strFileName;
  }
  /**
  *方法说明:执行下载操作
  *输入参数:
  *返回类型:
  */

public boolean compile() {
  if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
  return false;
  HashMap urlMap = new HashMap();
  NodeList nodes = new NodeList();
  try {
  Parser parser = createParser(strText);
  parser.setEncoding(strEncoding);
  nodes = parser.parse(null);
  } catch (ParserException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  extractAllScriptNodes(nodes);
  ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
  ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
  for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
  Map.Entry entry = (Map.Entry) iter.next();
  String key = (String)entry.getKey();
  String val = (String)entry.getValue();
  strText = JHtmlClear.replace(strText, val, key);
  }
  try {
  createMhtArchive(strText, urlScriptList, urlImageList);
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  return false;
  }
  return true;
  }
  /**
  *方法说明:建立HTML parser
  *输入参数:inputHTML 网页文本内容
  *返回类型:HTML parser
  */
  private Parser createParser(String inputHTML) {
  // TODO Auto-generated method stub
  Lexer mLexer = new Lexer(new Page(inputHTML));
  return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
  }
  /**
  *方法说明:抽取基础URL地址
  *输入参数:nodes 网页标签集合
  *返回类型:
  */


  private void extractAllScriptNodes(NodeList nodes) {
  NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
  "BASE"), true);
  if (filtered != null && filtered.size() > 0) {
  Tag tag = (Tag) filtered.elementAt(0);
  String href = tag.getAttribute("href");
  if (href != null && href.length() > 0) {
  try {
  strWeb = new URL(href);
  } catch (MalformedURLException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  }
  }
  }
  /**
  *方法说明:抽取网页包含的css,js链接
  *输入参数:nodes 网页标签集合; urlMap 已存在的url集合
  *返回类型:css,js链接的集合
  */
  private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {
  ArrayList urlList = new ArrayList();
  NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);
  for (int i = 0; i < filtered.size(); i++) {
  Tag tag = (Tag) filtered.elementAt(i);
  String src = tag.getAttribute("src");
  // Handle external css file’s url
  if (src != null && src.length() > 0) {
  String innerURL = src;
  String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
  if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
  urlMap.put(absoluteURL, innerURL);
  ArrayList urlInfo = new ArrayList();
  urlInfo.add(innerURL);
  urlInfo.add(absoluteURL);
  urlList.add(urlInfo);
  }
  tag.setAttribute("src", absoluteURL);
  }
  }
  filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);
  for (int i = 0; i < filtered.size(); i++) {
  Tag tag = (Tag) filtered.elementAt(i);
  String type = (tag.getAttribute("type"));
  String rel = (tag.getAttribute("rel"));
  String href = tag.getAttribute("href");
  boolean isCssFile = false;
  if (rel != null) {
  isCssFile = rel.indexOf("stylesheet") != -1;
  } else if (type != null) {
  isCssFile |= type.indexOf("text/css") != -1;
  }
  // Handle external css file’s url
  if (isCssFile && href != null && href.length() > 0) {
  String innerURL = href;
  String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
  if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
  urlMap.put(absoluteURL, innerURL);
  ArrayList urlInfo = new ArrayList();
  urlInfo.add(innerURL);
  urlInfo.add(absoluteURL);
  urlList.add(urlInfo);
  }
  tag.setAttribute("href", absoluteURL);
  }
  }
  return urlList;
  }
  /**
  *方法说明:抽取网页包含的图像链接
  *输入参数:nodes 网页标签集合; urlMap 已存在的url集合
  *返回类型:图像链接集合
  */

private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {
  ArrayList urlList = new ArrayList();
  NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("IMG"), true);
  for (int i = 0; i < filtered.size(); i++) {
  Tag tag = (Tag) filtered.elementAt(i);
  String src = tag.getAttribute("src");
  // Handle external css file’s url
  if (src != null && src.length() > 0) {
  String innerURL = src;
  String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
  if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
  urlMap.put(absoluteURL, innerURL);
  ArrayList urlInfo = new ArrayList();
  urlInfo.add(innerURL);
  urlInfo.add(absoluteURL);
  urlList.add(urlInfo);
  }
  tag.setAttribute("src", absoluteURL);
  }
  }
  return urlList;
  }
  /**
  *方法说明:相对路径转绝对路径
  *输入参数:strWeb 网页地址; innerURL 相对路径链接
  *返回类型:绝对路径链接
  */
  public static String makeAbsoluteURL(URL strWeb, String innerURL) {
  // TODO Auto-generated method stub
  //去除后缀
  int pos = innerURL.indexOf("?");
  if (pos != -1) {
  innerURL = innerURL.substring(0, pos);
  }
  if (innerURL != null
  && innerURL.toLowerCase().indexOf("http") == 0) {
  System.out.println(innerURL);
  return innerURL;
  }
  URL linkUri = null;
  try {
  linkUri = new URL(strWeb, innerURL);
  } catch (MalformedURLException e) {
  //TODO Auto-generated catch block
  e.printStackTrace();
  return null;
  }
  String absURL = linkUri.toString();
  absURL = JHtmlClear.replace(absURL, "../", "");
  absURL = JHtmlClear.replace(absURL, "./", "");
  System.out.println(absURL);
  return absURL;
  }
  /**
  *方法说明:创建mht文件
  *输入参数:content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合
  *返回类型:
  */
  private void createMhtArchive(String content, ArrayList urlScriptList, ArrayList urlImageList) throws Exception {
  //Instantiate a Multipart object
  MimeMultipart mp = new MimeMultipart("related");
  Properties props = new Properties();
  props.put("mail.smtp.host", smtp);
  Session session = Session.getDefaultInstance(props, null);
  MimeMessage msg = new MimeMessage(session);
  // set mailer
  msg.setHeader("X-Mailer", "Code Manager .SWT");
  // set from
  if (from != null) {
  msg.setFrom(new InternetAddress(from));
  }
  // set subject
  if (subject != null) {
  msg.setSubject(subject);
  }
  // to
  if (to != null) {
  InternetAddress[] toAddresses = getInetAddresses(to);
  msg.setRecipients(Message.RecipientType.TO, toAddresses);
  }
  // cc
  if (cc != null) {
  InternetAddress[] ccAddresses = getInetAddresses(cc);
  msg.setRecipients(Message.RecipientType.CC, ccAddresses);
  }
  // bcc
  if (bcc != null) {
  InternetAddress[] bccAddresses = getInetAddresses(bcc);
  msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
  }

3COME考试频道为您精心整理,希望对您有所帮助,更多信息在http://www.reader8.net/exam/

读书人网 >复习指导

热点推荐