读书人

使用java将网页保存为mht格式(2)

发布时间: 2009-01-05 12:38:24 作者: liuhuituzi

//设置网页正文
  MimeBodyPart bp = new MimeBodyPart();
  bp.setText(content, strEncoding);
  bp.addHeader("Content-Type", "text/html;charset=" + strEncoding);
  bp.addHeader("Content-Location", strWeb.toString());
  mp.addBodyPart(bp);
  int urlCount = urlScriptList.size();
  for (int i = 0; i < urlCount; i++) {
  bp = new MimeBodyPart();
  ArrayList urlInfo = (ArrayList) urlScriptList.get(i);
  // String url = urlInfo.get(0).toString();
  String absoluteURL = urlInfo.get(1).toString();
  bp
  .addHeader("Content-Location",
  javax.mail.internet.MimeUtility
  .encodeWord(java.net.URLDecoder
  .decode(absoluteURL, strEncoding)));
  DataSource source = new AttachmentDataSource(absoluteURL, "text");
  bp.setDataHandler(new DataHandler(source));
  mp.addBodyPart(bp);
  }
  urlCount = urlImageList.size();
  for (int i = 0; i < urlCount; i++) {
  bp = new MimeBodyPart();
  ArrayList urlInfo = (ArrayList) urlImageList.get(i);
  // String url = urlInfo.get(0).toString();
  String absoluteURL = urlInfo.get(1).toString();
  bp
  .addHeader("Content-Location",
  javax.mail.internet.MimeUtility
  .encodeWord(java.net.URLDecoder
  .decode(absoluteURL, strEncoding)));
  DataSource source = new AttachmentDataSource(absoluteURL, "image");
  bp.setDataHandler(new DataHandler(source));
  mp.addBodyPart(bp);
  }
  msg.setContent(mp);
  // write the mime multi part message to a file
  msg.writeTo(new FileOutputStream(strFileName));
  }
  /**
  *方法说明:mht转html
  *输入参数:strMht mht文件路径; strHtml html文件路径
  *返回类型:
  */
  public static void mht2html(String strMht, String strHtml) {
  try {
  //TODO readEmlFile
  InputStream fis = new FileInputStream(strMht);
  Session mailSession = Session.getDefaultInstance(System.getProperties(), null);
  MimeMessage msg = new MimeMessage(mailSession, fis);
  Object content = msg.getContent();
  if (content instanceof Multipart) {
  MimeMultipart mp = (MimeMultipart)content;
  MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);
  String strEncodng = getEncoding(bp1);
  String strText = getHtmlText(bp1, strEncodng);
  if (strText == null)
  return;
  File parent = null;
  if (mp.getCount() > 1) {
  parent = new File(new File(strHtml).getAbsolutePath() + ".files");
  parent.mkdirs();
  if (!parent.exists())
  return;
  }
  for (int i = 1; i < mp.getCount(); ++i) {
  MimeBodyPart bp = (MimeBodyPart)mp.getBodyPart(i);
  String strUrl = getResourcesUrl(bp);
  if (strUrl == null)
  continue;
  DataHandler dataHandler = bp.getDataHandler();
  MimePartDataSource source = (MimePartDataSource)dataHandler.getDataSource();
  File resources = new File(parent.getAbsolutePath() + File.separator + getName(strUrl, i));
  if (saveResourcesFile(resources, bp.getInputStream()))
  strText = JHtmlClear.replace(strText, strUrl, resources.getAbsolutePath());
  }
  saveHtml(strText, strHtml);
  }
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  }
  /**
  *方法说明:得到资源文件的name
  *输入参数:strName 资源文件链接, ID 资源文件的序号
  *返回类型:资源文件的本地临时文件名
  */
  public static String getName(String strName, int ID) {
  char separator = ’/’;
  System.out.println(strName);
  System.out.println(separator);
  if( strName.lastIndexOf(separator) >= 0)
  return format(strName.substring(strName.lastIndexOf(separator) + 1));
  return "temp" + ID;
  }
  /**
  *方法说明:得到网页编码
  *输入参数:bp MimeBodyPart类型的网页内容
  *返回类型:MimeBodyPart里的网页内容的编码
  */
  private static String getEncoding(MimeBodyPart bp) {
  if (bp != null) {
  try {
  Enumeration list = bp.getAllHeaders();
  while (list.hasMoreElements()) {
  javax.mail.Header head = (javax.mail.Header)list.nextElement();
  if (head.getName().compareTo("Content-Type") == 0) {
  String strType = head.getValue();
  int pos = strType.indexOf("charset=");
  if (pos != -1) {
  String strEncoding = strType.substring(pos + 8, strType.length());
  if (strEncoding.toLowerCase().compareTo("gb2312") == 0) {
  strEncoding = "gbk";
  }
  return strEncoding;
  }
  }
  }
  } catch (MessagingException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  }
  return null;
  }
  /**
  *方法说明:得到资源文件url
  *输入参数:bp MimeBodyPart类型的网页内容
  *返回类型:资源文件url
  */

private static String getResourcesUrl(MimeBodyPart bp) {
  if (bp != null) {
  try {
  Enumeration list = bp.getAllHeaders();
  while (list.hasMoreElements()) {
  javax.mail.Header head = (javax.mail.Header)list.nextElement();
  if (head.getName().compareTo("Content-Location") == 0) {
  return head.getValue();
  }
  }
  } catch (MessagingException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  }
  return null;
  }
  /**
  *方法说明:格式化文件名
  *输入参数:strName 文件名
  *返回类型:经过处理的符合命名规则的文件名
  */
  private static String format(String strName) {
  if (strName == null)
  return null;
  strName = strName.replaceAll(" ", " ");
  String strText = "/:*?"<>|^___FCKpd___0quot;;
  for (int i = 0; i < strName.length(); ++i) {
  String ch = String.valueOf(strName.charAt(i));
  if (strText.indexOf(ch) != -1) {
  strName = strName.replace(strName.charAt(i), ’-’);
  }
  }
  return strName;
  }
  /**
  *方法说明:保存资源文件
  *输入参数:resources 要创建的资源文件; inputStream 要输入文件中的流
  *返回类型:boolean
  */
  private static boolean saveResourcesFile(File resources, InputStream inputStream) {
  if (resources == null || inputStream == null) {
  return false;
  }
  BufferedInputStream in = null;
  FileOutputStream fio = null;
  BufferedOutputStream osw = null;
  try {
  in = new BufferedInputStream(inputStream);
  fio = new FileOutputStream(resources);
  osw = new BufferedOutputStream(new DataOutputStream(fio));
  int b;
  byte[] a = new byte[1024];
  boolean isEmpty = true;
  while ((b = in.read(a)) != -1) {
  isEmpty = false;
  osw.write(a, 0, b);
  osw.flush();
  }
  osw.close();
  fio.close();
  in.close();
  inputStream.close();
  if (isEmpty)
  resources.delete();
  return true;
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  System.out.println("解析mht失败");
  return false;
  } finally{
  try {
  if (osw != null)
  osw.close();
  if (fio != null)
  fio.close();
  if (in != null)
  in.close();
  if (inputStream != null)
  inputStream.close();
  } catch (Exception e) {
  e.printStackTrace();
  System.out.println("解析mht失败");
  return false;
  }
  }
  }
  /**


  *方法说明:得到mht文件的标题
  *输入参数:mhtFilename mht文件名
  *返回类型:mht文件的标题
  */
  public static String getTitle(String mhtFilename) {
  try {
  //TODO readEmlFile
  InputStream fis = new FileInputStream(mhtFilename);
  Session mailSession = Session.getDefaultInstance(System.getProperties(), null);
  MimeMessage msg = new MimeMessage(mailSession, fis);
  Object content = msg.getContent();
  if (content instanceof Multipart) {
  MimeMultipart mp = (MimeMultipart)content;
  MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);
  String strEncodng = getEncoding(bp1);
  String strText = getHtmlText(bp1, strEncodng);
  if (strText == null)
  return null;
  strText = strText.toLowerCase();
  int pos1 = strText.indexOf("<title>");
  int pos2 = strText.indexOf("</title>");
  if (pos1 != -1 && pos2!= -1 && pos2 > pos1) {
  return strText.substring(pos1 + 7, pos2).trim();
  }
  }
  return null;
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  return null;
  }
  }
  /**
  *方法说明:得到html文本
  *输入参数:bp MimeBodyPart类型的网页内容; strEncoding 内容编码
  *返回类型:html文本
  */

private static String getHtmlText(MimeBodyPart bp, String strEncoding) {
  InputStream textStream = null;
  BufferedInputStream buff = null;
  BufferedReader br = null;
  Reader r = null;
  try {
  textStream = bp.getInputStream();
  buff = new BufferedInputStream(textStream);
  r = new InputStreamReader(buff, strEncoding);
  br = new BufferedReader(r);
  StringBuffer strHtml = new StringBuffer("");
  String strLine = null;
  while ((strLine = br.readLine()) != null) {
  strHtml.append(strLine + "rn");
  }
  br.close();
  r.close();
  textStream.close();
  return strHtml.toString();
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  } finally{
  try{
  if (br != null)
  br.close();
  if (buff != null)
  buff.close();
  if (textStream != null)
  textStream.close();
  }catch(Exception e){
  System.out.println("解析mht失败");
  }
  }
  return null;
  }
  /**
  *方法说明:保存html文件
  *输入参数:strText html内容; strHtml html文件名
  *返回类型:
  */
  private static void saveHtml(String strText, String strHtml) {
  try {
  FileWriter fw = new FileWriter(strHtml);
  fw.write(strText);
  fw.close();
  } catch (IOException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  System.out.println("解析mht失败");
  }
  }
  private InternetAddress[] getInetAddresses(String emails) throws Exception {
  ArrayList list = new ArrayList();
  StringTokenizer tok = new StringTokenizer(emails, ",");
  while (tok.hasMoreTokens()) {
  list.add(tok.nextToken());
  }
  int count = list.size();
  InternetAddress[] addresses = new InternetAddress[count];
  for (int i = 0; i < count; i++) {
  addresses[i] = new InternetAddress(list.get(i).toString());
  }
  return addresses;
  }
  class AttachmentDataSource implements DataSource {
  private MimetypesFileTypeMap map = new MimetypesFileTypeMap();
  private String strUrl;
  private String strType;
  private byte[] dataSize = null;
  /**
  * This is some content type maps.
  */
  private Map normalMap = new HashMap();
  {
  // Initiate normal mime type map
  // Images
  normalMap.put("image", "image/jpeg");
  normalMap.put("text", "text/plain");
  }
  public AttachmentDataSource(String strUrl, String strType) {
  this.strType = strType;
  this.strUrl = strUrl;
  strUrl = strUrl.trim();
  strUrl = strUrl.replaceAll(" ", "%20");
  dataSize = JQuery.downBinaryFile(strUrl, null);
  }
  /**
  * Returns the content type.
  */
  public String getContentType() {
  return getMimeType(getName());
  }
  public String getName() {
  char separator = File.separatorChar;
  if( strUrl.lastIndexOf(separator) >= 0 )
  return strUrl.substring(strUrl.lastIndexOf(separator) + 1);
  return strUrl;
  }
  private String getMimeType(String fileName) {
  String type = (String)normalMap.get(strType);
  if (type == null) {
  try {
  type = map.getContentType(fileName);
  } catch (Exception e) {
  // TODO: handle exception
  }
  System.out.println(type);
  // Fix the null exception
  if (type == null) {
  type = "application/octet-stream";
  }
  }
  return type;
  }
  public InputStream getInputStream() throws IOException {
  // TODO Auto-generated method stub
  if (dataSize == null)
  dataSize = new byte[0];
  return new ByteArrayInputStream(dataSize);
  }
  public OutputStream getOutputStream() throws IOException {
  // TODO Auto-generated method stub
  return new java.io.ByteArrayOutputStream();
  }
  }
  }

3COME考试频道为您精心整理,希望对您有所帮助,更多信息在http://www.reader8.net/exam/

读书人网 >复习指导

热点推荐