读书人

用java实现将html保存为txt文本时怎

发布时间: 2013-09-18 14:17:40 作者: rapoo

用java实现将html保存为txt文本时,怎样去掉body { font-family: SimSun; font-size:22px; .....}
编写了一个java类,将一个html网页保存为txt文本,保存后的txt文本内容都正确,但是总是带着

body {
font-family: SimSun;
font-size:22px;
font-style:italic;
font-weight:bold;
color:#00F;
}

不知道该怎样去掉,求大侠帮忙

java部分代码:
package format.conversion;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;

import javax.servlet.jsp.tagext.BodyTag;
import javax.swing.JFileChooser;
import javax.swing.filechooser.FileNameExtensionFilter;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

public class HtmlToTxt {

public static void main(String[] args) throws Exception {
HtmlToTxt test=new HtmlToTxt();
test.go();
}

public void go(){
try{

JFileChooser fileSave=new JFileChooser(".");

FileNameExtensionFilter extension=new FileNameExtensionFilter("txt Files(.txt)","txt");
fileSave.setFileFilter(extension);

fileSave.showSaveDialog(null);
File file=fileSave.getSelectedFile();
if(!file.getPath().endsWith(".txt")){
file=new File(file.getPath()+".txt");
}
String outputFile =file.toString();



FileWriter writer=new FileWriter(outputFile);
String content = readTextFile("WebRoot/Report.html","UTF-8");
String txtcontent=getText(content);
writer.write(txtcontent);
writer.close();
System.out.println("txt文件保存成功!");
System.out.println("文件保存路径为:"+new File(outputFile).toURI().toURL());
}catch(IOException ex){
System.out.println("txt文件保存失败!");
}catch(ParserException ex){
System.out.println("字符转换失败");
}
}
/*----------------获取文本内容和标题----------------------*/
public static String getText(String content) throws ParserException {
Parser myParser; //htmlParser对html页面解析
NodeList nodeList = null;
StringBuilder result = new StringBuilder();
myParser = Parser.createParser(content, "UTF-8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class NodeFilter linkFilter = new NodeClassFilter(LinkTag.class

OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter});
nodeList = myParser.parse(lastFilter);//获取节点列表
Node[] nodes = nodeList.toNodeArray(); //获取节点数组
String line = "";

for (int i = 0; i < nodes.length; i++) {
Node anode = (Node) nodes[i];
if (anode instanceof TextNode) { TextNode textnode = (TextNode) anode;
line = textnode.getText();
} else if (anode instanceof LinkTag) {
LinkTag linknode = (LinkTag) anode;

line = linknode.getLink();
}

if (isTrimEmpty(line))
continue;

result.append(line);
}

return result.toString();
}

/*-------------------读取html文件-------------------*/
public static String readTextFile(String sFileName, String sEncode) {
StringBuffer sbStr = new StringBuffer(); //字符串变量
try {
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(
ff), sEncode); BufferedReader ins = new BufferedReader(read);


String dataLine = "";
while (null != (dataLine = ins.readLine())) {
sbStr.append(dataLine);
sbStr.append("\r\n");
}
ins.close();
} catch (Exception e) {
e.printStackTrace();
}
return sbStr.toString();
}
public static boolean isTrimEmpty(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
}
if (isBlank(astr.trim())) {
return true;
}
return false;
}
public static boolean isBlank(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
} else {
return false;
}
}

}




java html
[解决办法]

引用:
Quote: 引用:

Quote: 引用:

Quote: 引用:

html.replace((?s)body {(.*?)},"");

报错
Exception in thread "main" java.lang.Error: Unresolved compilation problems:
The method replace(char, char) in the type String is not applicable for the arguments (s)
Syntax error on token "?", delete this token
s cannot be resolved to a type
Syntax error, insert ")" to complete MethodInvocation
Syntax error, insert ";" to complete Statement
body cannot be resolved to a variable
Syntax error on tokens, delete these tokens
Syntax error, insert "}" to complete Block

at format.conversion.HtmlToTxt.go(HtmlToTxt.java:60)
at format.conversion.HtmlToTxt.main(HtmlToTxt.java:39)

你用的编译工具是什么?不是Eclipse吧
html.replace("(?s)body {(.*?)}","");


少加了个两个"号



我用的是myEclipse,我试过html.replace("(?s)body {(.*?)}","");虽然没报错,但是txt文本里还是无法去掉body{ }

你确定一下好么
你的是body{}
还是body {}
中间有没有空格。
或者String str = html.replaceAll("(?s)body.\\{.*?\\}","");这样就不担心有没有空格了。
另外{为特殊字符,上面我忘了转义了

读书人网 >Java相关

热点推荐