`
ruantongsheng
  • 浏览: 21146 次
  • 来自: ...
社区版块
存档分类
最新评论

par ser htm l

阅读更多
package com.test;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.eclipse.swt.SWT;
import org.eclipse.swt.browser.Browser;
import org.eclipse.swt.browser.ProgressEvent;
import org.eclipse.swt.browser.ProgressListener;
import org.eclipse.swt.widgets.Display;
import org.eclipse.swt.widgets.Shell;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;


public class HtmlParser {

public static Object parserLockObj=new Object();
public static Object getHtmlLockObj=new Object();
public static List<String> BeingDownloadWordList=new ArrayList<String>();
public static List<String> DownloadedWordList=new ArrayList<String>();

public static void main(String[] args) {
try {
initial();
initialList();

GetHtmlThread gt=new GetHtmlThread();
ParserHtmlThread pt=new ParserHtmlThread();
gt.start();
pt.start();

} catch (Exception e) {
e.printStackTrace();
}
}
public static void parserHtml(String word)throws Exception{
StringBuffer parsedText=new StringBuffer();
StringBuffer newWord=new StringBuffer();
String d=getProjectRootPath()+"newhtml/"+word+".html";
Parser parser=new Parser(readFile(d));
NodeFilter filter = new HasAttributeFilter("class","tab_content");
        NodeList nodes = parser.extractAllNodesThatMatch(filter);
        System.out.println(nodes.size());
int len=nodes.size();
for(int i=0;i<len;i++){
//outputAll1(nodes.elementAt(i));
Parser parserSub=new Parser(nodes.elementAt(i).toHtml());
NodeFilter   textFilter   =   new   NodeClassFilter(TextNode.class);
NodeList nl=parserSub.extractAllNodesThatMatch(textFilter);
for(int j=0;j<nl.size();j++){
String s=nl.elementAt(j).getText();
if(s!=null&&!s.trim().equals("")){
s=s.replace("&nbsp;","");
boolean isSeq=Pattern.matches("[0-9\\.]{1,3}",s);
if(isSeq){
//System.out.print(s);
parsedText.append(s).append("\r\n");
}else{
if(isWord(s)){
BeingDownloadWordList.add(s);
newWord.append(s.trim()).append("\r\n");
}
//System.out.println(s);
parsedText.append(s).append("\r\n");
}

}
}
System.out.println("\n\n\n###############################################");
}
writeFile(parsedText.toString(),getProjectRootPath()+"text/"+word+".txt");
appendWordToAllWordFile(newWord.toString());

}
private static void writeFile(String content,String path)throws Exception{

BufferedWriter w=new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path)));
w.write(content);
w.flush();
w.close();

}
private static String readFile(String path)throws Exception{
BufferedReader r=new BufferedReader(new InputStreamReader(new FileInputStream(path)));
String result="";
while(r.ready()){
result+=r.readLine();
}
return result;
}

private static boolean isWord(String s)throws Exception{
if(s==null||s.trim().equals("")){
return false;
}else{
s=s.trim();
Pattern p=Pattern.compile("[a-zA-Z]*");
return p.matches("[a-zA-Z]*",s);
}

}

private static void initial()throws Exception{
String rootPath=getProjectRootPath();
rootPath=rootPath.replace("\\","/");
rootPath=rootPath.trim();
if(rootPath.startsWith("/")){
rootPath=rootPath.substring(1);
}
File newFile=new File(rootPath+"newhtml");
File textFile=new File(rootPath+"text");

if(!newFile.exists()){
newFile.mkdirs();
}if(!textFile.exists()){
textFile.mkdirs();
}
System.out.println(rootPath);
}
public static  String getProjectRootPath(){
String projectRootPath="";
try {
projectRootPath=HtmlParser.class.getResource("/").getPath();
projectRootPath=projectRootPath.substring(0,projectRootPath.length()-1);
if(projectRootPath.endsWith("bin")){
projectRootPath=projectRootPath.substring(0,projectRootPath.length()-4)+  File.separator;

}else{
projectRootPath=projectRootPath +  File.separator ;

}

if(projectRootPath!=null&&projectRootPath.indexOf("%20")!=-1){
projectRootPath=projectRootPath.replace("%20", " ");
}
} catch (Exception e) {
e.printStackTrace();
}
return projectRootPath;
}



private static void appendWordToAllWordFile(String content)throws Exception{
String path=getProjectRootPath()+"text/AllWord.txt";
BufferedWriter w=new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, true)));
w.write(content);
w.flush();
w.close();
}
private static void initialList()throws Exception{
String path=getProjectRootPath()+"text/AllWord.txt";
File f=new File(path);
if(f.exists()){
BufferedReader buf=new BufferedReader(new InputStreamReader(new FileInputStream(path)));
while(buf.ready()){
String line=buf.readLine();
if(line!=null&&!line.trim().equals("")){
line=line.trim();
BeingDownloadWordList.add(line);
}
}
}
}

public static boolean newHtmlIsExist(String word)throws Exception{
String path=getProjectRootPath()+"newhtml/"+word+".html";
File f=new File(path);
return f.exists();
}
public static String getContent(String word)throws Exception{
String result="";
if(!newHtmlIsExist(word)){
Display display = new Display();  
    final Shell shell = new Shell(display);  
    final Browser browser = new Browser(shell, SWT.NONE);
    try {
    String url="http://www.iciba.com/" +word;




    browser.setToolTipText("process料");
    browser.setUrl(url);
   
    browser.addProgressListener(new ProgressListener() {
public void completed(ProgressEvent paramProgressEvent) {
//System.out.println(browser.getText());
browser.setToolTipText("completed");
}
public void changed(ProgressEvent paramProgressEvent) {}
});
    while (!shell.isDisposed()&&!browser.getToolTipText().equals("completed")) {  
    // while (!shell.isDisposed()) {
        if (!display.readAndDispatch()){
        //System.out.println(browser.getText());
          display.sleep();
        }
      }
    result=browser.getText();
    saveFile(result, word);
    DownloadedWordList.add(word);
    //System.out.println(result);
} catch (Exception e) {
e.printStackTrace();
}finally
{
browser.dispose();
shell.dispose();
//shell.close();
display.dispose();
//display.close();

}

    return result;
}else{
return "";
}

   
}
private static void saveFile(String content,String name)throws Exception{
BufferedWriter w=new BufferedWriter(new OutputStreamWriter(new FileOutputStream(getProjectRootPath()+"newhtml/"+name+".html")));
w.write(content);
w.flush();
w.close();
}
}

class ParserHtmlThread extends Thread{
HtmlParser hp = new HtmlParser();
public void run() {
try {
while(true){
if(HtmlParser.DownloadedWordList.size()>0){
String word=HtmlParser.DownloadedWordList.remove(0);
HtmlParser.parserHtml(word);
}else{
Thread.sleep(500);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

class GetHtmlThread extends Thread {
HtmlParser hp = new HtmlParser();
public void run() {
try {
while (true) {
if (HtmlParser.BeingDownloadWordList.size() > 0) {
String word = HtmlParser.BeingDownloadWordList.remove(0);
HtmlParser.getContent( word);
} else {
Thread.sleep(500);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics