lyq
/
ChineseParticiple


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
							package me.yoqi.participle;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.nlpcn.commons.lang.util.IOUtil;

public class Participle {

	public static final String TAG_START_CONTENT = "<content>";  
    public static final String TAG_END_CONTENT = "</content>";  
      
    public static void main(String[] args) {  
        String temp = null ;  
          
        BufferedReader reader = null;  
        PrintWriter pw = null;  
        try {  
            reader = IOUtil.getReader("corpus.txt", "UTF-8") ;  
            ToAnalysis.parse("test 123 孙") ;  
            pw = new PrintWriter("resultbig.txt");  
            long start = System.currentTimeMillis()  ;  
            int allCount =0 ;  
            int termcnt = 0;  
            Set<String> set = new HashSet<String>();  
            while((temp=reader.readLine())!=null){  
                temp = temp.trim();  
                if (temp.startsWith(TAG_START_CONTENT)) {  
                    int end = temp.indexOf(TAG_END_CONTENT);  
                    String content = temp.substring(TAG_START_CONTENT.length(), end);  
                    //System.out.println(content);  
                    if (content.length() > 0) {  
                        allCount += content.length() ;  
                        List<Term> result =  ToAnalysis.parse(content);  
                        for (Term term: result) {  
                            String item = term.getName().trim();  
                            if (item.length() > 0) {  
                                termcnt++;  
                                pw.print(item.trim() + " ");  
                                set.add(item);  
                            }  
                        }  
                        pw.println();  
                    }  
                }  
            }  
            long end = System.currentTimeMillis() ;  
            System.out.println("共" + termcnt + "个term，" + set.size() + "个不同的词，共 "  
                    +allCount+" 个字符，每秒处理了:"+(allCount*1000.0/(end-start)));  
        } catch (IOException e) {   
            e.printStackTrace();  
        } finally {  
            if (null != reader) {  
                try {  
                    reader.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
            if (null != pw) {  
                pw.close();  
            }  
        }  	

}