package me.yoqi.participle; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; import java.util.HashSet; import java.util.List; import java.util.Set; import org.ansj.domain.Term; import org.ansj.splitWord.analysis.ToAnalysis; import org.nlpcn.commons.lang.util.IOUtil; public class Participle { public static final String TAG_START_CONTENT = ""; public static final String TAG_END_CONTENT = ""; public static void main(String[] args) { String temp = null ; BufferedReader reader = null; PrintWriter pw = null; try { reader = IOUtil.getReader("corpus.txt", "UTF-8") ; ToAnalysis.parse("test 123 孙") ; pw = new PrintWriter("resultbig.txt"); long start = System.currentTimeMillis() ; int allCount =0 ; int termcnt = 0; Set set = new HashSet(); while((temp=reader.readLine())!=null){ temp = temp.trim(); if (temp.startsWith(TAG_START_CONTENT)) { int end = temp.indexOf(TAG_END_CONTENT); String content = temp.substring(TAG_START_CONTENT.length(), end); //System.out.println(content); if (content.length() > 0) { allCount += content.length() ; List result = ToAnalysis.parse(content); for (Term term: result) { String item = term.getName().trim(); if (item.length() > 0) { termcnt++; pw.print(item.trim() + " "); set.add(item); } } pw.println(); } } } long end = System.currentTimeMillis() ; System.out.println("共" + termcnt + "个term," + set.size() + "个不同的词,共 " +allCount+" 个字符,每秒处理了:"+(allCount*1000.0/(end-start))); } catch (IOException e) { e.printStackTrace(); } finally { if (null != reader) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } if (null != pw) { pw.close(); } } }