package me.yoqi.participle;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.nlpcn.commons.lang.util.IOUtil;
public class Participle {
public static final String TAG_START_CONTENT = "";
public static final String TAG_END_CONTENT = "";
public static void main(String[] args) {
String temp = null ;
BufferedReader reader = null;
PrintWriter pw = null;
try {
reader = IOUtil.getReader("corpus.txt", "UTF-8") ;
ToAnalysis.parse("test 123 孙") ;
pw = new PrintWriter("resultbig.txt");
long start = System.currentTimeMillis() ;
int allCount =0 ;
int termcnt = 0;
Set set = new HashSet();
while((temp=reader.readLine())!=null){
temp = temp.trim();
if (temp.startsWith(TAG_START_CONTENT)) {
int end = temp.indexOf(TAG_END_CONTENT);
String content = temp.substring(TAG_START_CONTENT.length(), end);
//System.out.println(content);
if (content.length() > 0) {
allCount += content.length() ;
List result = ToAnalysis.parse(content);
for (Term term: result) {
String item = term.getName().trim();
if (item.length() > 0) {
termcnt++;
pw.print(item.trim() + " ");
set.add(item);
}
}
pw.println();
}
}
}
long end = System.currentTimeMillis() ;
System.out.println("共" + termcnt + "个term," + set.size() + "个不同的词,共 "
+allCount+" 个字符,每秒处理了:"+(allCount*1000.0/(end-start)));
} catch (IOException e) {
e.printStackTrace();
} finally {
if (null != reader) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != pw) {
pw.close();
}
}
}