1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- package me.yoqi.participle;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.PrintWriter;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Set;
- import org.ansj.domain.Term;
- import org.ansj.splitWord.analysis.ToAnalysis;
- import org.nlpcn.commons.lang.util.IOUtil;
- public class Participle {
- public static final String TAG_START_CONTENT = "<content>";
- public static final String TAG_END_CONTENT = "</content>";
-
- public static void main(String[] args) {
- String temp = null ;
-
- BufferedReader reader = null;
- PrintWriter pw = null;
- try {
- reader = IOUtil.getReader("corpus.txt", "UTF-8") ;
- ToAnalysis.parse("test 123 孙") ;
- pw = new PrintWriter("resultbig.txt");
- long start = System.currentTimeMillis() ;
- int allCount =0 ;
- int termcnt = 0;
- Set<String> set = new HashSet<String>();
- while((temp=reader.readLine())!=null){
- temp = temp.trim();
- if (temp.startsWith(TAG_START_CONTENT)) {
- int end = temp.indexOf(TAG_END_CONTENT);
- String content = temp.substring(TAG_START_CONTENT.length(), end);
- //System.out.println(content);
- if (content.length() > 0) {
- allCount += content.length() ;
- List<Term> result = ToAnalysis.parse(content);
- for (Term term: result) {
- String item = term.getName().trim();
- if (item.length() > 0) {
- termcnt++;
- pw.print(item.trim() + " ");
- set.add(item);
- }
- }
- pw.println();
- }
- }
- }
- long end = System.currentTimeMillis() ;
- System.out.println("共" + termcnt + "个term," + set.size() + "个不同的词,共 "
- +allCount+" 个字符,每秒处理了:"+(allCount*1000.0/(end-start)));
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if (null != reader) {
- try {
- reader.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- if (null != pw) {
- pw.close();
- }
- }
- }
|