Participle.java 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. package me.yoqi.participle;
  2. import java.io.BufferedReader;
  3. import java.io.IOException;
  4. import java.io.PrintWriter;
  5. import java.util.HashSet;
  6. import java.util.List;
  7. import java.util.Set;
  8. import org.ansj.domain.Term;
  9. import org.ansj.splitWord.analysis.ToAnalysis;
  10. import org.nlpcn.commons.lang.util.IOUtil;
  11. public class Participle {
  12. public static final String TAG_START_CONTENT = "<content>";
  13. public static final String TAG_END_CONTENT = "</content>";
  14. public static void main(String[] args) {
  15. String temp = null ;
  16. BufferedReader reader = null;
  17. PrintWriter pw = null;
  18. try {
  19. reader = IOUtil.getReader("corpus.txt", "UTF-8") ;
  20. ToAnalysis.parse("test 123 孙") ;
  21. pw = new PrintWriter("resultbig.txt");
  22. long start = System.currentTimeMillis() ;
  23. int allCount =0 ;
  24. int termcnt = 0;
  25. Set<String> set = new HashSet<String>();
  26. while((temp=reader.readLine())!=null){
  27. temp = temp.trim();
  28. if (temp.startsWith(TAG_START_CONTENT)) {
  29. int end = temp.indexOf(TAG_END_CONTENT);
  30. String content = temp.substring(TAG_START_CONTENT.length(), end);
  31. //System.out.println(content);
  32. if (content.length() > 0) {
  33. allCount += content.length() ;
  34. List<Term> result = ToAnalysis.parse(content);
  35. for (Term term: result) {
  36. String item = term.getName().trim();
  37. if (item.length() > 0) {
  38. termcnt++;
  39. pw.print(item.trim() + " ");
  40. set.add(item);
  41. }
  42. }
  43. pw.println();
  44. }
  45. }
  46. }
  47. long end = System.currentTimeMillis() ;
  48. System.out.println("共" + termcnt + "个term," + set.size() + "个不同的词,共 "
  49. +allCount+" 个字符,每秒处理了:"+(allCount*1000.0/(end-start)));
  50. } catch (IOException e) {
  51. e.printStackTrace();
  52. } finally {
  53. if (null != reader) {
  54. try {
  55. reader.close();
  56. } catch (IOException e) {
  57. e.printStackTrace();
  58. }
  59. }
  60. if (null != pw) {
  61. pw.close();
  62. }
  63. }
  64. }