liuyuqi-dellpc 6 years ago
parent
commit
a40570160e
5 changed files with 123 additions and 1 deletions
  1. 22 1
      README.md
  2. BIN
      bin/me/yoqi/participle/Participle.class
  3. 28 0
      pom.xml
  4. 2 0
      shell/rmTag.sh
  5. 71 0
      src/me/yoqi/participle/Participle.java

+ 22 - 1
README.md

@@ -1,3 +1,24 @@
+
 # ChineseParticiple
 
-word2vec对搜狗中文新闻进行聚类
+word2vec对搜狗中文新闻进行聚类
+
+(1)下载搜狗数据
+http://www.sogou.com/labs/sogoudownload/SogouCA/news_tensite_xml.full.zip
+
+(2)去除html标签
+cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>"  > corpus.txt  
+
+(3)分词
+可以通过java包:ANSJ对文本分词。
+
+(4)
+./word2vec -train resultbig.txt -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1  
+
+(5)计算距离
+./distance vectors.bin  
+
+(6)聚类
+./word2vec -train resultbig.txt -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500  
+
+sort classes.txt -k 2 -n > classes.sorted.txt

BIN
bin/me/yoqi/participle/Participle.class


+ 28 - 0
pom.xml

@@ -0,0 +1,28 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>ChineseParticiple</groupId>
+  <artifactId>ChineseParticiple</artifactId>
+  <version>0.0.1-SNAPSHOT</version>
+  <description>中文分词项目</description>
+  <build>
+    <sourceDirectory>src</sourceDirectory>
+    <plugins>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.5.1</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.7</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+  	<dependency>
+  		<groupId>org.ansj</groupId>
+  		<artifactId>ansj_seg</artifactId>
+  		<version>5.1.1</version>
+  		<scope>runtime</scope>
+  	</dependency>
+  </dependencies>
+</project>

+ 2 - 0
shell/rmTag.sh

@@ -0,0 +1,2 @@
+#把news_tensite_xml.dat转换为utf8,然后取content内容存储到corpus.txt 
+cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>"  > corpus.txt  

+ 71 - 0
src/me/yoqi/participle/Participle.java

@@ -0,0 +1,71 @@
+package me.yoqi.participle;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.ansj.domain.Term;
+import org.ansj.splitWord.analysis.ToAnalysis;
+import org.nlpcn.commons.lang.util.IOUtil;
+
+public class Participle {
+
+	public static final String TAG_START_CONTENT = "<content>";  
+    public static final String TAG_END_CONTENT = "</content>";  
+      
+    public static void main(String[] args) {  
+        String temp = null ;  
+          
+        BufferedReader reader = null;  
+        PrintWriter pw = null;  
+        try {  
+            reader = IOUtil.getReader("corpus.txt", "UTF-8") ;  
+            ToAnalysis.parse("test 123 孙") ;  
+            pw = new PrintWriter("resultbig.txt");  
+            long start = System.currentTimeMillis()  ;  
+            int allCount =0 ;  
+            int termcnt = 0;  
+            Set<String> set = new HashSet<String>();  
+            while((temp=reader.readLine())!=null){  
+                temp = temp.trim();  
+                if (temp.startsWith(TAG_START_CONTENT)) {  
+                    int end = temp.indexOf(TAG_END_CONTENT);  
+                    String content = temp.substring(TAG_START_CONTENT.length(), end);  
+                    //System.out.println(content);  
+                    if (content.length() > 0) {  
+                        allCount += content.length() ;  
+                        List<Term> result =  ToAnalysis.parse(content);  
+                        for (Term term: result) {  
+                            String item = term.getName().trim();  
+                            if (item.length() > 0) {  
+                                termcnt++;  
+                                pw.print(item.trim() + " ");  
+                                set.add(item);  
+                            }  
+                        }  
+                        pw.println();  
+                    }  
+                }  
+            }  
+            long end = System.currentTimeMillis() ;  
+            System.out.println("共" + termcnt + "个term," + set.size() + "个不同的词,共 "  
+                    +allCount+" 个字符,每秒处理了:"+(allCount*1000.0/(end-start)));  
+        } catch (IOException e) {   
+            e.printStackTrace();  
+        } finally {  
+            if (null != reader) {  
+                try {  
+                    reader.close();  
+                } catch (IOException e) {  
+                    e.printStackTrace();  
+                }  
+            }  
+            if (null != pw) {  
+                pw.close();  
+            }  
+        }  	
+
+}