7 years ago · a40570160e
--- a/README.md
+++ b/README.md
@@ -1,3 +1,24 @@
 
				+
			
 
				 # ChineseParticiple
			
 
				 
			
 
				-word2vec对搜狗中文新闻进行聚类
			
 
				+word2vec对搜狗中文新闻进行聚类
			
 
				+
			
 
				+（1）下载搜狗数据
			
 
				+http://www.sogou.com/labs/sogoudownload/SogouCA/news_tensite_xml.full.zip
			
 
				+
			
 
				+（2）去除html标签
			
 
				+cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>"  > corpus.txt  
			
 
				+
			
 
				+（3）分词
			
 
				+可以通过java包：ANSJ对文本分词。
			
 
				+
			
 
				+（4）
			
 
				+./word2vec -train resultbig.txt -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1  
			
 
				+
			
 
				+（5）计算距离
			
 
				+./distance vectors.bin  
			
 
				+
			
 
				+（6）聚类
			
 
				+./word2vec -train resultbig.txt -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500  
			
 
				+
			
 
				+sort classes.txt -k 2 -n > classes.sorted.txt
			
--- a/bin/me/yoqi/participle/Participle.class
+++ b/bin/me/yoqi/participle/Participle.class
--- a/pom.xml
+++ b/pom.xml
@@ -0,0 +1,28 @@
 
				+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
			
 
				+  <modelVersion>4.0.0</modelVersion>
			
 
				+  <groupId>ChineseParticiple</groupId>
			
 
				+  <artifactId>ChineseParticiple</artifactId>
			
 
				+  <version>0.0.1-SNAPSHOT</version>
			
 
				+  <description>中文分词项目</description>
			
 
				+  <build>
			
 
				+    <sourceDirectory>src</sourceDirectory>
			
 
				+    <plugins>
			
 
				+      <plugin>
			
 
				+        <artifactId>maven-compiler-plugin</artifactId>
			
 
				+        <version>3.5.1</version>
			
 
				+        <configuration>
			
 
				+          <source>1.7</source>
			
 
				+          <target>1.7</target>
			
 
				+        </configuration>
			
 
				+      </plugin>
			
 
				+    </plugins>
			
 
				+  </build>
			
 
				+  <dependencies>
			
 
				+  	<dependency>
			
 
				+  		<groupId>org.ansj</groupId>
			
 
				+  		<artifactId>ansj_seg</artifactId>
			
 
				+  		<version>5.1.1</version>
			
 
				+  		<scope>runtime</scope>
			
 
				+  	</dependency>
			
 
				+  </dependencies>
			
 
				+</project>
			
--- a/shell/rmTag.sh
+++ b/shell/rmTag.sh
@@ -0,0 +1,2 @@
 
				+#把news_tensite_xml.dat转换为utf8，然后取content内容存储到corpus.txt 
			
 
				+cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>"  > corpus.txt  
			
--- a/src/me/yoqi/participle/Participle.java
+++ b/src/me/yoqi/participle/Participle.java
@@ -0,0 +1,71 @@
 
				+package me.yoqi.participle;
			
 
				+
			
 
				+import java.io.BufferedReader;
			
 
				+import java.io.IOException;
			
 
				+import java.io.PrintWriter;
			
 
				+import java.util.HashSet;
			
 
				+import java.util.List;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+import org.ansj.domain.Term;
			
 
				+import org.ansj.splitWord.analysis.ToAnalysis;
			
 
				+import org.nlpcn.commons.lang.util.IOUtil;
			
 
				+
			
 
				+public class Participle {
			
 
				+
			
 
				+	public static final String TAG_START_CONTENT = "<content>";  
			
 
				+    public static final String TAG_END_CONTENT = "</content>";  
			
 
				+      
			
 
				+    public static void main(String[] args) {  
			
 
				+        String temp = null ;  
			
 
				+          
			
 
				+        BufferedReader reader = null;  
			
 
				+        PrintWriter pw = null;  
			
 
				+        try {  
			
 
				+            reader = IOUtil.getReader("corpus.txt", "UTF-8") ;  
			
 
				+            ToAnalysis.parse("test 123 孙") ;  
			
 
				+            pw = new PrintWriter("resultbig.txt");  
			
 
				+            long start = System.currentTimeMillis()  ;  
			
 
				+            int allCount =0 ;  
			
 
				+            int termcnt = 0;  
			
 
				+            Set<String> set = new HashSet<String>();  
			
 
				+            while((temp=reader.readLine())!=null){  
			
 
				+                temp = temp.trim();  
			
 
				+                if (temp.startsWith(TAG_START_CONTENT)) {  
			
 
				+                    int end = temp.indexOf(TAG_END_CONTENT);  
			
 
				+                    String content = temp.substring(TAG_START_CONTENT.length(), end);  
			
 
				+                    //System.out.println(content);  
			
 
				+                    if (content.length() > 0) {  
			
 
				+                        allCount += content.length() ;  
			
 
				+                        List<Term> result =  ToAnalysis.parse(content);  
			
 
				+                        for (Term term: result) {  
			
 
				+                            String item = term.getName().trim();  
			
 
				+                            if (item.length() > 0) {  
			
 
				+                                termcnt++;  
			
 
				+                                pw.print(item.trim() + " ");  
			
 
				+                                set.add(item);  
			
 
				+                            }  
			
 
				+                        }  
			
 
				+                        pw.println();  
			
 
				+                    }  
			
 
				+                }  
			
 
				+            }  
			
 
				+            long end = System.currentTimeMillis() ;  
			
 
				+            System.out.println("共" + termcnt + "个term，" + set.size() + "个不同的词，共 "  
			
 
				+                    +allCount+" 个字符，每秒处理了:"+(allCount*1000.0/(end-start)));  
			
 
				+        } catch (IOException e) {   
			
 
				+            e.printStackTrace();  
			
 
				+        } finally {  
			
 
				+            if (null != reader) {  
			
 
				+                try {  
			
 
				+                    reader.close();  
			
 
				+                } catch (IOException e) {  
			
 
				+                    e.printStackTrace();  
			
 
				+                }  
			
 
				+            }  
			
 
				+            if (null != pw) {  
			
 
				+                pw.close();  
			
 
				+            }  
			
 
				+        }  	
			
 
				+
			
 
				+}