liuyuqi-dellpc 7 years ago
commit
355c7b4780
5 changed files with 419 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 36 0
      README.md
  3. 10 0
      requirements.txt
  4. 42 0
      src/Python/Untitled1.ipynb
  5. 329 0
      src/Python/movie-analysis.ipynb

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+/input
+.ipynb_checkpoints

+ 36 - 0
README.md

@@ -0,0 +1,36 @@
+# 深度学习在情感分析中的应用 
+
+《Keras快速上手:基于Python的深度学习实战》第七章
+## 赛题介绍
+* 首先下载http://ai.stanford.edu/~amaas/data/sentiment/中的数据。
+*安装依赖:
+pip install numpy scipy
+pip install scikit-learn
+pip install pillow
+pip install h5py
+---
+## 项目介绍
+
+* 项目结构
+
+src/python
+
+* 运行和结果
+安装依赖:   
+pip install -r requirements.txt 
+
+下载数据,然后解压放到input目录中。    
+20170905_preliminary.zip    
+20170907_hint.zip   
+
+项目是一个个单一文件组合而成,执行顺序:    
+src\python\main.py  导入数据,数据初始化等操作。   
+src\python\plot1.py       
+src\python\plot2.py   
+src\python\result.py  
+src\python\cut_roi.py 
+
+* 相关文档
+
+本文博客:[广东政务比赛](http://blog.yoqi.me)  
+项目地址:[gdzw-analysis](http://git.yoqi.me:3000/lyq/gdzw-analysis) 

+ 10 - 0
requirements.txt

@@ -0,0 +1,10 @@
+ipython==5.1.0
+jupyter==1.0.0
+Markdown==2.6.9
+matplotlib==2.0.2
+notebook==5.1.0
+numpy==1.13.3
+opencv-python==3.3.0.10
+tensorflow==1.3.0
+tifffile==0.12.1
+Shapely==1.6.1

+ 42 - 0
src/Python/Untitled1.ipynb

@@ -0,0 +1,42 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "323\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(323)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

+ 329 - 0
src/Python/movie-analysis.ipynb

@@ -0,0 +1,329 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import matplotlib as mpt\n",
+    "\n",
+    "\n",
+    "# 设置编码\n",
+    "reload(sys)\n",
+    "sys.setdefaultencoding('utf8')\n",
+    "\n",
+    "%matplotlib inline\n",
+    "# 设置plt中文编码\n",
+    "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
+    "#用来正常显示负号\n",
+    "plt.rcParams['axes.unicode_minus'] = False\n",
+    "\n",
+    "# 设置项目目录\n",
+    "os.getcwd()\n",
+    "os.chdir(\"/media/sf_share/linux/movie-analysis\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 多层全连接神经网络训练情感分析"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from keras.models import Sequential\n",
+    "from keras.layers import Dense\n",
+    "from keras.layers import Flatten\n",
+    "from keras.layers.embeddings import Embedding\n",
+    "from keras.preprocessing import sequence\n",
+    "import keras\n",
+    "import numpy as np\n",
+    "from keras.datasets import imdb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ACLIMDB_V1 = '/media/sf_share/linux/movie-analysis/input/imdb.npz'\n",
+    "(X_train, y_train), (X_test, y_test) = imdb.load_data(path=ACLIMDB_V1,\n",
+    "                                                      nb_words=None,\n",
+    "                                                      skip_top=0,\n",
+    "                                                      maxlen=None,\n",
+    "                                                      seed=113,\n",
+    "                                                      start_char=1,\n",
+    "                                                      oov_char=2,\n",
+    "                                                      index_from=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 先看一看数据长什么样子的。输入命令:\n",
+    "X_train[0]\n",
+    "print(y_train[:10])\n",
+    "X_train.shape\n",
+    "# (25000,)\n",
+    "y_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 看一看平均每个评论有多少个字:\n",
+    "avg_len = list(map(len, X_train))\n",
+    "\n",
+    "# 可以看到平均字长为238.714。\n",
+    "np.mean(avg_len)\n",
+    "# 中位数178.0\n",
+    "np.median(avg_len)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 为了直观显示,这里画一个条形图(见图7.1):\n",
+    "\n",
+    "plt.hist(avg_len, bins = range(min(avg_len), max(avg_len) + 50, 50))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 计算最长的文本长度:\n",
+    "m = max(list(map(len, X_train)), list(map(len, X_test)))\n",
+    "m"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 从中我们会发现有一个文本特别长,居然有2494 个字符。这种异常值需要排除,考虑到文本的平均长度为230 个字符,可以设定最多输入的文本长度为400 个字符,不足400 个字符的文本用空格填充,超过400 个字符的文本截取400 个字符,Keras 默认截取后400 个字符。\n",
+    "maxword = 400\n",
+    "X_train = sequence.pad_sequences(X_train, maxlen = maxword)\n",
+    "X_test = sequence.pad_sequences(X_test, maxlen = maxword)\n",
+    "vocab_size = np.max([np.max(X_train[i]) for i in range(X_train.shape[0])]) + 1\n",
+    "# 这里1 代表空格,其索引被认为是0。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 首先建立序列模型,逐步往上搭建网络。\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(vocab_size, 64, input_length = maxword))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 第一层是嵌入层,定义了嵌入层的矩阵为vocab_size 64。每个训练段落为其中的maxword 64 矩阵,作为数据的输入,填入输入层。\n",
+    "\n",
+    "model.add(Flatten())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 把输入层压平,原来是maxword × 64 的矩阵,现在变成一维的长度为maxword × 64的向量。\n",
+    "\n",
+    "# 接下来不断搭建全连接神经网络,使用relu 函数。relu 是简单的非线性函数:f(x) =max(0; x)。注意到神经网络的本质是把输入进行非线性变换。\n",
+    "\n",
+    "model.add(Dense(2000, activation = 'relu'))\n",
+    "model.add(Dense(500, activation = 'relu'))\n",
+    "model.add(Dense(200, activation = 'relu'))\n",
+    "model.add(Dense(50, activation = 'relu'))\n",
+    "model.add(Dense(1, activation = 'sigmoid'))\n",
+    "# 这里最后一层用Sigmoid,预测0,1 变量的概率,类似于logistic regression 的链接函数,目的是把线性变成非线性,并把目标值控制在0~1。因此这里计算的是最后输出的是0 或者1 的概率。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])\n",
+    "print(model.summary())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Keras 提供的建模API 让我们既能训练数据,又能在验证数据时看到模型测试效果。\n",
+    "\n",
+    "model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 20,batch_size = 100, verbose = 1)\n",
+    "score = model.evaluate(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 卷积神经网络训练情感分析\n",
+    "全连接神经网络几乎对网络模型没有任何限制,但缺点是过度拟合,即拟合了过多噪声。全连接神经网络模型的特点是灵活、参数多。在实际应用中,我们可能会对模型加上一些限制,使其适合数据的特点。并且由于模型的限制,其参数会大幅减少。这降低了模型的复杂度,模型的普适性进而会提高。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 接下来介绍如何利用Keras 搭建卷积神经网络来处理情感分析的分类问题。下面的代码构造了卷积神经网络的结构。\n",
+    "\n",
+    "from keras.layers import Dense, Dropout, Activation, Flatten , Conv1D, MaxPooling1D\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(vocab_size, 64, input_length = maxword))\n",
+    "model.add(Conv1D(filters = 64, kernel_size = 3, padding = 'same', activation= 'relu'))\n",
+    "model.add(MaxPooling1D(pool_size = 2))\n",
+    "model.add(Dropout(0.25))\n",
+    "model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'same',activation= 'relu'))\n",
+    "model.add(MaxPooling1D(pool_size = 2))\n",
+    "model.add(Dropout(0.25))\n",
+    "model.add(Flatten())\n",
+    "model.add(Dense(64, activation = 'relu'))\n",
+    "model.add(Dense(32, activation = 'relu'))\n",
+    "model.add(Dense(1, activation = 'sigmoid'))\n",
+    "model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics =['accuracy'])\n",
+    "print(model.summary())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 下面对模型进行拟合。\n",
+    "\n",
+    "model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 20,batch_size = 100)\n",
+    "scores = model.evaluate(X_test, y_test, verbose = 1)\n",
+    "print(scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 循环神经网络训练情感分析\n",
+    "LSTM 是循环神经网络的一种。本质上,它按照时间顺序,把信息进行有效的整合和筛选,有的信息得到保留,有的信息被丢弃。在时间t,你获得到的信息(比如对段落文字的理解)理所应当会包含之前的信息(之前提到的事件、人物等)。LSTM 说,根据我手里的训练数据,我得找出一个方法来如何进行有效的信息取舍,从而把最有价值的信息保留到最后。那么最自然的想法是总结出一个规律用来处理前一时刻的信息。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 构造LSTM 神经网络的结构可以使用如下的代码。\n",
+    "\n",
+    "from keras.layers import LSTM\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(vocab_size, 64, input_length = maxword))\n",
+    "model.add(LSTM(128, return_sequences=True))\n",
+    "model.add(Dropout(0.2))\n",
+    "model.add(LSTM(64, return_sequences=True))\n",
+    "model.add(Dropout(0.2))\n",
+    "model.add(LSTM(32))\n",
+    "model.add(Dropout(0.2))\n",
+    "model.add(Dense(1, activation = 'sigmoid'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 然后把模型打包。\n",
+    "\n",
+    "model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics =['accuracy'])\n",
+    "print(model.summary())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 最后输入数据集训练模型。\n",
+    "\n",
+    "model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5,batch_size = 100)\n",
+    "scores = model.evaluate(X_test, y_test)\n",
+    "print(scores)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}