|
@@ -0,0 +1,329 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import csv\n",
|
|
|
+ "import sys\n",
|
|
|
+ "import os\n",
|
|
|
+ "\n",
|
|
|
+ "import matplotlib.pyplot as plt\n",
|
|
|
+ "import numpy as np\n",
|
|
|
+ "import matplotlib as mpt\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "# 设置编码\n",
|
|
|
+ "reload(sys)\n",
|
|
|
+ "sys.setdefaultencoding('utf8')\n",
|
|
|
+ "\n",
|
|
|
+ "%matplotlib inline\n",
|
|
|
+ "# 设置plt中文编码\n",
|
|
|
+ "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
|
|
|
+ "#用来正常显示负号\n",
|
|
|
+ "plt.rcParams['axes.unicode_minus'] = False\n",
|
|
|
+ "\n",
|
|
|
+ "# 设置项目目录\n",
|
|
|
+ "os.getcwd()\n",
|
|
|
+ "os.chdir(\"/media/sf_share/linux/movie-analysis\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## 多层全连接神经网络训练情感分析"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from keras.models import Sequential\n",
|
|
|
+ "from keras.layers import Dense\n",
|
|
|
+ "from keras.layers import Flatten\n",
|
|
|
+ "from keras.layers.embeddings import Embedding\n",
|
|
|
+ "from keras.preprocessing import sequence\n",
|
|
|
+ "import keras\n",
|
|
|
+ "import numpy as np\n",
|
|
|
+ "from keras.datasets import imdb"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "ACLIMDB_V1 = '/media/sf_share/linux/movie-analysis/input/imdb.npz'\n",
|
|
|
+ "(X_train, y_train), (X_test, y_test) = imdb.load_data(path=ACLIMDB_V1,\n",
|
|
|
+ " nb_words=None,\n",
|
|
|
+ " skip_top=0,\n",
|
|
|
+ " maxlen=None,\n",
|
|
|
+ " seed=113,\n",
|
|
|
+ " start_char=1,\n",
|
|
|
+ " oov_char=2,\n",
|
|
|
+ " index_from=3)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 先看一看数据长什么样子的。输入命令:\n",
|
|
|
+ "X_train[0]\n",
|
|
|
+ "print(y_train[:10])\n",
|
|
|
+ "X_train.shape\n",
|
|
|
+ "# (25000,)\n",
|
|
|
+ "y_train.shape"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 看一看平均每个评论有多少个字:\n",
|
|
|
+ "avg_len = list(map(len, X_train))\n",
|
|
|
+ "\n",
|
|
|
+ "# 可以看到平均字长为238.714。\n",
|
|
|
+ "np.mean(avg_len)\n",
|
|
|
+ "# 中位数178.0\n",
|
|
|
+ "np.median(avg_len)\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 为了直观显示,这里画一个条形图(见图7.1):\n",
|
|
|
+ "\n",
|
|
|
+ "plt.hist(avg_len, bins = range(min(avg_len), max(avg_len) + 50, 50))\n",
|
|
|
+ "plt.show()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 计算最长的文本长度:\n",
|
|
|
+ "m = max(list(map(len, X_train)), list(map(len, X_test)))\n",
|
|
|
+ "m"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 从中我们会发现有一个文本特别长,居然有2494 个字符。这种异常值需要排除,考虑到文本的平均长度为230 个字符,可以设定最多输入的文本长度为400 个字符,不足400 个字符的文本用空格填充,超过400 个字符的文本截取400 个字符,Keras 默认截取后400 个字符。\n",
|
|
|
+ "maxword = 400\n",
|
|
|
+ "X_train = sequence.pad_sequences(X_train, maxlen = maxword)\n",
|
|
|
+ "X_test = sequence.pad_sequences(X_test, maxlen = maxword)\n",
|
|
|
+ "vocab_size = np.max([np.max(X_train[i]) for i in range(X_train.shape[0])]) + 1\n",
|
|
|
+ "# 这里1 代表空格,其索引被认为是0。"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 首先建立序列模型,逐步往上搭建网络。\n",
|
|
|
+ "model = Sequential()\n",
|
|
|
+ "model.add(Embedding(vocab_size, 64, input_length = maxword))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 第一层是嵌入层,定义了嵌入层的矩阵为vocab_size 64。每个训练段落为其中的maxword 64 矩阵,作为数据的输入,填入输入层。\n",
|
|
|
+ "\n",
|
|
|
+ "model.add(Flatten())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 把输入层压平,原来是maxword × 64 的矩阵,现在变成一维的长度为maxword × 64的向量。\n",
|
|
|
+ "\n",
|
|
|
+ "# 接下来不断搭建全连接神经网络,使用relu 函数。relu 是简单的非线性函数:f(x) =max(0; x)。注意到神经网络的本质是把输入进行非线性变换。\n",
|
|
|
+ "\n",
|
|
|
+ "model.add(Dense(2000, activation = 'relu'))\n",
|
|
|
+ "model.add(Dense(500, activation = 'relu'))\n",
|
|
|
+ "model.add(Dense(200, activation = 'relu'))\n",
|
|
|
+ "model.add(Dense(50, activation = 'relu'))\n",
|
|
|
+ "model.add(Dense(1, activation = 'sigmoid'))\n",
|
|
|
+ "# 这里最后一层用Sigmoid,预测0,1 变量的概率,类似于logistic regression 的链接函数,目的是把线性变成非线性,并把目标值控制在0~1。因此这里计算的是最后输出的是0 或者1 的概率。"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])\n",
|
|
|
+ "print(model.summary())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Keras 提供的建模API 让我们既能训练数据,又能在验证数据时看到模型测试效果。\n",
|
|
|
+ "\n",
|
|
|
+ "model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 20,batch_size = 100, verbose = 1)\n",
|
|
|
+ "score = model.evaluate(X_test, y_test)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## 卷积神经网络训练情感分析\n",
|
|
|
+ "全连接神经网络几乎对网络模型没有任何限制,但缺点是过度拟合,即拟合了过多噪声。全连接神经网络模型的特点是灵活、参数多。在实际应用中,我们可能会对模型加上一些限制,使其适合数据的特点。并且由于模型的限制,其参数会大幅减少。这降低了模型的复杂度,模型的普适性进而会提高。"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 接下来介绍如何利用Keras 搭建卷积神经网络来处理情感分析的分类问题。下面的代码构造了卷积神经网络的结构。\n",
|
|
|
+ "\n",
|
|
|
+ "from keras.layers import Dense, Dropout, Activation, Flatten , Conv1D, MaxPooling1D\n",
|
|
|
+ "model = Sequential()\n",
|
|
|
+ "model.add(Embedding(vocab_size, 64, input_length = maxword))\n",
|
|
|
+ "model.add(Conv1D(filters = 64, kernel_size = 3, padding = 'same', activation= 'relu'))\n",
|
|
|
+ "model.add(MaxPooling1D(pool_size = 2))\n",
|
|
|
+ "model.add(Dropout(0.25))\n",
|
|
|
+ "model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'same',activation= 'relu'))\n",
|
|
|
+ "model.add(MaxPooling1D(pool_size = 2))\n",
|
|
|
+ "model.add(Dropout(0.25))\n",
|
|
|
+ "model.add(Flatten())\n",
|
|
|
+ "model.add(Dense(64, activation = 'relu'))\n",
|
|
|
+ "model.add(Dense(32, activation = 'relu'))\n",
|
|
|
+ "model.add(Dense(1, activation = 'sigmoid'))\n",
|
|
|
+ "model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics =['accuracy'])\n",
|
|
|
+ "print(model.summary())\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 下面对模型进行拟合。\n",
|
|
|
+ "\n",
|
|
|
+ "model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 20,batch_size = 100)\n",
|
|
|
+ "scores = model.evaluate(X_test, y_test, verbose = 1)\n",
|
|
|
+ "print(scores)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## 循环神经网络训练情感分析\n",
|
|
|
+ "LSTM 是循环神经网络的一种。本质上,它按照时间顺序,把信息进行有效的整合和筛选,有的信息得到保留,有的信息被丢弃。在时间t,你获得到的信息(比如对段落文字的理解)理所应当会包含之前的信息(之前提到的事件、人物等)。LSTM 说,根据我手里的训练数据,我得找出一个方法来如何进行有效的信息取舍,从而把最有价值的信息保留到最后。那么最自然的想法是总结出一个规律用来处理前一时刻的信息。"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 构造LSTM 神经网络的结构可以使用如下的代码。\n",
|
|
|
+ "\n",
|
|
|
+ "from keras.layers import LSTM\n",
|
|
|
+ "model = Sequential()\n",
|
|
|
+ "model.add(Embedding(vocab_size, 64, input_length = maxword))\n",
|
|
|
+ "model.add(LSTM(128, return_sequences=True))\n",
|
|
|
+ "model.add(Dropout(0.2))\n",
|
|
|
+ "model.add(LSTM(64, return_sequences=True))\n",
|
|
|
+ "model.add(Dropout(0.2))\n",
|
|
|
+ "model.add(LSTM(32))\n",
|
|
|
+ "model.add(Dropout(0.2))\n",
|
|
|
+ "model.add(Dense(1, activation = 'sigmoid'))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 然后把模型打包。\n",
|
|
|
+ "\n",
|
|
|
+ "model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics =['accuracy'])\n",
|
|
|
+ "print(model.summary())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 最后输入数据集训练模型。\n",
|
|
|
+ "\n",
|
|
|
+ "model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5,batch_size = 100)\n",
|
|
|
+ "scores = model.evaluate(X_test, y_test)\n",
|
|
|
+ "print(scores)"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 2",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python2"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 2
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython2",
|
|
|
+ "version": "2.7.12"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|