|
@@ -0,0 +1,429 @@
|
|
|
+{
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2,
|
|
|
+ "metadata": {
|
|
|
+ "language_info": {
|
|
|
+ "name": "python",
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "version": "3.6.0-final"
|
|
|
+ },
|
|
|
+ "orig_nbformat": 2,
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "npconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": 3,
|
|
|
+ "kernelspec": {
|
|
|
+ "name": "python36064bitrootconda12dcd85ef9c147fdbdf4c10492696076",
|
|
|
+ "display_name": "Python 3.6.0 64-bit ('root': conda)"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import numpy as np\n",
|
|
|
+ "import pandas as pd\n",
|
|
|
+ "import matplotlib.pyplot as plt\n",
|
|
|
+ "import seaborn as sns\n",
|
|
|
+ "import re,os,sys\n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "\n",
|
|
|
+ "df =pd.read_excel(\"data/李子柒螺蛳粉评论.xlsx\")\n",
|
|
|
+ "df.head()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 去除重复值\n",
|
|
|
+ "df.drop_duplicates(inplace=True)\n",
|
|
|
+ "df.info() "
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## 时间-热度分析\n"
|
|
|
+ ],
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 时间走势图\n",
|
|
|
+ "df['comment_time'] = pd.to_datetime(df['comment_time'])\n",
|
|
|
+ "df['comment_date'] = df['comment_time'].dt.date\n",
|
|
|
+ "comment_num = df['comment_date'].value_counts().sort_index()\n",
|
|
|
+ "\n",
|
|
|
+ "from pyecharts.charts import Line\n",
|
|
|
+ "from pyecharts import options as opts \n",
|
|
|
+ "\n",
|
|
|
+ "# 折线图\n",
|
|
|
+ "line1 = Line(init_opts=opts.InitOpts(width='1350px', height='750px'))\n",
|
|
|
+ "line1.add_xaxis(comment_num.index.tolist())\n",
|
|
|
+ "line1.add_yaxis('热度', comment_num.values.tolist(),\n",
|
|
|
+ " areastyle_opts=opts.AreaStyleOpts(opacity=0.5),\n",
|
|
|
+ " label_opts=opts.LabelOpts(is_show=False))\n",
|
|
|
+ "line1.set_global_opts(title_opts=opts.TitleOpts(title='商品评价数量走势图'), \n",
|
|
|
+ " xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate='30')),\n",
|
|
|
+ " toolbox_opts=opts.ToolboxOpts(),\n",
|
|
|
+ " visualmap_opts=opts.VisualMapOpts(max_=400))\n",
|
|
|
+ "line1.set_series_opts(linestyle_opts=opts.LineStyleOpts(width=3))\n",
|
|
|
+ "line1.render() \n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "def judge_comment(df, result):\n",
|
|
|
+ "\n",
|
|
|
+ " # 创建一个空数据框\n",
|
|
|
+ " judges = pd.DataFrame(np.zeros(13 * len(df)).reshape(len(df),13),\n",
|
|
|
+ " columns = ['品牌','物流正面','物流负面','包装正面','包装负面','原料正面',\n",
|
|
|
+ " '原料负面','口感正面','口感负面','日期正面','日期负面',\n",
|
|
|
+ " '性价比正面','性价比负面'])\n",
|
|
|
+ "\n",
|
|
|
+ " for i in range(len(result)):\n",
|
|
|
+ " word = result[i]\n",
|
|
|
+ " #李子柒的产品具有强IP属性,基本都是正面评价,这里不统计情绪,只统计提及次数\n",
|
|
|
+ " if '李子柒' in word or '子柒' in word or '小柒' in word or '李子七' in word or '小七' in word:\n",
|
|
|
+ " judges.iloc[i]['品牌'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #先判断是不是物流相关的\n",
|
|
|
+ " if '物流' in word or '快递' in word or '配送' in word or '取货' in word:\n",
|
|
|
+ " #再判断是正面还是负面情感\n",
|
|
|
+ " if '好' in word or '不错' in word or '棒' in word or '满意' in word or '迅速' in word:\n",
|
|
|
+ " judges.iloc[i]['物流正面'] = 1\n",
|
|
|
+ " elif '慢' in word or '龟速' in word or '暴力' in word or '差' in word:\n",
|
|
|
+ " judges.iloc[i]['物流负面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #判断是否包装相关\n",
|
|
|
+ " if '包装' in word or '盒子' in word or '袋子' in word or '外观' in word:\n",
|
|
|
+ " if '高端' in word or '大气' in word or '还行' in word or '完整' in word or '好' in word or\\\n",
|
|
|
+ " '严实' in word or '紧' in word or '精致' in word:\n",
|
|
|
+ " judges.iloc[i]['包装正面'] = 1\n",
|
|
|
+ " elif '破' in word or '破损' in word or '瘪' in word or '简陋' in word:\n",
|
|
|
+ " judges.iloc[i]['包装负面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #产品\n",
|
|
|
+ " #产品原料是牛肉为主,且评价大多会提到牛肉,因此我们把这个单独拎出来分析\n",
|
|
|
+ " if '米粉' in word or '汤' in word or '配料' in word or '腐竹' in word or '花生' in word:\n",
|
|
|
+ " if '劲道' in word or '多' in word or '足' in word or '香' in word or '才' in word or\\\n",
|
|
|
+ " '脆' in word or 'nice' in word:\n",
|
|
|
+ " judges.iloc[i]['原料正面'] = 1\n",
|
|
|
+ " elif '小' in word or '少' in word or '没' in word:\n",
|
|
|
+ " judges.iloc[i]['原料负面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #口感的情绪\n",
|
|
|
+ " if '口味' in word or '味道' in word or '口感' in word or '吃起来' in word:\n",
|
|
|
+ " if '不错' in word or '浓鲜' in word or '十足' in word or '鲜' in word or\\\n",
|
|
|
+ " '可以' in word or '喜欢' in word or '符合' in word:\n",
|
|
|
+ " judges.iloc[i]['口感正面'] = 1\n",
|
|
|
+ " elif '不好' in word or '不行' in word or '不鲜' in word or\\\n",
|
|
|
+ " '太烂' in word:\n",
|
|
|
+ " judges.iloc[i]['口感负面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #口感方面,有些是不需要出现前置词,消费者直接评价好吃难吃的,例如:\n",
|
|
|
+ " if '难吃' in word or '不好吃' in word:\n",
|
|
|
+ " judges.iloc[i]['口感负面'] = 1\n",
|
|
|
+ " elif '好吃' in word or '香' in word:\n",
|
|
|
+ " judges.iloc[i]['口感正面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #日期是不是新鲜\n",
|
|
|
+ " if '日期' in word or '时间' in word or '保质期' in word:\n",
|
|
|
+ " if '新鲜' in word:\n",
|
|
|
+ " judges.iloc[i]['日期正面'] = 1\n",
|
|
|
+ " elif '久' in word or '长' in word:\n",
|
|
|
+ " judges.iloc[i]['日期负面'] = 1\n",
|
|
|
+ " elif '过期' in word:\n",
|
|
|
+ " judges.iloc[i]['日期负面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " #性价比\n",
|
|
|
+ " if '划算' in word or '便宜' in word or '赚了' in word or '囤货' in word or '超值' in word or \\\n",
|
|
|
+ " '太值' in word or '物美价廉' in word or '实惠' in word or '性价比高' in word or '不贵' in word: \n",
|
|
|
+ " judges.iloc[i]['性价比正面'] = 1\n",
|
|
|
+ " elif '贵' in word or '不值' in word or '亏了' in word or '不划算' in word or '不便宜' in word:\n",
|
|
|
+ " judges.iloc[i]['性价比负面'] = 1\n",
|
|
|
+ "\n",
|
|
|
+ " final_result = pd.concat([df,judges],axis = 1)\n",
|
|
|
+ "\n",
|
|
|
+ " return final_result\n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 得到数据框\n",
|
|
|
+ "judge = judge_comment(df, result=df.content)\n",
|
|
|
+ "judge.head() \n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 结果汇总\n",
|
|
|
+ "rank = judge.iloc[:, 5:].sum().reset_index().sort_values(0, ascending=False) \n",
|
|
|
+ "rank.columns = ['分类', '提及次数']\n",
|
|
|
+ "rank['占比'] = rank['提及次数'] / rank['提及次数'].sum()\n",
|
|
|
+ "rank['高级分类'] = rank['分类'].str[:-2]\n",
|
|
|
+ "rank"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "rank.loc[0, '高级分类'] = '品牌'\n",
|
|
|
+ "rank \n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "df.shape"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "rank_num = rank.groupby('高级分类')['提及次数'].sum().sort_values(ascending=False)\n",
|
|
|
+ "rank_num\n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "\n",
|
|
|
+ "data_pair = [list(z) for z in zip(rank_num.index, rank_num.values)]\n",
|
|
|
+ "data_pair"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from pyecharts.charts import Pie\n",
|
|
|
+ "\n",
|
|
|
+ "pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))\n",
|
|
|
+ "pie1.add( \n",
|
|
|
+ " series_name=\"num\",\n",
|
|
|
+ " radius=[\"35%\", \"55%\"],\n",
|
|
|
+ " data_pair=data_pair,\n",
|
|
|
+ " label_opts=opts.LabelOpts(\n",
|
|
|
+ " position=\"outside\",\n",
|
|
|
+ " formatter=\"{a|{a}}{abg|}\\n{hr|}\\n {b|{b}: }{c} {per|{d}%} \",\n",
|
|
|
+ " background_color=\"#eee\",\n",
|
|
|
+ " border_color=\"#aaa\",\n",
|
|
|
+ " border_width=1,\n",
|
|
|
+ " border_radius=4,\n",
|
|
|
+ " rich={\n",
|
|
|
+ " \"a\": {\"color\": \"#999\", \"lineHeight\": 22, \"align\": \"center\"},\n",
|
|
|
+ " \"abg\": {\n",
|
|
|
+ " \"backgroundColor\": \"#e3e3e3\",\n",
|
|
|
+ " \"width\": \"100%\",\n",
|
|
|
+ " \"align\": \"right\",\n",
|
|
|
+ " \"height\": 22,\n",
|
|
|
+ " \"borderRadius\": [4, 4, 0, 0],\n",
|
|
|
+ " },\n",
|
|
|
+ " \"hr\": {\n",
|
|
|
+ " \"borderColor\": \"#aaa\",\n",
|
|
|
+ " \"width\": \"100%\",\n",
|
|
|
+ " \"borderWidth\": 0.5,\n",
|
|
|
+ " \"height\": 0,\n",
|
|
|
+ " },\n",
|
|
|
+ " \"b\": {\"fontSize\": 16, \"lineHeight\": 33},\n",
|
|
|
+ " \"per\": {\n",
|
|
|
+ " \"color\": \"#eee\",\n",
|
|
|
+ " \"backgroundColor\": \"#334455\",\n",
|
|
|
+ " \"padding\": [2, 4],\n",
|
|
|
+ " \"borderRadius\": 2,\n",
|
|
|
+ " },\n",
|
|
|
+ " },\n",
|
|
|
+ " ),\n",
|
|
|
+ ")\n",
|
|
|
+ "pie1.set_global_opts(legend_opts=opts.LegendOpts(pos_left=\"left\", pos_top='30%', orient=\"vertical\"), \n",
|
|
|
+ " toolbox_opts=opts.ToolboxOpts(),\n",
|
|
|
+ " title_opts=opts.TitleOpts(title='消费者关注占比分布'))\n",
|
|
|
+ "pie1.set_series_opts(\n",
|
|
|
+ " tooltip_opts=opts.TooltipOpts(trigger=\"item\", formatter=\"{a} <br/>{b}: {c} ({d}%)\")\n",
|
|
|
+ " )\n",
|
|
|
+ "pie1.render()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from pyecharts import options as opts\n",
|
|
|
+ "from pyecharts.charts import Bar\n",
|
|
|
+ "from pyecharts.commons.utils import JsCode\n",
|
|
|
+ "from pyecharts.globals import ThemeType\n",
|
|
|
+ "\n",
|
|
|
+ "list2 = [\n",
|
|
|
+ " {\"value\": 1484.0, \"percent\": 1484.0 / (1484.0 + 27.0)},\n",
|
|
|
+ " {\"value\": 692.0, \"percent\": 692.0 / (692.0 + 3.0)},\n",
|
|
|
+ " {\"value\": 539.0, \"percent\": 539.0 / (539.0 + 63.0)},\n",
|
|
|
+ " {\"value\": 422.0, \"percent\": 422.0 / (422.0 + 0)},\n",
|
|
|
+ " {\"value\": 142.0, \"percent\": 142.0 / (142.0 + 66.0)},\n",
|
|
|
+ " {\"value\": 124.0, \"percent\": 124.0 / (124.0 + 22.0)},\n",
|
|
|
+ " {\"value\": 58.0, \"percent\": 58.0 / (58.0 + 3.0)},\n",
|
|
|
+ "]\n",
|
|
|
+ "\n",
|
|
|
+ "list3 = [\n",
|
|
|
+ " {\"value\": 27.0, \"percent\": 27.0 / (27.0 + 1484.0)},\n",
|
|
|
+ " {\"value\": 3.0, \"percent\": 3.0 / (3.0 + 692.0)},\n",
|
|
|
+ " {\"value\": 63.0, \"percent\": 63.0 / (63.0 + 539.0)},\n",
|
|
|
+ " {\"value\": 0, \"percent\": 0 / (0 + 422.0)},\n",
|
|
|
+ " {\"value\": 66.0, \"percent\": 66.0 / (66.0 + 142.0)},\n",
|
|
|
+ " {\"value\": 22.0, \"percent\": 22.0 / (22.0 + 124.0)},\n",
|
|
|
+ " {\"value\": 3.0, \"percent\": 3.0 / (3.0 + 58.0)},\n",
|
|
|
+ "]\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "bar1 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px', theme=ThemeType.LIGHT))\n",
|
|
|
+ "bar1.add_xaxis(['口感', '包装', '原料', '品牌', '日期', '性价比', '物流'])\n",
|
|
|
+ "bar1.add_yaxis(\"正面评论\", list2, stack=\"stack1\", category_gap=\"50%\")\n",
|
|
|
+ "bar1.add_yaxis(\"负面评论\", list3, stack=\"stack1\", category_gap=\"50%\")\n",
|
|
|
+ "bar1.set_global_opts(title_opts=opts.TitleOpts(title='关注点细分占比分布')) \n",
|
|
|
+ "bar1.set_series_opts(\n",
|
|
|
+ " label_opts=opts.LabelOpts(\n",
|
|
|
+ " position=\"right\",\n",
|
|
|
+ " formatter=JsCode(\n",
|
|
|
+ " \"function(x){return Number(x.data.percent * 100).toFixed() + '%';}\"\n",
|
|
|
+ " ),\n",
|
|
|
+ " )\n",
|
|
|
+ " )\n",
|
|
|
+ "bar1.render()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import jieba \n",
|
|
|
+ "import jieba.analyse\n",
|
|
|
+ "\n",
|
|
|
+ "txt = df['content'].str.cat(sep='。')\n",
|
|
|
+ "\n",
|
|
|
+ "# 添加关键词\n",
|
|
|
+ "jieba.add_word('李子柒') \n",
|
|
|
+ "\n",
|
|
|
+ "# 读入停用词表\n",
|
|
|
+ "stop_words = []\n",
|
|
|
+ "with open('stop_words.txt', 'r', encoding='utf-8') as f:\n",
|
|
|
+ " lines = f.readlines()\n",
|
|
|
+ " for line in lines:\n",
|
|
|
+ " stop_words.append(line.strip())\n",
|
|
|
+ "\n",
|
|
|
+ "# 添加停用词\n",
|
|
|
+ "stop_words.extend(['40', 'hellip', '一袋', '一包', '一个月', \n",
|
|
|
+ " '一点', '一个多月', '第一次', '哈哈哈', \n",
|
|
|
+ " '螺狮粉', '螺蛳']) \n",
|
|
|
+ "\n",
|
|
|
+ "# 评论字段分词处理\n",
|
|
|
+ "word_num = jieba.analyse.extract_tags(txt,\n",
|
|
|
+ " topK=100,\n",
|
|
|
+ " withWeight=True,\n",
|
|
|
+ " allowPOS=())\n",
|
|
|
+ "\n",
|
|
|
+ "# 去停用词\n",
|
|
|
+ "word_num_selected = []\n",
|
|
|
+ "\n",
|
|
|
+ "for i in word_num:\n",
|
|
|
+ " if i[0] not in stop_words:\n",
|
|
|
+ " word_num_selected.append(i)\n",
|
|
|
+ "\n",
|
|
|
+ "key_words = pd.DataFrame(word_num_selected, columns=['words','num'])\n",
|
|
|
+ "key_words.head()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from pyecharts.charts import WordCloud\n",
|
|
|
+ "from pyecharts.globals import SymbolType\n",
|
|
|
+ "\n",
|
|
|
+ "# 词云图\n",
|
|
|
+ "word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px'))\n",
|
|
|
+ "word1.add(\"\", [*zip(key_words.words, key_words.num)],\n",
|
|
|
+ " word_size_range=[20, 200],\n",
|
|
|
+ " shape=SymbolType.DIAMOND)\n",
|
|
|
+ "word1.set_global_opts(title_opts=opts.TitleOpts('评论分布词云图'),\n",
|
|
|
+ " toolbox_opts=opts.ToolboxOpts())\n",
|
|
|
+ "word1.render()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from pyecharts.charts import Page\n",
|
|
|
+ "\n",
|
|
|
+ "page = Page() \n",
|
|
|
+ "page.add(pie1, bar1, word1)\n",
|
|
|
+ "page.render('评论分析.html') \n",
|
|
|
+ ""
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ]
|
|
|
+}
|