Browse Source

add 李子柒螺蛳粉评论分析

liuyuqi-dellpc 5 years ago
parent
commit
5887607d52

+ 3 - 0
liziqi/README.md

@@ -0,0 +1,3 @@
+## 李子柒天猫店螺蛳粉销售分析
+
+对销售评论进行情感分析,跟踪用户反馈。修正产品。

+ 429 - 0
liziqi/analysis_and_plot.ipynb

@@ -0,0 +1,429 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "version": "3.6.0-final"
+  },
+  "orig_nbformat": 2,
+  "file_extension": ".py",
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3,
+  "kernelspec": {
+   "name": "python36064bitrootconda12dcd85ef9c147fdbdf4c10492696076",
+   "display_name": "Python 3.6.0 64-bit ('root': conda)"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import re,os,sys\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "df =pd.read_excel(\"data/李子柒螺蛳粉评论.xlsx\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 去除重复值\n",
+    "df.drop_duplicates(inplace=True)\n",
+    "df.info() "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 时间-热度分析\n"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 时间走势图\n",
+    "df['comment_time'] = pd.to_datetime(df['comment_time'])\n",
+    "df['comment_date'] = df['comment_time'].dt.date\n",
+    "comment_num = df['comment_date'].value_counts().sort_index()\n",
+    "\n",
+    "from pyecharts.charts import Line\n",
+    "from pyecharts import options as opts \n",
+    "\n",
+    "# 折线图\n",
+    "line1 = Line(init_opts=opts.InitOpts(width='1350px', height='750px'))\n",
+    "line1.add_xaxis(comment_num.index.tolist())\n",
+    "line1.add_yaxis('热度', comment_num.values.tolist(),\n",
+    "                areastyle_opts=opts.AreaStyleOpts(opacity=0.5),\n",
+    "                label_opts=opts.LabelOpts(is_show=False))\n",
+    "line1.set_global_opts(title_opts=opts.TitleOpts(title='商品评价数量走势图'), \n",
+    "                      xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate='30')),\n",
+    "                      toolbox_opts=opts.ToolboxOpts(),\n",
+    "                      visualmap_opts=opts.VisualMapOpts(max_=400))\n",
+    "line1.set_series_opts(linestyle_opts=opts.LineStyleOpts(width=3))\n",
+    "line1.render() \n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def judge_comment(df, result):\n",
+    "\n",
+    "    # 创建一个空数据框\n",
+    "    judges = pd.DataFrame(np.zeros(13 * len(df)).reshape(len(df),13),\n",
+    "                      columns = ['品牌','物流正面','物流负面','包装正面','包装负面','原料正面',\n",
+    "                                 '原料负面','口感正面','口感负面','日期正面','日期负面',\n",
+    "                                 '性价比正面','性价比负面'])\n",
+    "\n",
+    "    for i in range(len(result)):\n",
+    "        word = result[i]\n",
+    "        #李子柒的产品具有强IP属性,基本都是正面评价,这里不统计情绪,只统计提及次数\n",
+    "        if '李子柒' in word or '子柒' in word or '小柒' in word or '李子七' in word or '小七' in word:\n",
+    "                judges.iloc[i]['品牌'] = 1\n",
+    "\n",
+    "        #先判断是不是物流相关的\n",
+    "        if '物流' in word or '快递' in word or '配送' in word or '取货' in word:\n",
+    "            #再判断是正面还是负面情感\n",
+    "            if '好' in word or '不错' in word or '棒' in word or '满意' in word or '迅速' in word:\n",
+    "                judges.iloc[i]['物流正面'] = 1\n",
+    "            elif '慢' in word or '龟速' in word or '暴力' in word or '差' in word:\n",
+    "                judges.iloc[i]['物流负面'] = 1\n",
+    "\n",
+    "        #判断是否包装相关\n",
+    "        if '包装' in word or '盒子' in word or '袋子' in word or '外观' in word:\n",
+    "            if '高端' in word or '大气' in word or '还行' in word or '完整' in word or '好' in word or\\\n",
+    "               '严实' in word or '紧' in word or '精致' in word:\n",
+    "                judges.iloc[i]['包装正面'] = 1\n",
+    "            elif  '破' in word or '破损' in word or '瘪' in word or '简陋' in word:\n",
+    "                judges.iloc[i]['包装负面'] = 1\n",
+    "\n",
+    "        #产品\n",
+    "        #产品原料是牛肉为主,且评价大多会提到牛肉,因此我们把这个单独拎出来分析\n",
+    "        if '米粉' in word or '汤' in word or '配料' in word or '腐竹' in word or '花生' in word:\n",
+    "            if '劲道' in word or '多' in word or '足' in word or '香' in word or '才' in word or\\\n",
+    "                '脆' in word or 'nice' in word:\n",
+    "                judges.iloc[i]['原料正面'] = 1\n",
+    "            elif '小' in word or '少' in word or '没' in word:\n",
+    "                judges.iloc[i]['原料负面'] = 1\n",
+    "\n",
+    "        #口感的情绪\n",
+    "        if '口味' in word or '味道' in word or '口感' in word or '吃起来' in word:\n",
+    "            if '不错' in word or '浓鲜' in word or '十足' in word or '鲜' in word or\\\n",
+    "                '可以' in word or '喜欢' in word or '符合' in word:\n",
+    "                judges.iloc[i]['口感正面'] = 1\n",
+    "            elif '不好' in word or '不行' in word or '不鲜' in word or\\\n",
+    "                '太烂' in word:\n",
+    "                judges.iloc[i]['口感负面'] = 1\n",
+    "\n",
+    "        #口感方面,有些是不需要出现前置词,消费者直接评价好吃难吃的,例如:\n",
+    "        if '难吃' in word or '不好吃' in word:\n",
+    "            judges.iloc[i]['口感负面'] = 1\n",
+    "        elif '好吃' in word or '香' in word:\n",
+    "            judges.iloc[i]['口感正面'] = 1\n",
+    "\n",
+    "        #日期是不是新鲜\n",
+    "        if '日期' in word or '时间' in word or '保质期' in word:\n",
+    "            if '新鲜' in word:\n",
+    "                judges.iloc[i]['日期正面'] = 1\n",
+    "            elif '久' in word or '长' in word:\n",
+    "                judges.iloc[i]['日期负面'] = 1\n",
+    "        elif '过期' in word:\n",
+    "            judges.iloc[i]['日期负面'] = 1\n",
+    "\n",
+    "        #性价比\n",
+    "        if '划算' in word or '便宜' in word or '赚了' in word or '囤货' in word or '超值' in word or \\\n",
+    "            '太值' in word or '物美价廉' in word or '实惠' in word or '性价比高' in word or '不贵' in word: \n",
+    "            judges.iloc[i]['性价比正面'] = 1\n",
+    "        elif  '贵' in word or '不值' in word or '亏了' in word or '不划算' in word or '不便宜' in word:\n",
+    "            judges.iloc[i]['性价比负面'] = 1\n",
+    "\n",
+    "    final_result = pd.concat([df,judges],axis = 1)\n",
+    "\n",
+    "    return final_result\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 得到数据框\n",
+    "judge = judge_comment(df, result=df.content)\n",
+    "judge.head() \n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 结果汇总\n",
+    "rank = judge.iloc[:, 5:].sum().reset_index().sort_values(0, ascending=False) \n",
+    "rank.columns = ['分类', '提及次数']\n",
+    "rank['占比'] = rank['提及次数'] / rank['提及次数'].sum()\n",
+    "rank['高级分类'] = rank['分类'].str[:-2]\n",
+    "rank"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rank.loc[0, '高级分类'] = '品牌'\n",
+    "rank \n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rank_num = rank.groupby('高级分类')['提及次数'].sum().sort_values(ascending=False)\n",
+    "rank_num\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "data_pair = [list(z) for z in zip(rank_num.index, rank_num.values)]\n",
+    "data_pair"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyecharts.charts import Pie\n",
+    "\n",
+    "pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))\n",
+    "pie1.add( \n",
+    "        series_name=\"num\",\n",
+    "        radius=[\"35%\", \"55%\"],\n",
+    "        data_pair=data_pair,\n",
+    "        label_opts=opts.LabelOpts(\n",
+    "            position=\"outside\",\n",
+    "            formatter=\"{a|{a}}{abg|}\\n{hr|}\\n {b|{b}: }{c}  {per|{d}%}  \",\n",
+    "            background_color=\"#eee\",\n",
+    "            border_color=\"#aaa\",\n",
+    "            border_width=1,\n",
+    "            border_radius=4,\n",
+    "            rich={\n",
+    "                \"a\": {\"color\": \"#999\", \"lineHeight\": 22, \"align\": \"center\"},\n",
+    "                \"abg\": {\n",
+    "                    \"backgroundColor\": \"#e3e3e3\",\n",
+    "                    \"width\": \"100%\",\n",
+    "                    \"align\": \"right\",\n",
+    "                    \"height\": 22,\n",
+    "                    \"borderRadius\": [4, 4, 0, 0],\n",
+    "                },\n",
+    "                \"hr\": {\n",
+    "                    \"borderColor\": \"#aaa\",\n",
+    "                    \"width\": \"100%\",\n",
+    "                    \"borderWidth\": 0.5,\n",
+    "                    \"height\": 0,\n",
+    "                },\n",
+    "                \"b\": {\"fontSize\": 16, \"lineHeight\": 33},\n",
+    "                \"per\": {\n",
+    "                    \"color\": \"#eee\",\n",
+    "                    \"backgroundColor\": \"#334455\",\n",
+    "                    \"padding\": [2, 4],\n",
+    "                    \"borderRadius\": 2,\n",
+    "                },\n",
+    "            },\n",
+    "        ),\n",
+    ")\n",
+    "pie1.set_global_opts(legend_opts=opts.LegendOpts(pos_left=\"left\", pos_top='30%', orient=\"vertical\"), \n",
+    "                     toolbox_opts=opts.ToolboxOpts(),\n",
+    "                     title_opts=opts.TitleOpts(title='消费者关注占比分布'))\n",
+    "pie1.set_series_opts(\n",
+    "    tooltip_opts=opts.TooltipOpts(trigger=\"item\", formatter=\"{a} <br/>{b}: {c} ({d}%)\")\n",
+    "    )\n",
+    "pie1.render()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyecharts import options as opts\n",
+    "from pyecharts.charts import Bar\n",
+    "from pyecharts.commons.utils import JsCode\n",
+    "from pyecharts.globals import ThemeType\n",
+    "\n",
+    "list2 = [\n",
+    "    {\"value\": 1484.0, \"percent\": 1484.0 / (1484.0 + 27.0)},\n",
+    "    {\"value\": 692.0, \"percent\": 692.0 / (692.0 + 3.0)},\n",
+    "    {\"value\": 539.0, \"percent\": 539.0 / (539.0 + 63.0)},\n",
+    "    {\"value\": 422.0, \"percent\": 422.0 / (422.0 + 0)},\n",
+    "    {\"value\": 142.0, \"percent\": 142.0 / (142.0 + 66.0)},\n",
+    "    {\"value\": 124.0, \"percent\": 124.0 / (124.0 + 22.0)},\n",
+    "    {\"value\": 58.0, \"percent\": 58.0 / (58.0 + 3.0)},\n",
+    "]\n",
+    "\n",
+    "list3 = [\n",
+    "    {\"value\": 27.0, \"percent\": 27.0 / (27.0 + 1484.0)},\n",
+    "    {\"value\": 3.0, \"percent\": 3.0 / (3.0 + 692.0)},\n",
+    "    {\"value\": 63.0, \"percent\": 63.0 / (63.0 + 539.0)},\n",
+    "    {\"value\": 0, \"percent\": 0 / (0 + 422.0)},\n",
+    "    {\"value\": 66.0, \"percent\": 66.0 / (66.0 + 142.0)},\n",
+    "    {\"value\": 22.0, \"percent\": 22.0 / (22.0 + 124.0)},\n",
+    "    {\"value\": 3.0, \"percent\": 3.0 / (3.0 + 58.0)},\n",
+    "]\n",
+    "\n",
+    "\n",
+    "bar1 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px', theme=ThemeType.LIGHT))\n",
+    "bar1.add_xaxis(['口感', '包装', '原料', '品牌', '日期', '性价比', '物流'])\n",
+    "bar1.add_yaxis(\"正面评论\", list2, stack=\"stack1\", category_gap=\"50%\")\n",
+    "bar1.add_yaxis(\"负面评论\", list3, stack=\"stack1\", category_gap=\"50%\")\n",
+    "bar1.set_global_opts(title_opts=opts.TitleOpts(title='关注点细分占比分布')) \n",
+    "bar1.set_series_opts(\n",
+    "        label_opts=opts.LabelOpts(\n",
+    "            position=\"right\",\n",
+    "            formatter=JsCode(\n",
+    "                \"function(x){return Number(x.data.percent * 100).toFixed() + '%';}\"\n",
+    "            ),\n",
+    "        )\n",
+    "    )\n",
+    "bar1.render()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import jieba \n",
+    "import jieba.analyse\n",
+    "\n",
+    "txt = df['content'].str.cat(sep='。')\n",
+    "\n",
+    "# 添加关键词\n",
+    "jieba.add_word('李子柒')  \n",
+    "\n",
+    "# 读入停用词表\n",
+    "stop_words = []\n",
+    "with open('stop_words.txt', 'r', encoding='utf-8') as f:\n",
+    "    lines = f.readlines()\n",
+    "    for line in lines:\n",
+    "        stop_words.append(line.strip())\n",
+    "\n",
+    "# 添加停用词\n",
+    "stop_words.extend(['40', 'hellip', '一袋', '一包', '一个月', \n",
+    "                   '一点', '一个多月', '第一次', '哈哈哈', \n",
+    "                   '螺狮粉', '螺蛳'])      \n",
+    "\n",
+    "# 评论字段分词处理\n",
+    "word_num = jieba.analyse.extract_tags(txt,\n",
+    "                                      topK=100,\n",
+    "                                      withWeight=True,\n",
+    "                                      allowPOS=())\n",
+    "\n",
+    "# 去停用词\n",
+    "word_num_selected = []\n",
+    "\n",
+    "for i in word_num:\n",
+    "    if i[0] not in stop_words:\n",
+    "        word_num_selected.append(i)\n",
+    "\n",
+    "key_words = pd.DataFrame(word_num_selected, columns=['words','num'])\n",
+    "key_words.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyecharts.charts import WordCloud\n",
+    "from pyecharts.globals import SymbolType\n",
+    "\n",
+    "# 词云图\n",
+    "word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px'))\n",
+    "word1.add(\"\", [*zip(key_words.words, key_words.num)],\n",
+    "          word_size_range=[20, 200],\n",
+    "          shape=SymbolType.DIAMOND)\n",
+    "word1.set_global_opts(title_opts=opts.TitleOpts('评论分布词云图'),\n",
+    "                      toolbox_opts=opts.ToolboxOpts())\n",
+    "word1.render()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyecharts.charts import Page\n",
+    "\n",
+    "page = Page() \n",
+    "page.add(pie1, bar1, word1)\n",
+    "page.render('评论分析.html')  \n",
+    ""
+   ]
+  }
+ ]
+}

BIN
liziqi/data/李子柒螺蛳粉评论.xlsx


BIN
liziqi/data/评论处理后数据.xlsx


+ 62 - 0
liziqi/get_comment.py

@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/04/06 21:18:40
+@Version :   1.0
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   天猫爬取商品评论
+'''
+
+# 导入所需库
+import pandas as pd
+import requests
+import re
+import time
+import json
+from jsonpath import jsonpath
+from pprint import pprint
+
+df_all = pd.DataFrame()
+
+for i in range(1, 101):
+	true_url = "https://rate.tmall.com/list_detail_rate.htm?itemId=598614273525&spuId=0&sellerId=3965833216&order=3&currentPage={}&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvFvvpvoUvUvCkvvvvvjiPn25y0jtRRsFh6jEUPmPwzj3URFdWtjibRLFOtjrPdphvmpmvqRkJvvvWUghCvCWpvREwsDsNzYGUTnAYAZrqv6ruRphvCvvvphvPvpvhvv2MMQhCvvOv9hCvvvvEvpCWvhDi4Bz6VXu4hAx%2F0jZ7%2Bu0Owmz6%2Ff8r58t%2Bm7zydigXe5xLD76fd34AVAllY2%2FAdXQaWXxr58TJ%2B3%2BuQjZL%2Bu6fjLVxfBKK5FGDNdyCvm9vvhCvvvvvvvvvBJZvvUVavvCHtpvv9ZUvvhcDvvmCp9vvBJZvvUHmuphvmvvvpLvEk6nskphvC9hvpyP9Q8wCvvpvvhHh3QhvCvmvphmrvpvEvvEN7GOvvvExRphvCvvvphmrvpvEvvjKMngvvEbp9phvHnMS01gH7rMNz15bMH1btqjN%2FnsvRphvCvvvphv%3D&needFold=0&_ksTS=1585965084590_619&callback=jsonp620".format(i)
+	headers ={
+		'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
+		'referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.6.62982a9d1ZzQky&id=598614273525&cm_id=140105335569ed55e27b&abbucket=10&skuId=4416255191724',
+		'cookie': 'cna=6aIDF0UBwyMCAd9IWWP0ve2T; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=352faf1e2c7459bb9be6dcfb1a686859_1585650890558; _m_h5_tk_enc=f98da3ebe54c27135860fad3eb0dd30f; enc=7ukbCamgQiUCkbWFKHSyWs3%2FuaYf2BwLnE%2FrqYLwkifjMrzkW7Z9o9ZnyLldLtq72TZm4k67jLO3g3pY8WCUTg%3D%3D; t=fc84a8e7475ba65cba01390893141481; tracknick=%5Cu738B%5Cu771F%5Cu8FBE; lgc=%5Cu738B%5Cu771F%5Cu8FBE; _tb_token_=ee45eb07d7e8a; cookie2=108528103da2fb86166fe94f4ec6d16c; x5sec=7b22726174656d616e616765723b32223a223664363835376639313134633235396364656133353961346431343332383436434f4c4c6e2f514645497659677553626973487066513d3d227d; dnk=%5Cu738B%5Cu771F%5Cu8FBE; uc1=cookie15=VFC%2FuZ9ayeYq2g%3D%3D&existShop=false&tag=8&pas=0&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&lng=zh_CN&cookie14=UoTUP2uetwoTUg%3D%3D&cookie21=V32FPkk%2FgPzW; uc3=id2=W8g1q36CK3mT&nk2=rpB%2B19XZ&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dBxdAXsSeyaM9RvMI%3D; _l_g_=Ug%3D%3D; uc4=nk4=0%40rMpGHW%2BbjZbmUWy3pSf%2FTjQ%3D&id4=0%40WeuSdm3pqpKqyJu1mxC59Ozb5YU%3D; unb=816423751; cookie1=VvaOTBfnXLrXE%2FHlUE7SD0YJwBoh4uUMhGIHJ9cs6KA%3D; login=true; cookie17=W8g1q36CK3mT; _nk_=%5Cu738B%5Cu771F%5Cu8FBE; sgcookie=E61hfOKws3lgRAqU1v%2Fvg; sg=%E8%BE%BE19; csg=0a7d2065; l=dBM9vnp4Qcg5Mq6wBOfgCkjmkJ_t6IRf1sPzt2XL0ICP_H5JLsJNWZfuxtTvCnGVn6y6R35mgkfgBjLtxy4EhZXRFJXn9MpOLd8h.; isg=BNPTDT2niX35UEVSwcvpZWTrYlf9iGdK353nFoXw2_JOBPCmDVoIm3gSPnRqpL9C'
+	}
+
+	# 发起请求
+	data = requests.get(true_url, headers=headers).text
+
+	# 提取内容
+	json_data = re.findall(r'jsonp620\((.*)\)', data)[0]
+	# 解析数据
+	js_data = json.loads(json_data)
+
+	# 获取数据
+	UserNick = jsonpath(js_data, '$..rateList..displayUserNick')
+	comment_time = jsonpath(js_data, '$..rateList..rateDate')
+	content = jsonpath(js_data, '$..rateList..rateContent')
+	auctionSku = jsonpath(js_data, '$..rateList..auctionSku')
+
+	df_one = pd.DataFrame({
+		'UserNick': UserNick,
+		'comment_time': comment_time,
+		'content': content,
+		'auctionSku': auctionSku
+	})
+
+	# 循环追加
+	df_all = df_all.append(df_one, ignore_index=True)
+
+	# 休眠5秒
+	time.sleep(5)
+
+	# 打印进度
+	print('我正在获取第{}页的信息'.format(i))
+
+# 读出数据
+df_all.to_excel('data/李子柒螺蛳粉评论.xlsx', index=False)

+ 99 - 0
liziqi/get_luosi.py

@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/04/06 22:13:01
+@Version :   1.0
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   爬取螺蛳粉天猫数据
+'''
+
+# 导入所需包
+from selenium import webdriver
+import parsel
+import re
+import time
+import pandas as pd
+
+
+def login_taobao_acount():
+	# 打开浏览器
+	global browser
+	browser = webdriver.Chrome()
+	# 登录URL
+	login_url = 'https://login.taobao.com/member/login.jhtml'
+
+	# 打开网页
+	browser.get(login_url)
+	# 支付宝登录
+	browser.find_element_by_class_name('alipay-login').click()
+
+
+def get_assigned_page(key_words):
+	# 获取淘宝URL
+	tb_url = 'https://www.taobao.com/'
+	# 打开淘宝网
+	browser.get(tb_url)
+	# 定位搜索框,输入数据
+	s_bar = browser.find_element_by_xpath('//*[@id="q"]')
+	s_bar.send_keys('{}'.format(key_words))
+	# 点击搜索
+	browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
+
+
+def get_one_page():
+	# 先获取第一页的信息
+	html = parsel.Selector(browser.page_source)
+
+	# 获取数据
+	goods_name = html.xpath('//div[@class="grid g-clearfix"]//img/@alt').extract()
+	shop_name = html.xpath('//div[@class="grid g-clearfix"]//div[@class="shop"]/a/span[2]/text()').extract()
+	price = html.xpath('//div[@class="grid g-clearfix"]//div[contains(@class,"price")]/strong/text()').extract()
+	purchase_num = [re.findall(r'<div class="deal-cnt">(.*?)</div>', i)
+					for i in html.xpath('//div[@class="grid g-clearfix"]//div[@class="row row-1 g-clearfix"]').extract()]
+	location = html.xpath('//div[@class="grid g-clearfix"]//div[@class="location"]/text()').extract()
+
+	# 存储数据
+	df_one = pd.DataFrame({
+		'goods_name': goods_name,
+		'shop_name': shop_name,
+		'price': price,
+		'purchase_num': purchase_num,
+		'location': location
+	})
+
+	return df_one
+
+
+def get_all_page(page_num):
+	df_all = pd.DataFrame()
+
+	# 循环翻页
+	for i in range(1, page_num):
+		# 运行函数
+		df_one = get_one_page()
+		# 循环追加
+		df_all = df_all.append(df_one, ignore_index=True)
+		# 100页的时候打断翻页
+		if page_num==100:
+			break
+		else:
+			# 点击翻页
+			browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > ul > li.item.next > a').click()
+			# 打印进度
+			print('我正在获取第{}页的数据'.format(i))
+			# 休眠一秒
+			time.sleep(10)
+	return df_all
+
+
+if __name__=='__main__':
+	# 先运行登录函数
+	login_taobao_acount()
+	# 再运行搜索函数
+	get_assigned_page(key_words='螺蛳粉')
+	# 再运行翻页获取函数
+	df_all = get_all_page(page_num=101)
+	# 读出数据
+	df_all.to_excel('data/螺蛳粉店铺数据.xlsx', index=False)

+ 4 - 0
liziqi/requirements.txt

@@ -0,0 +1,4 @@
+jieba
+pyecharts
+pandas 
+numpy