top_topic.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2019/08/27 15:30:32
  7. @Version : 1.0
  8. @License : (C)Copyright 2019
  9. @Desc : 微博热搜爬虫,分析
  10. '''
  11. ## 调用要使用的包
  12. from pyecharts.charts import Page, WordCloud, TreeMap
  13. from pyecharts.charts import Calendar, Pie
  14. from pyecharts.globals import SymbolType
  15. from pyecharts import GraphicShapeOpts
  16. from pyecharts import options as opts
  17. import json
  18. import random
  19. import requests
  20. import time
  21. import pandas as pd
  22. import os
  23. import jieba
  24. from scipy.misc import imread # 这是一个处理图像的函数
  25. from wordcloud import WordCloud, ImageColorGenerator
  26. import matplotlib.pyplot as plt
  27. import datetime
  28. from collections import Counter
  29. os.chdir('D:/爬虫/微博热搜')
  30. ## 获得日期
  31. def getBetweenDay(begin_date, end_date):
  32. date_list = []
  33. begin_date = datetime.datetime.strptime(begin_date, "%Y/%m/%d")
  34. end_date = datetime.datetime.strptime(end_date, "%Y/%m/%d")
  35. while begin_date <= end_date:
  36. date_str = begin_date.strftime("%Y/%m/%d")
  37. date_list.append(date_str)
  38. begin_date += datetime.timedelta(days=1)
  39. return date_list
  40. ## 分词
  41. def get_words_list(df):
  42. df['words_list'] = []
  43. word_generator = jieba.cut_for_search(df['title'])
  44. for word in word_generator:
  45. df['words_list'].append(word)
  46. return df
  47. ## explode
  48. def dataframe_explode(dataframe, fieldname):
  49. temp_fieldname = fieldname + '_made_tuple_'
  50. dataframe[temp_fieldname] = dataframe[fieldname].apply(tuple)
  51. list_of_dataframes = []
  52. for values in dataframe[temp_fieldname].unique().tolist():
  53. list_of_dataframes.append(pd.DataFrame({
  54. temp_fieldname: [values] * len(values),
  55. fieldname: list(values),
  56. }))
  57. dataframe = dataframe[list(set(dataframe.columns) - set([fieldname]))].merge(
  58. pd.concat(list_of_dataframes), how='left', on=temp_fieldname)
  59. del dataframe[temp_fieldname]
  60. return dataframe
  61. ## 绘制词云
  62. def draw_word_cloud(word):
  63. word_title = resou_word[resou_word['title'].str.contains(word)]
  64. word_title = word_title.groupby(
  65. ['title'], as_index=False).agg({'searchCount': ['max']})
  66. word_title.columns = ['title', 'count']
  67. data = [(word_title['title'][i], word_title['count'][i]/1000000)
  68. for i in range(word_title.shape[0])]
  69. wc = (WordCloud(init_opts=opts.InitOpts(theme=ThemeType.ROMA))
  70. .add("", data, word_size_range=[20, 50], shape='pentagon')
  71. .set_global_opts(title_opts=opts.TitleOpts(title=''))
  72. .render('{}词云.html'.format(word))
  73. )
  74. ## 设置headers和cookie,数据爬取
  75. header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win32; x32; rv:54.0) Gecko/20100101 Firefox/54.0',
  76. 'Connection': 'keep-alive'}
  77. cookies = 'v=3; iuuid=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; webp=true; ci=1%2C%E5%8C%97%E4%BA%AC; __guid=26581345.3954606544145667000.1530879049181.8303; _lxsdk_cuid=1646f808301c8-0a4e19f5421593-5d4e211f-100200-1646f808302c8; _lxsdk=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; monitor_count=1; _lxsdk_s=16472ee89ec-de2-f91-ed0%7C%7C5; __mta=189118996.1530879050545.1530936763555.1530937843742.18'
  78. cookie = {}
  79. for line in cookies.split(';'):
  80. name, value = cookies.strip().split('=', 1)
  81. cookie[name] = value
  82. resou = pd.DataFrame(columns=['date', 'title', 'searchCount', 'rank'])
  83. resou_date = getBetweenDay('2019/01/01', '2019/07/12')
  84. for i in resou_date:
  85. print(i)
  86. url = 'https://www.enlightent.cn/research/top/getWeiboHotSearchDayAggs.do?date={}'.format(
  87. str(i))
  88. html = requests.get(url=url, cookies=cookie, headers=header).content
  89. data = json.loads(html.decode('utf-8'))
  90. for j in range(100):
  91. resou = resou.append({'date': i, 'title': data[j]['keyword'],
  92. 'searchCount': data[j]['searchCount'], 'rank': j+1}, ignore_index=True)
  93. ## 按天统计
  94. resou = resou.apply(get_words_list, axis=1)
  95. resou.to_excel('热搜数据.xlsx')
  96. resou_dt = resou.groupby('date', as_index=False).agg({'searchCount': ['mean']})
  97. resou_dt.columns = ['date', 'avg_count']
  98. ## 绘制日历图
  99. data = [
  100. [resou_dt['date'][i], resou_dt['avg_count'][i]]
  101. for i in range(resou_dt.shape[0])
  102. ]
  103. calendar = (
  104. Calendar(init_opts=opts.InitOpts(width='1800px', height='1500px'))
  105. .add("", data, calendar_opts=opts.CalendarOpts(range_=['2019-01-01', '2019-07-12']))
  106. .set_global_opts(
  107. title_opts=opts.TitleOpts(title="2019每日热搜平均指数", pos_left='15%'),
  108. visualmap_opts=opts.VisualMapOpts(
  109. max_=3600000,
  110. min_=0,
  111. orient="horizontal",
  112. is_piecewise=False,
  113. pos_top="230px",
  114. pos_left="100px",
  115. pos_right="10px"
  116. )
  117. )
  118. .render('日期热力图.html')
  119. )
  120. ## 词频统计
  121. resou_word = dataframe_explode(resou, 'words_list')
  122. resou_word_stat = resou_word.groupby(
  123. ['words_list'], as_index=False).agg({'title': ['count']})
  124. resou_word_stat.columns = ['word', 'num']
  125. resou_word_stat.to_excel('词频统计.xlsx')
  126. func = pd.read_excel('名词.xlsx')
  127. love = pd.read_excel('婚恋.xlsx')
  128. person = pd.read_excel('人物.xlsx')
  129. resou_word_func = pd.merge(resou_word_stat, func, how='inner', on='word')
  130. resou_word_love = pd.merge(resou_word_stat, love, how='inner', on='word')
  131. resou_word_person = pd.merge(resou_word_stat, person, how='inner', on='word')
  132. resou_word_func = resou_word_func.sort_values(
  133. 'count', ascending=False).reset_index()
  134. words = [(resou_word_func['word'][i], resou_word_func['num'][i])
  135. for i in range(resou_word_func.shape[0])]
  136. pie = (
  137. Pie(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
  138. .add("", words)
  139. .set_global_opts(title_opts=opts.TitleOpts(title="微博常用词出现次数", pos_left='center'),
  140. legend_opts=opts.LegendOpts(is_show=False))
  141. .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}", font_size=16),)
  142. .render('热搜词饼图.html')
  143. )
  144. resou_word_love = resou_word_love.sort_values(
  145. 'count', ascending=False).reset_index()
  146. words = [(resou_word_love['word'][i], resou_word_love['num'][i])
  147. for i in range(resou_word_love.shape[0])]
  148. pie = (
  149. Pie(init_opts=opts.InitOpts(theme=ThemeType.CHALK))
  150. .add("", words)
  151. .set_global_opts(title_opts=opts.TitleOpts(title="微博婚恋类词语出现次数", pos_left='center'),
  152. legend_opts=opts.LegendOpts(is_show=False))
  153. .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}", font_size=16),)
  154. .render('婚恋类热搜词饼图.html')
  155. )
  156. resou_word_person = resou_word_person.sort_values(
  157. 'num', ascending=False).reset_index()
  158. words = [{'"value": {}, "name": {}"'.format(str(resou_word_person['num'][i]),
  159. resou_word_person['name'][i])} for i in range(resou_word_person.shape[0])]
  160. tree = (
  161. TreeMap(init_opts=opts.InitOpts(theme=ThemeType.ESSOS))
  162. .add("", words, pos_left=0, pos_right=0, pos_top=50, pos_bottom=50)
  163. .set_global_opts(title_opts=opts.TitleOpts(title="热搜明星出现次数排名", pos_left='center'),
  164. legend_opts=opts.LegendOpts(is_show=False))
  165. .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}\n\n {c}", font_size=17,
  166. color='black', position='inside', font_weight='bolder'))
  167. .render('排序.html')
  168. )