get_video.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @File : get_video.py
  5. @Time : 2019/05/15 17:09:18
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 爬取 B 站视频
  11. '''
  12. # -*-coding:utf8-*-
  13. from lxml import etree
  14. from multiprocessing.dummy import Pool as ThreadPool
  15. import requests
  16. import time
  17. import sys
  18. import re
  19. import json
  20. import MySQLdb
  21. reload(sys)
  22. sys.setdefaultencoding('utf-8')
  23. # id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites
  24. # mid name article fans tags[3] common
  25. urls = []
  26. head = {
  27. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
  28. }
  29. time1 = time.time()
  30. for i in range(17501, 100000):
  31. url = 'http://bilibili.com/video/av' + str(i)
  32. urls.append(url)
  33. def spider(url):
  34. html = requests.get(url, headers=head)
  35. selector = etree.HTML(html.text)
  36. content = selector.xpath("//html")
  37. for each in content:
  38. title = each.xpath('//div[@class="v-title"]/h1/@title')
  39. if title:
  40. av = url.replace("http://bilibili.com/video/av", "")
  41. title = title[0]
  42. tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()')
  43. tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()')
  44. tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()')
  45. if tminfo1_log:
  46. tminfo1 = tminfo1_log[0]
  47. else:
  48. tminfo1 = ""
  49. if tminfo2_log:
  50. tminfo2 = tminfo2_log[0]
  51. else:
  52. tminfo2 = ""
  53. if tminfo3_log:
  54. tminfo3 = tminfo3_log[0]
  55. else:
  56. tminfo3 = ""
  57. tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3
  58. time_log = each.xpath('//div[@class="tminfo"]/time/i/text()')
  59. mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid')
  60. name_log = each.xpath('//div[@class="usname"]/a/@title')
  61. article_log = each.xpath(
  62. '//div[@class="up-video-message"]/div[1]/text()')
  63. fans_log = each.xpath(
  64. '//div[@class="up-video-message"]/div[2]/text()')
  65. if time_log:
  66. time = time_log[0]
  67. else:
  68. time = ""
  69. if mid_log:
  70. mid = mid_log[0]
  71. else:
  72. mid = ""
  73. if name_log:
  74. name = name_log[0]
  75. else:
  76. name = ""
  77. if article_log:
  78. article = article_log[0].replace(u"投稿:", "")
  79. else:
  80. article = "-1"
  81. if fans_log:
  82. fans = fans_log[0].replace(u"粉丝:", "")
  83. else:
  84. fans = "-1"
  85. tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()')
  86. tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()')
  87. tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()')
  88. if tag1_log:
  89. tag1 = tag1_log[0]
  90. else:
  91. tag1 = ""
  92. if tag2_log:
  93. tag2 = tag2_log[0]
  94. else:
  95. tag2 = ""
  96. if tag3_log:
  97. tag3 = tag3_log[0]
  98. else:
  99. tag3 = ""
  100. cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src')
  101. cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()')
  102. if cid_html_1 or cid_html_2:
  103. if cid_html_1:
  104. cid_html = cid_html_1[0]
  105. else:
  106. cid_html = cid_html_2[0]
  107. cids = re.findall(r'cid=.+&aid', cid_html)
  108. cid = cids[0].replace("cid=", "").replace("&aid", "")
  109. info_url = "http://interface.bilibili.com/player?id=cid:" + \
  110. str(cid) + "&aid=" + av
  111. video_info = requests.get(info_url)
  112. video_selector = etree.HTML(video_info.text)
  113. for video_each in video_selector:
  114. click_log = video_each.xpath('//click/text()')
  115. danmu_log = video_each.xpath('//danmu/text()')
  116. coins_log = video_each.xpath('//coins/text()')
  117. favourites_log = video_each.xpath('//favourites/text()')
  118. duration_log = video_each.xpath('//duration/text()')
  119. honor_click_log = video_each.xpath(
  120. '//honor[@t="click"]/text()')
  121. honor_coins_log = video_each.xpath(
  122. '//honor[@t="coins"]/text()')
  123. honor_favourites_log = video_each.xpath(
  124. '//honor[@t="favourites"]/text()')
  125. if honor_click_log:
  126. honor_click = honor_click_log[0]
  127. else:
  128. honor_click = 0
  129. if honor_coins_log:
  130. honor_coins = honor_coins_log[0]
  131. else:
  132. honor_coins = 0
  133. if honor_favourites_log:
  134. honor_favourites = honor_favourites_log[0]
  135. else:
  136. honor_favourites = 0
  137. if click_log:
  138. click = click_log[0]
  139. else:
  140. click = -1
  141. if danmu_log:
  142. danmu = danmu_log[0]
  143. else:
  144. danmu = -1
  145. if coins_log:
  146. coins = coins_log[0]
  147. else:
  148. coins = -1
  149. if favourites_log:
  150. favourites = favourites_log[0]
  151. else:
  152. favourites = -1
  153. if duration_log:
  154. duration = duration_log[0]
  155. else:
  156. duration = ""
  157. json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av
  158. jsoncontent = requests.get(json_url, headers=head).content
  159. jsDict = json.loads(jsoncontent)
  160. if jsDict['code'] == 0:
  161. jsData = jsDict['data']
  162. jsPages = jsData['page']
  163. common = jsPages['acount']
  164. try:
  165. conn = MySQLdb.connect(
  166. host='localhost', user='root', passwd='', port=3306, charset='utf8')
  167. cur = conn.cursor()
  168. conn.select_db('python')
  169. cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
  170. [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
  171. mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
  172. print "Succeed: av" + str(av)
  173. except MySQLdb.Error, e:
  174. print "Mysql Error %d: %s" % (e.args[0], e.args[1])
  175. else:
  176. print "Error_Json: " + url
  177. else:
  178. print "Error_noCid:" + url
  179. else:
  180. print "Error_404: " + url
  181. pool = ThreadPool(10)
  182. # results = pool.map(spider, urls)
  183. try:
  184. results = pool.map(spider, urls)
  185. except Exception, e:
  186. # print 'ConnectionError'
  187. print e
  188. time.sleep(300)
  189. results = pool.map(spider, urls)
  190. pool.close()
  191. pool.join()