get_video.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @File : get_video.py
  5. @Time : 2019/05/15 17:09:18
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 爬取 B 站视频
  11. '''
  12. from lxml import etree
  13. from multiprocessing.dummy import Pool as ThreadPool
  14. import requests
  15. import time
  16. import sys
  17. import re
  18. import json
  19. import MySQLdb
  20. reload(sys)
  21. sys.setdefaultencoding('utf-8')
  22. # id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites
  23. # mid name article fans tags[3] common
  24. urls = []
  25. head = {
  26. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
  27. }
  28. time1 = time.time()
  29. for i in range(17501, 100000):
  30. url = 'http://bilibili.com/video/av' + str(i)
  31. urls.append(url)
  32. def spider(url):
  33. html = requests.get(url, headers=head)
  34. selector = etree.HTML(html.text)
  35. content = selector.xpath("//html")
  36. for each in content:
  37. title = each.xpath('//div[@class="v-title"]/h1/@title')
  38. if title:
  39. av = url.replace("http://bilibili.com/video/av", "")
  40. title = title[0]
  41. tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()')
  42. tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()')
  43. tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()')
  44. if tminfo1_log:
  45. tminfo1 = tminfo1_log[0]
  46. else:
  47. tminfo1 = ""
  48. if tminfo2_log:
  49. tminfo2 = tminfo2_log[0]
  50. else:
  51. tminfo2 = ""
  52. if tminfo3_log:
  53. tminfo3 = tminfo3_log[0]
  54. else:
  55. tminfo3 = ""
  56. tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3
  57. time_log = each.xpath('//div[@class="tminfo"]/time/i/text()')
  58. mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid')
  59. name_log = each.xpath('//div[@class="usname"]/a/@title')
  60. article_log = each.xpath(
  61. '//div[@class="up-video-message"]/div[1]/text()')
  62. fans_log = each.xpath(
  63. '//div[@class="up-video-message"]/div[2]/text()')
  64. if time_log:
  65. time = time_log[0]
  66. else:
  67. time = ""
  68. if mid_log:
  69. mid = mid_log[0]
  70. else:
  71. mid = ""
  72. if name_log:
  73. name = name_log[0]
  74. else:
  75. name = ""
  76. if article_log:
  77. article = article_log[0].replace(u"投稿:", "")
  78. else:
  79. article = "-1"
  80. if fans_log:
  81. fans = fans_log[0].replace(u"粉丝:", "")
  82. else:
  83. fans = "-1"
  84. tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()')
  85. tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()')
  86. tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()')
  87. if tag1_log:
  88. tag1 = tag1_log[0]
  89. else:
  90. tag1 = ""
  91. if tag2_log:
  92. tag2 = tag2_log[0]
  93. else:
  94. tag2 = ""
  95. if tag3_log:
  96. tag3 = tag3_log[0]
  97. else:
  98. tag3 = ""
  99. cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src')
  100. cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()')
  101. if cid_html_1 or cid_html_2:
  102. if cid_html_1:
  103. cid_html = cid_html_1[0]
  104. else:
  105. cid_html = cid_html_2[0]
  106. cids = re.findall(r'cid=.+&aid', cid_html)
  107. cid = cids[0].replace("cid=", "").replace("&aid", "")
  108. info_url = "http://interface.bilibili.com/player?id=cid:" + \
  109. str(cid) + "&aid=" + av
  110. video_info = requests.get(info_url)
  111. video_selector = etree.HTML(video_info.text)
  112. for video_each in video_selector:
  113. click_log = video_each.xpath('//click/text()')
  114. danmu_log = video_each.xpath('//danmu/text()')
  115. coins_log = video_each.xpath('//coins/text()')
  116. favourites_log = video_each.xpath('//favourites/text()')
  117. duration_log = video_each.xpath('//duration/text()')
  118. honor_click_log = video_each.xpath(
  119. '//honor[@t="click"]/text()')
  120. honor_coins_log = video_each.xpath(
  121. '//honor[@t="coins"]/text()')
  122. honor_favourites_log = video_each.xpath(
  123. '//honor[@t="favourites"]/text()')
  124. if honor_click_log:
  125. honor_click = honor_click_log[0]
  126. else:
  127. honor_click = 0
  128. if honor_coins_log:
  129. honor_coins = honor_coins_log[0]
  130. else:
  131. honor_coins = 0
  132. if honor_favourites_log:
  133. honor_favourites = honor_favourites_log[0]
  134. else:
  135. honor_favourites = 0
  136. if click_log:
  137. click = click_log[0]
  138. else:
  139. click = -1
  140. if danmu_log:
  141. danmu = danmu_log[0]
  142. else:
  143. danmu = -1
  144. if coins_log:
  145. coins = coins_log[0]
  146. else:
  147. coins = -1
  148. if favourites_log:
  149. favourites = favourites_log[0]
  150. else:
  151. favourites = -1
  152. if duration_log:
  153. duration = duration_log[0]
  154. else:
  155. duration = ""
  156. json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av
  157. jsoncontent = requests.get(json_url, headers=head).content
  158. jsDict = json.loads(jsoncontent)
  159. if jsDict['code'] == 0:
  160. jsData = jsDict['data']
  161. jsPages = jsData['page']
  162. common = jsPages['acount']
  163. try:
  164. conn = MySQLdb.connect(
  165. host='localhost', user='root', passwd='', port=3306, charset='utf8')
  166. cur = conn.cursor()
  167. conn.select_db('python')
  168. cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
  169. [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
  170. mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
  171. print "Succeed: av" + str(av)
  172. except MySQLdb.Error, e:
  173. print "Mysql Error %d: %s" % (e.args[0], e.args[1])
  174. else:
  175. print "Error_Json: " + url
  176. else:
  177. print "Error_noCid:" + url
  178. else:
  179. print "Error_404: " + url
  180. pool = ThreadPool(10)
  181. # results = pool.map(spider, urls)
  182. try:
  183. results = pool.map(spider, urls)
  184. except Exception, e:
  185. # print 'ConnectionError'
  186. print e
  187. time.sleep(300)
  188. results = pool.map(spider, urls)
  189. pool.close()
  190. pool.join()