Browse Source

完成视频和弹幕抓取

liuyuqi-dellpc 6 years ago
parent
commit
76b8988c41
5 changed files with 85 additions and 50 deletions
  1. 29 0
      README.md
  2. 5 11
      barrage/get_barrage.py
  3. 2 0
      requirements.txt
  4. 17 3
      user/get_face.py
  5. 32 36
      video/get_video.py

+ 29 - 0
README.md

@@ -15,6 +15,35 @@
 
 
 ### 使用
 ### 使用
 
 
+启动数据库,这里启动一个 mysql 容器就好了。
+
+```
+docker volume create --name datadir
+docker run --name my-mysql -v datadir:/var/lib/mysql -e MYSQL_ROOT_PASSWORD=123456 -p 3306:3306 -d daocloud.io/mysql:5.7.4
+```
+
+* 抓取用户
+
+```
+cd crawl_bilibili
+python user/get_user.py
+python user/get_face.py
+```
+
+* 抓取视频
+
+```
+python video/get_video.py
+```
+
+* 抓取视频弹幕
+
+```
+python barrage/get_barrage.py
+```
+
+* 数据分析
+
 
 
 
 
 ### 更新历史
 ### 更新历史

+ 5 - 11
barrage/get_barrage.py

@@ -7,26 +7,20 @@
 @Version :   1.0
 @Version :   1.0
 @Contact :   liuyuqi.gov@msn.cn
 @Contact :   liuyuqi.gov@msn.cn
 @License :   (C)Copyright 2019
 @License :   (C)Copyright 2019
-@Desc    :   弹幕爬虫
+@Desc    :   弹幕爬虫,弹幕每隔30s刷新一次,所以抓取某视频的弹幕只要通过改变时间参数就可以抓取视频所有的弹幕
 '''
 '''
-
-# -*-coding:utf8-*-
-
 from lxml import etree
 from lxml import etree
 import requests
 import requests
 import sys
 import sys
 import re
 import re
 
 
-#reload(sys)
-
-#sys.setdefaultencoding('utf-8')
-
 head = {
 head = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
+    'User-Agent':
+    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
 }
 }
 
 
 
 
-def spider(av):
+def crawlBarrage(av):
     url = 'http://bilibili.com/video/av' + str(av)
     url = 'http://bilibili.com/video/av' + str(av)
     print(url)
     print(url)
     html = requests.get(url, headers=head)
     html = requests.get(url, headers=head)
@@ -62,4 +56,4 @@ def spider(av):
 if __name__ == '__main__':
 if __name__ == '__main__':
     av = input('input av:')
     av = input('input av:')
     f = open(av + '.txt', 'w', encoding='utf-8')
     f = open(av + '.txt', 'w', encoding='utf-8')
-    spider(av)
+    crawlBarrage(av)

+ 2 - 0
requirements.txt

@@ -1,2 +1,4 @@
 PyMySQL==0.9.3
 PyMySQL==0.9.3
 requests==2.21.0
 requests==2.21.0
+urllib3==1.24.3
+lxml==4.3.3

+ 17 - 3
user/get_face.py

@@ -1,9 +1,23 @@
-# -*-coding:utf8-*-
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@File    :   get_face.py
+@Time    :   2019/05/16 05:44:10
+@Author  :   Liuyuqi 
+@Version :   1.0
+@Contact :   liuyuqi.gov@msn.cn
+@License :   (C)Copyright 2019
+@Desc    :   抓取用户头像,User表中头像字段导出后下载即可。
+'''
 
 
 import urllib
 import urllib
 import re
 import re
+import os, sys
+src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
+os.chdir(src)
+sys.path.append(src)
 
 
-f = open("../data/bilibili_user_face.txt")
+f = open("data/bilibili_user_face.txt")
 line = f.readline()
 line = f.readline()
 for i in range(1, 1000):
 for i in range(1, 1000):
     print(line,)
     print(line,)
@@ -12,7 +26,7 @@ for i in range(1, 1000):
         print('noface:' + str(i))
         print('noface:' + str(i))
     else:
     else:
         path = r"../data/face/" + str(i) + ".jpg"
         path = r"../data/face/" + str(i) + ".jpg"
-        data = urllib.urlretrieve(line, path)
+        data = urllib.request.urlretrieve(line, path)    #下载到指定位置
         line = f.readline()
         line = f.readline()
         print('succeed:' + str(i))
         print('succeed:' + str(i))
 
 

+ 32 - 36
video/get_video.py

@@ -7,7 +7,9 @@
 @Version :   1.0
 @Version :   1.0
 @Contact :   liuyuqi.gov@msn.cn
 @Contact :   liuyuqi.gov@msn.cn
 @License :   (C)Copyright 2019
 @License :   (C)Copyright 2019
-@Desc    :   爬取 B 站视频
+@Desc    :   爬取 B 站视频 ,注意很多链接都失败了,需检测,接口:
+https://www.bilibili.com/video/av100500
+
 '''
 '''
 from lxml import etree
 from lxml import etree
 from multiprocessing.dummy import Pool as ThreadPool
 from multiprocessing.dummy import Pool as ThreadPool
@@ -16,17 +18,9 @@ import time
 import sys
 import sys
 import re
 import re
 import json
 import json
-import MySQLdb
-
-reload(sys)
-
-sys.setdefaultencoding('utf-8')
-
-# id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites
-# mid name article fans tags[3] common
+import pymysql
 
 
 urls = []
 urls = []
-
 head = {
 head = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
 }
 }
@@ -37,8 +31,15 @@ for i in range(17501, 100000):
     url = 'http://bilibili.com/video/av' + str(i)
     url = 'http://bilibili.com/video/av' + str(i)
     urls.append(url)
     urls.append(url)
 
 
+# 连接数据库
+conn = pymysql.connect(host='localhost',
+                       user='root',
+                       passwd='',
+                       port=3306,
+                       charset='utf8')
+cur = conn.cursor()
 
 
-def spider(url):
+def crawlVideo(url):
     html = requests.get(url, headers=head)
     html = requests.get(url, headers=head)
     selector = etree.HTML(html.text)
     selector = etree.HTML(html.text)
     content = selector.xpath("//html")
     content = selector.xpath("//html")
@@ -177,34 +178,29 @@ def spider(url):
                         jsPages = jsData['page']
                         jsPages = jsData['page']
                         common = jsPages['acount']
                         common = jsPages['acount']
                         try:
                         try:
-                            conn = MySQLdb.connect(
-                                host='localhost', user='root', passwd='', port=3306, charset='utf8')
-                            cur = conn.cursor()
-                            conn.select_db('python')
                             cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                             cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                                         [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
                                         [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
                                          mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
                                          mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
-
-                            print "Succeed: av" + str(av)
-                        except MySQLdb.Error, e:
-                            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
+                            print("Succeed: av" + str(av))
+                        except pymysql.Error as e:
+                            print("Mysql Error %d: %s" % (e.args[0], e.args[1]))
                     else:
                     else:
-                        print "Error_Json: " + url
+                        print("Error_Json: " + url)
             else:
             else:
-                print "Error_noCid:" + url
+                print("Error_noCid:" + url)
         else:
         else:
-            print "Error_404: " + url
-
-
-pool = ThreadPool(10)
-# results = pool.map(spider, urls)
-try:
-    results = pool.map(spider, urls)
-except Exception, e:
-    # print 'ConnectionError'
-    print e
-    time.sleep(300)
-    results = pool.map(spider, urls)
-
-pool.close()
-pool.join()
+            print("Error_404: " + url)
+
+if __name__ == "__main__":
+    # 开10个线程跑
+    pool = ThreadPool(10)
+    try:
+        results = pool.map(crawlVideo, urls)
+    except Exception as e:
+        # print 'ConnectionError'
+        print(e)
+        time.sleep(300)
+        results = pool.map(crawlVideo, urls)
+    conn.close() #关闭数据库
+    pool.close()
+    pool.join()