|
@@ -7,7 +7,9 @@
|
|
@Version : 1.0
|
|
@Version : 1.0
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
@License : (C)Copyright 2019
|
|
@License : (C)Copyright 2019
|
|
-@Desc : 爬取 B 站视频
|
|
|
|
|
|
+@Desc : 爬取 B 站视频 ,注意很多链接都失败了,需检测,接口:
|
|
|
|
+https://www.bilibili.com/video/av100500
|
|
|
|
+
|
|
'''
|
|
'''
|
|
from lxml import etree
|
|
from lxml import etree
|
|
from multiprocessing.dummy import Pool as ThreadPool
|
|
from multiprocessing.dummy import Pool as ThreadPool
|
|
@@ -16,17 +18,9 @@ import time
|
|
import sys
|
|
import sys
|
|
import re
|
|
import re
|
|
import json
|
|
import json
|
|
-import MySQLdb
|
|
|
|
-
|
|
|
|
-reload(sys)
|
|
|
|
-
|
|
|
|
-sys.setdefaultencoding('utf-8')
|
|
|
|
-
|
|
|
|
-# id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites
|
|
|
|
-# mid name article fans tags[3] common
|
|
|
|
|
|
+import pymysql
|
|
|
|
|
|
urls = []
|
|
urls = []
|
|
-
|
|
|
|
head = {
|
|
head = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
|
|
}
|
|
}
|
|
@@ -37,8 +31,15 @@ for i in range(17501, 100000):
|
|
url = 'http://bilibili.com/video/av' + str(i)
|
|
url = 'http://bilibili.com/video/av' + str(i)
|
|
urls.append(url)
|
|
urls.append(url)
|
|
|
|
|
|
|
|
+# 连接数据库
|
|
|
|
+conn = pymysql.connect(host='localhost',
|
|
|
|
+ user='root',
|
|
|
|
+ passwd='',
|
|
|
|
+ port=3306,
|
|
|
|
+ charset='utf8')
|
|
|
|
+cur = conn.cursor()
|
|
|
|
|
|
-def spider(url):
|
|
|
|
|
|
+def crawlVideo(url):
|
|
html = requests.get(url, headers=head)
|
|
html = requests.get(url, headers=head)
|
|
selector = etree.HTML(html.text)
|
|
selector = etree.HTML(html.text)
|
|
content = selector.xpath("//html")
|
|
content = selector.xpath("//html")
|
|
@@ -177,34 +178,29 @@ def spider(url):
|
|
jsPages = jsData['page']
|
|
jsPages = jsData['page']
|
|
common = jsPages['acount']
|
|
common = jsPages['acount']
|
|
try:
|
|
try:
|
|
- conn = MySQLdb.connect(
|
|
|
|
- host='localhost', user='root', passwd='', port=3306, charset='utf8')
|
|
|
|
- cur = conn.cursor()
|
|
|
|
- conn.select_db('python')
|
|
|
|
cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
|
|
cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
|
|
[str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
|
|
[str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
|
|
mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
|
|
mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
|
|
-
|
|
|
|
- print "Succeed: av" + str(av)
|
|
|
|
- except MySQLdb.Error, e:
|
|
|
|
- print "Mysql Error %d: %s" % (e.args[0], e.args[1])
|
|
|
|
|
|
+ print("Succeed: av" + str(av))
|
|
|
|
+ except pymysql.Error as e:
|
|
|
|
+ print("Mysql Error %d: %s" % (e.args[0], e.args[1]))
|
|
else:
|
|
else:
|
|
- print "Error_Json: " + url
|
|
|
|
|
|
+ print("Error_Json: " + url)
|
|
else:
|
|
else:
|
|
- print "Error_noCid:" + url
|
|
|
|
|
|
+ print("Error_noCid:" + url)
|
|
else:
|
|
else:
|
|
- print "Error_404: " + url
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-pool = ThreadPool(10)
|
|
|
|
-# results = pool.map(spider, urls)
|
|
|
|
-try:
|
|
|
|
- results = pool.map(spider, urls)
|
|
|
|
-except Exception, e:
|
|
|
|
- # print 'ConnectionError'
|
|
|
|
- print e
|
|
|
|
- time.sleep(300)
|
|
|
|
- results = pool.map(spider, urls)
|
|
|
|
-
|
|
|
|
-pool.close()
|
|
|
|
-pool.join()
|
|
|
|
|
|
+ print("Error_404: " + url)
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ # 开10个线程跑
|
|
|
|
+ pool = ThreadPool(10)
|
|
|
|
+ try:
|
|
|
|
+ results = pool.map(crawlVideo, urls)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ # print 'ConnectionError'
|
|
|
|
+ print(e)
|
|
|
|
+ time.sleep(300)
|
|
|
|
+ results = pool.map(crawlVideo, urls)
|
|
|
|
+ conn.close() #关闭数据库
|
|
|
|
+ pool.close()
|
|
|
|
+ pool.join()
|