7 years ago · dc982a6fba
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 
				+/.idea
			
 
				+/__pycache__
			
 
				+/data
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,17 @@
 
				+## crawl_bilibili
			
 
				+
			
 
				+bilibili爬虫，主要涉及：
			
 
				+
			
 
				+用户公开数据（用户/性别/地区/注册时间/）
			
 
				+
			
 
				+视频数据（）
			
 
				+
			
 
				+弹幕数据（）
			
 
				+
			
 
				+评论数据（）
			
 
				+
			
 
				+付费数据（）
			
 
				+
			
 
				+### 使用
			
 
				+
			
 
				+### 更新历史
			
--- a/barrage/get_barrage.py
+++ b/barrage/get_barrage.py
@@ -0,0 +1,65 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@File    :   get_barrage.py
			
 
				+@Time    :   2019/05/15 17:10:38
			
 
				+@Author  :   Liuyuqi 
			
 
				+@Version :   1.0
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@License :   (C)Copyright 2019
			
 
				+@Desc    :   弹幕爬虫
			
 
				+'''
			
 
				+
			
 
				+# -*-coding:utf8-*-
			
 
				+
			
 
				+from lxml import etree
			
 
				+import requests
			
 
				+import sys
			
 
				+import re
			
 
				+
			
 
				+#reload(sys)
			
 
				+
			
 
				+#sys.setdefaultencoding('utf-8')
			
 
				+
			
 
				+head = {
			
 
				+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def spider(av):
			
 
				+    url = 'http://bilibili.com/video/av' + str(av)
			
 
				+    print(url)
			
 
				+    html = requests.get(url, headers=head)
			
 
				+    selector = etree.HTML(html.text)
			
 
				+    content = selector.xpath("//html")
			
 
				+    for each in content:
			
 
				+        title = each.xpath('//*[@id="viewbox_report"]/h1/span')
			
 
				+        if title:
			
 
				+            print(title[0].text)
			
 
				+            cid_html_1 = each.xpath('//*[@id="link2"]/@value')
			
 
				+            if cid_html_1:
			
 
				+                cid_html = cid_html_1[0]
			
 
				+                cids = re.findall(r'cid=.+&page', cid_html)
			
 
				+                cid = cids[0].replace("cid=", "").replace("&page", "")
			
 
				+                comment_url = 'http://comment.bilibili.com/' + \
			
 
				+                    str(cid) + '.xml'
			
 
				+                print(comment_url)
			
 
				+                comment_text = requests.get(comment_url, headers=head)
			
 
				+                comment_selector = etree.HTML(comment_text.content)
			
 
				+                comment_content = comment_selector.xpath('//i')
			
 
				+                for comment_each in comment_content:
			
 
				+                    comments = comment_each.xpath('//d/text()')
			
 
				+                    if comments:
			
 
				+                        for comment in comments:
			
 
				+                            print(comment)
			
 
				+                            f.writelines(comment + '\n')
			
 
				+            else:
			
 
				+                print('cid not found!')
			
 
				+        else:
			
 
				+            print('video not found!')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    av = input('input av:')
			
 
				+    f = open(av + '.txt', 'w', encoding='utf-8')
			
 
				+    spider(av)
			
--- a/conf/user.sql
+++ b/conf/user.sql
@@ -0,0 +1,26 @@
 
				+DROP TABLE IF EXISTS `user`;
			
 
				+CREATE TABLE `user` (
			
 
				+  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
			
 
				+  `mid` int(20) unsigned NOT NULL,
			
 
				+  `name` varchar(45) NOT NULL,
			
 
				+  `sex` varchar(45) NOT NULL,
			
 
				+  `rank` varchar(45) NOT NULL,
			
 
				+  `face` varchar(200) NOT NULL,
			
 
				+  `regtime` varchar(45) NOT NULL,
			
 
				+  `spacesta` varchar(45) NOT NULL,
			
 
				+  `birthday` varchar(45) NOT NULL,
			
 
				+  `sign` varchar(300) NOT NULL,
			
 
				+  `level` varchar(45) NOT NULL,
			
 
				+  `OfficialVerifyType` varchar(45) NOT NULL,
			
 
				+  `OfficialVerifyDesc` varchar(100) NOT NULL,
			
 
				+  `vipType` varchar(45) NOT NULL,
			
 
				+  `vipStatus` varchar(45) NOT NULL,
			
 
				+  `toutu` varchar(200) NOT NULL,
			
 
				+  `toutuId` int(20) unsigned NOT NULL,
			
 
				+  `coins` int(20) unsigned NOT NULL,
			
 
				+  `following` int(20) unsigned NOT NULL,
			
 
				+  `fans` int(20) unsigned NOT NULL,
			
 
				+  `archiveview` int(20) unsigned NOT NULL,
			
 
				+  `article` int(20) unsigned NOT NULL,
			
 
				+  PRIMARY KEY (`id`)
			
 
				+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
			
--- a/conf/video.sql
+++ b/conf/video.sql
@@ -0,0 +1,25 @@
 
				+CREATE TABLE `video` (
			
 
				+  `id` int(11) NOT NULL AUTO_INCREMENT,
			
 
				+  `av` int(11) DEFAULT NULL,
			
 
				+  `cid` int(11) DEFAULT NULL,
			
 
				+  `title` varchar(150) DEFAULT NULL,
			
 
				+  `tminfo` varchar(45) DEFAULT NULL,
			
 
				+  `time` varchar(45) DEFAULT NULL,
			
 
				+  `click` int(11) DEFAULT NULL,
			
 
				+  `danmu` int(11) DEFAULT NULL,
			
 
				+  `coins` int(11) DEFAULT NULL,
			
 
				+  `favourites` int(11) DEFAULT NULL,
			
 
				+  `duration` varchar(45) DEFAULT NULL,
			
 
				+  `mid` int(11) DEFAULT NULL,
			
 
				+  `name` varchar(45) DEFAULT NULL,
			
 
				+  `article` int(11) DEFAULT NULL,
			
 
				+  `fans` int(11) DEFAULT NULL,
			
 
				+  `tag1` varchar(45) DEFAULT NULL,
			
 
				+  `tag2` varchar(45) DEFAULT NULL,
			
 
				+  `tag3` varchar(45) DEFAULT NULL,
			
 
				+  `common` int(11) DEFAULT NULL,
			
 
				+  `honor_click` int(11) DEFAULT NULL,
			
 
				+  `honor_coins` int(11) DEFAULT NULL,
			
 
				+  `honor_favourites` int(11) DEFAULT NULL,
			
 
				+  PRIMARY KEY (`id`)
			
 
				+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
 
				+pymysql
			
--- a/threads.py
+++ b/threads.py
@@ -0,0 +1,31 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+'''
			
 
				+多线程下载多文件;多线程分段下载单文件.
			
 
				+@Auther :liuyuqi.gov@msn.cn
			
 
				+@date :2019/4/8
			
 
				+'''
			
 
				+__author__ = "liuyuqi"
			
 
				+
			
 
				+from threading import Lock
			
 
				+from threading import Thread
			
 
				+
			
 
				+threadLock = Lock()
			
 
				+threads = []
			
 
				+
			
 
				+
			
 
				+class MyThread(Thread):
			
 
				+    def __init__(self, name, func, *args, lock=False):
			
 
				+        Thread.__init__(self)
			
 
				+        self.name = name
			
 
				+        self.func = func
			
 
				+        self.args = args
			
 
				+        self.lock = lock
			
 
				+
			
 
				+    def run(self):
			
 
				+        print("开启： " + self.name)
			
 
				+        if self.lock:
			
 
				+            threadLock.acquire()
			
 
				+            self.func(*self.args)
			
 
				+            threadLock.release()
			
 
				+        else:
			
 
				+            self.func(*self.args)
			
--- a/user/get_face.py
+++ b/user/get_face.py
@@ -0,0 +1,19 @@
 
				+# -*-coding:utf8-*-
			
 
				+
			
 
				+import urllib
			
 
				+import re
			
 
				+
			
 
				+f = open("../data/bilibili_user_face.txt")
			
 
				+line = f.readline()
			
 
				+for i in range(1, 1000):
			
 
				+    print(line,)
			
 
				+    if re.match('http://static.*', line):
			
 
				+        line = f.readline()
			
 
				+        print('noface:' + str(i))
			
 
				+    else:
			
 
				+        path = r"../data/face/" + str(i) + ".jpg"
			
 
				+        data = urllib.urlretrieve(line, path)
			
 
				+        line = f.readline()
			
 
				+        print('succeed:' + str(i))
			
 
				+
			
 
				+f.close()
			
--- a/user/get_user.py
+++ b/user/get_user.py
@@ -0,0 +1,155 @@
 
				+# -*-coding:utf8-*-
			
 
				+
			
 
				+import requests
			
 
				+import json
			
 
				+import random
			
 
				+import pymysql
			
 
				+import sys
			
 
				+import datetime
			
 
				+import time
			
 
				+from imp import reload
			
 
				+from multiprocessing.dummy import Pool as ThreadPool
			
 
				+
			
 
				+def datetime_to_timestamp_in_milliseconds(d):
			
 
				+    def current_milli_time(): return int(round(time.time() * 1000))
			
 
				+
			
 
				+    return current_milli_time()
			
 
				+
			
 
				+
			
 
				+reload(sys)
			
 
				+
			
 
				+
			
 
				+def LoadUserAgents(uafile):
			
 
				+    uas = []
			
 
				+    with open(uafile, 'rb') as uaf:
			
 
				+        for ua in uaf.readlines():
			
 
				+            if ua:
			
 
				+                uas.append(ua.strip()[:-1])
			
 
				+    random.shuffle(uas)
			
 
				+    return uas
			
 
				+
			
 
				+
			
 
				+uas = LoadUserAgents("user_agents.txt")
			
 
				+head = {
			
 
				+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
			
 
				+    'X-Requested-With': 'XMLHttpRequest',
			
 
				+    'Referer': 'http://space.bilibili.com/45388',
			
 
				+    'Origin': 'http://space.bilibili.com',
			
 
				+    'Host': 'space.bilibili.com',
			
 
				+    'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0',
			
 
				+    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
			
 
				+    'Accept': 'application/json, text/javascript, */*; q=0.01',
			
 
				+}
			
 
				+
			
 
				+# Please replace your own proxies.
			
 
				+proxies = {
			
 
				+    'http': 'http://120.26.110.59:8080',
			
 
				+    'http': 'http://120.52.32.46:80',
			
 
				+    'http': 'http://218.85.133.62:80',
			
 
				+}
			
 
				+time1 = time.time()
			
 
				+
			
 
				+urls = []
			
 
				+
			
 
				+# Please change the range data by yourself.
			
 
				+for m in range(5214, 5215):
			
 
				+
			
 
				+    for i in range(m * 100, (m + 1) * 100):
			
 
				+        url = 'https://space.bilibili.com/' + str(i)
			
 
				+        urls.append(url)
			
 
				+
			
 
				+
			
 
				+    def getsource(url):
			
 
				+        payload = {
			
 
				+            '_': datetime_to_timestamp_in_milliseconds(datetime.datetime.now()),
			
 
				+            'mid': url.replace('https://space.bilibili.com/', '')
			
 
				+        }
			
 
				+        ua = random.choice(uas)
			
 
				+        head = {
			
 
				+            'User-Agent': ua,
			
 
				+            'Referer': 'https://space.bilibili.com/' + str(i) + '?from=search&seid=' + str(random.randint(10000, 50000))
			
 
				+        }
			
 
				+        jscontent = requests \
			
 
				+            .session() \
			
 
				+            .post('http://space.bilibili.com/ajax/member/GetInfo',
			
 
				+                  headers=head,
			
 
				+                  data=payload,
			
 
				+                  proxies=proxies) \
			
 
				+            .text
			
 
				+        time2 = time.time()
			
 
				+        try:
			
 
				+            jsDict = json.loads(jscontent)
			
 
				+            statusJson = jsDict['status'] if 'status' in jsDict.keys() else False
			
 
				+            if statusJson == True:
			
 
				+                if 'data' in jsDict.keys():
			
 
				+                    jsData = jsDict['data']
			
 
				+                    mid = jsData['mid']
			
 
				+                    name = jsData['name']
			
 
				+                    sex = jsData['sex']
			
 
				+                    rank = jsData['rank']
			
 
				+                    face = jsData['face']
			
 
				+                    regtimestamp = jsData['regtime']
			
 
				+                    regtime_local = time.localtime(regtimestamp)
			
 
				+                    regtime = time.strftime("%Y-%m-%d %H:%M:%S",regtime_local)
			
 
				+                    spacesta = jsData['spacesta']
			
 
				+                    birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday'
			
 
				+                    sign = jsData['sign']
			
 
				+                    level = jsData['level_info']['current_level']
			
 
				+                    OfficialVerifyType = jsData['official_verify']['type']
			
 
				+                    OfficialVerifyDesc = jsData['official_verify']['desc']
			
 
				+                    vipType = jsData['vip']['vipType']
			
 
				+                    vipStatus = jsData['vip']['vipStatus']
			
 
				+                    toutu = jsData['toutu']
			
 
				+                    toutuId = jsData['toutuId']
			
 
				+                    coins = jsData['coins']
			
 
				+                    print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
			
 
				+                    try:
			
 
				+                        res = requests.get(
			
 
				+                            'https://api.bilibili.com/x/relation/stat?vmid=' + str(mid) + '&jsonp=jsonp').text
			
 
				+                        viewinfo = requests.get(
			
 
				+                            'https://api.bilibili.com/x/space/upstat?mid=' + str(mid) + '&jsonp=jsonp').text
			
 
				+                        js_fans_data = json.loads(res)
			
 
				+                        js_viewdata = json.loads(viewinfo)
			
 
				+                        following = js_fans_data['data']['following']
			
 
				+                        fans = js_fans_data['data']['follower']
			
 
				+                        archiveview = js_viewdata['data']['archive']['view']
			
 
				+                        article = js_viewdata['data']['article']['view']
			
 
				+                    except:
			
 
				+                        following = 0
			
 
				+                        fans = 0
			
 
				+                        archiveview = 0
			
 
				+                        article = 0
			
 
				+                else:
			
 
				+                    print('no data now')
			
 
				+                try:
			
 
				+                    # Please write your MySQL's information.
			
 
				+                    conn = pymysql.connect(
			
 
				+                        host='localhost', user='root', passwd='123456', db='bilibili', charset='utf8')
			
 
				+                    cur = conn.cursor()
			
 
				+                    cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, rank, face, regtime, spacesta, \
			
 
				+                                birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
			
 
				+                                toutu, toutuId, coins, following, fans ,archiveview, article) \
			
 
				+                    VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
			
 
				+                            "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
			
 
				+                                %
			
 
				+                                (mid, name, sex, rank, face, regtime, spacesta, \
			
 
				+                                birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
			
 
				+                                toutu, toutuId, coins, following, fans ,archiveview, article))
			
 
				+                    conn.commit()
			
 
				+                except Exception as e:
			
 
				+                    print(e)
			
 
				+            else:
			
 
				+                print("Error: " + url)
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+            pass
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pool = ThreadPool(1)
			
 
				+    try:
			
 
				+        results = pool.map(getsource, urls)
			
 
				+    except Exception as e:
			
 
				+        print(e)
			
 
				+ 
			
 
				+    pool.close()
			
 
				+    pool.join()
			
--- a/user/main.py
+++ b/user/main.py
@@ -0,0 +1,39 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+'''
			
 
				+@Auther :liuyuqi.gov@msn.cn
			
 
				+@date :2019/4/8
			
 
				+'''
			
 
				+__author__ = "liuyuqi"
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+from contextlib import closing
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+import DownloadProgress
			
 
				+import user_agent
			
 
				+
			
 
				+# src = "D:/PycharmProjects/crawl_xuexi/"
			
 
				+# os.chdir(src)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def crawl():
			
 
				+    with open("data/ml.json", "r", encoding="utf8") as f:
			
 
				+        mlData = json.loads(f.read())
			
 
				+        for i in range((len(mlData["fpe1ki18v228w00"]))):
			
 
				+            frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
			
 
				+            static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
			
 
				+            # 打开 mp4 视频网页链接
			
 
				+            resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
			
 
				+            preUrl = static_page_url.split("/")[3]
			
 
				+            pattern = r'src="./data(.*?)"></script>'
			
 
				+            url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
			
 
				+            res = get_video_links(url)[0]
			
 
				+            downloadVideo(res, file_name=frst_name)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawl()
			
--- a/utils/__init__.py
+++ b/utils/__init__.py
--- a/utils/user_agent.py
+++ b/utils/user_agent.py
@@ -0,0 +1,18 @@
 
				+# -*-coding:utf-8 -*-
			
 
				+
			
 
				+import random
			
 
				+import os
			
 
				+
			
 
				+# 返回一个随机的请求头 headers
			
 
				+
			
 
				+
			
 
				+def getheaders():
			
 
				+    useragent = []
			
 
				+    with open("data/user_agents.txt", "rb") as f:
			
 
				+        for ua in f.readlines():
			
 
				+            if ua:
			
 
				+                useragent.append(ua.strip()[:-1])
			
 
				+    random.shuffle(useragent)  # 随机排序
			
 
				+    return useragent
			
 
				+
			
 
				+# print({"User-Agent": getheaders()[1]})
			
--- a/video/get_video.py
+++ b/video/get_video.py
@@ -0,0 +1,213 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@File    :   get_video.py
			
 
				+@Time    :   2019/05/15 17:09:18
			
 
				+@Author  :   Liuyuqi 
			
 
				+@Version :   1.0
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@License :   (C)Copyright 2019
			
 
				+@Desc    :   爬取 B 站视频
			
 
				+'''
			
 
				+
			
 
				+# -*-coding:utf8-*-
			
 
				+
			
 
				+from lxml import etree
			
 
				+from multiprocessing.dummy import Pool as ThreadPool
			
 
				+import requests
			
 
				+import time
			
 
				+import sys
			
 
				+import re
			
 
				+import json
			
 
				+import MySQLdb
			
 
				+
			
 
				+reload(sys)
			
 
				+
			
 
				+sys.setdefaultencoding('utf-8')
			
 
				+
			
 
				+# id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites
			
 
				+# mid name article fans tags[3] common
			
 
				+
			
 
				+urls = []
			
 
				+
			
 
				+head = {
			
 
				+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
			
 
				+}
			
 
				+
			
 
				+time1 = time.time()
			
 
				+
			
 
				+for i in range(17501, 100000):
			
 
				+    url = 'http://bilibili.com/video/av' + str(i)
			
 
				+    urls.append(url)
			
 
				+
			
 
				+
			
 
				+def spider(url):
			
 
				+    html = requests.get(url, headers=head)
			
 
				+    selector = etree.HTML(html.text)
			
 
				+    content = selector.xpath("//html")
			
 
				+    for each in content:
			
 
				+        title = each.xpath('//div[@class="v-title"]/h1/@title')
			
 
				+        if title:
			
 
				+            av = url.replace("http://bilibili.com/video/av", "")
			
 
				+            title = title[0]
			
 
				+            tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()')
			
 
				+            tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()')
			
 
				+            tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()')
			
 
				+            if tminfo1_log:
			
 
				+                tminfo1 = tminfo1_log[0]
			
 
				+            else:
			
 
				+                tminfo1 = ""
			
 
				+            if tminfo2_log:
			
 
				+                tminfo2 = tminfo2_log[0]
			
 
				+            else:
			
 
				+                tminfo2 = ""
			
 
				+            if tminfo3_log:
			
 
				+                tminfo3 = tminfo3_log[0]
			
 
				+            else:
			
 
				+                tminfo3 = ""
			
 
				+            tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3
			
 
				+            time_log = each.xpath('//div[@class="tminfo"]/time/i/text()')
			
 
				+            mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid')
			
 
				+            name_log = each.xpath('//div[@class="usname"]/a/@title')
			
 
				+            article_log = each.xpath(
			
 
				+                '//div[@class="up-video-message"]/div[1]/text()')
			
 
				+            fans_log = each.xpath(
			
 
				+                '//div[@class="up-video-message"]/div[2]/text()')
			
 
				+
			
 
				+            if time_log:
			
 
				+                time = time_log[0]
			
 
				+            else:
			
 
				+                time = ""
			
 
				+            if mid_log:
			
 
				+                mid = mid_log[0]
			
 
				+            else:
			
 
				+                mid = ""
			
 
				+            if name_log:
			
 
				+                name = name_log[0]
			
 
				+            else:
			
 
				+                name = ""
			
 
				+            if article_log:
			
 
				+                article = article_log[0].replace(u"投稿：", "")
			
 
				+            else:
			
 
				+                article = "-1"
			
 
				+            if fans_log:
			
 
				+                fans = fans_log[0].replace(u"粉丝：", "")
			
 
				+            else:
			
 
				+                fans = "-1"
			
 
				+
			
 
				+            tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()')
			
 
				+            tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()')
			
 
				+            tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()')
			
 
				+            if tag1_log:
			
 
				+                tag1 = tag1_log[0]
			
 
				+            else:
			
 
				+                tag1 = ""
			
 
				+            if tag2_log:
			
 
				+                tag2 = tag2_log[0]
			
 
				+            else:
			
 
				+                tag2 = ""
			
 
				+            if tag3_log:
			
 
				+                tag3 = tag3_log[0]
			
 
				+            else:
			
 
				+                tag3 = ""
			
 
				+
			
 
				+            cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src')
			
 
				+            cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()')
			
 
				+            if cid_html_1 or cid_html_2:
			
 
				+                if cid_html_1:
			
 
				+                    cid_html = cid_html_1[0]
			
 
				+                else:
			
 
				+                    cid_html = cid_html_2[0]
			
 
				+
			
 
				+                cids = re.findall(r'cid=.+&aid', cid_html)
			
 
				+                cid = cids[0].replace("cid=", "").replace("&aid", "")
			
 
				+                info_url = "http://interface.bilibili.com/player?id=cid:" + \
			
 
				+                    str(cid) + "&aid=" + av
			
 
				+                video_info = requests.get(info_url)
			
 
				+                video_selector = etree.HTML(video_info.text)
			
 
				+                for video_each in video_selector:
			
 
				+                    click_log = video_each.xpath('//click/text()')
			
 
				+                    danmu_log = video_each.xpath('//danmu/text()')
			
 
				+                    coins_log = video_each.xpath('//coins/text()')
			
 
				+                    favourites_log = video_each.xpath('//favourites/text()')
			
 
				+                    duration_log = video_each.xpath('//duration/text()')
			
 
				+                    honor_click_log = video_each.xpath(
			
 
				+                        '//honor[@t="click"]/text()')
			
 
				+                    honor_coins_log = video_each.xpath(
			
 
				+                        '//honor[@t="coins"]/text()')
			
 
				+                    honor_favourites_log = video_each.xpath(
			
 
				+                        '//honor[@t="favourites"]/text()')
			
 
				+
			
 
				+                    if honor_click_log:
			
 
				+                        honor_click = honor_click_log[0]
			
 
				+                    else:
			
 
				+                        honor_click = 0
			
 
				+                    if honor_coins_log:
			
 
				+                        honor_coins = honor_coins_log[0]
			
 
				+                    else:
			
 
				+                        honor_coins = 0
			
 
				+                    if honor_favourites_log:
			
 
				+                        honor_favourites = honor_favourites_log[0]
			
 
				+                    else:
			
 
				+                        honor_favourites = 0
			
 
				+
			
 
				+                    if click_log:
			
 
				+                        click = click_log[0]
			
 
				+                    else:
			
 
				+                        click = -1
			
 
				+                    if danmu_log:
			
 
				+                        danmu = danmu_log[0]
			
 
				+                    else:
			
 
				+                        danmu = -1
			
 
				+                    if coins_log:
			
 
				+                        coins = coins_log[0]
			
 
				+                    else:
			
 
				+                        coins = -1
			
 
				+                    if favourites_log:
			
 
				+                        favourites = favourites_log[0]
			
 
				+                    else:
			
 
				+                        favourites = -1
			
 
				+                    if duration_log:
			
 
				+                        duration = duration_log[0]
			
 
				+                    else:
			
 
				+                        duration = ""
			
 
				+
			
 
				+                    json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av
			
 
				+                    jsoncontent = requests.get(json_url, headers=head).content
			
 
				+                    jsDict = json.loads(jsoncontent)
			
 
				+                    if jsDict['code'] == 0:
			
 
				+                        jsData = jsDict['data']
			
 
				+                        jsPages = jsData['page']
			
 
				+                        common = jsPages['acount']
			
 
				+                        try:
			
 
				+                            conn = MySQLdb.connect(
			
 
				+                                host='localhost', user='root', passwd='', port=3306, charset='utf8')
			
 
				+                            cur = conn.cursor()
			
 
				+                            conn.select_db('python')
			
 
				+                            cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
			
 
				+                                        [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
			
 
				+                                         mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
			
 
				+
			
 
				+                            print "Succeed: av" + str(av)
			
 
				+                        except MySQLdb.Error, e:
			
 
				+                            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
			
 
				+                    else:
			
 
				+                        print "Error_Json: " + url
			
 
				+            else:
			
 
				+                print "Error_noCid:" + url
			
 
				+        else:
			
 
				+            print "Error_404: " + url
			
 
				+
			
 
				+
			
 
				+pool = ThreadPool(10)
			
 
				+# results = pool.map(spider, urls)
			
 
				+try:
			
 
				+    results = pool.map(spider, urls)
			
 
				+except Exception, e:
			
 
				+    # print 'ConnectionError'
			
 
				+    print e
			
 
				+    time.sleep(300)
			
 
				+    results = pool.map(spider, urls)
			
 
				+
			
 
				+pool.close()
			
 
				+pool.join()