liuyuqi-dellpc 6 years ago
commit
dc982a6fba
13 changed files with 612 additions and 0 deletions
  1. 3 0
      .gitignore
  2. 17 0
      README.md
  3. 65 0
      barrage/get_barrage.py
  4. 26 0
      conf/user.sql
  5. 25 0
      conf/video.sql
  6. 1 0
      requirements.txt
  7. 31 0
      threads.py
  8. 19 0
      user/get_face.py
  9. 155 0
      user/get_user.py
  10. 39 0
      user/main.py
  11. 0 0
      utils/__init__.py
  12. 18 0
      utils/user_agent.py
  13. 213 0
      video/get_video.py

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+/.idea
+/__pycache__
+/data

+ 17 - 0
README.md

@@ -0,0 +1,17 @@
+## crawl_bilibili
+
+bilibili爬虫,主要涉及:
+
+用户公开数据(用户/性别/地区/注册时间/)
+
+视频数据()
+
+弹幕数据()
+
+评论数据()
+
+付费数据()
+
+### 使用
+
+### 更新历史

+ 65 - 0
barrage/get_barrage.py

@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@File    :   get_barrage.py
+@Time    :   2019/05/15 17:10:38
+@Author  :   Liuyuqi 
+@Version :   1.0
+@Contact :   liuyuqi.gov@msn.cn
+@License :   (C)Copyright 2019
+@Desc    :   弹幕爬虫
+'''
+
+# -*-coding:utf8-*-
+
+from lxml import etree
+import requests
+import sys
+import re
+
+#reload(sys)
+
+#sys.setdefaultencoding('utf-8')
+
+head = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
+}
+
+
+def spider(av):
+    url = 'http://bilibili.com/video/av' + str(av)
+    print(url)
+    html = requests.get(url, headers=head)
+    selector = etree.HTML(html.text)
+    content = selector.xpath("//html")
+    for each in content:
+        title = each.xpath('//*[@id="viewbox_report"]/h1/span')
+        if title:
+            print(title[0].text)
+            cid_html_1 = each.xpath('//*[@id="link2"]/@value')
+            if cid_html_1:
+                cid_html = cid_html_1[0]
+                cids = re.findall(r'cid=.+&page', cid_html)
+                cid = cids[0].replace("cid=", "").replace("&page", "")
+                comment_url = 'http://comment.bilibili.com/' + \
+                    str(cid) + '.xml'
+                print(comment_url)
+                comment_text = requests.get(comment_url, headers=head)
+                comment_selector = etree.HTML(comment_text.content)
+                comment_content = comment_selector.xpath('//i')
+                for comment_each in comment_content:
+                    comments = comment_each.xpath('//d/text()')
+                    if comments:
+                        for comment in comments:
+                            print(comment)
+                            f.writelines(comment + '\n')
+            else:
+                print('cid not found!')
+        else:
+            print('video not found!')
+
+
+if __name__ == '__main__':
+    av = input('input av:')
+    f = open(av + '.txt', 'w', encoding='utf-8')
+    spider(av)

+ 26 - 0
conf/user.sql

@@ -0,0 +1,26 @@
+DROP TABLE IF EXISTS `user`;
+CREATE TABLE `user` (
+  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
+  `mid` int(20) unsigned NOT NULL,
+  `name` varchar(45) NOT NULL,
+  `sex` varchar(45) NOT NULL,
+  `rank` varchar(45) NOT NULL,
+  `face` varchar(200) NOT NULL,
+  `regtime` varchar(45) NOT NULL,
+  `spacesta` varchar(45) NOT NULL,
+  `birthday` varchar(45) NOT NULL,
+  `sign` varchar(300) NOT NULL,
+  `level` varchar(45) NOT NULL,
+  `OfficialVerifyType` varchar(45) NOT NULL,
+  `OfficialVerifyDesc` varchar(100) NOT NULL,
+  `vipType` varchar(45) NOT NULL,
+  `vipStatus` varchar(45) NOT NULL,
+  `toutu` varchar(200) NOT NULL,
+  `toutuId` int(20) unsigned NOT NULL,
+  `coins` int(20) unsigned NOT NULL,
+  `following` int(20) unsigned NOT NULL,
+  `fans` int(20) unsigned NOT NULL,
+  `archiveview` int(20) unsigned NOT NULL,
+  `article` int(20) unsigned NOT NULL,
+  PRIMARY KEY (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;

+ 25 - 0
conf/video.sql

@@ -0,0 +1,25 @@
+CREATE TABLE `video` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `av` int(11) DEFAULT NULL,
+  `cid` int(11) DEFAULT NULL,
+  `title` varchar(150) DEFAULT NULL,
+  `tminfo` varchar(45) DEFAULT NULL,
+  `time` varchar(45) DEFAULT NULL,
+  `click` int(11) DEFAULT NULL,
+  `danmu` int(11) DEFAULT NULL,
+  `coins` int(11) DEFAULT NULL,
+  `favourites` int(11) DEFAULT NULL,
+  `duration` varchar(45) DEFAULT NULL,
+  `mid` int(11) DEFAULT NULL,
+  `name` varchar(45) DEFAULT NULL,
+  `article` int(11) DEFAULT NULL,
+  `fans` int(11) DEFAULT NULL,
+  `tag1` varchar(45) DEFAULT NULL,
+  `tag2` varchar(45) DEFAULT NULL,
+  `tag3` varchar(45) DEFAULT NULL,
+  `common` int(11) DEFAULT NULL,
+  `honor_click` int(11) DEFAULT NULL,
+  `honor_coins` int(11) DEFAULT NULL,
+  `honor_favourites` int(11) DEFAULT NULL,
+  PRIMARY KEY (`id`)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+pymysql

+ 31 - 0
threads.py

@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+'''
+多线程下载多文件;多线程分段下载单文件.
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+from threading import Lock
+from threading import Thread
+
+threadLock = Lock()
+threads = []
+
+
+class MyThread(Thread):
+    def __init__(self, name, func, *args, lock=False):
+        Thread.__init__(self)
+        self.name = name
+        self.func = func
+        self.args = args
+        self.lock = lock
+
+    def run(self):
+        print("开启: " + self.name)
+        if self.lock:
+            threadLock.acquire()
+            self.func(*self.args)
+            threadLock.release()
+        else:
+            self.func(*self.args)

+ 19 - 0
user/get_face.py

@@ -0,0 +1,19 @@
+# -*-coding:utf8-*-
+
+import urllib
+import re
+
+f = open("../data/bilibili_user_face.txt")
+line = f.readline()
+for i in range(1, 1000):
+    print(line,)
+    if re.match('http://static.*', line):
+        line = f.readline()
+        print('noface:' + str(i))
+    else:
+        path = r"../data/face/" + str(i) + ".jpg"
+        data = urllib.urlretrieve(line, path)
+        line = f.readline()
+        print('succeed:' + str(i))
+
+f.close()

+ 155 - 0
user/get_user.py

@@ -0,0 +1,155 @@
+# -*-coding:utf8-*-
+
+import requests
+import json
+import random
+import pymysql
+import sys
+import datetime
+import time
+from imp import reload
+from multiprocessing.dummy import Pool as ThreadPool
+
+def datetime_to_timestamp_in_milliseconds(d):
+    def current_milli_time(): return int(round(time.time() * 1000))
+
+    return current_milli_time()
+
+
+reload(sys)
+
+
+def LoadUserAgents(uafile):
+    uas = []
+    with open(uafile, 'rb') as uaf:
+        for ua in uaf.readlines():
+            if ua:
+                uas.append(ua.strip()[:-1])
+    random.shuffle(uas)
+    return uas
+
+
+uas = LoadUserAgents("user_agents.txt")
+head = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
+    'X-Requested-With': 'XMLHttpRequest',
+    'Referer': 'http://space.bilibili.com/45388',
+    'Origin': 'http://space.bilibili.com',
+    'Host': 'space.bilibili.com',
+    'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0',
+    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+}
+
+# Please replace your own proxies.
+proxies = {
+    'http': 'http://120.26.110.59:8080',
+    'http': 'http://120.52.32.46:80',
+    'http': 'http://218.85.133.62:80',
+}
+time1 = time.time()
+
+urls = []
+
+# Please change the range data by yourself.
+for m in range(5214, 5215):
+
+    for i in range(m * 100, (m + 1) * 100):
+        url = 'https://space.bilibili.com/' + str(i)
+        urls.append(url)
+
+
+    def getsource(url):
+        payload = {
+            '_': datetime_to_timestamp_in_milliseconds(datetime.datetime.now()),
+            'mid': url.replace('https://space.bilibili.com/', '')
+        }
+        ua = random.choice(uas)
+        head = {
+            'User-Agent': ua,
+            'Referer': 'https://space.bilibili.com/' + str(i) + '?from=search&seid=' + str(random.randint(10000, 50000))
+        }
+        jscontent = requests \
+            .session() \
+            .post('http://space.bilibili.com/ajax/member/GetInfo',
+                  headers=head,
+                  data=payload,
+                  proxies=proxies) \
+            .text
+        time2 = time.time()
+        try:
+            jsDict = json.loads(jscontent)
+            statusJson = jsDict['status'] if 'status' in jsDict.keys() else False
+            if statusJson == True:
+                if 'data' in jsDict.keys():
+                    jsData = jsDict['data']
+                    mid = jsData['mid']
+                    name = jsData['name']
+                    sex = jsData['sex']
+                    rank = jsData['rank']
+                    face = jsData['face']
+                    regtimestamp = jsData['regtime']
+                    regtime_local = time.localtime(regtimestamp)
+                    regtime = time.strftime("%Y-%m-%d %H:%M:%S",regtime_local)
+                    spacesta = jsData['spacesta']
+                    birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday'
+                    sign = jsData['sign']
+                    level = jsData['level_info']['current_level']
+                    OfficialVerifyType = jsData['official_verify']['type']
+                    OfficialVerifyDesc = jsData['official_verify']['desc']
+                    vipType = jsData['vip']['vipType']
+                    vipStatus = jsData['vip']['vipStatus']
+                    toutu = jsData['toutu']
+                    toutuId = jsData['toutuId']
+                    coins = jsData['coins']
+                    print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
+                    try:
+                        res = requests.get(
+                            'https://api.bilibili.com/x/relation/stat?vmid=' + str(mid) + '&jsonp=jsonp').text
+                        viewinfo = requests.get(
+                            'https://api.bilibili.com/x/space/upstat?mid=' + str(mid) + '&jsonp=jsonp').text
+                        js_fans_data = json.loads(res)
+                        js_viewdata = json.loads(viewinfo)
+                        following = js_fans_data['data']['following']
+                        fans = js_fans_data['data']['follower']
+                        archiveview = js_viewdata['data']['archive']['view']
+                        article = js_viewdata['data']['article']['view']
+                    except:
+                        following = 0
+                        fans = 0
+                        archiveview = 0
+                        article = 0
+                else:
+                    print('no data now')
+                try:
+                    # Please write your MySQL's information.
+                    conn = pymysql.connect(
+                        host='localhost', user='root', passwd='123456', db='bilibili', charset='utf8')
+                    cur = conn.cursor()
+                    cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, rank, face, regtime, spacesta, \
+                                birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
+                                toutu, toutuId, coins, following, fans ,archiveview, article) \
+                    VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
+                            "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
+                                %
+                                (mid, name, sex, rank, face, regtime, spacesta, \
+                                birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
+                                toutu, toutuId, coins, following, fans ,archiveview, article))
+                    conn.commit()
+                except Exception as e:
+                    print(e)
+            else:
+                print("Error: " + url)
+        except Exception as e:
+            print(e)
+            pass
+
+if __name__ == "__main__":
+    pool = ThreadPool(1)
+    try:
+        results = pool.map(getsource, urls)
+    except Exception as e:
+        print(e)
+ 
+    pool.close()
+    pool.join()

+ 39 - 0
user/main.py

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+'''
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+import json
+import os
+import re
+from contextlib import closing
+
+import requests
+
+import DownloadProgress
+import user_agent
+
+# src = "D:/PycharmProjects/crawl_xuexi/"
+# os.chdir(src)
+
+
+
+def crawl():
+    with open("data/ml.json", "r", encoding="utf8") as f:
+        mlData = json.loads(f.read())
+        for i in range((len(mlData["fpe1ki18v228w00"]))):
+            frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
+            static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
+            # 打开 mp4 视频网页链接
+            resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
+            preUrl = static_page_url.split("/")[3]
+            pattern = r'src="./data(.*?)"></script>'
+            url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
+            res = get_video_links(url)[0]
+            downloadVideo(res, file_name=frst_name)
+
+
+if __name__ == '__main__':
+    crawl()

+ 0 - 0
utils/__init__.py


+ 18 - 0
utils/user_agent.py

@@ -0,0 +1,18 @@
+# -*-coding:utf-8 -*-
+
+import random
+import os
+
+# 返回一个随机的请求头 headers
+
+
+def getheaders():
+    useragent = []
+    with open("data/user_agents.txt", "rb") as f:
+        for ua in f.readlines():
+            if ua:
+                useragent.append(ua.strip()[:-1])
+    random.shuffle(useragent)  # 随机排序
+    return useragent
+
+# print({"User-Agent": getheaders()[1]})

+ 213 - 0
video/get_video.py

@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@File    :   get_video.py
+@Time    :   2019/05/15 17:09:18
+@Author  :   Liuyuqi 
+@Version :   1.0
+@Contact :   liuyuqi.gov@msn.cn
+@License :   (C)Copyright 2019
+@Desc    :   爬取 B 站视频
+'''
+
+# -*-coding:utf8-*-
+
+from lxml import etree
+from multiprocessing.dummy import Pool as ThreadPool
+import requests
+import time
+import sys
+import re
+import json
+import MySQLdb
+
+reload(sys)
+
+sys.setdefaultencoding('utf-8')
+
+# id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites
+# mid name article fans tags[3] common
+
+urls = []
+
+head = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
+}
+
+time1 = time.time()
+
+for i in range(17501, 100000):
+    url = 'http://bilibili.com/video/av' + str(i)
+    urls.append(url)
+
+
+def spider(url):
+    html = requests.get(url, headers=head)
+    selector = etree.HTML(html.text)
+    content = selector.xpath("//html")
+    for each in content:
+        title = each.xpath('//div[@class="v-title"]/h1/@title')
+        if title:
+            av = url.replace("http://bilibili.com/video/av", "")
+            title = title[0]
+            tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()')
+            tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()')
+            tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()')
+            if tminfo1_log:
+                tminfo1 = tminfo1_log[0]
+            else:
+                tminfo1 = ""
+            if tminfo2_log:
+                tminfo2 = tminfo2_log[0]
+            else:
+                tminfo2 = ""
+            if tminfo3_log:
+                tminfo3 = tminfo3_log[0]
+            else:
+                tminfo3 = ""
+            tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3
+            time_log = each.xpath('//div[@class="tminfo"]/time/i/text()')
+            mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid')
+            name_log = each.xpath('//div[@class="usname"]/a/@title')
+            article_log = each.xpath(
+                '//div[@class="up-video-message"]/div[1]/text()')
+            fans_log = each.xpath(
+                '//div[@class="up-video-message"]/div[2]/text()')
+
+            if time_log:
+                time = time_log[0]
+            else:
+                time = ""
+            if mid_log:
+                mid = mid_log[0]
+            else:
+                mid = ""
+            if name_log:
+                name = name_log[0]
+            else:
+                name = ""
+            if article_log:
+                article = article_log[0].replace(u"投稿:", "")
+            else:
+                article = "-1"
+            if fans_log:
+                fans = fans_log[0].replace(u"粉丝:", "")
+            else:
+                fans = "-1"
+
+            tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()')
+            tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()')
+            tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()')
+            if tag1_log:
+                tag1 = tag1_log[0]
+            else:
+                tag1 = ""
+            if tag2_log:
+                tag2 = tag2_log[0]
+            else:
+                tag2 = ""
+            if tag3_log:
+                tag3 = tag3_log[0]
+            else:
+                tag3 = ""
+
+            cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src')
+            cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()')
+            if cid_html_1 or cid_html_2:
+                if cid_html_1:
+                    cid_html = cid_html_1[0]
+                else:
+                    cid_html = cid_html_2[0]
+
+                cids = re.findall(r'cid=.+&aid', cid_html)
+                cid = cids[0].replace("cid=", "").replace("&aid", "")
+                info_url = "http://interface.bilibili.com/player?id=cid:" + \
+                    str(cid) + "&aid=" + av
+                video_info = requests.get(info_url)
+                video_selector = etree.HTML(video_info.text)
+                for video_each in video_selector:
+                    click_log = video_each.xpath('//click/text()')
+                    danmu_log = video_each.xpath('//danmu/text()')
+                    coins_log = video_each.xpath('//coins/text()')
+                    favourites_log = video_each.xpath('//favourites/text()')
+                    duration_log = video_each.xpath('//duration/text()')
+                    honor_click_log = video_each.xpath(
+                        '//honor[@t="click"]/text()')
+                    honor_coins_log = video_each.xpath(
+                        '//honor[@t="coins"]/text()')
+                    honor_favourites_log = video_each.xpath(
+                        '//honor[@t="favourites"]/text()')
+
+                    if honor_click_log:
+                        honor_click = honor_click_log[0]
+                    else:
+                        honor_click = 0
+                    if honor_coins_log:
+                        honor_coins = honor_coins_log[0]
+                    else:
+                        honor_coins = 0
+                    if honor_favourites_log:
+                        honor_favourites = honor_favourites_log[0]
+                    else:
+                        honor_favourites = 0
+
+                    if click_log:
+                        click = click_log[0]
+                    else:
+                        click = -1
+                    if danmu_log:
+                        danmu = danmu_log[0]
+                    else:
+                        danmu = -1
+                    if coins_log:
+                        coins = coins_log[0]
+                    else:
+                        coins = -1
+                    if favourites_log:
+                        favourites = favourites_log[0]
+                    else:
+                        favourites = -1
+                    if duration_log:
+                        duration = duration_log[0]
+                    else:
+                        duration = ""
+
+                    json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av
+                    jsoncontent = requests.get(json_url, headers=head).content
+                    jsDict = json.loads(jsoncontent)
+                    if jsDict['code'] == 0:
+                        jsData = jsDict['data']
+                        jsPages = jsData['page']
+                        common = jsPages['acount']
+                        try:
+                            conn = MySQLdb.connect(
+                                host='localhost', user='root', passwd='', port=3306, charset='utf8')
+                            cur = conn.cursor()
+                            conn.select_db('python')
+                            cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
+                                        [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
+                                         mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
+
+                            print "Succeed: av" + str(av)
+                        except MySQLdb.Error, e:
+                            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
+                    else:
+                        print "Error_Json: " + url
+            else:
+                print "Error_noCid:" + url
+        else:
+            print "Error_404: " + url
+
+
+pool = ThreadPool(10)
+# results = pool.map(spider, urls)
+try:
+    results = pool.map(spider, urls)
+except Exception, e:
+    # print 'ConnectionError'
+    print e
+    time.sleep(300)
+    results = pool.map(spider, urls)
+
+pool.close()
+pool.join()