Browse Source

Merge branch 'develop' of lyq/crawl-bilibili into master

liuyuqi 5 years ago
parent
commit
55cc9dbd16

+ 2 - 1
README.md

@@ -1,6 +1,6 @@
 ## crawl_bilibili
 
- bilibili 爬虫,主要涉及:
+bilibili 爬虫,不需要实时数据,设置一分钟爬 6 次。**一个月大概爬 25 万用户**。由于时间跨度大,设置断点续爬功能。主要涉及:
 
 用户公开数据(用户/性别/地区/注册时间/)
 
@@ -48,3 +48,4 @@ python barrage/get_barrage.py
 
 ### 更新历史
 
+20190907 新增腾讯视频《都挺好》1-30集弹幕下载。

+ 11 - 0
conf/mysql.conf

@@ -0,0 +1,11 @@
+[db1]
+host = h5.yoqi.me
+port = 3306
+user = root
+pwd = 123456
+database = bilibili
+charset = utf8
+
+[project]
+workspace = C:/Users/liuyuqi/Desktop/crawl-bilibili
+

+ 24 - 13
user/get_user.py

@@ -7,28 +7,35 @@
 @Version :   1.0
 @Contact :   liuyuqi.gov@msn.cn
 @License :   (C)Copyright 2019
-@Desc    :   抓取 用户信息,接口为:
-https://space.bilibili.com/521400
+@Desc    :   抓取 用户信息,接口为: https://space.bilibili.com/521400 后面数字穷举法获取所有用户姓名,性别,年龄等等信息。
+
 http://space.bilibili.com/ajax/member/GetInfo
 """
 
+import sys
+import os
+src = "C:/Users/liuyuqi/Desktop/crawl-bilibili"
+os.chdir(src)
+sys.path.append(src)
+
+import utils.config as conf
+from utils.user_agent import getheaders
+
 import requests
 import json
 import random
 import pymysql
 import datetime
 import time
-import os, sys
 
-src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
-os.chdir(src)
-sys.path.append(src)
 
-from utils.user_agent import getheaders
+
+print(src)
+exit()
 
 # 连接数据库
 conn = pymysql.connect(
-    host="192.168.99.100", user="root", passwd="123456", db="bilibili", charset="utf8"
+    host=conf.readConf("db1", "host"), user=conf.readConf("db1", "user"), passwd=conf.readConf("db1", "pwd"), db="bilibili", charset="utf8"
 )
 cur = conn.cursor()
 # cur.execute("sql")
@@ -96,7 +103,8 @@ def getsource(url, i):
                 regtime = "2018-05-06 12:22:23"
                 spacesta = jsData["spacesta"]
                 birthday = (
-                    jsData["birthday"] if "birthday" in jsData.keys() else "nobirthday"
+                    jsData["birthday"] if "birthday" in jsData.keys(
+                    ) else "nobirthday"
                 )
                 sign = jsData["sign"]
                 level = jsData["level_info"]["current_level"]
@@ -107,7 +115,8 @@ def getsource(url, i):
                 toutu = jsData["toutu"]
                 toutuId = jsData["toutuId"]
                 coins = jsData["coins"]
-                print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
+                print("Succeed get user info: " +
+                      str(mid) + "\t" + str(time2 - time1))
                 try:
                     res = requests.get(
                         "https://api.bilibili.com/x/relation/stat?vmid="
@@ -179,8 +188,11 @@ def crawlUser():
     param :
     return:
     """
+    # 获得索引头
+    cur.execute("sql")
+    res = conn.commit()
     m = 5214
-    for i in range(m * 100, ((m * 100 )+ 1)):  # range(521400,521500)
+    for i in range(m * 100, ((m * 100) + 1)):  # range(521400,521500)
         url = "https://space.bilibili.com/" + str(i)
         # urls.append(url)
         getsource(url, i)
@@ -188,5 +200,4 @@ def crawlUser():
 
 if __name__ == "__main__":
     src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
-    os.chdir(src)
-    crawlUser()
+    crawlUser()

+ 97 - 0
utils/config.py

@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/07/23 10:58:48
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   设置 数据库参数
+'''
+
+import configparser
+import os
+import pymysql
+
+src = "C:/Users/liuyuqi/Desktop/crawl-bilibili"
+os.chdir(src)
+
+conf_dir = "conf/"
+config_file = os.path.join(conf_dir, "mysql.conf")
+section_name = "db1"
+
+
+def writeConf(user, pwd, database, host="localhost", port=3306, charset="utf8"):
+    cf = configparser.RawConfigParser()
+
+    cf.add_section('db1')
+    cf.set('db1', 'host', host)
+    cf.set('db1', 'port', port)
+    cf.set('db1', 'user', user)
+    cf.set('db1', 'pwd', pwd)
+    cf.set("db1", "database", database)
+    cf.set("db1", "charset", charset)
+
+    cf.add_section("project")
+    cf.set("project", "workspace", "C:/Users/liuyuqi/Desktop/crawl-bilibili")
+    with open(config_file, 'w') as configfile:
+        cf.write(configfile)
+
+
+def readConf(section, key):
+    config = configparser.RawConfigParser()
+    config.read(config_file)
+    port = config.get(section, key)
+    print(port)
+
+
+def readSQL(path):
+    with open(path, "r", encoding="utf-8") as f:
+        sql = ""
+        for line in f.readlines():
+            if not line or line == "\n":
+                continue
+            sql = sql+line
+        return sql
+
+def getDBServer():
+    pass
+
+def getWorkSpace():
+    return readConf("project", "workspace")
+
+def initDB1():
+    '''
+    导入数据,pymysql实在垃圾,只能一条一条执行。无法执行sql文件,也就是SQL文件手动读取为一条条在执行。。
+    '''
+    conn = pymysql.connect("localhost", "lyq", "123456", "bilibili")
+    cursor = conn.cursor()
+
+    # 如果没有数据库,则创建一个
+    # cursor.execute("CREATE DATABASE  bilibili;")
+
+    userSQL = os.path.join(conf_dir, "user.sql")
+    sql = readSQL(userSQL)
+    cursor.execute(sql)
+
+    videoSQL = os.path.join(conf_dir, "video.sql")
+    sql = readSQL(videoSQL)
+    cursor.execute(sql)
+
+
+def initDB():
+    '''
+    导入数据,采用shell命令执行
+    '''
+    userSQL = os.path.join(conf_dir, "user.sql")
+    videoSQL = os.path.join(conf_dir, "video.sql")
+    os.system(
+        "D:/Program-Files/MySQL/mysql-5.7.17-winx64/bin/mysql.exe -uroot -p123456  --default-character-set=utf8 bilibili < "+userSQL)
+    os.system(
+        "D:/Program-Files/MySQL/mysql-5.7.17-winx64/bin/mysql.exe -uroot -p123456  --default-character-set=utf8 bilibili < "+videoSQL)
+
+if __name__ == '__main__':
+    writeConf("root", "123456", "bilibili", host="h5.yoqi.me")
+    readConf("db1", "host")
+    # initDB()
+

+ 143 - 0
video/get_barrage.py

@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/09/07 19:00:20
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   《都挺好》1-30集 弹幕下载
+'''
+import requests
+import json
+import pandas as pd
+import os
+import time
+import random
+
+
+#页面基本信息解析,获取构成弹幕网址所需的后缀ID、播放量、集数等信息。
+def parse_base_info(url, headers):
+    df = pd.DataFrame()
+
+    html = requests.get(url, headers=headers)
+    bs = json.loads(html.text[html.text.find('{'):-1])
+
+    for i in bs['results']:
+        v_id = i['id']
+        title = i['fields']['title']
+        view_count = i['fields']['view_all_count']
+        episode = int(i['fields']['episode'])
+        if episode == 0:
+            pass
+        else:
+            cache = pd.DataFrame({'id': [v_id], 'title': [title], '播放量': [
+                                 view_count], '第几集': [episode]})
+            df = pd.concat([df, cache])
+    return df
+
+
+#传入后缀ID,获取该集的target_id并返回
+def get_episode_danmu(v_id, headers):
+    base_url = 'https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
+    pay = {"wRegistType": 2, "vecIdList": [v_id],
+           "wSpeSource": 0, "bIsGetUserCfg": 1,
+           "mapExtData": {v_id: {"strCid": "wu1e7mrffzvibjy", "strLid": ""}}}
+    html = requests.post(base_url, data=json.dumps(pay), headers=headers)
+    bs = json.loads(html.text)
+    danmu_key = bs['data']['stMap'][v_id]['strDanMuKey']
+    target_id = danmu_key[danmu_key.find(
+        'targetid') + 9: danmu_key.find('vid') - 1]
+    return [v_id, target_id]
+
+
+#解析单个弹幕页面,需传入target_id,v_id(后缀ID)和集数(方便匹配),返回具体的弹幕信息
+def parse_danmu(url, target_id, v_id, headers, period):
+    html = requests.get(url, headers=headers)
+    bs = json.loads(html.text, strict=False)
+    df = pd.DataFrame()
+    for i in bs['comments']:
+        content = i['content']
+        name = i['opername']
+        upcount = i['upcount']
+        user_degree = i['uservip_degree']
+        timepoint = i['timepoint']
+        comment_id = i['commentid']
+        cache = pd.DataFrame({'用户名': [name], '内容': [content], '会员等级': [user_degree],
+                              '弹幕时间点': [timepoint], '弹幕点赞': [upcount], '弹幕id': [comment_id], '集数': [period]})
+        df = pd.concat([df, cache])
+    return df
+
+
+#构造单集弹幕的循环网页,传入target_id和后缀ID(v_id),通过设置爬取页数来改变timestamp的值完成翻页操作
+def format_url(target_id, v_id, end=85):
+    urls = []
+    base_url = 'https://mfm.video.qq.com/danmu?otype=json&timestamp={}&target_id={}%26vid%3D{}&count=80&second_count=5'
+
+    for num in range(15, end * 30 + 15, 30):
+        url = base_url.format(num, target_id, v_id)
+        urls.append(url)
+    return urls
+
+
+def get_all_ids(part1_url, part2_url, headers):
+    #分别获取1-30,31-46的所有后缀ID(v_id)
+    part_1 = parse_base_info(part1_url, headers)
+    part_2 = parse_base_info(part2_url, headers)
+    df = pd.concat([part_1, part_2])
+    df.sort_values('第几集', ascending=True, inplace=True)
+    count = 1
+    #创建一个列表存储target_id
+    info_lst = []
+    for i in df['id']:
+        info = get_episode_danmu(i, headers)
+        info_lst.append(info)
+        print('正在努力爬取第 %d 集的target_id' % count)
+        count += 1
+        time.sleep(2 + random.random())
+    print('是不是发现多了一集?别担心,会去重的')
+    #根据后缀ID,将target_id和后缀ID所在的表合并
+    info_lst = pd.DataFrame(info_lst)
+    info_lst.columns = ['v_id', 'target_id']
+    combine = pd.merge(df, info_lst, left_on='id',
+                       right_on='v_id', how='inner')
+    #去重复值
+    combine = combine.loc[combine.duplicated('id') == False, :]
+    return combine
+
+
+#输入包含v_id,target_id的表,并传入想要爬取多少集
+def crawl_all(combine, num, page, headers):
+    c = 1
+    final_result = pd.DataFrame()
+    #print('Bro,马上要开始循环爬取每一集的弹幕了')
+    for v_id, target_id in zip(combine['v_id'][:num], combine['target_id'][:num]):
+        count = 1
+        urls = format_url(target_id, v_id, page)
+        for url in urls:
+            result = parse_danmu(url, target_id, v_id, headers, c)
+            final_result = pd.concat([final_result, result])
+            time.sleep(2 + random.random())
+            print('这是 %d 集的第 %d 页爬取..' % (c, count))
+            count += 1
+        print('-------------------------------------')
+        c += 1
+    return final_result
+
+
+if __name__ == '__main__':
+
+    #《都挺好》1-30集的网址,31-46集的网址
+    #如果要爬取其他电视剧,只需要根据文章的提示,找到存储后缀ID的原网址即可
+    part1_url = 'https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&idlist=x003061htl5,t00306i1e62,x003061htl5,b0030velala,w0030ilim7z,i0030r7v63u,z003044noq2,m0030sfinyr,c0030u884k7,k0030m5zbr7,l0030e5nglm,h0030b060vn,j003090ci7w,n0030falyoi,s00308u9kwx,p0030fohijf,g00303ob0cx,v0030960y6n,x0030bl84xw,v0030keuav1,t0030kups1i,n0030y2o52i,x0030s52mev,d0030xuekgw,o0030md1a2a,x0030peo3sk,d00303l5j4k,t0030aexmnt,a0030ybi45z,y0030wpe2wu&callback=jQuery191020844423583354543_1554200358596&_=1554200358597'
+    part2_url = 'https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&idlist=t0030epjqsi,g003035mi84,n00301fxqbh,h0030zivlrq,d0030qc1yu2,m0030q9ywxj,h0030j0eq19,j0030jks835,a00308xw434,l0030tb319m,x0030xogl32,g0030fju3w3,a0030vrcww0,l0030jzi1mi,c0030mq8yjr,u00302fdo8v,a0030w9g57k,n0030wnj6i8,j0030h91ouj,j00304eu73n,t00305kc1f5,i0030x490o2,u0030jtmlj2,d003031ey5h,w0850w594k6,l0854pfn9lg,f08546r7l7a,d0854s0oq1z,m08546pcd9k,p0854r1nygj&callback=jQuery191020844423583354543_1554200358598&_=1554200358599'
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
+
+    #得到所有的后缀ID,基于后缀ID爬取target_id
+    combine = get_all_ids(part1_url, part2_url, headers)
+
+    #设置要爬取多少集(num参数),每一集爬取多少页弹幕(1-85页,page参数),这里默认是爬取第一集的5页弹幕
+    #比如想要爬取30集,每一集85页,num = 30,page = 85
+    final_result = crawl_all(combine, num=1, page=5, headers=headers)
+    final_result.to_excel('xxx.xlsx')

+ 1 - 1
video/get_video.py → video/get_video_info.py

@@ -40,7 +40,7 @@ conn = pymysql.connect(host='localhost',
 cur = conn.cursor()
 
 def crawlVideo(url):
-    html = requests.get(url, headers=head)
+    html = requests.get(uurl, headers=head)
     selector = etree.HTML(html.text)
     content = selector.xpath("//html")
     for each in content:

+ 0 - 1
video_download/Dockerfile

@@ -1,7 +1,6 @@
 FROM python:3.6
 
 LABEL Name=bilibili_video_download Version=1.0.1
-EXPOSE 3000
 WORKDIR /app
 
 RUN mkdir -p ~/.pip

+ 33 - 0
video_download/Pic/download_bangumi.py

@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/09/03 04:31:23
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   纪录片下载
+'''
+import requests
+import os
+import sys
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
+}
+
+
+def down_video():
+    pass
+
+
+def crawl_bangumi():
+    '''
+    下载外番
+    '''
+    down_video()
+
+
+if __name__ == "__main__":
+    print('*' * 30 + 'B站视频下载小助手' + '*' * 30)
+    crawl_bangumi()

+ 52 - 44
video_download/README.md

@@ -1,61 +1,69 @@
 Bilibili(b站)视频下载 ![enter image description here](Pic/logo.png)
 ===========================
-![](https://img.shields.io/badge/Python-3.6.3-green.svg) ![](https://img.shields.io/badge/requests-2.18.4-green.svg) ![](https://img.shields.io/badge/moviepy-0.2.3.2-green.svg)
-### Bilibili官网 - https://www.bilibili.com/
-|Author|:sunglasses:Henryhaohao:sunglasses:|
-|---|---
-|Email|:hearts:1073064953@qq.com:hearts:
-
-    
-****
-## :dolphin:声明
-### 软件均仅用于学习交流,请勿用于任何商业用途!感谢大家!
+
 ## :dolphin:介绍
 ### 该项目为[Bilibili(b站)](https://www.bilibili.com/)视频下载(支持分P多段视频的下载!)
-- **对于单P视频:直接传入B站av号或者视频链接地址(eg: 49842011或者https://www.bilibili.com/video/av49842011)**
-- **对于多P视频:**
-  > **1.下载全集:直接传入B站av号或者视频链接地址(eg: 49842011或者https://www.bilibili.com/video/av49842011)**<br>
-  > **2.下载其中一集:传入那一集的视频链接地址(eg: https://www.bilibili.com/video/av19516333/?p=2)**
-## :dolphin:爬虫版本
-- **版本一: bilibili_video_download_v1.py**
-  > **加密API版,不需要加入cookie,直接即可下载1080p视频<br>**
-- **版本二: bilibili_video_download_v2.py**
-  > **1.无加密API版,但是需要加入登录后cookie中的SESSDATA字段,才可下载720p及以上视频**<br>
-  > **2.如果想下载1080p+视频,需要带入B站大会员的cookie中的SESSDATA才行,普通用户的SESSDATA最多只能下载1080p视频;请定期更换代码31行cookie中的SESSDATA值。跟换方法为:浏览器登录B站,在开发者模式(按F12) --> application --> cookie中找到SESSDATA值替换即可,一个月的有效期**
-- **版本三: bilibili_video_download_v3.py**
-  > **即版本二的升级版,为Threading多线程下载版本,下载速度大幅提升!**<br>  
-## :dolphin:运行环境
-Version: Python3
-## :dolphin:安装依赖库
-```
-pip3 install -r requirements.txt
-
-```
-## :dolphin:运行截图
-> - **运行下载**<br><br>
-![enter image description here](Pic/run.png)
-> - **下载完成**<br><br>
-![enter image description here](Pic/video.png)
-## :dolphin:**总结**
-> **最后,如果你觉得这个项目不错或者对你有帮助,给个Star呗,也算是对我学习路上的一种鼓励!<br>
- 哈哈哈,感谢大家!笔芯哟~**:cupid::cupid:
+- 对于单P视频:直接传入B站av号或者视频链接地址(eg: 49842011或者https://www.bilibili.com/video/av49842011)
+- 对于多P视频:
+  > 1.下载全集:直接传入B站av号或者视频链接地址(eg: 49842011或者https://www.bilibili.com/video/av49842011)<br>
+  > 2.下载其中一集:传入那一集的视频链接地址(eg: https://www.bilibili.com/video/av19516333/?p=2)
 
+## :dolphin:爬虫版本
+- 版本三: bilibili_video_download_v3.py
+  > 加密API版,不需要加入cookie,直接即可下载1080p视频<br>
+- 版本二: bilibili_video_download_v2.py
+  > 1.无加密API版,但是需要加入登录后cookie中的SESSDATA字段,才可下载720p及以上视频**<br>
+  > 2.如果想下载1080p+视频,需要带入B站大会员的cookie中的SESSDATA才行,普通用户的SESSDATA最多只能下载1080p视频;请定期更换代码31行cookie中的SESSDATA值。跟换方法为:浏览器登录B站,在开发者模式(按F12) --> application --> cookie中找到SESSDATA值替换即可,一个月的有效期
 
 ## docker运行
 
 windows中安装docker
 
-git clone https://xxx 本项目
+    git clone https://xxx 本项目
+    cd video_download
+    //wget https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg.linux64
+    wget http://cdn.yoqi.me/direct/2019-07-09/ffmpeg.linux64
+    docker build -t bilibili_video_download:latest .
+    docker-compose up -d
+    docker exec -it a93ac1 /bin/sh -c "[ -e /bin/bash ] && /bin/bash || /bin/sh"
+    python3 download_v2.py
+
+## B站视频
+
+用户上传视频,音乐,舞蹈,科技,数码:
+
+那么好了,用户 A 可以上传很多视频。对视频管理,那么用户可以创建多个频道。每个频道下包含不同视频。此外,一个视频可以分几段“视频选集”。
+
+https://www.bilibili.com/video/av66302052
+
+https://www.bilibili.com/video/av65216716
+
+https://www.bilibili.com/video/av19516333/?p=2
+
+?p=2 就是视频分段。av19516333就是av号(aid),可以通过API获取视频分段id(cid),其实也是下载一个视频。
+
+
+付费视频,纪录片无法下载。
+
+放映厅,动画:
+
+https://www.bilibili.com/bangumi/play/ss28186
+
+https://www.bilibili.com/bangumi/play/ep281758
+
+https://www.bilibili.com/bangumi/play/ss27001
+
+https://www.bilibili.com/bangumi/play/ss27002
 
-wget https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg.linux64
+直播:
 
-wget http://cdn.yoqi.me/direct/2019-07-09/ffmpeg.linux64
+https://live.bilibili.com/1562910
 
-docker build -t bilibili_video_download:latest .
 
-docker-compose up -d
+漫画:
 
-docker exec -it a93ac1 /bin/sh -c "[ -e /bin/bash ] && /bin/bash || /bin/sh"
+https://manga.bilibili.com/mc26826/348428
 
-python3 bilibili_video_download_v1.py
+## API介绍
 
+http://api.bilibili.com/

+ 0 - 220
video_download/bilibili_video_download_v1.py

@@ -1,220 +0,0 @@
-# !/usr/bin/python
-# -*- coding:utf-8 -*-
-# time: 2019/04/17--08:12
-__author__ = 'Henry'
-
-
-'''
-项目: B站视频下载
-
-版本1: 加密API版,不需要加入cookie,直接即可下载1080p视频
-
-20190422 - 增加多P视频单独下载其中一集的功能
-'''
-import imageio
-imageio.plugins.ffmpeg.download()
-
-import requests, time, hashlib, urllib.request, re, json
-from moviepy.editor import *
-import os, sys
-
-
-# 访问API地址
-def get_play_list(start_url, cid, quality):
-    entropy = 'rbMCKn@KuamXWlPMoJGsKcbiJKUfkPF_8dABscJntvqhRSETg'
-    appkey, sec = ''.join([chr(ord(i) + 2) for i in entropy[::-1]]).split(':')
-    params = 'appkey=%s&cid=%s&otype=json&qn=%s&quality=%s&type=' % (appkey, cid, quality, quality)
-    chksum = hashlib.md5(bytes(params + sec, 'utf8')).hexdigest()
-    url_api = 'https://interface.bilibili.com/v2/playurl?%s&sign=%s' % (params, chksum)
-    headers = {
-        'Referer': start_url,  # 注意加上referer
-        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
-    }
-    # print(url_api)
-    html = requests.get(url_api, headers=headers).json()
-    # print(json.dumps(html))
-    video_list = [html['durl'][0]['url']]
-    # print(video_list)
-    return video_list
-
-
-# 下载视频
-'''
- urllib.urlretrieve 的回调函数:
-def callbackfunc(blocknum, blocksize, totalsize):
-    @blocknum:  已经下载的数据块
-    @blocksize: 数据块的大小
-    @totalsize: 远程文件的大小
-'''
-
-
-def Schedule_cmd(blocknum, blocksize, totalsize):
-    speed = (blocknum * blocksize) / (time.time() - start_time)
-    # speed_str = " Speed: %.2f" % speed
-    speed_str = " Speed: %s" % format_size(speed)
-    recv_size = blocknum * blocksize
-
-    # 设置下载进度条
-    f = sys.stdout
-    pervent = recv_size / totalsize
-    percent_str = "%.2f%%" % (pervent * 100)
-    n = round(pervent * 50)
-    s = ('#' * n).ljust(50, '-')
-    f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
-    f.flush()
-    # time.sleep(0.1)
-    f.write('\r')
-
-
-def Schedule(blocknum, blocksize, totalsize):
-    speed = (blocknum * blocksize) / (time.time() - start_time)
-    # speed_str = " Speed: %.2f" % speed
-    speed_str = " Speed: %s" % format_size(speed)
-    recv_size = blocknum * blocksize
-
-    # 设置下载进度条
-    f = sys.stdout
-    pervent = recv_size / totalsize
-    percent_str = "%.2f%%" % (pervent * 100)
-    n = round(pervent * 50)
-    s = ('#' * n).ljust(50, '-')
-    print(percent_str.ljust(6, ' ') + '-' + speed_str)
-    f.flush()
-    time.sleep(2)
-    # print('\r')
-
-
-# 字节bytes转化K\M\G
-def format_size(bytes):
-    try:
-        bytes = float(bytes)
-        kb = bytes / 1024
-    except:
-        print("传入的字节格式不对")
-        return "Error"
-    if kb >= 1024:
-        M = kb / 1024
-        if M >= 1024:
-            G = M / 1024
-            return "%.3fG" % (G)
-        else:
-            return "%.3fM" % (M)
-    else:
-        return "%.3fK" % (kb)
-
-
-#  下载视频
-def down_video(video_list, title, start_url, page):
-    num = 1
-    print('[正在下载P{}段视频,请稍等...]:'.format(page) + title)
-    currentVideoPath = os.path.join(sys.path[0], 'bilibili_video', title)  # 当前目录作为下载目录
-    for i in video_list:
-        opener = urllib.request.build_opener()
-        # 请求头
-        opener.addheaders = [
-            # ('Host', 'upos-hz-mirrorks3.acgvideo.com'),  #注意修改host,不用也行
-            ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0'),
-            ('Accept', '*/*'),
-            ('Accept-Language', 'en-US,en;q=0.5'),
-            ('Accept-Encoding', 'gzip, deflate, br'),
-            ('Range', 'bytes=0-'),  # Range 的值要为 bytes=0- 才能下载完整视频
-            ('Referer', start_url),  # 注意修改referer,必须要加的!
-            ('Origin', 'https://www.bilibili.com'),
-            ('Connection', 'keep-alive'),
-        ]
-        urllib.request.install_opener(opener)
-        # 创建文件夹存放下载的视频
-        if not os.path.exists(currentVideoPath):
-            os.makedirs(currentVideoPath)
-        # 开始下载
-        if len(video_list) > 1:
-            urllib.request.urlretrieve(url=i, filename=os.path.join(currentVideoPath, r'{}-{}.flv'.format(title, num)),reporthook=Schedule_cmd)  # 写成mp4也行  title + '-' + num + '.flv'
-        else:
-            urllib.request.urlretrieve(url=i, filename=os.path.join(currentVideoPath, r'{}.flv'.format(title)),reporthook=Schedule_cmd)  # 写成mp4也行  title + '-' + num + '.flv'
-        num += 1
-
-# 合并视频
-def combine_video(video_list, title):
-    currentVideoPath = os.path.join(sys.path[0], 'bilibili_video', title)  # 当前目录作为下载目录
-    if len(video_list) >= 2:
-        # 视频大于一段才要合并
-        print('[下载完成,正在合并视频...]:' + title)
-        # 定义一个数组
-        L = []
-        # 访问 video 文件夹 (假设视频都放在这里面)
-        root_dir = currentVideoPath
-        # 遍历所有文件
-        for file in sorted(os.listdir(root_dir), key=lambda x: int(x[x.rindex("-") + 1:x.rindex(".")])):
-            # 如果后缀名为 .mp4/.flv
-            if os.path.splitext(file)[1] == '.flv':
-                # 拼接成完整路径
-                filePath = os.path.join(root_dir, file)
-                # 载入视频
-                video = VideoFileClip(filePath)
-                # 添加到数组
-                L.append(video)
-        # 拼接视频
-        final_clip = concatenate_videoclips(L)
-        # 生成目标视频文件
-        final_clip.to_videofile(os.path.join(root_dir, r'{}.mp4'.format(title)), fps=24, remove_temp=False)
-        print('[视频合并完成]' + title)
-
-    else:
-        # 视频只有一段则直接打印下载完成
-        print('[视频合并完成]:' + title)
-
-
-if __name__ == '__main__':
-    # 用户输入av号或者视频链接地址
-    print('*' * 30 + 'B站视频下载小助手' + '*' * 30)
-    start = input('请输入您要下载的B站av号或者视频链接地址:')
-    if start.isdigit() == True:  # 如果输入的是av号
-        # 获取cid的api, 传入aid即可
-        start_url = 'https://api.bilibili.com/x/web-interface/view?aid=' + start
-    else:
-        # https://www.bilibili.com/video/av46958874/?spm_id_from=333.334.b_63686965665f7265636f6d6d656e64.16
-        start_url = 'https://api.bilibili.com/x/web-interface/view?aid=' + re.search(r'/av(\d+)/*', start).group(1)
-
-    # 视频质量
-    # <accept_format><![CDATA[flv,flv720,flv480,flv360]]></accept_format>
-    # <accept_description><![CDATA[高清 1080P,高清 720P,清晰 480P,流畅 360P]]></accept_description>
-    # <accept_quality><![CDATA[80,64,32,16]]></accept_quality>
-    quality = input('请输入您要下载视频的清晰度(1080p:80;720p:64;480p:32;360p:16)(填写80或64或32或16):')
-    # 获取视频的cid,title
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
-    }
-    html = requests.get(start_url, headers=headers).json()
-    data = html['data']
-    video_title=data["title"].replace(" ","_")
-    cid_list = []
-    if '?p=' in start:
-        # 单独下载分P视频中的一集
-        p = re.search(r'\?p=(\d+)',start).group(1)
-        cid_list.append(data['pages'][int(p) - 1])
-    else:
-        # 如果p不存在就是全集下载
-        cid_list = data['pages']
-    # print(cid_list)
-    for item in cid_list:
-        cid = str(item['cid'])
-        title = item['part']
-        if not title:
-            title = video_title
-        title = re.sub(r'[\/\\:*?"<>|]', '', title)  # 替换为空的
-        print('[下载视频的cid]:' + cid)
-        print('[下载视频的标题]:' + title)
-        page = str(item['page'])
-        start_url = start_url + "/?p=" + page
-        video_list = get_play_list(start_url, cid, quality)
-        start_time = time.time()
-        down_video(video_list, title, start_url, page)
-        combine_video(video_list, title)
-
-    # 如果是windows系统,下载完成后打开下载目录
-    currentVideoPath = os.path.join(sys.path[0], 'bilibili_video')  # 当前目录作为下载目录
-    if (sys.platform.startswith('win')):
-        os.startfile(currentVideoPath)
-
-
-# 分P视频下载测试: https://www.bilibili.com/video/av19516333/

+ 1 - 3
video_download/docker-compose.yml

@@ -2,9 +2,7 @@ version: '2.1'
 
 services:
   bilibili_video_download:
-    image: bilibili_video_download
+    image: video_download
     build: .
-    ports:
-      - 3000:3000
     volumes:
       - "./:/app"

+ 0 - 7
video_download/bilibili_video_download_v2.py → video_download/download_v2.py

@@ -1,9 +1,6 @@
 # !/usr/bin/python
 # -*- coding:utf-8 -*-
 # time: 2019/04/16--17:12
-__author__ = 'Henry'
-
-
 '''
 项目: B站视频下载
 
@@ -51,8 +48,6 @@ def callbackfunc(blocknum, blocksize, totalsize):
     @blocksize: 数据块的大小
     @totalsize: 远程文件的大小
 '''
-
-
 def Schedule_cmd(blocknum, blocksize, totalsize):
     speed = (blocknum * blocksize) / (time.time() - start_time)
     # speed_str = " Speed: %.2f" % speed
@@ -86,8 +81,6 @@ def Schedule(blocknum, blocksize, totalsize):
     print(percent_str.ljust(6, ' ') + '-' + speed_str)
     f.flush()
     time.sleep(2)
-    # print('\r')
-
 
 # 字节bytes转化K\M\G
 def format_size(bytes):

+ 11 - 10
video_download/bilibili_video_download_v3.py → video_download/download_v3.py

@@ -1,16 +1,14 @@
-# !/usr/bin/python
-# -*- coding:utf-8 -*-
-# time: 2019/07/02--08:12
-__author__ = 'Henry'
-
-
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
 '''
-项目: B站视频下载 - 多线程下载
+@Time    :   2019/07/18 04:54:35
+@Author  :   Liuyuqi 
+@Version :   1.0
+@Contact :   liuyuqi.gov@msn.cn
+@License :   (C)Copyright 2019
+@Desc    :   项目: B站视频下载 - 多线程下载
 
 版本1: 加密API版,不需要加入cookie,直接即可下载1080p视频
-
-20190422 - 增加多P视频单独下载其中一集的功能
-20190702 - 增加视频多线程下载 速度大幅提升
 '''
 
 import requests, time, hashlib, urllib.request, re, json
@@ -188,6 +186,7 @@ if __name__ == '__main__':
     }
     html = requests.get(start_url, headers=headers).json()
     data = html['data']
+    video_title=data["title"].replace(" ","_")
     cid_list = []
     if '?p=' in start:
         # 单独下载分P视频中的一集
@@ -202,6 +201,8 @@ if __name__ == '__main__':
     for item in cid_list:
         cid = str(item['cid'])
         title = item['part']
+        if not title:
+            title = video_title
         title = re.sub(r'[\/\\:*?"<>|]', '', title)  # 替换为空的
         print('[下载视频的cid]:' + cid)
         print('[下载视频的标题]:' + title)