liuyuqi-dellpc 6 years ago
commit
75004a27f2
8 changed files with 252 additions and 0 deletions
  1. 3 0
      .gitignore
  2. 39 0
      DownloadProgress.py
  3. 32 0
      README.md
  4. 0 0
      data/ml.json
  5. 68 0
      main.py
  6. BIN
      screenshot/BaiduHi_2019-4-8_16-26-42.png
  7. 31 0
      threads.py
  8. 79 0
      user_agent.py

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+/.idea
+/__pycache__
+/data

+ 39 - 0
DownloadProgress.py

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+'''
+下载进度
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+
+class DownloadProgress(object):
+    def __init__(self, title, count=0.0, run_status=None, fin_status=None, total=100.0, unit='', sep='/',
+                 chunk_size=1.0):
+        super(DownloadProgress, self).__init__()
+        self.info = "[%s] %s %.2f %s %s %.2f %s"
+        self.title = title
+        self.total = total
+        self.count = count
+        self.chunk_size = chunk_size
+        self.status = run_status or ""
+        self.fin_status = fin_status or " " * len(self.status)
+        self.unit = unit
+        self.seq = sep
+
+    def __get_info(self):
+        # 【名称】状态 进度 单位 分割线 总数 单位
+        _info = self.info % (
+            self.title, self.status, self.count / self.chunk_size, self.unit, self.seq, self.total / self.chunk_size,
+            self.unit)
+        return _info
+
+    def refresh(self, count=1, status=None):
+        self.count += count
+        # if status is not None:
+        self.status = status or self.status
+        end_str = "\r"
+        if self.count >= self.total:
+            end_str = '\n'
+            self.status = status or self.fin_status
+        print(self.__get_info(), end=end_str)

+ 32 - 0
README.md

@@ -0,0 +1,32 @@
+## crawl_xuexi
+
+下载 学习强国 www.xuexi.cn 上面的机器学习课程.注意,该网站所有文章的内容存在 dataxxx.js 中.《大数据机器学习》视频链接:
+
+https://www.xuexi.cn/9f584b49d8a7386a4cf248ce16f5e667/9b0f04ec6509904be734f5f609a3604a.html
+
+《大数据机器学习》课程百度云分享:
+
+
+由于"学习强国APP"上视频采用cdn分发,而百度云下载限速,所以建议直接用本项目程序下载视频,方便,迅速.
+
+![](screenshot/BaiduHi_2019-4-8_16-26-42.png)
+
+### 介绍
+
+1. 视频列表js整理为ml.json
+2. 对ml.json中每条抓包过滤 dataxxx.js ,获取视频链接
+3. 异步分 5 线程队列下载视频 .
+
+### 使用
+
+1. 配置main.py中的项目目录.视频将下载到项目目录中Video文件夹.
+2. 执行 python main.py
+
+
+### 更新历史
+
+没啥需求,多线程下载暂不实现 .
+
+### 版权说明
+
+本项目开源,免费使用,无任何限制,下载的课件视频建议个人使用,请勿以盈利性目的分发哦.

File diff suppressed because it is too large
+ 0 - 0
data/ml.json


+ 68 - 0
main.py

@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+'''
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+import json
+import os
+import re
+from contextlib import closing
+
+import requests
+
+import DownloadProgress
+import user_agent
+
+src = "D:/PycharmProjects/crawl_xuexi/"
+os.chdir(src)
+
+
+def get_video_links(url):
+    video = requests.get(url=url, headers=user_agent.getheaders()).content.decode("utf8")
+    pattern = r'https://video.xuexi.cn/[^,"]*mp4'
+    link = re.findall(pattern, video, re.I)
+    link.reverse()
+    return link
+
+
+def downloadVideo(url, file_name):
+    '''
+    下载视频
+    :param url: 下载url路径
+    :return: 文件
+     '''
+    with closing(requests.get(url=url, stream=True)) as response:
+        chunk_size = 1024
+        content_size = int(response.headers['content-length'])
+        file_D = './Video/' + file_name + '.mp4'
+        if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
+            print('跳过' + file_name)
+        else:
+            progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
+                                                         chunk_size=chunk_size,
+                                                         run_status="正在下载", fin_status="下载完成")
+            with open(file_D, "wb") as file:
+                for data in response.iter_content(chunk_size=chunk_size):
+                    file.write(data)
+                    progress.refresh(count=len(data))
+
+
+def crawl():
+    with open("data/ml.json", "r", encoding="utf8") as f:
+        mlData = json.loads(f.read())
+        for i in range((len(mlData["fpe1ki18v228w00"]))):
+            frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
+            static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
+            # 打开 mp4 视频网页链接
+            resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
+            preUrl = static_page_url.split("/")[3]
+            pattern = r'src="./data(.*?)"></script>'
+            url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
+            res = get_video_links(url)[0]
+            downloadVideo(res, file_name=frst_name)
+
+
+if __name__ == '__main__':
+    crawl()

BIN
screenshot/BaiduHi_2019-4-8_16-26-42.png


+ 31 - 0
threads.py

@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+'''
+多线程下载多文件;多线程分段下载单文件.
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+from threading import Lock
+from threading import Thread
+
+threadLock = Lock()
+threads = []
+
+
+class MyThread(Thread):
+    def __init__(self, name, func, *args, lock=False):
+        Thread.__init__(self)
+        self.name = name
+        self.func = func
+        self.args = args
+        self.lock = lock
+
+    def run(self):
+        print("开启: " + self.name)
+        if self.lock:
+            threadLock.acquire()
+            self.func(*self.args)
+            threadLock.release()
+        else:
+            self.func(*self.args)

+ 79 - 0
user_agent.py

@@ -0,0 +1,79 @@
+# -*-coding:utf-8 -*-
+
+import random
+
+# 返回一个随机的请求头 headers
+def getheaders():
+    # 各种PC端
+    user_agent_list_2 = [
+        # Opera
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
+        "Opera/8.0 (Windows NT 5.1; U; en)",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
+        # Firefox
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
+        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
+        # Safari
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        # chrome
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.2171.71 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/72.0.1271.64 Safari/537.11",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/66.0.648.133 Safari/534.16",
+        # 360
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        # 淘宝浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+        # 猎豹浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+        # QQ浏览器
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+        # sogou浏览器
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
+        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
+        # maxthon浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
+        # UC浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
+    ]
+    # 各种移动端
+    user_agent_list_3 = [
+        # IPhone
+        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPod
+        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPAD
+        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # Android
+        "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # QQ浏览器 Android版本
+        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # Android Opera Mobile
+        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+        # Android Pad Moto Xoom
+        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+        # BlackBerry
+        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+        # WebOS HP Touchpad
+        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+        # Nokia N97
+        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+        # Windows Phone Mango
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+        # UC浏览器
+        "UCWEB7.0.2.37/28/999",
+        "NOKIA5700/ UCWEB7.0.2.37/28/999",
+        # UCOpenwave
+        "Openwave/ UCWEB7.0.2.37/28/999",
+        # UC Opera
+        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
+    ]
+    UserAgent = random.choice(user_agent_list_2) # 这里只用list1
+    headers = {'User-Agent': UserAgent}
+    return headers

Some files were not shown because too many files changed in this diff