liuyuqi-dellpc 1 year ago
commit
395e2b9b18

+ 0 - 0
.github/workflows/main.yml


+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+/crawl_ouchn/__pycache__/*.pyc

+ 6 - 0
README.md

@@ -0,0 +1,6 @@
+# crawl_ouchn
+
+["国家开放大学终身教育平台"](http://le.ouchn.cn/)课程视频批量下载
+
+
+

+ 0 - 0
Video/.gitkeep


+ 7 - 0
config/config.json

@@ -0,0 +1,7 @@
+{
+  "username": "15016215661",
+  "password": "123456",
+  "courseUrl": [
+    "http://le.ouchn.cn/#/courseDetails/CAAA010000023530"
+  ]
+}

+ 39 - 0
crawl_ouchn/DownloadProgress.py

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+'''
+下载进度
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+
+class DownloadProgress(object):
+    def __init__(self, title, count=0.0, run_status=None, fin_status=None, total=100.0, unit='', sep='/',
+                 chunk_size=1.0):
+        super(DownloadProgress, self).__init__()
+        self.info = "[%s] %s %.2f %s %s %.2f %s"
+        self.title = title
+        self.total = total
+        self.count = count
+        self.chunk_size = chunk_size
+        self.status = run_status or ""
+        self.fin_status = fin_status or " " * len(self.status)
+        self.unit = unit
+        self.seq = sep
+
+    def __get_info(self):
+        # 【名称】状态 进度 单位 分割线 总数 单位
+        _info = self.info % (
+            self.title, self.status, self.count / self.chunk_size, self.unit, self.seq, self.total / self.chunk_size,
+            self.unit)
+        return _info
+
+    def refresh(self, count=1, status=None):
+        self.count += count
+        # if status is not None:
+        self.status = status or self.status
+        end_str = "\r"
+        if self.count >= self.total:
+            end_str = '\n'
+            self.status = status or self.fin_status
+        print(self.__get_info(), end=end_str)

+ 0 - 0
crawl_ouchn/__init__.py


+ 23 - 0
crawl_ouchn/api.py

@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/23 14:49:22
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+
+_host = r"http://le.ouchn.cn"
+
+getMessageCode = _host + "/xx"
+login = _host + "xx"
+
+# http://le.ouchn.cn/#/courseDetails/CAAA010000040861
+# 普通视频
+getCommonVideoList = _host + "/api/Course/%s/MicroCourse/Details"
+
+# 兴趣菜单栏接口 GET /api/VisualPage/Channel/interest HTTP/1.1
+getInterest = _host + "/api/VisualPage/Channel/interest"
+
+# 需要登录的视频
+getVipVideoList = _host + ""

+ 102 - 0
crawl_ouchn/crawl_ouchn.py

@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/23 13:15:38
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   批量下载
+'''
+from multiprocessing import pool
+import requests
+import os
+import sys
+import re
+import json
+import logging
+from contextlib import closing
+from crawl_ouchn import DownloadProgress, api, user_agent
+from concurrent.futures import ThreadPoolExecutor
+
+
+class CrawlOuchn():
+
+    def __init__(self, configPath=r'config/config.json'):
+        self.sess = requests.Session()
+        self.configPath = configPath
+
+    def checkNet(self):
+        res = self.sess.get("http://baidu.com")
+        logging.debug(res.text)
+        return True
+
+    def getCode(self, phone):
+        '''
+        获取验证码
+        '''
+        data={}
+        res=self.sess.get("url", data=data, headers=user_agent.getheaders())
+        logging.debug(res.text)
+
+    def login(self, username, password):
+        data = {
+            "username": "x",
+            "code": "xx"
+        }
+        res = self.sess.post(api.login, data=data, headers=user_agent.getheaders())
+        logging.debug(res.text)
+
+
+    def getVIPVideoLinks(self, url):
+        pass
+
+    def getCommonVideoLinks(self, url):
+        jsonData = self.sess.get(
+            url=api.getCommonVideoList % (str.split(url, r'/')[-1]), headers=user_agent.getheaders())
+        print(jsonData.text)
+        res = json.loads(jsonData.text)
+        if(res["State"] == False):
+            logging.debug(res["Message"])
+
+        else:
+            logging.debug(res["Data"]["Modules"][0]["Title"])
+            logging.debug(res["Data"]["Url"])
+        link = []
+        return link
+
+    def downloadVideo(self, url, fileName):
+        '''
+        下载视频
+        :param url: 下载url路径
+        :return: 文件
+        '''
+        with closing(requests.get(url=url, stream=True)) as response:
+            chunk_size = 1024
+            content_size = int(response.headers['content-length'])
+            file_D = './Video/' + fileName + '.mp4'
+            if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
+                print('跳过' + fileName)
+            else:
+                progress = DownloadProgress.DownloadProgress(fileName, total=content_size, unit="KB",
+                                                             chunk_size=chunk_size,
+                                                             run_status="正在下载", fin_status="下载完成")
+                with open(file_D, "wb") as file:
+                    for data in response.iter_content(chunk_size=chunk_size):
+                        file.write(data)
+                        progress.refresh(count=len(data))
+
+    def crawl(self):
+        """
+        param :
+        return:
+        """
+        with open(self.configPath, "r", encoding="utf8") as f:
+            try:
+                myConfig = json.loads(f.read())
+                pool = ThreadPoolExecutor(max_workers=10)
+                courseUrls = myConfig["courseUrl"]
+                for courseLink in courseUrls:
+                    videoLinks = self.getCommonVideoLinks(courseLink)
+                    for videoLink in videoLinks:
+                        self.downloadVideo(videoLink)
+            except Exception as e:
+                print(e)

+ 31 - 0
crawl_ouchn/threads.py

@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+'''
+多线程下载多文件;多线程分段下载单文件.
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+from threading import Lock
+from threading import Thread
+
+threadLock = Lock()
+threads = []
+
+
+class MyThread(Thread):
+    def __init__(self, name, func, *args, lock=False):
+        Thread.__init__(self)
+        self.name = name
+        self.func = func
+        self.args = args
+        self.lock = lock
+
+    def run(self):
+        print("开启: " + self.name)
+        if self.lock:
+            threadLock.acquire()
+            self.func(*self.args)
+            threadLock.release()
+        else:
+            self.func(*self.args)

+ 79 - 0
crawl_ouchn/user_agent.py

@@ -0,0 +1,79 @@
+# -*-coding:utf-8 -*-
+
+import random
+
+# 返回一个随机的请求头 headers
+def getheaders():
+    # 各种PC端
+    user_agent_list_2 = [
+        # Opera
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
+        "Opera/8.0 (Windows NT 5.1; U; en)",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
+        # Firefox
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
+        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
+        # Safari
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        # chrome
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.2171.71 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/72.0.1271.64 Safari/537.11",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/66.0.648.133 Safari/534.16",
+        # 360
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        # 淘宝浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+        # 猎豹浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+        # QQ浏览器
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+        # sogou浏览器
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
+        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
+        # maxthon浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
+        # UC浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
+    ]
+    # 各种移动端
+    user_agent_list_3 = [
+        # IPhone
+        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPod
+        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPAD
+        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # Android
+        "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # QQ浏览器 Android版本
+        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # Android Opera Mobile
+        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+        # Android Pad Moto Xoom
+        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+        # BlackBerry
+        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+        # WebOS HP Touchpad
+        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+        # Nokia N97
+        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+        # Windows Phone Mango
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+        # UC浏览器
+        "UCWEB7.0.2.37/28/999",
+        "NOKIA5700/ UCWEB7.0.2.37/28/999",
+        # UCOpenwave
+        "Openwave/ UCWEB7.0.2.37/28/999",
+        # UC Opera
+        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
+    ]
+    UserAgent = random.choice(user_agent_list_2) # 这里只用list1
+    headers = {'User-Agent': UserAgent}
+    return headers

+ 0 - 0
crawl_ouchn/version.py


+ 18 - 0
le.ouchn.cn.http

@@ -0,0 +1,18 @@
+### 全局变量
+@hostname = le.ouchn.cn
+# @hostname= localhost:89
+@host = http://{{hostname}}
+@api={{host}}/api
+@contentType = application/json
+@createdAt = {{$datetime iso8601}}
+@useragent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36
+@accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
+@uid=10022
+@token=d627520092e04f958fa2f4a72ecb6170
+@course=CAAA010000023530
+
+### 获取配置
+GET {{api}}/Course/{{course}}/MicroCourse/Details
+# GET {{api}}/checkApiParams.php?service=Home.getConfig
+User-Agent: {{useragent}}
+Accept-Language: en-GB,en-US;q=0.8,en;q=0.6,zh-CN;q=0.4

+ 17 - 0
main.py

@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/23 14:33:19
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   入口
+'''
+
+import time
+from crawl_ouchn.crawl_ouchn import CrawlOuchn
+
+if __name__ == '__main__':
+    start_time=time.time()
+    crawlOuchn = CrawlOuchn(configPath=r"config/config.json")
+    crawlOuchn.crawl()
+    print("last time: {} s".format(time.time() - start_time))

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+requests

+ 6 - 0
setup.cfg

@@ -0,0 +1,6 @@
+[wheel]
+universal = True
+
+[flake8]
+exclude = setup.py,build,.git,venv
+ignore = E402,E501,E731,E741,W503

+ 147 - 0
setup.py

@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import os.path
+import warnings
+import sys
+
+try:
+    from setuptools import setup, Command
+    setuptools_available = True
+except ImportError:
+    from distutils.core import setup, Command
+    setuptools_available = False
+from distutils.spawn import spawn
+
+try:
+    # This will create an exe that needs Microsoft Visual C++ 2008
+    # Redistributable Package
+    import py2exe
+except ImportError:
+    if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
+        print('Cannot import py2exe', file=sys.stderr)
+        exit(1)
+
+py2exe_options = {
+    'bundle_files': 1,
+    'compressed': 1,
+    'optimize': 2,
+    'dist_dir': '.',
+    'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
+}
+
+# Get the version from youtube_dl/version.py without importing the package
+exec(compile(open('youtube_dl/version.py').read(),
+             'youtube_dl/version.py', 'exec'))
+
+DESCRIPTION = 'YouTube video downloader'
+LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites'
+
+py2exe_console = [{
+    'script': './youtube_dl/__main__.py',
+    'dest_base': 'youtube-dl',
+    'version': __version__,
+    'description': DESCRIPTION,
+    'comments': LONG_DESCRIPTION,
+    'product_name': 'youtube-dl',
+    'product_version': __version__,
+}]
+
+py2exe_params = {
+    'console': py2exe_console,
+    'options': {'py2exe': py2exe_options},
+    'zipfile': None
+}
+
+if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
+    params = py2exe_params
+else:
+    files_spec = [
+        ('etc/bash_completion.d', ['youtube-dl.bash-completion']),
+        ('etc/fish/completions', ['youtube-dl.fish']),
+        ('share/doc/youtube_dl', ['README.txt']),
+        ('share/man/man1', ['youtube-dl.1'])
+    ]
+    root = os.path.dirname(os.path.abspath(__file__))
+    data_files = []
+    for dirname, files in files_spec:
+        resfiles = []
+        for fn in files:
+            if not os.path.exists(fn):
+                warnings.warn(
+                    'Skipping file %s since it is not present. Type  make  to build all automatically generated files.' % fn)
+            else:
+                resfiles.append(fn)
+        data_files.append((dirname, resfiles))
+
+    params = {
+        'data_files': data_files,
+    }
+    if setuptools_available:
+        params['entry_points'] = {
+            'console_scripts': ['youtube-dl = youtube_dl:main']}
+    else:
+        params['scripts'] = ['bin/youtube-dl']
+
+
+class build_lazy_extractors(Command):
+    description = 'Build the extractor lazy loading module'
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        spawn(
+            [sys.executable, 'devscripts/make_lazy_extractors.py',
+                'youtube_dl/extractor/lazy_extractors.py'],
+            dry_run=self.dry_run,
+        )
+
+
+setup(
+    name='youtube_dl',
+    version=__version__,
+    description=DESCRIPTION,
+    long_description=LONG_DESCRIPTION,
+    url='https://github.com/jianboy/crawl-xuexi',
+    author='jianboy',
+    author_email='liuyuqi.gov@msn.cn',
+    maintainer='jianboy',
+    maintainer_email='liuyuqi.gov@msn.cn',
+    license='Unlicense',
+    packages=[
+        'youtube_dl',
+        'youtube_dl.extractor', 'youtube_dl.downloader',
+        'youtube_dl.postprocessor'],
+
+    classifiers=[
+        'Topic :: Multimedia :: Video',
+        'Development Status :: 5 - Production/Stable',
+        'Environment :: Console',
+        'License :: Public Domain',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: Implementation',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: IronPython',
+        'Programming Language :: setup.cfgPython :: Implementation :: Jython',
+        'Programming Language :: Python :: Implementation :: PyPy',
+    ],
+
+    cmdclass={'build_lazy_extractors': build_lazy_extractors},
+    **params
+)