#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2022/05/23 13:15:38 @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. @Desc : Muti-threads crawler for ouchn. ''' from multiprocessing import pool import requests import os import sys import re import json import logging from contextlib import closing from crawl_ouchn import DownloadProgress, api, user_agent from concurrent.futures import ThreadPoolExecutor class CrawlOuchn(): def __init__(self, configPath=r'config/config.json'): self.sess = requests.Session() self.configPath = configPath def checkNet(self): res = self.sess.get("http://baidu.com") logging.debug(res.text) return True def getCode(self, phone): ''' get verify code ''' data={} res=self.sess.get("url", data=data, headers=user_agent.getheaders()) logging.debug(res.text) def login(self, username, password): data = { "username": "x", "code": "xx" } res = self.sess.post(api.login, data=data, headers=user_agent.getheaders()) logging.debug(res.text) def getVIPVideoLinks(self, url): pass def getCommonVideoLinks(self, url): jsonData = self.sess.get( url=api.getCommonVideoList % (str.split(url, r'/')[-1]), headers=user_agent.getheaders()) res = json.loads(jsonData.text) if(res["State"] == False): logging.debug(res["Message"]) else: logging.info(res["Data"]["Modules"][0]["Title"]) logging.debug(res["Data"]["Url"]) link = [] return link def downloadVideo(self, url, fileName): ''' download video :param url: download url :return: fileName ''' with closing(requests.get(url=url, stream=True)) as response: chunk_size = 1024 content_size = int(response.headers['content-length']) file_D = './Video/' + fileName + '.mp4' if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size): print('跳过' + fileName) else: progress = DownloadProgress.DownloadProgress(fileName, total=content_size, unit="KB", chunk_size=chunk_size, run_status="正在下载", fin_status="下载完成") with open(file_D, "wb") as file: for data in response.iter_content(chunk_size=chunk_size): file.write(data) progress.refresh(count=len(data)) def crawl(self): """ crawl ouchn website param : return: """ with open(self.configPath, "r", encoding="utf8") as f: try: myConfig = json.loads(f.read()) pool = ThreadPoolExecutor(max_workers=10) courseUrls = myConfig["courseUrl"] for courseLink in courseUrls: videoLinks = self.getCommonVideoLinks(courseLink) for videoLink in videoLinks: self.downloadVideo(videoLink) except Exception as e: print(e)