123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2022/05/23 13:15:38
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc : Muti-threads crawler for ouchn.
- '''
- from multiprocessing import pool
- import requests
- import os
- import sys
- import re
- import json
- import logging
- from contextlib import closing
- from crawl_ouchn import DownloadProgress, api, user_agent
- from concurrent.futures import ThreadPoolExecutor
- class CrawlOuchn():
- def __init__(self, configPath=r'config/config.json'):
- self.sess = requests.Session()
- self.configPath = configPath
- def checkNet(self):
- res = self.sess.get("http://baidu.com")
- logging.debug(res.text)
- return True
- def getCode(self, phone):
- '''
- get verify code
- '''
- data={}
- res=self.sess.get("url", data=data, headers=user_agent.getheaders())
- logging.debug(res.text)
- def login(self, username, password):
- data = {
- "username": "x",
- "code": "xx"
- }
- res = self.sess.post(api.login, data=data, headers=user_agent.getheaders())
- logging.debug(res.text)
- def getVIPVideoLinks(self, url):
- pass
- def getCommonVideoLinks(self, url):
- jsonData = self.sess.get(
- url=api.getCommonVideoList % (str.split(url, r'/')[-1]), headers=user_agent.getheaders())
- res = json.loads(jsonData.text)
- if(res["State"] == False):
- logging.debug(res["Message"])
- else:
- logging.info(res["Data"]["Modules"][0]["Title"])
- logging.debug(res["Data"]["Url"])
- link = []
- return link
- def downloadVideo(self, url, fileName):
- '''
- download video
- :param url: download url
- :return: fileName
- '''
- with closing(requests.get(url=url, stream=True)) as response:
- chunk_size = 1024
- content_size = int(response.headers['content-length'])
- file_D = './Video/' + fileName + '.mp4'
- if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
- print('跳过' + fileName)
- else:
- progress = DownloadProgress.DownloadProgress(fileName, total=content_size, unit="KB",
- chunk_size=chunk_size,
- run_status="正在下载", fin_status="下载完成")
- with open(file_D, "wb") as file:
- for data in response.iter_content(chunk_size=chunk_size):
- file.write(data)
- progress.refresh(count=len(data))
- def crawl(self):
- """
- crawl ouchn website
- param :
- return:
- """
- with open(self.configPath, "r", encoding="utf8") as f:
- try:
- myConfig = json.loads(f.read())
- pool = ThreadPoolExecutor(max_workers=10)
- courseUrls = myConfig["courseUrl"]
- for courseLink in courseUrls:
- videoLinks = self.getCommonVideoLinks(courseLink)
- for videoLink in videoLinks:
- self.downloadVideo(videoLink)
- except Exception as e:
- print(e)
|