lyq
/
crawl_ouchn


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2022/05/23 13:15:38
@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
@Desc    :   Muti-threads crawler for ouchn.
'''
from multiprocessing import pool
import requests
import os
import sys
import re
import json
import logging
from contextlib import closing
from crawl_ouchn import DownloadProgress, api, user_agent
from concurrent.futures import ThreadPoolExecutor

class CrawlOuchn():

    def __init__(self, configPath=r'config/config.json'):
        self.sess = requests.Session()
        self.configPath = configPath

    def checkNet(self):
        res = self.sess.get("http://baidu.com")
        logging.debug(res.text)
        return True

    def getCode(self, phone):
        '''
        get verify code
        '''
        data={}
        res=self.sess.get("url", data=data, headers=user_agent.getheaders())
        logging.debug(res.text)

    def login(self, username, password):
        data = {
            "username": "x",
            "code": "xx"
        }
        res = self.sess.post(api.login, data=data, headers=user_agent.getheaders())
        logging.debug(res.text)


    def getVIPVideoLinks(self, url):
        pass

    def getCommonVideoLinks(self, url):
        jsonData = self.sess.get(
            url=api.getCommonVideoList % (str.split(url, r'/')[-1]), headers=user_agent.getheaders())
        res = json.loads(jsonData.text)
        if(res["State"] == False):
            logging.debug(res["Message"])
        else:
            logging.info(res["Data"]["Modules"][0]["Title"])
            logging.debug(res["Data"]["Url"])
        link = []
        return link

    def downloadVideo(self, url, fileName):
        '''
        download video
        :param url: download url
        :return: fileName
        '''
        with closing(requests.get(url=url, stream=True)) as response:
            chunk_size = 1024
            content_size = int(response.headers['content-length'])
            file_D = './Video/' + fileName + '.mp4'
            if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
                print('跳过' + fileName)
            else:
                progress = DownloadProgress.DownloadProgress(fileName, total=content_size, unit="KB",
                                                             chunk_size=chunk_size,
                                                             run_status="正在下载", fin_status="下载完成")
                with open(file_D, "wb") as file:
                    for data in response.iter_content(chunk_size=chunk_size):
                        file.write(data)
                        progress.refresh(count=len(data))

    def crawl(self):
        """
        crawl ouchn website
        param :
        return:
        """
        with open(self.configPath, "r", encoding="utf8") as f:
            try:
                myConfig = json.loads(f.read())
                pool = ThreadPoolExecutor(max_workers=10)
                courseUrls = myConfig["courseUrl"]
                for courseLink in courseUrls:
                    videoLinks = self.getCommonVideoLinks(courseLink)
                    for videoLink in videoLinks:
                        self.downloadVideo(videoLink)
            except Exception as e:
                print(e)