crawl_ouchn.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2022/05/23 13:15:38
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : Muti-threads crawler for ouchn.
  8. '''
  9. from multiprocessing import pool
  10. import requests
  11. import os
  12. import sys
  13. import re
  14. import json
  15. import logging
  16. from contextlib import closing
  17. from crawl_ouchn import DownloadProgress, api, user_agent
  18. from concurrent.futures import ThreadPoolExecutor
  19. class CrawlOuchn():
  20. def __init__(self, configPath=r'config/config.json'):
  21. self.sess = requests.Session()
  22. self.configPath = configPath
  23. def checkNet(self):
  24. res = self.sess.get("http://baidu.com")
  25. logging.debug(res.text)
  26. return True
  27. def getCode(self, phone):
  28. '''
  29. get verify code
  30. '''
  31. data={}
  32. res=self.sess.get("url", data=data, headers=user_agent.getheaders())
  33. logging.debug(res.text)
  34. def login(self, username, password):
  35. data = {
  36. "username": "x",
  37. "code": "xx"
  38. }
  39. res = self.sess.post(api.login, data=data, headers=user_agent.getheaders())
  40. logging.debug(res.text)
  41. def getVIPVideoLinks(self, url):
  42. pass
  43. def getCommonVideoLinks(self, url):
  44. jsonData = self.sess.get(
  45. url=api.getCommonVideoList % (str.split(url, r'/')[-1]), headers=user_agent.getheaders())
  46. res = json.loads(jsonData.text)
  47. if(res["State"] == False):
  48. logging.debug(res["Message"])
  49. else:
  50. logging.info(res["Data"]["Modules"][0]["Title"])
  51. logging.debug(res["Data"]["Url"])
  52. link = []
  53. return link
  54. def downloadVideo(self, url, fileName):
  55. '''
  56. download video
  57. :param url: download url
  58. :return: fileName
  59. '''
  60. with closing(requests.get(url=url, stream=True)) as response:
  61. chunk_size = 1024
  62. content_size = int(response.headers['content-length'])
  63. file_D = './Video/' + fileName + '.mp4'
  64. if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
  65. print('跳过' + fileName)
  66. else:
  67. progress = DownloadProgress.DownloadProgress(fileName, total=content_size, unit="KB",
  68. chunk_size=chunk_size,
  69. run_status="正在下载", fin_status="下载完成")
  70. with open(file_D, "wb") as file:
  71. for data in response.iter_content(chunk_size=chunk_size):
  72. file.write(data)
  73. progress.refresh(count=len(data))
  74. def crawl(self):
  75. """
  76. crawl ouchn website
  77. param :
  78. return:
  79. """
  80. with open(self.configPath, "r", encoding="utf8") as f:
  81. try:
  82. myConfig = json.loads(f.read())
  83. pool = ThreadPoolExecutor(max_workers=10)
  84. courseUrls = myConfig["courseUrl"]
  85. for courseLink in courseUrls:
  86. videoLinks = self.getCommonVideoLinks(courseLink)
  87. for videoLink in videoLinks:
  88. self.downloadVideo(videoLink)
  89. except Exception as e:
  90. print(e)