main.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # -*- coding: utf-8 -*-
  2. '''
  3. @Auther :liuyuqi.gov@msn.cn
  4. @date :2019/4/8
  5. '''
  6. __author__ = "liuyuqi"
  7. import json
  8. import os
  9. import re
  10. from contextlib import closing
  11. import requests
  12. import DownloadProgress
  13. import user_agent
  14. import threading
  15. from concurrent.futures import ThreadPoolExecutor
  16. import time
  17. # src = "D:/PycharmProjects/crawl_xuexi/"
  18. # os.chdir(src)
  19. def get_video_links(url):
  20. video = requests.get(url=url, headers=user_agent.getheaders()).content.decode("utf8")
  21. pattern = r'https://video.xuexi.cn/[^,"]*mp4'
  22. link = re.findall(pattern, video, re.I)
  23. link.reverse()
  24. return link
  25. def downloadVideo(url, file_name):
  26. '''
  27. 下载视频
  28. :param url: 下载url路径
  29. :return: 文件
  30. '''
  31. with closing(requests.get(url=url, stream=True)) as response:
  32. chunk_size = 1024
  33. content_size = int(response.headers['content-length'])
  34. file_D = './Video/' + file_name + '.mp4'
  35. if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
  36. print('跳过' + file_name)
  37. else:
  38. progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
  39. chunk_size=chunk_size,
  40. run_status="正在下载", fin_status="下载完成")
  41. with open(file_D, "wb") as file:
  42. for data in response.iter_content(chunk_size=chunk_size):
  43. file.write(data)
  44. progress.refresh(count=len(data))
  45. def crawl():
  46. with open("data/ml.json", "r", encoding="utf8") as f:
  47. mlData = json.loads(f.read())
  48. pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
  49. for i in range((len(mlData["fpe1ki18v228w00"]))):
  50. frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
  51. static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
  52. # 打开 mp4 视频网页链接
  53. resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
  54. preUrl = static_page_url.split("/")[3]
  55. pattern = r'src="./data(.*?)"></script>'
  56. url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
  57. res = get_video_links(url)[0]
  58. future1 = pool.submit(downloadVideo,
  59. res, frst_name) # 往线程池里面加入一个task
  60. if __name__ == '__main__':
  61. start_time = time.time()
  62. crawl()
  63. print("last time: {} s".format(time.time() - start_time))