main.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # -*- coding: utf-8 -*-
  2. '''
  3. @Auther :liuyuqi.gov@msn.cn
  4. @date :2019/4/8
  5. '''
  6. __author__ = "liuyuqi"
  7. import json
  8. import os
  9. import re
  10. from contextlib import closing
  11. import requests
  12. from crawl_xuexi import DownloadProgress, user_agent
  13. from concurrent.futures import ThreadPoolExecutor
  14. import time
  15. # src = "D:/PycharmProjects/crawl_xuexi/"
  16. # os.chdir(src)
  17. def get_video_links(url):
  18. video = requests.get(url=url, headers=user_agent.getheaders()).content.decode("utf8")
  19. pattern = r'https://video.xuexi.cn/[^,"]*mp4'
  20. link = re.findall(pattern, video, re.I)
  21. link.reverse()
  22. return link
  23. def downloadVideo(url, file_name):
  24. '''
  25. 下载视频
  26. :param url: 下载url路径
  27. :return: 文件
  28. '''
  29. with closing(requests.get(url=url, stream=True)) as response:
  30. chunk_size = 1024
  31. content_size = int(response.headers['content-length'])
  32. file_D = './Video/' + file_name + '.mp4'
  33. if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
  34. print('跳过' + file_name)
  35. else:
  36. progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
  37. chunk_size=chunk_size,
  38. run_status="正在下载", fin_status="下载完成")
  39. with open(file_D, "wb") as file:
  40. for data in response.iter_content(chunk_size=chunk_size):
  41. file.write(data)
  42. progress.refresh(count=len(data))
  43. def crawl():
  44. with open("data/ml.json", "r", encoding="utf8") as f:
  45. mlData = json.loads(f.read())
  46. pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
  47. for i in range((len(mlData["fpe1ki18v228w00"]))):
  48. frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
  49. static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
  50. # 打开 mp4 视频网页链接
  51. resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
  52. preUrl = static_page_url.split("/")[3]
  53. pattern = r'src="./data(.*?)"></script>'
  54. url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
  55. res = get_video_links(url)[0]
  56. future1 = pool.submit(downloadVideo,
  57. res, frst_name) # 往线程池里面加入一个task
  58. if __name__ == '__main__':
  59. start_time = time.time()
  60. crawl()
  61. print("last time: {} s".format(time.time() - start_time))