main.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. # -*- coding: utf-8 -*-
  2. '''
  3. @Auther :liuyuqi.gov@msn.cn
  4. @date :2019/4/8
  5. '''
  6. __author__ = "liuyuqi"
  7. import json
  8. import os
  9. import re
  10. from contextlib import closing
  11. import requests
  12. import DownloadProgress
  13. import user_agent
  14. # src = "D:/PycharmProjects/crawl_xuexi/"
  15. # os.chdir(src)
  16. def get_video_links(url):
  17. video = requests.get(url=url, headers=user_agent.getheaders()).content.decode("utf8")
  18. pattern = r'https://video.xuexi.cn/[^,"]*mp4'
  19. link = re.findall(pattern, video, re.I)
  20. link.reverse()
  21. return link
  22. def downloadVideo(url, file_name):
  23. '''
  24. 下载视频
  25. :param url: 下载url路径
  26. :return: 文件
  27. '''
  28. with closing(requests.get(url=url, stream=True)) as response:
  29. chunk_size = 1024
  30. content_size = int(response.headers['content-length'])
  31. file_D = './Video/' + file_name + '.mp4'
  32. if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
  33. print('跳过' + file_name)
  34. else:
  35. progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
  36. chunk_size=chunk_size,
  37. run_status="正在下载", fin_status="下载完成")
  38. with open(file_D, "wb") as file:
  39. for data in response.iter_content(chunk_size=chunk_size):
  40. file.write(data)
  41. progress.refresh(count=len(data))
  42. def crawl():
  43. with open("data/ml.json", "r", encoding="utf8") as f:
  44. mlData = json.loads(f.read())
  45. for i in range((len(mlData["fpe1ki18v228w00"]))):
  46. frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
  47. static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
  48. # 打开 mp4 视频网页链接
  49. resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
  50. preUrl = static_page_url.split("/")[3]
  51. pattern = r'src="./data(.*?)"></script>'
  52. url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
  53. res = get_video_links(url)[0]
  54. downloadVideo(res, file_name=frst_name)
  55. if __name__ == '__main__':
  56. crawl()