download.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/05/17 12:45:38
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : download csdn file
  8. '''
  9. from bs4 import BeautifulSoup
  10. import time
  11. import re
  12. import os
  13. import requests
  14. import sys
  15. from import print_msg
  16. reload(sys)
  17. sys.setdefaultencoding('utf8') # 对于py2,将ascii改为utf8
  18. class CsdnDownloader:
  19. def __init__(self, username, password):
  20. self.__username = username
  21. self.__password = password
  22. driver = None
  23. action = None
  24. # CSDN账号
  25. __username = ""
  26. # 登录密码
  27. __password = ""
  28. # 会话
  29. __session = requests.session()
  30. # 下载次数
  31. download_count = 0
  32. # 是否登录
  33. __is_logined = False
  34. __login_url = "https://passport.csdn.net/account/login"
  35. def download(self, remote_url, local_dir):
  36. # 1.是否登录
  37. if not self.__is_logined:
  38. self.__login()
  39. # 下载次数+1
  40. self.download_count += 1
  41. count = 0
  42. while count < 3:
  43. count += 1
  44. # 2.解析真实下载URL
  45. html_text = self.__session.get(remote_url).text
  46. html = BeautifulSoup(html_text, "html5lib")
  47. real_url = html.find("a", id="vip_btn").attrs["href"]
  48. # 3.下载
  49. source = self.__session.get(real_url)
  50. # 3.1获取下载名
  51. filename = re.findall(r".*\"(.*)\"$", source.headers.get("Content-Disposition", "\"None\""))[0]
  52. if filename == "None":
  53. continue
  54. filename = re.sub("\s", "_", filename)
  55. # 3.2创建本地文件
  56. if not os.path.exists(local_dir):
  57. os.makedirs(local_dir)
  58. _local_path = local_dir + filename
  59. # 3.3分段下载
  60. local_file = open(_local_path.encode("gbk"), "wb")
  61. for file_buffer in source.iter_content(chunk_size=512):
  62. if file_buffer:
  63. local_file.write(file_buffer)
  64. return _local_path
  65. return None
  66. def __login(self):
  67. # 1.请求登录页面,获取登录前的必要参数
  68. html_text = requests.get(self.__login_url).text
  69. html = BeautifulSoup(html_text, "html5lib")
  70. form = html.find("form", id="fm1")
  71. location = form.attrs["action"] # 每次表单action后面有个随机数
  72. lt = form.select("input[name=lt]")[0].attrs["value"]
  73. execution = form.select("input[name=execution]")[0].attrs["value"]
  74. _eventId = form.select("input[name=_eventId]")[0].attrs["value"]
  75. params = {"username": self.__username, "password": self.__password, "lt": lt, "execution": execution,
  76. "_eventId": _eventId}
  77. time.sleep(1) # CSDN貌似判断机器人,睡眠一下,增加成功率
  78. # 2.进行登录
  79. response = requests.post(location, params)
  80. # 3.保存cookies
  81. self.__session.cookies = response.cookies
  82. self.__is_logined = True
  83. if __name__ == '__main__':
  84. down_loader = CsdnDownloader("test", "123456")
  85. local_path = down_loader.download('http://download.csdn.net/download/lqkitten/10113904', "c://Robot_Download/")
  86. if local_path is not None:
  87. print_msg("CSDN下载完成,本地路径:" + local_path)
  88. else:
  89. print_msg("CSDN下载失败")