download.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2020/05/03 19:10:15
  7. @Version : 1.0
  8. @License : Copyright ? 2017-2020 liuyuqi. All Rights Reserved.
  9. @Desc : teaching.applysquare.com
  10. '''
  11. import json
  12. import logging
  13. import os
  14. import re
  15. import time
  16. from contextlib import closing
  17. import requests
  18. from selenium import webdriver
  19. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  20. # Function dealing with illegal characters of windows filename
  21. def filename_filter(name:str):
  22. illegal_list = list('/\:*?”"<>|')
  23. for char in illegal_list:
  24. name = name.replace(char, ' ')
  25. return name
  26. def construct_attchment_list(driver, token, pid, uid, cid):
  27. attachment_list = list()
  28. attachment_info_url = attachment_url_fmt.format(token, pid, 1, uid, cid)
  29. driver.get(attachment_info_url)
  30. raw_info = re.search(r'\{.*\}', driver.page_source).group(0)
  31. info = json.loads(raw_info).get('message')
  32. file_num = info.get('count')
  33. current_page = 1
  34. # Add attachment path to attachment_list
  35. while len(attachment_list) < file_num:
  36. current_url = attachment_url_fmt.format(token, pid, current_page, uid, cid)
  37. driver.get(current_url)
  38. raw_info = re.search(r'\{.*\}', driver.page_source).group(0)
  39. info = json.loads(raw_info).get('message')
  40. attachment_list.extend(info.get('list'))
  41. current_page += 1
  42. return attachment_list
  43. # Load config from config.json
  44. with open('config.json', 'r') as f:
  45. config = json.loads(f.read())
  46. user_name = config.get('username')
  47. user_passwd = config.get('password')
  48. headless_mode = config.get('headless_mode')
  49. download_all_ext = config.get('download_all_ext')
  50. download_all_courses = config.get('download_all_courses')
  51. ext_list = config.get('ext_list')
  52. ext_expel_list = config.get('ext_expel_list')
  53. cid_list = config.get('cid_list')
  54. # auto_restart = True
  55. # speed_threshold = 50 * 1024
  56. # Some metadata
  57. login_url = r"https://teaching.applysquare.com/Home/User/login"
  58. attachment_url_fmt = r'https://teaching.applysquare.com/Api/CourseAttachment/getList/token/{}?parent_id={}&page={}&plan_id=-1&uid={}&cid={}'
  59. course_info_url_fmt = r'https://teaching.applysquare.com/Api/Public/getIndexCourseList/token/{}?type=1&usertype=1&uid={}'
  60. token_pattern = r'(https://teaching\.applysquare\.com/Api/Public/getIndexCourseList/token/.*?)"'
  61. # Start the webdriver
  62. caps = DesiredCapabilities.CHROME
  63. caps['loggingPrefs'] = {'performance': 'ALL'}
  64. opt = webdriver.ChromeOptions()
  65. opt.add_experimental_option('w3c', False)
  66. opt.add_argument('log-level=3')
  67. if headless_mode:
  68. opt.add_argument("--headless")
  69. driver = webdriver.Chrome(options=opt, desired_capabilities=caps)
  70. # Login to Pedagogy Square
  71. driver.get(login_url)
  72. time.sleep(1)
  73. driver.find_element_by_xpath(r"/html/body/div[2]/div/div[2]/div/div/div/div/div[2]/div/div/div[1]/input").send_keys(user_name) # Send username
  74. driver.find_element_by_xpath(r'//*[@id="id_login_password"]').send_keys(user_passwd) # Send password
  75. driver.find_element_by_xpath(r'//*[@id="id_login_button"]').click() # Submit
  76. time.sleep(0.5)
  77. # Dealing with student-teacher selection
  78. try:
  79. driver.find_element_by_xpath(r'/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[2]/div[1]/i').click() # Choose student
  80. driver.find_element_by_xpath(r'/html/body/div[2]/div/div[2]/div/div/div[1]/div[4]/a').click() # Submit
  81. except Exception:
  82. pass
  83. time.sleep(0.5)
  84. if (driver.current_url == r'https://teaching.applysquare.com/S/Index/index'):
  85. print("Login Successfully!")
  86. else:
  87. print("Login Error --- Please check your username & password")
  88. print("Disable headless mode for detailed information")
  89. # Get token for authorization
  90. token = None
  91. while not token:
  92. for entry in driver.get_log('performance'):
  93. match_obj = re.search(token_pattern, entry.get('message'))
  94. if match_obj:
  95. temp_url = match_obj.group(1)
  96. token = re.search(r'token/(.*?)\?', temp_url).group(1)
  97. uid = re.search(r'uid=(.*?)', temp_url).group(1)
  98. break
  99. cid2name_dict = dict()
  100. course_info_url = course_info_url_fmt.format(token, uid)
  101. driver.get(course_info_url)
  102. raw_info = re.search(r'\{.*\}', driver.page_source).group(0)
  103. info = json.loads(raw_info).get('message')
  104. for entry in info:
  105. cid2name_dict[entry.get('cid')] = entry.get('name')
  106. if download_all_courses:
  107. cid_list = cid2name_dict.keys()
  108. for cid in cid_list:
  109. cid = str(cid) # Prevent bug caused by wrong type of cid
  110. course_name = filename_filter(cid2name_dict[cid])
  111. print("\nDownloading files of course {}".format(course_name))
  112. # Create dir for this course
  113. try:
  114. os.chdir("./{}".format(course_name))
  115. except FileNotFoundError:
  116. os.mkdir("{}".format(course_name))
  117. os.chdir("./{}".format(course_name))
  118. # Construct attachment list, with some dirs in it
  119. course_attachment_list = construct_attchment_list(driver=driver, token=token, pid=0, uid=uid, cid=cid)
  120. # Iteratively add files in dirs to global attachment list
  121. dir_counter = 0
  122. for entry in course_attachment_list:
  123. if (entry.get('ext') == 'dir'):
  124. dir_counter += 1
  125. # Add dir content to attachment list
  126. dir_id = entry.get('id')
  127. course_attachment_list.extend(construct_attchment_list(driver=driver, token=token, pid=dir_id, uid=uid, cid=cid))
  128. print("Get {:d} files, with {:d} dirs".format(len(course_attachment_list)-dir_counter, dir_counter))
  129. # Download attachments
  130. for entry in course_attachment_list:
  131. ext = entry.get('ext')
  132. if (ext == 'dir') or (ext in ext_expel_list) or (not download_all_ext and ext not in ext_list):
  133. continue
  134. if (ext in entry.get('title')):
  135. filename = filename_filter(entry.get('title'))
  136. else:
  137. filename = filename_filter("{}.{}".format(entry.get('title'), ext))
  138. filesize = entry.get('size')
  139. with closing(requests.get(entry.get('path').replace('amp;', ''), stream=True)) as res:
  140. content_size = eval(res.headers['content-length'])
  141. if filename in os.listdir():
  142. # If file is up-to date, continue; else, delete and re-download
  143. if os.path.getsize(filename) == content_size:
  144. print("File \"{}\" is up-to-date".format(filename))
  145. continue
  146. else:
  147. print("Updating File {}".format(filename))
  148. os.remove(filename)
  149. print("Downloading {}, filesize = {}".format(filename, filesize))
  150. chunk_size = min(content_size, 10240)
  151. with open(filename, "wb") as f:
  152. chunk_count = 0
  153. start_time = time.time()
  154. # previous_time = time.time()
  155. # lag_counter = 0
  156. total = content_size / 1024 / 1024
  157. for data in res.iter_content(chunk_size=chunk_size):
  158. chunk_count += 1
  159. processed = len(data) * chunk_count / 1024 / 1024
  160. current_time = time.time()
  161. if chunk_count < 5:
  162. print(r" Total: {:.2f} MB Processed: {:.2f} MB ({:.2f}%)".format(total, processed, processed/total*100), end = '\r')
  163. else:
  164. remaining = (current_time-start_time)/processed*(total-processed)
  165. print(r" Total: {:.2f} MB Processed: {:.2f} MB ({:.2f}%), ETA {:.2f}s".format(total, processed, processed/total*100, remaining), end = '\r')
  166. f.write(data)
  167. # speed = chunk_size / 1.0 * (current_time - previous_time)
  168. # if speed < speed_threshold:
  169. # lag_counter += 1
  170. # else:
  171. # lag_counter = 0
  172. # if lag_counter > 10:
  173. # print("Restart downloading of file {}".format(filename))
  174. # attachment_list.append(entry)
  175. # continue
  176. os.chdir(r'../') # Switch directory
  177. print("Done!")