spider.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. import os
  2. import requests
  3. import time
  4. import re
  5. import json
  6. import schedule
  7. import tempfile
  8. from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
  9. from selenium.webdriver import DesiredCapabilities, Chrome, ChromeOptions
  10. from datetime import datetime
  11. from random import randrange, choice
  12. from selenium.webdriver.common import utils
  13. # verification code Identification settings
  14. code_url = 'http://apigateway.jianjiaoshuju.com/api/v_1/yzmCustomized.html'
  15. code_headers = {
  16. 'appCode': 'X',
  17. 'appKey': 'X',
  18. 'appSecret': 'X'
  19. }
  20. headers = {
  21. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  22. 'accept-encoding': 'deflate',
  23. 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
  24. 'cache-control': 'max-age=0',
  25. 'dnt': '1',
  26. 'sec-fetch-dest': 'document',
  27. 'sec-fetch-mode': 'navigate',
  28. 'sec-fetch-site': 'none',
  29. 'sec-fetch-user': '?1',
  30. 'upgrade-insecure-requests': '1',
  31. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
  32. }
  33. url = 'https://signup.live.com/signup'
  34. check_timeout = 0
  35. sign_list = ['~', '!', '@', '#', '$', '%', '^', '&', '*',
  36. '(', ')', '_+', '<', '>', '?', ':', '"', '{', '}', '|']
  37. name_list = [
  38. 'Emma',
  39. 'Olivia',
  40. 'Ava',
  41. 'Isabella',
  42. 'Sophia',
  43. 'Charlotte',
  44. 'Mia',
  45. 'Amelia',
  46. 'Harper',
  47. 'Evelyn',
  48. 'Abigail',
  49. 'Emily',
  50. 'Elizabeth',
  51. 'Mila',
  52. 'Ella',
  53. 'Avery',
  54. 'Sofia',
  55. 'Camila',
  56. 'Aria',
  57. 'Scarlett',
  58. 'Victoria',
  59. 'Madison',
  60. 'Luna',
  61. 'Grace',
  62. 'Chloe',
  63. 'Penelope',
  64. 'Layla',
  65. 'Riley',
  66. 'Zoey',
  67. 'Nora',
  68. 'Lily',
  69. 'Eleanor',
  70. 'Hannah',
  71. 'Lillian',
  72. 'Addison',
  73. 'Aubrey',
  74. 'Ellie',
  75. 'Stella',
  76. 'Natalie',
  77. 'Zoe',
  78. 'Leah',
  79. 'Hazel',
  80. 'Violet',
  81. 'Aurora',
  82. 'Savannah',
  83. 'Audrey',
  84. 'Brooklyn',
  85. 'Bella',
  86. 'Claire',
  87. 'Skylar'
  88. ]
  89. last_image_data = ''
  90. last_code = ''
  91. def find_element_by_css_selector(driver, css_selector):
  92. try:
  93. time.sleep(0.1)
  94. return driver.find_element_by_css_selector(css_selector)
  95. except Exception:
  96. time.sleep(check_timeout)
  97. return find_element_by_css_selector(driver, css_selector)
  98. def find_elements_by_css_selector(driver, css_selector):
  99. try:
  100. time.sleep(0.1)
  101. return driver.find_elements_by_css_selector(css_selector)
  102. except Exception:
  103. time.sleep(check_timeout)
  104. return find_elements_by_css_selector(driver, css_selector)
  105. def find_element_by_link_text(driver, link_text):
  106. try:
  107. time.sleep(0.1)
  108. return driver.find_element_by_link_text(link_text)
  109. except Exception:
  110. time.sleep(check_timeout)
  111. return find_element_by_link_text(driver, link_text)
  112. def find_elements_by_tag_name(driver, tag_name, target_number=None, try_times=None):
  113. time.sleep(0.1)
  114. if try_times <= 0:
  115. raise Exception('Can not find element.')
  116. if try_times is not None:
  117. try_times -= 1
  118. try:
  119. l = driver.find_elements_by_tag_name(tag_name)
  120. if target_number is None:
  121. return l
  122. if len(l) == target_number:
  123. return l
  124. else:
  125. time.sleep(check_timeout)
  126. return find_elements_by_tag_name(driver, tag_name, target_number, try_times)
  127. except Exception:
  128. time.sleep(check_timeout)
  129. return find_elements_by_tag_name(driver, tag_name, target_number, try_times)
  130. def find_element_by_tag_name(driver, tag_name):
  131. try:
  132. time.sleep(0.1)
  133. return driver.find_element_by_tag_name(tag_name)
  134. except Exception:
  135. time.sleep(check_timeout)
  136. return find_element_by_tag_name(driver, tag_name)
  137. def get_code(img_data):
  138. global last_image_data
  139. global last_code
  140. if last_image_data == img_data:
  141. return last_code
  142. last_image_data = img_data
  143. data = {
  144. 'v_pic': img_data,
  145. 'pri_id': 'ne',
  146. }
  147. response = requests.post(code_url, headers=code_headers, data=data)
  148. code = json.loads(response.text)['v_code']
  149. last_code = code
  150. print('code is {}'.format(code))
  151. return code
  152. def set_global_chek_timeout(ip_list, max_bad_number=None):
  153. global check_timeout
  154. if not ip_list:
  155. print(ip_list)
  156. raise Exception('ip_list is None')
  157. bad_number = 0
  158. success_number = 0
  159. if max_bad_number is None:
  160. max_bad_number = len(ip_list)/2
  161. for ip, port, tp in ip_list:
  162. tp = tp.lower()
  163. try:
  164. ip_url = '{}:{}'.format(ip, port)
  165. requests.get(url=url, headers=headers, proxies={
  166. tp: '{}://'.format(tp)+ip_url}, timeout=check_timeout)
  167. success_number += 1
  168. except Exception:
  169. bad_number += 1
  170. print('try to set timeout={} \t success:{} \t fail:{}'.format(
  171. check_timeout, success_number, bad_number))
  172. if bad_number > max_bad_number:
  173. check_timeout += 1.5
  174. if check_timeout >= 10:
  175. raise TimeoutError
  176. return set_global_chek_timeout(ip_list, max_bad_number)
  177. if success_number > len(ip_list) - max_bad_number:
  178. d = datetime.now()
  179. with open('check_timeout.txt', 'w') as f:
  180. f.write('{} {} {} {} {}'.format(
  181. d.year, d.month, d.day, d.hour, check_timeout))
  182. return check_timeout
  183. def get_ip_list():
  184. global check_timeout
  185. d = datetime.now()
  186. year = d.year
  187. month = d.month
  188. day = d.day
  189. hour = d.hour
  190. req_url = 'https://ip.ihuan.me/today/{}/{:02d}/{{:02d}}/{{:02d}}.html'.format(
  191. year,
  192. month
  193. )
  194. ip_list = []
  195. while True:
  196. try:
  197. cur_url = req_url.format(day, hour)
  198. print('try to get ip list page: ' + cur_url)
  199. response = requests.get(cur_url, headers=headers)
  200. ip_list = re.findall(
  201. r'<br>([\d\.]*?):(\d*)@(.*?)#',
  202. response.text,
  203. re.S
  204. )
  205. if ip_list:
  206. break
  207. hour -= 1
  208. except Exception as ex:
  209. print(ex)
  210. time.sleep(2)
  211. hour -= 1
  212. if hour == 0:
  213. raise Exception('get ip timeout')
  214. print('get ip list page success')
  215. if os.path.exists('check_timeout.txt'):
  216. l = []
  217. with open('check_timeout.txt', 'r') as f:
  218. f_read = f.read()
  219. if f_read:
  220. l = list(map(float, f_read.split(' ')))
  221. if l and [year, month, day, hour] == l[:-1]:
  222. check_timeout = l[-1]
  223. else:
  224. set_global_chek_timeout(ip_list[-5:])
  225. else:
  226. set_global_chek_timeout(ip_list[-5:])
  227. used_ip = ''
  228. if os.path.exists('used_ip.txt'):
  229. with open('used_ip.txt', 'r') as f:
  230. used_ip = f.read()
  231. return [[tp, '{}:{}'.format(ip, port)] for ip, port, tp in ip_list if ip not in used_ip]
  232. def register_email(driver, email, password):
  233. driver.get(url)
  234. time.sleep(check_timeout*2)
  235. sleep_timess = 0
  236. while True:
  237. try:
  238. driver.find_element_by_css_selector('#liveSwitch').click()
  239. break
  240. except Exception:
  241. time.sleep(check_timeout)
  242. sleep_timess += 1
  243. if sleep_timess > 10:
  244. return False
  245. while True:
  246. try:
  247. find_element_by_css_selector(driver, '#MemberName').clear()
  248. find_element_by_css_selector(
  249. driver, '#MemberName').send_keys(email)
  250. find_element_by_css_selector(driver, '#iSignupAction').click()
  251. break
  252. except Exception as e:
  253. time.sleep(check_timeout)
  254. sleep_timess = 0
  255. while driver.title != 'Create a password' and driver.title != '创建密码':
  256. time.sleep(check_timeout)
  257. sleep_timess += 1
  258. if sleep_timess > 15:
  259. return 'exist'
  260. find_element_by_css_selector(
  261. driver, '#PasswordInput').send_keys(password)
  262. find_element_by_css_selector(driver, '#iOptinEmail').click()
  263. find_element_by_css_selector(driver, '#iSignupAction').click()
  264. time.sleep(check_timeout)
  265. find_element_by_css_selector(
  266. driver, '#LastName').send_keys(choice(name_list))
  267. find_element_by_css_selector(
  268. driver, '#FirstName').send_keys(choice(name_list))
  269. find_element_by_css_selector(driver, '#iSignupAction').click()
  270. time.sleep(check_timeout)
  271. find_element_by_css_selector(
  272. driver, '#BirthYear option:nth-child({})'.format(randrange(2, 25))).click()
  273. find_element_by_css_selector(
  274. driver, '#BirthMonth option:nth-child({})'.format(randrange(2, 11))).click()
  275. find_element_by_css_selector(
  276. driver, '#BirthDay option:nth-child({})'.format(randrange(2, 22))).click()
  277. find_element_by_css_selector(driver, '#iSignupAction').click()
  278. time.sleep(check_timeout)
  279. t_url = driver.current_url
  280. try:
  281. while True:
  282. code_element = find_elements_by_tag_name(driver, 'input', 5, 2)[0]
  283. code_element.clear()
  284. code_element.send_keys(
  285. get_code(
  286. find_elements_by_tag_name(
  287. driver, 'img', 5, 2
  288. )[-1].screenshot_as_base64
  289. )
  290. )
  291. find_element_by_css_selector(driver, '#iSignupAction').click()
  292. time.sleep(check_timeout)
  293. try_times = 0
  294. while t_url == driver.current_url:
  295. time.sleep(check_timeout)
  296. try:
  297. driver.find_element_by_css_selector(
  298. '#iSignupAction').click()
  299. except Exception:
  300. pass
  301. try_times += 1
  302. if try_times > 8:
  303. break
  304. if t_url != driver.current_url:
  305. return True
  306. except Exception as e:
  307. print(e)
  308. return False
  309. def get_email_password():
  310. def _get_random_sign():
  311. return choice(sign_list)
  312. def _get_random_char(is_low=None):
  313. if is_low is None:
  314. r = randrange(0, 2)
  315. elif is_low is True:
  316. r = 0
  317. else:
  318. r = 1
  319. if r == 0:
  320. return chr(randrange(65, 91))
  321. else:
  322. return chr(randrange(97, 123))
  323. email = str(randrange(0, 10)).join(_get_random_char() for i in range(5))
  324. password = (
  325. str(randrange(0, 100)) + _get_random_sign()
  326. ).join(
  327. _get_random_char(True) + _get_random_char() + _get_random_char(False) for i in range(3)
  328. )
  329. return email, password
  330. def start_register(driver):
  331. email, password = get_email_password()
  332. result = register_email(driver, email, password)
  333. while result == 'exist':
  334. print('Already Register Account {}@outlook.com'.format(email))
  335. email, password = get_email_password()
  336. result = register_email(driver, email, password)
  337. if result:
  338. sleep_times = 0
  339. while 'account.microsoft.com' not in driver.current_url:
  340. sleep_times += 1
  341. time.sleep(check_timeout)
  342. if sleep_times >= 20:
  343. break
  344. if sleep_times >= 10:
  345. print('waiting time too long')
  346. else:
  347. with open('pass.txt', 'a+') as f:
  348. f.write(email + '@outlook.com ' + password + '\n')
  349. print('Success Register Account {}@outlook.com'.format(email))
  350. return True
  351. else:
  352. print('Fail')
  353. return False
  354. def create_driver(tp, ip_port):
  355. port = utils.free_port()
  356. options = ChromeOptions()
  357. desired_capabilities = DesiredCapabilities().CHROME
  358. desired_capabilities['pageLoadStrategy'] = 'none'
  359. os.popen(
  360. 'chrome.exe --remote-debugging-port={} --user-data-dir={} --proxy-server={}={}'.format(
  361. port, tempfile.mkdtemp(), tp, ip_port)
  362. )
  363. options.add_experimental_option(
  364. 'debuggerAddress', '127.0.0.1:{}'.format(port))
  365. driver = Chrome(
  366. options=options, desired_capabilities=desired_capabilities)
  367. driver.set_window_position(0, 0)
  368. driver.set_window_size(700, 600)
  369. return driver
  370. def run_driver():
  371. consecutive_fail_number = 0
  372. try:
  373. for tp, ip_port in get_ip_list():
  374. if datetime.now().minute == 10 or consecutive_fail_number > 10:
  375. break
  376. print('Proxy: {}://{}'.format(tp, ip_port))
  377. driver = create_driver(tp, ip_port)
  378. register_success = start_register(driver)
  379. driver.close()
  380. if register_success:
  381. consecutive_fail_number = 0
  382. if not register_success:
  383. consecutive_fail_number += 1
  384. except Exception as e:
  385. print(e)
  386. return schedule.every().minute.at(':10').do(run_driver)
  387. if __name__ == '__main__':
  388. run_driver()