get_user.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2019/08/11 06:41:06
  7. @Version : 1.0
  8. @License : (C)Copyright 2019
  9. @Desc : 获取所有相亲用户数据
  10. https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank
  11. 总共 447 信息,每页20条,共23页。
  12. '''
  13. # import pandas
  14. # import josn
  15. # import os,sys,re
  16. # import requests
  17. import time
  18. import threading
  19. import urllib.request
  20. url_seed = ""
  21. url_login = ""
  22. url_cache = set()
  23. headers = {
  24. 'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
  25. 'Cookie': 'did=web_34abffaccc51410a45a2f09bee712ec6; didv=2; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549878747,1549878930,1549878956; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549879170',
  26. 'Host': 'id.kuaishou.com',
  27. 'Referer': 'https://www.kuaishou.com/account/login/?redirectURL=https%3A%2F%2Fverify.kuaishou.com%2F%23%2Fverify%2Fpersonal',
  28. 'Upgrade-Insecure-Requests': '1',
  29. }
  30. def crawl():
  31. for i in range(1, 24):
  32. print(i)
  33. def getUser():
  34. data = {
  35. "": "",
  36. "": "",
  37. "": "",
  38. "": "",
  39. "": "",
  40. "": "",
  41. "": "",
  42. }
  43. try:
  44. req = urllib.request.Request(
  45. url=url_seed, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
  46. with urllib.request.urlopen(req) as res:
  47. print(res.read().decode('utf-8'))
  48. for i in res["d"]["list"].length:
  49. saveUser(res[i])
  50. except Exception as err:
  51. print(err)
  52. def saveUser(jsonUser):
  53. '''
  54. 保存到mongodb中
  55. '''
  56. pass
  57. if __name__ == "__main__":
  58. start_time = time.time()
  59. crawl()
  60. print("last time: {} s".format(time.time() - start_time))