get_user.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2019/08/11 06:41:06
  7. @Version : 1.0
  8. @License : (C)Copyright 2019
  9. @Desc : 获取所有相亲用户数据
  10. https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank
  11. 总共 447 信息,每页20条,共23页。
  12. '''
  13. # import pandas
  14. # import josn
  15. # import os,sys,re
  16. # import requests
  17. import pymongo
  18. import time
  19. import threading
  20. import urllib.request
  21. # url_seed = "https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&pageSize=20&sortType=rank&page="
  22. url_seed = "https://baidu.com"
  23. url_login = ""
  24. url_cache = set()
  25. headers = {
  26. 'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
  27. 'Cookie': '_ga=GA1.2.543338178.1565470742;_gid=GA1.2.1886010917.1565470742;gr_session_id_89669d96c88aefbc=79649999-e8c0-470a-9677-82496ff889a4;gr_session_id_89669d96c88aefbc_79649999-e8c0-470a-9677-82496ff889a4=true;gr_user_id=5ead73a1-13db-4b49-85bd-ff1ee44188bd;Hm_lpvt_93bbd335a208870aa1f296bcd6842e5e=1565478332;Hm_lvt_93bbd335a208870aa1f296bcd6842e5e=1565471101,1565471795,1565474508,1565478332;ab={};MEIQIA_TRACK_ID=1PFYEEcl0GseQQFT5TpFEZroHGg;QINGCLOUDELB=7c5122b6c6517c59163563fe189d391bab7e48fb3972913efd95d72fe838c4fb|XU9Nv|XU9Nv;',
  28. 'Host': 'juejin.im',
  29. 'Referer': 'https://juejin.im/pins/topic/5abcaa67092dcb4620ca335c',
  30. 'Sec-Fetch-Mode': 'cors',
  31. }
  32. client = pymongo.MongoClient("mongodb://admin:password@localhost:27017/")
  33. db_juejin = client.juejin_date
  34. def crawl():
  35. for i in range(0, 1):
  36. getUser(i)
  37. def getUser(page):
  38. url_page = url_seed+str(page)
  39. data = {
  40. "uid": "",
  41. "device_id": "",
  42. "token": "",
  43. "src": "web",
  44. "topicId": "5abcaa67092dcb4620ca335c",
  45. "page": "0",
  46. "pageSize": "20",
  47. "sortType": "rank",
  48. }
  49. try:
  50. req = urllib.request.Request(method="get",
  51. url=url_page, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
  52. except Exception as err:
  53. print(err)
  54. try:
  55. with urllib.request.urlopen(req) as res:
  56. print(url_page)
  57. print(res)
  58. except Exception as e:
  59. print(e)
  60. # print(res.read().decode('utf-8'))
  61. # for i in res["d"]["list"].length:
  62. # saveUser(res[i])
  63. def saveUser(jsonUser):
  64. '''
  65. 保存到mongodb中
  66. '''
  67. student1 = {
  68. 'id': '20170101',
  69. 'name': 'Jordan',
  70. 'age': 20,
  71. 'gender': 'male'
  72. }
  73. # result = db_juejin.students.insert(student1)
  74. res1 = db_juejin.students.insert_one(student1)
  75. print(res1.inserted_id)
  76. if __name__ == "__main__":
  77. start_time = time.time()
  78. crawl()
  79. print("last time: {} s".format(time.time() - start_time))