#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Author : liuyuqi @Contact : liuyuqi.gov@msn.cn @Time : 2019/08/11 06:41:06 @Version : 1.0 @License : (C)Copyright 2019 @Desc : 获取所有相亲用户数据 https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank 总共 447 信息,每页20条,共23页。 ''' # import pandas # import josn # import os,sys,re # import requests import pymongo import time import threading import urllib.request # url_seed = "https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&pageSize=20&sortType=rank&page=" url_seed = "https://baidu.com" url_login = "" url_cache = set() headers = { 'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36", 'Cookie': '_ga=GA1.2.543338178.1565470742;_gid=GA1.2.1886010917.1565470742;gr_session_id_89669d96c88aefbc=79649999-e8c0-470a-9677-82496ff889a4;gr_session_id_89669d96c88aefbc_79649999-e8c0-470a-9677-82496ff889a4=true;gr_user_id=5ead73a1-13db-4b49-85bd-ff1ee44188bd;Hm_lpvt_93bbd335a208870aa1f296bcd6842e5e=1565478332;Hm_lvt_93bbd335a208870aa1f296bcd6842e5e=1565471101,1565471795,1565474508,1565478332;ab={};MEIQIA_TRACK_ID=1PFYEEcl0GseQQFT5TpFEZroHGg;QINGCLOUDELB=7c5122b6c6517c59163563fe189d391bab7e48fb3972913efd95d72fe838c4fb|XU9Nv|XU9Nv;', 'Host': 'juejin.im', 'Referer': 'https://juejin.im/pins/topic/5abcaa67092dcb4620ca335c', 'Sec-Fetch-Mode': 'cors', } client = pymongo.MongoClient("mongodb://admin:password@localhost:27017/") db_juejin = client.juejin_date def crawl(): for i in range(0, 1): getUser(i) def getUser(page): url_page = url_seed+str(page) data = { "uid": "", "device_id": "", "token": "", "src": "web", "topicId": "5abcaa67092dcb4620ca335c", "page": "0", "pageSize": "20", "sortType": "rank", } try: req = urllib.request.Request(method="get", url=url_page, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers) except Exception as err: print(err) try: with urllib.request.urlopen(req) as res: print(url_page) print(res) except Exception as e: print(e) # print(res.read().decode('utf-8')) # for i in res["d"]["list"].length: # saveUser(res[i]) def saveUser(jsonUser): ''' 保存到mongodb中 ''' student1 = { 'id': '20170101', 'name': 'Jordan', 'age': 20, 'gender': 'male' } # result = db_juejin.students.insert(student1) res1 = db_juejin.students.insert_one(student1) print(res1.inserted_id) if __name__ == "__main__": start_time = time.time() crawl() print("last time: {} s".format(time.time() - start_time))