|
@@ -17,56 +17,82 @@ https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&t
|
|
# import os,sys,re
|
|
# import os,sys,re
|
|
# import requests
|
|
# import requests
|
|
|
|
|
|
|
|
+import pymongo
|
|
import time
|
|
import time
|
|
import threading
|
|
import threading
|
|
import urllib.request
|
|
import urllib.request
|
|
|
|
|
|
-url_seed = ""
|
|
|
|
|
|
+# url_seed = "https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&pageSize=20&sortType=rank&page="
|
|
|
|
+url_seed = "https://baidu.com"
|
|
|
|
+
|
|
url_login = ""
|
|
url_login = ""
|
|
url_cache = set()
|
|
url_cache = set()
|
|
|
|
|
|
headers = {
|
|
headers = {
|
|
'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
|
|
'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
|
|
- 'Cookie': 'did=web_34abffaccc51410a45a2f09bee712ec6; didv=2; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549878747,1549878930,1549878956; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549879170',
|
|
|
|
- 'Host': 'id.kuaishou.com',
|
|
|
|
- 'Referer': 'https://www.kuaishou.com/account/login/?redirectURL=https%3A%2F%2Fverify.kuaishou.com%2F%23%2Fverify%2Fpersonal',
|
|
|
|
- 'Upgrade-Insecure-Requests': '1',
|
|
|
|
|
|
+ 'Cookie': '_ga=GA1.2.543338178.1565470742;_gid=GA1.2.1886010917.1565470742;gr_session_id_89669d96c88aefbc=79649999-e8c0-470a-9677-82496ff889a4;gr_session_id_89669d96c88aefbc_79649999-e8c0-470a-9677-82496ff889a4=true;gr_user_id=5ead73a1-13db-4b49-85bd-ff1ee44188bd;Hm_lpvt_93bbd335a208870aa1f296bcd6842e5e=1565478332;Hm_lvt_93bbd335a208870aa1f296bcd6842e5e=1565471101,1565471795,1565474508,1565478332;ab={};MEIQIA_TRACK_ID=1PFYEEcl0GseQQFT5TpFEZroHGg;QINGCLOUDELB=7c5122b6c6517c59163563fe189d391bab7e48fb3972913efd95d72fe838c4fb|XU9Nv|XU9Nv;',
|
|
|
|
+ 'Host': 'juejin.im',
|
|
|
|
+ 'Referer': 'https://juejin.im/pins/topic/5abcaa67092dcb4620ca335c',
|
|
|
|
+ 'Sec-Fetch-Mode': 'cors',
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
+client = pymongo.MongoClient("mongodb://admin:password@localhost:27017/")
|
|
|
|
+db_juejin = client.juejin_date
|
|
|
|
+
|
|
|
|
+
|
|
def crawl():
|
|
def crawl():
|
|
- for i in range(1, 24):
|
|
|
|
- print(i)
|
|
|
|
|
|
+ for i in range(0, 1):
|
|
|
|
+ getUser(i)
|
|
|
|
|
|
|
|
|
|
-def getUser():
|
|
|
|
|
|
+def getUser(page):
|
|
|
|
+ url_page = url_seed+str(page)
|
|
data = {
|
|
data = {
|
|
- "": "",
|
|
|
|
- "": "",
|
|
|
|
- "": "",
|
|
|
|
- "": "",
|
|
|
|
- "": "",
|
|
|
|
- "": "",
|
|
|
|
- "": "",
|
|
|
|
|
|
+ "uid": "",
|
|
|
|
+ "device_id": "",
|
|
|
|
+ "token": "",
|
|
|
|
+ "src": "web",
|
|
|
|
+ "topicId": "5abcaa67092dcb4620ca335c",
|
|
|
|
+ "page": "0",
|
|
|
|
+ "pageSize": "20",
|
|
|
|
+ "sortType": "rank",
|
|
}
|
|
}
|
|
try:
|
|
try:
|
|
- req = urllib.request.Request(
|
|
|
|
- url=url_seed, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
|
|
|
|
- with urllib.request.urlopen(req) as res:
|
|
|
|
- print(res.read().decode('utf-8'))
|
|
|
|
- for i in res["d"]["list"].length:
|
|
|
|
- saveUser(res[i])
|
|
|
|
|
|
+ req = urllib.request.Request(method="get",
|
|
|
|
+ url=url_page, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
|
|
except Exception as err:
|
|
except Exception as err:
|
|
print(err)
|
|
print(err)
|
|
|
|
+ try:
|
|
|
|
+ with urllib.request.urlopen(req) as res:
|
|
|
|
+ print(url_page)
|
|
|
|
+ print(res)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print(e)
|
|
|
|
+ # print(res.read().decode('utf-8'))
|
|
|
|
+ # for i in res["d"]["list"].length:
|
|
|
|
+ # saveUser(res[i])
|
|
|
|
|
|
|
|
|
|
def saveUser(jsonUser):
|
|
def saveUser(jsonUser):
|
|
'''
|
|
'''
|
|
保存到mongodb中
|
|
保存到mongodb中
|
|
'''
|
|
'''
|
|
- pass
|
|
|
|
|
|
+ student1 = {
|
|
|
|
+ 'id': '20170101',
|
|
|
|
+ 'name': 'Jordan',
|
|
|
|
+ 'age': 20,
|
|
|
|
+ 'gender': 'male'
|
|
|
|
+ }
|
|
|
|
+ # result = db_juejin.students.insert(student1)
|
|
|
|
+ res1 = db_juejin.students.insert_one(student1)
|
|
|
|
+ print(res1.inserted_id)
|
|
|
|
+
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
crawl()
|
|
crawl()
|
|
print("last time: {} s".format(time.time() - start_time))
|
|
print("last time: {} s".format(time.time() - start_time))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|