liuyuqi-dellpc 4 years ago
parent
commit
d2a4f96d39

+ 1 - 0
conf/config.yaml

@@ -0,0 +1 @@
+url_date_seed: https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank

+ 48 - 22
date/get_user.py

@@ -17,56 +17,82 @@ https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&t
 # import os,sys,re
 # import requests
 
+import pymongo
 import time
 import threading
 import urllib.request
 
-url_seed = ""
+# url_seed = "https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&pageSize=20&sortType=rank&page="
+url_seed = "https://baidu.com"
+
 url_login = ""
 url_cache = set()
 
 headers = {
     'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
-    'Cookie': 'did=web_34abffaccc51410a45a2f09bee712ec6; didv=2; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549878747,1549878930,1549878956; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549879170',
-    'Host': 'id.kuaishou.com',
-    'Referer': 'https://www.kuaishou.com/account/login/?redirectURL=https%3A%2F%2Fverify.kuaishou.com%2F%23%2Fverify%2Fpersonal',
-    'Upgrade-Insecure-Requests': '1',
+    'Cookie': '_ga=GA1.2.543338178.1565470742;_gid=GA1.2.1886010917.1565470742;gr_session_id_89669d96c88aefbc=79649999-e8c0-470a-9677-82496ff889a4;gr_session_id_89669d96c88aefbc_79649999-e8c0-470a-9677-82496ff889a4=true;gr_user_id=5ead73a1-13db-4b49-85bd-ff1ee44188bd;Hm_lpvt_93bbd335a208870aa1f296bcd6842e5e=1565478332;Hm_lvt_93bbd335a208870aa1f296bcd6842e5e=1565471101,1565471795,1565474508,1565478332;ab={};MEIQIA_TRACK_ID=1PFYEEcl0GseQQFT5TpFEZroHGg;QINGCLOUDELB=7c5122b6c6517c59163563fe189d391bab7e48fb3972913efd95d72fe838c4fb|XU9Nv|XU9Nv;',
+    'Host': 'juejin.im',
+    'Referer': 'https://juejin.im/pins/topic/5abcaa67092dcb4620ca335c',
+    'Sec-Fetch-Mode': 'cors',
 }
 
 
+client = pymongo.MongoClient("mongodb://admin:password@localhost:27017/")
+db_juejin = client.juejin_date
+
+
 def crawl():
-    for i in range(1, 24):
-        print(i)
+    for i in range(0, 1):
+        getUser(i)
 
 
-def getUser():
+def getUser(page):
+    url_page = url_seed+str(page)
     data = {
-        "": "",
-            "": "",
-            "": "",
-            "": "",
-            "": "",
-            "": "",
-            "": "",
+        "uid": "",
+        "device_id": "",
+        "token": "",
+        "src": "web",
+        "topicId": "5abcaa67092dcb4620ca335c",
+        "page": "0",
+        "pageSize": "20",
+        "sortType": "rank",
     }
     try:
-        req = urllib.request.Request(
-            url=url_seed, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
-        with urllib.request.urlopen(req) as res:
-            print(res.read().decode('utf-8'))
-            for i in res["d"]["list"].length:                
-                saveUser(res[i])
+        req = urllib.request.Request(method="get",
+            url=url_page, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
     except Exception as err:
         print(err)
+    try:
+        with urllib.request.urlopen(req) as res:
+            print(url_page)
+            print(res)
+    except Exception as e:
+        print(e)
+        # print(res.read().decode('utf-8'))
+        # for i in res["d"]["list"].length:
+        #     saveUser(res[i])
 
 
 def saveUser(jsonUser):
     '''
     保存到mongodb中
     '''
-    pass
+    student1 = {
+        'id': '20170101',
+        'name': 'Jordan',
+        'age': 20,
+        'gender': 'male'
+    }
+    # result = db_juejin.students.insert(student1)
+    res1 = db_juejin.students.insert_one(student1)
+    print(res1.inserted_id)
+
 
 if __name__ == "__main__":
     start_time = time.time()
     crawl()
     print("last time: {} s".format(time.time() - start_time))
+
+
+    

+ 2 - 0
date/user_plot.py

@@ -10,3 +10,5 @@
 '''
 import matlib as plt
 
+if __name__ == "__main__":
+    pass

+ 2 - 0
events/get_events.py

@@ -8,3 +8,5 @@
 @License :   (C)Copyright 2019
 @Desc    :   获取所有events,不过没多大意义。
 '''
+if __name__ == "__main__":
+    pass

+ 2 - 0
main.py

@@ -8,3 +8,5 @@
 @License :   (C)Copyright 2019
 @Desc    :   掘金爬虫
 '''
+if __name__ == "__main__":
+    pass

+ 9 - 6
test/init_conf.py

@@ -13,7 +13,7 @@ import os
 
 
 # current_path = os.path.dirname(os.path.abspath(__file__))
-config_path = os.path.join('..', "conf", 'config.yaml')
+config_path = os.path.join( "conf", 'config.yaml')
 
 
 class YamlConf:
@@ -54,11 +54,14 @@ class YamlConf:
             result = YamlConf.load()[key]
             return result
         except Exception as e:
-            return default_val
+            print(e)
 
 
 if __name__ == '__main__':
-    config = {"user": "小舟", "pass": "123", "address": ["shanghai", "beijing"]}
-    print(YamlConf.get("addresss", "default_val"))
-    config2 = {"sex": "girl", "user": "xiaobai"}
-    YamlConf.set(config2)
+    # config = {"user": "小舟", "pass": "123", "address": ["shanghai", "beijing"]}
+    # print(YamlConf.get("addresss", "default_val"))
+    config2 = {"url_date_seed": "https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank"}
+    YamlConf.set(config2)
+
+
+    

+ 66 - 0
test/string_join.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 09:47:45
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   字符串拼接
+'''
+
+url1 = "https://juejin.im/pins/topic/5abcaa67092dcb4620ca335c?page="
+tmp = 1
+
+
+def t1():
+    '''
+    join 字符串链接。。。垃圾!!
+    '''
+    print(url1.join(str(tmp)))
+    print("".join([url1, str(tmp)]))
+    print("ni hao ya".join("wo bu hao"))
+
+    try:
+        print("".join([url1, tmp]))
+    except Exception as e:
+        print(e)
+
+t1()
+
+
+def t2():
+    '''
+    , 逗号字符串链接,垃圾!!
+    '''
+    a1 = url1, tmp
+    print(a1)
+
+
+t2()
+
+
+def t3():
+    '''
+    +,加号字符串链接,垃圾!!
+    '''
+    try:
+        a2 = url1+tmp
+        print(a2)
+    except Exception as e:
+        print(e)
+    print(url1+str(tmp))
+
+
+t3()
+
+
+def t4():
+    '''
+    , 逗号字符串链接,垃圾!!
+    '''
+    res = '{0}{1}'.format(url1, tmp)
+    print(res)
+
+
+t4()

+ 28 - 0
test/test_db.py

@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 08:52:06
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   mongo操作
+'''
+import pymongo
+
+client = pymongo.MongoClient("mongodb://admin:password@localhost:27017/")
+db_juejin = client.juejin_date
+student1 = {
+    'id': '20170101',
+    'name': 'Jordan',
+    'age': 20,
+    'gender': 'male'
+}
+result = db_juejin.students.insert(student1)
+res1=db_juejin.students.insert_one(student1)
+res2=db_juejin.students.find({})
+
+print(res1.inserted_id)
+
+
+

+ 14 - 0
test/test_urllib copy.py

@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 10:27:41
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   垃圾的 urllib.request
+
+404 页面不存在,就错误!
+'''
+import urllib3
+

+ 25 - 0
test/test_urllib.py

@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 10:27:41
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   垃圾的 urllib.request
+
+404 页面不存在,就错误!
+'''
+
+import urllib.request
+
+url_page = "http://blog.yoqi.me"
+data = {}
+
+try:
+    req = urllib.request.Request(
+        url=url_page, data=urllib.parse.urlencode(data).encode(encoding='UTF8'))
+    with urllib.request.urlopen(req) as res:
+        print("http res:", res)
+except Exception as err:
+    print("error:", err)

+ 2 - 1
user/get_user.py

@@ -11,6 +11,7 @@
 '''
 
 
-
+if __name__ == "__main__":
+    pass
 
 

+ 2 - 1
utils/send_msg.py

@@ -10,5 +10,6 @@
 '''
 import pandas as pd
 
-
+if __name__ == "__main__":
+    pass
 

+ 2 - 0
video/download_video.py

@@ -8,3 +8,5 @@
 @License :   (C)Copyright 2019
 @Desc    :   掘金视频下载
 '''
+if __name__ == "__main__":
+    pass