liuyuqi-dellpc 5 years ago
commit
9adca349c5
12 changed files with 268 additions and 0 deletions
  1. 1 0
      .gitignore
  2. 50 0
      README.md
  3. 72 0
      date/get_user.py
  4. 8 0
      date/user_plot.R
  5. 12 0
      date/user_plot.py
  6. 10 0
      events/get_events.py
  7. 10 0
      main.py
  8. 1 0
      requirements.txt
  9. 64 0
      test/init_conf.py
  10. 16 0
      user/get_user.py
  11. 14 0
      utils/send_msg.py
  12. 10 0
      video/download_video.py

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+/.vscode

+ 50 - 0
README.md

@@ -0,0 +1,50 @@
+## crawl-juejin
+
+掘金(juejin.im)爬虫。
+
+- [x] 用户爬虫,分析
+  - [x] 获取所有相亲用户
+  - [x] 作图表分析
+  - [x] 定时更新,自动增加新数据。
+- [ ] 文章爬虫,分析
+  - [ ] 
+  - [ ] 
+  - [ ] 
+- [ ] 开源关注爬虫,分析
+  - [ ] 
+  - [ ] 
+  - [ ] 
+- [ ] 应用安利
+  - [ ] 
+  - [ ] 
+  - [ ] 
+- [ ] New资讯
+  - [ ] 
+  - [ ] 
+  - [ ] 
+- [ ] 付费用户
+  - [ ] 
+  - [ ] 
+  - [ ] 
+
+主动工具
+
+- [ ] 批量发送消息
+  - [ ] 
+  - [ ] 
+  - [ ] 
+- [ ] 批量发送评论
+  - [ ] 
+  - [ ] 
+  - [ ] 
+
+
+## Run
+
+```
+cd /d C:/Users/dell/Desktop/crawl-juejin
+pip install -r requirements.txt
+python crawl-juejin/main.py
+
+```
+

+ 72 - 0
date/get_user.py

@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 06:41:06
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   获取所有相亲用户数据
+
+https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank
+
+总共 447 信息,每页20条,共23页。
+'''
+# import pandas
+# import josn
+# import os,sys,re
+# import requests
+
+import time
+import threading
+import urllib.request
+
+url_seed = ""
+url_login = ""
+url_cache = set()
+
+headers = {
+    'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
+    'Cookie': 'did=web_34abffaccc51410a45a2f09bee712ec6; didv=2; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549878747,1549878930,1549878956; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1549879170',
+    'Host': 'id.kuaishou.com',
+    'Referer': 'https://www.kuaishou.com/account/login/?redirectURL=https%3A%2F%2Fverify.kuaishou.com%2F%23%2Fverify%2Fpersonal',
+    'Upgrade-Insecure-Requests': '1',
+}
+
+
+def crawl():
+    for i in range(1, 24):
+        print(i)
+
+
+def getUser():
+    data = {
+        "": "",
+            "": "",
+            "": "",
+            "": "",
+            "": "",
+            "": "",
+            "": "",
+    }
+    try:
+        req = urllib.request.Request(
+            url=url_seed, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
+        with urllib.request.urlopen(req) as res:
+            print(res.read().decode('utf-8'))
+            for i in res["d"]["list"].length:                
+                saveUser(res[i])
+    except Exception as err:
+        print(err)
+
+
+def saveUser(jsonUser):
+    '''
+    保存到mongodb中
+    '''
+    pass
+
+if __name__ == "__main__":
+    start_time = time.time()
+    crawl()
+    print("last time: {} s".format(time.time() - start_time))

+ 8 - 0
date/user_plot.R

@@ -0,0 +1,8 @@
+# @Author  :   liuyuqi
+# @Contact :   liuyuqi.gov@msn.cn
+# @Time    :   2019/08/11 07:03:53
+# @Version :   1.0
+# @License :   (C)Copyright 2019 liuyuqi.
+# @Desc    :   用户分析
+###############################################################################
+

+ 12 - 0
date/user_plot.py

@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 07:02:57
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   用户作图
+'''
+import matlib as plt
+

+ 10 - 0
events/get_events.py

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 07:08:22
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   获取所有events,不过没多大意义。
+'''

+ 10 - 0
main.py

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 06:38:50
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   掘金爬虫
+'''

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+

+ 64 - 0
test/init_conf.py

@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 06:48:32
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   初始化配置文件
+'''
+import yaml
+import os
+
+
+# current_path = os.path.dirname(os.path.abspath(__file__))
+config_path = os.path.join('..', "conf", 'config.yaml')
+
+
+class YamlConf:
+    '''
+    yaml配置
+    '''
+    @staticmethod
+    def save(data):
+        global config_path
+        try:
+            yaml.dump(data, open(config_path, "w"))
+        except Exception as e:
+            print(e)
+
+    @staticmethod
+    def load():
+        global config_path
+        config = {}
+        try:
+            config = yaml.load(
+                open(config_path, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
+            if config is None:
+                config = {}
+        except Exception as e:
+            print(e)
+        return config
+
+    @staticmethod
+    def set(data_dict):
+        json_obj = YamlConf.load()
+        for key in data_dict:
+            json_obj[key] = data_dict[key]
+        YamlConf.save(json_obj)
+
+    @staticmethod
+    def get(key, default_val=""):
+        try:
+            result = YamlConf.load()[key]
+            return result
+        except Exception as e:
+            return default_val
+
+
+if __name__ == '__main__':
+    config = {"user": "小舟", "pass": "123", "address": ["shanghai", "beijing"]}
+    print(YamlConf.get("addresss", "default_val"))
+    config2 = {"sex": "girl", "user": "xiaobai"}
+    YamlConf.set(config2)

+ 16 - 0
user/get_user.py

@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 07:10:54
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   获取所有用户信息
+姓名/性别/公司/行业/年龄/备注等等
+'''
+
+
+
+
+

+ 14 - 0
utils/send_msg.py

@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 06:52:23
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   批量给指定用户发送消息,发送评论
+'''
+import pandas as pd
+
+
+

+ 10 - 0
video/download_video.py

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/08/11 07:07:24
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   掘金视频下载
+'''