liuyuqi-dellpc 1 year ago
commit
e79ece6a80

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+/build
+*.pyc

+ 0 - 0
bin/crawl_baidu


+ 12 - 0
conf/config.json

@@ -0,0 +1,12 @@
+{
+    "cookie": "BIDUPSID=94CF38B6E2B133BB95CFD7692279A2E1; PSTM=1656423206; BA_HECTOR=0l8l8g252kag0kal201hbm0p715; ZFY=T8w70A:ACckELtYIsYWBHLbsz7sugWUFeW3MhaAA1ojU:C; BDUSS=JxRGVZY1EyY2cxSHZoN1VES2tJRGszWHVTcXpLRWdqWUFITlB4TXdLRlNrT0ppRVFBQUFBJCQAAAAAAAAAAAEAAADrBCoHwffLrrfJt8m3ybfJAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFIDu2JSA7ticG; H_PS_PSSID=36555_36461_36501_36455_36414_36690_36167_36695_36697_36622_26350_36467_22159; BAIDUID=94CF38B6E2B133BB56AD32A9AF8AAAB5:SL=0:NR=10:FG=1; BAIDUID_BFESS=94CF38B6E2B133BB56AD32A9AF8AAAB5:SL=0:NR=10:FG=1; bdindexid=a17n716aovv6sp31nffonloj57; RT=\"z=1&dm=baidu.com&si=lw6v60mezv8&ss=l4y7n19q&sl=b&tt=a43&bcn=https://fclog.baidu.com/log/weirwood?type=perf\"; BDUSS_BFESS=JxRGVZY1EyY2cxSHZoN1VES2tJRGszWHVTcXpLRWdqWUFITlB4TXdLRlNrT0ppRVFBQUFBJCQAAAAAAAAAAAEAAADrBCoHwffLrrfJt8m3ybfJAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFIDu2JSA7ticG; ab_sr=1.0.1_NWY1MTE0ZTBiNWU4ZGFlNTQwMWU2NWEzYTRhYTEzZGMyMjI0NWZiY2EwNTUzZjg2NjMzM2I5Y2FjYzYxOTQ0ZDM5YTY1MzExMzk0NmE0ODRmNjM2NzdjNDJjZGYyYmUyMmUzZjRlYTcwMDkyYTVmNmNiMTlkNzhlNmRlOGI3N2RiZWQwOTVkNTQ1OTNkNThkNzQ2NjU3ZmY0MTQ1ZmFmNA==",
+    "CliperText": "1656399607736_1656431430319_4AB9QFVS9yPO/04xkfvTZwBTR5dvR0vc288+m8IVjlgN8bMIpU6QuC2bRux22CB2Nbng1A6MN7m8IX2FO6oF15RHawNhPCcwq4Mw3QS9dBt7F9xmt4Dgk58vhHOe5x0mM54r6A4ynFeQusA66u5tdjDf15Di88ToI9hMKhcoVN+nItCTH7oEvHXKzf2R8DIwu0QcCyWgcQl4cdjq/Dg6doApJ0uejKu+ptYHJSEBLtEfQm98gTv/fOcvYcfJKYDmKQFDg3Eso9n6By/2bWBSkZIm+1gOW+80Cy448DaLJPvxso9NGiqElawftwr2kdWrqNRGDepgI5gKFpTea6SJ6Jofg/R/lWGUR76IXhlnitvurTY55t3Sli64QRxux7SUL1ghVwQln9SjlD1o4TuNskX7D6fW5d7lqXluc+KtxpeSwQtnQvgYE0cb9smSubzF7jPvkAyY3qFOKDW2eTzU6Q==",
+    "words": [
+        [
+            {
+                "name": "气候变化",
+                "wordType": 1
+            }
+        ]
+    ]
+}

+ 5 - 0
crawl_baidu/__init__.py

@@ -0,0 +1,5 @@
+from crawl_baidu.crawl_baidu import CrawlBaidu
+
+def main():
+    crawl=CrawlBaidu()
+    crawl.get_index_data()

+ 3 - 0
crawl_baidu/api.py

@@ -0,0 +1,3 @@
+
+_host = r"https://index.baidu.com"
+search = _host + r'/api/SearchApi/index?area=0&word={}&area=0&startDate={}&endDate={}'

+ 93 - 0
crawl_baidu/crawl_baidu.py

@@ -0,0 +1,93 @@
+
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/06/28 23:15:05
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+
+from http import cookies
+import requests
+import sys,os,json
+from crawl_baidu.lib.json_conf import JsonConf
+import time
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Cache-Control': 'no-cache',
+    'DNT': '1',
+    'Host': 'index.baidu.com',
+    'Pragma': 'no-cache',
+    'Proxy-Connection': 'keep-alive',
+    'Referer': 'https://index.baidu.com/v2/main/index.html',
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+class CrawlBaidu():
+    def __init__(self):
+        self.sess=requests.Session()
+        self.jsonConf = JsonConf()
+        self.conf = self.jsonConf.load()
+        cookie = self.conf.get('cookie')
+        CliperText = self.conf.get('CliperText')
+        self.words = self.conf.get('words')
+        # self.sess.cookies.update(cookie)
+        self.sess.headers.update({
+            "Cipher-Text":CliperText,
+            "Cookie" : cookie
+            })
+
+    @staticmethod
+    def decrypt(t,e):
+        n = list(t)
+        i = list(e)
+        a = {}
+        result = []
+        ln = int(len(n)/2)
+        start = n[ln:]
+        end = n[:ln]
+        for j,k in zip(start, end):
+            a.update({k: j})
+        for j in e:
+            result.append(a.get(j))
+        return ''.join(result)
+        
+    def get_ptbk(self,uniqid):
+        url = 'http://index.baidu.com/Interface/ptbk?uniqid={}'
+        resp = self.sess.get(url.format(uniqid), headers=headers)
+        if resp.status_code != 200:
+            print('获取uniqid失败')
+            sys.exit(1)
+        return resp.json().get('data')
+        
+    def get_index_data(self, start='2011-01-03', end='2022-08-05'):
+        keyword = str(self.words).replace("'", '"')
+        url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={keyword}&area=0&startDate={start}&endDate={end}'
+        resp = self.sess.get(url, headers=headers)
+        if resp.status_code != 200:
+            print('获取指数失败')
+            sys.exit(1)
+
+        content = resp.json()
+        data = content.get('data')
+        user_indexes = data.get('userIndexes')[0]
+        uniqid = data.get('uniqid')
+        ptbk = self.get_ptbk(uniqid)
+
+        while ptbk is None or ptbk == '':
+            ptbk = self.get_ptbk(uniqid)
+
+        all_data = user_indexes.get('all').get('data')
+        result = CrawlBaidu.decrypt(ptbk, all_data)
+        result = result.split(',')
+
+        print(result)
+
+        if  not os.path.exists("data"):
+            os.mkdir("data")
+        with open("data/res.txt","w") as file:
+            file.write(json.dumps(result))

+ 0 - 0
crawl_baidu/lib/__init__.py


+ 66 - 0
crawl_baidu/lib/json_conf.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/24 15:07:14
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   yaml util
+'''
+import os
+import json
+
+config_path = "conf/config.json"
+
+
+class JsonConf:
+    '''json配置文件类'''
+    @staticmethod
+    def save(data):
+        global config_path
+        with open(config_path, 'w') as json_file:
+            json_file.write(json.dumps(data, indent=4))
+
+    @staticmethod
+    def load():
+        global config_path
+        if not os.path.exists(config_path):
+            with open(config_path, 'w') as json_file:
+                pass
+        with open(config_path, encoding="utf-8") as json_file:
+            try:
+                data = json.load(json_file)
+            except Exception as e:
+                if(str(e).index("utf-8-sig") > 0):
+                    with open(config_path, encoding="utf-8-sig") as json_file:
+                        data = json.load(json_file)
+                        return data
+                else:
+                    print(e)
+            return data
+
+    @staticmethod
+    def set(data_dict):
+        json_obj = JsonConf.load()
+        for key in data_dict:
+            json_obj[key] = data_dict[key]
+        JsonConf.save(json_obj)
+        print(json.dumps(json_obj, indent=4))
+
+    @staticmethod
+    def get(key, default_val=""):
+        '''
+        配置文件获取key对象的值,如果没有设置就返回默认值
+        '''
+        try:
+            result = JsonConf.load()[key]
+            return result
+        except Exception as e:
+            print(e)
+            return default_val
+
+    @staticmethod
+    def get(jsonData, key, default_val=""):
+        try:
+            return jsonData[key]
+        except Exception as e:
+            return default_val

+ 50 - 0
crawl_baidu/lib/yml_conf.py

@@ -0,0 +1,50 @@
+'''
+Created on 2019年4月30日
+
+yaml一般用来写配置文件。
+@author: liuyuqi
+'''
+
+import yaml
+import os
+
+config_path ="conf/config.yml"
+
+class YamlConf:
+    '''
+    yaml配置
+    '''
+    @staticmethod
+    def save(data):
+        global config_path
+        try:
+            yaml.dump(data, open(config_path, "w"))
+        except Exception as e:
+            print(e)
+
+    @staticmethod
+    def load():
+        global config_path
+        config = {}
+        try:
+            config = yaml.load(open(config_path, "r", encoding="utf-8"), Loader=yaml.SafeLoader)
+            if config is None:
+                config = {}
+        except Exception as e:
+            print(e)
+        return config
+
+    @staticmethod
+    def set(data_dict):
+        json_obj = YamlConf.load()
+        for key in data_dict:
+            json_obj[key] = data_dict[key]
+        YamlConf.save(json_obj)
+    
+    @staticmethod
+    def get(key, default_val=""):
+        try:
+            result = YamlConf.load()[key]
+            return result
+        except Exception as e:
+            return default_val

File diff suppressed because it is too large
+ 0 - 0
data/res.txt


+ 44 - 0
demo.spec

@@ -0,0 +1,44 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+
+block_cipher = None
+
+
+a = Analysis(
+    ['demo.py'],
+    pathex=[],
+    binaries=[],
+    datas=[],
+    hiddenimports=[],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='demo',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)

+ 12 - 0
dist/conf/config.json

@@ -0,0 +1,12 @@
+{
+    "cookie": "BIDUPSID=94CF38B6E2B133BB95CFD7692279A2E1; PSTM=1656423206; BA_HECTOR=0l8l8g252kag0kal201hbm0p715; ZFY=T8w70A:ACckELtYIsYWBHLbsz7sugWUFeW3MhaAA1ojU:C; BDUSS=JxRGVZY1EyY2cxSHZoN1VES2tJRGszWHVTcXpLRWdqWUFITlB4TXdLRlNrT0ppRVFBQUFBJCQAAAAAAAAAAAEAAADrBCoHwffLrrfJt8m3ybfJAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFIDu2JSA7ticG; H_PS_PSSID=36555_36461_36501_36455_36414_36690_36167_36695_36697_36622_26350_36467_22159; BAIDUID=94CF38B6E2B133BB56AD32A9AF8AAAB5:SL=0:NR=10:FG=1; BAIDUID_BFESS=94CF38B6E2B133BB56AD32A9AF8AAAB5:SL=0:NR=10:FG=1; bdindexid=a17n716aovv6sp31nffonloj57; RT=\"z=1&dm=baidu.com&si=lw6v60mezv8&ss=l4y7n19q&sl=b&tt=a43&bcn=https://fclog.baidu.com/log/weirwood?type=perf\"; BDUSS_BFESS=JxRGVZY1EyY2cxSHZoN1VES2tJRGszWHVTcXpLRWdqWUFITlB4TXdLRlNrT0ppRVFBQUFBJCQAAAAAAAAAAAEAAADrBCoHwffLrrfJt8m3ybfJAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFIDu2JSA7ticG; ab_sr=1.0.1_NWY1MTE0ZTBiNWU4ZGFlNTQwMWU2NWEzYTRhYTEzZGMyMjI0NWZiY2EwNTUzZjg2NjMzM2I5Y2FjYzYxOTQ0ZDM5YTY1MzExMzk0NmE0ODRmNjM2NzdjNDJjZGYyYmUyMmUzZjRlYTcwMDkyYTVmNmNiMTlkNzhlNmRlOGI3N2RiZWQwOTVkNTQ1OTNkNThkNzQ2NjU3ZmY0MTQ1ZmFmNA==",
+    "CliperText": "1656399607736_1656431430319_4AB9QFVS9yPO/04xkfvTZwBTR5dvR0vc288+m8IVjlgN8bMIpU6QuC2bRux22CB2Nbng1A6MN7m8IX2FO6oF15RHawNhPCcwq4Mw3QS9dBt7F9xmt4Dgk58vhHOe5x0mM54r6A4ynFeQusA66u5tdjDf15Di88ToI9hMKhcoVN+nItCTH7oEvHXKzf2R8DIwu0QcCyWgcQl4cdjq/Dg6doApJ0uejKu+ptYHJSEBLtEfQm98gTv/fOcvYcfJKYDmKQFDg3Eso9n6By/2bWBSkZIm+1gOW+80Cy448DaLJPvxso9NGiqElawftwr2kdWrqNRGDepgI5gKFpTea6SJ6Jofg/R/lWGUR76IXhlnitvurTY55t3Sli64QRxux7SUL1ghVwQln9SjlD1o4TuNskX7D6fW5d7lqXluc+KtxpeSwQtnQvgYE0cb9smSubzF7jPvkAyY3qFOKDW2eTzU6Q==",
+    "words": [
+        [
+            {
+                "name": "气候变化",
+                "wordType": 1
+            }
+        ]
+    ]
+}

File diff suppressed because it is too large
+ 0 - 0
dist/data/res.txt


+ 12 - 0
main.py

@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/06/28 23:14:30
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   main
+'''
+from crawl_baidu import main
+
+if __name__=='__main__':
+    main()

+ 44 - 0
main.spec

@@ -0,0 +1,44 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+
+block_cipher = None
+
+
+a = Analysis(
+    ['main.py'],
+    pathex=[],
+    binaries=[],
+    datas=[],
+    hiddenimports=[],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='main',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+requests

Some files were not shown because too many files changed in this diff