crawl_baidu.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2022/06/28 23:15:05
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc :
  8. '''
  9. from http import cookies
  10. import requests
  11. import sys,os,json
  12. from crawl_baidu.lib.json_conf import JsonConf
  13. import time
  14. headers = {
  15. 'Accept': 'application/json, text/plain, */*',
  16. 'Accept-Encoding': 'gzip, deflate',
  17. 'Accept-Language': 'zh-CN,zh;q=0.9',
  18. 'Cache-Control': 'no-cache',
  19. 'DNT': '1',
  20. 'Host': 'index.baidu.com',
  21. 'Pragma': 'no-cache',
  22. 'Proxy-Connection': 'keep-alive',
  23. 'Referer': 'https://index.baidu.com/v2/main/index.html',
  24. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
  25. 'X-Requested-With': 'XMLHttpRequest',
  26. }
  27. class CrawlBaidu():
  28. def __init__(self):
  29. self.sess=requests.Session()
  30. self.jsonConf = JsonConf()
  31. self.conf = self.jsonConf.load()
  32. cookie = self.conf.get('cookie')
  33. CliperText = self.conf.get('CliperText')
  34. self.words = self.conf.get('words')
  35. # self.sess.cookies.update(cookie)
  36. self.sess.headers.update({
  37. "Cipher-Text":CliperText,
  38. "Cookie" : cookie
  39. })
  40. @staticmethod
  41. def decrypt(t,e):
  42. n = list(t)
  43. i = list(e)
  44. a = {}
  45. result = []
  46. ln = int(len(n)/2)
  47. start = n[ln:]
  48. end = n[:ln]
  49. for j,k in zip(start, end):
  50. a.update({k: j})
  51. for j in e:
  52. result.append(a.get(j))
  53. return ''.join(result)
  54. def get_ptbk(self,uniqid):
  55. url = 'http://index.baidu.com/Interface/ptbk?uniqid={}'
  56. resp = self.sess.get(url.format(uniqid), headers=headers)
  57. if resp.status_code != 200:
  58. print('获取uniqid失败')
  59. sys.exit(1)
  60. return resp.json().get('data')
  61. def get_index_data(self, start='2011-01-03', end='2022-08-05'):
  62. keyword = str(self.words).replace("'", '"')
  63. url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={keyword}&area=0&startDate={start}&endDate={end}'
  64. resp = self.sess.get(url, headers=headers)
  65. if resp.status_code != 200:
  66. print('获取指数失败')
  67. sys.exit(1)
  68. content = resp.json()
  69. data = content.get('data')
  70. user_indexes = data.get('userIndexes')[0]
  71. uniqid = data.get('uniqid')
  72. ptbk = self.get_ptbk(uniqid)
  73. while ptbk is None or ptbk == '':
  74. ptbk = self.get_ptbk(uniqid)
  75. all_data = user_indexes.get('all').get('data')
  76. result = CrawlBaidu.decrypt(ptbk, all_data)
  77. result = result.split(',')
  78. print(result)
  79. if not os.path.exists("data"):
  80. os.mkdir("data")
  81. with open("data/res.txt","w") as file:
  82. file.write(json.dumps(result))