cmsNavDetailCertificate.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2021/03/10 11:04:33
  6. @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
  7. @Desc : 招采人员能力评价-证书爬虫
  8. 共70页 6919条 ,每页 100 条
  9. 第一步: 循环爬取所有CertificateListePage ,参数中有签名,所以采用
  10. 第二步:根据爬到的Certificateid 爬取详情页,下载证书图片,以 名字-身份证-职称等级
  11. '''
  12. import os
  13. import sys
  14. import re
  15. import json
  16. import time
  17. # from spliter import Browser
  18. import requests
  19. headers = {
  20. 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
  21. "Authorization": "Basic YXBwOlZtMHhkMUl4YkZoVFdHaFRWMGQ0VjFsWGM=",
  22. "Cookie": "acw_tc=2760820416153446998782758e6f918d30a3b14e84cdf23375d461edeec5cd",
  23. "Sign": "ef1bffdaa27c5e0219fedface033b001",
  24. "Tenant-Id": "000000",
  25. "Timestamp": "1615345938461"}
  26. def downloadCert():
  27. pass
  28. def getCert():
  29. url = r"http://tpp.ctba.org.cn/cmsNavDetail/open/certificateDetail?id=1368587249407119362"
  30. res=requests.get(url,headers=headers)
  31. print(res.text.encode(res.encoding).decode("utf-8"))
  32. print(res.content)
  33. def getCertificateListPage():
  34. for i in range(1, 2):
  35. # with Browser(driver_name='chrome', executable_path="chromedriver.exe") as browser:
  36. # browser.visit()
  37. res = requests.get(
  38. r'http://tpp.ctba.org.cn/api/ctpsp-public/user-certificate/endpoint/publicity-pager?current='+str(i)+'&size=100&level=0&status=3&keyword=' + str(i), headers=headers)
  39. print(res.text.encode(res.encoding).decode("utf-8"))
  40. print(res.content)
  41. def crawl():
  42. getCert()
  43. if __name__ == "__main__":
  44. crawl()