notices.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import tushare as ts
  4. import urllib
  5. import copy
  6. import requests
  7. from pymongo import MongoClient
  8. import re
  9. import hashlib
  10. from spiderNotices.items import NoticeItem
  11. from spiderNotices.utils import ashx_json
  12. class NoticesSpider(scrapy.Spider):
  13. name = 'notices'
  14. allowed_domains = ['eastmoney.com']
  15. start_urls = ['http://eastmoney.com/']
  16. # 股票列表
  17. shangshi = list(ts.pro_api().stock_basic(list_status='L')['ts_code'].drop_duplicates())
  18. tuishi = list(ts.pro_api().stock_basic(list_status='D')['ts_code'].drop_duplicates())
  19. zanting = list(ts.pro_api().stock_basic(list_status='P')['ts_code'].drop_duplicates())
  20. ts_code_list = list(set(shangshi + tuishi + zanting))
  21. code_list = [x.split('.')[0] for x in ts_code_list]
  22. code_list.sort()
  23. # code_list = ['000001', '000002']
  24. url_ashx = "http://data.eastmoney.com/notices/getdata.ashx"
  25. # 对应数据库
  26. db = None
  27. def start_requests(self):
  28. """"
  29. 第一次请求数据。指定page_size,若未指定则请求该股票所有数据。
  30. """
  31. self.db = MongoClient(self.settings.get('REMOTEMONGO')['uri'])[self.settings.get('REMOTEMONGO')['notices']]
  32. self.logger.info('爬取股票数量{}'.format(len(self.code_list)))
  33. for stk in self.code_list:
  34. item = NoticeItem()
  35. item['code'] = stk
  36. if self.settings.get('PAGE_SIZE'):
  37. params = {
  38. 'StockCode': stk,
  39. 'CodeType': 1,
  40. 'PageIndex': 1,
  41. 'PageSize': self.settings.get('PAGE_SIZE'),
  42. }
  43. url = self.url_ashx + '?' + urllib.parse.urlencode(params)
  44. yield scrapy.Request(
  45. url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}
  46. )
  47. else:
  48. params = {
  49. 'StockCode': stk,
  50. 'CodeType': 1,
  51. 'PageIndex': 1, # 证券市场,hsa为1,必须要有,否则TotalCount会出问题。
  52. 'PageSize': 50,
  53. }
  54. url = self.url_ashx + '?' + urllib.parse.urlencode(params)
  55. first = requests.get(url)
  56. page_size = ashx_json(first.text)['TotalCount']
  57. self.logger.warning('{}数据总数{}'.format(item['code'], page_size))
  58. if page_size == 0: # 有些证券,网站没有数据。page_size为0,parse函数中会报错,所以眺过
  59. continue
  60. params = {
  61. 'StockCode': stk,
  62. 'CodeType': 1,
  63. 'PageIndex': 1,
  64. 'PageSize': page_size,
  65. }
  66. url = self.url_ashx + '?' + urllib.parse.urlencode(params)
  67. yield scrapy.Request(
  68. url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}
  69. )
  70. def parse(self, response):
  71. """
  72. 分析返回的数据结构,获取公告的摘要信息。
  73. """
  74. item = response.meta['item']
  75. assert item['code'] == re.findall(r'StockCode=(.*?)&', response.url)[0]
  76. # 已存在的数据,且content不为空。
  77. # TODO 按需设置有效数据的规则,例如pdf处理
  78. exsit_md5 = self.db[item['code']].find({'content_source': {'$ne': 0}}, {'_id': 1, 'href_md5': 1})
  79. exsit_md5 = [x.get('href_md5') for x in exsit_md5]
  80. total = ashx_json(response.body_as_unicode())
  81. for each in total.get('data'):
  82. item['ann_date'] = each.get('NOTICEDATE')
  83. item['ann_title'] = each.get('NOTICETITLE')
  84. item['ann_type'] = each.get('ANN_RELCOLUMNS')[0].get('COLUMNNAME') # 有些type不属于公告分类table,而是'其它' '股票'这种字段
  85. item['href'] = each.get('Url')
  86. item['href_md5'] = hashlib.md5(item['href'].encode('utf8')).hexdigest()
  87. if item['href_md5'] in exsit_md5:
  88. continue
  89. copy_item = copy.deepcopy(item)
  90. yield scrapy.Request(
  91. copy_item['href'], callback=self.parse_content, meta={'item': copy_item}
  92. )
  93. def parse_content(self, response):
  94. """ 获取公告对应的文本内容。"""
  95. item = response.meta['item']
  96. try:
  97. temp = response.xpath("//div[@class='detail-body']/div/text()").extract()
  98. temp = [x for x in temp if str(x).strip()]
  99. temp = '\r\n'.join(temp)
  100. item['content'] = temp
  101. item['content_source'] = 1
  102. except Exception as e:
  103. self.logger.warning('链接文本为空{}'.format(item['href'])) # TODO 做pdf的提取
  104. item['content'] = ''
  105. item['content_source'] = 0
  106. return item