6 years ago · 4c3ae741ec
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,107 @@
 
				+.idea
			
 
				+.idea/
			
 
				+
			
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# C extensions
			
 
				+*.so
			
 
				+
			
 
				+# Distribution / packaging
			
 
				+.Python
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+wheels/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+MANIFEST
			
 
				+
			
 
				+# PyInstaller
			
 
				+#  Usually these files are written by a python script from a template
			
 
				+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				+*.manifest
			
 
				+*.spec
			
 
				+
			
 
				+# Installer logs
			
 
				+pip-log.txt
			
 
				+pip-delete-this-directory.txt
			
 
				+
			
 
				+# Unit test / coverage reports
			
 
				+htmlcov/
			
 
				+.tox/
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+nosetests.xml
			
 
				+coverage.xml
			
 
				+*.cover
			
 
				+.hypothesis/
			
 
				+.pytest_cache/
			
 
				+
			
 
				+# Translations
			
 
				+*.mo
			
 
				+*.pot
			
 
				+
			
 
				+# Django stuff:
			
 
				+*.log
			
 
				+local_settings.py
			
 
				+db.sqlite3
			
 
				+
			
 
				+# Flask stuff:
			
 
				+instance/
			
 
				+.webassets-cache
			
 
				+
			
 
				+# Scrapy stuff:
			
 
				+.scrapy
			
 
				+
			
 
				+# Sphinx documentation
			
 
				+docs/_build/
			
 
				+
			
 
				+# PyBuilder
			
 
				+target/
			
 
				+
			
 
				+# Jupyter Notebook
			
 
				+.ipynb_checkpoints
			
 
				+
			
 
				+# pyenv
			
 
				+.python-version
			
 
				+
			
 
				+# celery beat schedule file
			
 
				+celerybeat-schedule
			
 
				+
			
 
				+# SageMath parsed files
			
 
				+*.sage.py
			
 
				+
			
 
				+# Environments
			
 
				+.env
			
 
				+.venv
			
 
				+env/
			
 
				+venv/
			
 
				+ENV/
			
 
				+env.bak/
			
 
				+venv.bak/
			
 
				+
			
 
				+# Spyder project settings
			
 
				+.spyderproject
			
 
				+.spyproject
			
 
				+
			
 
				+# Rope project settings
			
 
				+.ropeproject
			
 
				+
			
 
				+# mkdocs documentation
			
 
				+/site
			
 
				+
			
 
				+# mypy
			
 
				+.mypy_cache/
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,46 @@
 
				+# spiderNotices
			
 
				+爬取东方财富网站的公司公告列表及文本内容。
			
 
				+
			
 
				+## 运行方法
			
 
				+- 1、本地安装mongodb数据库，python环境安装scrapy爬虫框架和 requests 等相关依赖。
			
 
				+- 2、设置项目 spiderNotices.settings.REMOTEMONGO
			
 
				+    - 数据库的uri: REMOTEMONGO['uri']
			
 
				+    - 所在数据库名称：REMOTEMONGO['aiNotices']
			
 
				+- 3、项目运行，python 运行main.py脚本。或者切换到项目根目录，输入命令
			
 
				+```
			
 
				+scrapy crawl notices
			
 
				+```
			
 
				+- 4、运行结果：往数据库中存入数据，项目路径下生成log文件夹
			
 
				+
			
 
				+## 数据调用
			
 
				+- spiderNotices.text_mongo.TextMongo对象的方法:
			
 
				+    - get_notices_stk::获取notices数据库下存在的表。
			
 
				+    - get_notices::从mongodb中获取数据。
			
 
				+    - get_notices_single::获取单个股票数据
			
 
				+    
			
 
				+```python
			
 
				+from spiderNotices.text_mongo import TextMongo
			
 
				+# 单个获取
			
 
				+result = TextMongo().get_notices_single('000001.SZ', '2010-01-01', '2012-12-31')
			
 
				+result = TextMongo().get_notices_single('000001.SZ')
			
 
				+
			
 
				+# 多个获取
			
 
				+result = TextMongo().get_notices(['000001.SZ', '000002.SZ'])
			
 
				+
			
 
				+# 遍历存有的股票
			
 
				+result = TextMongo().get_notices_stk()
			
 
				+
			
 
				+```
			
 
				+
			
 
				+
			
 
				+## 爬虫设置
			
 
				+- settings.PAGE_SIZE:爬取第一页时的数据大小。
			
 
				+    - 首次运行，设置为None，会执行全部数据的爬取。
			
 
				+    - 往后增量更新，可以设为50或其它。
			
 
				+- DOWNLOADER_MIDDLEWARES可以启用`SeleniumMiddleware` `RandomUserAgent` `ProxyIpMiddleware`
			
 
				+
			
 
				+
			
 
				+## TODO
			
 
				+- [ ] pdf的文本提取和图片文字识别
			
 
				+
			
 
				+- [ ] 市场新闻的爬取
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
 
				+Scrapy==1.7.4
			
 
				+beautifulsoup4==4.8.1
			
 
				+bs4==0.0.1
			
 
				+pandas==0.25.2
			
 
				+pymongo==3.9.0
			
 
				+requests==2.22.0
			
 
				+tushare==1.2.48
			
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = spiderNotices.settings
			
 
				+
			
 
				+[deploy]
			
 
				+#url = http://localhost:6800/
			
 
				+project = spiderNotices
			
--- a/spiderNotices/__init__.py
+++ b/spiderNotices/__init__.py
--- a/spiderNotices/items.py
+++ b/spiderNotices/items.py
@@ -0,0 +1,27 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class SpidernoticesItem(scrapy.Item):
			
 
				+    # define the fields for your item here like:
			
 
				+    # name = scrapy.Field()
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class NoticeItem(scrapy.Item):
			
 
				+    code = scrapy.Field()  # 证券代码xxxxxx六位数字
			
 
				+
			
 
				+    ann_date = scrapy.Field()  # 公告日期
			
 
				+    ann_title = scrapy.Field()
			
 
				+    ann_type = scrapy.Field()
			
 
				+
			
 
				+    href = scrapy.Field()
			
 
				+    href_md5 = scrapy.Field()
			
 
				+    content = scrapy.Field()
			
 
				+    content_source = scrapy.Field()  # 公告内容来源，0 空，1 网页text， 2 pdf解析
			
--- a/spiderNotices/main.py
+++ b/spiderNotices/main.py
@@ -0,0 +1,12 @@
 
				+"""
			
 
				+
			
 
				+"""
			
 
				+from scrapy import cmdline
			
 
				+
			
 
				+
			
 
				+def run_notices():
			
 
				+    cmdline.execute("scrapy crawl notices".split())
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    run_notices()
			
--- a/spiderNotices/middlewares.py
+++ b/spiderNotices/middlewares.py
@@ -0,0 +1,172 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+import random
			
 
				+from logging import getLogger
			
 
				+from scrapy import signals
			
 
				+from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
			
 
				+from scrapy.http import HtmlResponse
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from selenium.common.exceptions import TimeoutException
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+import requests
			
 
				+
			
 
				+from .utils import user_agent_list
			
 
				+
			
 
				+
			
 
				+class SpidernoticesSpiderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the spider middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_spider_input(self, response, spider):
			
 
				+        # Called for each response that goes through the spider
			
 
				+        # middleware and into the spider.
			
 
				+
			
 
				+        # Should return None or raise an exception.
			
 
				+        return None
			
 
				+
			
 
				+    def process_spider_output(self, response, result, spider):
			
 
				+        # Called with the results returned from the Spider, after
			
 
				+        # it has processed the response.
			
 
				+
			
 
				+        # Must return an iterable of Request, dict or Item objects.
			
 
				+        for i in result:
			
 
				+            yield i
			
 
				+
			
 
				+    def process_spider_exception(self, response, exception, spider):
			
 
				+        # Called when a spider or process_spider_input() method
			
 
				+        # (from other spider middleware) raises an exception.
			
 
				+
			
 
				+        # Should return either None or an iterable of Request, dict
			
 
				+        # or Item objects.
			
 
				+        pass
			
 
				+
			
 
				+    def process_start_requests(self, start_requests, spider):
			
 
				+        # Called with the start requests of the spider, and works
			
 
				+        # similarly to the process_spider_output() method, except
			
 
				+        # that it doesn’t have a response associated.
			
 
				+
			
 
				+        # Must return only requests (not items).
			
 
				+        for r in start_requests:
			
 
				+            yield r
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class SpidernoticesDownloaderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # Called for each request that goes through the downloader
			
 
				+        # middleware.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this request
			
 
				+        # - or return a Response object
			
 
				+        # - or return a Request object
			
 
				+        # - or raise IgnoreRequest: process_exception() methods of
			
 
				+        #   installed downloader middleware will be called
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class SeleniumMiddleware(object):
			
 
				+    """
			
 
				+    使用Selenium加载动态页面。
			
 
				+    1、process_request:通过request.url判断。
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, timeout=10, service_args=[]):
			
 
				+        self.logger = getLogger(__name__)
			
 
				+
			
 
				+        self.browser = webdriver.PhantomJS(service_args=service_args)
			
 
				+        self.timeout = timeout
			
 
				+        self.wait = WebDriverWait(self.browser, self.timeout)
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        self.browser.close()
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        if request.url.find(r'data.eastmoney.com/notices/stock/') != -1:
			
 
				+            # 个股公告公告列表，此时AJX加载到第一页
			
 
				+            self.logger.debug('Selenium is Starting')
			
 
				+            try:
			
 
				+                self.browser.get(request.url)
			
 
				+                # time.sleep(3)
			
 
				+                self.wait.until(
			
 
				+                    EC.presence_of_element_located((By.ID, 'PageCont'))
			
 
				+                )
			
 
				+                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
			
 
				+                                    )
			
 
				+            except TimeoutException:
			
 
				+                return HtmlResponse(url=request.url, status=500, request=request)
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+
			
 
				+class ProxyIpMiddleware(object):
			
 
				+    """
			
 
				+    使用ip:port格式的代理。从47.103.1.245:5010获取
			
 
				+    """
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        ip = requests.get('http://47.103.1.245:5010/get/').json()
			
 
				+        if ip["proxy"]:
			
 
				+            request.meta["proxy"] = "http://" + ip["proxy"]
			
 
				+
			
 
				+
			
 
				+class RandomUserAgent(UserAgentMiddleware):
			
 
				+    """
			
 
				+    随机设置一个useragent
			
 
				+    """
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        ua = random.choice(user_agent_list)
			
 
				+        request.headers.setdefault('User-Agent', ua)
			
--- a/spiderNotices/pipelines.py
+++ b/spiderNotices/pipelines.py
@@ -0,0 +1,47 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+from pymongo import MongoClient
			
 
				+from dateutil.parser import parse
			
 
				+import hashlib
			
 
				+
			
 
				+
			
 
				+class SpidernoticesPipeline(object):
			
 
				+    def process_item(self, item, spider):
			
 
				+        return item
			
 
				+
			
 
				+
			
 
				+class ItemToMongo(object):
			
 
				+
			
 
				+    def __init__(self, uri, db_name):
			
 
				+        self.client = MongoClient(uri)
			
 
				+        self.db_name = db_name
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        return cls(
			
 
				+            uri=crawler.settings.get('REMOTEMONGO')['uri'],
			
 
				+            db_name=crawler.settings.get('REMOTEMONGO')['notices']
			
 
				+        )
			
 
				+
			
 
				+    def close_spider(self, spider):
			
 
				+        self.client.close()
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        """ 存储到mongodb，数据库aiStkNotices
			
 
				+        一个股票对应一张表，表名只有xxxxxx六位证券代码。
			
 
				+
			
 
				+        """
			
 
				+        post = dict(item)
			
 
				+        coll = self.client[self.db_name][post['code']]
			
 
				+
			
 
				+        temp = parse(post['ann_date']).strftime('%Y-%m-%d')  # 网站上显示的只有前面的日期，不考虑tzone
			
 
				+        post['ann_date'] = parse(temp)
			
 
				+        post['_id'] = hashlib.md5(post['href'].encode('utf8')).hexdigest()
			
 
				+        # coll.insert_one(post)  # 有值再插入pymongo.errors.DuplicateKeyError
			
 
				+        coll.update_one({'_id': post.pop('_id')}, {'$set': post}, upsert=True)
			
 
				+
			
 
				+        return item
			
--- a/spiderNotices/settings.py
+++ b/spiderNotices/settings.py
@@ -0,0 +1,126 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import os
			
 
				+import datetime
			
 
				+
			
 
				+
			
 
				+# TODO log分析，完善爬取自动化
			
 
				+
			
 
				+
			
 
				+# 一页数据量
			
 
				+PAGE_SIZE = None
			
 
				+
			
 
				+# 远程数据库
			
 
				+REMOTEMONGO = {
			
 
				+    # 'uri': 'mongodb://read:read123456@120.92.189.17:27027',  # /?authSource=amdin&authMechanism=SCRAM-SHA-1
			
 
				+    #            'host': '120.92.189.17',
			
 
				+    #            'port': 27027,
			
 
				+    'uri': 'mongodb://read:read123456@127.0.0.1:27017',  # 本地配置
			
 
				+    'host': '127.0.0.1',
			
 
				+    'port': 27017,
			
 
				+
			
 
				+    'username': 'admin',
			
 
				+    'password': 'admin',
			
 
				+    # 'tushare': 'aiTushare',  # tushare数据的数据库名称
			
 
				+    # 'factor': 'aiFactor',
			
 
				+    'notices': 'aiNotices',
			
 
				+}
			
 
				+
			
 
				+# --------------------------------------------------------------------------------------------
			
 
				+# Scrapy settings for spiderNotices project
			
 
				+#
			
 
				+# For simplicity, this file contains only settings considered important or
			
 
				+# commonly used. You can find more settings consulting the documentation:
			
 
				+#
			
 
				+#     https://docs.scrapy.org/en/latest/topics/settings.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+BOT_NAME = 'spiderNotices'
			
 
				+
			
 
				+SPIDER_MODULES = ['spiderNotices.spiders']
			
 
				+NEWSPIDER_MODULE = 'spiderNotices.spiders'
			
 
				+
			
 
				+DOWNLOAD_TIMEOUT = 120  # 下载超时时间
			
 
				+
			
 
				+today = datetime.datetime.now()
			
 
				+SETTINGS_PATH = os.path.abspath(__file__)
			
 
				+LOG_FILE = os.path.join(os.path.dirname(SETTINGS_PATH), f'log\\{today.year}_{today.month}_{today.day}.log')
			
 
				+LOG_LEVEL = 'WARNING'
			
 
				+
			
 
				+# Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				+# USER_AGENT = 'spiderNotices (+http://www.yourdomain.com)'
			
 
				+
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = False
			
 
				+
			
 
				+# Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				+# CONCURRENT_REQUESTS = 32
			
 
				+
			
 
				+# Configure a delay for requests for the same website (default: 0)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# See also autothrottle settings and docs
			
 
				+# DOWNLOAD_DELAY = 3
			
 
				+# The download delay setting will honor only one of:
			
 
				+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
			
 
				+# CONCURRENT_REQUESTS_PER_IP = 16
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+# COOKIES_ENABLED = False
			
 
				+
			
 
				+# Disable Telnet Console (enabled by default)
			
 
				+# TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# Override the default request headers:
			
 
				+# DEFAULT_REQUEST_HEADERS = {
			
 
				+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+#   'Accept-Language': 'en',
			
 
				+# }
			
 
				+
			
 
				+# Enable or disable spider middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+# SPIDER_MIDDLEWARES = {
			
 
				+#    'spiderNotices.middlewares.SpidernoticesSpiderMiddleware': 543,
			
 
				+# }
			
 
				+
			
 
				+# Enable or disable downloader middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+DOWNLOADER_MIDDLEWARES = {
			
 
				+    # 'spiderNotices.middlewares.SpidernoticesDownloaderMiddleware': 543,
			
 
				+    #  'spiderNotices.middlewares.SeleniumMiddleware': 100,
			
 
				+    # 'spiderNotices.middlewares.RandomUserAgent': 200,
			
 
				+    # 'spiderNotices.middlewares.ProxyIpMiddleware': 201,
			
 
				+}
			
 
				+
			
 
				+# Enable or disable extensions
			
 
				+# See https://docs.scrapy.org/en/latest/topics/extensions.html
			
 
				+# EXTENSIONS = {
			
 
				+#    'scrapy.extensions.telnet.TelnetConsole': None,
			
 
				+# }
			
 
				+
			
 
				+# Configure item pipelines
			
 
				+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+ITEM_PIPELINES = {
			
 
				+    # 'spiderNotices.pipelines.SpidernoticesPipeline': 300,
			
 
				+    'spiderNotices.pipelines.ItemToMongo': 400,
			
 
				+}
			
 
				+
			
 
				+# Enable and configure the AutoThrottle extension (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+# AUTOTHROTTLE_ENABLED = True
			
 
				+# The initial download delay
			
 
				+# AUTOTHROTTLE_START_DELAY = 5
			
 
				+# The maximum download delay to be set in case of high latencies
			
 
				+# AUTOTHROTTLE_MAX_DELAY = 60
			
 
				+# The average number of requests Scrapy should be sending in parallel to
			
 
				+# each remote server
			
 
				+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# Enable showing throttling stats for every response received:
			
 
				+# AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# Enable and configure HTTP caching (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+# HTTPCACHE_ENABLED = True
			
 
				+# HTTPCACHE_EXPIRATION_SECS = 0
			
 
				+# HTTPCACHE_DIR = 'httpcache'
			
 
				+# HTTPCACHE_IGNORE_HTTP_CODES = []
			
 
				+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
			
--- a/spiderNotices/spiders/__init__.py
+++ b/spiderNotices/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/spiderNotices/spiders/notices.py
+++ b/spiderNotices/spiders/notices.py
@@ -0,0 +1,121 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import scrapy
			
 
				+import tushare as ts
			
 
				+import urllib
			
 
				+import copy
			
 
				+import requests
			
 
				+from pymongo import MongoClient
			
 
				+import re
			
 
				+import hashlib
			
 
				+from spiderNotices.items import NoticeItem
			
 
				+from spiderNotices.utils import ashx_json
			
 
				+
			
 
				+
			
 
				+class NoticesSpider(scrapy.Spider):
			
 
				+    name = 'notices'
			
 
				+    allowed_domains = ['eastmoney.com']
			
 
				+    start_urls = ['http://eastmoney.com/']
			
 
				+
			
 
				+    # 股票列表
			
 
				+    shangshi = list(ts.pro_api().stock_basic(list_status='L')['ts_code'].drop_duplicates())
			
 
				+    tuishi = list(ts.pro_api().stock_basic(list_status='D')['ts_code'].drop_duplicates())
			
 
				+    zanting = list(ts.pro_api().stock_basic(list_status='P')['ts_code'].drop_duplicates())
			
 
				+    ts_code_list = list(set(shangshi + tuishi + zanting))
			
 
				+    code_list = [x.split('.')[0] for x in ts_code_list]
			
 
				+    code_list.sort()
			
 
				+    # code_list = ['000001', '000002']
			
 
				+
			
 
				+    url_ashx = "http://data.eastmoney.com/notices/getdata.ashx"
			
 
				+
			
 
				+    # 对应数据库
			
 
				+    db = None
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        """"
			
 
				+        第一次请求数据。指定page_size，若未指定则请求该股票所有数据。
			
 
				+        """
			
 
				+
			
 
				+        self.db = MongoClient(self.settings.get('REMOTEMONGO')['uri'])[self.settings.get('REMOTEMONGO')['notices']]
			
 
				+        self.logger.info('爬取股票数量{}'.format(len(self.code_list)))
			
 
				+
			
 
				+        for stk in self.code_list:
			
 
				+            item = NoticeItem()
			
 
				+            item['code'] = stk
			
 
				+            if self.settings.get('PAGE_SIZE'):
			
 
				+                params = {
			
 
				+                    'StockCode': stk,
			
 
				+                    'CodeType': 1,
			
 
				+                    'PageIndex': 1,
			
 
				+                    'PageSize': self.settings.get('PAGE_SIZE'),
			
 
				+                }
			
 
				+                url = self.url_ashx + '?' + urllib.parse.urlencode(params)
			
 
				+                yield scrapy.Request(
			
 
				+                    url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}
			
 
				+                )
			
 
				+            else:
			
 
				+                params = {
			
 
				+                    'StockCode': stk,
			
 
				+                    'CodeType': 1,
			
 
				+                    'PageIndex': 1,  # 证券市场，hsa为1，必须要有，否则TotalCount会出问题。
			
 
				+                    'PageSize': 50,
			
 
				+                }
			
 
				+                url = self.url_ashx + '?' + urllib.parse.urlencode(params)
			
 
				+                first = requests.get(url)
			
 
				+                page_size = ashx_json(first.text)['TotalCount']
			
 
				+                self.logger.warning('{}数据总数{}'.format(item['code'], page_size))
			
 
				+                if page_size == 0:  # 有些证券，网站没有数据。page_size为0，parse函数中会报错，所以眺过
			
 
				+                    continue
			
 
				+
			
 
				+                params = {
			
 
				+                    'StockCode': stk,
			
 
				+                    'CodeType': 1,
			
 
				+                    'PageIndex': 1,
			
 
				+                    'PageSize': page_size,
			
 
				+                }
			
 
				+                url = self.url_ashx + '?' + urllib.parse.urlencode(params)
			
 
				+                yield scrapy.Request(
			
 
				+                    url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}
			
 
				+                )
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        """
			
 
				+        分析返回的数据结构，获取公告的摘要信息。
			
 
				+        """
			
 
				+        item = response.meta['item']
			
 
				+        assert item['code'] == re.findall(r'StockCode=(.*?)&', response.url)[0]
			
 
				+
			
 
				+        # 已存在的数据，且content不为空。
			
 
				+        # TODO 按需设置有效数据的规则，例如pdf处理
			
 
				+        exsit_md5 = self.db[item['code']].find({'content_source': {'$ne': 0}}, {'_id': 1, 'href_md5': 1})
			
 
				+        exsit_md5 = [x.get('href_md5') for x in exsit_md5]
			
 
				+
			
 
				+        total = ashx_json(response.body_as_unicode())
			
 
				+        for each in total.get('data'):
			
 
				+            item['ann_date'] = each.get('NOTICEDATE')
			
 
				+            item['ann_title'] = each.get('NOTICETITLE')
			
 
				+            item['ann_type'] = each.get('ANN_RELCOLUMNS')[0].get('COLUMNNAME')  # 有些type不属于公告分类table，而是'其它' '股票'这种字段
			
 
				+            item['href'] = each.get('Url')
			
 
				+            item['href_md5'] = hashlib.md5(item['href'].encode('utf8')).hexdigest()
			
 
				+            if item['href_md5'] in exsit_md5:
			
 
				+                continue
			
 
				+
			
 
				+            copy_item = copy.deepcopy(item)
			
 
				+            yield scrapy.Request(
			
 
				+                copy_item['href'], callback=self.parse_content, meta={'item': copy_item}
			
 
				+            )
			
 
				+
			
 
				+    def parse_content(self, response):
			
 
				+        """ 获取公告对应的文本内容。"""
			
 
				+        item = response.meta['item']
			
 
				+        try:
			
 
				+            temp = response.xpath("//div[@class='detail-body']/div/text()").extract()
			
 
				+            temp = [x for x in temp if str(x).strip()]
			
 
				+            temp = '\r\n'.join(temp)
			
 
				+            item['content'] = temp
			
 
				+            item['content_source'] = 1
			
 
				+        except Exception as e:
			
 
				+            self.logger.warning('链接文本为空{}'.format(item['href']))  # TODO 做pdf的提取
			
 
				+            item['content'] = ''
			
 
				+            item['content_source'] = 0
			
 
				+
			
 
				+        return item
			
--- a/spiderNotices/text_mongo.py
+++ b/spiderNotices/text_mongo.py
@@ -0,0 +1,90 @@
 
				+"""
			
 
				+mongodb中的文本数据。
			
 
				+
			
 
				+"""
			
 
				+import pandas as pd
			
 
				+from dateutil.parser import parse
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from spiderNotices.settings import REMOTEMONGO
			
 
				+
			
 
				+
			
 
				+class TextMongo(object):
			
 
				+    """" 只做数据查询。"""
			
 
				+
			
 
				+    def __init__(self, uri=REMOTEMONGO['uri']):
			
 
				+        self.client = MongoClient(uri)
			
 
				+        # 上市公司公告的数据库
			
 
				+        self.db_notices = self.client[REMOTEMONGO['notices']]
			
 
				+
			
 
				+    def get_notices_stk(self):
			
 
				+        """ 获取notices数据库下存在的表。"""
			
 
				+        coll_names = self.db_notices.list_collection_names(session=None)
			
 
				+        coll_names.sort()
			
 
				+        return coll_names
			
 
				+
			
 
				+    def get_notices(self, stk_list=[], begin='', end='', columns=[]):
			
 
				+        """
			
 
				+        从mongodb中获取数据。
			
 
				+        :param stk_list: xxxxxx.zz或xxxxxx.zzzz格式，切分后取前面数字编码。
			
 
				+        :param begin:
			
 
				+        :param end:
			
 
				+        :param columns:
			
 
				+        :return: DataFrame
			
 
				+        """
			
 
				+        # 循环股票列表
			
 
				+        stk_list = list(set(stk_list))
			
 
				+        stk_list.sort()
			
 
				+        each_list = []
			
 
				+        for stk in stk_list:
			
 
				+            each = self.get_notices_single(stk, begin=begin, end=end, columns=columns)
			
 
				+            if not each.empty:
			
 
				+                each_list.append(each)
			
 
				+        df = pd.concat(each_list).reset_index(drop=True)
			
 
				+        return df
			
 
				+
			
 
				+    def get_notices_single(self, stk, begin='', end='', columns=[]):
			
 
				+        # 数据库表
			
 
				+        coll = self.db_notices[stk.split('.')[0]]
			
 
				+
			
 
				+        # 查询条件
			
 
				+        query = {}
			
 
				+        if begin:
			
 
				+            begin = parse(begin)
			
 
				+            if end:
			
 
				+                end = parse(end)
			
 
				+                query['ann_date'] = {"$gte": begin, "$lte": end}
			
 
				+            else:
			
 
				+                query['ann_date'] = {"$gte": begin}
			
 
				+        else:
			
 
				+            if end:
			
 
				+                end = parse(end)
			
 
				+                query['ann_date'] = {"$lte": end}
			
 
				+            else:
			
 
				+                pass
			
 
				+
			
 
				+        # 查询列
			
 
				+        if columns:
			
 
				+            cursor = coll.find(query, {x: 1 for x in columns})  # query为{}时，全取出
			
 
				+        else:
			
 
				+            cursor = coll.find(query)
			
 
				+        df = pd.DataFrame(list(cursor))
			
 
				+
			
 
				+        # 整理数据
			
 
				+        if '_id' in df.columns:
			
 
				+            del df['_id']
			
 
				+        df.reset_index(drop=True, inplace=True)
			
 
				+
			
 
				+        return df
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 单个获取
			
 
				+    result = TextMongo().get_notices_single('000001.SZ', '2010-01-01', '2012-12-31')
			
 
				+    result = TextMongo().get_notices_single('000001.SZ')
			
 
				+
			
 
				+    # 多个获取
			
 
				+    result = TextMongo().get_notices(['000001.SZ', '000002.SZ'])
			
 
				+
			
 
				+    # 遍历存有的股票
			
 
				+    result = TextMongo().get_notices_stk()
			
--- a/spiderNotices/utils/__init__.py
+++ b/spiderNotices/utils/__init__.py
@@ -0,0 +1,71 @@
 
				+"""
			
 
				+
			
 
				+"""
			
 
				+import re
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+def ashx_json(ashx_str):
			
 
				+    json_str = re.findall(r'\{(.*)\}', ashx_str)[0]
			
 
				+    json_str = '{' + json_str + '}'
			
 
				+    return json.loads(json_str)
			
 
				+
			
 
				+
			
 
				+# 客户端列表
			
 
				+user_agent_list = [
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
			
 
				+        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
			
 
				+        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
			
 
				+        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
			
 
				+        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
			
 
				+        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
			
 
				+        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
			
 
				+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
			
 
				+
			
 
				+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
			
 
				+    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
			
 
				+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
			
 
				+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
			
 
				+    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
			
 
				+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
			
 
				+    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
			
 
				+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
			
 
				+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
			
 
				+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
			
 
				+    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
			
 
				+]