liuyuqi-dellpc 5 years ago
commit
4c3ae741ec

+ 107 - 0
.gitignore

@@ -0,0 +1,107 @@
+.idea
+.idea/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/

+ 46 - 0
README.md

@@ -0,0 +1,46 @@
+# spiderNotices
+爬取东方财富网站的公司公告列表及文本内容。
+
+## 运行方法
+- 1、本地安装mongodb数据库,python环境安装scrapy爬虫框架和 requests 等相关依赖。
+- 2、设置项目 spiderNotices.settings.REMOTEMONGO
+    - 数据库的uri: REMOTEMONGO['uri']
+    - 所在数据库名称:REMOTEMONGO['aiNotices']
+- 3、项目运行,python 运行main.py脚本。或者切换到项目根目录,输入命令
+```
+scrapy crawl notices
+```
+- 4、运行结果:往数据库中存入数据,项目路径下生成log文件夹
+
+## 数据调用
+- spiderNotices.text_mongo.TextMongo对象的方法:
+    - get_notices_stk::获取notices数据库下存在的表。
+    - get_notices::从mongodb中获取数据。
+    - get_notices_single::获取单个股票数据
+    
+```python
+from spiderNotices.text_mongo import TextMongo
+# 单个获取
+result = TextMongo().get_notices_single('000001.SZ', '2010-01-01', '2012-12-31')
+result = TextMongo().get_notices_single('000001.SZ')
+
+# 多个获取
+result = TextMongo().get_notices(['000001.SZ', '000002.SZ'])
+
+# 遍历存有的股票
+result = TextMongo().get_notices_stk()
+
+```
+
+
+## 爬虫设置
+- settings.PAGE_SIZE:爬取第一页时的数据大小。
+    - 首次运行,设置为None,会执行全部数据的爬取。
+    - 往后增量更新,可以设为50或其它。
+- DOWNLOADER_MIDDLEWARES可以启用`SeleniumMiddleware` `RandomUserAgent` `ProxyIpMiddleware`
+
+
+## TODO
+- [ ] pdf的文本提取和图片文字识别
+
+- [ ] 市场新闻的爬取

+ 7 - 0
requirements.txt

@@ -0,0 +1,7 @@
+Scrapy==1.7.4
+beautifulsoup4==4.8.1
+bs4==0.0.1
+pandas==0.25.2
+pymongo==3.9.0
+requests==2.22.0
+tushare==1.2.48

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = spiderNotices.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = spiderNotices

+ 0 - 0
spiderNotices/__init__.py


+ 27 - 0
spiderNotices/items.py

@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class SpidernoticesItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+
+
+class NoticeItem(scrapy.Item):
+    code = scrapy.Field()  # 证券代码xxxxxx六位数字
+
+    ann_date = scrapy.Field()  # 公告日期
+    ann_title = scrapy.Field()
+    ann_type = scrapy.Field()
+
+    href = scrapy.Field()
+    href_md5 = scrapy.Field()
+    content = scrapy.Field()
+    content_source = scrapy.Field()  # 公告内容来源,0 空,1 网页text, 2 pdf解析

+ 12 - 0
spiderNotices/main.py

@@ -0,0 +1,12 @@
+"""
+
+"""
+from scrapy import cmdline
+
+
+def run_notices():
+    cmdline.execute("scrapy crawl notices".split())
+
+
+if __name__ == '__main__':
+    run_notices()

+ 172 - 0
spiderNotices/middlewares.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+import random
+from logging import getLogger
+from scrapy import signals
+from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
+from scrapy.http import HtmlResponse
+
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import requests
+
+from .utils import user_agent_list
+
+
+class SpidernoticesSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class SpidernoticesDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class SeleniumMiddleware(object):
+    """
+    使用Selenium加载动态页面。
+    1、process_request:通过request.url判断。
+
+    """
+
+    def __init__(self, timeout=10, service_args=[]):
+        self.logger = getLogger(__name__)
+
+        self.browser = webdriver.PhantomJS(service_args=service_args)
+        self.timeout = timeout
+        self.wait = WebDriverWait(self.browser, self.timeout)
+
+    def __del__(self):
+        self.browser.close()
+
+    def process_request(self, request, spider):
+        if request.url.find(r'data.eastmoney.com/notices/stock/') != -1:
+            # 个股公告公告列表,此时AJX加载到第一页
+            self.logger.debug('Selenium is Starting')
+            try:
+                self.browser.get(request.url)
+                # time.sleep(3)
+                self.wait.until(
+                    EC.presence_of_element_located((By.ID, 'PageCont'))
+                )
+                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
+                                    )
+            except TimeoutException:
+                return HtmlResponse(url=request.url, status=500, request=request)
+        else:
+            return None
+
+
+class ProxyIpMiddleware(object):
+    """
+    使用ip:port格式的代理。从47.103.1.245:5010获取
+    """
+
+    def process_request(self, request, spider):
+        ip = requests.get('http://47.103.1.245:5010/get/').json()
+        if ip["proxy"]:
+            request.meta["proxy"] = "http://" + ip["proxy"]
+
+
+class RandomUserAgent(UserAgentMiddleware):
+    """
+    随机设置一个useragent
+    """
+
+    def process_request(self, request, spider):
+        ua = random.choice(user_agent_list)
+        request.headers.setdefault('User-Agent', ua)

+ 47 - 0
spiderNotices/pipelines.py

@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+from pymongo import MongoClient
+from dateutil.parser import parse
+import hashlib
+
+
+class SpidernoticesPipeline(object):
+    def process_item(self, item, spider):
+        return item
+
+
+class ItemToMongo(object):
+
+    def __init__(self, uri, db_name):
+        self.client = MongoClient(uri)
+        self.db_name = db_name
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            uri=crawler.settings.get('REMOTEMONGO')['uri'],
+            db_name=crawler.settings.get('REMOTEMONGO')['notices']
+        )
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        """ 存储到mongodb,数据库aiStkNotices
+        一个股票对应一张表,表名只有xxxxxx六位证券代码。
+
+        """
+        post = dict(item)
+        coll = self.client[self.db_name][post['code']]
+
+        temp = parse(post['ann_date']).strftime('%Y-%m-%d')  # 网站上显示的只有前面的日期,不考虑tzone
+        post['ann_date'] = parse(temp)
+        post['_id'] = hashlib.md5(post['href'].encode('utf8')).hexdigest()
+        # coll.insert_one(post)  # 有值再插入pymongo.errors.DuplicateKeyError
+        coll.update_one({'_id': post.pop('_id')}, {'$set': post}, upsert=True)
+
+        return item

+ 126 - 0
spiderNotices/settings.py

@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+import os
+import datetime
+
+
+# TODO log分析,完善爬取自动化
+
+
+# 一页数据量
+PAGE_SIZE = None
+
+# 远程数据库
+REMOTEMONGO = {
+    # 'uri': 'mongodb://read:read123456@120.92.189.17:27027',  # /?authSource=amdin&authMechanism=SCRAM-SHA-1
+    #            'host': '120.92.189.17',
+    #            'port': 27027,
+    'uri': 'mongodb://read:read123456@127.0.0.1:27017',  # 本地配置
+    'host': '127.0.0.1',
+    'port': 27017,
+
+    'username': 'admin',
+    'password': 'admin',
+    # 'tushare': 'aiTushare',  # tushare数据的数据库名称
+    # 'factor': 'aiFactor',
+    'notices': 'aiNotices',
+}
+
+# --------------------------------------------------------------------------------------------
+# Scrapy settings for spiderNotices project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'spiderNotices'
+
+SPIDER_MODULES = ['spiderNotices.spiders']
+NEWSPIDER_MODULE = 'spiderNotices.spiders'
+
+DOWNLOAD_TIMEOUT = 120  # 下载超时时间
+
+today = datetime.datetime.now()
+SETTINGS_PATH = os.path.abspath(__file__)
+LOG_FILE = os.path.join(os.path.dirname(SETTINGS_PATH), f'log\\{today.year}_{today.month}_{today.day}.log')
+LOG_LEVEL = 'WARNING'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = 'spiderNotices (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+# }
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    'spiderNotices.middlewares.SpidernoticesSpiderMiddleware': 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+    # 'spiderNotices.middlewares.SpidernoticesDownloaderMiddleware': 543,
+    #  'spiderNotices.middlewares.SeleniumMiddleware': 100,
+    # 'spiderNotices.middlewares.RandomUserAgent': 200,
+    # 'spiderNotices.middlewares.ProxyIpMiddleware': 201,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    # 'spiderNotices.pipelines.SpidernoticesPipeline': 300,
+    'spiderNotices.pipelines.ItemToMongo': 400,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

+ 4 - 0
spiderNotices/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

+ 121 - 0
spiderNotices/spiders/notices.py

@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import tushare as ts
+import urllib
+import copy
+import requests
+from pymongo import MongoClient
+import re
+import hashlib
+from spiderNotices.items import NoticeItem
+from spiderNotices.utils import ashx_json
+
+
+class NoticesSpider(scrapy.Spider):
+    name = 'notices'
+    allowed_domains = ['eastmoney.com']
+    start_urls = ['http://eastmoney.com/']
+
+    # 股票列表
+    shangshi = list(ts.pro_api().stock_basic(list_status='L')['ts_code'].drop_duplicates())
+    tuishi = list(ts.pro_api().stock_basic(list_status='D')['ts_code'].drop_duplicates())
+    zanting = list(ts.pro_api().stock_basic(list_status='P')['ts_code'].drop_duplicates())
+    ts_code_list = list(set(shangshi + tuishi + zanting))
+    code_list = [x.split('.')[0] for x in ts_code_list]
+    code_list.sort()
+    # code_list = ['000001', '000002']
+
+    url_ashx = "http://data.eastmoney.com/notices/getdata.ashx"
+
+    # 对应数据库
+    db = None
+
+    def start_requests(self):
+        """"
+        第一次请求数据。指定page_size,若未指定则请求该股票所有数据。
+        """
+
+        self.db = MongoClient(self.settings.get('REMOTEMONGO')['uri'])[self.settings.get('REMOTEMONGO')['notices']]
+        self.logger.info('爬取股票数量{}'.format(len(self.code_list)))
+
+        for stk in self.code_list:
+            item = NoticeItem()
+            item['code'] = stk
+            if self.settings.get('PAGE_SIZE'):
+                params = {
+                    'StockCode': stk,
+                    'CodeType': 1,
+                    'PageIndex': 1,
+                    'PageSize': self.settings.get('PAGE_SIZE'),
+                }
+                url = self.url_ashx + '?' + urllib.parse.urlencode(params)
+                yield scrapy.Request(
+                    url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}
+                )
+            else:
+                params = {
+                    'StockCode': stk,
+                    'CodeType': 1,
+                    'PageIndex': 1,  # 证券市场,hsa为1,必须要有,否则TotalCount会出问题。
+                    'PageSize': 50,
+                }
+                url = self.url_ashx + '?' + urllib.parse.urlencode(params)
+                first = requests.get(url)
+                page_size = ashx_json(first.text)['TotalCount']
+                self.logger.warning('{}数据总数{}'.format(item['code'], page_size))
+                if page_size == 0:  # 有些证券,网站没有数据。page_size为0,parse函数中会报错,所以眺过
+                    continue
+
+                params = {
+                    'StockCode': stk,
+                    'CodeType': 1,
+                    'PageIndex': 1,
+                    'PageSize': page_size,
+                }
+                url = self.url_ashx + '?' + urllib.parse.urlencode(params)
+                yield scrapy.Request(
+                    url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}
+                )
+
+    def parse(self, response):
+        """
+        分析返回的数据结构,获取公告的摘要信息。
+        """
+        item = response.meta['item']
+        assert item['code'] == re.findall(r'StockCode=(.*?)&', response.url)[0]
+
+        # 已存在的数据,且content不为空。
+        # TODO 按需设置有效数据的规则,例如pdf处理
+        exsit_md5 = self.db[item['code']].find({'content_source': {'$ne': 0}}, {'_id': 1, 'href_md5': 1})
+        exsit_md5 = [x.get('href_md5') for x in exsit_md5]
+
+        total = ashx_json(response.body_as_unicode())
+        for each in total.get('data'):
+            item['ann_date'] = each.get('NOTICEDATE')
+            item['ann_title'] = each.get('NOTICETITLE')
+            item['ann_type'] = each.get('ANN_RELCOLUMNS')[0].get('COLUMNNAME')  # 有些type不属于公告分类table,而是'其它' '股票'这种字段
+            item['href'] = each.get('Url')
+            item['href_md5'] = hashlib.md5(item['href'].encode('utf8')).hexdigest()
+            if item['href_md5'] in exsit_md5:
+                continue
+
+            copy_item = copy.deepcopy(item)
+            yield scrapy.Request(
+                copy_item['href'], callback=self.parse_content, meta={'item': copy_item}
+            )
+
+    def parse_content(self, response):
+        """ 获取公告对应的文本内容。"""
+        item = response.meta['item']
+        try:
+            temp = response.xpath("//div[@class='detail-body']/div/text()").extract()
+            temp = [x for x in temp if str(x).strip()]
+            temp = '\r\n'.join(temp)
+            item['content'] = temp
+            item['content_source'] = 1
+        except Exception as e:
+            self.logger.warning('链接文本为空{}'.format(item['href']))  # TODO 做pdf的提取
+            item['content'] = ''
+            item['content_source'] = 0
+
+        return item

+ 90 - 0
spiderNotices/text_mongo.py

@@ -0,0 +1,90 @@
+"""
+mongodb中的文本数据。
+
+"""
+import pandas as pd
+from dateutil.parser import parse
+from pymongo import MongoClient
+
+from spiderNotices.settings import REMOTEMONGO
+
+
+class TextMongo(object):
+    """" 只做数据查询。"""
+
+    def __init__(self, uri=REMOTEMONGO['uri']):
+        self.client = MongoClient(uri)
+        # 上市公司公告的数据库
+        self.db_notices = self.client[REMOTEMONGO['notices']]
+
+    def get_notices_stk(self):
+        """ 获取notices数据库下存在的表。"""
+        coll_names = self.db_notices.list_collection_names(session=None)
+        coll_names.sort()
+        return coll_names
+
+    def get_notices(self, stk_list=[], begin='', end='', columns=[]):
+        """
+        从mongodb中获取数据。
+        :param stk_list: xxxxxx.zz或xxxxxx.zzzz格式,切分后取前面数字编码。
+        :param begin:
+        :param end:
+        :param columns:
+        :return: DataFrame
+        """
+        # 循环股票列表
+        stk_list = list(set(stk_list))
+        stk_list.sort()
+        each_list = []
+        for stk in stk_list:
+            each = self.get_notices_single(stk, begin=begin, end=end, columns=columns)
+            if not each.empty:
+                each_list.append(each)
+        df = pd.concat(each_list).reset_index(drop=True)
+        return df
+
+    def get_notices_single(self, stk, begin='', end='', columns=[]):
+        # 数据库表
+        coll = self.db_notices[stk.split('.')[0]]
+
+        # 查询条件
+        query = {}
+        if begin:
+            begin = parse(begin)
+            if end:
+                end = parse(end)
+                query['ann_date'] = {"$gte": begin, "$lte": end}
+            else:
+                query['ann_date'] = {"$gte": begin}
+        else:
+            if end:
+                end = parse(end)
+                query['ann_date'] = {"$lte": end}
+            else:
+                pass
+
+        # 查询列
+        if columns:
+            cursor = coll.find(query, {x: 1 for x in columns})  # query为{}时,全取出
+        else:
+            cursor = coll.find(query)
+        df = pd.DataFrame(list(cursor))
+
+        # 整理数据
+        if '_id' in df.columns:
+            del df['_id']
+        df.reset_index(drop=True, inplace=True)
+
+        return df
+
+
+if __name__ == '__main__':
+    # 单个获取
+    result = TextMongo().get_notices_single('000001.SZ', '2010-01-01', '2012-12-31')
+    result = TextMongo().get_notices_single('000001.SZ')
+
+    # 多个获取
+    result = TextMongo().get_notices(['000001.SZ', '000002.SZ'])
+
+    # 遍历存有的股票
+    result = TextMongo().get_notices_stk()

+ 71 - 0
spiderNotices/utils/__init__.py

@@ -0,0 +1,71 @@
+"""
+
+"""
+import re
+import json
+
+
+def ashx_json(ashx_str):
+    json_str = re.findall(r'\{(.*)\}', ashx_str)[0]
+    json_str = '{' + json_str + '}'
+    return json.loads(json_str)
+
+
+# 客户端列表
+user_agent_list = [
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
+        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
+        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
+        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
+        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
+        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
+        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
+        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
+        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
+    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
+    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
+    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
+]