settings.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import datetime
  4. # TODO log分析,完善爬取自动化
  5. # 一页数据量
  6. PAGE_SIZE = None
  7. # 远程数据库
  8. REMOTEMONGO = {
  9. # 'uri': 'mongodb://read:read123456@120.92.189.17:27027', # /?authSource=amdin&authMechanism=SCRAM-SHA-1
  10. # 'host': '120.92.189.17',
  11. # 'port': 27027,
  12. 'uri': 'mongodb://read:read123456@127.0.0.1:27017', # 本地配置
  13. 'host': '127.0.0.1',
  14. 'port': 27017,
  15. 'username': 'admin',
  16. 'password': 'admin',
  17. # 'tushare': 'aiTushare', # tushare数据的数据库名称
  18. # 'factor': 'aiFactor',
  19. 'notices': 'aiNotices',
  20. }
  21. # --------------------------------------------------------------------------------------------
  22. # Scrapy settings for spiderNotices project
  23. #
  24. # For simplicity, this file contains only settings considered important or
  25. # commonly used. You can find more settings consulting the documentation:
  26. #
  27. # https://docs.scrapy.org/en/latest/topics/settings.html
  28. # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  29. # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  30. BOT_NAME = 'spiderNotices'
  31. SPIDER_MODULES = ['spiderNotices.spiders']
  32. NEWSPIDER_MODULE = 'spiderNotices.spiders'
  33. DOWNLOAD_TIMEOUT = 120 # 下载超时时间
  34. today = datetime.datetime.now()
  35. SETTINGS_PATH = os.path.abspath(__file__)
  36. LOG_FILE = os.path.join(os.path.dirname(SETTINGS_PATH), f'log\\{today.year}_{today.month}_{today.day}.log')
  37. LOG_LEVEL = 'WARNING'
  38. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  39. # USER_AGENT = 'spiderNotices (+http://www.yourdomain.com)'
  40. # Obey robots.txt rules
  41. ROBOTSTXT_OBEY = False
  42. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  43. # CONCURRENT_REQUESTS = 32
  44. # Configure a delay for requests for the same website (default: 0)
  45. # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
  46. # See also autothrottle settings and docs
  47. # DOWNLOAD_DELAY = 3
  48. # The download delay setting will honor only one of:
  49. # CONCURRENT_REQUESTS_PER_DOMAIN = 16
  50. # CONCURRENT_REQUESTS_PER_IP = 16
  51. # Disable cookies (enabled by default)
  52. # COOKIES_ENABLED = False
  53. # Disable Telnet Console (enabled by default)
  54. # TELNETCONSOLE_ENABLED = False
  55. # Override the default request headers:
  56. # DEFAULT_REQUEST_HEADERS = {
  57. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58. # 'Accept-Language': 'en',
  59. # }
  60. # Enable or disable spider middlewares
  61. # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  62. # SPIDER_MIDDLEWARES = {
  63. # 'spiderNotices.middlewares.SpidernoticesSpiderMiddleware': 543,
  64. # }
  65. # Enable or disable downloader middlewares
  66. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  67. DOWNLOADER_MIDDLEWARES = {
  68. # 'spiderNotices.middlewares.SpidernoticesDownloaderMiddleware': 543,
  69. # 'spiderNotices.middlewares.SeleniumMiddleware': 100,
  70. # 'spiderNotices.middlewares.RandomUserAgent': 200,
  71. # 'spiderNotices.middlewares.ProxyIpMiddleware': 201,
  72. }
  73. # Enable or disable extensions
  74. # See https://docs.scrapy.org/en/latest/topics/extensions.html
  75. # EXTENSIONS = {
  76. # 'scrapy.extensions.telnet.TelnetConsole': None,
  77. # }
  78. # Configure item pipelines
  79. # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  80. ITEM_PIPELINES = {
  81. # 'spiderNotices.pipelines.SpidernoticesPipeline': 300,
  82. 'spiderNotices.pipelines.ItemToMongo': 400,
  83. }
  84. # Enable and configure the AutoThrottle extension (disabled by default)
  85. # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
  86. # AUTOTHROTTLE_ENABLED = True
  87. # The initial download delay
  88. # AUTOTHROTTLE_START_DELAY = 5
  89. # The maximum download delay to be set in case of high latencies
  90. # AUTOTHROTTLE_MAX_DELAY = 60
  91. # The average number of requests Scrapy should be sending in parallel to
  92. # each remote server
  93. # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  94. # Enable showing throttling stats for every response received:
  95. # AUTOTHROTTLE_DEBUG = False
  96. # Enable and configure HTTP caching (disabled by default)
  97. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  98. # HTTPCACHE_ENABLED = True
  99. # HTTPCACHE_EXPIRATION_SECS = 0
  100. # HTTPCACHE_DIR = 'httpcache'
  101. # HTTPCACHE_IGNORE_HTTP_CODES = []
  102. # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'