1 year ago · 20788980df
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ cd my_project_dir
 
				 virtualenv -p /opt/python/bin/python3 venv
			
 
				 source venv/bin/activate
			
 
				 pip install -r requirements.txt
			
 
				-python main.py
			
 
				+python main.py --start 20210525 --end 20210525
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -19,5 +19,6 @@ python main.py
 
				 目前下载到 ./data/20130822/07.pdf ，2275天的资讯日报，总共16G。
			
 
				 
			
 
				 
			
 
				+## History
			
 
				 
			
 
				 
			
--- a/crawl_mrdx/__init__.py
+++ b/crawl_mrdx/__init__.py
@@ -11,6 +11,6 @@ import time
 
				 from crawl_mrdx.crawl_mrdx import CrawlMrdx
			
 
				 
			
 
				 
			
 
				-def main(argv=None):
			
 
				+def main(start: str, end: str):
			
 
				     crawl = CrawlMrdx()
			
 
				-    crawl.crawl()
			
 
				+    crawl.crawl(start, end)
			
--- a/crawl_mrdx/api.py
+++ b/crawl_mrdx/api.py
@@ -8,6 +8,8 @@
 
				 '''
			
 
				 
			
 
				 _host = r"http://mrdx.cn"
			
 
				-pdfUrl = _host + "/PDF/%s/0%s.pdf"
			
 
				+api_host=r"http://xx.com"
			
 
				 
			
 
				+pdfUrl = _host + "/PDF/%s/0%s.pdf"
			
 
				+get_version = api_host+"/api/get_version"
			
 
				 
			
--- a/crawl_mrdx/crawl_mrdx.py
+++ b/crawl_mrdx/crawl_mrdx.py
@@ -28,9 +28,18 @@ class CrawlMrdx():
 
				     def __init__(self):
			
 
				         self.jsonConf = JsonConf()
			
 
				         self.conf = self.jsonConf.load()
			
 
				-        self.currentDate = self.conf.get('currentDate')
			
 
				+        self.start_date = self.conf.get('startDate')
			
 
				+        self.end_date = self.conf.get("endDate")
			
 
				+        
			
 
				+    def update(self):
			
 
				+        '''update app'''
			
 
				+        pass
			
 
				 
			
 
				     def downNews(self, url, fileName):
			
 
				+        '''download news
			
 
				+        :param url: news url
			
 
				+        :param fileName: saved file name
			
 
				+        '''
			
 
				         with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
			
 
				             chunkSize = 1024
			
 
				             contentSize = int(response.headers["content-length"])
			
@@ -46,29 +55,37 @@ class CrawlMrdx():
 
				                         file.write(data)
			
 
				                         progress.refresh(count=len(data))
			
 
				 
			
 
				-    def crawl(self):
			
 
				-        start_time = time.time()
			
 
				+    def crawl(self, start: str, end: str):
			
 
				+        '''crawl news
			
 
				+        :param start: start date
			
 
				+        :param end: end date
			
 
				+        '''
			
 
				+        start_time = time.time() # 计算耗时
			
 
				         if not os.path.exists("data"):
			
 
				             os.makedirs("data")
			
 
				         pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
			
 
				         index = 1
			
 
				-        while True:
			
 
				-            yestday = (datetime.datetime.strptime(
			
 
				-                self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
			
 
				+        start_date = datetime.datetime.strptime(start, "%Y%m%d")
			
 
				+        end_date = datetime.datetime.strptime(end, "%Y%m%d")
			
 
				+
			
 
				+        current_date = start_date
			
 
				+        while current_date <= end_date:
			
 
				+            print(current_date.strftime("%Y%m%d"))
			
 
				+            current_date_str=current_date.strftime("%Y%m%d")
			
 
				             for j in range(1, 17):
			
 
				-                fileName = r"./data/%s/0%s.pdf" % (yestday, j)
			
 
				+                fileName = r"./data/%s/0%s.pdf" % (current_date_str, j)
			
 
				                 if(os.path.exists(fileName)):
			
 
				                     print("跳过" + fileName)
			
 
				                 else:
			
 
				-                    url = api.pdfUrl % (yestday, j)
			
 
				+                    url = api.pdfUrl % (current_date_str, j)
			
 
				                     # 检查链接有效性
			
 
				                     response = requests.head(url)
			
 
				                     if response.status_code == 200:
			
 
				                         # downNews(url,fileName)
			
 
				                         future1 = pool.submit(self.downNews, url, fileName)
			
 
				                         time.sleep(random.randint(1, 2))  # 文明爬虫
			
 
				-            self.jsonConf.set({"currentDate": yestday})
			
 
				-            if(yestday == self.conf.get('endDate')):
			
 
				+            self.jsonConf.set({"currentDate": current_date_str})
			
 
				+            if(current_date_str == self.conf.get('endDate')):
			
 
				                 break
			
 
				-            index += 1
			
 
				-        print("last time: {} s".format(time.time() - start_time))
			
 
				+            current_date += datetime.timedelta(days=1)
			
 
				+        print("last time: {} s".format(time.time() - start_time))
			
--- a/main.py
+++ b/main.py
@@ -9,6 +9,5 @@
 
				 @Desc    :   按照规则下载pdf文件，直到请求无效停止
			
 
				 '''
			
 
				 import crawl_mrdx
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				-    crawl_mrdx.main()
			
 
				+    crawl_mrdx.main(start = '20210525', end = '20210525')