2 years ago · fc64b0de08
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 
				 *.pyc
			
 
				+/data/
			
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ python main.py
 
				 
			
 
				 ### 截图
			
 
				 
			
 
				-![](README_files/1.jpg)
			
 
				+![](screenshot/1.jpg)
			
 
				 
			
 
				 目前下载到 ./data/20130822/07.pdf ，2275天的资讯日报，总共16G。
			
 
				 
			
--- a/bin/crawl_mrdx
+++ b/bin/crawl_mrdx
@@ -0,0 +1,6 @@
 
				+#!/usr/bin/env python
			
 
				+
			
 
				+import crawl_mrdx
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    crawl_mrdx.main()
			
--- a/conf/config.json
+++ b/conf/config.json
@@ -0,0 +1,5 @@
 
				+{
			
 
				+    "startDate": "20191111",
			
 
				+    "currentDate": "20191114",
			
 
				+    "endDate": "20211111"
			
 
				+}
			
--- a/crawl_mrdx/__init__.py
+++ b/crawl_mrdx/__init__.py
@@ -0,0 +1,16 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2022/05/25 18:05:04
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   main
			
 
				+'''
			
 
				+
			
 
				+import time
			
 
				+from crawl_mrdx.crawl_mrdx import CrawlMrdx
			
 
				+
			
 
				+
			
 
				+def main(argv=None):
			
 
				+    crawl = CrawlMrdx()
			
 
				+    crawl.crawl()
			
--- a/crawl_mrdx/api.py
+++ b/crawl_mrdx/api.py
@@ -0,0 +1,13 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2022/05/25 18:39:36
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   api
			
 
				+'''
			
 
				+
			
 
				+_host = r"http://mrdx.cn"
			
 
				+pdfUrl = _host + "/PDF/%s/0%s.pdf"
			
 
				+
			
 
				+
			
--- a/crawl_mrdx/crawl_mrdx.py
+++ b/crawl_mrdx/crawl_mrdx.py
@@ -0,0 +1,74 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2022/05/25 18:06:22
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+'''
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import re
			
 
				+import time
			
 
				+import datetime
			
 
				+from contextlib import closing
			
 
				+from crawl_mrdx import api
			
 
				+from crawl_mrdx.libs.json_conf import JsonConf
			
 
				+import requests
			
 
				+import utils.DownloadProgress as DownloadProgress
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+import random
			
 
				+import utils.user_agent as user_agent
			
 
				+
			
 
				+
			
 
				+class CrawlMrdx():
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.jsonConf = JsonConf()
			
 
				+        self.conf = self.jsonConf.load()
			
 
				+        self.currentDate = self.conf.get('currentDate')
			
 
				+
			
 
				+    def downNews(self, url, fileName):
			
 
				+        with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
			
 
				+            chunkSize = 1024
			
 
				+            contentSize = int(response.headers["content-length"])
			
 
				+            if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
			
 
				+                print("跳过" + fileName)
			
 
				+            else:
			
 
				+                progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
			
 
				+                                                             chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
			
 
				+                if not os.path.exists(os.path.dirname(fileName)):
			
 
				+                    os.makedirs(os.path.dirname(fileName))
			
 
				+                with open(fileName, "wb") as file:
			
 
				+                    for data in response.iter_content(chunk_size=chunkSize):
			
 
				+                        file.write(data)
			
 
				+                        progress.refresh(count=len(data))
			
 
				+
			
 
				+    def crawl(self):
			
 
				+        start_time = time.time()
			
 
				+        if not os.path.exists("data"):
			
 
				+            os.makedirs("data")
			
 
				+        pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
			
 
				+        index = 1
			
 
				+        while True:
			
 
				+            yestday = (datetime.datetime.strptime(
			
 
				+                self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
			
 
				+            for j in range(1, 17):
			
 
				+                fileName = r"./data/%s/0%s.pdf" % (yestday, j)
			
 
				+                if(os.path.exists(fileName)):
			
 
				+                    print("跳过" + fileName)
			
 
				+                else:
			
 
				+                    url = api.pdfUrl % (yestday, j)
			
 
				+                    # 检查链接有效性
			
 
				+                    response = requests.head(url)
			
 
				+                    if response.status_code == 200:
			
 
				+                        # downNews(url,fileName)
			
 
				+                        future1 = pool.submit(self.downNews, url, fileName)
			
 
				+                        time.sleep(random.randint(1, 2))  # 文明爬虫
			
 
				+            self.jsonConf.set({"currentDate": yestday})
			
 
				+            if(yestday == self.conf.get('endDate')):
			
 
				+                break
			
 
				+            index += 1
			
 
				+        print("last time: {} s".format(time.time() - start_time))
			
--- a/crawl_mrdx/libs/json_conf.py
+++ b/crawl_mrdx/libs/json_conf.py
@@ -0,0 +1,66 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2022/05/24 15:07:14
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   yaml util
			
 
				+'''
			
 
				+import os
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+class JsonConf:
			
 
				+    def __init__(self, config_path="conf/config.json"):
			
 
				+        self.config_path = config_path
			
 
				+
			
 
				+    def save(self, data):
			
 
				+        with open(self.config_path, 'w') as json_file:
			
 
				+            json_file.write(json.dumps(data, indent=4))
			
 
				+
			
 
				+    def load(self):
			
 
				+        if not os.path.exists(self.config_path):
			
 
				+            with open(self.config_path, 'w') as json_file:
			
 
				+                pass
			
 
				+        with open(self.config_path, encoding="utf-8") as json_file:
			
 
				+            try:
			
 
				+                data = json.load(json_file)
			
 
				+            except Exception as e:
			
 
				+                if(str(e).index("utf-8-sig") > 0):
			
 
				+                    with open(self.config_path, encoding="utf-8-sig") as json_file:
			
 
				+                        data = json.load(json_file)
			
 
				+                        return data
			
 
				+                else:
			
 
				+                    print(e)
			
 
				+            return data
			
 
				+
			
 
				+    def set(self, data_dict):
			
 
				+        json_obj = self.load()
			
 
				+        for key in data_dict:
			
 
				+            json_obj[key] = data_dict[key]
			
 
				+        self.save(json_obj)
			
 
				+        print(json.dumps(json_obj, indent=4))
			
 
				+
			
 
				+    def get(self, key, default_val=""):
			
 
				+        '''
			
 
				+        配置文件获取key对象的值，如果没有设置就返回默认值
			
 
				+        '''
			
 
				+        try:
			
 
				+            result = self.load()[key]
			
 
				+            return result
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+            return default_val
			
 
				+
			
 
				+    def get(self, jsonData, key, default_val=""):
			
 
				+        try:
			
 
				+            return jsonData[key]
			
 
				+        except Exception as e:
			
 
				+            return default_val
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get(jsonData, key, default_val=""):
			
 
				+        try:
			
 
				+            return jsonData[key]
			
 
				+        except Exception as e:
			
 
				+            return default_val
			
--- a/main.py
+++ b/main.py
@@ -6,61 +6,9 @@
 
				 @Time    :   2019/11/11 04:40:28
			
 
				 @Version :   1.0
			
 
				 @License :   (C)Copyright 2019
			
 
				-@Desc    :   按照规则下载pdf文件，知道请求无效停止
			
 
				+@Desc    :   按照规则下载pdf文件，直到请求无效停止
			
 
				 '''
			
 
				-import os
			
 
				-import sys
			
 
				-import json
			
 
				-import re
			
 
				-import time
			
 
				-import datetime
			
 
				-from contextlib import closing
			
 
				-import requests
			
 
				-import utils.DownloadProgress as DownloadProgress
			
 
				-from concurrent.futures import ThreadPoolExecutor
			
 
				-import random
			
 
				-import utils.user_agent as user_agent
			
 
				+import crawl_mrdx
			
 
				 
			
 
				-def get_link():
			
 
				-    pass
			
 
				-
			
 
				-def downNews(url, fileName):
			
 
				-    with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
			
 
				-        chunkSize=1024
			
 
				-        contentSize=int(response.headers["content-length"])
			
 
				-        if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
			
 
				-            print("跳过"+fileName)
			
 
				-        else:
			
 
				-            progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
			
 
				-                                                        chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
			
 
				-            if not os.path.exists(os.path.dirname(fileName)):
			
 
				-                os.makedirs(os.path.dirname(fileName))
			
 
				-            with open(fileName,"wb") as file:
			
 
				-                for data in response.iter_content(chunk_size=chunkSize):
			
 
				-                    file.write(data)
			
 
				-                    progress.refresh(count=len(data))
			
 
				-
			
 
				-def crawl():
			
 
				-    pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
			
 
				-    for i in range(1,3650):
			
 
				-        # yestday = (datetime.date.today() + datetime.timedelta(-i)).strftime("%Y%m%d")
			
 
				-        yestday = (datetime.datetime.strptime("2019-11-11","%Y-%m-%d").date() +  datetime.timedelta(-i)).strftime("%Y%m%d")
			
 
				-        for j in range(1, 17):
			
 
				-            fileName=r"./data/%s/0%s.pdf" %(yestday,j)
			
 
				-            if(os.path.exists(fileName)):
			
 
				-                print("跳过"+fileName)
			
 
				-            else:
			
 
				-                url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
			
 
				-                # 检查链接有效性
			
 
				-                response=requests.head(url)
			
 
				-                if response.status_code==200:
			
 
				-                    # downNews(url,fileName)
			
 
				-                    future1 = pool.submit(downNews,url, fileName)
			
 
				-                    # time.sleep(random.randint(1,2))  # 文明爬虫
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    start_time = time.time()
			
 
				-    if not os.path.exists("data"):
			
 
				-        os.makedirs("data")
			
 
				-    crawl()
			
 
				-    print("last time: {} s".format(time.time() - start_time))
			
 
				+if __name__ == '__main__':
			
 
				+    crawl_mrdx.main()
			
--- a/README_files/1.jpg
+++ b/README_files/1.jpg