4 years ago · fc64b0de08
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 
															 *.pyc
														
 
															+/data/
														
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ python main.py
 
															 ### 截图
														
 
															-![](README_files/1.jpg)
														
 
															+![](screenshot/1.jpg)
														
 
															 目前下载到 ./data/20130822/07.pdf ，2275天的资讯日报，总共16G。
														
--- a/bin/crawl_mrdx
+++ b/bin/crawl_mrdx
@@ -0,0 +1,6 @@
 
															+#!/usr/bin/env python
														
 
															+
														
 
															+import crawl_mrdx
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    crawl_mrdx.main()
														
--- a/conf/config.json
+++ b/conf/config.json
@@ -0,0 +1,5 @@
 
															+{
														
 
															+    "startDate": "20191111",
														
 
															+    "currentDate": "20191114",
														
 
															+    "endDate": "20211111"
														
 
															+}
														
--- a/crawl_mrdx/__init__.py
+++ b/crawl_mrdx/__init__.py
@@ -0,0 +1,16 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+'''
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2022/05/25 18:05:04
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   main
														
 
															+'''
														
 
															+
														
 
															+import time
														
 
															+from crawl_mrdx.crawl_mrdx import CrawlMrdx
														
 
															+
														
 
															+
														
 
															+def main(argv=None):
														
 
															+    crawl = CrawlMrdx()
														
 
															+    crawl.crawl()
														
--- a/crawl_mrdx/api.py
+++ b/crawl_mrdx/api.py
@@ -0,0 +1,13 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+'''
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2022/05/25 18:39:36
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   api
														
 
															+'''
														
 
															+
														
 
															+_host = r"http://mrdx.cn"
														
 
															+pdfUrl = _host + "/PDF/%s/0%s.pdf"
														
 
															+
														
 
															+
														
--- a/crawl_mrdx/crawl_mrdx.py
+++ b/crawl_mrdx/crawl_mrdx.py
@@ -0,0 +1,74 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+'''
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2022/05/25 18:06:22
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   
														
 
															+'''
														
 
															+
														
 
															+import os
														
 
															+import sys
														
 
															+import json
														
 
															+import re
														
 
															+import time
														
 
															+import datetime
														
 
															+from contextlib import closing
														
 
															+from crawl_mrdx import api
														
 
															+from crawl_mrdx.libs.json_conf import JsonConf
														
 
															+import requests
														
 
															+import utils.DownloadProgress as DownloadProgress
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+import random
														
 
															+import utils.user_agent as user_agent
														
 
															+
														
 
															+
														
 
															+class CrawlMrdx():
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.jsonConf = JsonConf()
														
 
															+        self.conf = self.jsonConf.load()
														
 
															+        self.currentDate = self.conf.get('currentDate')
														
 
															+
														
 
															+    def downNews(self, url, fileName):
														
 
															+        with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
														
 
															+            chunkSize = 1024
														
 
															+            contentSize = int(response.headers["content-length"])
														
 
															+            if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
														
 
															+                print("跳过" + fileName)
														
 
															+            else:
														
 
															+                progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
														
 
															+                                                             chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
														
 
															+                if not os.path.exists(os.path.dirname(fileName)):
														
 
															+                    os.makedirs(os.path.dirname(fileName))
														
 
															+                with open(fileName, "wb") as file:
														
 
															+                    for data in response.iter_content(chunk_size=chunkSize):
														
 
															+                        file.write(data)
														
 
															+                        progress.refresh(count=len(data))
														
 
															+
														
 
															+    def crawl(self):
														
 
															+        start_time = time.time()
														
 
															+        if not os.path.exists("data"):
														
 
															+            os.makedirs("data")
														
 
															+        pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
														
 
															+        index = 1
														
 
															+        while True:
														
 
															+            yestday = (datetime.datetime.strptime(
														
 
															+                self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
														
 
															+            for j in range(1, 17):
														
 
															+                fileName = r"./data/%s/0%s.pdf" % (yestday, j)
														
 
															+                if(os.path.exists(fileName)):
														
 
															+                    print("跳过" + fileName)
														
 
															+                else:
														
 
															+                    url = api.pdfUrl % (yestday, j)
														
 
															+                    # 检查链接有效性
														
 
															+                    response = requests.head(url)
														
 
															+                    if response.status_code == 200:
														
 
															+                        # downNews(url,fileName)
														
 
															+                        future1 = pool.submit(self.downNews, url, fileName)
														
 
															+                        time.sleep(random.randint(1, 2))  # 文明爬虫
														
 
															+            self.jsonConf.set({"currentDate": yestday})
														
 
															+            if(yestday == self.conf.get('endDate')):
														
 
															+                break
														
 
															+            index += 1
														
 
															+        print("last time: {} s".format(time.time() - start_time))
														
--- a/crawl_mrdx/libs/json_conf.py
+++ b/crawl_mrdx/libs/json_conf.py
@@ -0,0 +1,66 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- encoding: utf-8 -*-
														
 
															+'''
														
 
															+@Contact :   liuyuqi.gov@msn.cn
														
 
															+@Time    :   2022/05/24 15:07:14
														
 
															+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															+@Desc    :   yaml util
														
 
															+'''
														
 
															+import os
														
 
															+import json
														
 
															+
														
 
															+
														
 
															+class JsonConf:
														
 
															+    def __init__(self, config_path="conf/config.json"):
														
 
															+        self.config_path = config_path
														
 
															+
														
 
															+    def save(self, data):
														
 
															+        with open(self.config_path, 'w') as json_file:
														
 
															+            json_file.write(json.dumps(data, indent=4))
														
 
															+
														
 
															+    def load(self):
														
 
															+        if not os.path.exists(self.config_path):
														
 
															+            with open(self.config_path, 'w') as json_file:
														
 
															+                pass
														
 
															+        with open(self.config_path, encoding="utf-8") as json_file:
														
 
															+            try:
														
 
															+                data = json.load(json_file)
														
 
															+            except Exception as e:
														
 
															+                if(str(e).index("utf-8-sig") > 0):
														
 
															+                    with open(self.config_path, encoding="utf-8-sig") as json_file:
														
 
															+                        data = json.load(json_file)
														
 
															+                        return data
														
 
															+                else:
														
 
															+                    print(e)
														
 
															+            return data
														
 
															+
														
 
															+    def set(self, data_dict):
														
 
															+        json_obj = self.load()
														
 
															+        for key in data_dict:
														
 
															+            json_obj[key] = data_dict[key]
														
 
															+        self.save(json_obj)
														
 
															+        print(json.dumps(json_obj, indent=4))
														
 
															+
														
 
															+    def get(self, key, default_val=""):
														
 
															+        '''
														
 
															+        配置文件获取key对象的值，如果没有设置就返回默认值
														
 
															+        '''
														
 
															+        try:
														
 
															+            result = self.load()[key]
														
 
															+            return result
														
 
															+        except Exception as e:
														
 
															+            print(e)
														
 
															+            return default_val
														
 
															+
														
 
															+    def get(self, jsonData, key, default_val=""):
														
 
															+        try:
														
 
															+            return jsonData[key]
														
 
															+        except Exception as e:
														
 
															+            return default_val
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def get(jsonData, key, default_val=""):
														
 
															+        try:
														
 
															+            return jsonData[key]
														
 
															+        except Exception as e:
														
 
															+            return default_val
														
--- a/main.py
+++ b/main.py
@@ -6,61 +6,9 @@
 
															 @Time    :   2019/11/11 04:40:28
														
 
															 @Version :   1.0
														
 
															 @License :   (C)Copyright 2019
														
 
															-@Desc    :   按照规则下载pdf文件，知道请求无效停止
														
 
															+@Desc    :   按照规则下载pdf文件，直到请求无效停止
														
 
															 '''
														
 
															-import os
														
 
															-import sys
														
 
															-import json
														
 
															-import re
														
 
															-import time
														
 
															-import datetime
														
 
															-from contextlib import closing
														
 
															-import requests
														
 
															-import utils.DownloadProgress as DownloadProgress
														
 
															-from concurrent.futures import ThreadPoolExecutor
														
 
															-import random
														
 
															-import utils.user_agent as user_agent
														
 
															+import crawl_mrdx
														
 
															-def get_link():
														
 
															-    pass
														
 
															-
														
 
															-def downNews(url, fileName):
														
 
															-    with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
														
 
															-        chunkSize=1024
														
 
															-        contentSize=int(response.headers["content-length"])
														
 
															-        if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
														
 
															-            print("跳过"+fileName)
														
 
															-        else:
														
 
															-            progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
														
 
															-                                                        chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
														
 
															-            if not os.path.exists(os.path.dirname(fileName)):
														
 
															-                os.makedirs(os.path.dirname(fileName))
														
 
															-            with open(fileName,"wb") as file:
														
 
															-                for data in response.iter_content(chunk_size=chunkSize):
														
 
															-                    file.write(data)
														
 
															-                    progress.refresh(count=len(data))
														
 
															-
														
 
															-def crawl():
														
 
															-    pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
														
 
															-    for i in range(1,3650):
														
 
															-        # yestday = (datetime.date.today() + datetime.timedelta(-i)).strftime("%Y%m%d")
														
 
															-        yestday = (datetime.datetime.strptime("2019-11-11","%Y-%m-%d").date() +  datetime.timedelta(-i)).strftime("%Y%m%d")
														
 
															-        for j in range(1, 17):
														
 
															-            fileName=r"./data/%s/0%s.pdf" %(yestday,j)
														
 
															-            if(os.path.exists(fileName)):
														
 
															-                print("跳过"+fileName)
														
 
															-            else:
														
 
															-                url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
														
 
															-                # 检查链接有效性
														
 
															-                response=requests.head(url)
														
 
															-                if response.status_code==200:
														
 
															-                    # downNews(url,fileName)
														
 
															-                    future1 = pool.submit(downNews,url, fileName)
														
 
															-                    # time.sleep(random.randint(1,2))  # 文明爬虫
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    start_time = time.time()
														
 
															-    if not os.path.exists("data"):
														
 
															-        os.makedirs("data")
														
 
															-    crawl()
														
 
															-    print("last time: {} s".format(time.time() - start_time))
														
 
															+if __name__ == '__main__':
														
 
															+    crawl_mrdx.main()
														
--- a/README_files/1.jpg
+++ b/README_files/1.jpg