Browse Source

优化项目结构

liuyuqi-dellpc 2 years ago
parent
commit
fc64b0de08
10 changed files with 186 additions and 57 deletions
  1. 1 0
      .gitignore
  2. 1 1
      README.md
  3. 6 0
      bin/crawl_mrdx
  4. 5 0
      conf/config.json
  5. 16 0
      crawl_mrdx/__init__.py
  6. 13 0
      crawl_mrdx/api.py
  7. 74 0
      crawl_mrdx/crawl_mrdx.py
  8. 66 0
      crawl_mrdx/libs/json_conf.py
  9. 4 56
      main.py
  10. 0 0
      screenshot/1.jpg

+ 1 - 0
.gitignore

@@ -1 +1,2 @@
 *.pyc
+/data/

+ 1 - 1
README.md

@@ -14,7 +14,7 @@ python main.py
 
 ### 截图
 
-![](README_files/1.jpg)
+![](screenshot/1.jpg)
 
 目前下载到 ./data/20130822/07.pdf ,2275天的资讯日报,总共16G。
 

+ 6 - 0
bin/crawl_mrdx

@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+import crawl_mrdx
+
+if __name__ == '__main__':
+    crawl_mrdx.main()

+ 5 - 0
conf/config.json

@@ -0,0 +1,5 @@
+{
+    "startDate": "20191111",
+    "currentDate": "20191114",
+    "endDate": "20211111"
+}

+ 16 - 0
crawl_mrdx/__init__.py

@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/25 18:05:04
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   main
+'''
+
+import time
+from crawl_mrdx.crawl_mrdx import CrawlMrdx
+
+
+def main(argv=None):
+    crawl = CrawlMrdx()
+    crawl.crawl()

+ 13 - 0
crawl_mrdx/api.py

@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/25 18:39:36
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   api
+'''
+
+_host = r"http://mrdx.cn"
+pdfUrl = _host + "/PDF/%s/0%s.pdf"
+
+

+ 74 - 0
crawl_mrdx/crawl_mrdx.py

@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/25 18:06:22
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+
+import os
+import sys
+import json
+import re
+import time
+import datetime
+from contextlib import closing
+from crawl_mrdx import api
+from crawl_mrdx.libs.json_conf import JsonConf
+import requests
+import utils.DownloadProgress as DownloadProgress
+from concurrent.futures import ThreadPoolExecutor
+import random
+import utils.user_agent as user_agent
+
+
+class CrawlMrdx():
+
+    def __init__(self):
+        self.jsonConf = JsonConf()
+        self.conf = self.jsonConf.load()
+        self.currentDate = self.conf.get('currentDate')
+
+    def downNews(self, url, fileName):
+        with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
+            chunkSize = 1024
+            contentSize = int(response.headers["content-length"])
+            if(os.path.exists(fileName) and os.path.getsize(fileName) == contentSize):
+                print("跳过" + fileName)
+            else:
+                progress = DownloadProgress.DownloadProgress(fileName, total=contentSize, unit="KB",
+                                                             chunk_size=chunkSize, run_status="downloading", fin_status="downloaded")
+                if not os.path.exists(os.path.dirname(fileName)):
+                    os.makedirs(os.path.dirname(fileName))
+                with open(fileName, "wb") as file:
+                    for data in response.iter_content(chunk_size=chunkSize):
+                        file.write(data)
+                        progress.refresh(count=len(data))
+
+    def crawl(self):
+        start_time = time.time()
+        if not os.path.exists("data"):
+            os.makedirs("data")
+        pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
+        index = 1
+        while True:
+            yestday = (datetime.datetime.strptime(
+                self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
+            for j in range(1, 17):
+                fileName = r"./data/%s/0%s.pdf" % (yestday, j)
+                if(os.path.exists(fileName)):
+                    print("跳过" + fileName)
+                else:
+                    url = api.pdfUrl % (yestday, j)
+                    # 检查链接有效性
+                    response = requests.head(url)
+                    if response.status_code == 200:
+                        # downNews(url,fileName)
+                        future1 = pool.submit(self.downNews, url, fileName)
+                        time.sleep(random.randint(1, 2))  # 文明爬虫
+            self.jsonConf.set({"currentDate": yestday})
+            if(yestday == self.conf.get('endDate')):
+                break
+            index += 1
+        print("last time: {} s".format(time.time() - start_time))

+ 66 - 0
crawl_mrdx/libs/json_conf.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/24 15:07:14
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   yaml util
+'''
+import os
+import json
+
+
+class JsonConf:
+    def __init__(self, config_path="conf/config.json"):
+        self.config_path = config_path
+
+    def save(self, data):
+        with open(self.config_path, 'w') as json_file:
+            json_file.write(json.dumps(data, indent=4))
+
+    def load(self):
+        if not os.path.exists(self.config_path):
+            with open(self.config_path, 'w') as json_file:
+                pass
+        with open(self.config_path, encoding="utf-8") as json_file:
+            try:
+                data = json.load(json_file)
+            except Exception as e:
+                if(str(e).index("utf-8-sig") > 0):
+                    with open(self.config_path, encoding="utf-8-sig") as json_file:
+                        data = json.load(json_file)
+                        return data
+                else:
+                    print(e)
+            return data
+
+    def set(self, data_dict):
+        json_obj = self.load()
+        for key in data_dict:
+            json_obj[key] = data_dict[key]
+        self.save(json_obj)
+        print(json.dumps(json_obj, indent=4))
+
+    def get(self, key, default_val=""):
+        '''
+        配置文件获取key对象的值,如果没有设置就返回默认值
+        '''
+        try:
+            result = self.load()[key]
+            return result
+        except Exception as e:
+            print(e)
+            return default_val
+
+    def get(self, jsonData, key, default_val=""):
+        try:
+            return jsonData[key]
+        except Exception as e:
+            return default_val
+
+    @staticmethod
+    def get(jsonData, key, default_val=""):
+        try:
+            return jsonData[key]
+        except Exception as e:
+            return default_val

+ 4 - 56
main.py

@@ -6,61 +6,9 @@
 @Time    :   2019/11/11 04:40:28
 @Version :   1.0
 @License :   (C)Copyright 2019
-@Desc    :   按照规则下载pdf文件,知道请求无效停止
+@Desc    :   按照规则下载pdf文件,直到请求无效停止
 '''
-import os
-import sys
-import json
-import re
-import time
-import datetime
-from contextlib import closing
-import requests
-import utils.DownloadProgress as DownloadProgress
-from concurrent.futures import ThreadPoolExecutor
-import random
-import utils.user_agent as user_agent
+import crawl_mrdx
 
-def get_link():
-    pass
-
-def downNews(url, fileName):
-    with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
-        chunkSize=1024
-        contentSize=int(response.headers["content-length"])
-        if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
-            print("跳过"+fileName)
-        else:
-            progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
-                                                        chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
-            if not os.path.exists(os.path.dirname(fileName)):
-                os.makedirs(os.path.dirname(fileName))
-            with open(fileName,"wb") as file:
-                for data in response.iter_content(chunk_size=chunkSize):
-                    file.write(data)
-                    progress.refresh(count=len(data))
-
-def crawl():
-    pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
-    for i in range(1,3650):
-        # yestday = (datetime.date.today() + datetime.timedelta(-i)).strftime("%Y%m%d")
-        yestday = (datetime.datetime.strptime("2019-11-11","%Y-%m-%d").date() +  datetime.timedelta(-i)).strftime("%Y%m%d")
-        for j in range(1, 17):
-            fileName=r"./data/%s/0%s.pdf" %(yestday,j)
-            if(os.path.exists(fileName)):
-                print("跳过"+fileName)
-            else:
-                url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
-                # 检查链接有效性
-                response=requests.head(url)
-                if response.status_code==200:
-                    # downNews(url,fileName)
-                    future1 = pool.submit(downNews,url, fileName)
-                    # time.sleep(random.randint(1,2))  # 文明爬虫
-
-if __name__ == "__main__":
-    start_time = time.time()
-    if not os.path.exists("data"):
-        os.makedirs("data")
-    crawl()
-    print("last time: {} s".format(time.time() - start_time))
+if __name__ == '__main__':
+    crawl_mrdx.main()

+ 0 - 0
README_files/1.jpg → screenshot/1.jpg