liuyuqi-dellpc 1 year ago
parent
commit
20788980df
5 changed files with 37 additions and 18 deletions
  1. 2 1
      README.md
  2. 2 2
      crawl_mrdx/__init__.py
  3. 3 1
      crawl_mrdx/api.py
  4. 29 12
      crawl_mrdx/crawl_mrdx.py
  5. 1 2
      main.py

+ 2 - 1
README.md

@@ -8,7 +8,7 @@ cd my_project_dir
 virtualenv -p /opt/python/bin/python3 venv
 source venv/bin/activate
 pip install -r requirements.txt
-python main.py
+python main.py --start 20210525 --end 20210525
 
 ```
 
@@ -19,5 +19,6 @@ python main.py
 目前下载到 ./data/20130822/07.pdf ,2275天的资讯日报,总共16G。
 
 
+## History
 
 

+ 2 - 2
crawl_mrdx/__init__.py

@@ -11,6 +11,6 @@ import time
 from crawl_mrdx.crawl_mrdx import CrawlMrdx
 
 
-def main(argv=None):
+def main(start: str, end: str):
     crawl = CrawlMrdx()
-    crawl.crawl()
+    crawl.crawl(start, end)

+ 3 - 1
crawl_mrdx/api.py

@@ -8,6 +8,8 @@
 '''
 
 _host = r"http://mrdx.cn"
-pdfUrl = _host + "/PDF/%s/0%s.pdf"
+api_host=r"http://xx.com"
 
+pdfUrl = _host + "/PDF/%s/0%s.pdf"
+get_version = api_host+"/api/get_version"
 

+ 29 - 12
crawl_mrdx/crawl_mrdx.py

@@ -28,9 +28,18 @@ class CrawlMrdx():
     def __init__(self):
         self.jsonConf = JsonConf()
         self.conf = self.jsonConf.load()
-        self.currentDate = self.conf.get('currentDate')
+        self.start_date = self.conf.get('startDate')
+        self.end_date = self.conf.get("endDate")
+        
+    def update(self):
+        '''update app'''
+        pass
 
     def downNews(self, url, fileName):
+        '''download news
+        :param url: news url
+        :param fileName: saved file name
+        '''
         with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
             chunkSize = 1024
             contentSize = int(response.headers["content-length"])
@@ -46,29 +55,37 @@ class CrawlMrdx():
                         file.write(data)
                         progress.refresh(count=len(data))
 
-    def crawl(self):
-        start_time = time.time()
+    def crawl(self, start: str, end: str):
+        '''crawl news
+        :param start: start date
+        :param end: end date
+        '''
+        start_time = time.time() # 计算耗时
         if not os.path.exists("data"):
             os.makedirs("data")
         pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
         index = 1
-        while True:
-            yestday = (datetime.datetime.strptime(
-                self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
+        start_date = datetime.datetime.strptime(start, "%Y%m%d")
+        end_date = datetime.datetime.strptime(end, "%Y%m%d")
+
+        current_date = start_date
+        while current_date <= end_date:
+            print(current_date.strftime("%Y%m%d"))
+            current_date_str=current_date.strftime("%Y%m%d")
             for j in range(1, 17):
-                fileName = r"./data/%s/0%s.pdf" % (yestday, j)
+                fileName = r"./data/%s/0%s.pdf" % (current_date_str, j)
                 if(os.path.exists(fileName)):
                     print("跳过" + fileName)
                 else:
-                    url = api.pdfUrl % (yestday, j)
+                    url = api.pdfUrl % (current_date_str, j)
                     # 检查链接有效性
                     response = requests.head(url)
                     if response.status_code == 200:
                         # downNews(url,fileName)
                         future1 = pool.submit(self.downNews, url, fileName)
                         time.sleep(random.randint(1, 2))  # 文明爬虫
-            self.jsonConf.set({"currentDate": yestday})
-            if(yestday == self.conf.get('endDate')):
+            self.jsonConf.set({"currentDate": current_date_str})
+            if(current_date_str == self.conf.get('endDate')):
                 break
-            index += 1
-        print("last time: {} s".format(time.time() - start_time))
+            current_date += datetime.timedelta(days=1)
+        print("last time: {} s".format(time.time() - start_time))

+ 1 - 2
main.py

@@ -9,6 +9,5 @@
 @Desc    :   按照规则下载pdf文件,直到请求无效停止
 '''
 import crawl_mrdx
-
 if __name__ == '__main__':
-    crawl_mrdx.main()
+    crawl_mrdx.main(start = '20210525', end = '20210525')