5 years ago · 4b265fa82a
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 
				+*.pyc
			
--- a/main.py
+++ b/main.py
@@ -28,32 +28,36 @@ def downNews(url, fileName):
 
				     with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
			
 
				         chunkSize=1024
			
 
				         contentSize=int(response.headers["content-length"])
			
 
				-        fileD="./data/"+fileName
			
 
				-        if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
			
 
				+        if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
			
 
				             print("跳过"+fileName)
			
 
				         else:
			
 
				 
			
 
				             progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
			
 
				                                                         chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
			
 
				-            if not os.path.exists(os.path.dirname(fileD)):
			
 
				-                os.makedirs(os.path.dirname(fileD))
			
 
				-            with open(fileD,"wb") as file:
			
 
				+            if not os.path.exists(os.path.dirname(fileName)):
			
 
				+                os.makedirs(os.path.dirname(fileName))
			
 
				+            with open(fileName,"wb") as file:
			
 
				                 for data in response.iter_content(chunk_size=chunkSize):
			
 
				                     file.write(data)
			
 
				                     progress.refresh(count=len(data))
			
 
				 
			
 
				 def crawl():
			
 
				+    pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
			
 
				     for i in range(1,3650):
			
 
				         yestday = (datetime.date.today() +
			
 
				                 datetime.timedelta(-i)).strftime("%Y%m%d")
			
 
				         for j in range(1, 17):
			
 
				-            url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
			
 
				-            # 检查链接有效性
			
 
				-            response=requests.head(url)
			
 
				-            if response.status_code==200:
			
 
				-                fileName=r"%s/0%s.pdf" %(yestday,j)
			
 
				-                downNews(url,fileName)
			
 
				-                time.sleep(random.randint(1,2))  # 文明爬虫
			
 
				+            fileName=r"./data/%s/0%s.pdf" %(yestday,j)
			
 
				+            if(os.path.exists(fileName)):
			
 
				+                print("跳过"+fileName)
			
 
				+            else:
			
 
				+                url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
			
 
				+                # 检查链接有效性
			
 
				+                response=requests.head(url)
			
 
				+                if response.status_code==200:
			
 
				+                    # downNews(url,fileName)
			
 
				+                    future1 = pool.submit(downNews,url, fileName)
			
 
				+                    # time.sleep(random.randint(1,2))  # 文明爬虫
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     start_time = time.time()