Browse Source

添加多线程

liuyuqi-dellpc 5 years ago
parent
commit
4b265fa82a
2 changed files with 17 additions and 12 deletions
  1. 1 0
      .gitignore
  2. 16 12
      main.py

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+*.pyc

+ 16 - 12
main.py

@@ -28,32 +28,36 @@ def downNews(url, fileName):
     with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
         chunkSize=1024
         contentSize=int(response.headers["content-length"])
-        fileD="./data/"+fileName
-        if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
+        if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
             print("跳过"+fileName)
         else:
 
             progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
                                                         chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
-            if not os.path.exists(os.path.dirname(fileD)):
-                os.makedirs(os.path.dirname(fileD))
-            with open(fileD,"wb") as file:
+            if not os.path.exists(os.path.dirname(fileName)):
+                os.makedirs(os.path.dirname(fileName))
+            with open(fileName,"wb") as file:
                 for data in response.iter_content(chunk_size=chunkSize):
                     file.write(data)
                     progress.refresh(count=len(data))
 
 def crawl():
+    pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
     for i in range(1,3650):
         yestday = (datetime.date.today() +
                 datetime.timedelta(-i)).strftime("%Y%m%d")
         for j in range(1, 17):
-            url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
-            # 检查链接有效性
-            response=requests.head(url)
-            if response.status_code==200:
-                fileName=r"%s/0%s.pdf" %(yestday,j)
-                downNews(url,fileName)
-                time.sleep(random.randint(1,2))  # 文明爬虫
+            fileName=r"./data/%s/0%s.pdf" %(yestday,j)
+            if(os.path.exists(fileName)):
+                print("跳过"+fileName)
+            else:
+                url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
+                # 检查链接有效性
+                response=requests.head(url)
+                if response.status_code==200:
+                    # downNews(url,fileName)
+                    future1 = pool.submit(downNews,url, fileName)
+                    # time.sleep(random.randint(1,2))  # 文明爬虫
 
 if __name__ == "__main__":
     start_time = time.time()