|
@@ -28,32 +28,36 @@ def downNews(url, fileName):
|
|
|
with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
|
|
|
chunkSize=1024
|
|
|
contentSize=int(response.headers["content-length"])
|
|
|
- fileD="./data/"+fileName
|
|
|
- if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
|
|
|
+ if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
|
|
|
print("跳过"+fileName)
|
|
|
else:
|
|
|
|
|
|
progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
|
|
|
chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
|
|
|
- if not os.path.exists(os.path.dirname(fileD)):
|
|
|
- os.makedirs(os.path.dirname(fileD))
|
|
|
- with open(fileD,"wb") as file:
|
|
|
+ if not os.path.exists(os.path.dirname(fileName)):
|
|
|
+ os.makedirs(os.path.dirname(fileName))
|
|
|
+ with open(fileName,"wb") as file:
|
|
|
for data in response.iter_content(chunk_size=chunkSize):
|
|
|
file.write(data)
|
|
|
progress.refresh(count=len(data))
|
|
|
|
|
|
def crawl():
|
|
|
+ pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
|
|
|
for i in range(1,3650):
|
|
|
yestday = (datetime.date.today() +
|
|
|
datetime.timedelta(-i)).strftime("%Y%m%d")
|
|
|
for j in range(1, 17):
|
|
|
- url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
|
|
|
- # 检查链接有效性
|
|
|
- response=requests.head(url)
|
|
|
- if response.status_code==200:
|
|
|
- fileName=r"%s/0%s.pdf" %(yestday,j)
|
|
|
- downNews(url,fileName)
|
|
|
- time.sleep(random.randint(1,2)) # 文明爬虫
|
|
|
+ fileName=r"./data/%s/0%s.pdf" %(yestday,j)
|
|
|
+ if(os.path.exists(fileName)):
|
|
|
+ print("跳过"+fileName)
|
|
|
+ else:
|
|
|
+ url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
|
|
|
+ # 检查链接有效性
|
|
|
+ response=requests.head(url)
|
|
|
+ if response.status_code==200:
|
|
|
+ # downNews(url,fileName)
|
|
|
+ future1 = pool.submit(downNews,url, fileName)
|
|
|
+ # time.sleep(random.randint(1,2)) # 文明爬虫
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
start_time = time.time()
|