|
@@ -31,7 +31,6 @@ def downNews(url, fileName):
|
|
if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
|
|
if(os.path.exists(fileName) and os.path.getsize(fileName)==contentSize):
|
|
print("跳过"+fileName)
|
|
print("跳过"+fileName)
|
|
else:
|
|
else:
|
|
-
|
|
|
|
progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
|
|
progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
|
|
chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
|
|
chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
|
|
if not os.path.exists(os.path.dirname(fileName)):
|
|
if not os.path.exists(os.path.dirname(fileName)):
|
|
@@ -44,8 +43,8 @@ def downNews(url, fileName):
|
|
def crawl():
|
|
def crawl():
|
|
pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
|
|
pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
|
|
for i in range(1,3650):
|
|
for i in range(1,3650):
|
|
- yestday = (datetime.date.today() +
|
|
|
|
- datetime.timedelta(-i)).strftime("%Y%m%d")
|
|
|
|
|
|
+ # yestday = (datetime.date.today() + datetime.timedelta(-i)).strftime("%Y%m%d")
|
|
|
|
+ yestday = (datetime.datetime.strptime("2019-11-11","%Y-%m-%d").date() + datetime.timedelta(-i)).strftime("%Y%m%d")
|
|
for j in range(1, 17):
|
|
for j in range(1, 17):
|
|
fileName=r"./data/%s/0%s.pdf" %(yestday,j)
|
|
fileName=r"./data/%s/0%s.pdf" %(yestday,j)
|
|
if(os.path.exists(fileName)):
|
|
if(os.path.exists(fileName)):
|