|
@@ -28,9 +28,18 @@ class CrawlMrdx():
|
|
|
def __init__(self):
|
|
|
self.jsonConf = JsonConf()
|
|
|
self.conf = self.jsonConf.load()
|
|
|
- self.currentDate = self.conf.get('currentDate')
|
|
|
+ self.start_date = self.conf.get('startDate')
|
|
|
+ self.end_date = self.conf.get("endDate")
|
|
|
+
|
|
|
+ def update(self):
|
|
|
+ '''update app'''
|
|
|
+ pass
|
|
|
|
|
|
def downNews(self, url, fileName):
|
|
|
+ '''download news
|
|
|
+ :param url: news url
|
|
|
+ :param fileName: saved file name
|
|
|
+ '''
|
|
|
with closing(requests.get(url=url, headers=user_agent.getheaders(), stream=True)) as response:
|
|
|
chunkSize = 1024
|
|
|
contentSize = int(response.headers["content-length"])
|
|
@@ -46,29 +55,37 @@ class CrawlMrdx():
|
|
|
file.write(data)
|
|
|
progress.refresh(count=len(data))
|
|
|
|
|
|
- def crawl(self):
|
|
|
- start_time = time.time()
|
|
|
+ def crawl(self, start: str, end: str):
|
|
|
+ '''crawl news
|
|
|
+ :param start: start date
|
|
|
+ :param end: end date
|
|
|
+ '''
|
|
|
+ start_time = time.time() # 计算耗时
|
|
|
if not os.path.exists("data"):
|
|
|
os.makedirs("data")
|
|
|
pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
|
|
|
index = 1
|
|
|
- while True:
|
|
|
- yestday = (datetime.datetime.strptime(
|
|
|
- self.currentDate, "%Y%m%d").date() + datetime.timedelta(index)).strftime("%Y%m%d")
|
|
|
+ start_date = datetime.datetime.strptime(start, "%Y%m%d")
|
|
|
+ end_date = datetime.datetime.strptime(end, "%Y%m%d")
|
|
|
+
|
|
|
+ current_date = start_date
|
|
|
+ while current_date <= end_date:
|
|
|
+ print(current_date.strftime("%Y%m%d"))
|
|
|
+ current_date_str=current_date.strftime("%Y%m%d")
|
|
|
for j in range(1, 17):
|
|
|
- fileName = r"./data/%s/0%s.pdf" % (yestday, j)
|
|
|
+ fileName = r"./data/%s/0%s.pdf" % (current_date_str, j)
|
|
|
if(os.path.exists(fileName)):
|
|
|
print("跳过" + fileName)
|
|
|
else:
|
|
|
- url = api.pdfUrl % (yestday, j)
|
|
|
+ url = api.pdfUrl % (current_date_str, j)
|
|
|
# 检查链接有效性
|
|
|
response = requests.head(url)
|
|
|
if response.status_code == 200:
|
|
|
# downNews(url,fileName)
|
|
|
future1 = pool.submit(self.downNews, url, fileName)
|
|
|
time.sleep(random.randint(1, 2)) # 文明爬虫
|
|
|
- self.jsonConf.set({"currentDate": yestday})
|
|
|
- if(yestday == self.conf.get('endDate')):
|
|
|
+ self.jsonConf.set({"currentDate": current_date_str})
|
|
|
+ if(current_date_str == self.conf.get('endDate')):
|
|
|
break
|
|
|
- index += 1
|
|
|
- print("last time: {} s".format(time.time() - start_time))
|
|
|
+ current_date += datetime.timedelta(days=1)
|
|
|
+ print("last time: {} s".format(time.time() - start_time))
|