|
@@ -15,6 +15,7 @@ from crawl_xiaohua.libs.json_conf import JsonConf
|
|
|
import requests
|
|
|
from crawl_xiaohua import api
|
|
|
import bs4
|
|
|
+import pandas as pd
|
|
|
|
|
|
headers = {
|
|
|
"Authority": "img.xiaohua.com",
|
|
@@ -38,6 +39,7 @@ class CrawlXiaohua():
|
|
|
self.jsonConf = JsonConf()
|
|
|
self.conf = self.jsonConf.load()
|
|
|
self.indexPage = self.conf.get('indexPage')
|
|
|
+ self.indexDuanziPage = self.conf.get('indexDuanziPage')
|
|
|
# self.s.cookies.update(JsonConf().get_cookies())
|
|
|
|
|
|
def crawl(self):
|
|
@@ -45,6 +47,26 @@ class CrawlXiaohua():
|
|
|
self.getPicList()
|
|
|
time.sleep(random.randint(1, 5))
|
|
|
|
|
|
+ def crawlDuanzi(self):
|
|
|
+ for i in range(10000):
|
|
|
+ self.getDuanziList()
|
|
|
+ self.indexDuanziPage = str(i)
|
|
|
+ self.jsonConf.set({"indexDuanziPage": self.indexDuanziPage})
|
|
|
+ time.sleep(random.randint(1, 5))
|
|
|
+
|
|
|
+ def getDuanziList(self):
|
|
|
+ res = self.s.get(api.startDuanziUrl + "?page=" + self.indexDuanziPage)
|
|
|
+ resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
|
|
|
+ divContentLeft = resHtml.find_all('div', {
|
|
|
+ 'class': 'one-cont'})
|
|
|
+ divContentLefts = [div.find('p').find(
|
|
|
+ 'a').text for div in divContentLeft]
|
|
|
+ self.saveDuanZiList(divContentLefts)
|
|
|
+
|
|
|
+ def saveDuanZiList(self, duanziList):
|
|
|
+ pd.DataFrame(duanziList).to_csv(r"data/duanzhi.csv",
|
|
|
+ mode='a', encoding='utf-8', header=False)
|
|
|
+
|
|
|
def getPicList(self):
|
|
|
res = self.s.get(api.startUrl + self.indexPage)
|
|
|
resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
|