Browse Source

增加段子爬虫

liuyuqi-dellpc 2 years ago
parent
commit
9ff190de23

+ 2 - 1
crawl_xiaohua/conf/config.json

@@ -1,4 +1,5 @@
 {
     "cookie": "",
-    "indexPage": "123650"
+    "indexPage": "123650",
+    "indexDuanziPage": "845"
 }

+ 5 - 0
crawl_xiaohua/crawl_xiaohua/__init__.py

@@ -13,3 +13,8 @@ from crawl_xiaohua.crawl_xiaohua import CrawlXiaohua
 def main(argv=None):
     crawl = CrawlXiaohua()
     crawl.crawl()
+
+
+def crawlDuanzi(argv=None):
+    crawl = CrawlXiaohua()
+    crawl.crawlDuanzi()

+ 2 - 1
crawl_xiaohua/crawl_xiaohua/api.py

@@ -9,7 +9,8 @@
 
 _host = r"http://www.xiaohua.com"
 startUrl = _host + "/detail/"
-
+# http://www.xiaohua.com/duanzi?page=2
+startDuanziUrl=_host+"/duanzi/"
 
 
 

+ 22 - 0
crawl_xiaohua/crawl_xiaohua/crawl_xiaohua.py

@@ -15,6 +15,7 @@ from crawl_xiaohua.libs.json_conf import JsonConf
 import requests
 from crawl_xiaohua import api
 import bs4
+import pandas as pd
 
 headers = {
     "Authority": "img.xiaohua.com",
@@ -38,6 +39,7 @@ class CrawlXiaohua():
         self.jsonConf = JsonConf()
         self.conf = self.jsonConf.load()
         self.indexPage = self.conf.get('indexPage')
+        self.indexDuanziPage = self.conf.get('indexDuanziPage')
         # self.s.cookies.update(JsonConf().get_cookies())
 
     def crawl(self):
@@ -45,6 +47,26 @@ class CrawlXiaohua():
             self.getPicList()
             time.sleep(random.randint(1, 5))
 
+    def crawlDuanzi(self):
+        for i in range(10000):
+            self.getDuanziList()
+            self.indexDuanziPage = str(i)
+            self.jsonConf.set({"indexDuanziPage": self.indexDuanziPage})
+            time.sleep(random.randint(1, 5))
+
+    def getDuanziList(self):
+        res = self.s.get(api.startDuanziUrl + "?page=" + self.indexDuanziPage)
+        resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
+        divContentLeft = resHtml.find_all('div', {
+            'class': 'one-cont'})
+        divContentLefts = [div.find('p').find(
+            'a').text for div in divContentLeft]
+        self.saveDuanZiList(divContentLefts)
+
+    def saveDuanZiList(self, duanziList):
+        pd.DataFrame(duanziList).to_csv(r"data/duanzhi.csv",
+                                        mode='a', encoding='utf-8', header=False)
+
     def getPicList(self):
         res = self.s.get(api.startUrl + self.indexPage)
         resHtml = bs4.BeautifulSoup(res.text, 'html.parser')

+ 1 - 1
crawl_xiaohua/crawl_xiaohua/libs/json_conf.py

@@ -39,7 +39,7 @@ class JsonConf:
         for key in data_dict:
             json_obj[key] = data_dict[key]
         self.save(json_obj)
-        print(json.dumps(json_obj, indent=4))
+        # print(json.dumps(json_obj, indent=4))
 
     def get(self, key, default_val=""):
         '''

+ 2 - 1
crawl_xiaohua/main.py

@@ -9,4 +9,5 @@
 import crawl_xiaohua
 
 if __name__ == '__main__':
-    crawl_xiaohua.main()
+    # crawl_xiaohua.main()
+    crawl_xiaohua.crawlDuanzi()

+ 2 - 1
crawl_xiaohua/requirements.txt

@@ -1,2 +1,3 @@
 requests
-bs4
+bs4
+pandas