liuyuqi-dellpc 1 year ago
parent
commit
83b94e59e8

+ 1 - 1
README.md

@@ -1,4 +1,4 @@
-# xiaohua-crawl
+# xiaohua_crawl
 
 目前本项目包含两个小项目:校花网爬虫,妹子网爬虫。后续爬取结果陆续发布,并定时追加更新。
 

+ 0 - 0
crawl_xiaohua/crawl_xiaohua/downloader/__init__.py


+ 0 - 0
crawl_xiaohua/crawl_xiaohua/extractor/__init__.py


+ 16 - 0
crawl_xiaohua/crawl_xiaohua/extractor/base_extractor.py

@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/09/14 07:15:00
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+
+class BaseExtractor(object):
+    
+    def __init__(self):
+        pass
+
+    def download(self):
+        pass

+ 93 - 0
crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/09/14 07:14:07
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+https://www.qinimg.com/
+'''
+import requests
+import sys
+import os
+import re
+import json
+from bs4 import BeautifulSoup
+import sqlite3
+
+class Qinimg(object):
+    '''  '''
+    _host = "https://www.qinimg.com/"
+    _url = "https://www.qinimg.com/random"
+    _name = "qinimg"
+    _header = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+               "Referer": "https://www.qinimg.com/",
+               "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
+               "Accept-Encoding": "gzip, deflate, br",
+               "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+               "Cache-Control": "no-cache",
+               "Connection": "keep-alive",
+               "Host": "www.qinimg.com"
+               }
+    max_page = 1000
+
+    def __init__(self):
+        self.sess = requests.Session()
+        # init sqlite database
+
+    def get_list(self,page=1):
+        ''' get all pic list '''
+        res=self.sess.get(self._host, headers=self._header)
+        res.encoding = 'utf-8'
+        soup = BeautifulSoup(res.text, 'html.parser')
+        img = soup.find('img')
+        # find all img
+        # /html/body/div[3]/div[2]/ul[1]/li[1]/a/img
+        # /html/body/div[3]/div[2]/ul[4]/li[1]/a/img
+        # div.list_box > ul:nth-child(5) > a
+        imgs = soup.find_all('img')
+
+        # save to sqlite, qinimg_girl
+        for img in imgs:
+            print(img['src'])
+            # if not exist in sqlite, add to sqlite
+            # if exist, skip
+
+        # find the next button
+        next = soup.find('a', {'class': 'next'})
+        # if exist next button and page < max_page
+        if next and int(next.text) < self.max_page:
+            self.get_list(page=int(next.text))
+        else:
+            return
+    
+    def get_pic_detail(self, url):
+        ''' get pic detail '''
+        res=self.sess.get(url, headers=self._header)
+        res.encoding = 'utf-8'
+        soup = BeautifulSoup(res.text, 'html.parser')
+        img = soup.find('img')
+        # find all img
+        imgs = soup.find_all('img')
+
+        # save to sqlite, qinimg_girl_detail
+        for img in imgs:
+            print(img['src'])
+            # if not exist in sqlite, add to sqlite
+            # if exist, check whether the pic is downloaded
+            self.download(img['src'])
+
+    def download(self, url):
+        self.sess.get(url, headers=self._header)
+        # save to local
+
+
+    def run(self):
+        ''' run '''
+        self.get_list()
+
+
+if __name__ == "__main__":
+    qinimg = Qinimg()
+    qinimg.run()

+ 8 - 0
crawl_xiaohua/crawl_xiaohua/update.py

@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/09/14 07:12:53
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''

+ 8 - 0
crawl_xiaohua/crawl_xiaohua/utils.py

@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/09/14 07:12:42
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''

+ 1 - 1
crawl_xiaohua/crawl_xiaohua/version.py

@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2022.05.24'
+__version__ = '2023.09.09'

+ 2 - 1
crawl_xiaohua/requirements.txt

@@ -1,3 +1,4 @@
 requests
 bs4
-pandas
+pandas
+lxml