1 year ago · 83b94e59e8
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
 
				-# xiaohua-crawl
			
 
				+# xiaohua_crawl
			
 
				 
			
 
				 目前本项目包含两个小项目：校花网爬虫，妹子网爬虫。后续爬取结果陆续发布，并定时追加更新。
			
 
				 
			
--- a/crawl_xiaohua/crawl_xiaohua/downloader/__init__.py
+++ b/crawl_xiaohua/crawl_xiaohua/downloader/__init__.py
--- a/crawl_xiaohua/crawl_xiaohua/extractor/__init__.py
+++ b/crawl_xiaohua/crawl_xiaohua/extractor/__init__.py
--- a/crawl_xiaohua/crawl_xiaohua/extractor/base_extractor.py
+++ b/crawl_xiaohua/crawl_xiaohua/extractor/base_extractor.py
@@ -0,0 +1,16 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/09/14 07:15:00
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+'''
			
 
				+
			
 
				+class BaseExtractor(object):
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+
			
 
				+    def download(self):
			
 
				+        pass
			
--- a/crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py
+++ b/crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py
@@ -0,0 +1,93 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/09/14 07:14:07
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+https://www.qinimg.com/
			
 
				+'''
			
 
				+import requests
			
 
				+import sys
			
 
				+import os
			
 
				+import re
			
 
				+import json
			
 
				+from bs4 import BeautifulSoup
			
 
				+import sqlite3
			
 
				+
			
 
				+class Qinimg(object):
			
 
				+    '''  '''
			
 
				+    _host = "https://www.qinimg.com/"
			
 
				+    _url = "https://www.qinimg.com/random"
			
 
				+    _name = "qinimg"
			
 
				+    _header = {
			
 
				+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
			
 
				+               "Referer": "https://www.qinimg.com/",
			
 
				+               "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
			
 
				+               "Accept-Encoding": "gzip, deflate, br",
			
 
				+               "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+               "Cache-Control": "no-cache",
			
 
				+               "Connection": "keep-alive",
			
 
				+               "Host": "www.qinimg.com"
			
 
				+               }
			
 
				+    max_page = 1000
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.sess = requests.Session()
			
 
				+        # init sqlite database
			
 
				+
			
 
				+    def get_list(self,page=1):
			
 
				+        ''' get all pic list '''
			
 
				+        res=self.sess.get(self._host, headers=self._header)
			
 
				+        res.encoding = 'utf-8'
			
 
				+        soup = BeautifulSoup(res.text, 'html.parser')
			
 
				+        img = soup.find('img')
			
 
				+        # find all img
			
 
				+        # /html/body/div[3]/div[2]/ul[1]/li[1]/a/img
			
 
				+        # /html/body/div[3]/div[2]/ul[4]/li[1]/a/img
			
 
				+        # div.list_box > ul:nth-child(5) > a
			
 
				+        imgs = soup.find_all('img')
			
 
				+
			
 
				+        # save to sqlite, qinimg_girl
			
 
				+        for img in imgs:
			
 
				+            print(img['src'])
			
 
				+            # if not exist in sqlite, add to sqlite
			
 
				+            # if exist, skip
			
 
				+
			
 
				+        # find the next button
			
 
				+        next = soup.find('a', {'class': 'next'})
			
 
				+        # if exist next button and page < max_page
			
 
				+        if next and int(next.text) < self.max_page:
			
 
				+            self.get_list(page=int(next.text))
			
 
				+        else:
			
 
				+            return
			
 
				+    
			
 
				+    def get_pic_detail(self, url):
			
 
				+        ''' get pic detail '''
			
 
				+        res=self.sess.get(url, headers=self._header)
			
 
				+        res.encoding = 'utf-8'
			
 
				+        soup = BeautifulSoup(res.text, 'html.parser')
			
 
				+        img = soup.find('img')
			
 
				+        # find all img
			
 
				+        imgs = soup.find_all('img')
			
 
				+
			
 
				+        # save to sqlite, qinimg_girl_detail
			
 
				+        for img in imgs:
			
 
				+            print(img['src'])
			
 
				+            # if not exist in sqlite, add to sqlite
			
 
				+            # if exist, check whether the pic is downloaded
			
 
				+            self.download(img['src'])
			
 
				+
			
 
				+    def download(self, url):
			
 
				+        self.sess.get(url, headers=self._header)
			
 
				+        # save to local
			
 
				+
			
 
				+
			
 
				+    def run(self):
			
 
				+        ''' run '''
			
 
				+        self.get_list()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    qinimg = Qinimg()
			
 
				+    qinimg.run()
			
--- a/crawl_xiaohua/crawl_xiaohua/update.py
+++ b/crawl_xiaohua/crawl_xiaohua/update.py
@@ -0,0 +1,8 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/09/14 07:12:53
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+'''
			
--- a/crawl_xiaohua/crawl_xiaohua/utils.py
+++ b/crawl_xiaohua/crawl_xiaohua/utils.py
@@ -0,0 +1,8 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/09/14 07:12:42
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+'''
			
--- a/crawl_xiaohua/crawl_xiaohua/version.py
+++ b/crawl_xiaohua/crawl_xiaohua/version.py
@@ -1,3 +1,3 @@
 
				 from __future__ import unicode_literals
			
 
				 
			
 
				-__version__ = '2022.05.24'
			
 
				+__version__ = '2023.09.09'
			
--- a/crawl_xiaohua/requirements.txt
+++ b/crawl_xiaohua/requirements.txt
@@ -1,3 +1,4 @@
 
				 requests
			
 
				 bs4
			
 
				-pandas
			
 
				+pandas
			
 
				+lxml