liuyuqi-dellpc 1 year ago
parent
commit
2cf3afa49a
2 changed files with 156 additions and 27 deletions
  1. 2 2
      README.md
  2. 154 25
      crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

+ 2 - 2
README.md

@@ -23,8 +23,8 @@ python xiaohua/main.py
 ```
 > cd /d C:/Users/dell/Desktop/xiaohua-crawl/meizi
 > sudo easy_install virtualenv
-> virtualenv venv
-> source venv/bin/activate
+> virtualenv .venv
+> source .venv/bin/activate
 > python setup.py --requires | xargs pip install
 
 * 妹子图:`python run.py crawl meizitu`

+ 154 - 25
crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

@@ -11,13 +11,15 @@ import requests
 import sys
 import os
 import re
-import json
+import json,time
 from bs4 import BeautifulSoup
 import sqlite3
+from concurrent.futures import ThreadPoolExecutor
 
 class Qinimg(object):
     '''  '''
-    _host = "https://www.qinimg.com/"
+    # _host = "https://www.qinimg.com"
+    _host="http://localhost:88"
     _url = "https://www.qinimg.com/random"
     _name = "qinimg"
     _header = {
@@ -34,34 +36,102 @@ class Qinimg(object):
 
     def __init__(self):
         self.sess = requests.Session()
-        # init sqlite database
+        self.conn = sqlite3.connect('xiaohua.db')
+        self.cursor = self.conn.cursor()
+        self.init_database()
+        
+    def init_database(self):
+        ''' init sqlite database '''
+        # whether the ''qinimg_girl table exists
+        self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='qinimg_girl'")
+        exists = self.cursor.fetchone()
+        if not exists:
+        # if not exist, create table
+            sql1='''
+    CREATE TABLE "qinimg_girl" (
+    "id"  INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+    "name"  TEXT,
+    "uid"  INTEGER,
+    "createtime"  TEXT,
+    "isdelete"  INTEGER NOT NULL DEFAULT 0,
+    "url"  TEXT
+    );
+    '''
+            self.cursor.execute(sql1)
 
-    def get_list(self,page=1):
+
+        self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='qinimg_girl_detail'")
+        exists = self.cursor.fetchone()
+        if not exists:
+            sql2='''
+    CREATE TABLE "qinimg_girl_detail" (
+    "id"  INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+    "url"  TEXT,
+    "is_download"  INTEGER NOT NULL DEFAULT 0,
+    "createtime"  TEXT
+    );
+            '''
+            self.cursor.execute(sql2)
+        
+        # self.cursor.close()
+        # self.conn.close()
+
+    def get_list(self, next_page=None):
         ''' get all pic list '''
+        if next_page is None:
+            next_page = self._url
+        else:
+            next_page = self._host + next_page
         res=self.sess.get(self._host, headers=self._header)
         res.encoding = 'utf-8'
         soup = BeautifulSoup(res.text, 'html.parser')
-        img = soup.find('img')
-        # find all img
-        # /html/body/div[3]/div[2]/ul[1]/li[1]/a/img
-        # /html/body/div[3]/div[2]/ul[4]/li[1]/a/img
-        # div.list_box > ul:nth-child(5) > a
-        imgs = soup.find_all('img')
-
+        imgs = soup.select('div.list_box ul')
         # save to sqlite, qinimg_girl
         for img in imgs:
-            print(img['src'])
-            # if not exist in sqlite, add to sqlite
-            # if exist, skip
+            try:
+                url=img.select('a')[0]['href'].strip()
+                title=img.select('a')[0]['title'].strip()
+                createtime=img.select('span')[1].text.strip()
+                self.save_girl(url,title,createtime)
 
+            except Exception as e:
+                print(e)
+            finally:
+                pass
+        
         # find the next button
-        next = soup.find('a', {'class': 'next'})
-        # if exist next button and page < max_page
-        if next and int(next.text) < self.max_page:
-            self.get_list(page=int(next.text))
+        pagenumbers=soup.select('div.pages a')
+        index=0
+        for pagenumber in pagenumbers:
+            if pagenumber.text == 'Next' and index < self.max_page:
+                next = pagenumber
+                self.get_list(next_page = next['href'])
+                break
+            index=index + 1
+        self.set_config("init_page", next_page)
+
+    def save_girl(self, url:str, title:str, createtime:str):
+        # if not exist in sqlite, add to sqlite
+        # if exist, skip
+        uid=re.findall(r'(\d+)', url)[0]
+        url=self._host+url
+        sql1='''
+Select * from qinimg_girl where uid = '{}' and isdelete = 0
+'''
+        # print(sql1.format(uid))
+        self.cursor.execute(sql1.format(str(uid)))
+        exists = self.cursor.fetchone()
+        if not exists:
+            sql2='''
+    INSERT INTO qinimg_girl ("name", "uid", "createtime", "url") 
+    VALUES ("{}", {}, "{}",  "{}");
+    '''
+            # print(sql2.format(title, uid, createtime, url))
+            self.cursor.execute(sql2.format(title, uid, createtime, url))
+            self.conn.commit()
         else:
-            return
-    
+            print(str(uid)+" is exist.")
+
     def get_pic_detail(self, url):
         ''' get pic detail '''
         res=self.sess.get(url, headers=self._header)
@@ -76,18 +146,77 @@ class Qinimg(object):
             print(img['src'])
             # if not exist in sqlite, add to sqlite
             # if exist, check whether the pic is downloaded
-            self.download(img['src'])
+            self.download_all_pic(img['src'])
 
-    def download(self, url):
-        self.sess.get(url, headers=self._header)
-        # save to local
+    def download_all_pic(self):
+        ''' download pic '''
+        sql = '''
+        select * from qinimg_girl_detail where is_download = 0
+        '''
+        # get all pic detail
+        self.cursor.execute(sql)
+        details = self.cursor.fetchall()
+        for detail in details:
+            pool = ThreadPoolExecutor(max_workers=5)
+            future1 = pool.submit(self._download_pic, detail["url"], detail["name"], detail["index"])
+            time.sleep(1)
 
+    def _download_pic(self, url, name,index):
+        ''' download pic '''
+        if not os.path.exists("data"):
+            os.mkdir("data")
+        if not os.path.exists("data/{}".format(name)):
+            os.mkdir("data/{}".format(name))
+        with open("data/{}/{}.jpg".format(name, index), "wb") as f:
+            f.write(self.sess.get(url, headers=self._header).content)
 
     def run(self):
         ''' run '''
-        self.get_list()
+        # get all pic
+        init_page=self.get_config("init_page")
+        self.get_list(next_page=init_page)
+        # download pic
+        self.download_all_pic()
+        self.__release__()
+
+    def set_config(self, key, value):
+        ''' save config '''
+        if self.get_config(key) is None:
+            sql='''
+    insert into qinimg_config (key, value) values ('{}', '{}')
+    '''
+        else:
+            sql='''
+    update qinimg_config set value = '{}' where key = '{}'
+    '''
+        try:
+            self.cursor.execute(sql.format(key, value))
+            self.conn.commit()
+        except Exception as e:
+            print(e)
+        finally:
+            pass
 
+    
+    def get_config(self, key):
+        ''' get config '''
+        sql='''
+select * from qinimg_config where key = '{}'
+'''     
+        try:
+            self.cursor.execute(sql.format(key))
+            config = self.cursor.fetchone()
+            return config["value"]
+        except Exception as e:
+            return None
+        finally:
+            pass
 
+    def __release__(self):
+        ''' release '''
+        self.cursor.close()
+        self.conn.close()
+        
 if __name__ == "__main__":
     qinimg = Qinimg()
     qinimg.run()