liuyuqi-dellpc 1 year ago
parent
commit
2c6fda0b2f
1 changed files with 27 additions and 15 deletions
  1. 27 15
      crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

+ 27 - 15
crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

@@ -18,10 +18,8 @@ from concurrent.futures import ThreadPoolExecutor
 
 class Qinimg(object):
     '''  '''
-    # _host = "https://www.qinimg.com"
-    _host="http://localhost:88"
-    _url = "https://www.qinimg.com/random"
-    _name = "qinimg"
+    _host = "https://www.qinimg.com"
+    # _host="http://localhost:88"
     _header = {
                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
                "Referer": "https://www.qinimg.com/",
@@ -79,10 +77,10 @@ class Qinimg(object):
     def get_list(self, next_page=None):
         ''' get all pic list '''
         if next_page is None:
-            next_page = self._url
+            next_page = self._host
         else:
             next_page = self._host + next_page
-        res=self.sess.get(self._host, headers=self._header)
+        res=self.sess.get(next_page, headers=self._header)
         res.encoding = 'utf-8'
         soup = BeautifulSoup(res.text, 'html.parser')
         imgs = soup.select('div.list_box ul')
@@ -93,7 +91,7 @@ class Qinimg(object):
                 title=img.select('a')[0]['title'].strip()
                 createtime=img.select('span')[1].text.strip()
                 self.save_girl(url,title,createtime)
-
+                self.get_pic_detail(url,title,createtime)
             except Exception as e:
                 print(e)
             finally:
@@ -132,21 +130,30 @@ Select * from qinimg_girl where uid = '{}' and isdelete = 0
         else:
             print(str(uid)+" is exist.")
 
-    def get_pic_detail(self, url):
+    def get_pic_detail(self, url, name, createtime):
         ''' get pic detail '''
+        uid=re.findall(r'(\d+)', url)[0]
+        index=0
+        url=self._host+url
         res=self.sess.get(url, headers=self._header)
         res.encoding = 'utf-8'
         soup = BeautifulSoup(res.text, 'html.parser')
-        img = soup.find('img')
         # find all img
-        imgs = soup.find_all('img')
-
+        imgs=soup.select('div#image p a')
         # save to sqlite, qinimg_girl_detail
         for img in imgs:
-            print(img['src'])
+            print(img['href'])
             # if not exist in sqlite, add to sqlite
             # if exist, check whether the pic is downloaded
-            self.download_all_pic(img['src'])
+            # self.download_all_pic(img['href'])
+            self.save_girl_detail(uid,name,index,createtime,url)
+
+    def save_girl_detail(self, uid,name,index,createtime,url):
+        sql='''
+insert into qinimg_girl_detail (uid,name,index,createtime,url) values ('{}', '{}', '{}', '{}', '{}')
+        '''
+        self.cursor.execute(sql.format(uid,name,index,createtime,url))
+        self.conn.commit()
 
     def download_all_pic(self):
         ''' download pic '''
@@ -169,14 +176,19 @@ Select * from qinimg_girl where uid = '{}' and isdelete = 0
             os.mkdir("data/{}".format(name))
         with open("data/{}/{}.jpg".format(name, index), "wb") as f:
             f.write(self.sess.get(url, headers=self._header).content)
+        sql='''
+        update qinimg_girl_detail set is_download = 1 where url = '{}'
+        '''
+        self.cursor.execute(sql.format(url))
+        self.conn.commit()
 
     def run(self):
         ''' run '''
         # get all pic
         init_page=self.get_config("init_page")
-        self.get_list(next_page=init_page)
+        self.get_list(next_page = init_page)
         # download pic
-        self.download_all_pic()
+        # self.download_all_pic()
         self.__release__()
 
     def set_config(self, key, value):