liuyuqi-dellpc 1 year ago
parent
commit
a5144e3f3b
2 changed files with 55 additions and 32 deletions
  1. 3 0
      .gitignore
  2. 52 32
      crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

+ 3 - 0
.gitignore

@@ -82,3 +82,6 @@ vignettes/*.pdf
 
 /images
 /crawl_xiaohua/data
+data/
+xiaohua.db
+xiaohua.db-journal

+ 52 - 32
crawl_xiaohua/crawl_xiaohua/extractor/qinimg.py

@@ -20,17 +20,25 @@ class Qinimg(object):
     '''  '''
     _host = "https://www.qinimg.com"
     # _host="http://localhost:88"
+    
     _header = {
                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
                "Referer": "https://www.qinimg.com/",
-               "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
-               "Accept-Encoding": "gzip, deflate, br",
-               "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-               "Cache-Control": "no-cache",
-               "Connection": "keep-alive",
-               "Host": "www.qinimg.com"
+               "authority": "www.qinimg.com",
+                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+                "accept-language": "en-US,en;q=0.9",
+                "cache-control": "max-age=0",
+                "dnt": "1",
+                "sec-ch-ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
+                "sec-ch-ua-mobile": "?0",
+                "sec-ch-ua-platform": '"Windows"',
+                "sec-fetch-dest": "document",
+                "sec-fetch-mode": "navigate",
+                "sec-fetch-site": "same-origin",
+                "sec-fetch-user": "?1",
+                "upgrade-insecure-requests": "1"
                }
-    max_page = 1
+    max_page = 10
 
     def __init__(self):
         self.sess = requests.Session()
@@ -77,16 +85,15 @@ class Qinimg(object):
 
     def get_list(self, next_page=None):
         ''' get all pic list '''
-        print("init page: "+next_page)
+        print("init page: " + next_page)
         if next_page is None:
             next_page = self._host
         else:
-            next_page = self._host + next_page
+            next_page = next_page
+
         res=self.sess.get(next_page, headers=self._header)
-        # res.encoding = 'utf-8'
+        res.encoding = 'utf-8'
         soup = BeautifulSoup(res.text, 'html.parser')
-        with open("data/list.html", "w", encoding="utf-8") as f:
-            f.write(res.text)
         imgs = soup.select('div.list_box ul')
         # save to sqlite, qinimg_girl
         for img in imgs:
@@ -107,7 +114,8 @@ class Qinimg(object):
         for pagenumber in pagenumbers:
             if pagenumber.text == 'Next' and index < self.max_page:
                 next = pagenumber
-                self.get_list(next_page = next['href'])
+                next_page = self._host + next['href']
+                self.get_list(next_page=next_page)
                 break
             index=index + 1
         self.set_config("init_page", next_page)
@@ -137,29 +145,36 @@ Select * from qinimg_girl where uid = '{}' and isdelete = 0
 
     def get_pic_detail(self, url, name, createtime):
         ''' get pic detail '''
-        uid=re.findall(r'(\d+)', url)[0]
-        index=0
-        url=self._host+url
-        res=self.sess.get(url, headers=self._header)
-        res.encoding = 'utf-8'
-        soup = BeautifulSoup(res.text, 'html.parser')
-        # find all img
-        imgs=soup.select('div#image p a')
-        # save to sqlite, qinimg_girl_detail
-        for img in imgs:
-            print(img['href'])
-            # if not exist in sqlite, add to sqlite
-            # if exist, check whether the pic is downloaded
-            # self.download_all_pic(img['href'])
-            self.save_girl_detail(uid,name,index,createtime,url)
+        uid = re.findall(r'(\d+)', url)[0]
+        pic_index = 0
+        url = self._host + url
+        try:
+            res=self.sess.get(url, headers=self._header)
+            res.encoding = 'utf-8'
+            soup = BeautifulSoup(res.text, 'html.parser')
+            # find all img
+            imgs=soup.select('div#image p a')
+            # save to sqlite, qinimg_girl_detail
+            for img in imgs:
+                # if not exist in sqlite, add to sqlite
+                # if exist, check whether the pic is downloaded
+                # self.download_all_pic(img['href'])
+                self.save_girl_detail(uid, name, pic_index, createtime, img['href'])
+                pic_index = pic_index + 1
+        except Exception as e:
+            print(e)
+        finally:
+            pass
+
 
-    def save_girl_detail(self, uid,name,index,createtime,url):
+    def save_girl_detail(self, uid,name,pic_index,createtime,url):
         ''' save girl detail to sqlite '''
         sql='''
-insert into qinimg_girl_detail (uid,name,index,createtime,url) values ('{}', '{}', '{}', '{}', '{}')
+insert into qinimg_girl_detail (uid,name,pic_index,createtime,url) values ('{}', '{}', '{}', '{}', '{}')
         '''
-        self.cursor.execute(sql.format(uid,name,index,createtime,url))
+        self.cursor.execute(sql.format(uid,name,str(pic_index),createtime,url))
         self.conn.commit()
+        # print(sql.format(uid,name,str(pic_index),createtime,url))
 
     def download_all_pic(self):
         ''' download pic '''
@@ -169,9 +184,14 @@ insert into qinimg_girl_detail (uid,name,index,createtime,url) values ('{}', '{}
         # get all pic detail
         self.cursor.execute(sql)
         details = self.cursor.fetchall()
+        index=1
         for detail in details:
             pool = ThreadPoolExecutor(max_workers=5)
-            future1 = pool.submit(self._download_pic, detail["url"], detail["name"], detail["index"])
+            future1 = pool.submit(self._download_pic, detail[4], detail[2], detail[3])
+            # print(detail)
+            # if index>2:
+            #     return
+            # index = index + 1
             time.sleep(1)
 
     def _download_pic(self, url, name,index):