Browse Source

增加校花爬虫

liuyuqi-dellpc 2 years ago
parent
commit
500a4caf49

+ 4 - 0
crawl_xiaohua/.gitignore

@@ -0,0 +1,4 @@
+/build/
+/__pycache__
+/dist/
+*.pyc

+ 8 - 0
crawl_xiaohua/README.md

@@ -0,0 +1,8 @@
+## crawl_xiaohua
+
+xiaohua.com 爬虫,获取美女图片
+
+## Development
+
+Read this [Development](./docs/Development.md) document.
+

+ 3 - 0
crawl_xiaohua/conf/config.json

@@ -0,0 +1,3 @@
+[
+
+]

+ 39 - 0
crawl_xiaohua/crawl_xiaohua/DownloadProgress.py

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+'''
+下载进度
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+
+class DownloadProgress(object):
+    def __init__(self, title, count=0.0, run_status=None, fin_status=None, total=100.0, unit='', sep='/',
+                 chunk_size=1.0):
+        super(DownloadProgress, self).__init__()
+        self.info = "[%s] %s %.2f %s %s %.2f %s"
+        self.title = title
+        self.total = total
+        self.count = count
+        self.chunk_size = chunk_size
+        self.status = run_status or ""
+        self.fin_status = fin_status or " " * len(self.status)
+        self.unit = unit
+        self.seq = sep
+
+    def __get_info(self):
+        # 【名称】状态 进度 单位 分割线 总数 单位
+        _info = self.info % (
+            self.title, self.status, self.count / self.chunk_size, self.unit, self.seq, self.total / self.chunk_size,
+            self.unit)
+        return _info
+
+    def refresh(self, count=1, status=None):
+        self.count += count
+        # if status is not None:
+        self.status = status or self.status
+        end_str = "\r"
+        if self.count >= self.total:
+            end_str = '\n'
+            self.status = status or self.fin_status
+        print(self.__get_info(), end=end_str)

+ 15 - 0
crawl_xiaohua/crawl_xiaohua/__init__.py

@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/24 13:07:28
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   main function
+'''
+import time
+from crawl_xiaohua.crawl_xiaohua import CrawlXiaohua
+
+
+def main(argv=None):
+    crawl = CrawlXiaohua()
+    crawl.crawl()

+ 24 - 0
crawl_xiaohua/crawl_xiaohua/api.py

@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/24 15:02:12
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   api
+'''
+
+_host = r"http://www.xiaohua.com"
+startUrl = _host + "/detail/"
+
+
+
+
+
+
+
+
+
+
+
+
+

+ 0 - 0
xiaohua/__init__.py → crawl_xiaohua/crawl_xiaohua/conf/config.json


+ 77 - 0
crawl_xiaohua/crawl_xiaohua/crawl_xiaohua.py

@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/02/23 04:21:56
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   crawl xiaohua.com
+'''
+
+from contextlib import closing
+import os
+import random
+import time
+import requests
+from crawl_xiaohua import api
+import bs4
+
+headers = {
+    "Authority": "img.xiaohua.com",
+    "Accept": "image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
+    "Dnt": "1",
+    "Referer": "http://www.xiaohua.com/",
+    "Sec-Ch-Ua-Mobile": "?0",
+    "Sec-Ch-Ua-Platform": "Windows",
+    "Sec-Fetch-Dest": "image",
+    "Sec-Fetch-Mode": "no-cors",
+    "Sec-Fetch-Site": "cross-site",
+}
+
+
+class CrawlXiaohua():
+
+    def __init__(self):
+        self.s = requests.Session()
+        self.s.headers.update(headers)
+        self.indexPage = "123655"
+        # self.s.cookies.update(JsonConf().get_cookies())
+
+    def crawl(self):
+        for i in range(2):
+            self.getPicList()
+            time.sleep(random.randint(1, 30) / 10)
+
+    def getPicList(self):
+        res = self.s.get(api.startUrl + self.indexPage)
+        resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
+        divPic = resHtml.find('div', {'id': 'divPic'})
+        divContentLeft = resHtml.find('div', {'class': 'content-left'})
+        imgDiv = divPic.find_all('img')
+        imgs = [img.get('data-src') for img in imgDiv]
+        titleDesc = divContentLeft.find('p').text
+        titleDesc = titleDesc if len(titleDesc) <= 20 else titleDesc[:10]
+        # 下载图片
+        for i in range(len(imgs)):
+            self.downloadPic(imgs[i], titleDesc + "_" + self.indexPage + "_" +
+                             str(i) + "." + imgs[0].split('.')[-1])
+
+        # 下一页
+        nextPan = str.split(resHtml.find(
+            'a', {'id': 'hylPrev'})["href"], r"/")[2]
+        self.indexPage = nextPan
+        return nextPan
+
+    def downloadPic(self, imgUrl, title):
+        with closing(self.s.get(url=imgUrl, stream=True)) as response:
+            chunk_size = 1024
+            file_D = './data/' + title
+            if (os.path.exists(file_D)):
+                print('跳过' + title)
+            else:
+                with open(file_D, "wb") as file:
+                    for data in response.iter_content(chunk_size=chunk_size):
+                        file.write(data)
+
+    def getCookie(self):
+        pass

+ 0 - 0
crawl_xiaohua/crawl_xiaohua/data/.gitkeep


+ 64 - 0
crawl_xiaohua/crawl_xiaohua/libs/json_conf.py

@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/24 15:07:14
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   yaml util
+'''
+import os,json
+
+config_path = "conf/config.json"
+
+class JsonConf:
+    '''json配置文件类'''
+    @staticmethod
+    def save(data):
+        global config_path
+        with open(config_path, 'w') as json_file:
+            json_file.write(json.dumps(data, indent=4))
+
+    @staticmethod
+    def load():
+        global config_path
+        if not os.path.exists(config_path):
+            with open(config_path, 'w') as json_file:
+                pass
+        with open(config_path, encoding="utf-8") as json_file:
+            try:
+                data = json.load(json_file)
+            except Exception as e:
+                if(str(e).index("utf-8-sig") > 0):
+                    with open(config_path, encoding="utf-8-sig") as json_file:
+                        data = json.load(json_file)
+                        return data
+                else:
+                    print(e)
+            return data
+
+    @staticmethod
+    def set(data_dict):
+        json_obj = JsonConf.load()
+        for key in data_dict:
+            json_obj[key] = data_dict[key]
+        JsonConf.save(json_obj)
+        print(json.dumps(json_obj, indent=4))
+
+    @staticmethod
+    def get(key, default_val=""):
+        '''
+        配置文件获取key对象的值,如果没有设置就返回默认值
+        '''
+        try:
+            result = JsonConf.load()[key]
+            return result
+        except Exception as e:
+            print(e)
+            return default_val
+
+    @staticmethod
+    def get(jsonData, key, default_val=""):
+        try:
+            return jsonData[key]
+        except Exception as e:
+            return default_val

+ 31 - 0
crawl_xiaohua/crawl_xiaohua/threads.py

@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+'''
+多线程下载多文件;多线程分段下载单文件.
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+from threading import Lock
+from threading import Thread
+
+threadLock = Lock()
+threads = []
+
+
+class MyThread(Thread):
+    def __init__(self, name, func, *args, lock=False):
+        Thread.__init__(self)
+        self.name = name
+        self.func = func
+        self.args = args
+        self.lock = lock
+
+    def run(self):
+        print("开启: " + self.name)
+        if self.lock:
+            threadLock.acquire()
+            self.func(*self.args)
+            threadLock.release()
+        else:
+            self.func(*self.args)

+ 79 - 0
crawl_xiaohua/crawl_xiaohua/user_agent.py

@@ -0,0 +1,79 @@
+# -*-coding:utf-8 -*-
+
+import random
+
+# 返回一个随机的请求头 headers
+def getheaders():
+    # 各种PC端
+    user_agent_list_2 = [
+        # Opera
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
+        "Opera/8.0 (Windows NT 5.1; U; en)",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
+        # Firefox
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
+        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
+        # Safari
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        # chrome
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.2171.71 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/72.0.1271.64 Safari/537.11",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/66.0.648.133 Safari/534.16",
+        # 360
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        # 淘宝浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+        # 猎豹浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+        # QQ浏览器
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+        # sogou浏览器
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
+        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
+        # maxthon浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
+        # UC浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
+    ]
+    # 各种移动端
+    user_agent_list_3 = [
+        # IPhone
+        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPod
+        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPAD
+        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # Android
+        "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # QQ浏览器 Android版本
+        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # Android Opera Mobile
+        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+        # Android Pad Moto Xoom
+        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+        # BlackBerry
+        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+        # WebOS HP Touchpad
+        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+        # Nokia N97
+        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+        # Windows Phone Mango
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+        # UC浏览器
+        "UCWEB7.0.2.37/28/999",
+        "NOKIA5700/ UCWEB7.0.2.37/28/999",
+        # UCOpenwave
+        "Openwave/ UCWEB7.0.2.37/28/999",
+        # UC Opera
+        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
+    ]
+    UserAgent = random.choice(user_agent_list_2) # 这里只用list1
+    headers = {'User-Agent': UserAgent}
+    return headers

+ 3 - 0
crawl_xiaohua/crawl_xiaohua/version.py

@@ -0,0 +1,3 @@
+from __future__ import unicode_literals
+
+__version__ = '2022.05.24'

+ 13 - 0
crawl_xiaohua/docs/Development.md

@@ -0,0 +1,13 @@
+## Development
+
+
+```
+virtualenv .venv 
+
+pip install -r requirements.txt
+
+pip install pyinstaller
+pyinstaller -F -c -i launch200.ico main.py
+
+
+```

+ 12 - 0
crawl_xiaohua/main.py

@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2022/05/23 14:33:19
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   main
+'''
+import crawl_xiaohua
+
+if __name__ == '__main__':
+    crawl_xiaohua.main()

+ 1 - 0
crawl_xiaohua/requirements.txt

@@ -0,0 +1 @@
+requests

+ 71 - 0
crawl_xiaohua/www.xiaohua.com.http

@@ -0,0 +1,71 @@
+### 全局变量
+@hostname = www.xiaohua.com
+# @hostname= localhost:89
+@host = http://{{hostname}}
+@imghost=https://img.xiaohua.com
+@api={{host}}/api
+@contentType = application/json
+@createdAt = {{$datetime iso8601}}
+@useragent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53
+@accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
+
+
+### 打开页面
+GET {{host}}/detail/110625
+User-Agent: {{useragent}}
+Authority: img.xiaohua.com
+Accept: image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8
+Accept-Language: en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7
+Dnt: 1
+# Referer: http://www.xiaohua.com/
+Sec-Ch-Ua: " Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"
+Sec-Ch-Ua-Mobile: ?0
+Sec-Ch-Ua-Platform: "Windows"
+Sec-Fetch-Dest: image
+Sec-Fetch-Mode: no-cors
+Sec-Fetch-Site: cross-site
+
+
+### 下载图片
+GET {{imghost}}/Picture/201905156369351846864271015176836.jpg
+User-Agent: {{useragent}}
+Authority: img.xiaohua.com
+Accept: image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8
+Accept-Language: en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7
+Dnt: 1
+Referer: http://www.xiaohua.com/
+Sec-Ch-Ua: " Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"
+Sec-Ch-Ua-Mobile: ?0
+Sec-Ch-Ua-Platform: "Windows"
+Sec-Fetch-Dest: image
+Sec-Fetch-Mode: no-cors
+Sec-Fetch-Site: cross-site
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ 0 - 0
xiaohuar/__init__.py


+ 0 - 0
xiaohua/main.py → xiaohuar/main.py