|
@@ -0,0 +1,116 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
+'''
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
+@Time : 2023/09/21 14:25:08
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
+@Desc :
|
|
|
+'''
|
|
|
+
|
|
|
+import requests
|
|
|
+import re,os
|
|
|
+import time
|
|
|
+from urllib import request
|
|
|
+from fake_useragent import UserAgent
|
|
|
+
|
|
|
+#[url]http://mzsock.com[/url] 美足船袜网
|
|
|
+
|
|
|
+class Mzsock():
|
|
|
+ def __init__(self):
|
|
|
+ self.ua = UserAgent()
|
|
|
+ self.headers = {"User-Agent": self.ua.random}
|
|
|
+
|
|
|
+ def get_categroy_url(self):
|
|
|
+ url = "http://mzsock.com"
|
|
|
+ response = requests.get(url, headers=self.headers).text
|
|
|
+ ul = re.findall(
|
|
|
+ r'<ul id="chenxing_menu" class="cx_menu l">(.+?)</ul>', response, re.S)[0]
|
|
|
+ categroy_urls = re.findall(
|
|
|
+ r'<li id=".+?"><a href="(.+?)">.+?</a></li>', ul, re.S)[1:-1]
|
|
|
+ return categroy_urls
|
|
|
+
|
|
|
+ def get_urllist(self, categroy_urls):
|
|
|
+ urllist = []
|
|
|
+ for url in categroy_urls:
|
|
|
+ response = requests.get(
|
|
|
+ url, verify=False, headers=self.headers).text
|
|
|
+ num = re.findall(
|
|
|
+ r'</i>共找到.+?>(.+?)</em>篇帖子</span>', response, re.S)[0]
|
|
|
+ pagenum = round(int(num)/20) # 取整,四舍五入
|
|
|
+ print(pagenum)
|
|
|
+ for i in range(1, pagenum+1):
|
|
|
+ pageurl = f'{url}page/{i}/'
|
|
|
+ urllist.append(pageurl)
|
|
|
+ return urllist
|
|
|
+
|
|
|
+ def get_contentlist(self, urllist):
|
|
|
+ contentlist = []
|
|
|
+ for url in urllist:
|
|
|
+ response = requests.get(url, headers=self.headers).text
|
|
|
+ div = re.findall(
|
|
|
+ r'<ul class="post-list cl" id="post-list">(.+?)</ul>', response, re.S)[0]
|
|
|
+ hrefs = re.findall(
|
|
|
+ r'<a class="img" href="(.+?)" title=".+?" target="_blank">', div, re.S)
|
|
|
+ contentlist.extend(hrefs)
|
|
|
+ print(hrefs)
|
|
|
+ return contentlist
|
|
|
+
|
|
|
+ def get_content(self, contentlist):
|
|
|
+ for url in contentlist:
|
|
|
+ response = requests.get(url, headers=self.headers).text
|
|
|
+ h1 = re.findall(r'<h1>(.+?)[(](.+?)[)]</h1>', response, re.S)[0]
|
|
|
+ title = h1[0]
|
|
|
+ title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符
|
|
|
+ print(title)
|
|
|
+ os.makedirs(f'mzsock/{title}/', exist_ok=True) # 创建目录
|
|
|
+ page_num = h1[1][6:-7]
|
|
|
+ page_num = page_num.split('/')[1]
|
|
|
+ print(page_num)
|
|
|
+ for i in range(1, int(page_num)+1):
|
|
|
+ content_url = f'{url[:-5]}_{i}.html'
|
|
|
+ content_response = requests.get(
|
|
|
+ content_url, headers=self.headers).text
|
|
|
+ div = re.findall(
|
|
|
+ r'<div class="picsbox picsboxcenter chenxing_pic_images">(.+?)</div>', content_response, re.S)[0]
|
|
|
+ img_urls = re.findall(
|
|
|
+ r'<img src="(.+?)" alt=".+?" width', div, re.S)
|
|
|
+ x = 1
|
|
|
+ for img_url in img_urls:
|
|
|
+ img_name = f'{i}_{x}{img_url[-4:]}'
|
|
|
+ self.bctp(f'mzsock/{title}/', img_url, img_name)
|
|
|
+ x = x+1
|
|
|
+
|
|
|
+ def bctp(self, lj, img_url, img_name):
|
|
|
+ print("开始下载图片!")
|
|
|
+ try:
|
|
|
+ r = requests.get(img_url, timeout=5, headers=self.headers)
|
|
|
+ with open(f'{lj}/{img_name}', 'wb') as f:
|
|
|
+ f.write(r.content)
|
|
|
+ print(f'下载{img_name}图片成功!')
|
|
|
+ time.sleep(1)
|
|
|
+ except Exception as e:
|
|
|
+ if "port=443): Read timed out" in str(e):
|
|
|
+ time.sleep(2)
|
|
|
+ try:
|
|
|
+ r = requests.get(img_url, timeout=5, headers=self.headers)
|
|
|
+ with open(f'{lj}/{img_name}', 'wb') as f:
|
|
|
+ f.write(r.content)
|
|
|
+ print(f'下载{img_name}图片成功!')
|
|
|
+ except Exception as e:
|
|
|
+ print(f'下载{img_name}图片失败!')
|
|
|
+ print(f'错误代码:{e}')
|
|
|
+ with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
|
|
|
+ f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
|
|
|
+ else:
|
|
|
+ print(f'下载{img_name}图片失败!')
|
|
|
+ print(f'错误代码:{e}')
|
|
|
+ with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
|
|
|
+ f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ spider = Mzsock()
|
|
|
+ categroy_urls = spider.get_categroy_url()
|
|
|
+ urllist = spider.get_urllist(categroy_urls)
|
|
|
+ contentlist = spider.get_contentlist(urllist)
|
|
|
+ spider.get_content(contentlist)
|