# -*- coding: UTF-8 -*-
import requests
import re,os
import time
from urllib import request
from fake_useragent import UserAgent
#[url]http://mzsock.com[/url] 美足船袜网
class Mzsock():
def __init__(self):
self.ua=UserAgent()
self.headers={"User-Agent":self.ua.random}
def get_categroy_url(self):
url="http://mzsock.com"
response=requests.get(url,headers=self.headers).text
ul=re.findall(r'
',response,re.S)[0]
categroy_urls=re.findall(r'.+?',ul,re.S)[1:-1]
return categroy_urls
def get_urllist(self,categroy_urls):
urllist=[]
for url in categroy_urls:
response=requests.get(url,verify=False,headers=self.headers).text
num=re.findall(r'共找到.+?>(.+?)篇帖子',response,re.S)[0]
pagenum=round(int(num)/20) #取整,四舍五入
print(pagenum)
for i in range(1,pagenum+1):
pageurl=f'{url}page/{i}/'
urllist.append(pageurl)
return urllist
def get_contentlist(self,urllist):
contentlist=[]
for url in urllist:
response = requests.get(url,headers=self.headers).text
div=re.findall(r'',response,re.S)[0]
hrefs=re.findall(r'',div,re.S)
contentlist.extend(hrefs)
print(hrefs)
return contentlist
def get_content(self,contentlist):
for url in contentlist:
response = requests.get(url,headers=self.headers).text
h1=re.findall(r'(.+?)[(](.+?)[)]
',response,re.S)[0]
title=h1[0]
title= re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符
print(title)
os.makedirs(f'mzsock/{title}/',exist_ok=True) #创建目录
page_num = h1[1][6:-7]
page_num = page_num.split('/')[1]
print(page_num)
for i in range(1,int(page_num)+1):
content_url=f'{url[:-5]}_{i}.html'
content_response = requests.get(content_url, headers=self.headers).text
div=re.findall(r'(.+?)
',content_response,re.S)[0]
img_urls=re.findall(r')