# -*- coding: UTF-8 -*- import requests import re,os import time from urllib import request from fake_useragent import UserAgent #[url]http://mzsock.com[/url] 美足船袜网 class Mzsock(): def __init__(self): self.ua=UserAgent() self.headers={"User-Agent":self.ua.random} def get_categroy_url(self): url="http://mzsock.com" response=requests.get(url,headers=self.headers).text ul=re.findall(r'',response,re.S)[0] categroy_urls=re.findall(r'
  • .+?
  • ',ul,re.S)[1:-1] return categroy_urls def get_urllist(self,categroy_urls): urllist=[] for url in categroy_urls: response=requests.get(url,verify=False,headers=self.headers).text num=re.findall(r'共找到.+?>(.+?)篇帖子',response,re.S)[0] pagenum=round(int(num)/20) #取整,四舍五入 print(pagenum) for i in range(1,pagenum+1): pageurl=f'{url}page/{i}/' urllist.append(pageurl) return urllist def get_contentlist(self,urllist): contentlist=[] for url in urllist: response = requests.get(url,headers=self.headers).text div=re.findall(r'',response,re.S)[0] hrefs=re.findall(r'',div,re.S) contentlist.extend(hrefs) print(hrefs) return contentlist def get_content(self,contentlist): for url in contentlist: response = requests.get(url,headers=self.headers).text h1=re.findall(r'

    (.+?)[(](.+?)[)]

    ',response,re.S)[0] title=h1[0] title= re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符 print(title) os.makedirs(f'mzsock/{title}/',exist_ok=True) #创建目录 page_num = h1[1][6:-7] page_num = page_num.split('/')[1] print(page_num) for i in range(1,int(page_num)+1): content_url=f'{url[:-5]}_{i}.html' content_response = requests.get(content_url, headers=self.headers).text div=re.findall(r'
    (.+?)
    ',content_response,re.S)[0] img_urls=re.findall(r'