get_pic.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. # -*- coding: UTF-8 -*-
  2. import requests
  3. import re,os
  4. import time
  5. from urllib import request
  6. from fake_useragent import UserAgent
  7. #[url]http://mzsock.com[/url] 美足船袜网
  8. class Mzsock():
  9. def __init__(self):
  10. self.ua=UserAgent()
  11. self.headers={"User-Agent":self.ua.random}
  12. def get_categroy_url(self):
  13. url="http://mzsock.com"
  14. response=requests.get(url,headers=self.headers).text
  15. ul=re.findall(r'<ul id="chenxing_menu" class="cx_menu l">(.+?)</ul>',response,re.S)[0]
  16. categroy_urls=re.findall(r'<li id=".+?"><a href="(.+?)">.+?</a></li>',ul,re.S)[1:-1]
  17. return categroy_urls
  18. def get_urllist(self,categroy_urls):
  19. urllist=[]
  20. for url in categroy_urls:
  21. response=requests.get(url,verify=False,headers=self.headers).text
  22. num=re.findall(r'</i>共找到.+?>(.+?)</em>篇帖子</span>',response,re.S)[0]
  23. pagenum=round(int(num)/20) #取整,四舍五入
  24. print(pagenum)
  25. for i in range(1,pagenum+1):
  26. pageurl=f'{url}page/{i}/'
  27. urllist.append(pageurl)
  28. return urllist
  29. def get_contentlist(self,urllist):
  30. contentlist=[]
  31. for url in urllist:
  32. response = requests.get(url,headers=self.headers).text
  33. div=re.findall(r'<ul class="post-list cl" id="post-list">(.+?)</ul>',response,re.S)[0]
  34. hrefs=re.findall(r'<a class="img" href="(.+?)" title=".+?" target="_blank">',div,re.S)
  35. contentlist.extend(hrefs)
  36. print(hrefs)
  37. return contentlist
  38. def get_content(self,contentlist):
  39. for url in contentlist:
  40. response = requests.get(url,headers=self.headers).text
  41. h1=re.findall(r'<h1>(.+?)[(](.+?)[)]</h1>',response,re.S)[0]
  42. title=h1[0]
  43. title= re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符
  44. print(title)
  45. os.makedirs(f'mzsock/{title}/',exist_ok=True) #创建目录
  46. page_num = h1[1][6:-7]
  47. page_num = page_num.split('/')[1]
  48. print(page_num)
  49. for i in range(1,int(page_num)+1):
  50. content_url=f'{url[:-5]}_{i}.html'
  51. content_response = requests.get(content_url, headers=self.headers).text
  52. div=re.findall(r'<div class="picsbox picsboxcenter chenxing_pic_images">(.+?)</div>',content_response,re.S)[0]
  53. img_urls=re.findall(r'<img src="(.+?)" alt=".+?" width',div,re.S)
  54. x=1
  55. for img_url in img_urls:
  56. img_name=f'{i}_{x}{img_url[-4:]}'
  57. self.bctp(f'mzsock/{title}/', img_url, img_name)
  58. x=x+1
  59. def bctp(self,lj, img_url, img_name):
  60. print("开始下载图片!")
  61. try:
  62. r = requests.get(img_url, timeout=5, headers=self.headers)
  63. with open(f'{lj}/{img_name}', 'wb') as f:
  64. f.write(r.content)
  65. print(f'下载{img_name}图片成功!')
  66. time.sleep(1)
  67. except Exception as e:
  68. if "port=443): Read timed out" in str(e):
  69. time.sleep(2)
  70. try:
  71. r = requests.get(img_url, timeout=5, headers=self.headers)
  72. with open(f'{lj}/{img_name}', 'wb') as f:
  73. f.write(r.content)
  74. print(f'下载{img_name}图片成功!')
  75. except Exception as e:
  76. print(f'下载{img_name}图片失败!')
  77. print(f'错误代码:{e}')
  78. with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
  79. f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
  80. else:
  81. print(f'下载{img_name}图片失败!')
  82. print(f'错误代码:{e}')
  83. with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
  84. f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
  85. if __name__ == '__main__':
  86. spider=Mzsock()
  87. categroy_urls=spider.get_categroy_url()
  88. urllist=spider.get_urllist(categroy_urls)
  89. contentlist=spider.get_contentlist(urllist)
  90. spider.get_content(contentlist)