|
@@ -4,21 +4,23 @@
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
@Time : 2023/09/21 14:25:08
|
|
@Time : 2023/09/21 14:25:08
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
-@Desc :
|
|
+@Desc :
|
|
|
|
+http://mzsock.com[/url] 美足船袜网
|
|
'''
|
|
'''
|
|
|
|
|
|
import requests
|
|
import requests
|
|
import re,os
|
|
import re,os
|
|
import time
|
|
import time
|
|
from urllib import request
|
|
from urllib import request
|
|
-from fake_useragent import UserAgent
|
|
+from crawl_xiaohua.extractor.base_extractor import BaseExtractor
|
|
-
|
|
+
|
|
-
|
|
+class Mzsock(BaseExtractor):
|
|
|
|
+ ''' extract mzsock.com '''
|
|
|
|
+
|
|
|
|
+ _headers = {}
|
|
|
|
|
|
-class Mzsock():
|
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
- self.ua = UserAgent()
|
|
+ self.headers = self._headers
|
|
- self.headers = {"User-Agent": self.ua.random}
|
|
|
|
|
|
|
|
def get_categroy_url(self):
|
|
def get_categroy_url(self):
|
|
url = "http://mzsock.com"
|
|
url = "http://mzsock.com"
|
|
@@ -81,6 +83,11 @@ class Mzsock():
|
|
x = x+1
|
|
x = x+1
|
|
|
|
|
|
def bctp(self, lj, img_url, img_name):
|
|
def bctp(self, lj, img_url, img_name):
|
|
|
|
+ '''保存图片
|
|
|
|
+ :param lj: 保存路径
|
|
|
|
+ :param img_url: 图片链接
|
|
|
|
+ :param img_name: 图片名
|
|
|
|
+ '''
|
|
print("开始下载图片!")
|
|
print("开始下载图片!")
|
|
try:
|
|
try:
|
|
r = requests.get(img_url, timeout=5, headers=self.headers)
|
|
r = requests.get(img_url, timeout=5, headers=self.headers)
|
|
@@ -105,12 +112,4 @@ class Mzsock():
|
|
print(f'下载{img_name}图片失败!')
|
|
print(f'下载{img_name}图片失败!')
|
|
print(f'错误代码:{e}')
|
|
print(f'错误代码:{e}')
|
|
with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
|
|
with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
|
|
- f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
|
|
+ f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
|
|
-
|
|
|
|
-
|
|
|
|
-if __name__ == '__main__':
|
|
|
|
- spider = Mzsock()
|
|
|
|
- categroy_urls = spider.get_categroy_url()
|
|
|
|
- urllist = spider.get_urllist(categroy_urls)
|
|
|
|
- contentlist = spider.get_contentlist(urllist)
|
|
|
|
- spider.get_content(contentlist)
|
|
|