Browse Source

优化项目结构

liuyuqi-dellpc 7 months ago
parent
commit
17ac653d70

+ 7 - 0
crawl_xiaohua/__init__.py

@@ -11,6 +11,7 @@ import sys
 import re
 import os
 from crawl_xiaohua.crawl_xiaohua import CrawlXiaohua
+from crawl_xiaohua.extractor.mzsock import Mzsock
 from crawl_xiaohua.extractor.xiaohuar import Xiaohuar
 from flask import Flask
 
@@ -38,5 +39,11 @@ def run(extractor: str, cmd: str, argv=None):
     elif extractor == 'xiaohuar':
         crawl = Xiaohuar()
         crawl.run()
+    elif extractor == 'mzsock':
+        crawl = Mzsock()
+        categroy_urls = crawl.get_categroy_url()
+        urllist = crawl.get_urllist(categroy_urls)
+        contentlist = crawl.get_contentlist(urllist)
+        crawl.get_content(contentlist)
     else:
         print('unknown extractor: %s' % extractor)

+ 2 - 10
crawl_xiaohua/api.py

@@ -7,19 +7,11 @@
 @Desc    :   api
 '''
 
+# 校花网
 _host = r"http://www.xiaohua.com"
 startUrl = _host + "/detail/"
 # http://www.xiaohua.com/duanzi?page=2
 startDuanziUrl=_host+"/duanzi/"
 
 
-
-
-
-
-
-
-
-
-
-
+# 

+ 15 - 16
crawl_xiaohua/extractor/mzsock.py

@@ -4,21 +4,23 @@
 @Contact :   liuyuqi.gov@msn.cn
 @Time    :   2023/09/21 14:25:08
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
-@Desc    :   
+@Desc    :    
+http://mzsock.com[/url] 美足船袜网
 '''
 
 import requests
 import re,os
 import time
 from urllib import request
-from fake_useragent import UserAgent
- 
-#[url]http://mzsock.com[/url] 美足船袜网
+from crawl_xiaohua.extractor.base_extractor import BaseExtractor
+
+class Mzsock(BaseExtractor):
+    ''' extract mzsock.com '''
+
+    _headers = {}
 
-class Mzsock():
     def __init__(self):
-        self.ua = UserAgent()
-        self.headers = {"User-Agent": self.ua.random}
+        self.headers = self._headers
 
     def get_categroy_url(self):
         url = "http://mzsock.com"
@@ -81,6 +83,11 @@ class Mzsock():
                     x = x+1
 
     def bctp(self, lj, img_url, img_name):
+        '''保存图片
+            :param lj: 保存路径
+            :param img_url: 图片链接
+            :param img_name: 图片名
+        '''
         print("开始下载图片!")
         try:
             r = requests.get(img_url, timeout=5, headers=self.headers)
@@ -105,12 +112,4 @@ class Mzsock():
                 print(f'下载{img_name}图片失败!')
                 print(f'错误代码:{e}')
                 with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
-                    f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
-
-
-if __name__ == '__main__':
-    spider = Mzsock()
-    categroy_urls = spider.get_categroy_url()
-    urllist = spider.get_urllist(categroy_urls)
-    contentlist = spider.get_contentlist(urllist)
-    spider.get_content(contentlist)
+                    f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')

+ 0 - 0
crawl_xiaohua/threads.py → crawl_xiaohua/libs/threads.py


+ 0 - 0
crawl_xiaohua/user_agent.py → crawl_xiaohua/libs/user_agent.py


+ 6 - 6
docs/Development.md

@@ -11,7 +11,12 @@
 
 **flask web ui操作;**
 
-```
+```python
+cd /d C:/Users/dell/Desktop/xiaohua-crawl
+virtualenv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+
 python main.py server
 python mian.py server --config xx
 ```
@@ -19,11 +24,6 @@ python mian.py server --config xx
 **命令行模式:**
 
 ```
-cd /d C:/Users/dell/Desktop/xiaohua-crawl
-virtualenv .venv
-source .venv/bin/activate
-pip install -r requirements.txt
-
 #python setup.py --requires | xargs pip install
 
 # 校花网

+ 1 - 0
requirements.txt

@@ -2,3 +2,4 @@ requests
 bs4
 pandas
 lxml
+flask