Browse Source

remove mzsock and xiaohuar

liuyuqi-dellpc 1 year ago
parent
commit
3359095f1f

+ 22 - 9
crawl_xiaohua/crawl_xiaohua/__init__.py

@@ -6,15 +6,28 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   main function
 '''
-import time
+import time,sys,re,os
 from crawl_xiaohua.crawl_xiaohua import CrawlXiaohua
+from flask import Flask
 
+def server(config:str, argv=None):
+    ''' web server mode '''
+    if argv is None:
+        argv = sys.argv
+    else:
+        sys.argv.extend(argv)
+    app=Flask(__name__)
+    app.run()
 
-def main(argv=None):
-    crawl = CrawlXiaohua()
-    crawl.crawl()
-
-
-def crawlDuanzi(argv=None):
-    crawl = CrawlXiaohua()
-    crawl.crawlDuanzi()
+def run(extractor:str, cmd:str, argv=None):
+    ''' shell mode '''
+    if argv is None:
+        argv = sys.argv
+    if extractor == 'xiaohua':
+        crawl = CrawlXiaohua()
+        if cmd == 'duanzi':
+            crawl.crawlDuanzi()
+        else:
+            crawl.crawl()
+    else:
+        print('unknown extractor: %s' % extractor)

+ 4 - 80
crawl_xiaohua/crawl_xiaohua/crawl_xiaohua.py

@@ -16,88 +16,12 @@ import requests
 from crawl_xiaohua import api
 import bs4
 import pandas as pd
-
-headers = {
-    "Authority": "img.xiaohua.com",
-    "Accept": "image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
-    "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
-    "Dnt": "1",
-    "Referer": "http://www.xiaohua.com/",
-    "Sec-Ch-Ua-Mobile": "?0",
-    "Sec-Ch-Ua-Platform": "Windows",
-    "Sec-Fetch-Dest": "image",
-    "Sec-Fetch-Mode": "no-cors",
-    "Sec-Fetch-Site": "cross-site",
-}
-
+from flask import Flask
 
 class CrawlXiaohua():
+    ''' crawl xiaohua '''
 
     def __init__(self):
-        self.s = requests.Session()
-        self.s.headers.update(headers)
-        self.jsonConf = JsonConf()
-        self.conf = self.jsonConf.load()
-        self.indexPage = self.conf.get('indexPage')
-        self.indexDuanziPage = self.conf.get('indexDuanziPage')
-        # self.s.cookies.update(JsonConf().get_cookies())
-
-    def crawl(self):
-        for i in range(10000):
-            self.getPicList()
-            time.sleep(random.randint(1, 5))
-
-    def crawlDuanzi(self):
-        for i in range(10000):
-            self.getDuanziList()
-            self.indexDuanziPage = str(i)
-            self.jsonConf.set({"indexDuanziPage": self.indexDuanziPage})
-            time.sleep(random.randint(1, 5))
-
-    def getDuanziList(self):
-        res = self.s.get(api.startDuanziUrl + "?page=" + self.indexDuanziPage)
-        resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
-        divContentLeft = resHtml.find_all('div', {
-            'class': 'one-cont'})
-        divContentLefts = [div.find('p').find(
-            'a').text for div in divContentLeft]
-        self.saveDuanZiList(divContentLefts)
-
-    def saveDuanZiList(self, duanziList):
-        pd.DataFrame(duanziList).to_csv(r"data/duanzhi.csv",
-                                        mode='a', encoding='utf-8', header=False)
-
-    def getPicList(self):
-        res = self.s.get(api.startUrl + self.indexPage)
-        resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
-        divPic = resHtml.find('div', {'id': 'divPic'})
-        divContentLeft = resHtml.find('div', {'class': 'content-left'})
-        imgDiv = divPic.find_all('img')
-        imgs = [img.get('data-src') for img in imgDiv]
-        titleDesc = divContentLeft.find('p').text
-        titleDesc = titleDesc if len(titleDesc) <= 20 else titleDesc[:10]
-        # 下载图片
-        for i in range(len(imgs)):
-            self.downloadPic(imgs[i], titleDesc + "_" + self.indexPage + "_" +
-                             str(i) + "." + imgs[0].split('.')[-1])
-
-        self.jsonConf.set({"indexPage": self.indexPage})
-        # 下一页
-        nextPan = str.split(resHtml.find(
-            'a', {'id': 'hylPrev'})["href"], r"/")[2]
-        self.indexPage = nextPan
-        return nextPan
-
-    def downloadPic(self, imgUrl, title):
-        with closing(self.s.get(url=imgUrl, stream=True)) as response:
-            chunk_size = 1024
-            file_D = './data/' + title
-            if (os.path.exists(file_D)):
-                print('跳过' + title)
-            else:
-                with open(file_D, "wb") as file:
-                    for data in response.iter_content(chunk_size=chunk_size):
-                        file.write(data)
-
-    def getCookie(self):
+        ''' init '''
         pass
+    

+ 116 - 0
crawl_xiaohua/crawl_xiaohua/extractor/mzsock.py

@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/09/21 14:25:08
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+
+import requests
+import re,os
+import time
+from urllib import request
+from fake_useragent import UserAgent
+ 
+#[url]http://mzsock.com[/url] 美足船袜网
+
+class Mzsock():
+    def __init__(self):
+        self.ua = UserAgent()
+        self.headers = {"User-Agent": self.ua.random}
+
+    def get_categroy_url(self):
+        url = "http://mzsock.com"
+        response = requests.get(url, headers=self.headers).text
+        ul = re.findall(
+            r'<ul id="chenxing_menu" class="cx_menu l">(.+?)</ul>', response, re.S)[0]
+        categroy_urls = re.findall(
+            r'<li id=".+?"><a href="(.+?)">.+?</a></li>', ul, re.S)[1:-1]
+        return categroy_urls
+
+    def get_urllist(self, categroy_urls):
+        urllist = []
+        for url in categroy_urls:
+            response = requests.get(
+                url, verify=False, headers=self.headers).text
+            num = re.findall(
+                r'</i>共找到.+?>(.+?)</em>篇帖子</span>', response, re.S)[0]
+            pagenum = round(int(num)/20)  # 取整,四舍五入
+            print(pagenum)
+            for i in range(1, pagenum+1):
+                pageurl = f'{url}page/{i}/'
+                urllist.append(pageurl)
+        return urllist
+
+    def get_contentlist(self, urllist):
+        contentlist = []
+        for url in urllist:
+            response = requests.get(url, headers=self.headers).text
+            div = re.findall(
+                r'<ul class="post-list cl" id="post-list">(.+?)</ul>', response, re.S)[0]
+            hrefs = re.findall(
+                r'<a class="img" href="(.+?)" title=".+?" target="_blank">', div, re.S)
+            contentlist.extend(hrefs)
+            print(hrefs)
+        return contentlist
+
+    def get_content(self, contentlist):
+        for url in contentlist:
+            response = requests.get(url, headers=self.headers).text
+            h1 = re.findall(r'<h1>(.+?)[(](.+?)[)]</h1>', response, re.S)[0]
+            title = h1[0]
+            title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title)  # 剔除不合法字符
+            print(title)
+            os.makedirs(f'mzsock/{title}/', exist_ok=True)  # 创建目录
+            page_num = h1[1][6:-7]
+            page_num = page_num.split('/')[1]
+            print(page_num)
+            for i in range(1, int(page_num)+1):
+                content_url = f'{url[:-5]}_{i}.html'
+                content_response = requests.get(
+                    content_url, headers=self.headers).text
+                div = re.findall(
+                    r'<div class="picsbox picsboxcenter chenxing_pic_images">(.+?)</div>', content_response, re.S)[0]
+                img_urls = re.findall(
+                    r'<img src="(.+?)"  alt=".+?" width', div, re.S)
+                x = 1
+                for img_url in img_urls:
+                    img_name = f'{i}_{x}{img_url[-4:]}'
+                    self.bctp(f'mzsock/{title}/', img_url, img_name)
+                    x = x+1
+
+    def bctp(self, lj, img_url, img_name):
+        print("开始下载图片!")
+        try:
+            r = requests.get(img_url, timeout=5, headers=self.headers)
+            with open(f'{lj}/{img_name}', 'wb') as f:
+                f.write(r.content)
+                print(f'下载{img_name}图片成功!')
+                time.sleep(1)
+        except Exception as e:
+            if "port=443): Read timed out" in str(e):
+                time.sleep(2)
+                try:
+                    r = requests.get(img_url, timeout=5, headers=self.headers)
+                    with open(f'{lj}/{img_name}', 'wb') as f:
+                        f.write(r.content)
+                        print(f'下载{img_name}图片成功!')
+                except Exception as e:
+                    print(f'下载{img_name}图片失败!')
+                    print(f'错误代码:{e}')
+                    with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
+                        f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
+            else:
+                print(f'下载{img_name}图片失败!')
+                print(f'错误代码:{e}')
+                with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
+                    f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
+
+
+if __name__ == '__main__':
+    spider = Mzsock()
+    categroy_urls = spider.get_categroy_url()
+    urllist = spider.get_urllist(categroy_urls)
+    contentlist = spider.get_contentlist(urllist)
+    spider.get_content(contentlist)

+ 102 - 0
crawl_xiaohua/crawl_xiaohua/extractor/xiaohua.py

@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/09/21 14:12:51
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   校花网
+'''
+from crawl_xiaohua.extractor.base_extractor import BaseExtractor
+from contextlib import closing
+import os
+import random
+import time
+from crawl_xiaohua.libs.json_conf import JsonConf
+import requests
+from crawl_xiaohua import api
+import bs4
+import pandas as pd
+
+class Xiaohua(BaseExtractor):
+    
+    _header={
+        "Authority": "img.xiaohua.com",
+        "Accept": "image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
+        "Dnt": "1",
+        "Referer": "http://www.xiaohua.com/",
+        "Sec-Ch-Ua-Mobile": "?0",
+        "Sec-Ch-Ua-Platform": "Windows",
+        "Sec-Fetch-Dest": "image",
+        "Sec-Fetch-Mode": "no-cors",
+        "Sec-Fetch-Site": "cross-site",
+    }
+
+    def __init__(self):
+        self.s = requests.Session()
+        self.s.headers.update(self._header)
+        self.jsonConf = JsonConf()
+        self.conf = self.jsonConf.load()
+        self.indexPage = self.conf.get('indexPage')
+        self.indexDuanziPage = self.conf.get('indexDuanziPage')
+        # self.s.cookies.update(JsonConf().get_cookies())
+
+    def crawl(self):
+        for i in range(10000):
+            self.getPicList()
+            time.sleep(random.randint(1, 5))
+
+    def crawlDuanzi(self):
+        for i in range(10000):
+            self.getDuanziList()
+            self.indexDuanziPage = str(i)
+            self.jsonConf.set({"indexDuanziPage": self.indexDuanziPage})
+            time.sleep(random.randint(1, 5))
+
+    def getDuanziList(self):
+        res = self.s.get(api.startDuanziUrl + "?page=" + self.indexDuanziPage)
+        resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
+        divContentLeft = resHtml.find_all('div', {
+            'class': 'one-cont'})
+        divContentLefts = [div.find('p').find(
+            'a').text for div in divContentLeft]
+        self.saveDuanZiList(divContentLefts)
+
+    def saveDuanZiList(self, duanziList):
+        pd.DataFrame(duanziList).to_csv(r"data/duanzhi.csv",
+                                        mode='a', encoding='utf-8', header=False)
+
+    def getPicList(self):
+        res = self.s.get(api.startUrl + self.indexPage)
+        resHtml = bs4.BeautifulSoup(res.text, 'html.parser')
+        divPic = resHtml.find('div', {'id': 'divPic'})
+        divContentLeft = resHtml.find('div', {'class': 'content-left'})
+        imgDiv = divPic.find_all('img')
+        imgs = [img.get('data-src') for img in imgDiv]
+        titleDesc = divContentLeft.find('p').text
+        titleDesc = titleDesc if len(titleDesc) <= 20 else titleDesc[:10]
+        # 下载图片
+        for i in range(len(imgs)):
+            self.downloadPic(imgs[i], titleDesc + "_" + self.indexPage + "_" +
+                             str(i) + "." + imgs[0].split('.')[-1])
+
+        self.jsonConf.set({"indexPage": self.indexPage})
+        # 下一页
+        nextPan = str.split(resHtml.find(
+            'a', {'id': 'hylPrev'})["href"], r"/")[2]
+        self.indexPage = nextPan
+        return nextPan
+
+    def downloadPic(self, imgUrl, title):
+        with closing(self.s.get(url=imgUrl, stream=True)) as response:
+            chunk_size = 1024
+            file_D = './data/' + title
+            if (os.path.exists(file_D)):
+                print('跳过' + title)
+            else:
+                with open(file_D, "wb") as file:
+                    for data in response.iter_content(chunk_size=chunk_size):
+                        file.write(data)
+
+    def getCookie(self):
+        pass

+ 0 - 0
xiaohuar/main.py → crawl_xiaohua/crawl_xiaohua/extractor/xiaohuar.py


+ 59 - 0
crawl_xiaohua/crawl_xiaohua/update.py

@@ -6,3 +6,62 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   
 '''
+
+# check update from github
+import os
+import sys
+import time
+import requests
+import json
+import re
+import subprocess
+
+def checkUpdate()->str:
+    ''' check update 
+    return:
+        False: no new version
+        str: new version
+    '''
+    repo='jianboy/crawl_xiaohua'
+    url='https://api.github.com/repos/%s/releases/latest' % repo
+    try:
+        res = requests.get(url)
+        if res.status_code != 200:
+            return False
+        res = json.loads(res.text)
+        tag_name = res['tag_name']
+        if tag_name == 'v%s' % getVersion():
+            return False
+        return tag_name
+    
+    except Exception as e:
+        print(e)
+        return False
+
+def getVersion():
+    ''' get version from setup.py '''
+    with open('setup.py', 'r') as f:
+        content = f.read()
+        res = re.search(r'version="(.*)"', content)
+        if res:
+            return res.group(1)
+        else:
+            return '0.0.0'
+
+def update():
+    ''' check update, if new version exist, download and update it, then restart app '''
+    tag_name = checkUpdate()
+    if tag_name:
+        print('new version: %s' % tag_name)
+        print('updating...')
+        cmd = 'pip install --upgrade git+'
+        cmd += tag_name
+        cmd += '#egg=crawl_xiaohua'
+        print(cmd)
+        os.system(cmd)
+        print('update success, restart app...')
+        os.system('python -m crawl_xiaohua')
+        sys.exit(0)
+    else:
+        print('no new version')
+        sys.exit(0)

+ 17 - 8
crawl_xiaohua/docs/Development.md

@@ -1,9 +1,25 @@
 ## Development
 
+
+**flask web ui操作;**
+
+```
+python main.py server
+python mian.py server --config xx
+```
+
 **命令行模式:**
 
 ```
-python main.py cli --server aa --bb ss
+# 校花网
+python main.py shell --extractor xiaohua --cmd crawl
+
+# 
+python main.py shell --extractor xiaohua --cmd crawl
+
+
+
+
 ```
 
 **打包exe:**
@@ -16,10 +32,3 @@ pip install pyinstaller
 pyinstaller -F -c -i launch200.ico main.py
 ```
 
-
-**flask web ui操作;**
-
-```
-python main.py server
-python mian.py server --config xx
-```

+ 22 - 2
crawl_xiaohua/main.py

@@ -7,7 +7,27 @@
 @Desc    :   main
 '''
 import crawl_xiaohua
+import argparse
+
+parser = argparse.ArgumentParser(description='crawl_xiaohua')
+parser.add_argument('command', type=str, help='server, run ?')
+parser.add_argument('--config', type=str, help='config file')
+parser.add_argument('--extractor', type=str, help='extractor name')
+parser.add_argument('--cmd', type=str, help='shell command')
 
 if __name__ == '__main__':
-    # crawl_xiaohua.main()
-    crawl_xiaohua.crawlDuanzi()
+    try:
+        args = parser.parse_args()
+        if args.command == 'server':
+            crawl_xiaohua.server(args.config)
+            print('server')
+        elif args.command == 'run':
+            crawl_xiaohua.run(args.extractor, args.cmd)
+        else:
+            print('unknown command')
+            parser.print_help()
+    except Exception as e:
+        print(e)
+        parser.print_help()
+    finally:
+        pass

+ 0 - 0
xiaohuar/__init__.py → crawl_xiaohua/setup.py


+ 0 - 100
mzsock/get_pic.py

@@ -1,100 +0,0 @@
-# -*- coding: UTF-8 -*-
- 
-import requests
-import re,os
-import time
-from urllib import request
-from fake_useragent import UserAgent
- 
-#[url]http://mzsock.com[/url] 美足船袜网
-
-class Mzsock():
-    def __init__(self):
-        self.ua=UserAgent()
-        self.headers={"User-Agent":self.ua.random}
- 
-    def get_categroy_url(self):
-        url="http://mzsock.com"
-        response=requests.get(url,headers=self.headers).text
-        ul=re.findall(r'<ul id="chenxing_menu" class="cx_menu l">(.+?)</ul>',response,re.S)[0]
-        categroy_urls=re.findall(r'<li id=".+?"><a href="(.+?)">.+?</a></li>',ul,re.S)[1:-1]
-        return categroy_urls
- 
-    def get_urllist(self,categroy_urls):
-        urllist=[]
-        for url in categroy_urls:
-            response=requests.get(url,verify=False,headers=self.headers).text
-            num=re.findall(r'</i>共找到.+?>(.+?)</em>篇帖子</span>',response,re.S)[0]
-            pagenum=round(int(num)/20) #取整,四舍五入
-            print(pagenum)
-            for i in range(1,pagenum+1):
-                pageurl=f'{url}page/{i}/'
-                urllist.append(pageurl)
-        return urllist
- 
-    def get_contentlist(self,urllist):
-        contentlist=[]
-        for url in urllist:
-            response = requests.get(url,headers=self.headers).text
-            div=re.findall(r'<ul class="post-list cl" id="post-list">(.+?)</ul>',response,re.S)[0]
-            hrefs=re.findall(r'<a class="img" href="(.+?)" title=".+?" target="_blank">',div,re.S)
-            contentlist.extend(hrefs)
-            print(hrefs)
-        return contentlist
- 
-    def get_content(self,contentlist):
-        for url in contentlist:
-            response = requests.get(url,headers=self.headers).text
-            h1=re.findall(r'<h1>(.+?)[(](.+?)[)]</h1>',response,re.S)[0]
-            title=h1[0]
-            title= re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title)  # 剔除不合法字符
-            print(title)
-            os.makedirs(f'mzsock/{title}/',exist_ok=True) #创建目录
-            page_num = h1[1][6:-7]
-            page_num = page_num.split('/')[1]
-            print(page_num)
-            for i in range(1,int(page_num)+1):
-                content_url=f'{url[:-5]}_{i}.html'
-                content_response = requests.get(content_url, headers=self.headers).text
-                div=re.findall(r'<div class="picsbox picsboxcenter chenxing_pic_images">(.+?)</div>',content_response,re.S)[0]
-                img_urls=re.findall(r'<img src="(.+?)"  alt=".+?" width',div,re.S)
-                x=1
-                for img_url in img_urls:
-                    img_name=f'{i}_{x}{img_url[-4:]}'
-                    self.bctp(f'mzsock/{title}/', img_url, img_name)
-                    x=x+1
- 
-    def bctp(self,lj, img_url, img_name):
-        print("开始下载图片!")
-        try:
-            r = requests.get(img_url, timeout=5, headers=self.headers)
-            with open(f'{lj}/{img_name}', 'wb') as f:
-                f.write(r.content)
-                print(f'下载{img_name}图片成功!')
-                time.sleep(1)
-        except Exception as e:
-            if "port=443): Read timed out" in str(e):
-                time.sleep(2)
-                try:
-                    r = requests.get(img_url, timeout=5, headers=self.headers)
-                    with open(f'{lj}/{img_name}', 'wb') as f:
-                        f.write(r.content)
-                        print(f'下载{img_name}图片成功!')
-                except Exception as e:
-                    print(f'下载{img_name}图片失败!')
-                    print(f'错误代码:{e}')
-                    with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
-                        f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
-            else:
-                print(f'下载{img_name}图片失败!')
-                print(f'错误代码:{e}')
-                with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
-                    f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
- 
- 
-if __name__ == '__main__':
-    spider=Mzsock()
-    categroy_urls=spider.get_categroy_url()
-    urllist=spider.get_urllist(categroy_urls)
-    contentlist=spider.get_contentlist(urllist)
-    spider.get_content(contentlist)