Browse Source

modify xiaohuar extractor

liuyuqi-dellpc 8 months ago
parent
commit
9597fda8c8

+ 2 - 1
conf/config.json

@@ -1,5 +1,6 @@
 {
     "cookie": "",
     "indexPage": "123650",
-    "indexDuanziPage": "845"
+    "indexDuanziPage": "845",
+    "data_dir":"data"
 }

+ 14 - 5
crawl_xiaohua/__init__.py

@@ -6,20 +6,26 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   main function
 '''
-import time,sys,re,os
+import time
+import sys
+import re
+import os
 from crawl_xiaohua.crawl_xiaohua import CrawlXiaohua
+from crawl_xiaohua.extractor.xiaohuar import Xiaohuar
 from flask import Flask
 
-def server(config:str, argv=None):
+
+def server(config: str, argv=None):
     ''' web server mode '''
     if argv is None:
         argv = sys.argv
     else:
         sys.argv.extend(argv)
-    app=Flask(__name__)
+    app = Flask(__name__)
     app.run()
 
-def run(extractor:str, cmd:str, argv=None):
+
+def run(extractor: str, cmd: str, argv=None):
     ''' shell mode '''
     if argv is None:
         argv = sys.argv
@@ -29,5 +35,8 @@ def run(extractor:str, cmd:str, argv=None):
             crawl.crawlDuanzi()
         else:
             crawl.crawl()
+    elif extractor == 'xiaohuar':
+        crawl = Xiaohuar()
+        crawl.run()
     else:
-        print('unknown extractor: %s' % extractor)
+        print('unknown extractor: %s' % extractor)

+ 0 - 0
crawl_xiaohua/DownloadProgress.py → crawl_xiaohua/downloader/DownloadProgress.py


+ 3 - 1
crawl_xiaohua/extractor/base_extractor.py

@@ -6,11 +6,13 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   
 '''
+import requests
+from bs4 import BeautifulSoup
 
 class BaseExtractor(object):
     
     def __init__(self):
-        pass
+        self.sess=requests.Session()
 
     def download(self):
         pass

+ 46 - 49
crawl_xiaohua/extractor/xiaohuar.py

@@ -9,52 +9,49 @@ import os
 from time import sleep
 import random
 from urllib import request
-
-project_dir = "C:/Users/dell/Desktop/xiaohua-crawl"
-img_dir = project_dir+"/images"
-data_dir = project_dir+"/data"
-
-
-def downloadImg(imgUrl, fileName):
-    try:
-        headers = {
-            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
-            'Referer': 'http://www.xiaohuar.com'
-        }
-        req = request.Request(url=imgUrl)
-        for i in headers:
-            req.add_header(i, headers[i])
-        res = request.urlopen(req)
-        with open(img_dir+"/"+fileName+imgUrl[-4:], "wb") as code:
-            code.write(res.read())
-#         sleep(random.randint(1,5))
-    except Exception as err:
-        print(err)
-    finally:
-        print("pic:" + fileName+".jpg")
-
-
-def __init__():
-    if(os.path.exists(img_dir) != True):
-        os.mkdir(img_dir)
-    if(os.path.exists(data_dir) != True):
-        os.mkdir(data_dir)
-
-
-def main():
-    file = data_dir+"/result.csv"
-    with open(file, 'r') as f:
-        #         data=csv.reader(f, csv.excel_tab)
-        data = csv.reader(f)
-        for row in data:
-            imgUrl = ""
-            fileName = ""
-            for i in range(len(row)):
-                fileName = row[4]+"-"+row[3]
-                imgUrl = "http://www.xiaohuar.com"+row[2]
-
-            downloadImg(imgUrl, fileName)
-
-
-__init__()
-main()
+from crawl_xiaohua.extractor.base_extractor import BaseExtractor
+
+class Xiaohuar(BaseExtractor):
+    ''' extract xiaohuar.com '''
+    project_dir = "C:/Users/dell/Desktop/xiaohua-crawl"
+    img_dir = project_dir+"/images"
+    
+    _headers = {
+     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
+                'Referer': 'http://www.xiaohuar.com'
+            }
+    def __init__(self):
+        if(os.path.exists(self.img_dir) != True):
+            os.mkdir(self.img_dir)
+        if(os.path.exists(self.data_dir) != True):
+            os.mkdir(self.data_dir)
+
+    def downloadImg(self, imgUrl, fileName):
+        ''' download image
+            :param imgUrl: image url
+            :param fileName: file name
+        '''
+        try:
+            res = self.sess.get(imgUrl, headers=self._headers)
+            with open(self.img_dir+"/"+fileName+imgUrl[-4:], "wb") as file:
+                file.write(res.read())
+    #         sleep(random.randint(1,5))
+        except Exception as err:
+            print(err)
+        finally:
+            print("pic:" + fileName+".jpg")
+
+    def run(self):
+        ''' run '''
+        file = self.img_dir+"/result_xiaohuar.csv"
+        with open(file, 'r') as f:
+            #         data=csv.reader(f, csv.excel_tab)
+            data = csv.reader(f)
+            for row in data:
+                imgUrl = ""
+                fileName = ""
+                for i in range(len(row)):
+                    fileName = row[4]+"-"+row[3]
+                    imgUrl = "http://www.xiaohuar.com"+row[2]
+
+                self.downloadImg(imgUrl, fileName)

+ 8 - 14
main.py

@@ -16,18 +16,12 @@ parser.add_argument('--extractor', type=str, help='extractor name')
 parser.add_argument('--cmd', type=str, help='shell command')
 
 if __name__ == '__main__':
-    try:
-        args = parser.parse_args()
-        if args.command == 'server':
-            crawl_xiaohua.server(args.config)
-            print('server')
-        elif args.command == 'run':
-            crawl_xiaohua.run(args.extractor, args.cmd)
-        else:
-            print('unknown command')
-            parser.print_help()
-    except Exception as e:
-        print(e)
+    args = parser.parse_args()
+    if args.command == 'server':
+        crawl_xiaohua.server(args.config)
+        print('server')
+    elif args.command == 'run':
+        crawl_xiaohua.run(args.extractor, args.cmd)
+    else:
+        print('unknown command')
         parser.print_help()
-    finally:
-        pass