|
@@ -9,52 +9,49 @@ import os
|
|
from time import sleep
|
|
from time import sleep
|
|
import random
|
|
import random
|
|
from urllib import request
|
|
from urllib import request
|
|
-
|
|
|
|
-project_dir = "C:/Users/dell/Desktop/xiaohua-crawl"
|
|
|
|
-img_dir = project_dir+"/images"
|
|
|
|
-data_dir = project_dir+"/data"
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def downloadImg(imgUrl, fileName):
|
|
|
|
- try:
|
|
|
|
- headers = {
|
|
|
|
- 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
|
|
|
|
- 'Referer': 'http://www.xiaohuar.com'
|
|
|
|
- }
|
|
|
|
- req = request.Request(url=imgUrl)
|
|
|
|
- for i in headers:
|
|
|
|
- req.add_header(i, headers[i])
|
|
|
|
- res = request.urlopen(req)
|
|
|
|
- with open(img_dir+"/"+fileName+imgUrl[-4:], "wb") as code:
|
|
|
|
- code.write(res.read())
|
|
|
|
-# sleep(random.randint(1,5))
|
|
|
|
- except Exception as err:
|
|
|
|
- print(err)
|
|
|
|
- finally:
|
|
|
|
- print("pic:" + fileName+".jpg")
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def __init__():
|
|
|
|
- if(os.path.exists(img_dir) != True):
|
|
|
|
- os.mkdir(img_dir)
|
|
|
|
- if(os.path.exists(data_dir) != True):
|
|
|
|
- os.mkdir(data_dir)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def main():
|
|
|
|
- file = data_dir+"/result.csv"
|
|
|
|
- with open(file, 'r') as f:
|
|
|
|
- # data=csv.reader(f, csv.excel_tab)
|
|
|
|
- data = csv.reader(f)
|
|
|
|
- for row in data:
|
|
|
|
- imgUrl = ""
|
|
|
|
- fileName = ""
|
|
|
|
- for i in range(len(row)):
|
|
|
|
- fileName = row[4]+"-"+row[3]
|
|
|
|
- imgUrl = "http://www.xiaohuar.com"+row[2]
|
|
|
|
-
|
|
|
|
- downloadImg(imgUrl, fileName)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-__init__()
|
|
|
|
-main()
|
|
|
|
|
|
+from crawl_xiaohua.extractor.base_extractor import BaseExtractor
|
|
|
|
+
|
|
|
|
+class Xiaohuar(BaseExtractor):
|
|
|
|
+ ''' extract xiaohuar.com '''
|
|
|
|
+ project_dir = "C:/Users/dell/Desktop/xiaohua-crawl"
|
|
|
|
+ img_dir = project_dir+"/images"
|
|
|
|
+
|
|
|
|
+ _headers = {
|
|
|
|
+ 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
|
|
|
|
+ 'Referer': 'http://www.xiaohuar.com'
|
|
|
|
+ }
|
|
|
|
+ def __init__(self):
|
|
|
|
+ if(os.path.exists(self.img_dir) != True):
|
|
|
|
+ os.mkdir(self.img_dir)
|
|
|
|
+ if(os.path.exists(self.data_dir) != True):
|
|
|
|
+ os.mkdir(self.data_dir)
|
|
|
|
+
|
|
|
|
+ def downloadImg(self, imgUrl, fileName):
|
|
|
|
+ ''' download image
|
|
|
|
+ :param imgUrl: image url
|
|
|
|
+ :param fileName: file name
|
|
|
|
+ '''
|
|
|
|
+ try:
|
|
|
|
+ res = self.sess.get(imgUrl, headers=self._headers)
|
|
|
|
+ with open(self.img_dir+"/"+fileName+imgUrl[-4:], "wb") as file:
|
|
|
|
+ file.write(res.read())
|
|
|
|
+ # sleep(random.randint(1,5))
|
|
|
|
+ except Exception as err:
|
|
|
|
+ print(err)
|
|
|
|
+ finally:
|
|
|
|
+ print("pic:" + fileName+".jpg")
|
|
|
|
+
|
|
|
|
+ def run(self):
|
|
|
|
+ ''' run '''
|
|
|
|
+ file = self.img_dir+"/result_xiaohuar.csv"
|
|
|
|
+ with open(file, 'r') as f:
|
|
|
|
+ # data=csv.reader(f, csv.excel_tab)
|
|
|
|
+ data = csv.reader(f)
|
|
|
|
+ for row in data:
|
|
|
|
+ imgUrl = ""
|
|
|
|
+ fileName = ""
|
|
|
|
+ for i in range(len(row)):
|
|
|
|
+ fileName = row[4]+"-"+row[3]
|
|
|
|
+ imgUrl = "http://www.xiaohuar.com"+row[2]
|
|
|
|
+
|
|
|
|
+ self.downloadImg(imgUrl, fileName)
|