#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Author  :   liuyuqi
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2020/04/06 22:13:01
@Version :   1.0
@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
@Desc    :   爬取螺蛳粉天猫数据
'''

# 导入所需包
from selenium import webdriver
import parsel
import re
import time
import pandas as pd


def login_taobao_acount():
	# 打开浏览器
	global browser
	browser = webdriver.Chrome()
	# 登录URL
	login_url = 'https://login.taobao.com/member/login.jhtml'

	# 打开网页
	browser.get(login_url)
	# 支付宝登录
	browser.find_element_by_class_name('alipay-login').click()


def get_assigned_page(key_words):
	# 获取淘宝URL
	tb_url = 'https://www.taobao.com/'
	# 打开淘宝网
	browser.get(tb_url)
	# 定位搜索框,输入数据
	s_bar = browser.find_element_by_xpath('//*[@id="q"]')
	s_bar.send_keys('{}'.format(key_words))
	# 点击搜索
	browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()


def get_one_page():
	# 先获取第一页的信息
	html = parsel.Selector(browser.page_source)

	# 获取数据
	goods_name = html.xpath('//div[@class="grid g-clearfix"]//img/@alt').extract()
	shop_name = html.xpath('//div[@class="grid g-clearfix"]//div[@class="shop"]/a/span[2]/text()').extract()
	price = html.xpath('//div[@class="grid g-clearfix"]//div[contains(@class,"price")]/strong/text()').extract()
	purchase_num = [re.findall(r'<div class="deal-cnt">(.*?)</div>', i)
					for i in html.xpath('//div[@class="grid g-clearfix"]//div[@class="row row-1 g-clearfix"]').extract()]
	location = html.xpath('//div[@class="grid g-clearfix"]//div[@class="location"]/text()').extract()

	# 存储数据
	df_one = pd.DataFrame({
		'goods_name': goods_name,
		'shop_name': shop_name,
		'price': price,
		'purchase_num': purchase_num,
		'location': location
	})

	return df_one


def get_all_page(page_num):
	df_all = pd.DataFrame()

	# 循环翻页
	for i in range(1, page_num):
		# 运行函数
		df_one = get_one_page()
		# 循环追加
		df_all = df_all.append(df_one, ignore_index=True)
		# 100页的时候打断翻页
		if page_num==100:
			break
		else:
			# 点击翻页
			browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > ul > li.item.next > a').click()
			# 打印进度
			print('我正在获取第{}页的数据'.format(i))
			# 休眠一秒
			time.sleep(10)
	return df_all


if __name__=='__main__':
	# 先运行登录函数
	login_taobao_acount()
	# 再运行搜索函数
	get_assigned_page(key_words='螺蛳粉')
	# 再运行翻页获取函数
	df_all = get_all_page(page_num=101)
	# 读出数据
	df_all.to_excel('data/螺蛳粉店铺数据.xlsx', index=False)