import re
import json
import logging
from typing import Dict, List, Optional, Any
from urllib.parse import urlencode, urlparse, parse_qs
from .base import BaseCrawler
from utils.helpers import clean_price, parse_sales_volume
class TaobaoCrawler(BaseCrawler):
"""
淘宝平台爬虫
注意:淘宝有较强的反爬机制,需要配置 Cookie 或使用代理
"""
platform = 'taobao'
def __init__(self, cookie: str = None, proxy: str = None):
"""
初始化淘宝爬虫
:param cookie: 淘宝登录后的 Cookie 字符串
:param proxy: 代理服务器地址,如 'http://127.0.0.1:7890'
"""
super().__init__()
self.cookie = cookie
self.proxy = proxy
if cookie:
self.headers['Cookie'] = cookie
if proxy:
self.session.proxies = {
'http': proxy,
'https': proxy
}
def search(self, keyword: str, page: int = 1, sort: str = 'default', **kwargs) -> List[Dict[str, Any]]:
"""
搜索淘宝商品
:param keyword: 搜索关键词
:param page: 页码,从1开始
:param sort: 排序方式: default(综合), sale-desc(销量), price-asc(价格升序), price-desc(价格降序)
:return: 商品列表
"""
self.logger.info(f"搜索淘宝商品: keyword={keyword}, page={page}")
params = {
'q': keyword,
's': (page - 1) * 44,
'sort': sort
}
search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
self.logger.debug(f"搜索URL: {search_url}")
response = self._make_request('GET', search_url)
if not response:
self.logger.warning(f"搜索请求失败: {keyword}")
return []
products = self._parse_search_result(response.text, search_url)
self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
return products
def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
"""
解析淘宝搜索结果页面
注意:淘宝搜索结果主要通过 JavaScript 渲染,需要从页面中的 JSON 数据提取
"""
products = []
g_page_config_match = re.search(r'g_page_config\s*=\s*({.*?});', html, re.DOTALL)
if g_page_config_match:
try:
g_page_config = json.loads(g_page_config_match.group(1))
auctions = g_page_config.get('mods', {}).get('itemlist', {}).get('data', {}).get('auctions', [])
for auction in auctions:
product = self._parse_auction_item(auction, source_url)
if product:
products.append(product)
except json.JSONDecodeError as e:
self.logger.error(f"解析 g_page_config 失败: {e}")
if not products:
products = self._parse_html_directly(html, source_url)
return products
def _parse_auction_item(self, auction: Dict, source_url: str) -> Optional[Dict[str, Any]]:
"""
解析单个拍卖商品数据
"""
try:
nid = auction.get('nid', '')
if not nid:
return None
title = auction.get('raw_title', '') or auction.get('title', '')
if not title:
return None
price = clean_price(auction.get('view_price', '0'))
original_price = clean_price(auction.get('view_fee', '0')) or None
pic_url = auction.get('pic_url', '')
if pic_url and not pic_url.startswith('http'):
pic_url = 'https:' + pic_url
detail_url = auction.get('detail_url', '')
if detail_url and not detail_url.startswith('http'):
detail_url = 'https:' + detail_url
sales_str = auction.get('view_sales', '')
sales_volume = parse_sales_volume(sales_str)
shop_name = auction.get('nick', '')
is_tmall = auction.get('shopcard', {}).get('isTmall', False) if auction.get('shopcard') else False
return {
'product_id': str(nid),
'name': title.strip(),
'price': price,
'original_price': original_price,
'image_url': pic_url,
'url': detail_url,
'shop_name': shop_name,
'sales_volume': sales_volume,
'is_tmall': is_tmall,
'source_url': source_url,
'is_wholesale': False,
'currency': 'CNY'
}
except Exception as e:
self.logger.error(f"解析商品数据失败: {e}")
return None
def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]:
"""
直接从 HTML 中解析商品(备用方法)
"""
products = []
item_pattern = r'
]*class="[^"]*item[^"]*"[^>]*data-id="(\d+)"[^>]*>(.*?)
\s*\s*'
items = re.findall(item_pattern, html, re.DOTALL | re.IGNORECASE)
for item_id, item_html in items:
try:
title_match = re.search(r']*class="[^"]*J_ClickStat[^"]*"[^>]*>(.*?)', item_html, re.DOTALL)
title = title_match.group(1) if title_match else ''
title = re.sub(r'<[^>]+>', '', title).strip() if title else ''
price_match = re.search(r']*data-price="([\d.]+)"', item_html)
price = clean_price(price_match.group(1)) if price_match else 0
sales_match = re.search(r']*class="deal-cnt"[^>]*>([^<]+)
', item_html)
sales_str = sales_match.group(1) if sales_match else ''
sales_volume = parse_sales_volume(sales_str)
shop_match = re.search(r']*class="shopname[^"]*"[^>]*>(.*?)', item_html, re.DOTALL)
shop_name = shop_match.group(1) if shop_match else ''
shop_name = re.sub(r'<[^>]+>', '', shop_name).strip() if shop_name else ''
img_match = re.search(r'
]*data-src="([^"]+)"', item_html)
img_url = img_match.group(1) if img_match else ''
if img_url and not img_url.startswith('http'):
img_url = 'https:' + img_url
if title and item_id:
products.append({
'product_id': str(item_id),
'name': title,
'price': price,
'image_url': img_url,
'shop_name': shop_name,
'sales_volume': sales_volume,
'source_url': source_url,
'is_wholesale': False,
'currency': 'CNY'
})
except Exception as e:
self.logger.error(f"解析HTML商品失败: {e}")
continue
return products
def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
"""
获取淘宝商品详情
:param product_id: 商品ID (nid)
:return: 商品详情
"""
self.logger.info(f"获取淘宝商品详情: product_id={product_id}")
detail_url = f"https://item.taobao.com/item.htm?id={product_id}"
response = self._make_request('GET', detail_url)
if not response:
self.logger.warning(f"获取商品详情失败: {product_id}")
return None
return self._parse_product_detail(response.text, product_id, detail_url)
def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
"""
解析商品详情页面
"""
try:
title_match = re.search(r'([^<]+)', html)
title = title_match.group(1).split('-')[0].strip() if title_match else ''
price_match = re.search(r'"price"\s*:\s*"([\d.]+)"', html) or \
re.search(r'"defaultItemPrice"\s*:\s*"([\d.]+)"', html)
price = clean_price(price_match.group(1)) if price_match else 0
shop_match = re.search(r'"nick"\s*:\s*"([^"]+)"', html) or \
re.search(r'shopName["\']\s*[:=]\s*["\']([^"\']+)["\']', html)
shop_name = shop_match.group(1) if shop_match else ''
sales_match = re.search(r'"sellCount"\s*:\s*(\d+)', html) or \
re.search(r'"totalSoldQuantity"\s*:\s*(\d+)', html)
sales_volume = int(sales_match.group(1)) if sales_match else None
return {
'product_id': str(product_id),
'name': title,
'price': price,
'url': source_url,
'shop_name': shop_name,
'sales_volume': sales_volume,
'is_wholesale': False,
'currency': 'CNY'
}
except Exception as e:
self.logger.error(f"解析商品详情失败: {e}")
return None
def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
"""
获取商品价格
:param product_id: 商品ID
:return: 价格信息
"""
detail = self.get_product_detail(product_id, **kwargs)
if detail:
return {
'product_id': str(product_id),
'price': detail.get('price', 0),
'original_price': detail.get('original_price'),
'currency': 'CNY',
'platform': self.platform,
'source_url': detail.get('url', ''),
'price_type': 'retail'
}
return None
def set_cookie(self, cookie: str):
"""
设置 Cookie
:param cookie: Cookie 字符串
"""
self.cookie = cookie
self.headers['Cookie'] = cookie
def set_proxy(self, proxy: str):
"""
设置代理
:param proxy: 代理服务器地址
"""
self.proxy = proxy
self.session.proxies = {
'http': proxy,
'https': proxy
}