import re import json import logging from typing import Dict, List, Optional, Any from urllib.parse import urlencode, urlparse, parse_qs from .base import BaseCrawler from utils.helpers import clean_price, parse_sales_volume class TaobaoCrawler(BaseCrawler): """ 淘宝平台爬虫 注意:淘宝有较强的反爬机制,需要配置 Cookie 或使用代理 """ platform = 'taobao' def __init__(self, cookie: str = None, proxy: str = None): """ 初始化淘宝爬虫 :param cookie: 淘宝登录后的 Cookie 字符串 :param proxy: 代理服务器地址,如 'http://127.0.0.1:7890' """ super().__init__() self.cookie = cookie self.proxy = proxy if cookie: self.headers['Cookie'] = cookie if proxy: self.session.proxies = { 'http': proxy, 'https': proxy } def search(self, keyword: str, page: int = 1, sort: str = 'default', **kwargs) -> List[Dict[str, Any]]: """ 搜索淘宝商品 :param keyword: 搜索关键词 :param page: 页码,从1开始 :param sort: 排序方式: default(综合), sale-desc(销量), price-asc(价格升序), price-desc(价格降序) :return: 商品列表 """ self.logger.info(f"搜索淘宝商品: keyword={keyword}, page={page}") params = { 'q': keyword, 's': (page - 1) * 44, 'sort': sort } search_url = f"{self.config.get('search_url')}?{urlencode(params)}" self.logger.debug(f"搜索URL: {search_url}") response = self._make_request('GET', search_url) if not response: self.logger.warning(f"搜索请求失败: {keyword}") return [] products = self._parse_search_result(response.text, search_url) self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品") return products def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]: """ 解析淘宝搜索结果页面 注意:淘宝搜索结果主要通过 JavaScript 渲染,需要从页面中的 JSON 数据提取 """ products = [] g_page_config_match = re.search(r'g_page_config\s*=\s*({.*?});', html, re.DOTALL) if g_page_config_match: try: g_page_config = json.loads(g_page_config_match.group(1)) auctions = g_page_config.get('mods', {}).get('itemlist', {}).get('data', {}).get('auctions', []) for auction in auctions: product = self._parse_auction_item(auction, source_url) if product: products.append(product) except json.JSONDecodeError as e: self.logger.error(f"解析 g_page_config 失败: {e}") if not products: products = self._parse_html_directly(html, source_url) return products def _parse_auction_item(self, auction: Dict, source_url: str) -> Optional[Dict[str, Any]]: """ 解析单个拍卖商品数据 """ try: nid = auction.get('nid', '') if not nid: return None title = auction.get('raw_title', '') or auction.get('title', '') if not title: return None price = clean_price(auction.get('view_price', '0')) original_price = clean_price(auction.get('view_fee', '0')) or None pic_url = auction.get('pic_url', '') if pic_url and not pic_url.startswith('http'): pic_url = 'https:' + pic_url detail_url = auction.get('detail_url', '') if detail_url and not detail_url.startswith('http'): detail_url = 'https:' + detail_url sales_str = auction.get('view_sales', '') sales_volume = parse_sales_volume(sales_str) shop_name = auction.get('nick', '') is_tmall = auction.get('shopcard', {}).get('isTmall', False) if auction.get('shopcard') else False return { 'product_id': str(nid), 'name': title.strip(), 'price': price, 'original_price': original_price, 'image_url': pic_url, 'url': detail_url, 'shop_name': shop_name, 'sales_volume': sales_volume, 'is_tmall': is_tmall, 'source_url': source_url, 'is_wholesale': False, 'currency': 'CNY' } except Exception as e: self.logger.error(f"解析商品数据失败: {e}") return None def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]: """ 直接从 HTML 中解析商品(备用方法) """ products = [] item_pattern = r']*class="[^"]*item[^"]*"[^>]*data-id="(\d+)"[^>]*>(.*?)\s*\s*' items = re.findall(item_pattern, html, re.DOTALL | re.IGNORECASE) for item_id, item_html in items: try: title_match = re.search(r']*class="[^"]*J_ClickStat[^"]*"[^>]*>(.*?)', item_html, re.DOTALL) title = title_match.group(1) if title_match else '' title = re.sub(r'<[^>]+>', '', title).strip() if title else '' price_match = re.search(r']*data-price="([\d.]+)"', item_html) price = clean_price(price_match.group(1)) if price_match else 0 sales_match = re.search(r']*class="deal-cnt"[^>]*>([^<]+)', item_html) sales_str = sales_match.group(1) if sales_match else '' sales_volume = parse_sales_volume(sales_str) shop_match = re.search(r']*class="shopname[^"]*"[^>]*>(.*?)', item_html, re.DOTALL) shop_name = shop_match.group(1) if shop_match else '' shop_name = re.sub(r'<[^>]+>', '', shop_name).strip() if shop_name else '' img_match = re.search(r']*data-src="([^"]+)"', item_html) img_url = img_match.group(1) if img_match else '' if img_url and not img_url.startswith('http'): img_url = 'https:' + img_url if title and item_id: products.append({ 'product_id': str(item_id), 'name': title, 'price': price, 'image_url': img_url, 'shop_name': shop_name, 'sales_volume': sales_volume, 'source_url': source_url, 'is_wholesale': False, 'currency': 'CNY' }) except Exception as e: self.logger.error(f"解析HTML商品失败: {e}") continue return products def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]: """ 获取淘宝商品详情 :param product_id: 商品ID (nid) :return: 商品详情 """ self.logger.info(f"获取淘宝商品详情: product_id={product_id}") detail_url = f"https://item.taobao.com/item.htm?id={product_id}" response = self._make_request('GET', detail_url) if not response: self.logger.warning(f"获取商品详情失败: {product_id}") return None return self._parse_product_detail(response.text, product_id, detail_url) def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]: """ 解析商品详情页面 """ try: title_match = re.search(r'([^<]+)', html) title = title_match.group(1).split('-')[0].strip() if title_match else '' price_match = re.search(r'"price"\s*:\s*"([\d.]+)"', html) or \ re.search(r'"defaultItemPrice"\s*:\s*"([\d.]+)"', html) price = clean_price(price_match.group(1)) if price_match else 0 shop_match = re.search(r'"nick"\s*:\s*"([^"]+)"', html) or \ re.search(r'shopName["\']\s*[:=]\s*["\']([^"\']+)["\']', html) shop_name = shop_match.group(1) if shop_match else '' sales_match = re.search(r'"sellCount"\s*:\s*(\d+)', html) or \ re.search(r'"totalSoldQuantity"\s*:\s*(\d+)', html) sales_volume = int(sales_match.group(1)) if sales_match else None return { 'product_id': str(product_id), 'name': title, 'price': price, 'url': source_url, 'shop_name': shop_name, 'sales_volume': sales_volume, 'is_wholesale': False, 'currency': 'CNY' } except Exception as e: self.logger.error(f"解析商品详情失败: {e}") return None def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]: """ 获取商品价格 :param product_id: 商品ID :return: 价格信息 """ detail = self.get_product_detail(product_id, **kwargs) if detail: return { 'product_id': str(product_id), 'price': detail.get('price', 0), 'original_price': detail.get('original_price'), 'currency': 'CNY', 'platform': self.platform, 'source_url': detail.get('url', ''), 'price_type': 'retail' } return None def set_cookie(self, cookie: str): """ 设置 Cookie :param cookie: Cookie 字符串 """ self.cookie = cookie self.headers['Cookie'] = cookie def set_proxy(self, proxy: str): """ 设置代理 :param proxy: 代理服务器地址 """ self.proxy = proxy self.session.proxies = { 'http': proxy, 'https': proxy }