| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285 |
- import re
- import json
- import logging
- from typing import Dict, List, Optional, Any
- from urllib.parse import urlencode, urlparse, parse_qs
- from .base import BaseCrawler
- from utils.helpers import clean_price, parse_sales_volume
- class TaobaoCrawler(BaseCrawler):
- """
- 淘宝平台爬虫
- 注意:淘宝有较强的反爬机制,需要配置 Cookie 或使用代理
- """
-
- platform = 'taobao'
-
- def __init__(self, cookie: str = None, proxy: str = None):
- """
- 初始化淘宝爬虫
- :param cookie: 淘宝登录后的 Cookie 字符串
- :param proxy: 代理服务器地址,如 'http://127.0.0.1:7890'
- """
- super().__init__()
- self.cookie = cookie
- self.proxy = proxy
-
- if cookie:
- self.headers['Cookie'] = cookie
-
- if proxy:
- self.session.proxies = {
- 'http': proxy,
- 'https': proxy
- }
-
- def search(self, keyword: str, page: int = 1, sort: str = 'default', **kwargs) -> List[Dict[str, Any]]:
- """
- 搜索淘宝商品
- :param keyword: 搜索关键词
- :param page: 页码,从1开始
- :param sort: 排序方式: default(综合), sale-desc(销量), price-asc(价格升序), price-desc(价格降序)
- :return: 商品列表
- """
- self.logger.info(f"搜索淘宝商品: keyword={keyword}, page={page}")
-
- params = {
- 'q': keyword,
- 's': (page - 1) * 44,
- 'sort': sort
- }
-
- search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
-
- self.logger.debug(f"搜索URL: {search_url}")
-
- response = self._make_request('GET', search_url)
-
- if not response:
- self.logger.warning(f"搜索请求失败: {keyword}")
- return []
-
- products = self._parse_search_result(response.text, search_url)
-
- self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
-
- return products
-
- def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
- """
- 解析淘宝搜索结果页面
- 注意:淘宝搜索结果主要通过 JavaScript 渲染,需要从页面中的 JSON 数据提取
- """
- products = []
-
- g_page_config_match = re.search(r'g_page_config\s*=\s*({.*?});', html, re.DOTALL)
- if g_page_config_match:
- try:
- g_page_config = json.loads(g_page_config_match.group(1))
- auctions = g_page_config.get('mods', {}).get('itemlist', {}).get('data', {}).get('auctions', [])
-
- for auction in auctions:
- product = self._parse_auction_item(auction, source_url)
- if product:
- products.append(product)
-
- except json.JSONDecodeError as e:
- self.logger.error(f"解析 g_page_config 失败: {e}")
-
- if not products:
- products = self._parse_html_directly(html, source_url)
-
- return products
-
- def _parse_auction_item(self, auction: Dict, source_url: str) -> Optional[Dict[str, Any]]:
- """
- 解析单个拍卖商品数据
- """
- try:
- nid = auction.get('nid', '')
- if not nid:
- return None
-
- title = auction.get('raw_title', '') or auction.get('title', '')
- if not title:
- return None
-
- price = clean_price(auction.get('view_price', '0'))
- original_price = clean_price(auction.get('view_fee', '0')) or None
-
- pic_url = auction.get('pic_url', '')
- if pic_url and not pic_url.startswith('http'):
- pic_url = 'https:' + pic_url
-
- detail_url = auction.get('detail_url', '')
- if detail_url and not detail_url.startswith('http'):
- detail_url = 'https:' + detail_url
-
- sales_str = auction.get('view_sales', '')
- sales_volume = parse_sales_volume(sales_str)
-
- shop_name = auction.get('nick', '')
-
- is_tmall = auction.get('shopcard', {}).get('isTmall', False) if auction.get('shopcard') else False
-
- return {
- 'product_id': str(nid),
- 'name': title.strip(),
- 'price': price,
- 'original_price': original_price,
- 'image_url': pic_url,
- 'url': detail_url,
- 'shop_name': shop_name,
- 'sales_volume': sales_volume,
- 'is_tmall': is_tmall,
- 'source_url': source_url,
- 'is_wholesale': False,
- 'currency': 'CNY'
- }
-
- except Exception as e:
- self.logger.error(f"解析商品数据失败: {e}")
- return None
-
- def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]:
- """
- 直接从 HTML 中解析商品(备用方法)
- """
- products = []
-
- item_pattern = r'<div[^>]*class="[^"]*item[^"]*"[^>]*data-id="(\d+)"[^>]*>(.*?)</div>\s*</div>\s*</div>'
- items = re.findall(item_pattern, html, re.DOTALL | re.IGNORECASE)
-
- for item_id, item_html in items:
- try:
- title_match = re.search(r'<a[^>]*class="[^"]*J_ClickStat[^"]*"[^>]*>(.*?)</a>', item_html, re.DOTALL)
- title = title_match.group(1) if title_match else ''
- title = re.sub(r'<[^>]+>', '', title).strip() if title else ''
-
- price_match = re.search(r'<strong[^>]*data-price="([\d.]+)"', item_html)
- price = clean_price(price_match.group(1)) if price_match else 0
-
- sales_match = re.search(r'<div[^>]*class="deal-cnt"[^>]*>([^<]+)</div>', item_html)
- sales_str = sales_match.group(1) if sales_match else ''
- sales_volume = parse_sales_volume(sales_str)
-
- shop_match = re.search(r'<a[^>]*class="shopname[^"]*"[^>]*>(.*?)</a>', item_html, re.DOTALL)
- shop_name = shop_match.group(1) if shop_match else ''
- shop_name = re.sub(r'<[^>]+>', '', shop_name).strip() if shop_name else ''
-
- img_match = re.search(r'<img[^>]*data-src="([^"]+)"', item_html)
- img_url = img_match.group(1) if img_match else ''
- if img_url and not img_url.startswith('http'):
- img_url = 'https:' + img_url
-
- if title and item_id:
- products.append({
- 'product_id': str(item_id),
- 'name': title,
- 'price': price,
- 'image_url': img_url,
- 'shop_name': shop_name,
- 'sales_volume': sales_volume,
- 'source_url': source_url,
- 'is_wholesale': False,
- 'currency': 'CNY'
- })
-
- except Exception as e:
- self.logger.error(f"解析HTML商品失败: {e}")
- continue
-
- return products
-
- def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
- """
- 获取淘宝商品详情
- :param product_id: 商品ID (nid)
- :return: 商品详情
- """
- self.logger.info(f"获取淘宝商品详情: product_id={product_id}")
-
- detail_url = f"https://item.taobao.com/item.htm?id={product_id}"
-
- response = self._make_request('GET', detail_url)
-
- if not response:
- self.logger.warning(f"获取商品详情失败: {product_id}")
- return None
-
- return self._parse_product_detail(response.text, product_id, detail_url)
-
- def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
- """
- 解析商品详情页面
- """
- try:
- title_match = re.search(r'<title>([^<]+)</title>', html)
- title = title_match.group(1).split('-')[0].strip() if title_match else ''
-
- price_match = re.search(r'"price"\s*:\s*"([\d.]+)"', html) or \
- re.search(r'"defaultItemPrice"\s*:\s*"([\d.]+)"', html)
- price = clean_price(price_match.group(1)) if price_match else 0
-
- shop_match = re.search(r'"nick"\s*:\s*"([^"]+)"', html) or \
- re.search(r'shopName["\']\s*[:=]\s*["\']([^"\']+)["\']', html)
- shop_name = shop_match.group(1) if shop_match else ''
-
- sales_match = re.search(r'"sellCount"\s*:\s*(\d+)', html) or \
- re.search(r'"totalSoldQuantity"\s*:\s*(\d+)', html)
- sales_volume = int(sales_match.group(1)) if sales_match else None
-
- return {
- 'product_id': str(product_id),
- 'name': title,
- 'price': price,
- 'url': source_url,
- 'shop_name': shop_name,
- 'sales_volume': sales_volume,
- 'is_wholesale': False,
- 'currency': 'CNY'
- }
-
- except Exception as e:
- self.logger.error(f"解析商品详情失败: {e}")
- return None
-
- def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
- """
- 获取商品价格
- :param product_id: 商品ID
- :return: 价格信息
- """
- detail = self.get_product_detail(product_id, **kwargs)
- if detail:
- return {
- 'product_id': str(product_id),
- 'price': detail.get('price', 0),
- 'original_price': detail.get('original_price'),
- 'currency': 'CNY',
- 'platform': self.platform,
- 'source_url': detail.get('url', ''),
- 'price_type': 'retail'
- }
- return None
-
- def set_cookie(self, cookie: str):
- """
- 设置 Cookie
- :param cookie: Cookie 字符串
- """
- self.cookie = cookie
- self.headers['Cookie'] = cookie
-
- def set_proxy(self, proxy: str):
- """
- 设置代理
- :param proxy: 代理服务器地址
- """
- self.proxy = proxy
- self.session.proxies = {
- 'http': proxy,
- 'https': proxy
- }
|