lyq
/
crawl_price


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
							import re
import json
import logging
from typing import Dict, List, Optional, Any
from urllib.parse import urlencode, urlparse, parse_qs

from .base import BaseCrawler
from utils.helpers import clean_price, parse_sales_volume


class TaobaoCrawler(BaseCrawler):
    """
    淘宝平台爬虫
    注意：淘宝有较强的反爬机制，需要配置 Cookie 或使用代理
    """
    
    platform = 'taobao'
    
    def __init__(self, cookie: str = None, proxy: str = None):
        """
        初始化淘宝爬虫
        :param cookie: 淘宝登录后的 Cookie 字符串
        :param proxy: 代理服务器地址，如 'http://127.0.0.1:7890'
        """
        super().__init__()
        self.cookie = cookie
        self.proxy = proxy
        
        if cookie:
            self.headers['Cookie'] = cookie
        
        if proxy:
            self.session.proxies = {
                'http': proxy,
                'https': proxy
            }
    
    def search(self, keyword: str, page: int = 1, sort: str = 'default', **kwargs) -> List[Dict[str, Any]]:
        """
        搜索淘宝商品
        :param keyword: 搜索关键词
        :param page: 页码，从1开始
        :param sort: 排序方式: default(综合), sale-desc(销量), price-asc(价格升序), price-desc(价格降序)
        :return: 商品列表
        """
        self.logger.info(f"搜索淘宝商品: keyword={keyword}, page={page}")
        
        params = {
            'q': keyword,
            's': (page - 1) * 44,
            'sort': sort
        }
        
        search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
        
        self.logger.debug(f"搜索URL: {search_url}")
        
        response = self._make_request('GET', search_url)
        
        if not response:
            self.logger.warning(f"搜索请求失败: {keyword}")
            return []
        
        products = self._parse_search_result(response.text, search_url)
        
        self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
        
        return products
    
    def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
        """
        解析淘宝搜索结果页面
        注意：淘宝搜索结果主要通过 JavaScript 渲染，需要从页面中的 JSON 数据提取
        """
        products = []
        
        g_page_config_match = re.search(r'g_page_config\s*=\s*({.*?});', html, re.DOTALL)
        if g_page_config_match:
            try:
                g_page_config = json.loads(g_page_config_match.group(1))
                auctions = g_page_config.get('mods', {}).get('itemlist', {}).get('data', {}).get('auctions', [])
                
                for auction in auctions:
                    product = self._parse_auction_item(auction, source_url)
                    if product:
                        products.append(product)
                        
            except json.JSONDecodeError as e:
                self.logger.error(f"解析 g_page_config 失败: {e}")
        
        if not products:
            products = self._parse_html_directly(html, source_url)
        
        return products
    
    def _parse_auction_item(self, auction: Dict, source_url: str) -> Optional[Dict[str, Any]]:
        """
        解析单个拍卖商品数据
        """
        try:
            nid = auction.get('nid', '')
            if not nid:
                return None
            
            title = auction.get('raw_title', '') or auction.get('title', '')
            if not title:
                return None
            
            price = clean_price(auction.get('view_price', '0'))
            original_price = clean_price(auction.get('view_fee', '0')) or None
            
            pic_url = auction.get('pic_url', '')
            if pic_url and not pic_url.startswith('http'):
                pic_url = 'https:' + pic_url
            
            detail_url = auction.get('detail_url', '')
            if detail_url and not detail_url.startswith('http'):
                detail_url = 'https:' + detail_url
            
            sales_str = auction.get('view_sales', '')
            sales_volume = parse_sales_volume(sales_str)
            
            shop_name = auction.get('nick', '')
            
            is_tmall = auction.get('shopcard', {}).get('isTmall', False) if auction.get('shopcard') else False
            
            return {
                'product_id': str(nid),
                'name': title.strip(),
                'price': price,
                'original_price': original_price,
                'image_url': pic_url,
                'url': detail_url,
                'shop_name': shop_name,
                'sales_volume': sales_volume,
                'is_tmall': is_tmall,
                'source_url': source_url,
                'is_wholesale': False,
                'currency': 'CNY'
            }
            
        except Exception as e:
            self.logger.error(f"解析商品数据失败: {e}")
            return None
    
    def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]:
        """
        直接从 HTML 中解析商品（备用方法）
        """
        products = []
        
        item_pattern = r'<div[^>]*class="[^"]*item[^"]*"[^>]*data-id="(\d+)"[^>]*>(.*?)</div>\s*</div>\s*</div>'
        items = re.findall(item_pattern, html, re.DOTALL | re.IGNORECASE)
        
        for item_id, item_html in items:
            try:
                title_match = re.search(r'<a[^>]*class="[^"]*J_ClickStat[^"]*"[^>]*>(.*?)</a>', item_html, re.DOTALL)
                title = title_match.group(1) if title_match else ''
                title = re.sub(r'<[^>]+>', '', title).strip() if title else ''
                
                price_match = re.search(r'<strong[^>]*data-price="([\d.]+)"', item_html)
                price = clean_price(price_match.group(1)) if price_match else 0
                
                sales_match = re.search(r'<div[^>]*class="deal-cnt"[^>]*>([^<]+)</div>', item_html)
                sales_str = sales_match.group(1) if sales_match else ''
                sales_volume = parse_sales_volume(sales_str)
                
                shop_match = re.search(r'<a[^>]*class="shopname[^"]*"[^>]*>(.*?)</a>', item_html, re.DOTALL)
                shop_name = shop_match.group(1) if shop_match else ''
                shop_name = re.sub(r'<[^>]+>', '', shop_name).strip() if shop_name else ''
                
                img_match = re.search(r'<img[^>]*data-src="([^"]+)"', item_html)
                img_url = img_match.group(1) if img_match else ''
                if img_url and not img_url.startswith('http'):
                    img_url = 'https:' + img_url
                
                if title and item_id:
                    products.append({
                        'product_id': str(item_id),
                        'name': title,
                        'price': price,
                        'image_url': img_url,
                        'shop_name': shop_name,
                        'sales_volume': sales_volume,
                        'source_url': source_url,
                        'is_wholesale': False,
                        'currency': 'CNY'
                    })
                    
            except Exception as e:
                self.logger.error(f"解析HTML商品失败: {e}")
                continue
        
        return products
    
    def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
        """
        获取淘宝商品详情
        :param product_id: 商品ID (nid)
        :return: 商品详情
        """
        self.logger.info(f"获取淘宝商品详情: product_id={product_id}")
        
        detail_url = f"https://item.taobao.com/item.htm?id={product_id}"
        
        response = self._make_request('GET', detail_url)
        
        if not response:
            self.logger.warning(f"获取商品详情失败: {product_id}")
            return None
        
        return self._parse_product_detail(response.text, product_id, detail_url)
    
    def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
        """
        解析商品详情页面
        """
        try:
            title_match = re.search(r'<title>([^<]+)</title>', html)
            title = title_match.group(1).split('-')[0].strip() if title_match else ''
            
            price_match = re.search(r'"price"\s*:\s*"([\d.]+)"', html) or \
                          re.search(r'"defaultItemPrice"\s*:\s*"([\d.]+)"', html)
            price = clean_price(price_match.group(1)) if price_match else 0
            
            shop_match = re.search(r'"nick"\s*:\s*"([^"]+)"', html) or \
                         re.search(r'shopName["\']\s*[:=]\s*["\']([^"\']+)["\']', html)
            shop_name = shop_match.group(1) if shop_match else ''
            
            sales_match = re.search(r'"sellCount"\s*:\s*(\d+)', html) or \
                          re.search(r'"totalSoldQuantity"\s*:\s*(\d+)', html)
            sales_volume = int(sales_match.group(1)) if sales_match else None
            
            return {
                'product_id': str(product_id),
                'name': title,
                'price': price,
                'url': source_url,
                'shop_name': shop_name,
                'sales_volume': sales_volume,
                'is_wholesale': False,
                'currency': 'CNY'
            }
            
        except Exception as e:
            self.logger.error(f"解析商品详情失败: {e}")
            return None
    
    def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
        """
        获取商品价格
        :param product_id: 商品ID
        :return: 价格信息
        """
        detail = self.get_product_detail(product_id, **kwargs)
        if detail:
            return {
                'product_id': str(product_id),
                'price': detail.get('price', 0),
                'original_price': detail.get('original_price'),
                'currency': 'CNY',
                'platform': self.platform,
                'source_url': detail.get('url', ''),
                'price_type': 'retail'
            }
        return None
    
    def set_cookie(self, cookie: str):
        """
        设置 Cookie
        :param cookie: Cookie 字符串
        """
        self.cookie = cookie
        self.headers['Cookie'] = cookie
    
    def set_proxy(self, proxy: str):
        """
        设置代理
        :param proxy: 代理服务器地址
        """
        self.proxy = proxy
        self.session.proxies = {
            'http': proxy,
            'https': proxy
        }