lyq
/
crawl_price


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
							import re
import json
import logging
from typing import Dict, List, Optional, Any, Tuple
from urllib.parse import urlencode, quote, urlparse, parse_qs

from .base import BaseCrawler
from utils.helpers import clean_price, parse_sales_volume, extract_numbers


class Alibaba1688Crawler(BaseCrawler):
    """
    1688批发平台爬虫
    重点关注批发价格、起订量等批发相关信息
    """
    
    platform = 'alibaba1688'
    
    def __init__(self, cookie: str = None, proxy: str = None):
        """
        初始化1688爬虫
        :param cookie: 1688登录后的 Cookie 字符串
        :param proxy: 代理服务器地址
        """
        super().__init__()
        self.cookie = cookie
        self.proxy = proxy
        
        if cookie:
            self.headers['Cookie'] = cookie
        
        if proxy:
            self.session.proxies = {
                'http': proxy,
                'https': proxy
            }
    
    def search(self, keyword: str, page: int = 1, sort: str = 'default', 
               price_range: Tuple[float, float] = None, **kwargs) -> List[Dict[str, Any]]:
        """
        搜索1688商品
        :param keyword: 搜索关键词
        :param page: 页码，从1开始
        :param sort: 排序方式: 
            - default (综合)
            - va (销量)
            - price_asc (价格升序)
            - price_desc (价格降序)
            - bookTime (最新发布)
        :param price_range: 价格范围 (min_price, max_price)
        :return: 商品列表
        """
        self.logger.info(f"搜索1688商品: keyword={keyword}, page={page}")
        
        params = {
            'keywords': keyword,
            'pageSize': 40,
            'beginPage': page,
        }
        
        if sort and sort != 'default':
            params['sortType'] = sort
        
        if price_range:
            min_price, max_price = price_range
            if min_price:
                params['filtPriceMin'] = min_price
            if max_price:
                params['filtPriceMax'] = max_price
        
        search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
        
        self.logger.debug(f"搜索URL: {search_url}")
        
        response = self._make_request('GET', search_url)
        
        if not response:
            self.logger.warning(f"搜索请求失败: {keyword}")
            return []
        
        products = self._parse_search_result(response.text, search_url)
        
        self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
        
        return products
    
    def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
        """
        解析1688搜索结果页面
        """
        products = []
        
        json_pattern = r'window\.pageData\s*=\s*({.*?});\s*</script>'
        json_match = re.search(json_pattern, html, re.DOTALL)
        
        if json_match:
            try:
                page_data = json.loads(json_match.group(1))
                items = page_data.get('data', {}).get('searchList', []) or \
                        page_data.get('data', {}).get('offerList', []) or \
                        page_data.get('offerList', [])
                
                for item in items:
                    product = self._parse_json_item(item, source_url)
                    if product:
                        products.append(product)
                        
            except json.JSONDecodeError as e:
                self.logger.error(f"解析pageData失败: {e}")
        
        if not products:
            products = self._parse_html_directly(html, source_url)
        
        return products
    
    def _parse_json_item(self, item: Dict, source_url: str) -> Optional[Dict[str, Any]]:
        """
        解析JSON格式的商品数据
        """
        try:
            offer_id = item.get('offerId', '') or item.get('id', '')
            if not offer_id:
                return None
            
            title = item.get('subject', '') or item.get('title', '')
            title = title.strip()
            if not title:
                return None
            
            price_info = self._extract_price_info(item)
            
            image_url = item.get('imgUrl', '') or item.get('imageUrl', '')
            if image_url and not image_url.startswith('http'):
                image_url = 'https:' + image_url
            
            detail_url = item.get('detailUrl', '') or f"https://detail.1688.com/offer/{offer_id}.html"
            if detail_url and not detail_url.startswith('http'):
                detail_url = 'https:' + detail_url
            
            shop_name = item.get('companyName', '') or item.get('shopName', '')
            
            sales_str = item.get('soldQuantity', '') or item.get('salesNum', '') or str(item.get('bookedCount', ''))
            sales_volume = parse_sales_volume(str(sales_str)) if sales_str else None
            
            min_order = item.get('minOrder', '') or item.get('startQuantity', '')
            min_order_quantity = self._parse_quantity(min_order)
            
            unit = item.get('unit', '') or item.get('quantityUnit', '件')
            
            is_wholesale = True
            
            return {
                'product_id': str(offer_id),
                'name': title,
                'price': price_info.get('min_price', 0),
                'original_price': price_info.get('original_price'),
                'price_ranges': price_info.get('price_ranges', []),
                'image_url': image_url,
                'url': detail_url,
                'shop_name': shop_name,
                'sales_volume': sales_volume,
                'min_order_quantity': min_order_quantity,
                'unit': unit,
                'source_url': source_url,
                'is_wholesale': is_wholesale,
                'currency': 'CNY'
            }
            
        except Exception as e:
            self.logger.error(f"解析JSON商品失败: {e}")
            return None
    
    def _extract_price_info(self, item: Dict) -> Dict[str, Any]:
        """
        从商品数据中提取价格信息
        1688的价格通常是区间价格，根据起订量不同价格不同
        """
        price_info = {
            'min_price': 0,
            'max_price': 0,
            'original_price': None,
            'price_ranges': []
        }
        
        price_ranges = item.get('priceRanges', []) or item.get('priceRangeList', [])
        
        if price_ranges:
            prices = []
            for pr in price_ranges:
                price = clean_price(str(pr.get('price', '0')))
                min_quantity = pr.get('quantity', 0) or pr.get('startQuantity', 0)
                max_quantity = pr.get('endQuantity')
                
                prices.append(price)
                price_info['price_ranges'].append({
                    'min_quantity': min_quantity,
                    'max_quantity': max_quantity,
                    'price': price
                })
            
            if prices:
                price_info['min_price'] = min(prices)
                price_info['max_price'] = max(prices)
        else:
            price_str = item.get('price', '') or item.get('displayPrice', '')
            if price_str:
                price = clean_price(str(price_str))
                price_info['min_price'] = price
                price_info['max_price'] = price
        
        original_price_str = item.get('originalPrice', '') or item.get('marketPrice', '')
        if original_price_str:
            price_info['original_price'] = clean_price(str(original_price_str))
        
        return price_info
    
    def _parse_quantity(self, quantity_str: Any) -> Optional[int]:
        """
        解析数量字符串
        """
        if not quantity_str:
            return None
        
        quantity_str = str(quantity_str).strip()
        numbers = extract_numbers(quantity_str)
        if numbers:
            return int(numbers[0])
        return None
    
    def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]:
        """
        直接从HTML解析商品（备用方法）
        """
        products = []
        
        offer_pattern = r'data-offerid="(\d+)"[^>]*>(.*?)</div>\s*</div>\s*</div>'
        offers = re.findall(offer_pattern, html, re.DOTALL | re.IGNORECASE)
        
        for offer_id, offer_html in offers:
            try:
                title_match = re.search(r'<a[^>]*title="([^"]+)"', offer_html) or \
                             re.search(r'<div[^>]*class="[^"]*title[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', offer_html, re.DOTALL)
                title = ''
                if title_match:
                    title = title_match.group(1)
                    title = re.sub(r'<[^>]+>', '', title).strip()
                
                if not title:
                    continue
                
                price_match = re.search(r'<div[^>]*class="[^"]*price[^"]*"[^>]*>.*?<span[^>]*>([\d¥.,]+)</span>', offer_html, re.DOTALL | re.IGNORECASE) or \
                             re.search(r'¥([\d.]+)', offer_html)
                price = clean_price(price_match.group(1)) if price_match else 0
                
                img_match = re.search(r'<img[^>]*data-src="([^"]+)"', offer_html) or \
                           re.search(r'<img[^>]*src="([^"]+)"', offer_html)
                img_url = img_match.group(1) if img_match else ''
                if img_url and not img_url.startswith('http'):
                    img_url = 'https:' + img_url
                
                shop_match = re.search(r'<div[^>]*class="[^"]*company[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', offer_html, re.DOTALL | re.IGNORECASE)
                shop_name = shop_match.group(1).strip() if shop_match else ''
                shop_name = re.sub(r'<[^>]+>', '', shop_name)
                
                sales_match = re.search(r'<span[^>]*class="[^"]*sales[^"]*"[^>]*>([^<]+)</span>', offer_html, re.IGNORECASE) or \
                             re.search(r'成交量[：:]\s*([\d万+]+)', offer_html)
                sales_str = sales_match.group(1) if sales_match else ''
                sales_volume = parse_sales_volume(sales_str)
                
                min_order_match = re.search(r'起订量?[：:]\s*([\d]+)', offer_html) or \
                                 re.search(r'<span[^>]*class="[^"]*moq[^"]*"[^>]*>([^<]+)</span>', offer_html, re.IGNORECASE)
                min_order_quantity = self._parse_quantity(min_order_match.group(1)) if min_order_match else None
                
                detail_url = f"https://detail.1688.com/offer/{offer_id}.html"
                
                products.append({
                    'product_id': str(offer_id),
                    'name': title,
                    'price': price,
                    'image_url': img_url,
                    'url': detail_url,
                    'shop_name': shop_name,
                    'sales_volume': sales_volume,
                    'min_order_quantity': min_order_quantity,
                    'source_url': source_url,
                    'is_wholesale': True,
                    'currency': 'CNY'
                })
                
            except Exception as e:
                self.logger.error(f"解析HTML商品失败: {e}")
                continue
        
        return products
    
    def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
        """
        获取1688商品详情
        :param product_id: 商品ID (offerId)
        :return: 商品详情
        """
        self.logger.info(f"获取1688商品详情: product_id={product_id}")
        
        detail_url = f"https://detail.1688.com/offer/{product_id}.html"
        
        response = self._make_request('GET', detail_url)
        
        if not response:
            self.logger.warning(f"获取商品详情失败: {product_id}")
            return None
        
        return self._parse_product_detail(response.text, product_id, detail_url)
    
    def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
        """
        解析商品详情页面
        """
        try:
            title_match = re.search(r'<title>([^<]+)</title>', html)
            title = title_match.group(1).split('-')[0].strip() if title_match else ''
            
            json_pattern = r'window\.iDetailData\s*=\s*({.*?});\s*</script>'
            json_match = re.search(json_pattern, html, re.DOTALL)
            
            price_info = {'min_price': 0, 'price_ranges': []}
            shop_name = ''
            min_order_quantity = None
            unit = '件'
            sales_volume = None
            
            if json_match:
                try:
                    detail_data = json.loads(json_match.group(1))
                    
                    if not title:
                        title = detail_data.get('subject', '') or detail_data.get('title', '')
                    
                    price_module = detail_data.get('price', {})
                    if price_module:
                        price_ranges = price_module.get('priceRanges', [])
                        if price_ranges:
                            prices = []
                            for pr in price_ranges:
                                price = clean_price(str(pr.get('price', '0')))
                                min_qty = pr.get('quantity', 0)
                                max_qty = pr.get('endQuantity')
                                prices.append(price)
                                price_info['price_ranges'].append({
                                    'min_quantity': min_qty,
                                    'max_quantity': max_qty,
                                    'price': price
                                })
                            if prices:
                                price_info['min_price'] = min(prices)
                        else:
                            price_str = price_module.get('showPrice', '') or price_module.get('price', '')
                            if price_str:
                                price_info['min_price'] = clean_price(str(price_str))
                    
                    shop_name = detail_data.get('companyName', '') or detail_data.get('shopName', '')
                    
                    sales_data = detail_data.get('trade', {})
                    sales_volume = sales_data.get('soldQuantity') or sales_data.get('totalSoldQuantity')
                    
                    moq_data = detail_data.get('moq', {})
                    min_order_quantity = moq_data.get('minOrderQuantity')
                    unit = moq_data.get('unit', '件')
                    
                except json.JSONDecodeError as e:
                    self.logger.error(f"解析商品详情JSON失败: {e}")
            
            if price_info['min_price'] <= 0:
                price_match = re.search(r'["\']price["\']\s*:\s*["\']?([\d.]+)["\']?', html) or \
                             re.search(r'¥([\d.]+)', html)
                if price_match:
                    price_info['min_price'] = clean_price(price_match.group(1))
            
            return {
                'product_id': str(product_id),
                'name': title,
                'price': price_info['min_price'],
                'price_ranges': price_info.get('price_ranges', []),
                'url': source_url,
                'shop_name': shop_name,
                'sales_volume': sales_volume,
                'min_order_quantity': min_order_quantity,
                'unit': unit,
                'is_wholesale': True,
                'currency': 'CNY'
            }
            
        except Exception as e:
            self.logger.error(f"解析商品详情失败: {e}")
            return None
    
    def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
        """
        获取商品价格
        :param product_id: 商品ID
        :return: 价格信息
        """
        detail = self.get_product_detail(product_id, **kwargs)
        if detail:
            return {
                'product_id': str(product_id),
                'price': detail.get('price', 0),
                'original_price': detail.get('original_price'),
                'currency': 'CNY',
                'platform': self.platform,
                'source_url': detail.get('url', ''),
                'price_type': 'wholesale',
                'price_ranges': detail.get('price_ranges', []),
                'min_quantity': detail.get('min_order_quantity')
            }
        return None
    
    def get_wholesale_prices(self, product_id: str, **kwargs) -> List[Dict[str, Any]]:
        """
        获取批发价格区间
        :param product_id: 商品ID
        :return: 价格区间列表
        """
        detail = self.get_product_detail(product_id, **kwargs)
        if detail:
            price_ranges = detail.get('price_ranges', [])
            if price_ranges:
                return price_ranges
            
            price = detail.get('price', 0)
            min_qty = detail.get('min_order_quantity', 1)
            return [{
                'min_quantity': min_qty,
                'max_quantity': None,
                'price': price
            }]
        return []
    
    def set_cookie(self, cookie: str):
        """
        设置 Cookie
        """
        self.cookie = cookie
        self.headers['Cookie'] = cookie
    
    def set_proxy(self, proxy: str):
        """
        设置代理
        """
        self.proxy = proxy
        self.session.proxies = {
            'http': proxy,
            'https': proxy
        }