| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453 |
- import re
- import json
- import logging
- from typing import Dict, List, Optional, Any, Tuple
- from urllib.parse import urlencode, quote, urlparse, parse_qs
- from .base import BaseCrawler
- from utils.helpers import clean_price, parse_sales_volume, extract_numbers
- class Alibaba1688Crawler(BaseCrawler):
- """
- 1688批发平台爬虫
- 重点关注批发价格、起订量等批发相关信息
- """
-
- platform = 'alibaba1688'
-
- def __init__(self, cookie: str = None, proxy: str = None):
- """
- 初始化1688爬虫
- :param cookie: 1688登录后的 Cookie 字符串
- :param proxy: 代理服务器地址
- """
- super().__init__()
- self.cookie = cookie
- self.proxy = proxy
-
- if cookie:
- self.headers['Cookie'] = cookie
-
- if proxy:
- self.session.proxies = {
- 'http': proxy,
- 'https': proxy
- }
-
- def search(self, keyword: str, page: int = 1, sort: str = 'default',
- price_range: Tuple[float, float] = None, **kwargs) -> List[Dict[str, Any]]:
- """
- 搜索1688商品
- :param keyword: 搜索关键词
- :param page: 页码,从1开始
- :param sort: 排序方式:
- - default (综合)
- - va (销量)
- - price_asc (价格升序)
- - price_desc (价格降序)
- - bookTime (最新发布)
- :param price_range: 价格范围 (min_price, max_price)
- :return: 商品列表
- """
- self.logger.info(f"搜索1688商品: keyword={keyword}, page={page}")
-
- params = {
- 'keywords': keyword,
- 'pageSize': 40,
- 'beginPage': page,
- }
-
- if sort and sort != 'default':
- params['sortType'] = sort
-
- if price_range:
- min_price, max_price = price_range
- if min_price:
- params['filtPriceMin'] = min_price
- if max_price:
- params['filtPriceMax'] = max_price
-
- search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
-
- self.logger.debug(f"搜索URL: {search_url}")
-
- response = self._make_request('GET', search_url)
-
- if not response:
- self.logger.warning(f"搜索请求失败: {keyword}")
- return []
-
- products = self._parse_search_result(response.text, search_url)
-
- self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
-
- return products
-
- def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
- """
- 解析1688搜索结果页面
- """
- products = []
-
- json_pattern = r'window\.pageData\s*=\s*({.*?});\s*</script>'
- json_match = re.search(json_pattern, html, re.DOTALL)
-
- if json_match:
- try:
- page_data = json.loads(json_match.group(1))
- items = page_data.get('data', {}).get('searchList', []) or \
- page_data.get('data', {}).get('offerList', []) or \
- page_data.get('offerList', [])
-
- for item in items:
- product = self._parse_json_item(item, source_url)
- if product:
- products.append(product)
-
- except json.JSONDecodeError as e:
- self.logger.error(f"解析pageData失败: {e}")
-
- if not products:
- products = self._parse_html_directly(html, source_url)
-
- return products
-
- def _parse_json_item(self, item: Dict, source_url: str) -> Optional[Dict[str, Any]]:
- """
- 解析JSON格式的商品数据
- """
- try:
- offer_id = item.get('offerId', '') or item.get('id', '')
- if not offer_id:
- return None
-
- title = item.get('subject', '') or item.get('title', '')
- title = title.strip()
- if not title:
- return None
-
- price_info = self._extract_price_info(item)
-
- image_url = item.get('imgUrl', '') or item.get('imageUrl', '')
- if image_url and not image_url.startswith('http'):
- image_url = 'https:' + image_url
-
- detail_url = item.get('detailUrl', '') or f"https://detail.1688.com/offer/{offer_id}.html"
- if detail_url and not detail_url.startswith('http'):
- detail_url = 'https:' + detail_url
-
- shop_name = item.get('companyName', '') or item.get('shopName', '')
-
- sales_str = item.get('soldQuantity', '') or item.get('salesNum', '') or str(item.get('bookedCount', ''))
- sales_volume = parse_sales_volume(str(sales_str)) if sales_str else None
-
- min_order = item.get('minOrder', '') or item.get('startQuantity', '')
- min_order_quantity = self._parse_quantity(min_order)
-
- unit = item.get('unit', '') or item.get('quantityUnit', '件')
-
- is_wholesale = True
-
- return {
- 'product_id': str(offer_id),
- 'name': title,
- 'price': price_info.get('min_price', 0),
- 'original_price': price_info.get('original_price'),
- 'price_ranges': price_info.get('price_ranges', []),
- 'image_url': image_url,
- 'url': detail_url,
- 'shop_name': shop_name,
- 'sales_volume': sales_volume,
- 'min_order_quantity': min_order_quantity,
- 'unit': unit,
- 'source_url': source_url,
- 'is_wholesale': is_wholesale,
- 'currency': 'CNY'
- }
-
- except Exception as e:
- self.logger.error(f"解析JSON商品失败: {e}")
- return None
-
- def _extract_price_info(self, item: Dict) -> Dict[str, Any]:
- """
- 从商品数据中提取价格信息
- 1688的价格通常是区间价格,根据起订量不同价格不同
- """
- price_info = {
- 'min_price': 0,
- 'max_price': 0,
- 'original_price': None,
- 'price_ranges': []
- }
-
- price_ranges = item.get('priceRanges', []) or item.get('priceRangeList', [])
-
- if price_ranges:
- prices = []
- for pr in price_ranges:
- price = clean_price(str(pr.get('price', '0')))
- min_quantity = pr.get('quantity', 0) or pr.get('startQuantity', 0)
- max_quantity = pr.get('endQuantity')
-
- prices.append(price)
- price_info['price_ranges'].append({
- 'min_quantity': min_quantity,
- 'max_quantity': max_quantity,
- 'price': price
- })
-
- if prices:
- price_info['min_price'] = min(prices)
- price_info['max_price'] = max(prices)
- else:
- price_str = item.get('price', '') or item.get('displayPrice', '')
- if price_str:
- price = clean_price(str(price_str))
- price_info['min_price'] = price
- price_info['max_price'] = price
-
- original_price_str = item.get('originalPrice', '') or item.get('marketPrice', '')
- if original_price_str:
- price_info['original_price'] = clean_price(str(original_price_str))
-
- return price_info
-
- def _parse_quantity(self, quantity_str: Any) -> Optional[int]:
- """
- 解析数量字符串
- """
- if not quantity_str:
- return None
-
- quantity_str = str(quantity_str).strip()
- numbers = extract_numbers(quantity_str)
- if numbers:
- return int(numbers[0])
- return None
-
- def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]:
- """
- 直接从HTML解析商品(备用方法)
- """
- products = []
-
- offer_pattern = r'data-offerid="(\d+)"[^>]*>(.*?)</div>\s*</div>\s*</div>'
- offers = re.findall(offer_pattern, html, re.DOTALL | re.IGNORECASE)
-
- for offer_id, offer_html in offers:
- try:
- title_match = re.search(r'<a[^>]*title="([^"]+)"', offer_html) or \
- re.search(r'<div[^>]*class="[^"]*title[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', offer_html, re.DOTALL)
- title = ''
- if title_match:
- title = title_match.group(1)
- title = re.sub(r'<[^>]+>', '', title).strip()
-
- if not title:
- continue
-
- price_match = re.search(r'<div[^>]*class="[^"]*price[^"]*"[^>]*>.*?<span[^>]*>([\d¥.,]+)</span>', offer_html, re.DOTALL | re.IGNORECASE) or \
- re.search(r'¥([\d.]+)', offer_html)
- price = clean_price(price_match.group(1)) if price_match else 0
-
- img_match = re.search(r'<img[^>]*data-src="([^"]+)"', offer_html) or \
- re.search(r'<img[^>]*src="([^"]+)"', offer_html)
- img_url = img_match.group(1) if img_match else ''
- if img_url and not img_url.startswith('http'):
- img_url = 'https:' + img_url
-
- shop_match = re.search(r'<div[^>]*class="[^"]*company[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', offer_html, re.DOTALL | re.IGNORECASE)
- shop_name = shop_match.group(1).strip() if shop_match else ''
- shop_name = re.sub(r'<[^>]+>', '', shop_name)
-
- sales_match = re.search(r'<span[^>]*class="[^"]*sales[^"]*"[^>]*>([^<]+)</span>', offer_html, re.IGNORECASE) or \
- re.search(r'成交量[::]\s*([\d万+]+)', offer_html)
- sales_str = sales_match.group(1) if sales_match else ''
- sales_volume = parse_sales_volume(sales_str)
-
- min_order_match = re.search(r'起订量?[::]\s*([\d]+)', offer_html) or \
- re.search(r'<span[^>]*class="[^"]*moq[^"]*"[^>]*>([^<]+)</span>', offer_html, re.IGNORECASE)
- min_order_quantity = self._parse_quantity(min_order_match.group(1)) if min_order_match else None
-
- detail_url = f"https://detail.1688.com/offer/{offer_id}.html"
-
- products.append({
- 'product_id': str(offer_id),
- 'name': title,
- 'price': price,
- 'image_url': img_url,
- 'url': detail_url,
- 'shop_name': shop_name,
- 'sales_volume': sales_volume,
- 'min_order_quantity': min_order_quantity,
- 'source_url': source_url,
- 'is_wholesale': True,
- 'currency': 'CNY'
- })
-
- except Exception as e:
- self.logger.error(f"解析HTML商品失败: {e}")
- continue
-
- return products
-
- def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
- """
- 获取1688商品详情
- :param product_id: 商品ID (offerId)
- :return: 商品详情
- """
- self.logger.info(f"获取1688商品详情: product_id={product_id}")
-
- detail_url = f"https://detail.1688.com/offer/{product_id}.html"
-
- response = self._make_request('GET', detail_url)
-
- if not response:
- self.logger.warning(f"获取商品详情失败: {product_id}")
- return None
-
- return self._parse_product_detail(response.text, product_id, detail_url)
-
- def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
- """
- 解析商品详情页面
- """
- try:
- title_match = re.search(r'<title>([^<]+)</title>', html)
- title = title_match.group(1).split('-')[0].strip() if title_match else ''
-
- json_pattern = r'window\.iDetailData\s*=\s*({.*?});\s*</script>'
- json_match = re.search(json_pattern, html, re.DOTALL)
-
- price_info = {'min_price': 0, 'price_ranges': []}
- shop_name = ''
- min_order_quantity = None
- unit = '件'
- sales_volume = None
-
- if json_match:
- try:
- detail_data = json.loads(json_match.group(1))
-
- if not title:
- title = detail_data.get('subject', '') or detail_data.get('title', '')
-
- price_module = detail_data.get('price', {})
- if price_module:
- price_ranges = price_module.get('priceRanges', [])
- if price_ranges:
- prices = []
- for pr in price_ranges:
- price = clean_price(str(pr.get('price', '0')))
- min_qty = pr.get('quantity', 0)
- max_qty = pr.get('endQuantity')
- prices.append(price)
- price_info['price_ranges'].append({
- 'min_quantity': min_qty,
- 'max_quantity': max_qty,
- 'price': price
- })
- if prices:
- price_info['min_price'] = min(prices)
- else:
- price_str = price_module.get('showPrice', '') or price_module.get('price', '')
- if price_str:
- price_info['min_price'] = clean_price(str(price_str))
-
- shop_name = detail_data.get('companyName', '') or detail_data.get('shopName', '')
-
- sales_data = detail_data.get('trade', {})
- sales_volume = sales_data.get('soldQuantity') or sales_data.get('totalSoldQuantity')
-
- moq_data = detail_data.get('moq', {})
- min_order_quantity = moq_data.get('minOrderQuantity')
- unit = moq_data.get('unit', '件')
-
- except json.JSONDecodeError as e:
- self.logger.error(f"解析商品详情JSON失败: {e}")
-
- if price_info['min_price'] <= 0:
- price_match = re.search(r'["\']price["\']\s*:\s*["\']?([\d.]+)["\']?', html) or \
- re.search(r'¥([\d.]+)', html)
- if price_match:
- price_info['min_price'] = clean_price(price_match.group(1))
-
- return {
- 'product_id': str(product_id),
- 'name': title,
- 'price': price_info['min_price'],
- 'price_ranges': price_info.get('price_ranges', []),
- 'url': source_url,
- 'shop_name': shop_name,
- 'sales_volume': sales_volume,
- 'min_order_quantity': min_order_quantity,
- 'unit': unit,
- 'is_wholesale': True,
- 'currency': 'CNY'
- }
-
- except Exception as e:
- self.logger.error(f"解析商品详情失败: {e}")
- return None
-
- def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
- """
- 获取商品价格
- :param product_id: 商品ID
- :return: 价格信息
- """
- detail = self.get_product_detail(product_id, **kwargs)
- if detail:
- return {
- 'product_id': str(product_id),
- 'price': detail.get('price', 0),
- 'original_price': detail.get('original_price'),
- 'currency': 'CNY',
- 'platform': self.platform,
- 'source_url': detail.get('url', ''),
- 'price_type': 'wholesale',
- 'price_ranges': detail.get('price_ranges', []),
- 'min_quantity': detail.get('min_order_quantity')
- }
- return None
-
- def get_wholesale_prices(self, product_id: str, **kwargs) -> List[Dict[str, Any]]:
- """
- 获取批发价格区间
- :param product_id: 商品ID
- :return: 价格区间列表
- """
- detail = self.get_product_detail(product_id, **kwargs)
- if detail:
- price_ranges = detail.get('price_ranges', [])
- if price_ranges:
- return price_ranges
-
- price = detail.get('price', 0)
- min_qty = detail.get('min_order_quantity', 1)
- return [{
- 'min_quantity': min_qty,
- 'max_quantity': None,
- 'price': price
- }]
- return []
-
- def set_cookie(self, cookie: str):
- """
- 设置 Cookie
- """
- self.cookie = cookie
- self.headers['Cookie'] = cookie
-
- def set_proxy(self, proxy: str):
- """
- 设置代理
- """
- self.proxy = proxy
- self.session.proxies = {
- 'http': proxy,
- 'https': proxy
- }
|