alibaba1688.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. import re
  2. import json
  3. import logging
  4. from typing import Dict, List, Optional, Any, Tuple
  5. from urllib.parse import urlencode, quote, urlparse, parse_qs
  6. from .base import BaseCrawler
  7. from utils.helpers import clean_price, parse_sales_volume, extract_numbers
  8. class Alibaba1688Crawler(BaseCrawler):
  9. """
  10. 1688批发平台爬虫
  11. 重点关注批发价格、起订量等批发相关信息
  12. """
  13. platform = 'alibaba1688'
  14. def __init__(self, cookie: str = None, proxy: str = None):
  15. """
  16. 初始化1688爬虫
  17. :param cookie: 1688登录后的 Cookie 字符串
  18. :param proxy: 代理服务器地址
  19. """
  20. super().__init__()
  21. self.cookie = cookie
  22. self.proxy = proxy
  23. if cookie:
  24. self.headers['Cookie'] = cookie
  25. if proxy:
  26. self.session.proxies = {
  27. 'http': proxy,
  28. 'https': proxy
  29. }
  30. def search(self, keyword: str, page: int = 1, sort: str = 'default',
  31. price_range: Tuple[float, float] = None, **kwargs) -> List[Dict[str, Any]]:
  32. """
  33. 搜索1688商品
  34. :param keyword: 搜索关键词
  35. :param page: 页码,从1开始
  36. :param sort: 排序方式:
  37. - default (综合)
  38. - va (销量)
  39. - price_asc (价格升序)
  40. - price_desc (价格降序)
  41. - bookTime (最新发布)
  42. :param price_range: 价格范围 (min_price, max_price)
  43. :return: 商品列表
  44. """
  45. self.logger.info(f"搜索1688商品: keyword={keyword}, page={page}")
  46. params = {
  47. 'keywords': keyword,
  48. 'pageSize': 40,
  49. 'beginPage': page,
  50. }
  51. if sort and sort != 'default':
  52. params['sortType'] = sort
  53. if price_range:
  54. min_price, max_price = price_range
  55. if min_price:
  56. params['filtPriceMin'] = min_price
  57. if max_price:
  58. params['filtPriceMax'] = max_price
  59. search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
  60. self.logger.debug(f"搜索URL: {search_url}")
  61. response = self._make_request('GET', search_url)
  62. if not response:
  63. self.logger.warning(f"搜索请求失败: {keyword}")
  64. return []
  65. products = self._parse_search_result(response.text, search_url)
  66. self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
  67. return products
  68. def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
  69. """
  70. 解析1688搜索结果页面
  71. """
  72. products = []
  73. json_pattern = r'window\.pageData\s*=\s*({.*?});\s*</script>'
  74. json_match = re.search(json_pattern, html, re.DOTALL)
  75. if json_match:
  76. try:
  77. page_data = json.loads(json_match.group(1))
  78. items = page_data.get('data', {}).get('searchList', []) or \
  79. page_data.get('data', {}).get('offerList', []) or \
  80. page_data.get('offerList', [])
  81. for item in items:
  82. product = self._parse_json_item(item, source_url)
  83. if product:
  84. products.append(product)
  85. except json.JSONDecodeError as e:
  86. self.logger.error(f"解析pageData失败: {e}")
  87. if not products:
  88. products = self._parse_html_directly(html, source_url)
  89. return products
  90. def _parse_json_item(self, item: Dict, source_url: str) -> Optional[Dict[str, Any]]:
  91. """
  92. 解析JSON格式的商品数据
  93. """
  94. try:
  95. offer_id = item.get('offerId', '') or item.get('id', '')
  96. if not offer_id:
  97. return None
  98. title = item.get('subject', '') or item.get('title', '')
  99. title = title.strip()
  100. if not title:
  101. return None
  102. price_info = self._extract_price_info(item)
  103. image_url = item.get('imgUrl', '') or item.get('imageUrl', '')
  104. if image_url and not image_url.startswith('http'):
  105. image_url = 'https:' + image_url
  106. detail_url = item.get('detailUrl', '') or f"https://detail.1688.com/offer/{offer_id}.html"
  107. if detail_url and not detail_url.startswith('http'):
  108. detail_url = 'https:' + detail_url
  109. shop_name = item.get('companyName', '') or item.get('shopName', '')
  110. sales_str = item.get('soldQuantity', '') or item.get('salesNum', '') or str(item.get('bookedCount', ''))
  111. sales_volume = parse_sales_volume(str(sales_str)) if sales_str else None
  112. min_order = item.get('minOrder', '') or item.get('startQuantity', '')
  113. min_order_quantity = self._parse_quantity(min_order)
  114. unit = item.get('unit', '') or item.get('quantityUnit', '件')
  115. is_wholesale = True
  116. return {
  117. 'product_id': str(offer_id),
  118. 'name': title,
  119. 'price': price_info.get('min_price', 0),
  120. 'original_price': price_info.get('original_price'),
  121. 'price_ranges': price_info.get('price_ranges', []),
  122. 'image_url': image_url,
  123. 'url': detail_url,
  124. 'shop_name': shop_name,
  125. 'sales_volume': sales_volume,
  126. 'min_order_quantity': min_order_quantity,
  127. 'unit': unit,
  128. 'source_url': source_url,
  129. 'is_wholesale': is_wholesale,
  130. 'currency': 'CNY'
  131. }
  132. except Exception as e:
  133. self.logger.error(f"解析JSON商品失败: {e}")
  134. return None
  135. def _extract_price_info(self, item: Dict) -> Dict[str, Any]:
  136. """
  137. 从商品数据中提取价格信息
  138. 1688的价格通常是区间价格,根据起订量不同价格不同
  139. """
  140. price_info = {
  141. 'min_price': 0,
  142. 'max_price': 0,
  143. 'original_price': None,
  144. 'price_ranges': []
  145. }
  146. price_ranges = item.get('priceRanges', []) or item.get('priceRangeList', [])
  147. if price_ranges:
  148. prices = []
  149. for pr in price_ranges:
  150. price = clean_price(str(pr.get('price', '0')))
  151. min_quantity = pr.get('quantity', 0) or pr.get('startQuantity', 0)
  152. max_quantity = pr.get('endQuantity')
  153. prices.append(price)
  154. price_info['price_ranges'].append({
  155. 'min_quantity': min_quantity,
  156. 'max_quantity': max_quantity,
  157. 'price': price
  158. })
  159. if prices:
  160. price_info['min_price'] = min(prices)
  161. price_info['max_price'] = max(prices)
  162. else:
  163. price_str = item.get('price', '') or item.get('displayPrice', '')
  164. if price_str:
  165. price = clean_price(str(price_str))
  166. price_info['min_price'] = price
  167. price_info['max_price'] = price
  168. original_price_str = item.get('originalPrice', '') or item.get('marketPrice', '')
  169. if original_price_str:
  170. price_info['original_price'] = clean_price(str(original_price_str))
  171. return price_info
  172. def _parse_quantity(self, quantity_str: Any) -> Optional[int]:
  173. """
  174. 解析数量字符串
  175. """
  176. if not quantity_str:
  177. return None
  178. quantity_str = str(quantity_str).strip()
  179. numbers = extract_numbers(quantity_str)
  180. if numbers:
  181. return int(numbers[0])
  182. return None
  183. def _parse_html_directly(self, html: str, source_url: str) -> List[Dict[str, Any]]:
  184. """
  185. 直接从HTML解析商品(备用方法)
  186. """
  187. products = []
  188. offer_pattern = r'data-offerid="(\d+)"[^>]*>(.*?)</div>\s*</div>\s*</div>'
  189. offers = re.findall(offer_pattern, html, re.DOTALL | re.IGNORECASE)
  190. for offer_id, offer_html in offers:
  191. try:
  192. title_match = re.search(r'<a[^>]*title="([^"]+)"', offer_html) or \
  193. re.search(r'<div[^>]*class="[^"]*title[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', offer_html, re.DOTALL)
  194. title = ''
  195. if title_match:
  196. title = title_match.group(1)
  197. title = re.sub(r'<[^>]+>', '', title).strip()
  198. if not title:
  199. continue
  200. price_match = re.search(r'<div[^>]*class="[^"]*price[^"]*"[^>]*>.*?<span[^>]*>([\d¥.,]+)</span>', offer_html, re.DOTALL | re.IGNORECASE) or \
  201. re.search(r'¥([\d.]+)', offer_html)
  202. price = clean_price(price_match.group(1)) if price_match else 0
  203. img_match = re.search(r'<img[^>]*data-src="([^"]+)"', offer_html) or \
  204. re.search(r'<img[^>]*src="([^"]+)"', offer_html)
  205. img_url = img_match.group(1) if img_match else ''
  206. if img_url and not img_url.startswith('http'):
  207. img_url = 'https:' + img_url
  208. shop_match = re.search(r'<div[^>]*class="[^"]*company[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', offer_html, re.DOTALL | re.IGNORECASE)
  209. shop_name = shop_match.group(1).strip() if shop_match else ''
  210. shop_name = re.sub(r'<[^>]+>', '', shop_name)
  211. sales_match = re.search(r'<span[^>]*class="[^"]*sales[^"]*"[^>]*>([^<]+)</span>', offer_html, re.IGNORECASE) or \
  212. re.search(r'成交量[::]\s*([\d万+]+)', offer_html)
  213. sales_str = sales_match.group(1) if sales_match else ''
  214. sales_volume = parse_sales_volume(sales_str)
  215. min_order_match = re.search(r'起订量?[::]\s*([\d]+)', offer_html) or \
  216. re.search(r'<span[^>]*class="[^"]*moq[^"]*"[^>]*>([^<]+)</span>', offer_html, re.IGNORECASE)
  217. min_order_quantity = self._parse_quantity(min_order_match.group(1)) if min_order_match else None
  218. detail_url = f"https://detail.1688.com/offer/{offer_id}.html"
  219. products.append({
  220. 'product_id': str(offer_id),
  221. 'name': title,
  222. 'price': price,
  223. 'image_url': img_url,
  224. 'url': detail_url,
  225. 'shop_name': shop_name,
  226. 'sales_volume': sales_volume,
  227. 'min_order_quantity': min_order_quantity,
  228. 'source_url': source_url,
  229. 'is_wholesale': True,
  230. 'currency': 'CNY'
  231. })
  232. except Exception as e:
  233. self.logger.error(f"解析HTML商品失败: {e}")
  234. continue
  235. return products
  236. def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
  237. """
  238. 获取1688商品详情
  239. :param product_id: 商品ID (offerId)
  240. :return: 商品详情
  241. """
  242. self.logger.info(f"获取1688商品详情: product_id={product_id}")
  243. detail_url = f"https://detail.1688.com/offer/{product_id}.html"
  244. response = self._make_request('GET', detail_url)
  245. if not response:
  246. self.logger.warning(f"获取商品详情失败: {product_id}")
  247. return None
  248. return self._parse_product_detail(response.text, product_id, detail_url)
  249. def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
  250. """
  251. 解析商品详情页面
  252. """
  253. try:
  254. title_match = re.search(r'<title>([^<]+)</title>', html)
  255. title = title_match.group(1).split('-')[0].strip() if title_match else ''
  256. json_pattern = r'window\.iDetailData\s*=\s*({.*?});\s*</script>'
  257. json_match = re.search(json_pattern, html, re.DOTALL)
  258. price_info = {'min_price': 0, 'price_ranges': []}
  259. shop_name = ''
  260. min_order_quantity = None
  261. unit = '件'
  262. sales_volume = None
  263. if json_match:
  264. try:
  265. detail_data = json.loads(json_match.group(1))
  266. if not title:
  267. title = detail_data.get('subject', '') or detail_data.get('title', '')
  268. price_module = detail_data.get('price', {})
  269. if price_module:
  270. price_ranges = price_module.get('priceRanges', [])
  271. if price_ranges:
  272. prices = []
  273. for pr in price_ranges:
  274. price = clean_price(str(pr.get('price', '0')))
  275. min_qty = pr.get('quantity', 0)
  276. max_qty = pr.get('endQuantity')
  277. prices.append(price)
  278. price_info['price_ranges'].append({
  279. 'min_quantity': min_qty,
  280. 'max_quantity': max_qty,
  281. 'price': price
  282. })
  283. if prices:
  284. price_info['min_price'] = min(prices)
  285. else:
  286. price_str = price_module.get('showPrice', '') or price_module.get('price', '')
  287. if price_str:
  288. price_info['min_price'] = clean_price(str(price_str))
  289. shop_name = detail_data.get('companyName', '') or detail_data.get('shopName', '')
  290. sales_data = detail_data.get('trade', {})
  291. sales_volume = sales_data.get('soldQuantity') or sales_data.get('totalSoldQuantity')
  292. moq_data = detail_data.get('moq', {})
  293. min_order_quantity = moq_data.get('minOrderQuantity')
  294. unit = moq_data.get('unit', '件')
  295. except json.JSONDecodeError as e:
  296. self.logger.error(f"解析商品详情JSON失败: {e}")
  297. if price_info['min_price'] <= 0:
  298. price_match = re.search(r'["\']price["\']\s*:\s*["\']?([\d.]+)["\']?', html) or \
  299. re.search(r'¥([\d.]+)', html)
  300. if price_match:
  301. price_info['min_price'] = clean_price(price_match.group(1))
  302. return {
  303. 'product_id': str(product_id),
  304. 'name': title,
  305. 'price': price_info['min_price'],
  306. 'price_ranges': price_info.get('price_ranges', []),
  307. 'url': source_url,
  308. 'shop_name': shop_name,
  309. 'sales_volume': sales_volume,
  310. 'min_order_quantity': min_order_quantity,
  311. 'unit': unit,
  312. 'is_wholesale': True,
  313. 'currency': 'CNY'
  314. }
  315. except Exception as e:
  316. self.logger.error(f"解析商品详情失败: {e}")
  317. return None
  318. def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
  319. """
  320. 获取商品价格
  321. :param product_id: 商品ID
  322. :return: 价格信息
  323. """
  324. detail = self.get_product_detail(product_id, **kwargs)
  325. if detail:
  326. return {
  327. 'product_id': str(product_id),
  328. 'price': detail.get('price', 0),
  329. 'original_price': detail.get('original_price'),
  330. 'currency': 'CNY',
  331. 'platform': self.platform,
  332. 'source_url': detail.get('url', ''),
  333. 'price_type': 'wholesale',
  334. 'price_ranges': detail.get('price_ranges', []),
  335. 'min_quantity': detail.get('min_order_quantity')
  336. }
  337. return None
  338. def get_wholesale_prices(self, product_id: str, **kwargs) -> List[Dict[str, Any]]:
  339. """
  340. 获取批发价格区间
  341. :param product_id: 商品ID
  342. :return: 价格区间列表
  343. """
  344. detail = self.get_product_detail(product_id, **kwargs)
  345. if detail:
  346. price_ranges = detail.get('price_ranges', [])
  347. if price_ranges:
  348. return price_ranges
  349. price = detail.get('price', 0)
  350. min_qty = detail.get('min_order_quantity', 1)
  351. return [{
  352. 'min_quantity': min_qty,
  353. 'max_quantity': None,
  354. 'price': price
  355. }]
  356. return []
  357. def set_cookie(self, cookie: str):
  358. """
  359. 设置 Cookie
  360. """
  361. self.cookie = cookie
  362. self.headers['Cookie'] = cookie
  363. def set_proxy(self, proxy: str):
  364. """
  365. 设置代理
  366. """
  367. self.proxy = proxy
  368. self.session.proxies = {
  369. 'http': proxy,
  370. 'https': proxy
  371. }