jd.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. import re
  2. import json
  3. import logging
  4. from typing import Dict, List, Optional, Any
  5. from urllib.parse import urlencode, quote
  6. from .base import BaseCrawler
  7. from utils.helpers import clean_price, parse_sales_volume
  8. class JdCrawler(BaseCrawler):
  9. """
  10. 京东平台爬虫
  11. 注意:京东有较强的反爬机制,可能需要配置 Cookie 或使用代理
  12. """
  13. platform = 'jd'
  14. def __init__(self, cookie: str = None, proxy: str = None):
  15. """
  16. 初始化京东爬虫
  17. :param cookie: 京东登录后的 Cookie 字符串
  18. :param proxy: 代理服务器地址
  19. """
  20. super().__init__()
  21. self.cookie = cookie
  22. self.proxy = proxy
  23. if cookie:
  24. self.headers['Cookie'] = cookie
  25. if proxy:
  26. self.session.proxies = {
  27. 'http': proxy,
  28. 'https': proxy
  29. }
  30. def search(self, keyword: str, page: int = 1, sort: str = 'sort_totalsales15_desc', **kwargs) -> List[Dict[str, Any]]:
  31. """
  32. 搜索京东商品
  33. :param keyword: 搜索关键词
  34. :param page: 页码,从1开始
  35. :param sort: 排序方式:
  36. - sort_totalsales15_desc (销量)
  37. - sort_price_asc (价格升序)
  38. - sort_price_desc (价格降序)
  39. - sort_discount_desc (折扣)
  40. - sort_totalsales15_desc (销量)
  41. :return: 商品列表
  42. """
  43. self.logger.info(f"搜索京东商品: keyword={keyword}, page={page}")
  44. params = {
  45. 'keyword': keyword,
  46. 'wq': keyword,
  47. 'pvid': self._generate_pvid(),
  48. 'page': page,
  49. 's': (page - 1) * 30 + 1,
  50. }
  51. if sort and sort != 'default':
  52. params['psort'] = sort
  53. search_url = f"{self.config.get('search_url')}?{urlencode(params)}"
  54. self.logger.debug(f"搜索URL: {search_url}")
  55. response = self._make_request('GET', search_url)
  56. if not response:
  57. self.logger.warning(f"搜索请求失败: {keyword}")
  58. return []
  59. products = self._parse_search_result(response.text, search_url)
  60. if products:
  61. self._fill_product_prices(products)
  62. self.logger.info(f"从搜索结果中解析到 {len(products)} 个商品")
  63. return products
  64. def _generate_pvid(self) -> str:
  65. """
  66. 生成京东 pvid
  67. """
  68. import time
  69. import random
  70. timestamp = int(time.time() * 1000)
  71. random_num = random.randint(100000, 999999)
  72. return f"{timestamp}{random_num}"
  73. def _parse_search_result(self, html: str, source_url: str) -> List[Dict[str, Any]]:
  74. """
  75. 解析京东搜索结果页面
  76. """
  77. products = []
  78. item_pattern = r'<li[^>]*data-sku="(\d+)"[^>]*>(.*?)</li>'
  79. items = re.findall(item_pattern, html, re.DOTALL | re.IGNORECASE)
  80. self.logger.debug(f"找到 {len(items)} 个商品项")
  81. for sku_id, item_html in items:
  82. try:
  83. product = self._parse_item_html(sku_id, item_html, source_url)
  84. if product:
  85. products.append(product)
  86. except Exception as e:
  87. self.logger.error(f"解析商品失败: {e}")
  88. continue
  89. if not products:
  90. products = self._parse_from_json(html, source_url)
  91. return products
  92. def _parse_item_html(self, sku_id: str, item_html: str, source_url: str) -> Optional[Dict[str, Any]]:
  93. """
  94. 解析单个商品HTML
  95. """
  96. try:
  97. title_match = re.search(r'<div[^>]*class="p-name[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', item_html, re.DOTALL | re.IGNORECASE)
  98. title = ''
  99. if title_match:
  100. title = title_match.group(1)
  101. title = re.sub(r'<[^>]+>', '', title).strip()
  102. title = re.sub(r'\s+', ' ', title)
  103. if not title:
  104. return None
  105. price_match = re.search(r'<div[^>]*class="p-price[^"]*"[^>]*>.*?<i[^>]*>([\d.]*)</i>', item_html, re.DOTALL | re.IGNORECASE)
  106. price = clean_price(price_match.group(1)) if price_match else 0
  107. shop_match = re.search(r'<div[^>]*class="p-shop[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', item_html, re.DOTALL | re.IGNORECASE)
  108. shop_name = ''
  109. if shop_match:
  110. shop_name = shop_match.group(1)
  111. shop_name = re.sub(r'<[^>]+>', '', shop_name).strip()
  112. sales_match = re.search(r'<div[^>]*class="p-commit[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>', item_html, re.DOTALL | re.IGNORECASE)
  113. sales_str = sales_match.group(1) if sales_match else ''
  114. sales_volume = parse_sales_volume(sales_str)
  115. img_match = re.search(r'<div[^>]*class="p-img[^"]*"[^>]*>.*?<img[^>]*data-lazy-img="([^"]+)"', item_html, re.DOTALL | re.IGNORECASE) or \
  116. re.search(r'<div[^>]*class="p-img[^"]*"[^>]*>.*?<img[^>]*src="([^"]+)"', item_html, re.DOTALL | re.IGNORECASE)
  117. img_url = img_match.group(1) if img_match else ''
  118. if img_url and not img_url.startswith('http'):
  119. img_url = 'https:' + img_url
  120. detail_url = f"https://item.jd.com/{sku_id}.html"
  121. is_self = '京东自营' in shop_name or 'p-icons' in item_html and '自营' in item_html
  122. return {
  123. 'product_id': str(sku_id),
  124. 'name': title,
  125. 'price': price,
  126. 'image_url': img_url,
  127. 'url': detail_url,
  128. 'shop_name': shop_name,
  129. 'sales_volume': sales_volume,
  130. 'is_jd_self': is_self,
  131. 'source_url': source_url,
  132. 'is_wholesale': False,
  133. 'currency': 'CNY'
  134. }
  135. except Exception as e:
  136. self.logger.error(f"解析商品HTML失败: {e}")
  137. return None
  138. def _parse_from_json(self, html: str, source_url: str) -> List[Dict[str, Any]]:
  139. """
  140. 从页面中的 JSON 数据解析商品
  141. """
  142. products = []
  143. json_pattern = r'window\.__SEARCH_RESULT__\s*=\s*({.*?});'
  144. json_match = re.search(json_pattern, html, re.DOTALL)
  145. if json_match:
  146. try:
  147. data = json.loads(json_match.group(1))
  148. items = data.get('wareList', {}).get('wareInfo', [])
  149. for item in items:
  150. product = self._parse_json_item(item, source_url)
  151. if product:
  152. products.append(product)
  153. except json.JSONDecodeError as e:
  154. self.logger.error(f"解析JSON失败: {e}")
  155. return products
  156. def _parse_json_item(self, item: Dict, source_url: str) -> Optional[Dict[str, Any]]:
  157. """
  158. 解析 JSON 格式的商品数据
  159. """
  160. try:
  161. sku_id = item.get('wname', '') or item.get('wareId', '')
  162. if not sku_id:
  163. return None
  164. title = item.get('wname', '').strip()
  165. if not title:
  166. return None
  167. price = clean_price(str(item.get('price', '0')))
  168. original_price = clean_price(str(item.get('oprice', '0'))) or None
  169. img_url = item.get('imgurl', '')
  170. if img_url and not img_url.startswith('http'):
  171. img_url = 'https:' + img_url
  172. shop_name = item.get('goodShop', {}).get('shopName', '') if item.get('goodShop') else ''
  173. sales_str = item.get('reviews', '')
  174. sales_volume = parse_sales_volume(sales_str)
  175. detail_url = f"https://item.jd.com/{sku_id}.html"
  176. return {
  177. 'product_id': str(sku_id),
  178. 'name': title,
  179. 'price': price,
  180. 'original_price': original_price,
  181. 'image_url': img_url,
  182. 'url': detail_url,
  183. 'shop_name': shop_name,
  184. 'sales_volume': sales_volume,
  185. 'source_url': source_url,
  186. 'is_wholesale': False,
  187. 'currency': 'CNY'
  188. }
  189. except Exception as e:
  190. self.logger.error(f"解析JSON商品失败: {e}")
  191. return None
  192. def _fill_product_prices(self, products: List[Dict[str, Any]]):
  193. """
  194. 批量获取商品价格(京东价格接口)
  195. """
  196. if not products:
  197. return
  198. sku_ids = [p.get('product_id') for p in products if p.get('product_id') and p.get('price', 0) <= 0]
  199. if not sku_ids:
  200. return
  201. self.logger.debug(f"批量获取 {len(sku_ids)} 个商品的价格")
  202. sku_str = ','.join([f'J_{sku}' for sku in sku_ids])
  203. price_url = f"https://p.3.cn/prices/mgets?skuIds={sku_str}&type=1"
  204. response = self._make_request('GET', price_url, headers={
  205. 'Referer': 'https://www.jd.com'
  206. })
  207. if response:
  208. try:
  209. price_data = response.json()
  210. price_map = {}
  211. for item in price_data:
  212. sku = item.get('id', '').replace('J_', '')
  213. price = clean_price(item.get('p', '0'))
  214. original_price = clean_price(item.get('op', '0')) or None
  215. price_map[sku] = {'price': price, 'original_price': original_price}
  216. for product in products:
  217. sku = product.get('product_id')
  218. if sku in price_map:
  219. if product.get('price', 0) <= 0:
  220. product['price'] = price_map[sku]['price']
  221. if not product.get('original_price'):
  222. product['original_price'] = price_map[sku]['original_price']
  223. except json.JSONDecodeError as e:
  224. self.logger.error(f"解析价格数据失败: {e}")
  225. def get_product_detail(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
  226. """
  227. 获取京东商品详情
  228. :param product_id: 商品ID (sku)
  229. :return: 商品详情
  230. """
  231. self.logger.info(f"获取京东商品详情: product_id={product_id}")
  232. detail_url = f"https://item.jd.com/{product_id}.html"
  233. response = self._make_request('GET', detail_url)
  234. if not response:
  235. self.logger.warning(f"获取商品详情失败: {product_id}")
  236. return None
  237. return self._parse_product_detail(response.text, product_id, detail_url)
  238. def _parse_product_detail(self, html: str, product_id: str, source_url: str) -> Optional[Dict[str, Any]]:
  239. """
  240. 解析商品详情页面
  241. """
  242. try:
  243. title_match = re.search(r'<title>([^<]+)</title>', html) or \
  244. re.search(r'<div[^>]*class="sku-name[^"]*"[^>]*>(.*?)</div>', html, re.DOTALL | re.IGNORECASE)
  245. title = ''
  246. if title_match:
  247. title = title_match.group(1)
  248. title = re.sub(r'<[^>]+>', '', title).strip()
  249. title = title.split('-')[0].strip() if '-' in title else title
  250. shop_match = re.search(r'<div[^>]*class="name[^"]*"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE) or \
  251. re.search(r'shopName\s*:\s*"([^"]+)"', html)
  252. shop_name = shop_match.group(1).strip() if shop_match else ''
  253. self._fill_product_prices([{'product_id': product_id, 'price': 0}])
  254. return {
  255. 'product_id': str(product_id),
  256. 'name': title,
  257. 'url': source_url,
  258. 'shop_name': shop_name,
  259. 'is_wholesale': False,
  260. 'currency': 'CNY'
  261. }
  262. except Exception as e:
  263. self.logger.error(f"解析商品详情失败: {e}")
  264. return None
  265. def get_price(self, product_id: str, **kwargs) -> Optional[Dict[str, Any]]:
  266. """
  267. 获取商品价格
  268. :param product_id: 商品ID
  269. :return: 价格信息
  270. """
  271. self.logger.info(f"获取京东商品价格: product_id={product_id}")
  272. sku_str = f'J_{product_id}'
  273. price_url = f"https://p.3.cn/prices/mgets?skuIds={sku_str}&type=1"
  274. response = self._make_request('GET', price_url, headers={
  275. 'Referer': 'https://www.jd.com'
  276. })
  277. if response:
  278. try:
  279. price_data = response.json()
  280. if price_data and len(price_data) > 0:
  281. item = price_data[0]
  282. price = clean_price(item.get('p', '0'))
  283. original_price = clean_price(item.get('op', '0')) or None
  284. return {
  285. 'product_id': str(product_id),
  286. 'price': price,
  287. 'original_price': original_price,
  288. 'currency': 'CNY',
  289. 'platform': self.platform,
  290. 'source_url': f"https://item.jd.com/{product_id}.html",
  291. 'price_type': 'retail'
  292. }
  293. except json.JSONDecodeError as e:
  294. self.logger.error(f"解析价格数据失败: {e}")
  295. return None
  296. def set_cookie(self, cookie: str):
  297. """
  298. 设置 Cookie
  299. """
  300. self.cookie = cookie
  301. self.headers['Cookie'] = cookie
  302. def set_proxy(self, proxy: str):
  303. """
  304. 设置代理
  305. """
  306. self.proxy = proxy
  307. self.session.proxies = {
  308. 'http': proxy,
  309. 'https': proxy
  310. }