liuyuqi-dellpc 11 months ago
parent
commit
855bb14018
9 changed files with 153 additions and 1 deletions
  1. 21 1
      README.md
  2. 0 0
      conf/.gitkeep
  3. 4 0
      conf/config.ini
  4. 1 0
      conf/urls.txt
  5. 2 0
      crawl_meituan/.gitignore
  6. 1 0
      crawl_meituan/__init__.py
  7. 66 0
      crawl_meituan/meituan.py
  8. 26 0
      main.py
  9. 32 0
      test.py

+ 21 - 1
README.md

@@ -1,3 +1,23 @@
 # crawl_meituan
 
-美团
+美团爬虫,获取店铺名
+
+## Usage
+
+1、登录美团,浏览器获取cookie
+2、打开conf/config.ini,填入cookie
+3、填入url,运行main.py
+4、输出 result.txt结果
+
+```
+python main.py
+```
+
+**批量生成:**
+
+1、登录美团,浏览器获取cookie
+2、打开conf/config.ini,填入cookie
+3、填入urls_path,默认为urls.txt,每行一个url
+4、打开conf/url.txt,填入url,运行main.py
+5、输出 result.
+

+ 0 - 0
conf/.gitkeep


+ 4 - 0
conf/config.ini

@@ -0,0 +1,4 @@
+[base]
+cookie = "wm_order_channel=default; request_source=openh5; au_trace_key_net=default; _lxsdk_cuid=1889c1945f5c8-0b5a22a3ae49df-26031d51-100200-1889c1945f5c8; isIframe=false; WEBDFPID=v00y48y4546u5uyzzu7wzw2y005z32918110x80450w979582u69v309-2001606085819-1686246081061SOYQGMGfd79fef3d01d5e9aadc18ccd4d0c95071417; channelType={%22default%22:%220%22}; channelConfig={%22channel%22:%22default%22%2C%22type%22:0%2C%22fixedReservation%22:{%22reservationTimeStatus%22:0%2C%22startReservationTime%22:0%2C%22endReservationTime%22:0}}; showTopHeader=show; isTcMpa=false; iuuid=53C8215B347576A2B868D71E1C5E09F54AFA444953A0F488BEA7A38383494DA5; openh5_uuid=53C8215B347576A2B868D71E1C5E09F54AFA444953A0F488BEA7A38383494DA5; _lxsdk=53C8215B347576A2B868D71E1C5E09F54AFA444953A0F488BEA7A38383494DA5; token=AgHUIWaP3j8Xnl4svTPUBhJ8W_n_x8c2TbACQ3FAFsdYYTV8hV8ULaeypKcFHD5GCFyONe_wNmpUbAAAAADZGAAAQsXGBmwQtiNFTPmZwsYfJFL_xcSseJ_wRtuaHzfAMMsqndzqj4cEPebFmAKe0KRI; mt_c_token=AgHUIWaP3j8Xnl4svTPUBhJ8W_n_x8c2TbACQ3FAFsdYYTV8hV8ULaeypKcFHD5GCFyONe_wNmpUbAAAAADZGAAAQsXGBmwQtiNFTPmZwsYfJFL_xcSseJ_wRtuaHzfAMMsqndzqj4cEPebFmAKe0KRI; oops=AgHUIWaP3j8Xnl4svTPUBhJ8W_n_x8c2TbACQ3FAFsdYYTV8hV8ULaeypKcFHD5GCFyONe_wNmpUbAAAAADZGAAAQsXGBmwQtiNFTPmZwsYfJFL_xcSseJ_wRtuaHzfAMMsqndzqj4cEPebFmAKe0KRI; userId=3551010057; userId=3551010057; userIdCanceled=0; uuidCanceled=0; userName=kIv431145404; userFace=; logan_session_token=wuin052kvbsz4fucgo1l; _lx_utm=utm_source%3D; cssVersion=20a70494; uuid=2159b3e8ad4d40f99665.1686249025.1.0.0; mtcdn=K; _lxsdk_s=1889c3b484f-cfa-777-d7d%7C%7C114"
+url = "https://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId=1000088679487895&utm_source=&channel=default&source=shoplist&initialLat=39.940816&initialLng=116.444976&actualLat=39.940816&actualLng=116.444976"
+urls_path=conf/urls.txt

+ 1 - 0
conf/urls.txt

@@ -0,0 +1 @@
+https://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId=1000088679487895&utm_source=&channel=default&source=shoplist&initialLat=39.940816&initialLng=116.444976&actualLat=39.940816&actualLng=116.444976

+ 2 - 0
crawl_meituan/.gitignore

@@ -0,0 +1,2 @@
+*.pyc
+__pycache__/

+ 1 - 0
crawl_meituan/__init__.py

@@ -0,0 +1 @@
+from crawl_meituan.meituan import Meituan

+ 66 - 0
crawl_meituan/meituan.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/06/09 01:46:55
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   None
+'''
+import os,sys,re,logging
+import requests
+from lxml import etree
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+import aiohttp
+
+class Meituan(object):
+    ''' meituan spider '''
+
+    def __init__(self, cookie:str,url:str,url_path="conf/urls.txt" ) -> None:
+        self.url=url
+        self.sess=requests.session()
+        self.headers = {
+            'Cookie': cookie,
+            'Host': 'www.meituan.com',
+            'Referer': 'https://www.meituan.com/',
+            "Content-Type": "application/x-www-form-urlencoded",
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
+        }
+
+        # https://i.waimai.meituan.com/openapi/v1/poi/food?_=1686248568250
+        
+
+        
+    def run(self):
+        # with ThreadPoolExecutor(max_workers=10) as executor:
+        #     for i in range(1, 10):
+        #         executor.submit(self.get_list, i)
+        uid="c3d2d2c0-4b5e-4b1e-8b0e-2b0b5b4b5b4b"
+        sid=re.findall(r"mtShopId=(\d+)", self.url)[0]
+        params = {
+            "page_offset": "0",
+            "page_size": "10",
+            "filter": "1",
+            "poiid": sid, # 替换成你要查询的商家ID
+            "type": "0",
+            "source": "1",
+            "platform": "1",
+            "append": "0",
+            "partner": "4",
+            "originUrl": "http://www.meituan.com/meishi/"
+        }
+        url="https://www.meituan.com/meishi/api/poi/getMerchantComment"
+        res=self.sess.post(url,headers=self.headers,data=params)
+        # save index.html
+        with open("index.html","w",encoding="utf-8") as f:
+            f.write(res.text)
+        print(res.text)
+        
+    def get_list(self, page):
+        pass
+
+    def get_shop_info(self, shop_id):
+        pass
+
+    def save(self, data):
+        pass

+ 26 - 0
main.py

@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/06/09 01:45:08
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point
+'''
+import os,sys,re,logging
+import argparse
+import configparser
+from crawl_meituan import Meituan
+
+def read_conf(config: configparser.ConfigParser):
+    config.read('conf/config.ini', encoding='utf-8')
+    if config.has_section('base'):
+        cookie = config.get("base", "cookie",raw=True)
+        url = config.get("base", "url",raw=True)
+        urls_path = config['base']['urls_path']
+        return cookie,url,urls_path
+
+if __name__=='__main__':
+    config = configparser.ConfigParser()
+    cookie,url,url_path =read_conf(config)
+    meituan = Meituan(cookie,url,url_path)
+    meituan.run()

+ 32 - 0
test.py

@@ -0,0 +1,32 @@
+# import configparser
+# import os
+# config = configparser.ConfigParser()
+
+# # read config from conf/config.ini
+# def read_conf():
+#     config.read(os.path.join("conf","config.ini"),encoding="utf-8")
+#     if config.has_section('db'):
+#         host = config['db']['host']
+#         port = config['db']['port']
+#         user = config['db']['user']
+#         password = config['db']['password']
+#         db = config['db']['db']
+#         print(host, port, user, password, db)
+#         return host, port, user, password, db
+
+# def write_conf():
+#     # write config to conf/config.ini
+#     config.add_section('db')
+#     config.set('db', 'host', '122')
+#     config.set('db', 'port', '333306')
+
+#     config.write(open('conf/config.ini', 'w'))
+
+# read_conf()
+
+import re
+url = "https://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId=1000088679487895&utm_source=&channel=default&source=shoplist&initialLat=39.940816&initialLng=116.444976&actualLat=39.940816&actualLng=116.444976"
+# get mtShopId
+mtShopId = re.findall(r"mtShopId=(\d+)", url)[0]
+print(mtShopId)
+