liuyuqi-dellpc 11 months ago
parent
commit
6735665d63
4 changed files with 229 additions and 9 deletions
  1. 1 0
      .gitignore
  2. 2 0
      crawl_meituan/api.py
  3. 223 4
      crawl_meituan/meituan.py
  4. 3 5
      main.py

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+data.csv

+ 2 - 0
crawl_meituan/api.py

@@ -0,0 +1,2 @@
+
+proxy_host='http://'

+ 223 - 4
crawl_meituan/meituan.py

@@ -6,12 +6,17 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   None
 '''
+import base64
+import datetime
 import os,sys,re,logging
-import requests
+import zlib
+import requests,json
 from lxml import etree
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
-import aiohttp
+import aiohttp,time,random
+import pandas as pd
+from fontTools.ttLib import TTFont
 
 class Meituan(object):
     ''' meituan spider '''
@@ -59,8 +64,222 @@ class Meituan(object):
     def get_list(self, page):
         pass
 
+    def proxy(self):
+        IPURL = "http://http.tiqu.alicdns.com/getip3?num=1&type=2&pro=&city=0&yys=0&port=1&pack=99107&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1&regions=&gm=4"
+        response = requests.get(url=IPURL)
+        text = response.content.decode()
+        jsonText = json.loads(text)
+        IP = jsonText["data"][0]["ip"]
+        Port = jsonText["data"][0]["port"]
+        proxies = {
+            "https" : str(IP) +":"+str(Port)
+        }
+        return proxies
+    
     def get_shop_info(self, shop_id):
         pass
+    
+    @staticmethod
+    def fontPackage():
+        '''字体包下载解析'''
+        W_headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cache-Control': 'max-age=0',
+            'Connection': 'keep-alive',
+            'Cookie': '_lxsdk_cuid=16d05dddafcc8-0442284bc8cf6c-78494577-49a10-16d05dddafc36; iuuid=3E841C72DA9D196B93B63BFFDCC6C65597046BF55AABFB11C539943F89A5B274; ci=20; cityname=%E5%B9%BF%E5%B7%9E; _lxsdk=3E841C72DA9D196B93B63BFFDCC6C65597046BF55AABFB11C539943F89A5B274; __utmz=74597006.1567761829.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _hc.v=d661dd6c-769e-42f0-c1f5-cd1dabd2087f.1567761838; IJSESSIONID=1l1elx4cck96mxaf6kdgehuuu; latlng=; __utma=74597006.1732546303.1567761829.1571106477.1575878401.7; __utmc=74597006; __utmb=74597006.2.9.1575878402052; i_extend=C_b1Gimthomepagecategory1394H__a; wm_order_channel=mtib; utm_source=60030; au_trace_key_net=default; openh5_uuid=3E841C72DA9D196B93B63BFFDCC6C65597046BF55AABFB11C539943F89A5B274; uuid=3E841C72DA9D196B93B63BFFDCC6C65597046BF55AABFB11C539943F89A5B274; showTopHeader=show; cssVersion=3c4250a3; _lx_utm=utm_source%3D60030; _lxsdk_s=16ee9ac132b-4aa-2ea-364%7C%7C84',
+            'Host': 'h5.waimai.meituan.com',
+            'Referer': 'http://i.meituan.com/?city=guangzhou',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
+        }
+
+        url = 'https://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId=1076985719664217&initialLat=23.170906&initialLng=113.342259&actualLat=&actualLng=&source=searchresult'
+        responts = requests.get(url=url, headers=W_headers)
+        get_css = re.compile("(?<='Set-Cookie': ').*?(?=; path)")
+        global cssVersion
+        cssVersion = get_css.findall(str(responts.headers))
+
+        # 正则表达式匹配.woff链接,下载
+        woff = re.compile('(?<=,url\(").*?(?=")')
+        file_woff = woff.findall(responts.text)
+        file_woff = requests.get("http:" + file_woff[0])
+        with open("maoyan.woff", "wb") as pdf:
+            for chunk in file_woff.iter_content(chunk_size=1024):
+                if chunk:
+                    pdf.write(chunk)
+        font = TTFont('./abc.woff')
+        font2 = TTFont('./maoyan.woff')
+        uniList = font.getGlyphOrder()[2:]
+        data = {'uniF676': '0', 'uniF742': '3', 'uniED63': '1', 'uniE892': '5', 'uniEA7C': '9', 'uniF583': '7',
+                'uniF137': '4', 'uniE120': '8', 'uniE63B': '2', 'uniE9C9': '6'}
+        uniList2 = font2.getGlyphOrder()[2:]
+        global data2
+        data2 = {}
+        for x in range(10):
+            uni2 = font2['glyf'][uniList2[x]]
+            for y in range(10):
+                uni = font['glyf'][uniList[y]]
+                if uni2 == uni:
+                    data2.update({uniList2[x]: data.get(uniList[y])})
+
+    #请求商店信息
+    def storeInformation(ShopID,Token,uuid,test,cur,proxies,terse):
+        HEADERS = {
+            'Content-Length': '1650',
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Connection': 'close',
+            'Cookie': 'terminal=i; w_utmz="utm_campaign=(direct)&utm_source=5000&utm_medium=(none)&utm_content=(none)&utm_term=(none)"; _lxsdk_cuid=16d05dddafcc8-0442284bc8cf6c-78494577-49a10-16d05dddafc36; iuuid='+ uuid +'; ci=20; cityname=%E5%B9%BF%E5%B7%9E; _lxsdk='+ uuid +'; __utmz=74597006.1567761829.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _hc.v=d661dd6c-769e-42f0-c1f5-cd1dabd2087f.1567761838; openh5_uuid='+uuid+'; wm_order_channel=mtib; utm_source=60030; token=zRdDEPPd-amzgXCz7AdlN5iN2joAAAAAWgoAAI8syfghZMrRr2dtzEMQvKuaZS0M7elQ8cym90Yi_NwSiEWZmkXWKzz1zEjTA0zbTg; mt_c_token=zRdDEPPd-amzgXCz7AdlN5iN2joAAAAAWgoAAI8syfghZMrRr2dtzEMQvKuaZS0M7elQ8cym90Yi_NwSiEWZmkXWKzz1zEjTA0zbTg; userId=803523721; w_token=zRdDEPPd-amzgXCz7AdlN5iN2joAAAAAWgoAAI8syfghZMrRr2dtzEMQvKuaZS0M7elQ8cym90Yi_NwSiEWZmkXWKzz1zEjTA0zbTg; isid=771E183918F5DE9700D34E21F1651E76; logintype=normal; t_lxid=17185eb92cfb8-0f4b8553f77b4a-2d604637-3d10d-17185eb92d0c8-tid; service-off=0; IJSESSIONID=1hagrhzjuwkbd1wznzf86tmy4p; oops=zRdDEPPd-amzgXCz7AdlN5iN2joAAAAAWgoAAI8syfghZMrRr2dtzEMQvKuaZS0M7elQ8cym90Yi_NwSiEWZmkXWKzz1zEjTA0zbTg; u=803523721; __utmc=74597006; au_trace_key_net=default; openh5_uuid='+ uuid +'; uuid='+ uuid +'; __utma=74597006.1732546303.1567761829.1587094243.1587103566.14; ci3=1; latlng=23.138153,113.367997,1587103569336; __utmb=74597006.4.9.1587103574718; i_extend=C_b1Gimthomepagecategory1394H__a; ' +
+                    cssVersion[
+                        0] + '; _lx_utm=utm_source%3D60030; w_latlng=0,0; w_visitid=325bea04-5a15-465d-9776-79827bad5bd3; channelType={%22mtib%22:%220%22}; w_actual_lat=23135645; w_actual_lng=113373507; _lxsdk_s=17186be8729-ed4-2e4-001%7C%7C8',
+            'Host': 'i.waimai.meituan.com',
+            'Origin': 'https://h5.waimai.meituan.com',
+            'Referer': 'https://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId=' + ShopID + '&utm_source=60030&channel=mtib&source=shoplist&initialLat=&initialLng=&actualLat=23.135645&actualLng=113.373507',
+            'Sec-Fetch-Mode': 'cors',
+            'Sec-Fetch-Site': 'same-site',
+            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
+        }
+        DATA = {
+            'shopId': '0',
+            'orderPlatform': '',
+            'mtWmPoiId': ShopID,
+            'source': 'searchresult',
+            'address': '',
+            'cityId': '',
+            'channel': 6,
+            'gpsLng': 113.373507,
+            'gpsLat': 23.135645,
+            'uuid': uuid,
+            'platform': 3,
+            'partner': 4,
+            'originUrl': 'https://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId=' + ShopID + '&utm_source=60030&channel=mtib&source=shoplist&initialLat=&initialLng=&actualLat=23.135645&actualLng=113.373507',
+            'riskLevel': 71,
+            'optimusCode': 10,
+            'wm_latitude': 23135645,
+            'wm_longitude': 113373507,
+            'wm_actual_latitude': 23135645,
+            'wm_actual_longitude': 113373507,
+            'openh5_uuid': uuid,
+            '_token': 'eJxNUdmuokoU/ReS64tGihlOQjqKIM6iiEPn5oShhAIBoYrB7vS/d3lvOqeTSvaa9sNe9ZOpFxHzwQFOAdyIaWHNfDDcGIxlZsQQTB1JVTggSIrMy8KICf/WNB5o0ogJam/GfHwXFGkky8q/b+FA+ZfwhXiRvndiQQNMQsgTf7BsIo07H+U+GucQkcYvxmGZs/9LbI6KCPbfyOsJdcqLz6cfw0FD8k9cNnUIdRkAAQzCxC8K+NBzgoIBJij6DHRuEMIWFkRHOfmHt5Iyh+9lCkOfwLisXxzFgiYOahijsvhEkc4BAEQRKAD8EWknmE79qwlOVgYIHyE5lA2B9b4u+9cBVg3ERCd1A2l1DD0yd99HCkAe0XKpxmnqSODkt5W9LTr9vyOj2cL7iv3HaIT8iW7oz1Abo7igCC5fJDOtNjEmTnJn13aKKrSZ+cnMrbjKXdrp7Zkn4tbzq2wS+x3PKgomHmlIpmmpr27VnTp0fVWW+y4tLV60qnV2sJ1jy3L8zHZcXtnvT+QwnZr5/C4eoo0aH6ZzZBjhabUfolnmz4M4dnPL2piGI7meafwwXBAvifRIuNlMwxsvdSPrnu2kyt2cTg/nHEGlQguvc1FxPAWrebNDqx0+dnGV7jea04ZgfVldgSFpUnW+XfKT2M33Ml8Ehp3dsXDtT2tXe6zZc29vi5htAizaxRRI1rKfH7OEDXuOm5bemlvh504xb/LLLJqhIp+zDgwfvhTdK/m5vlWqf+MD38iOzrJLck9tF169ENedvAV2FpMWuimK+tb2J1OhvIbi6+J4Acl2hV2TYIIn2l5y4iOOdpqiHubnU5yGZnqU+BaweDGsztcFd70MY0tChW8Kav9jJ1/Ul3O/AigscboVSkEQ9qhHps78+g1EWBa6'
+        }
+        DATA['token'] = Token
+        url = 'https://i.waimai.meituan.com/openh5/poi/food?_=' + ShopID + '&X-FOR-WITH=AAHm3WbILqxiZy6hgSgM9ClbQk%2FJKeGERSqeqt%2FF2I2kJYuiU9FbmEmFBqkAneOLyBCIfKwQOy6AyDoxT60wQbYlf6%2FDlCbPvTj54%2Bz1RudyTBXOq%2F%2BrIQXcOCrLXIXxMLNeV7jK1LNQOtYOEkpuhObQNL4aFKXdHDaLo2iyzhhRulVuEvTXTDa3u%2BnouiYS4nXPynTtpYzHyTgevxBTog%3D%3D'
+        if(terse == 1):
+            response = requests.post(url, data=DATA, headers=HEADERS,proxies=proxies)
+        elif(terse == 0):
+            response = requests.post(url, data=DATA, headers=HEADERS)
+        commodity_text = response.content.decode('utf-8')
+        commodity_text = commodity_text.replace("&#x", "uni")  # 替换加密字体
+        data_list = list(data2)
+        data_list2 = []
+        for i in data_list:
+            data_list2.append(str(i).lower() + ';')
+        for i in range(10):
+            if data_list2[i] in commodity_text:
+                commodity_text = commodity_text.replace(data_list2[i], data2[data_list[i]])
+        print(commodity_text)
+        commodity_json = json.loads(commodity_text)
+
+        shopName = commodity_json['data']['shopInfo']['shopName']       #商家名称
+        print(shopName)
+        classifications = commodity_json['data']['categoryList']         #商品分类
+        spuName = []        #商品名称
+        spuDesc = []        #商品简介
+        spuAttrList = []    #商品规格
+        saleVolumeDecoded = []      #商品月销量
+        originPrice = []    #商品原价
+        currentPrice = []   #商品现价
+        for classification in classifications:
+            for spuList in classification['spuList']:
+                spuName.append(spuList['spuName'])
+                spuDesc.append(spuList['spuDesc'])
+                pattern = "[\u4e00-\u9fa5]+"
+                regex = re.compile(pattern)
+                spuAttrName = regex.findall(str(spuList['spuAttrList']))
+                spuAttrList.append(spuAttrName)
+                saleVolumeDecoded.append(spuList['saleVolumeDecoded'])
+                originPrice.append(spuList['originPrice'])
+                currentPrice.append(spuList['currentPrice'])
+
+        time.sleep(random.randint(2,5))
+        url = url.replace('food', 'info')
+        if (terse == 1):
+            response = requests.post(url, data=DATA, headers=HEADERS,proxies=proxies)
+        elif (terse == 0):
+            response = requests.post(url, data=DATA, headers=HEADERS)
+        business_text = response.content.decode('utf-8')
+        business_text = business_text.replace("&#x", "uni")
+        for i in range(10):
+            if data_list2[i] in business_text:
+                business_text = business_text.replace(data_list2[i], data2[data_list[i]])
+        business_json = json.loads(business_text)
+        shopAddress = business_json['data']['shopAddress']          #商家地址
+        nowTime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")     #爬取时间
+
+        cur.execute("SELECT trading_area_type FROM business WHERE id='" + str(ShopID) + "'")
+        tradingAreaId = cur.fetchone()[0]  # 商圈类型
+        cur.execute("SELECT brand_id FROM business WHERE id = '" + str(ShopID) + "'")
+        brandid = cur.fetchone()[0]  # 品牌id
+        cur.execute("SELECT name FROM brand WHERE id = " + str(brandid))
+        brandname = cur.fetchone()[0]  # 品牌名称
+        cur.execute("UPDATE business SET name='" + str(shopName) + "',address='" + str(shopAddress) + "',brand_name='" + str(brandname) + "',create_time='" + str(nowTime) + "' WHERE id='" + str(ShopID) + "'")
+        test.commit()
+        # print(spuName[i], str(spuDesc[i]), saleVolumeDecoded[i], str(spuAttrList[i]), originPrice[i], currentPrice[i], tradingAreaId, ShopID,shopName, shopAddress, brandid, brandname, nowTime)
+
+        #数据写入数据库
+        for i in range(len(spuName)):
+            sqllan = "INSERT INTO waimai_product_data ( product_name, introduce, sales, standards, origin_price, current_price, trading_area_type, bussiness_id, bussiness_name, bussiness_address, brand_id, brand_name, create_time )VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+            cur.execute(sqllan, (spuName[i], str(spuDesc[i]), saleVolumeDecoded[i], str(spuAttrList[i]), originPrice[i], currentPrice[i], tradingAreaId, ShopID,shopName, shopAddress, brandid, brandname, nowTime))
+            test.commit()
+
+    @staticmethod
+    def changeToken(uuid,shopId):
+        signData = 'dpShopId=-1&geoType=2&mtWmPoiId='+ shopId +'&openh5_uuid='+ uuid +'&optimusCode=10&originUrl=https://h5.waimai.meituan.com/waimai/mindex/menu?dpShopId=&mtShopId='+ shopId +'&utm_source=60030&channel=mtib&source=shoplist&initialLat=&initialLng=&actualLat=23.135645&actualLng=113.373507&partner=4&platform=3&riskLevel=71&skuId=&source=shoplist&uuid=' + uuid + '&wm_actual_latitude=23135645&wm_actual_longitude=113373507&wm_latitude=0&wm_longitude=0'
+        signBytes = bytes(signData,encoding="utf-8")
+        signZlib = zlib.compress(signBytes)
+        sign = base64.b64encode(signZlib)
+        timeStamp = int(round(time.time() * 1000))
+        tokenData = '{"rId":101701,"ver":"1.0.6","ts":'+ str(timeStamp) +',"cts":'+ str(timeStamp + 80000)+',"brVD":[375,667],"brR":[[375,667],[375,667],24,24],"bI":["https://h5.waimai.meituan.com/waimai/mindex?type=main_page&utm_source=60030&channel=mtib&stid_b=1&cevent=imt%2Fhomepage%2Fcategory1%2F394","https://i.meituan.com/?cevent=imt%2Fguide%2Fi%2FiPhone&city=guangzhou"],"mT":["174,309"],"kT":[],"aT":["174,309,DIV"],"tT":[],"aM":"","sign":"'+str(sign)+'"}'
+        tokenBytes = bytes(tokenData,encoding="utf-8")
+        tokenZlib = zlib.compress(tokenBytes)
+        token = base64.b64encode(tokenZlib)
+        return token
+
+    #读取数据库商店编码
+    def createShopID(self):
+        Meituan.fontPackage()
+        # test = pymysql.connect(host='localhost', port=3306, user='root', passwd='UXH123456', db='takeaway_monitor_db',charset='utf8')           #链接数据库
+        # cur = test.cursor()
+        uuids = ['09BB857AE6740F9973DA0A635474792559E4534D4CBE0929A7CA23BFFEEAD75A','3E841C72DA9D196B93B63BFFDCC6C65597046BF55AABFB11C539943F89A5B274','39F32AD369D12C7D544EB76B5EC1D6C04F069642F44B4BBDB060537931998F98','3E0416149E317B185D28312A78CED3F72CC0BF6FC0F89F28F019617CAACDFDED']
+        # cur.execute("SELECT id FROM business")
+        # ShopIDs = cur.fetchall()     # 商家id
+        for ShopID in ShopIDs:
+            uuid = uuids[random.randint(0,2)]
+            Token = Meituan.changeToken(uuid, ShopID[0])
+            try:
+                # storeInformation(ShopID[0], Token,uuid,test,cur,proxies,1)
+                time.sleep(random.randint(5, 10))
+            except:
+                try:
+                    # storeInformation(ShopID[0], Token,uuid,test,cur,0,0)
+                    proxies = self.getIP()
+                except:
+                    # 爬取失败商家跳过,写入日志
+                    # cur.execute("SELECT name FROM business WHERE id='" + ShopID[0] + "'")
+                    # shopName = cur.fetchone()[0]
+                    file_handle = open('log.txt', mode='a+')
+                    file_handle.write(str(shopName) + '爬取失败\t' + str(datetime) +'\n')
+                    file_handle.close()
+                    time.sleep(random.randint(3, 7))
+                    continue
+        # test.close()
+        
+    def save(self, data:pd.DataFrame):
+        # pd to_csv
+        data.to_csv("data.csv",index=False,encoding="utf-8-sig")
+
 
-    def save(self, data):
-        pass
+if __name__=='__main__':
+    meituan=Meituan()
+    meituan.run()

+ 3 - 5
main.py

@@ -15,12 +15,10 @@ def read_conf(config: configparser.ConfigParser):
     config.read('conf/config.ini', encoding='utf-8')
     if config.has_section('base'):
         cookie = config.get("base", "cookie",raw=True)
-        url = config.get("base", "url",raw=True)
-        urls_path = config['base']['urls_path']
-        return cookie,url,urls_path
+        return cookie
 
 if __name__=='__main__':
     config = configparser.ConfigParser()
-    cookie,url,url_path =read_conf(config)
-    meituan = Meituan(cookie,url,url_path)
+    cookie =read_conf(config)
+    meituan = Meituan(cookie)
     meituan.run()