|
@@ -0,0 +1,63 @@
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import requests
|
|
|
+import warnings
|
|
|
+import re
|
|
|
+from datetime import datetime
|
|
|
+import json
|
|
|
+import pandas as pd
|
|
|
+import random
|
|
|
+import time
|
|
|
+from datetime import datetime
|
|
|
+#from selenium import webdriver
|
|
|
+
|
|
|
+
|
|
|
+# 每次爬取时cookies都需要重新粘贴使用。
|
|
|
+# 网址就是大网页网址(修改s={})
|
|
|
+# 修改文件名称
|
|
|
+
|
|
|
+headers = {
|
|
|
+ 'User-Agent': '',
|
|
|
+ 'Referer':'https://www.taobao.com/',
|
|
|
+ 'Connection':'keep-alive'}
|
|
|
+url = 'https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s={}'
|
|
|
+
|
|
|
+cookies={'cookie':''}
|
|
|
+
|
|
|
+price=[]
|
|
|
+name=[]
|
|
|
+address=[]
|
|
|
+count=[]
|
|
|
+title=[]
|
|
|
+id_=[]
|
|
|
+detail_url=[]
|
|
|
+sale=[]
|
|
|
+ji=1
|
|
|
+
|
|
|
+for i in range(51,51):
|
|
|
+ value=44*i
|
|
|
+ url_range=url.format(value)
|
|
|
+ res=requests.get(url_range,headers=headers,cookies=cookies,timeout=10)
|
|
|
+ res.encoding='utf-8'
|
|
|
+ # 正则从网址中提取信息
|
|
|
+ print('正在爬取第'+str(ji)+'页')
|
|
|
+ data=re.findall('g_page_config =(.*?)g_srp_loadCss',res.text,re.S)[0].strip()[:-1]
|
|
|
+ content=json.loads(data,encoding='utf-8')
|
|
|
+ list_=content['mods']['itemlist']['data']['auctions']
|
|
|
+ for item in list_:
|
|
|
+ name.append(item['nick'])
|
|
|
+ price.append(item['view_price'])
|
|
|
+ address.append(item['item_loc'])
|
|
|
+ count.append(item['view_sales'].replace('人收货',''))
|
|
|
+ title.append(item['raw_title'])
|
|
|
+ id_.append(item['nid']) #nid
|
|
|
+ detail_url.append(item['detail_url'])
|
|
|
+ ji+=1
|
|
|
+ time.sleep(random.random()*100+3)
|
|
|
+
|
|
|
+print('爬取完成')
|
|
|
+result={'店铺名称':name,'商品标题':title,'价格':price,'地址':address,'商品编号':id_,'收货人数':count,'详情页网址':detail_url}
|
|
|
+#result={'店铺名称':name[:177],'商品标题':title[:177],'价格':price[:177],'地址':address[:177],'商品编号':id_[:177],'收货人数':count[:177],'详情页网址':detail_url[:177]}
|
|
|
+results=pd.DataFrame(result)
|
|
|
+results.info()
|
|
|
+results.to_excel('口红ID.xlsx')
|
|
|
+
|