1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- '''
- Created on 2017年5月16日
- @vsersion:python3.6
- @author: liuyuqi
- '''
- import requests
- from bs4 import BeautifulSoup
- urls = []
- citys1 = ['pudongxinqu', 'minhang', 'baoshan', 'xuhui', 'putuo', 'yangpu', 'changning', 'songjiang',
- 'jiading', 'huangpu', 'jinan', 'zhabei', 'hongkou', 'qingpu', 'fengxian', 'jinshan', 'chongming']
- citys = ['pudongxinqu']
- data = {"user": "user", "password": "pass"}
- headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
- "Accept-Encoding": "gzip",
- "Accept-Language": "zh-CN,zh;q=0.8",
- "Referer": "http://www.example.com/",
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
- }
- for i in citys:
- url = 'http://sh.lianjia.com/ershoufang/%s/' % i
- res = requests.get(url, headers=headers)
- res = res.text.encode(res.encoding).decode('utf-8')
- soup = BeautifulSoup(res, 'html.parser')
- page = soup.findAll('div', {'class': 'c-pagination'})
- pages = [i.strip() for i in page[0].text.split('\n')]
-
- if len(pages) > 3:
- total_pages = int(pages[-3])
- else:
- total_pages = int(pages[-2])
- for j in list(range(1, total_pages + 1)):
- urls.append('http://sh.lianjia.com/ershoufang/%s/d%s' % (i, j))
-
- urlss = []
- for i in range(0, 1):
- urlss.append(urls[i])
- file = open('lianjia.csv', 'w', encoding='utf-8')
- for url in urls:
- res = requests.get(url)
- res = res.text.encode(res.encoding).decode('utf-8')
- soup = BeautifulSoup(res, 'html.parser')
- find_all = soup.find_all(name='div', attrs={'class': 'info-panel'})
- for i in list(range(len(find_all))):
- title = find_all[i].find('a')['title']
- res2 = find_all[i]
- name = res2.find_all('div', {'class': 'where'})[0].find_all('span')[0].text
- room_type = res2.find_all('div', {'class': 'where'})[0].find_all('span')[1].text
- size = res2.find_all('div', {'class': 'where'})[0].find_all('span')[2].text[:-3]
-
- info = [i.strip() for i in res2.find_all('div', {'class': 'con'})[0].text.split('\n')]
- region = info[1]
- loucheng = info[2][2:]
- chaoxiang = info[5][2:]
- builtdate = info[-3][2:]
-
- price = find_all[i].find('div', {'class': 'price'}).text.strip()[:-1]
-
- price_union = find_all[i].find('div', {'class': 'price-pre'}).text.strip()[:-3]
-
-
- file.write(','.join((name, room_type, size, region, loucheng, chaoxiang, price, price_union, builtdate)) + '\n')
- file.close()
|