|
@@ -15,11 +15,14 @@ import logging
|
|
from selenium import webdriver
|
|
from selenium import webdriver
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
|
|
|
+
|
|
class Enum(tuple):
|
|
class Enum(tuple):
|
|
__getattr__ = tuple.index
|
|
__getattr__ = tuple.index
|
|
|
|
|
|
|
|
+
|
|
BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
|
|
BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
|
|
|
|
|
|
|
|
+
|
|
class CrawlCar():
|
|
class CrawlCar():
|
|
def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
|
|
def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
|
|
self.__site = site
|
|
self.__site = site
|
|
@@ -128,7 +131,7 @@ class CrawlCar():
|
|
tmp.append(col.text)
|
|
tmp.append(col.text)
|
|
reportData = reportData.append(
|
|
reportData = reportData.append(
|
|
pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
|
|
pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
|
|
-
|
|
|
|
|
|
+
|
|
# 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
|
|
# 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
|
|
if i > 0:
|
|
if i > 0:
|
|
self.__browser.find_element_by_xpath(
|
|
self.__browser.find_element_by_xpath(
|
|
@@ -141,5 +144,18 @@ class CrawlCar():
|
|
reportData.to_csv(self.__save_folder +
|
|
reportData.to_csv(self.__save_folder +
|
|
"/report" + month1 + ".csv", header=False)
|
|
"/report" + month1 + ".csv", header=False)
|
|
|
|
|
|
|
|
+ def merge(self):
|
|
|
|
+ '''
|
|
|
|
+ 合并多个csv文件
|
|
|
|
+ '''
|
|
|
|
+ df = pd.DataFrame()
|
|
|
|
+ for parent, dirnames, filenames in os.walk(self.__save_folder):
|
|
|
|
+ for filename in filenames:
|
|
|
|
+ if filename.startswith("report"):
|
|
|
|
+ df1 = pd.read_csv(os.path.join(parent, filename))
|
|
|
|
+ df = pd.concat([df, df1])
|
|
|
|
+ df = df.drop(df[df["日期"] == "日期"].index).reset_index()
|
|
|
|
+ df.to_csv(self.__save_folder + "res.csv", header=False)
|
|
|
|
+
|
|
def start(self):
|
|
def start(self):
|
|
self.crawl()
|
|
self.crawl()
|