Browse Source

增加数据处理

liuyuqi-dellpc 3 years ago
parent
commit
6751fb3711
4 changed files with 132 additions and 1 deletions
  1. 17 1
      crawl_car/crawl_car.py
  2. 0 0
      notebook/car-R.ipynb
  3. 0 0
      notebook/car-py.ipynb
  4. 115 0
      notebook/驾考分析.ipynb

+ 17 - 1
crawl_car/crawl_car.py

@@ -15,11 +15,14 @@ import logging
 from selenium import webdriver
 import pandas as pd
 
+
 class Enum(tuple):
     __getattr__ = tuple.index
 
+
 BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
 
+
 class CrawlCar():
     def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
         self.__site = site
@@ -128,7 +131,7 @@ class CrawlCar():
                                 tmp.append(col.text)
                             reportData = reportData.append(
                                 pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
-                        
+
                         # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
                         if i > 0:
                             self.__browser.find_element_by_xpath(
@@ -141,5 +144,18 @@ class CrawlCar():
                 reportData.to_csv(self.__save_folder +
                                   "/report" + month1 + ".csv", header=False)
 
+    def merge(self):
+        '''
+        合并多个csv文件
+        '''
+        df = pd.DataFrame()
+        for parent, dirnames, filenames in os.walk(self.__save_folder):
+            for filename in filenames:
+                if filename.startswith("report"):
+                    df1 = pd.read_csv(os.path.join(parent, filename))
+                    df = pd.concat([df, df1])
+        df = df.drop(df[df["日期"] == "日期"].index).reset_index()
+        df.to_csv(self.__save_folder + "res.csv", header=False)
+
     def start(self):
         self.crawl()

+ 0 - 0
car-R.ipynb → notebook/car-R.ipynb


+ 0 - 0
car-py.ipynb → notebook/car-py.ipynb


+ 115 - 0
notebook/驾考分析.ipynb

@@ -0,0 +1,115 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.0-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python_defaultSpec_1599503433913",
+   "display_name": "Python 3.6.0 64-bit ('root': conda)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import numpy as np \n",
+    "import matplotlib.pyplot as plt\n",
+    "import os,sys,re,time\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "0   2015-07-01\n1   2015-07-01\nName: 3, dtype: datetime64[ns]\n"
+    }
+   ],
+   "source": [
+    "df =pd.read_csv(\"data/res.csv\",header=None)\n",
+    "df2= pd.to_datetime(df[3],format='%Y%m%d')\n",
+    "\n",
+    "print(df2.head(2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "index         3            4   5   6   7   8\n0      0  20150701  科目一第01考试点申通   3  --  --  --\n1      1  20150701  科目一第05考试点马陆  83  --  --  --\n"
+    }
+   ],
+   "source": [
+    "df2=df.drop([0,1,2],axis=1).reset_index()\n",
+    "\n",
+    "print(df2.head(2))\n",
+    "\n",
+    "\n",
+    "# df_all = df_all[df_all['purchase_num'].str.contains('人付款')] \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}