{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 特征工程" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.model_selection import train_test_split,StratifiedKFold\n", "from sklearn.metrics import auc,roc_auc_score\n", "import xgboost as xgb \n", "import os" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# 切换项目目录\n", "os.getcwd()\n", "os.chdir(\"/media/sf_share/linux/haoxin\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# 把特征个数少的列全部丢弃,使用和不适用效果影响不大,建议使用,省时间内存\n", "def null_ratio(df):\n", " features=df.columns\n", " feature_selected=[]\n", " drop_index=[]\n", " sz=df.size\n", " for feat in features:\n", " sz_null=df[df[feat].isnull()].size\n", " ratio=float(sz_null)/sz\n", " if ratio > 0.9:\n", " feature_selected.append((feat,ratio))\n", " drop_index.append(feat) \n", " return feature_selected,drop_index" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def dumyuserfeature(train):\n", " pre_feat = 'UserInfo_'\n", " # 检查离散变量和连续变量\n", " asd = 0\n", " index=[]\n", " train_copy = train.copy()\n", " for i ,col in enumerate(train.columns):\n", " print(i)\n", " cofe = len(train.groupby(col).count())\n", " if cofe < 20: #10,15都一样\n", " feikong = np.sum([train[col] != -999] )\n", " if feikong < len(train) * 0.1:\n", " continue\n", " \n", " train_copy = train_copy.join(pd.get_dummies(train[col], prefix=col+'_'))\n", " index.append(col)\n", " asd += 1 \n", " return train_copy" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# 导入数据\n", "a_train = pd.read_csv('data/A_train.csv')\n", "# a_train = pd.read_csv('data/A_train.csv',nrows =5)\n", "b_train = pd.read_csv('data/B_train.csv')\n", "b_test = pd.read_csv('data/B_test.csv')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# 用字符串-999代替缺失值\n", "b_train = b_train.fillna(-999)\n", "b_test = b_test.fillna(-999)\n", "flags = b_train['flag']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "1\n" ] }, { "ename": "ValueError", "evalue": "columns overlap but no suffix specified: Index([u'UserInfo_1__2054.47'], dtype='object')", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mb_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumyuserfeature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mb_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumyuserfeature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mb_test\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mdumyuserfeature\u001b[0;34m(train)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mtrain_copy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_copy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_dummies\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0masd\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36mjoin\u001b[0;34m(self, other, on, how, lsuffix, rsuffix, sort)\u001b[0m\n\u001b[1;32m 4667\u001b[0m \u001b[0;31m# For SparseDataFrame's benefit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4668\u001b[0m return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,\n\u001b[0;32m-> 4669\u001b[0;31m rsuffix=rsuffix, sort=sort)\n\u001b[0m\u001b[1;32m 4670\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4671\u001b[0m def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_join_compat\u001b[0;34m(self, other, on, how, lsuffix, rsuffix, sort)\u001b[0m\n\u001b[1;32m 4682\u001b[0m return merge(self, other, left_on=on, how=how,\n\u001b[1;32m 4683\u001b[0m \u001b[0mleft_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mon\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mright_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4684\u001b[0;31m suffixes=(lsuffix, rsuffix), sort=sort)\n\u001b[0m\u001b[1;32m 4685\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4686\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mon\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/reshape/merge.pyc\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0mright_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mright_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msuffixes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msuffixes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m copy=copy, indicator=indicator)\n\u001b[0;32m---> 54\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 55\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/reshape/merge.pyc\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 574\u001b[0m llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,\n\u001b[0;32m--> 575\u001b[0;31m rdata.items, rsuf)\n\u001b[0m\u001b[1;32m 576\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0mlindexers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mleft_indexer\u001b[0m\u001b[0;34m}\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mleft_indexer\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc\u001b[0m in \u001b[0;36mitems_overlap_with_suffix\u001b[0;34m(left, lsuffix, right, rsuffix)\u001b[0m\n\u001b[1;32m 4699\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mlsuffix\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mrsuffix\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4700\u001b[0m raise ValueError('columns overlap but no suffix specified: %s' %\n\u001b[0;32m-> 4701\u001b[0;31m to_rename)\n\u001b[0m\u001b[1;32m 4702\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4703\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mlrenamer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: columns overlap but no suffix specified: Index([u'UserInfo_1__2054.47'], dtype='object')" ] } ], "source": [ "b_train = dumyuserfeature(b_train)\n", "b_test = dumyuserfeature(b_test)\n", "b_test" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# 导出预处理数据的结果,删除了很多缺失值。\n", "b_train.to_csv('output/B_train_dummy.csv',index=False)\n", "b_test.to_csv('output/B_test_dummy.csv',index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }