· kaggle python scikit-learn data-science random-forest

scikit-learn: Random forests - Feature Importance


import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# We'll use this library to make the display pretty
from tabulate import tabulate

train = pd.read_csv('train.csv')

# the model can only handle numeric values so filter out the rest
data = train.select_dtypes(include=[np.number]).interpolate().dropna()

y = train.SalePrice
X = data.drop(["SalePrice", "Id"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.33)

clf = RandomForestRegressor(n_jobs=2, n_estimators=1000)
model = clf.fit(X_train, y_train)

headers = ["name", "score"]
values = sorted(zip(X_train.columns, model.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                 score
OverallQual    0.553829
GrLivArea      0.131
BsmtFinSF1     0.0374779
TotalBsmtSF    0.0372076
1stFlrSF       0.0321814
GarageCars     0.0226189
GarageArea     0.0215719
LotArea        0.0214979
YearBuilt      0.0184556
2ndFlrSF       0.0127248
YearRemodAdd   0.0126581
WoodDeckSF     0.0108077
OpenPorchSF    0.00945239
LotFrontage    0.00873811
TotRmsAbvGrd   0.00803121
GarageYrBlt    0.00760442
BsmtUnfSF      0.00715158
MasVnrArea     0.00680341
ScreenPorch    0.00618797
Fireplaces     0.00521741
OverallCond    0.00487722
MoSold         0.00461165
MSSubClass     0.00458496
BedroomAbvGr   0.00253031
FullBath       0.0024245
YrSold         0.00211638
HalfBath       0.0014954
KitchenAbvGr   0.00140786
BsmtFullBath   0.00137335
BsmtFinSF2     0.00107147
EnclosedPorch  0.000951266
3SsnPorch      0.000501238
PoolArea       0.000261668
LowQualFinSF   0.000241304
BsmtHalfBath   0.000179506
MiscVal        0.000154799
  • LinkedIn
  • Tumblr
  • Reddit
  • Google+
  • Pinterest
  • Pocket