Untitled1
In [19]:
from train_model import *
In [7]:
df.shape
Out[7]:
(1460, 81)
In [13]:
features[‘Neighborhood’][0]
Out[13]:
‘CollgCr’
In [10]:
features.columns
Out[10]:
Index([‘MSSubClass’, ‘MSZoning’, ‘LotFrontage’, ‘LotArea’, ‘Street’, ‘Alley’,
‘LotShape’, ‘LandContour’, ‘Utilities’, ‘LotConfig’, ‘LandSlope’,
‘Neighborhood’, ‘Condition1’, ‘Condition2’, ‘BldgType’, ‘HouseStyle’,
‘OverallQual’, ‘OverallCond’, ‘YearBuilt’, ‘YearRemodAdd’, ‘RoofStyle’,
‘RoofMatl’, ‘Exterior1st’, ‘Exterior2nd’, ‘MasVnrType’, ‘MasVnrArea’,
‘ExterQual’, ‘ExterCond’, ‘Foundation’, ‘BsmtQual’, ‘BsmtCond’,
‘BsmtExposure’, ‘BsmtFinType1’, ‘BsmtFinSF1’, ‘BsmtFinType2’,
‘BsmtFinSF2’, ‘BsmtUnfSF’, ‘TotalBsmtSF’, ‘Heating’, ‘HeatingQC’,
‘CentralAir’, ‘Electrical’, ‘1stFlrSF’, ‘2ndFlrSF’, ‘LowQualFinSF’,
‘GrLivArea’, ‘BsmtFullBath’, ‘BsmtHalfBath’, ‘FullBath’, ‘HalfBath’,
‘BedroomAbvGr’, ‘KitchenAbvGr’, ‘KitchenQual’, ‘TotRmsAbvGrd’,
‘Functional’, ‘Fireplaces’, ‘FireplaceQu’, ‘GarageType’, ‘GarageYrBlt’,
‘GarageFinish’, ‘GarageCars’, ‘GarageArea’, ‘GarageQual’, ‘GarageCond’,
‘PavedDrive’, ‘WoodDeckSF’, ‘OpenPorchSF’, ‘EnclosedPorch’, ‘3SsnPorch’,
‘ScreenPorch’, ‘PoolArea’, ‘PoolQC’, ‘Fence’, ‘MiscFeature’, ‘MiscVal’,
‘MoSold’, ‘YrSold’, ‘SaleType’, ‘SaleCondition’],
dtype=’object’)
In [14]:
df_test, features_test = readData(‘test.csv’)
nf, nf_test = transformFeatures(features, features_test)
In [15]:
nf.shape
Out[15]:
(1460, 288)
In [16]:
nf_test.shape
Out[16]:
(1459, 288)
In [17]:
nf.columns
Out[17]:
Index([‘MSSubClass’, ‘LotFrontage’, ‘LotArea’, ‘OverallQual’, ‘OverallCond’,
‘YearBuilt’, ‘YearRemodAdd’, ‘MasVnrArea’, ‘BsmtFinSF1’, ‘BsmtFinSF2’,
…
‘SaleType_ConLw’, ‘SaleType_New’, ‘SaleType_Oth’, ‘SaleType_WD’,
‘SaleCondition_Abnorml’, ‘SaleCondition_AdjLand’,
‘SaleCondition_Alloca’, ‘SaleCondition_Family’, ‘SaleCondition_Normal’,
‘SaleCondition_Partial’],
dtype=’object’, length=288)
In [18]:
nf[‘Neighborhood_CollgCr’][0]
Out[18]:
1.0
In [21]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import LassoCV
import sklearn.cross_validation as cv
import matplotlib.pyplot as plt
def readData(fn):
# read the file
# remove the first id and last price to get the feature data
df = pd.read_csv(fn)
features = df.loc[:, ‘MSSubClass’:’SaleCondition’]
return df, features
def transformFeatures(features, features_test):
all_features = pd.concat([features, features_test])
names = []
# for every feature
for feature in all_features:
# test if its numerical
if all_features[feature].dtype != “object”:
# test if it is too skew
if skew(features[feature].dropna()) > 0.8:
names.append(feature)
# print(‘skewed’)
# print(feature)
# log transformation
all_features[feature] = np.log1p(features[feature])
# convert categority to indicator variable
all_features = pd.get_dummies(all_features)
# print(all_features.head(10))
# replace empty value with mean
all_features = all_features.fillna(all_features.mean())
# split to train and test feature
features = all_features[:features.shape[0]]
features_test = all_features[features.shape[0]:]
return features, features_test
def modelTraining(features, target):
linear_model = LassoCV(alphas=[0.00001, 0.0001, 0.001, 0.01, 0.1], max_iter=10000).fit(features, target)
# print selected alpha
# print(linear_model.alpha_)
# the cross validation score
crossScore = cv.cross_val_score(linear_model, features, target, scoring=”mean_squared_error”)
return linear_model, linear_model.alpha_, crossScore
def getTopCoefficient(linear_model):
model_coefficient = pd.Series(linear_model.coef_, index=features.columns)
# non-zero coefficient features are selected
numSelect = sum(model_coefficient != 0)
removed = model_coefficient.shape[0] – numSelect
print(“select %d variables and remove %d variables” % (numSelect, removed))
# smallest 10
neg10 = model_coefficient.sort_values(ascending=True).head(10)
# largest 10
pos10 = model_coefficient.sort_values(ascending=False).head(10)
# print(neg10)
# print(pos10)
topCoefficent = pd.concat([pos10, neg10])
return topCoefficent
def modelTesting(linear_model, features, features_test):
# prediction on train data
train_pred = np.expm1(linear_model.predict(features))
pd.DataFrame({“pred”: train_pred, “correct”: df[“SalePrice”]}).to_csv(“compare.csv”,
index=False, header=True)
# compute the relative error
meanRela = np.zeros_like(train_pred)
for i in range(meanRela.shape[0]):
meanRela[i] = np.abs(train_pred[i] – df[“SalePrice”][i]) / df[“SalePrice”][i]
# print the mean relative error
print(np.mean(meanRela))
# save train prediction to file
outFrameTrain = pd.DataFrame({“id”: df.Id, “SalePrice”: train_pred})
outFrameTrain.to_csv(“train_pred.csv”, index=False, header=True)
# do prediction on test data and save to file
test_pred = np.expm1(linear_model.predict(features_test))
outFrame = pd.DataFrame({“id”: df_test.Id, “SalePrice”: test_pred})
outFrame.to_csv(“test_pred.csv”, index=False, header=True)
df, features = readData(‘train.csv’)
df_test, features_test = readData(‘test.csv’)
# log transform skewed features, convert categority to indicator variable,
# replance empty value by mean
features, features_test = transformFeatures(features, features_test)
# transfom sale price
target = np.log1p(df[“SalePrice”])
linear_model, alpha, crossScore = modelTraining(features, target)
topCoefficent = getTopCoefficient(linear_model)
select 79 variables and remove 209 variables
In [22]:
topCoefficent
Out[22]:
GrLivArea 0.369302
Neighborhood_Crawfor 0.087217
Neighborhood_StoneBr 0.077048
LotArea 0.071276
Neighborhood_NoRidge 0.063047
Exterior1st_BrkFace 0.060827
OverallQual 0.059689
Neighborhood_NridgHt 0.057854
Functional_Typ 0.056103
Condition1_Norm 0.052286
MSZoning_C (all) -0.202997
SaleCondition_Abnorml -0.050854
Neighborhood_Edwards -0.049865
MSZoning_RM -0.038466
CentralAir_N -0.030392
LandContour_Bnk -0.030336
BsmtExposure_No -0.014004
SaleType_WD -0.013042
MSSubClass -0.012221
ExterQual_TA -0.010308
dtype: float64
In [23]:
topCoefficent[‘GrLivArea’]
Out[23]:
0.36930151600431799
In [ ]: