程序代写代做代考 Untitled1

Untitled1

In [19]:

from train_model import *

In [7]:

df.shape

Out[7]:

(1460, 81)

In [13]:

features[‘Neighborhood’][0]

Out[13]:

‘CollgCr’

In [10]:

features.columns

Out[10]:

Index([‘MSSubClass’, ‘MSZoning’, ‘LotFrontage’, ‘LotArea’, ‘Street’, ‘Alley’,
‘LotShape’, ‘LandContour’, ‘Utilities’, ‘LotConfig’, ‘LandSlope’,
‘Neighborhood’, ‘Condition1’, ‘Condition2’, ‘BldgType’, ‘HouseStyle’,
‘OverallQual’, ‘OverallCond’, ‘YearBuilt’, ‘YearRemodAdd’, ‘RoofStyle’,
‘RoofMatl’, ‘Exterior1st’, ‘Exterior2nd’, ‘MasVnrType’, ‘MasVnrArea’,
‘ExterQual’, ‘ExterCond’, ‘Foundation’, ‘BsmtQual’, ‘BsmtCond’,
‘BsmtExposure’, ‘BsmtFinType1’, ‘BsmtFinSF1’, ‘BsmtFinType2’,
‘BsmtFinSF2’, ‘BsmtUnfSF’, ‘TotalBsmtSF’, ‘Heating’, ‘HeatingQC’,
‘CentralAir’, ‘Electrical’, ‘1stFlrSF’, ‘2ndFlrSF’, ‘LowQualFinSF’,
‘GrLivArea’, ‘BsmtFullBath’, ‘BsmtHalfBath’, ‘FullBath’, ‘HalfBath’,
‘BedroomAbvGr’, ‘KitchenAbvGr’, ‘KitchenQual’, ‘TotRmsAbvGrd’,
‘Functional’, ‘Fireplaces’, ‘FireplaceQu’, ‘GarageType’, ‘GarageYrBlt’,
‘GarageFinish’, ‘GarageCars’, ‘GarageArea’, ‘GarageQual’, ‘GarageCond’,
‘PavedDrive’, ‘WoodDeckSF’, ‘OpenPorchSF’, ‘EnclosedPorch’, ‘3SsnPorch’,
‘ScreenPorch’, ‘PoolArea’, ‘PoolQC’, ‘Fence’, ‘MiscFeature’, ‘MiscVal’,
‘MoSold’, ‘YrSold’, ‘SaleType’, ‘SaleCondition’],
dtype=’object’)

In [14]:

df_test, features_test = readData(‘test.csv’)

nf, nf_test = transformFeatures(features, features_test)

In [15]:

nf.shape

Out[15]:

(1460, 288)

In [16]:

nf_test.shape

Out[16]:

(1459, 288)

In [17]:

nf.columns

Out[17]:

Index([‘MSSubClass’, ‘LotFrontage’, ‘LotArea’, ‘OverallQual’, ‘OverallCond’,
‘YearBuilt’, ‘YearRemodAdd’, ‘MasVnrArea’, ‘BsmtFinSF1’, ‘BsmtFinSF2’,
…
‘SaleType_ConLw’, ‘SaleType_New’, ‘SaleType_Oth’, ‘SaleType_WD’,
‘SaleCondition_Abnorml’, ‘SaleCondition_AdjLand’,
‘SaleCondition_Alloca’, ‘SaleCondition_Family’, ‘SaleCondition_Normal’,
‘SaleCondition_Partial’],
dtype=’object’, length=288)

In [18]:

nf[‘Neighborhood_CollgCr’][0]

Out[18]:

1.0

In [21]:

import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import LassoCV
import sklearn.cross_validation as cv
import matplotlib.pyplot as plt

def readData(fn):
# read the file
# remove the first id and last price to get the feature data
df = pd.read_csv(fn)

features = df.loc[:, ‘MSSubClass’:’SaleCondition’]

return df, features

def transformFeatures(features, features_test):
all_features = pd.concat([features, features_test])

names = []

# for every feature
for feature in all_features:
# test if its numerical
if all_features[feature].dtype != “object”:
# test if it is too skew
if skew(features[feature].dropna()) > 0.8:
names.append(feature)
# print(‘skewed’)
# print(feature)
# log transformation
all_features[feature] = np.log1p(features[feature])

# convert categority to indicator variable
all_features = pd.get_dummies(all_features)

# print(all_features.head(10))
# replace empty value with mean
all_features = all_features.fillna(all_features.mean())

# split to train and test feature
features = all_features[:features.shape[0]]
features_test = all_features[features.shape[0]:]

return features, features_test

def modelTraining(features, target):
linear_model = LassoCV(alphas=[0.00001, 0.0001, 0.001, 0.01, 0.1], max_iter=10000).fit(features, target)

# print selected alpha
# print(linear_model.alpha_)

# the cross validation score
crossScore = cv.cross_val_score(linear_model, features, target, scoring=”mean_squared_error”)

return linear_model, linear_model.alpha_, crossScore

def getTopCoefficient(linear_model):
model_coefficient = pd.Series(linear_model.coef_, index=features.columns)

# non-zero coefficient features are selected
numSelect = sum(model_coefficient != 0)
removed = model_coefficient.shape[0] – numSelect
print(“select %d variables and remove %d variables” % (numSelect, removed))

# smallest 10
neg10 = model_coefficient.sort_values(ascending=True).head(10)
# largest 10
pos10 = model_coefficient.sort_values(ascending=False).head(10)

# print(neg10)
# print(pos10)

topCoefficent = pd.concat([pos10, neg10])

return topCoefficent

def modelTesting(linear_model, features, features_test):
# prediction on train data
train_pred = np.expm1(linear_model.predict(features))
pd.DataFrame({“pred”: train_pred, “correct”: df[“SalePrice”]}).to_csv(“compare.csv”,
index=False, header=True)
# compute the relative error
meanRela = np.zeros_like(train_pred)
for i in range(meanRela.shape[0]):
meanRela[i] = np.abs(train_pred[i] – df[“SalePrice”][i]) / df[“SalePrice”][i]

# print the mean relative error
print(np.mean(meanRela))

# save train prediction to file
outFrameTrain = pd.DataFrame({“id”: df.Id, “SalePrice”: train_pred})
outFrameTrain.to_csv(“train_pred.csv”, index=False, header=True)

# do prediction on test data and save to file
test_pred = np.expm1(linear_model.predict(features_test))
outFrame = pd.DataFrame({“id”: df_test.Id, “SalePrice”: test_pred})
outFrame.to_csv(“test_pred.csv”, index=False, header=True)

df, features = readData(‘train.csv’)
df_test, features_test = readData(‘test.csv’)

# log transform skewed features, convert categority to indicator variable,
# replance empty value by mean
features, features_test = transformFeatures(features, features_test)

# transfom sale price
target = np.log1p(df[“SalePrice”])
linear_model, alpha, crossScore = modelTraining(features, target)
topCoefficent = getTopCoefficient(linear_model)

select 79 variables and remove 209 variables

In [22]:

topCoefficent

Out[22]:

GrLivArea 0.369302
Neighborhood_Crawfor 0.087217
Neighborhood_StoneBr 0.077048
LotArea 0.071276
Neighborhood_NoRidge 0.063047
Exterior1st_BrkFace 0.060827
OverallQual 0.059689
Neighborhood_NridgHt 0.057854
Functional_Typ 0.056103
Condition1_Norm 0.052286
MSZoning_C (all) -0.202997
SaleCondition_Abnorml -0.050854
Neighborhood_Edwards -0.049865
MSZoning_RM -0.038466
CentralAir_N -0.030392
LandContour_Bnk -0.030336
BsmtExposure_No -0.014004
SaleType_WD -0.013042
MSSubClass -0.012221
ExterQual_TA -0.010308
dtype: float64

In [23]:

topCoefficent[‘GrLivArea’]

Out[23]:

0.36930151600431799

In [ ]:

Related Posts