ml case studies 3 - uci auto data regression performance evaluation

last updated: 03 Jun 2019

Pre-requisite Reading



Goal



Python Libraries

### IMPORT ML LIBRARIES: 
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
import keras

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error



Data Pre-Processing

### IMPORTS: 

data_path = '../csv/00-cleaned-up-data.csv'
df = pd.read_csv(data_path)



Feature Selection

### FEATURE SELECTION: 

group = np.array([
    'engine-size', 'horsepower', 'city-mpg', 'highway-mpg', 'body-style',
    'price'
])

df = df[group]



One-Hot-Encoding and Normalization

### ONE-HOT-ENCODING: 

ohe = pd.get_dummies(df)
print(ohe.shape)

### NORMALIZE DATA: 

train_target = train_data.pop('price')
test_target = test_data.pop('price')

train_stats = train_data.describe().transpose()
# print(train_stats)

normed_train_data = (train_data - train_stats['mean']) / train_stats['std']
normed_test_data = (test_data - train_stats['mean']) / train_stats['std']

# print(normed_train_data.dtypes)
# print(normed_test_data)



Model Initialization and Training


linear regression model

## MULTI-LINEAR REGRESSION: 

# init model:
lm = LinearRegression()

# train model:
lm.fit(normed_train_data, train_target)


neural net regression model

## NEURAL NET REGRESSION: 

# setup MLP model generating function
def build_mlp_model():

    model = keras.Sequential([
        keras.layers.Dense(
            32,
            activation='sigmoid',
            kernel_initializer=keras.initializers.glorot_normal(seed=3),
            input_dim=len(normed_train_data.keys())),
        keras.layers.Dropout(rate=0.25, noise_shape=None, seed=7),
        keras.layers.Dense(8, activation='relu'),
        keras.layers.Dropout(rate=0.001, noise_shape=None, seed=3),
        keras.layers.Dense(1)
    ])

    model.compile(loss='mse',
                  optimizer=keras.optimizers.Adam(lr=0.09,
                                                  beta_1=0.9,
                                                  beta_2=0.999,
                                                  epsilon=None,
                                                  decay=0.03,
                                                  amsgrad=True),
                  metrics=['mae', 'mse'])

    return model


# initialize model and view details

nn = build_mlp_model() # keras only model
nn.summary()

# nn model verification: 

example_batch = normed_train_data[:10]
example_result = nn.predict(example_batch)
print(example_result)



In-Sample Accuracy


linear model residual

fig: residual plot for linear regression model


neural net residual

fig: residual plot for neural net model


insights


### RESIDUALS (in-sample accuracy): 

## MULTI-LINEAR REGRESSION: 
plt.figure(0, figsize=(12, 10))
sns.residplot(lm.predict(normed_train_data), train_target)
plt.savefig('plots/a-resid-plot-multi-lin.png')

## NEURAL NET REGRESSION: 
plt.figure(1, figsize=(12, 10))
sns.residplot(nn.predict(normed_train_data).flatten(), train_target)
plt.savefig('plots/a-resid-plot-nn.png')

### ACCURACY (in-sample test): 

## MULTI-LINEAR REGRESSION: 

print('\nMulti-variate Linear Regression Accuracy Metrics')

# R^2 score:
print('in-sample R^2')
print(r2_score(train_target, lm.predict(normed_train_data)))

# MSE score:
print('in-sample MSE')
print(mean_squared_error(train_target, lm.predict(normed_train_data)))

## NEURAL NET REGRESSION: 

print('\nNeural Net Regression Accuracy Metrics')

# R^2 score:
print('in-sample R^2')
print(r2_score(train_target, nn.predict(normed_train_data).flatten()))

# MSE score:
print('in-sample MSE')
print(mean_squared_error(train_target, nn.predict(normed_train_data).flatten()))



Out-of-Sample Accuracy


linear model


neural net


comparison

fig: distribution plot for true (black), linear model (cyan) and neural net predictions (yellow)


insights


### ACCURACY (out-of-sample test): 

## MULTI-LINEAR REGRESSION: 

print('\nMulti-variate Linear Regression Accuracy Metrics')

# R^2 score:
print('out-of-sample R^2')
print(r2_score(test_target, lm.predict(normed_test_data)))

# MSE score:
print('out-of-sample MSE')
print(mean_squared_error(test_target, lm.predict(normed_test_data)))

## NEURAL NET REGRESSION: 

print('\nNeural Net Regression Accuracy Metrics')

# R^2 score:
print('out-of-sample R^2')
print(r2_score(test_target, nn.predict(normed_test_data).flatten()))

# MSE score:
print('out-of-sample MSE')
print(mean_squared_error(test_target, nn.predict(normed_test_data).flatten()))

## DISTRIBUTION PLOTS: 

plt.figure(2, figsize=(12, 10))

ax1 = sns.distplot(test_target, hist=False, color="k", label='True Values')

ax2 = sns.distplot(lm.predict(normed_test_data),
                   hist=False,
                   color="c",
                   label='Linear Model Prediction',
                   ax=ax1)

ax3 = sns.distplot(nn.predict(normed_test_data).flatten(),
                   hist=False,
                   color="y",
                   label='Neural Net Prediction',
                   ax=ax1)

plt.title(
    "['engine-size', 'horsepower', 'city-mpg', 'highway-mpg', 'body-style']:Price"
)
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')
plt.savefig('plots/b-dist-lin-nn-compare.png')



References


created: 01 Jun 2019
today's track: We Know (Vintage & Morelli Remix) by Boom Jinx & Sound Prank feat. Katrine Stenbekk