Saturday, August 1, 2020

Machine Learning || Predictive Analytics


Lets start practicing some multiple regression on a car sales dataset today. 

1.
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

# Load the data from a .csv in the same folder
raw_data = pd.read_csv('1.04. Real-life example.csv')
# Let's explore the top 5 rows of the df
raw_data.head()


Brand Price Body Mileage EngineV Engine Type Registration Year Model
0 BMW 4200.0 sedan 277 2.0 Petrol yes 1991 320
1 Mercedes-Benz 7900.0 van 427 2.9 Diesel yes 1999 Sprinter 212
2 Mercedes-Benz 13300.0 sedan 358 5.0 Gas yes 2003 S 500
3 Audi 23000.0 crossover 240 4.2 Petrol yes 2007 Q7
4 Toyota 18300.0 crossover 120 2.0 Petrol yes 2011 Rav 4


Preprocessing
We know that Model can be derived from either of the other features (Brand, Body, EngineV etc.)
data = raw_data.drop(['Model'],axis=1)
# Let's check the descriptives without 'Model'

First we need to remove the missing values 
data_no_mv = data.dropna(axis=0)


Now there are some outliers present

# Outliers are a great issue for OLS, thus we must deal with them in some way
# Let's declare a variable that will be equal to the 99th percentile of the 'Price' variable
q = data_no_mv['Price'].quantile(0.99)
# Then we can create a new df, with the condition that all prices must be below the 99 percentile of 'Price'
data_1 = data_no_mv[data_no_mv['Price']<q]
# In this way we have essentially removed the top 1% of the data about 'Price'
data_1.describe(include='all')

q = data_1['Mileage'].quantile(0.99)
data_2 = data_1[data_1['Mileage']<q]
sns.distplot(data_2['Mileage'])

# A simple Google search can indicate the natural domain of this variable
# Car engine volumes are usually (always?) below 6.5l
# This is a prime example of the fact that a domain expert (a person working in the car industry)
# may find it much easier to determine problems with the data than an outsider
data_3 = data_2[data_2['EngineV']<6.5]

q = data_3['Year'].quantile(0.01)
data_4 = data_3[data_3['Year']>q]
sns.distplot(data_4['Year'])


Now the preprocessing is done
data_cleaned = data_4.reset_index(drop=True)


# You can simply use plt.scatter() for each of them (with your current knowledge)
# But since Price is the 'y' axis of all the plots, it made sense to plot them side-by-side (so we can compare them)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3)) #sharey -> share 'Price' as y
ax1.scatter(data_cleaned['Year'],data_cleaned['Price'])
ax1.set_title('Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['Price'])
ax2.set_title('Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['Price'])
ax3.set_title('Price and Mileage')

plt.show()



# Let's transform 'Price' with a log transformation
log_price = np.log(data_cleaned['Price'])
# Then we add it to our data frame
data_cleaned['log_price'] = log_price
data_cleaned

# Let's check the three scatters once again
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3))
ax1.scatter(data_cleaned['Year'],data_cleaned['log_price'])
ax1.set_title('Log Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['log_price'])
ax2.set_title('Log Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['log_price'])
ax3.set_title('Log Price and Mileage')

plt.show()
# The relationships show a clear linear relationship
# This is some good linear regression material
# Alternatively we could have transformed each of the independent variables

Multicollnearity
# sklearn does not have a built-in way to check for multicollinearity
# one of the main reasons is that this is an issue well covered in statistical frameworks and not in ML ones
# surely it is an issue nonetheless, thus we will try to deal with it
# Here's the relevant module
# full documentation: http://www.statsmodels.org/dev/_modules/statsmodels/stats/outliers_influence.html#variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
# To make this as easy as possible to use, we declare a variable where we put

# since our categorical data is not yet preprocessed, we will only take the numerical ones
variables = data_cleaned[['Mileage','Year','EngineV']]
# we create a new data frame which will include all the VIFs
# note that each variable has its own variance inflation factor as this measure is variable specific (not model specific)
vif = pd.DataFrame()
# here we make use of the variance_inflation_factor, which will basically output the respective VIFs
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
# Finally, I like to include names so it is easier to explore the result
vif["Features"] = variables.columns


VIF
Features
0 3.791584 Mileage
1 10.354854 Year
2 7.662068 EngineV


# Since Year has the highest VIF, I will remove it from the model
# This will drive the VIF of other variables down!!!
# So even if EngineV seems with a high VIF, too, once 'Year' is gone that will no longer be the case
data_no_multicollinearity = data_cleaned.drop(['Year'],axis=1)

data_with_dummies = pd.get_dummies(data_no_multicollinearity, drop_first=True)


Linear regression model

# The target(s) (dependent variable) is 'log price'
targets = data_preprocessed['log_price']
# The inputs are everything BUT the dependent variable, so we can simply drop it
inputs = data_preprocessed.drop(['log_price'],axis=1)

# Import the scaling module
from sklearn.preprocessing import StandardScaler
# Create a scaler object
scaler = StandardScaler()
# Fit the inputs (calculate the mean and standard deviation feature-wise)
scaler.fit(inputs)

inputs_scaled = scaler.transform(inputs)

# Import the module for the split
from sklearn.model_selection import train_test_split
# Split the variables with an 80-20 split and some random state
# To have the same split as mine, use random_state = 365
x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

# Create a linear regression object
reg = LinearRegression()
# Fit the regression with the scaled TRAIN inputs and targets
reg.fit(x_train,y_train)

y_hat = reg.predict(x_train)

# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_train, y_hat)
# Let's also name the axes
plt.xlabel('Targets (y_train)',size=18)
plt.ylabel('Predictions (y_hat)',size=18)
# Sometimes the plot will have different scales of the x-axis and the y-axis
# This is an issue as we won't be able to interpret the '45-degree line'
# We want the x-axis and the y-axis to be the same
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()


# Find the R-squared of the model
reg.score(x_train,y_train)

0.744996578792662 We have a modeal with almost ~75% accuracy

No comments:

Post a Comment