5. Example for Linear Regression#
story (Procedure of applied ML)
Preparation
Make the purpose (goal) clear.
Make the task concrete.
Check the possibilities to replace the existing services.
Prepare the dataset
Select a model
Continue learning, evaluation and tuning
ref.
# Prepare the dataset
# Load the diabetes dataset
# ref. https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset
from sklearn import datasets
diabetes = datasets.load_diabetes()
orig_X = diabetes.data # get a feature matrix (input dataset)
print(type(orig_X)) # check the type
print(orig_X.shape) # check the size of feature matrix
print(orig_X[:5]) # display the first 5 samples
print(diabetes.feature_names) # display the name of each element of feature
<class 'numpy.ndarray'>
(442, 10)
[[ 0.03807591 0.05068012 0.06169621 0.02187235 -0.0442235 -0.03482076
-0.04340085 -0.00259226 0.01990842 -0.01764613]
[-0.00188202 -0.04464164 -0.05147406 -0.02632783 -0.00844872 -0.01916334
0.07441156 -0.03949338 -0.06832974 -0.09220405]
[ 0.08529891 0.05068012 0.04445121 -0.00567061 -0.04559945 -0.03419447
-0.03235593 -0.00259226 0.00286377 -0.02593034]
[-0.08906294 -0.04464164 -0.01159501 -0.03665645 0.01219057 0.02499059
-0.03603757 0.03430886 0.02269202 -0.00936191]
[ 0.00538306 -0.04464164 -0.03638469 0.02187235 0.00393485 0.01559614
0.00814208 -0.00259226 -0.03199144 -0.04664087]]
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
# convert np.ndarray to pd.dataframe for readability
import pandas as pd
df = pd.DataFrame(orig_X, columns=diabetes.feature_names)
df.head()
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019908 | -0.017646 |
1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068330 | -0.092204 |
2 | 0.085299 | 0.050680 | 0.044451 | -0.005671 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002864 | -0.025930 |
3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022692 | -0.009362 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031991 | -0.046641 |
# Here, use just one feature 'bmi' for simple exercise
import numpy as np
X = diabetes.data[:, np.newaxis, 2]
print(X.shape)
print(X[:5])
(442, 1)
[[ 0.06169621]
[-0.05147406]
[ 0.04445121]
[-0.01159501]
[-0.03638469]]
y = diabetes.target
print(y.shape)
print(y[:5])
(442,)
[151. 75. 141. 206. 135.]
# split the dataset into training and testing set
num_of_training = 400
X_train = X[:num_of_training]
X_test = X[num_of_training:]
y_train = y[:num_of_training]
y_test = y[num_of_training:]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(400, 1)
(42, 1)
(400,)
(42,)
# Select a model
# => Linear Regression
from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training set
regr.fit(X_train, y_train)
# Make predictions using the testing set
predicted = regr.predict(X_test)
# Check the predictions vs true answer
print(np.c_[predicted, y_test])
[[196.51241167 175. ]
[109.98667708 93. ]
[121.31742804 168. ]
[245.95568858 275. ]
[204.75295782 293. ]
[270.67732703 281. ]
[ 75.99442421 72. ]
[241.8354155 140. ]
[104.83633574 189. ]
[141.91879342 181. ]
[126.46776938 209. ]
[208.8732309 136. ]
[234.62493762 261. ]
[152.21947611 113. ]
[159.42995399 131. ]
[161.49009053 174. ]
[229.47459628 257. ]
[221.23405012 55. ]
[129.55797419 84. ]
[100.71606266 42. ]
[118.22722323 146. ]
[168.70056841 212. ]
[227.41445974 233. ]
[115.13701842 91. ]
[163.55022706 111. ]
[114.10695016 152. ]
[120.28735977 120. ]
[158.39988572 67. ]
[237.71514243 310. ]
[121.31742804 94. ]
[ 98.65592612 183. ]
[123.37756458 66. ]
[205.78302609 173. ]
[ 95.56572131 72. ]
[154.27961264 49. ]
[130.58804246 64. ]
[ 82.17483382 48. ]
[171.79077322 178. ]
[137.79852034 104. ]
[137.79852034 132. ]
[190.33200206 220. ]
[ 83.20490209 57. ]]
# Check the differences (errors)
print(y_test - predicted)
[ -21.51241167 -16.98667708 46.68257196 29.04431142 88.24704218
10.32267297 -3.99442421 -101.8354155 84.16366426 39.08120658
82.53223062 -72.8732309 26.37506238 -39.21947611 -28.42995399
12.50990947 27.52540372 -166.23405012 -45.55797419 -58.71606266
27.77277677 43.29943159 5.58554026 -24.13701842 -52.55022706
37.89304984 -0.28735977 -91.39988572 72.28485757 -27.31742804
84.34407388 -57.37756458 -32.78302609 -23.56572131 -105.27961264
-66.58804246 -34.17483382 6.20922678 -33.79852034 -5.79852034
29.66799794 -26.20490209]
# Check the sum of absolute error
print(sum(np.abs(y_test - predicted)))
# MAE (Mean Absolute Error)
print(sum(np.abs(y_test - predicted)) / len(X_test))
1890.1633693291935
45.00388974593318
# Plot testing set and predictions
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.scatter(X_test, y_test, color='black', label='test data')
ax.plot(X_test, predicted, color='blue', linewidth=3, label='predicted')
ax.legend(loc='best', fontsize=20)
ax.set_xlabel('bmi', fontsize=20)
ax.set_ylabel('disease progression', fontsize=20)
plt.show()
# Check the model (obtained parameters)
print('coefficients: ', regr.coef_)
print('intercept: ', regr.intercept_)
coefficients: [955.70303385]
intercept: 153.00018395675963