Example for Linear Regression

5. Example for Linear Regression

  • story (Procedure of applied ML)

    • Preparation

      • Make the purpose (goal) clear.

      • Make the task concrete.

      • Check the possibilities to replace the existing services.

    • Prepare the dataset

    • Select a model

    • Continue learning, evaluation and tuning

  • ref.

# Prepare the dataset
# Load the diabetes dataset
# ref. https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset

from sklearn import datasets
diabetes = datasets.load_diabetes()

orig_X = diabetes.data # get a feature matrix (input dataset)
print(type(orig_X))    # check the type
print(orig_X.shape)    # check the size of feature matrix
print(orig_X[:5])      # display the first 5 samples
print(diabetes.feature_names)    # display the name of each element of feature
<class 'numpy.ndarray'>
(442, 10)
[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632783 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06832974 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567061 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286377 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665645  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02269202 -0.00936191]
 [ 0.00538306 -0.04464164 -0.03638469  0.02187235  0.00393485  0.01559614
   0.00814208 -0.00259226 -0.03199144 -0.04664087]]
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
# convert np.ndarray to pd.dataframe for readability

import pandas as pd
df = pd.DataFrame(orig_X, columns=diabetes.feature_names)
df.head()
age sex bmi bp s1 s2 s3 s4 s5 s6
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019908 -0.017646
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068330 -0.092204
2 0.085299 0.050680 0.044451 -0.005671 -0.045599 -0.034194 -0.032356 -0.002592 0.002864 -0.025930
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022692 -0.009362
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031991 -0.046641
# Here, use just one feature 'bmi' for simple exercise
import numpy as np
X = diabetes.data[:, np.newaxis, 2]
print(X.shape)
print(X[:5])
(442, 1)
[[ 0.06169621]
 [-0.05147406]
 [ 0.04445121]
 [-0.01159501]
 [-0.03638469]]
y = diabetes.target
print(y.shape)
print(y[:5])
(442,)
[151.  75. 141. 206. 135.]
# split the dataset into training and testing set
num_of_training = 400

X_train = X[:num_of_training]
X_test = X[num_of_training:]

y_train = y[:num_of_training]
y_test = y[num_of_training:]

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(400, 1)
(42, 1)
(400,)
(42,)
# Select a model
# => Linear Regression

from sklearn import linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training set
regr.fit(X_train, y_train)

# Make predictions using the testing set
predicted = regr.predict(X_test)

# Check the predictions vs true answer
print(np.c_[predicted, y_test])
[[196.51241167 175.        ]
 [109.98667708  93.        ]
 [121.31742804 168.        ]
 [245.95568858 275.        ]
 [204.75295782 293.        ]
 [270.67732703 281.        ]
 [ 75.99442421  72.        ]
 [241.8354155  140.        ]
 [104.83633574 189.        ]
 [141.91879342 181.        ]
 [126.46776938 209.        ]
 [208.8732309  136.        ]
 [234.62493762 261.        ]
 [152.21947611 113.        ]
 [159.42995399 131.        ]
 [161.49009053 174.        ]
 [229.47459628 257.        ]
 [221.23405012  55.        ]
 [129.55797419  84.        ]
 [100.71606266  42.        ]
 [118.22722323 146.        ]
 [168.70056841 212.        ]
 [227.41445974 233.        ]
 [115.13701842  91.        ]
 [163.55022706 111.        ]
 [114.10695016 152.        ]
 [120.28735977 120.        ]
 [158.39988572  67.        ]
 [237.71514243 310.        ]
 [121.31742804  94.        ]
 [ 98.65592612 183.        ]
 [123.37756458  66.        ]
 [205.78302609 173.        ]
 [ 95.56572131  72.        ]
 [154.27961264  49.        ]
 [130.58804246  64.        ]
 [ 82.17483382  48.        ]
 [171.79077322 178.        ]
 [137.79852034 104.        ]
 [137.79852034 132.        ]
 [190.33200206 220.        ]
 [ 83.20490209  57.        ]]
# Check the differences (errors)
print(y_test - predicted)
[ -21.51241167  -16.98667708   46.68257196   29.04431142   88.24704218
   10.32267297   -3.99442421 -101.8354155    84.16366426   39.08120658
   82.53223062  -72.8732309    26.37506238  -39.21947611  -28.42995399
   12.50990947   27.52540372 -166.23405012  -45.55797419  -58.71606266
   27.77277677   43.29943159    5.58554026  -24.13701842  -52.55022706
   37.89304984   -0.28735977  -91.39988572   72.28485757  -27.31742804
   84.34407388  -57.37756458  -32.78302609  -23.56572131 -105.27961264
  -66.58804246  -34.17483382    6.20922678  -33.79852034   -5.79852034
   29.66799794  -26.20490209]
# Check the sum of absolute error
print(sum(np.abs(y_test - predicted)))

# MAE (Mean Absolute Error)
print(sum(np.abs(y_test - predicted)) / len(X_test))
1890.1633693291935
45.00388974593318
# Plot testing set and predictions
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.scatter(X_test, y_test,  color='black', label='test data')
ax.plot(X_test, predicted, color='blue', linewidth=3, label='predicted')
ax.legend(loc='best', fontsize=20)
ax.set_xlabel('bmi', fontsize=20)
ax.set_ylabel('disease progression', fontsize=20)
plt.show()
../_images/regression_diabetes_9_0.png
# Check the model (obtained parameters)
print('coefficients: ', regr.coef_)
print('intercept: ', regr.intercept_)
coefficients:  [955.70303385]
intercept:  153.00018395675963