import scipy as sp
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, precision_score, recall_score, log_loss
from sklearn.linear_model import LinearRegression, Ridge
import sklearn
Some data
= pd.read_csv("https://rcambier.github.io/blog/assets/california_housing_train.csv")
df = df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']] df
Linear Regression
= (df - df.min()) / (df.max() - df.min())
scaled_df = scaled_df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']].values
X = scaled_df['median_house_value'].values
y
= np.hstack((np.ones((len(X), 1)),X))
X_with_intercept = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ (X_with_intercept.T @ y.reshape(-1, 1))
B
print("Manual weights: ", B.reshape(-1))
print("Manual score: ", r2_score(y, (X_with_intercept @ B).reshape(-1)))
Manual weights: [-0.07556544 0.19769139 -1.56087573 1.32234017 -2.57610401 1.59516284
1.43606576]
Manual score: 0.5713482748283873
from sklearn.metrics import r2_score
= (((X_with_intercept @ B).reshape(-1) - y)**2).sum() # Squared distance from our new regression line
RSS = ((y.mean() - y)**2).sum() # Squared distance from the mean
TSS = 1 - RSS / TSS # How much distance did we gained ? Did we reduce the errors ? Are we closer to the actual point values ?
r2 @ B).reshape(-1)) r2, r2_score(y, (X_with_intercept
(0.5713482748283873, 0.5713482748283873)
Let’s compare those results with sklearn linear regression
= LinearRegression().fit(X, y)
lr
print("")
print("Sklearn weights: ", [lr.intercept_] + lr.coef_.tolist() )
print("Sklearn score: ", r2_score(y, lr.predict(X)))
Sklearn weights: [-0.07556543642855085, 0.19769138728528463, -1.560875734209466, 1.3223401715433833, -2.5761040065353327, 1.5951628411047127, 1.4360657609756604]
Sklearn score: 0.5713482748283873
Linear regression with regularization (Ridge regression)
Regularization is the action of adding to the loss, a term that contains the weight values. That way these terms are forced to stay small. This helps avoiding overfitting.
Let’s look at the ordinary least sqaure loss and then add the square of each weight to build the regularized loss. Adding the square of each weight means we buil the Ridge regression loss. If we add the absolute value of each weight we build the Lasso regression loss.
= (((X_with_intercept @ B) - y.reshape(-1, 1)).T @ (X_with_intercept @ B) - y.reshape(-1, 1)).reshape(-1)
loss = loss + 0.3 * B.T @ B
regularized_loss loss, regularized_loss
(array([-4.07656752, -4.10378391, -4.11533025, ..., -4.15223732,
-4.11553644, -4.13368069]),
array([[-0.1053435 , -0.13255988, -0.14410623, ..., -0.18101329,
-0.14431241, -0.16245667]]))
The way adding this loss impacts the formula is the following
= (df - df.min()) / (df.max() - df.min())
scaled_df = scaled_df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']].values
X = scaled_df['median_house_value'].values
y
= np.hstack((np.ones((len(X), 1)),X))
X_with_intercept
= np.identity(X_with_intercept.shape[1])
I 0,0] = 0
I[= np.linalg.inv(X_with_intercept.T @ X_with_intercept + 0.3 * I) @ (X_with_intercept.T @ y.reshape(-1, 1))
B
print("Manual weights: ", B.reshape(-1))
print("Manual score: ", r2_score(y, (X_with_intercept @ B).reshape(-1)))
Manual weights: [-0.07457501 0.19926227 -1.4614579 1.30386275 -2.31228351 1.40463349
1.42708759]
Manual score: 0.5710213053584052
= Ridge(alpha=0.3).fit(X, y)
lr
print("")
print("Sklearn weights: ", [lr.intercept_] + lr.coef_.tolist() )
print("Sklearn score: ", r2_score(y, lr.predict(X)))
Sklearn weights: [-0.07457500943073836, 0.19926227134208827, -1.4614578956147966, 1.3038627486538301, -2.3122835137561593, 1.4046334910837222, 1.4270875901070914]
Sklearn score: 0.5710213053584055
Logistic Regression
For the logistic regression, we transform the X values in the same way but we add a sigmoid transform at the end in order to map to values between 0 and 1.
We can not use the normal form anymore for computing the weights. We have to resort to other techniques like gradient descent.
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def log_likelihood(y_hat, y_true):
# Being far away from the correct class is penalized heavily.
return - np.mean( y_true * np.log(y_hat) + (1-y_true) * np.log(1-y_hat) )
def gradient_sigmoid(x):
* (1 - sigmoid(X))
sigmoid(X)
def gradients(X, y, y_hat):
# Loss = y * log(h) + (1 - y) * log(1-h)
# where h = sigmoid(z)
# and z = Xt @ B
# deriv_loss_to_h = y / h - (1-y) / (1-h) = (y - h) / (h * (1 - h))
# deriv_h_to_z = sigmoid(h) * (1 - sigmoid(h))
# deriv_z_to_b = Xt
# Though chain rule, final derivative
# final_derivative = deriv_loss_to_h * deriv_h_to_z * deriv_z_to_b = x * (y - h) = x * (y - y_hat)
= (1/len(X)) * (X.T @ (y_hat - y))
dw return dw
'median_house_value_cat'] = (df['median_house_value'] > 150_000).astype(int)
df[
= (df - df.min()) / (df.max() - df.min())
scaled_df = scaled_df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']].values
X = df['median_house_value_cat'].values
y
= np.hstack((np.ones((len(X), 1)),X))
X_with_intercept
= np.random.normal(0, 0.1 ,(7, 1))
B
for i in range(50_000):
= sigmoid(X_with_intercept @ B).reshape(-1)
y_hat if i % 5000 == 0 or i ==0:
print("loss: ", log_likelihood(y_hat, y))
= gradients(X_with_intercept, y, y_hat)
deltas -= 0.3 * deltas.reshape(-1, 1)
B
= sklearn.linear_model.LogisticRegression().fit(X, y) lr
loss: 0.6821755251690551
loss: 0.46252854158211454
loss: 0.4541283502467595
loss: 0.4510517001241438
loss: 0.4487185438489886
loss: 0.4466409644195144
loss: 0.44474208933519344
loss: 0.4429996946665173
loss: 0.4413999204051543
loss: 0.43993073757337586
print("Manual weights: ", B.reshape(-1))
print("Manual score: ",
@ B).reshape(-1) > 0.5).astype(int) ),
precision_score(y, (sigmoid(X_with_intercept @ B).reshape(-1) > 0.5).astype(int) ),
recall_score(y, (sigmoid(X_with_intercept
)print()
print("Sklearn log loss: ", log_loss(y, (sigmoid(X_with_intercept @ B).reshape(-1))))
print("Sklearn weights: ", lr.intercept_.tolist() + lr.coef_.reshape(-1).tolist())
print("Sklearn score",
precision_score(y, lr.predict(X)),
recall_score(y, lr.predict(X)) )
Manual weights: [ -4.74650191 2.09565859 -11.5802056 7.08212211 -3.00163538
8.00035044 18.85028779]
Manual score: 0.8215249055925193 0.8514583915758084
Sklearn log loss: 0.43866521703018374
Sklearn weights: [-4.385273936252051, 1.9603353158729624, -10.78106293024599, 6.882196980429938, -2.868679885031378, 7.251350300146187, 17.41000987846787]
Sklearn score 0.8186773905272565 0.8536949026185817
The weights are not exactly the same but the performances are very similar. This is due to the randomness aspect of training through gradient descent.