import scipy as sp 
import numpy as np 
import pandas as pd
from sklearn.metrics import r2_score, precision_score, recall_score, log_loss
from sklearn.linear_model import LinearRegression, Ridge
import sklearn

Some data

df = pd.read_csv("https://rcambier.github.io/blog/assets/california_housing_train.csv")
df = df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']]

Linear Regression

scaled_df = (df - df.min()) / (df.max() - df.min())
X = scaled_df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']].values
y = scaled_df['median_house_value'].values

X_with_intercept = np.hstack((np.ones((len(X), 1)),X))
B = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ (X_with_intercept.T @ y.reshape(-1, 1))

print("Manual weights: ", B.reshape(-1))
print("Manual score: ", r2_score(y, (X_with_intercept @ B).reshape(-1)))

Manual weights:  [-0.07556544  0.19769139 -1.56087573  1.32234017 -2.57610401  1.59516284
  1.43606576]
Manual score:  0.5713482748283873

from sklearn.metrics import r2_score

RSS = (((X_with_intercept @ B).reshape(-1) - y)**2).sum() # Squared distance from our new regression line
TSS = ((y.mean() - y)**2).sum()                           # Squared distance from the mean
r2 = 1 - RSS / TSS                                        # How much distance did we gained ? Did we reduce the errors ? Are we closer to the actual point values ?
r2, r2_score(y, (X_with_intercept @ B).reshape(-1))

(0.5713482748283873, 0.5713482748283873)

Let’s compare those results with sklearn linear regression


lr = LinearRegression().fit(X, y)

print("")
print("Sklearn weights: ", [lr.intercept_] + lr.coef_.tolist() )
print("Sklearn score: ", r2_score(y, lr.predict(X)))


Sklearn weights:  [-0.07556543642855085, 0.19769138728528463, -1.560875734209466, 1.3223401715433833, -2.5761040065353327, 1.5951628411047127, 1.4360657609756604]
Sklearn score:  0.5713482748283873

Linear regression with regularization (Ridge regression)

Regularization is the action of adding to the loss, a term that contains the weight values. That way these terms are forced to stay small. This helps avoiding overfitting.

Let’s look at the ordinary least sqaure loss and then add the square of each weight to build the regularized loss. Adding the square of each weight means we buil the Ridge regression loss. If we add the absolute value of each weight we build the Lasso regression loss.


loss = (((X_with_intercept @ B) - y.reshape(-1, 1)).T  @  (X_with_intercept @ B) - y.reshape(-1, 1)).reshape(-1)
regularized_loss = loss + 0.3 * B.T @ B
loss, regularized_loss

(array([-4.07656752, -4.10378391, -4.11533025, ..., -4.15223732,
        -4.11553644, -4.13368069]),
 array([[-0.1053435 , -0.13255988, -0.14410623, ..., -0.18101329,
         -0.14431241, -0.16245667]]))

The way adding this loss impacts the formula is the following

scaled_df = (df - df.min()) / (df.max() - df.min())
X = scaled_df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']].values
y = scaled_df['median_house_value'].values


X_with_intercept = np.hstack((np.ones((len(X), 1)),X))

I = np.identity(X_with_intercept.shape[1])
I[0,0] = 0
B = np.linalg.inv(X_with_intercept.T @ X_with_intercept + 0.3 * I) @ (X_with_intercept.T @ y.reshape(-1, 1))

print("Manual weights: ", B.reshape(-1))
print("Manual score: ", r2_score(y, (X_with_intercept @ B).reshape(-1)))

Manual weights:  [-0.07457501  0.19926227 -1.4614579   1.30386275 -2.31228351  1.40463349
  1.42708759]
Manual score:  0.5710213053584052


lr = Ridge(alpha=0.3).fit(X, y)

print("")
print("Sklearn weights: ", [lr.intercept_] + lr.coef_.tolist() )
print("Sklearn score: ", r2_score(y, lr.predict(X)))


Sklearn weights:  [-0.07457500943073836, 0.19926227134208827, -1.4614578956147966, 1.3038627486538301, -2.3122835137561593, 1.4046334910837222, 1.4270875901070914]
Sklearn score:  0.5710213053584055

Logistic Regression

For the logistic regression, we transform the X values in the same way but we add a sigmoid transform at the end in order to map to values between 0 and 1.

We can not use the normal form anymore for computing the weights. We have to resort to other techniques like gradient descent.

def sigmoid(x):
  return  1 / (1 + np.exp(-x)) 

def log_likelihood(y_hat, y_true):
  # Being far away from the correct class is penalized heavily. 
  return - np.mean( y_true * np.log(y_hat) + (1-y_true) * np.log(1-y_hat) )

def gradient_sigmoid(x):
  sigmoid(X) * (1 - sigmoid(X))


def gradients(X, y, y_hat):
    # Loss = y * log(h) + (1 - y) * log(1-h)
    # where h = sigmoid(z)
    # and z = Xt @ B

    # deriv_loss_to_h = y / h - (1-y) / (1-h) = (y - h) / (h * (1 - h))
    # deriv_h_to_z = sigmoid(h) * (1 - sigmoid(h))
    # deriv_z_to_b = Xt
    # Though chain rule, final derivative 
    # final_derivative = deriv_loss_to_h * deriv_h_to_z * deriv_z_to_b = x * (y - h) = x * (y - y_hat) 
    dw = (1/len(X)) * (X.T @ (y_hat - y))
    return dw

df['median_house_value_cat'] = (df['median_house_value'] > 150_000).astype(int)

scaled_df = (df - df.min()) / (df.max() - df.min())
X = scaled_df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']].values
y = df['median_house_value_cat'].values 

X_with_intercept = np.hstack((np.ones((len(X), 1)),X))

B = np.random.normal(0, 0.1 ,(7, 1))

for i in range(50_000):
  y_hat = sigmoid(X_with_intercept @ B).reshape(-1)
  if i % 5000 == 0 or i ==0: 
    print("loss: ", log_likelihood(y_hat, y))
  deltas = gradients(X_with_intercept, y, y_hat)
  B -= 0.3 * deltas.reshape(-1, 1)


lr = sklearn.linear_model.LogisticRegression().fit(X, y)

loss:  0.6821755251690551
loss:  0.46252854158211454
loss:  0.4541283502467595
loss:  0.4510517001241438
loss:  0.4487185438489886
loss:  0.4466409644195144
loss:  0.44474208933519344
loss:  0.4429996946665173
loss:  0.4413999204051543
loss:  0.43993073757337586

print("Manual weights: ", B.reshape(-1))
print("Manual score: ", 
        precision_score(y, (sigmoid(X_with_intercept @ B).reshape(-1) > 0.5).astype(int) ),
        recall_score(y, (sigmoid(X_with_intercept @ B).reshape(-1) > 0.5).astype(int) ),
      )
print()
print("Sklearn log loss: ", log_loss(y, (sigmoid(X_with_intercept @ B).reshape(-1))))
print("Sklearn weights: ", lr.intercept_.tolist() + lr.coef_.reshape(-1).tolist())
print("Sklearn score", 
      precision_score(y, lr.predict(X)),
      recall_score(y, lr.predict(X))
      )

Manual weights:  [ -4.74650191   2.09565859 -11.5802056    7.08212211  -3.00163538
   8.00035044  18.85028779]
Manual score:  0.8215249055925193 0.8514583915758084

Sklearn log loss:  0.43866521703018374
Sklearn weights:  [-4.385273936252051, 1.9603353158729624, -10.78106293024599, 6.882196980429938, -2.868679885031378, 7.251350300146187, 17.41000987846787]
Sklearn score 0.8186773905272565 0.8536949026185817

The weights are not exactly the same but the performances are very similar. This is due to the randomness aspect of training through gradient descent.