Gradient Boosting trees

Simple boosting trees, for regression and classification in python from scratch
ai
Published

September 18, 2021

Boosting trees

Regression

from sklearn.datasets import load_wine, load_breast_cancer, load_boston
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, mean_absolute_error
from sklearn.ensemble._gb_losses import BinomialDeviance
raw = load_boston(return_X_y=True)

X = pd.DataFrame(raw[0])
y = pd.DataFrame(raw[1])

initial_predictions = [y.mean()[0]] * len(y)


print("Error with mean: ", mean_absolute_error(y, initial_predictions))

learning_rate = 0.3

# Let's build some trees !
predictions_so_far = initial_predictions
gradient_of_loss = (y.values.reshape(-1) - predictions_so_far)
trees = []
for i in range(5): 

  # Train a tree on the latest residuals
  tree = DecisionTreeRegressor(max_depth=1)
  tree.fit(X, gradient_of_loss)
  trees.append(tree)

  # Compute the predictions of the trees
  predictions_so_far = predictions_so_far + learning_rate * tree.predict(X).reshape(-1) # Each tree tries to predict the error. 

  # Get the new residuals. This is what we fit the next tree on
  # Residuals are the gradient of the loss with respect to the previous trees predictions. 
  # In this case the loss is MSE: 
  # loss = (y_hat - y) ** 2
  # loss_gradient_with_respect_to_y = - 2 * (y_hat - y) = 2 * (y - y_hat)
  gradient_of_loss =  2* (y.values.reshape(-1) - predictions_so_far)

print("Error with boosting: ", mean_absolute_error(y, predictions_so_far))
Error with mean:  6.647207423956011
Error with boosting:  3.3369627690621475

Classification

from sklearn.datasets import load_wine, load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
def update_lead_values(tree): 
    
raw = load_breast_cancer(return_X_y=True)

X = pd.DataFrame(raw[0])
y = pd.DataFrame(raw[1])

p = y.mean()[0]
initial_predictions = np.array([np.log(p/(1-p))] * len(y))# Initial prediction is logodds of y
print("Initial score: ", f1_score(y, (initial_predictions > 0.5) *1))


learning_rate = 0.3

def sigmoid(x): 
  return 1 / (1 + np.exp(-x))

y_hat = sigmoid(initial_predictions)
gradient_of_loss = y_hat - y.values.reshape(-1)


trees = []
predictions_so_far = initial_predictions
for i in range(5): 

  # Train a tree on the latest residuals
  tree = DecisionTreeRegressor(max_depth=1)
  tree.fit(X, gradient_of_loss)

  # TODO: Here you need to update the values of the tree leaves
  # to equal a specific value each. 

  trees.append(tree)

  # Compute the predictions of the trees
  predictions_so_far = predictions_so_far - learning_rate * tree.predict(X).reshape(-1) 

  # The gradient of the loss with respect to y_hat
  # is y_hat - y. Neat.
  y_hat = sigmoid(predictions_so_far)
  gradient_of_loss =  y_hat - y.values.reshape(-1)

print("Score with boosting: ", f1_score(y, 1 * (sigmoid(predictions_so_far) > 0.5)))
Initial score:  0.7710583153347732
Score with boosting:  0.922279792746114
trees[0].tree_
<sklearn.tree._tree.Tree at 0x7faeec0f2180>