from sklearn.datasets import load_wine, load_breast_cancer, load_boston
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, mean_absolute_error
from sklearn.ensemble._gb_losses import BinomialDeviance
Boosting trees
Regression
= load_boston(return_X_y=True)
raw
= pd.DataFrame(raw[0])
X = pd.DataFrame(raw[1])
y
= [y.mean()[0]] * len(y)
initial_predictions
print("Error with mean: ", mean_absolute_error(y, initial_predictions))
= 0.3
learning_rate
# Let's build some trees !
= initial_predictions
predictions_so_far = (y.values.reshape(-1) - predictions_so_far)
gradient_of_loss = []
trees for i in range(5):
# Train a tree on the latest residuals
= DecisionTreeRegressor(max_depth=1)
tree
tree.fit(X, gradient_of_loss)
trees.append(tree)
# Compute the predictions of the trees
= predictions_so_far + learning_rate * tree.predict(X).reshape(-1) # Each tree tries to predict the error.
predictions_so_far
# Get the new residuals. This is what we fit the next tree on
# Residuals are the gradient of the loss with respect to the previous trees predictions.
# In this case the loss is MSE:
# loss = (y_hat - y) ** 2
# loss_gradient_with_respect_to_y = - 2 * (y_hat - y) = 2 * (y - y_hat)
= 2* (y.values.reshape(-1) - predictions_so_far)
gradient_of_loss
print("Error with boosting: ", mean_absolute_error(y, predictions_so_far))
Error with mean: 6.647207423956011
Error with boosting: 3.3369627690621475
Classification
from sklearn.datasets import load_wine, load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
def update_lead_values(tree):
= load_breast_cancer(return_X_y=True)
raw
= pd.DataFrame(raw[0])
X = pd.DataFrame(raw[1])
y
= y.mean()[0]
p = np.array([np.log(p/(1-p))] * len(y))# Initial prediction is logodds of y
initial_predictions print("Initial score: ", f1_score(y, (initial_predictions > 0.5) *1))
= 0.3
learning_rate
def sigmoid(x):
return 1 / (1 + np.exp(-x))
= sigmoid(initial_predictions)
y_hat = y_hat - y.values.reshape(-1)
gradient_of_loss
= []
trees = initial_predictions
predictions_so_far for i in range(5):
# Train a tree on the latest residuals
= DecisionTreeRegressor(max_depth=1)
tree
tree.fit(X, gradient_of_loss)
# TODO: Here you need to update the values of the tree leaves
# to equal a specific value each.
trees.append(tree)
# Compute the predictions of the trees
= predictions_so_far - learning_rate * tree.predict(X).reshape(-1)
predictions_so_far
# The gradient of the loss with respect to y_hat
# is y_hat - y. Neat.
= sigmoid(predictions_so_far)
y_hat = y_hat - y.values.reshape(-1)
gradient_of_loss
print("Score with boosting: ", f1_score(y, 1 * (sigmoid(predictions_so_far) > 0.5)))
Initial score: 0.7710583153347732
Score with boosting: 0.922279792746114
0].tree_ trees[
<sklearn.tree._tree.Tree at 0x7faeec0f2180>