from sklearn.datasets import load_wine, load_breast_cancer, load_boston
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, mean_absolute_error
from sklearn.ensemble._gb_losses import BinomialDevianceBoosting trees
Regression
raw = load_boston(return_X_y=True)
X = pd.DataFrame(raw[0])
y = pd.DataFrame(raw[1])
initial_predictions = [y.mean()[0]] * len(y)
print("Error with mean: ", mean_absolute_error(y, initial_predictions))
learning_rate = 0.3
# Let's build some trees !
predictions_so_far = initial_predictions
gradient_of_loss = (y.values.reshape(-1) - predictions_so_far)
trees = []
for i in range(5):
# Train a tree on the latest residuals
tree = DecisionTreeRegressor(max_depth=1)
tree.fit(X, gradient_of_loss)
trees.append(tree)
# Compute the predictions of the trees
predictions_so_far = predictions_so_far + learning_rate * tree.predict(X).reshape(-1) # Each tree tries to predict the error.
# Get the new residuals. This is what we fit the next tree on
# Residuals are the gradient of the loss with respect to the previous trees predictions.
# In this case the loss is MSE:
# loss = (y_hat - y) ** 2
# loss_gradient_with_respect_to_y = - 2 * (y_hat - y) = 2 * (y - y_hat)
gradient_of_loss = 2* (y.values.reshape(-1) - predictions_so_far)
print("Error with boosting: ", mean_absolute_error(y, predictions_so_far))Error with mean: 6.647207423956011
Error with boosting: 3.3369627690621475
Classification
from sklearn.datasets import load_wine, load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_scoredef update_lead_values(tree):
raw = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(raw[0])
y = pd.DataFrame(raw[1])
p = y.mean()[0]
initial_predictions = np.array([np.log(p/(1-p))] * len(y))# Initial prediction is logodds of y
print("Initial score: ", f1_score(y, (initial_predictions > 0.5) *1))
learning_rate = 0.3
def sigmoid(x):
return 1 / (1 + np.exp(-x))
y_hat = sigmoid(initial_predictions)
gradient_of_loss = y_hat - y.values.reshape(-1)
trees = []
predictions_so_far = initial_predictions
for i in range(5):
# Train a tree on the latest residuals
tree = DecisionTreeRegressor(max_depth=1)
tree.fit(X, gradient_of_loss)
# TODO: Here you need to update the values of the tree leaves
# to equal a specific value each.
trees.append(tree)
# Compute the predictions of the trees
predictions_so_far = predictions_so_far - learning_rate * tree.predict(X).reshape(-1)
# The gradient of the loss with respect to y_hat
# is y_hat - y. Neat.
y_hat = sigmoid(predictions_so_far)
gradient_of_loss = y_hat - y.values.reshape(-1)
print("Score with boosting: ", f1_score(y, 1 * (sigmoid(predictions_so_far) > 0.5)))Initial score: 0.7710583153347732
Score with boosting: 0.922279792746114
trees[0].tree_<sklearn.tree._tree.Tree at 0x7faeec0f2180>