from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import random
= load_breast_cancer(return_X_y=True)
raw
= pd.DataFrame(raw[0])
X = pd.DataFrame(raw[1]) y
= X.columns
features = len(features)
n_features = int(round(np.sqrt(n_features)))
n_features_to_consider features, n_features_to_consider
(RangeIndex(start=0, stop=30, step=1), 5)
= []
trees
for i in range(10):
= random.sample(features.values.tolist(), k=n_features_to_consider)
feature_subset = DecisionTreeClassifier(max_depth=2)
tree = X.sample(frac=1).index # RANDOMly select data to train on
sampling_index # RANDOMly select features to train on
tree.fit(X.loc[sampling_index, feature_subset], y.loc[sampling_index]) trees.append((tree, feature_subset))
= np.mean([tree.predict(X.loc[:, features]) for tree, features in trees], axis=0) rf_predictions
# The precision of a single tree
for i in range(5):
= trees[i][0].predict(X.loc[:, trees[i][1]])
one_tree_predictions print(f1_score(y, one_tree_predictions) )
0.9195088676671215
0.9346879535558781
0.9439124487004104
0.9482517482517482
0.9410187667560322
# The precision of the forest
> 0.5) * 1) f1_score(y, (rf_predictions
0.9665738161559889