This code is heavily derived from Decision Tree from a Scratch (ID3 algorithm)
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
dataset = {'Taste': ['Salty', 'Spicy', 'Spicy', 'Spicy', 'Spicy', 'Sweet', 'Salty', 'Sweet', 'Spicy', 'Salty'],
'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Hot', 'Cold', 'Cold', 'Hot', 'Cold', 'Hot'],
'Texture': ['Soft', 'Soft', 'Hard', 'Hard', 'Hard', 'Soft', 'Soft', 'Soft', 'Soft', 'Hard'],
'Eat': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes']}
df = pd.DataFrame(dataset, columns=['Taste', 'Temperature', 'Texture', 'Eat'])
def find_entropy(df):
entropy = 0
for value in ('Yes', 'No'):
fraction = df['Eat'].value_counts()[value] / len(df['Eat'])
entropy += -fraction * np.log2(fraction)
return entropy
df['Eat'].value_counts()['Yes']
6
df['Taste'][df['Taste'] == 'Salty']
0 Salty 6 Salty 9 Salty Name: Taste, dtype: object
len(df[df['Eat'] == 'Yes'])
6
find_entropy(df)
0.9709505944546686
def find_entropy_attribute(df, attribute):
entropy_sum = 0
for variable in df[attribute].unique():
entropy = 0
for target_variable in ('Yes', 'No'):
num = len(df[attribute][df[attribute] == variable][df['Eat'] == target_variable])
den = len(df[attribute][df[attribute] == variable])
fraction = num / (den + eps)
entropy += -fraction * np.log2(fraction + eps)
entropy_weights = den / len(df)
entropy_sum += -entropy_weights * entropy
return abs(entropy_sum)
def build_tree(df):
IG = []
for key in df.keys()[:-1]:
IG.append(find_entropy(df) - find_entropy_attribute(df, key))
node = df.keys()[:-1][np.argmax(IG)] # Get attribute with maximum information gain
tree = {}
tree[node] = {}
for value in np.unique(df[node]):
sub_table = df[df[node] == value].reset_index(drop=True)
cls, counts = np.unique(sub_table['Eat'], return_counts=True)
if len(counts) == 1: # Checking purity of subset
tree[node][value] = cls[0]
else:
tree[node][value] = build_tree(sub_table) # Calling the function recursively
return tree
tree = build_tree(df)
import pprint
pprint.pprint(tree)
{'Taste': {'Salty': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}}, 'Spicy': {'Temperature': {'Cold': {'Texture': {'Hard': 'No', 'Soft': 'Yes'}}, 'Hot': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}}}}, 'Sweet': 'Yes'}}
def predict(inst, tree):
for nodes in tree.keys():
value = inst[nodes]
tree = tree[nodes][value]
prediction = 0
if type(tree) is dict:
prediction = predict(inst, tree)
else:
prediction = tree
break
return prediction
inst = df.iloc[6][:-1]
inst
Taste Salty Temperature Cold Texture Soft Name: 6, dtype: object
df.iloc[6][-1]
'No'
prediction = predict(inst, tree)
prediction
'No'