This code is heavily derived from Decision Tree from a Scratch (ID3 algorithm)

In [1]:
import numpy as np
import pandas as pd

eps = np.finfo(float).eps
dataset = {'Taste': ['Salty', 'Spicy', 'Spicy', 'Spicy', 'Spicy', 'Sweet', 'Salty', 'Sweet', 'Spicy', 'Salty'],
'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Hot', 'Cold', 'Cold', 'Hot', 'Cold', 'Hot'],
'Texture': ['Soft', 'Soft', 'Hard', 'Hard', 'Hard', 'Soft', 'Soft', 'Soft', 'Soft', 'Hard'],
'Eat': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes']}

df = pd.DataFrame(dataset, columns=['Taste', 'Temperature', 'Texture', 'Eat'])

In [2]:
def find_entropy(df):
entropy = 0
for value in ('Yes', 'No'):
fraction = df['Eat'].value_counts()[value] / len(df['Eat'])
entropy += -fraction * np.log2(fraction)
return entropy

In [3]:
df['Eat'].value_counts()['Yes']

Out[3]:
6
In [4]:
df['Taste'][df['Taste'] == 'Salty']

Out[4]:
0    Salty
6    Salty
9    Salty
Name: Taste, dtype: object
In [5]:
len(df[df['Eat'] == 'Yes'])

Out[5]:
6
In [6]:
find_entropy(df)

Out[6]:
0.9709505944546686
In [7]:
def find_entropy_attribute(df, attribute):
entropy_sum = 0
for variable in df[attribute].unique():
entropy = 0
for target_variable in ('Yes', 'No'):
num = len(df[attribute][df[attribute] == variable][df['Eat'] == target_variable])
den = len(df[attribute][df[attribute] == variable])
fraction = num / (den + eps)
entropy += -fraction * np.log2(fraction + eps)
entropy_weights = den / len(df)
entropy_sum += -entropy_weights * entropy
return abs(entropy_sum)

In [8]:
def build_tree(df):
IG = []
for key in df.keys()[:-1]:
IG.append(find_entropy(df) - find_entropy_attribute(df, key))
node = df.keys()[:-1][np.argmax(IG)]  # Get attribute with maximum information gain

tree = {}
tree[node] = {}

for value in np.unique(df[node]):
sub_table = df[df[node] == value].reset_index(drop=True)
cls, counts = np.unique(sub_table['Eat'], return_counts=True)

if len(counts) == 1:  # Checking purity of subset
tree[node][value] = cls[0]
else:
tree[node][value] = build_tree(sub_table)  # Calling the function recursively

return tree

In [9]:
tree = build_tree(df)

import pprint
pprint.pprint(tree)

{'Taste': {'Salty': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}},
'Spicy': {'Temperature': {'Cold': {'Texture': {'Hard': 'No',
'Soft': 'Yes'}},
'Hot': {'Texture': {'Hard': 'Yes',
'Soft': 'No'}}}},
'Sweet': 'Yes'}}

In [10]:
def predict(inst, tree):
for nodes in tree.keys():
value = inst[nodes]
tree = tree[nodes][value]
prediction = 0

if type(tree) is dict:
prediction = predict(inst, tree)
else:
prediction = tree
break

return prediction

In [11]:
inst = df.iloc[6][:-1]
inst

Out[11]:
Taste          Salty
Temperature     Cold
Texture         Soft
Name: 6, dtype: object
In [12]:
df.iloc[6][-1]

Out[12]:
'No'
In [13]:
prediction = predict(inst, tree)
prediction

Out[13]:
'No'