The city is for Nominal Onehot Encoding. The Size is for Ordinla Encoding

Nominal OneHotEncoding¶

In [144]:

import pandas as pd

In [186]:

d = {'sales': [100000, 222000, 10000000, 525000, 111111, 200000, 75000, 9000, 109000, 10000],
    'city': ['Tampa', 'Tampa', 'Orlando', 'Jacksonville', 'Miami', 'Miami', 'Orlando', 'Jacksonville', 'Jacksonville', 'Orlando' ],
    'size':['Small', 'Medium', 'Large', 'Medium', 'Medium', 'Large', 'Small', 'Small','Medium', 'Small'],
    }

In [187]:

df = pd.DataFrame(d)

In [188]:

df.head()

Out[188]:

	sales	city	size
0	100000	Tampa	Small
1	222000	Tampa	Medium
2	10000000	Orlando	Large
3	525000	Jacksonville	Medium
4	111111	Miami	Medium

In [189]:

df['city'].unique()

Out[189]:

array(['Tampa', 'Orlando', 'Jacksonville', 'Miami'], dtype=object)

In [190]:

from sklearn.preprocessing import OneHotEncoder

In [191]:

ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False)

In this code:

handle_unknown='ignore' specifies that if unknown categories are encountered during transform, they should be ignored.
sparse=False specifies that the output should be a dense array rather than a sparse matrix.

In [192]:

ohe_transform_city = ohe_city.fit_transform(df[['city']])
# This fits the OneHotEncoder to the 'city' column of the DataFrame df and transforms it into a one-hot encoded representation.
ohe_transform_city

Out[192]:

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [193]:

feature_names_city = ohe_city.get_feature_names_out(input_features=['city'])
# This retrieves the feature names for the one-hot encoded 'city' column. It ensures that the column name is included in the feature names.
feature_names_city

Out[193]:

array(['city_Jacksonville', 'city_Miami', 'city_Orlando', 'city_Tampa'],
      dtype=object)

In [194]:

ohe_df_city = pd.DataFrame(ohe_transform_city, columns=feature_names_city)
# This converts the transformed array of the one-hot encoded 'city' column into a pandas DataFrame using the feature names obtained earlier.
ohe_df_city

Out[194]:

	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	0.0	0.0	0.0	1.0
1	0.0	0.0	0.0	1.0
2	0.0	0.0	1.0	0.0
3	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0
5	0.0	1.0	0.0	0.0
6	0.0	0.0	1.0	0.0
7	1.0	0.0	0.0	0.0
8	1.0	0.0	0.0	0.0
9	0.0	0.0	1.0	0.0

In [195]:

df_encoded = pd.concat([df.drop(columns=['city']), ohe_df_city], axis=1)
# This concatenates the original DataFrame df after dropping the 'city' column with the one-hot encoded 'city' DataFrame ohe_df_city, resulting in the final DataFrame df_encoded.
df_encoded

Out[195]:

	sales	size	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	100000	Small	0.0	0.0	0.0	1.0
1	222000	Medium	0.0	0.0	0.0	1.0
2	10000000	Large	0.0	0.0	1.0	0.0
3	525000	Medium	1.0	0.0	0.0	0.0
4	111111	Medium	0.0	1.0	0.0	0.0
5	200000	Large	0.0	1.0	0.0	0.0
6	75000	Small	0.0	0.0	1.0	0.0
7	9000	Small	1.0	0.0	0.0	0.0
8	109000	Medium	1.0	0.0	0.0	0.0
9	10000	Small	0.0	0.0	1.0	0.0

In [196]:

df_encoded.drop('city_Tampa', axis=1)

Out[196]:

	sales	size	city_Jacksonville	city_Miami	city_Orlando
0	100000	Small	0.0	0.0	0.0
1	222000	Medium	0.0	0.0	0.0
2	10000000	Large	0.0	0.0	1.0
3	525000	Medium	1.0	0.0	0.0
4	111111	Medium	0.0	1.0	0.0
5	200000	Large	0.0	1.0	0.0
6	75000	Small	0.0	0.0	1.0
7	9000	Small	1.0	0.0	0.0
8	109000	Medium	1.0	0.0	0.0
9	10000	Small	0.0	0.0	1.0

OR¶

In [197]:

df_encoded = pd.get_dummies(df, columns=['city'])
df_encoded

# This is straight forward

Out[197]:

	sales	size	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	100000	Small	0	0	0	1
1	222000	Medium	0	0	0	1
2	10000000	Large	0	0	1	0
3	525000	Medium	1	0	0	0
4	111111	Medium	0	1	0	0
5	200000	Large	0	1	0	0
6	75000	Small	0	0	1	0
7	9000	Small	1	0	0	0
8	109000	Medium	1	0	0	0
9	10000	Small	0	0	1	0

Nominal OneHotEncoding¶

In [211]:

df

Out[211]:

	sales	city	size
0	100000	Tampa	Small
1	222000	Tampa	Medium
2	10000000	Orlando	Large
3	525000	Jacksonville	Medium
4	111111	Miami	Medium
5	200000	Miami	Large
6	75000	Orlando	Small
7	9000	Jacksonville	Small
8	109000	Jacksonville	Medium
9	10000	Orlando	Small

In [212]:

df['size'].unique()

Out[212]:

array(['Small', 'Medium', 'Large'], dtype=object)

In [213]:

sizes = ['Small', 'Medium', 'Large']

In [214]:

from sklearn.preprocessing import OrdinalEncoder

In [219]:

enc = OrdinalEncoder(categories = [sizes])

In [221]:

enc.fit_transform(df[['size']])

Out[221]:

array([[0.],
       [1.],
       [2.],
       [1.],
       [1.],
       [2.],
       [0.],
       [0.],
       [1.],
       [0.]])

In [217]:

df.head()

Out[217]:

	sales	city	size
0	100000	Tampa	Small
1	222000	Tampa	Medium
2	10000000	Orlando	Large
3	525000	Jacksonville	Medium
4	111111	Miami	Medium

In [222]:

df['size '] = enc.fit_transform(df[['size']])

In [224]:

df.head(10)

Out[224]:

	sales	city	size	size
0	100000	Tampa	Small	0.0
1	222000	Tampa	Medium	1.0
2	10000000	Orlando	Large	2.0
3	525000	Jacksonville	Medium	1.0
4	111111	Miami	Medium	1.0
5	200000	Miami	Large	2.0
6	75000	Orlando	Small	0.0
7	9000	Jacksonville	Small	0.0
8	109000	Jacksonville	Medium	1.0
9	10000	Orlando	Small	0.0

To now convert the Nominal Data- City¶

In [206]:

df_encoded = pd.get_dummies(df[['city']])

In [225]:

df_encoded

Out[225]:

	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	0	0	0	1
1	0	0	0	1
2	0	0	1	0
3	1	0	0	0
4	0	1	0	0
5	0	1	0	0
6	0	0	1	0
7	1	0	0	0
8	1	0	0	0
9	0	0	1	0

In [226]:

final_df = pd.concat([df, df_encoded], axis=1)
final_df

Out[226]:

	sales	city	size	size	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	100000	Tampa	Small	0.0	0	0	0	1
1	222000	Tampa	Medium	1.0	0	0	0	1
2	10000000	Orlando	Large	2.0	0	0	1	0
3	525000	Jacksonville	Medium	1.0	1	0	0	0
4	111111	Miami	Medium	1.0	0	1	0	0
5	200000	Miami	Large	2.0	0	1	0	0
6	75000	Orlando	Small	0.0	0	0	1	0
7	9000	Jacksonville	Small	0.0	1	0	0	0
8	109000	Jacksonville	Medium	1.0	1	0	0	0
9	10000	Orlando	Small	0.0	0	0	1	0

In [232]:

# Select columns of type 'object' (string)
string_size_columns = final_df.select_dtypes(include=['object']).columns

# Drop the column containing string values, all string values
final_df = final_df.drop(columns=string_size_columns)

In [233]:

final_df

Out[233]:

	sales	size	city_Jacksonville	city_Miami	city_Orlando
0	100000	0.0	0	0	0
1	222000	1.0	0	0	0
2	10000000	2.0	0	0	1
3	525000	1.0	1	0	0
4	111111	1.0	0	1	0
5	200000	2.0	0	1	0
6	75000	0.0	0	0	1
7	9000	0.0	1	0	0
8	109000	1.0	1	0	0
9	10000	0.0	0	0	1

In [ ]:

	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	0.0	0.0	0.0	1.0
1	0.0	0.0	0.0	1.0
2	0.0	0.0	1.0	0.0
3	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0
5	0.0	1.0	0.0	0.0
6	0.0	0.0	1.0	0.0
7	1.0	0.0	0.0	0.0
8	1.0	0.0	0.0	0.0
9	0.0	0.0	1.0	0.0

	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	0.0	0.0	0.0	1.0
1	0.0	0.0	0.0	1.0
2	0.0	0.0	1.0	0.0
3	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0
5	0.0	1.0	0.0	0.0
6	0.0	0.0	1.0	0.0
7	1.0	0.0	0.0	0.0
8	1.0	0.0	0.0	0.0
9	0.0	0.0	1.0	0.0

	city_Jacksonville	city_Miami	city_Orlando	city_Tampa
0	0.0	0.0	0.0	1.0
1	0.0	0.0	0.0	1.0
2	0.0	0.0	1.0	0.0
3	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0
5	0.0	1.0	0.0	0.0
6	0.0	0.0	1.0	0.0
7	1.0	0.0	0.0	0.0
8	1.0	0.0	0.0	0.0
9	0.0	0.0	1.0	0.0