The city is for Nominal Onehot Encoding. The Size is for Ordinla Encoding
import pandas as pd
d = {'sales': [100000, 222000, 10000000, 525000, 111111, 200000, 75000, 9000, 109000, 10000],
'city': ['Tampa', 'Tampa', 'Orlando', 'Jacksonville', 'Miami', 'Miami', 'Orlando', 'Jacksonville', 'Jacksonville', 'Orlando' ],
'size':['Small', 'Medium', 'Large', 'Medium', 'Medium', 'Large', 'Small', 'Small','Medium', 'Small'],
}
df = pd.DataFrame(d)
df.head()
sales | city | size | |
---|---|---|---|
0 | 100000 | Tampa | Small |
1 | 222000 | Tampa | Medium |
2 | 10000000 | Orlando | Large |
3 | 525000 | Jacksonville | Medium |
4 | 111111 | Miami | Medium |
df['city'].unique()
array(['Tampa', 'Orlando', 'Jacksonville', 'Miami'], dtype=object)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
In this code:
ohe_transform_city = ohe_city.fit_transform(df[['city']])
# This fits the OneHotEncoder to the 'city' column of the DataFrame df and transforms it into a one-hot encoded representation.
ohe_transform_city
array([[0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 1., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [1., 0., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.]])
feature_names_city = ohe_city.get_feature_names_out(input_features=['city'])
# This retrieves the feature names for the one-hot encoded 'city' column. It ensures that the column name is included in the feature names.
feature_names_city
array(['city_Jacksonville', 'city_Miami', 'city_Orlando', 'city_Tampa'], dtype=object)
ohe_df_city = pd.DataFrame(ohe_transform_city, columns=feature_names_city)
# This converts the transformed array of the one-hot encoded 'city' column into a pandas DataFrame using the feature names obtained earlier.
ohe_df_city
city_Jacksonville | city_Miami | city_Orlando | city_Tampa | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 0.0 | 0.0 | 0.0 | 1.0 |
2 | 0.0 | 0.0 | 1.0 | 0.0 |
3 | 1.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 1.0 | 0.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 | 0.0 |
6 | 0.0 | 0.0 | 1.0 | 0.0 |
7 | 1.0 | 0.0 | 0.0 | 0.0 |
8 | 1.0 | 0.0 | 0.0 | 0.0 |
9 | 0.0 | 0.0 | 1.0 | 0.0 |
df_encoded = pd.concat([df.drop(columns=['city']), ohe_df_city], axis=1)
# This concatenates the original DataFrame df after dropping the 'city' column with the one-hot encoded 'city' DataFrame ohe_df_city, resulting in the final DataFrame df_encoded.
df_encoded
sales | size | city_Jacksonville | city_Miami | city_Orlando | city_Tampa | |
---|---|---|---|---|---|---|
0 | 100000 | Small | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 222000 | Medium | 0.0 | 0.0 | 0.0 | 1.0 |
2 | 10000000 | Large | 0.0 | 0.0 | 1.0 | 0.0 |
3 | 525000 | Medium | 1.0 | 0.0 | 0.0 | 0.0 |
4 | 111111 | Medium | 0.0 | 1.0 | 0.0 | 0.0 |
5 | 200000 | Large | 0.0 | 1.0 | 0.0 | 0.0 |
6 | 75000 | Small | 0.0 | 0.0 | 1.0 | 0.0 |
7 | 9000 | Small | 1.0 | 0.0 | 0.0 | 0.0 |
8 | 109000 | Medium | 1.0 | 0.0 | 0.0 | 0.0 |
9 | 10000 | Small | 0.0 | 0.0 | 1.0 | 0.0 |
df_encoded.drop('city_Tampa', axis=1)
sales | size | city_Jacksonville | city_Miami | city_Orlando | |
---|---|---|---|---|---|
0 | 100000 | Small | 0.0 | 0.0 | 0.0 |
1 | 222000 | Medium | 0.0 | 0.0 | 0.0 |
2 | 10000000 | Large | 0.0 | 0.0 | 1.0 |
3 | 525000 | Medium | 1.0 | 0.0 | 0.0 |
4 | 111111 | Medium | 0.0 | 1.0 | 0.0 |
5 | 200000 | Large | 0.0 | 1.0 | 0.0 |
6 | 75000 | Small | 0.0 | 0.0 | 1.0 |
7 | 9000 | Small | 1.0 | 0.0 | 0.0 |
8 | 109000 | Medium | 1.0 | 0.0 | 0.0 |
9 | 10000 | Small | 0.0 | 0.0 | 1.0 |
df_encoded = pd.get_dummies(df, columns=['city'])
df_encoded
# This is straight forward
sales | size | city_Jacksonville | city_Miami | city_Orlando | city_Tampa | |
---|---|---|---|---|---|---|
0 | 100000 | Small | 0 | 0 | 0 | 1 |
1 | 222000 | Medium | 0 | 0 | 0 | 1 |
2 | 10000000 | Large | 0 | 0 | 1 | 0 |
3 | 525000 | Medium | 1 | 0 | 0 | 0 |
4 | 111111 | Medium | 0 | 1 | 0 | 0 |
5 | 200000 | Large | 0 | 1 | 0 | 0 |
6 | 75000 | Small | 0 | 0 | 1 | 0 |
7 | 9000 | Small | 1 | 0 | 0 | 0 |
8 | 109000 | Medium | 1 | 0 | 0 | 0 |
9 | 10000 | Small | 0 | 0 | 1 | 0 |
df
sales | city | size | |
---|---|---|---|
0 | 100000 | Tampa | Small |
1 | 222000 | Tampa | Medium |
2 | 10000000 | Orlando | Large |
3 | 525000 | Jacksonville | Medium |
4 | 111111 | Miami | Medium |
5 | 200000 | Miami | Large |
6 | 75000 | Orlando | Small |
7 | 9000 | Jacksonville | Small |
8 | 109000 | Jacksonville | Medium |
9 | 10000 | Orlando | Small |
df['size'].unique()
array(['Small', 'Medium', 'Large'], dtype=object)
sizes = ['Small', 'Medium', 'Large']
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [sizes])
enc.fit_transform(df[['size']])
array([[0.], [1.], [2.], [1.], [1.], [2.], [0.], [0.], [1.], [0.]])
df.head()
sales | city | size | |
---|---|---|---|
0 | 100000 | Tampa | Small |
1 | 222000 | Tampa | Medium |
2 | 10000000 | Orlando | Large |
3 | 525000 | Jacksonville | Medium |
4 | 111111 | Miami | Medium |
df['size '] = enc.fit_transform(df[['size']])
df.head(10)
sales | city | size | size | |
---|---|---|---|---|
0 | 100000 | Tampa | Small | 0.0 |
1 | 222000 | Tampa | Medium | 1.0 |
2 | 10000000 | Orlando | Large | 2.0 |
3 | 525000 | Jacksonville | Medium | 1.0 |
4 | 111111 | Miami | Medium | 1.0 |
5 | 200000 | Miami | Large | 2.0 |
6 | 75000 | Orlando | Small | 0.0 |
7 | 9000 | Jacksonville | Small | 0.0 |
8 | 109000 | Jacksonville | Medium | 1.0 |
9 | 10000 | Orlando | Small | 0.0 |
df_encoded = pd.get_dummies(df[['city']])
df_encoded
city_Jacksonville | city_Miami | city_Orlando | city_Tampa | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 1 | 0 |
3 | 1 | 0 | 0 | 0 |
4 | 0 | 1 | 0 | 0 |
5 | 0 | 1 | 0 | 0 |
6 | 0 | 0 | 1 | 0 |
7 | 1 | 0 | 0 | 0 |
8 | 1 | 0 | 0 | 0 |
9 | 0 | 0 | 1 | 0 |
final_df = pd.concat([df, df_encoded], axis=1)
final_df
sales | city | size | size | city_Jacksonville | city_Miami | city_Orlando | city_Tampa | |
---|---|---|---|---|---|---|---|---|
0 | 100000 | Tampa | Small | 0.0 | 0 | 0 | 0 | 1 |
1 | 222000 | Tampa | Medium | 1.0 | 0 | 0 | 0 | 1 |
2 | 10000000 | Orlando | Large | 2.0 | 0 | 0 | 1 | 0 |
3 | 525000 | Jacksonville | Medium | 1.0 | 1 | 0 | 0 | 0 |
4 | 111111 | Miami | Medium | 1.0 | 0 | 1 | 0 | 0 |
5 | 200000 | Miami | Large | 2.0 | 0 | 1 | 0 | 0 |
6 | 75000 | Orlando | Small | 0.0 | 0 | 0 | 1 | 0 |
7 | 9000 | Jacksonville | Small | 0.0 | 1 | 0 | 0 | 0 |
8 | 109000 | Jacksonville | Medium | 1.0 | 1 | 0 | 0 | 0 |
9 | 10000 | Orlando | Small | 0.0 | 0 | 0 | 1 | 0 |
# Select columns of type 'object' (string)
string_size_columns = final_df.select_dtypes(include=['object']).columns
# Drop the column containing string values, all string values
final_df = final_df.drop(columns=string_size_columns)
final_df
sales | size | city_Jacksonville | city_Miami | city_Orlando | |
---|---|---|---|---|---|
0 | 100000 | 0.0 | 0 | 0 | 0 |
1 | 222000 | 1.0 | 0 | 0 | 0 |
2 | 10000000 | 2.0 | 0 | 0 | 1 |
3 | 525000 | 1.0 | 1 | 0 | 0 |
4 | 111111 | 1.0 | 0 | 1 | 0 |
5 | 200000 | 2.0 | 0 | 1 | 0 |
6 | 75000 | 0.0 | 0 | 0 | 1 |
7 | 9000 | 0.0 | 1 | 0 | 0 |
8 | 109000 | 1.0 | 1 | 0 | 0 |
9 | 10000 | 0.0 | 0 | 0 | 1 |