#!/usr/bin/env python # coding: utf-8 # # The city is for Nominal Onehot Encoding. # The Size is for Ordinla Encoding # ### Nominal OneHotEncoding # In[144]: import pandas as pd # In[186]: d = {'sales': [100000, 222000, 10000000, 525000, 111111, 200000, 75000, 9000, 109000, 10000], 'city': ['Tampa', 'Tampa', 'Orlando', 'Jacksonville', 'Miami', 'Miami', 'Orlando', 'Jacksonville', 'Jacksonville', 'Orlando' ], 'size':['Small', 'Medium', 'Large', 'Medium', 'Medium', 'Large', 'Small', 'Small','Medium', 'Small'], } # In[187]: df = pd.DataFrame(d) # In[188]: df.head() # In[189]: df['city'].unique() # In[190]: from sklearn.preprocessing import OneHotEncoder # In[191]: ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False) # **In this code:** # # - handle_unknown='ignore' specifies that if unknown categories are encountered during transform, they should be ignored. # - sparse=False specifies that the output should be a dense array rather than a sparse matrix. # In[192]: ohe_transform_city = ohe_city.fit_transform(df[['city']]) # This fits the OneHotEncoder to the 'city' column of the DataFrame df and transforms it into a one-hot encoded representation. ohe_transform_city # In[193]: feature_names_city = ohe_city.get_feature_names_out(input_features=['city']) # This retrieves the feature names for the one-hot encoded 'city' column. It ensures that the column name is included in the feature names. feature_names_city # In[194]: ohe_df_city = pd.DataFrame(ohe_transform_city, columns=feature_names_city) # This converts the transformed array of the one-hot encoded 'city' column into a pandas DataFrame using the feature names obtained earlier. ohe_df_city # In[195]: df_encoded = pd.concat([df.drop(columns=['city']), ohe_df_city], axis=1) # This concatenates the original DataFrame df after dropping the 'city' column with the one-hot encoded 'city' DataFrame ohe_df_city, resulting in the final DataFrame df_encoded. df_encoded # In[196]: df_encoded.drop('city_Tampa', axis=1) # ### OR # In[197]: df_encoded = pd.get_dummies(df, columns=['city']) df_encoded # This is straight forward # ### Nominal OneHotEncoding # In[211]: df # In[212]: df['size'].unique() # In[213]: sizes = ['Small', 'Medium', 'Large'] # In[214]: from sklearn.preprocessing import OrdinalEncoder # In[219]: enc = OrdinalEncoder(categories = [sizes]) # In[221]: enc.fit_transform(df[['size']]) # In[217]: df.head() # In[222]: df['size '] = enc.fit_transform(df[['size']]) # In[224]: df.head(10) # ##### To now convert the Nominal Data- City # In[206]: df_encoded = pd.get_dummies(df[['city']]) # In[225]: df_encoded # In[226]: final_df = pd.concat([df, df_encoded], axis=1) final_df # In[232]: # Select columns of type 'object' (string) string_size_columns = final_df.select_dtypes(include=['object']).columns # Drop the column containing string values, all string values final_df = final_df.drop(columns=string_size_columns) # In[233]: final_df # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: