#!/usr/bin/env python # coding: utf-8 # ## Resignation in Australian Civil Service # Employee retention has many benefits. It can help with efficiency, customer satisfaction, and workplace diversity. Understanding why people choose to resign is key in building a stable group. #
This paper looks at the Australian Department of Education, Training, and Employment (DETE) as well as the Technical and Further Education (TAFE) institute in Queensland, Australia. It compares rates of resignation to the length of employment and to the age of employee. # In[1]: # import necessary libraries and set display options import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.float_format = '{:20,.4f}'.format pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.width', 3000) pd.set_option('display.colheader_justify', 'center') pd.set_option('display.precision', 3) # open files dete_survey = pd.read_csv("dete_survey.csv", na_values="Not Stated") tafe_survey = pd.read_csv("tafe_survey.csv") # In[2]: # dete_survey display(dete_survey.head()) display(dete_survey.info()) # In[3]: # tafe_survey display(tafe_survey.head()) display(tafe_survey.info()) # ## Initial Look & Header Standardization # There are many columns in both files that are not needed to answer the research questions. Making the header style more uniform allows for code that is easier to read. # In[4]: # drop unnecessary columns dete_survey_updated = dete_survey.drop(dete_survey.columns[28:49], axis=1) tafe_survey_updated = tafe_survey.drop(tafe_survey.columns[17:66], axis=1) # modify & standardize header map = {'Record ID': 'id', 'CESSATION YEAR': 'cease_date', 'Reason for ceasing employment': 'separationtype', 'Gender. What is your Gender?': 'gender', 'CurrentAge. Current Age': 'age', 'Employment Type. Employment Type': 'employment_status', 'Classification. Classification': 'position', 'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service', 'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'} tafe_survey_updated = tafe_survey_updated.rename(columns=map) dete_survey_updated.columns = dete_survey_updated.columns.str.lower().str.replace(' ', '_').str.strip() tafe_survey_updated.columns = tafe_survey_updated.columns.str.lower().str.replace(' ', '_').str.strip() display(dete_survey_updated.columns.to_list()) display(tafe_survey_updated.columns.to_list()) # ## Filtering Year Columns # The code cell below shows the responses in the ```separationtype``` column. This study is focused only on resignations. The DETE survey has multiple resignation codes. All of these rows, plus the corresponding rows in the tafe survey, will be copied to new data frames. This reduces 822 DETE rows and 702 TAFE rows to 262 and 340 respectively. from The relevant values will be cleaned and the rows checked to see if they are corrupted by any outlier values. # In[5]: # values coded into speparationtype column display(dete_survey_updated["separationtype"].value_counts(dropna=False)) display(tafe_survey_updated["separationtype"].value_counts(dropna=False)) # In[6]: # copy separationtype rows using regex and vectorized string methods pattern = r"Resignation" dete_resignations = dete_survey_updated[dete_survey_updated["separationtype"].str.contains(pattern, regex=True)].copy() tafe_resignations = tafe_survey_updated[tafe_survey_updated["separationtype"].str.contains(pattern, regex=True, na=False)].copy() display(dete_resignations["separationtype"].value_counts(dropna=False)) display(tafe_resignations["separationtype"].value_counts(dropna=False)) # In[7]: # dete_start_date > 1940 dete_resignations = dete_resignations[1940 < dete_resignations["dete_start_date"]] # dete_start_date < role_start_date < cease_date dete_resignations = dete_resignations[1940 < dete_resignations["role_start_date"]] dete_resignations = dete_resignations[dete_resignations["role_start_date"] < 2021] # cease_date < current date pattern = r"[0-9][0-9]/" dete_resignations["cease_date"] = dete_resignations["cease_date"].str.replace(pattern, "", regex=True).astype(float) dete_resignations = dete_resignations[dete_resignations["cease_date"] < 2021] # In[8]: dete_resignations.boxplot(column=["dete_start_date", "role_start_date", "cease_date"]) # ## Filtering Year Columns Continued # The DETE dataframe had one row in the `role_start_date` column with a year entry of 200. This row was excluded. There are a number of rows with NaN, but they can stay in because Pandas vectorized methods accommodate them. The first question is to assess connections between resignation and newer employees. The TAFE dataframe includes a employment duration column. A similar column needs to created for the DETE information. # In[9]: # display TAFE institute service display(tafe_resignations["institute_service"].value_counts()) display(tafe_resignations["institute_service"].describe()) # In[10]: # create and display DETE institute service column # the dictionary reassigns discrete numbers into a more useable range dete_resignations["institute_service"] = dete_resignations["cease_date"] - dete_resignations["dete_start_date"] di = {0.0: "Less than 1 year", 1.0: "1-2", 2.0:"1-2", 3.0:"3-4", 4.0:"3-4", 5.0:"5-6", 6.0:"5-6", 7.0:"7-10", 8.0:"7-10", 9.0:"7-10", 10.0:"7-10", 11.0:"11-20", 12.0:"11-20", 13.0:"11-20", 14.0:"11-20", 15.0:"11-20", 16.0:"11-20", 17.0:"11-20", 18.0:"11-20", 19.0:"11-20", 20.0:"11-20", 21:"More than 20 years", 22:"More than 20 years", 23:"More than 20 years", 24:"More than 20 years", 25:"More than 20 years", 26:"More than 20 years", 27:"More than 20 years", 28:"More than 20 years", 29:"More than 20 years", 30:"More than 20 years", 31:"More than 20 years", 32:"More than 20 years", 33:"More than 20 years", 34:"More than 20 years", 35:"More than 20 years", 36:"More than 20 years", 38:"More than 20 years", 39:"More than 20 years", 41:"More than 20 years", 42:"More than 20 years", 49:"More than 20 years"} dete_resignations['institute_service'] = dete_resignations['institute_service'].map(di) display(dete_resignations["institute_service"].value_counts()) dete_resignations["institute_service"].describe() # ## Aggregating Dissatisfaction # Each dataset has multiple resignation columns that could be considered as dissatisfaction, as opposed to other reasons such as `ill_health`, `study`, or `maternity`. These columns will be aggregated into a single `dissatisfied` column for each dataset. Values will be coded to True or False. At this time, the NaN values are being coded as False. # In[11]: # inspect TAFE resignation values display(tafe_resignations["contributing_factors._dissatisfaction"].value_counts(dropna=False)) display(tafe_resignations["contributing_factors._job_dissatisfaction"].value_counts(dropna=False)) # In[12]: # convert TAFE resignation values to boolean values # vectorized methods are amazing def boolean_bummer(element): if pd.isnull(element): return np.nan elif element == "-": return False else: return True tafe_resig_factors = ["contributing_factors._dissatisfaction", "contributing_factors._job_dissatisfaction"] tafe_resignations[tafe_resig_factors] = tafe_resignations[tafe_resig_factors].applymap(boolean_bummer) # In[13]: # inspect TAFE resignation values after converting to boolean display(tafe_resignations["contributing_factors._dissatisfaction"].value_counts(dropna=False)) display(tafe_resignations["contributing_factors._job_dissatisfaction"].value_counts(dropna=False)) # In[14]: # create new TAFE column `dissatisfied` using .any() # trying to preserve NaN as NaN does not work, gets converted to True or False tafe_resignations["dissatisfied"] = tafe_resignations[tafe_resig_factors].any(axis=1, skipna=True) # In[15]: # inspect all three columns for logical errors tafe_resignations = tafe_resignations.reindex() display(tafe_resignations.loc[39:53,["contributing_factors._dissatisfaction", "contributing_factors._job_dissatisfaction", "dissatisfied"]]) # In[16]: # create new DETE column `dissatisfied` using .any() dete_resig_factors = ["job_dissatisfaction", "dissatisfaction_with_the_department", "physical_work_environment", "lack_of_recognition", "lack_of_job_security", "work_location", "employment_conditions", "work_life_balance", "workload"] dete_resignations["dissatisfied"] = dete_resignations[dete_resig_factors].any(axis=1, skipna=True) display(dete_resignations["dissatisfied"].value_counts()) # ## Combine Datasets # Both datasets are ready to combine. Vertical concatenation will work best for the purpose of this study. Only the relevant columns will be used. They will be used to create new two new columns, one of age groups, and the other length of employment groups. # In[17]: # make two new dataframes containing only relevant columns dete_resignations_up = dete_resignations[["age", "institute_service", "dissatisfied"]].copy() tafe_resignations_up = tafe_resignations[["age", "institute_service", "dissatisfied"]].copy() # add institute column to each dataset w/ dataset label dete_resignations_up["institute"] = "DETE" tafe_resignations_up["institute"] = "TAFE" # combine dataframes vertically combined = pd.concat([dete_resignations_up, tafe_resignations_up], ignore_index=True) display(combined.head()) display(combined.tail()) # In[18]: # use institute_service to create new service_cat column # function to reassign institute_service values def service_changer(element): if element == "Less than 1 year" or element == "1-2": return "New" elif element == "3-4" or element == "5-6": return "Experienced" elif element == "7-10": return "Established" elif pd.isnull(element): return np.nan else: return "Veteran" combined["service_cat"] = combined["institute_service"].apply(service_changer) display(combined["institute_service"].value_counts(dropna=False)) # In[19]: # use age to create new age_groups column alpha = r"[a-z]*" combined["age_groups"] = combined["age"] combined["age_groups"] = combined["age_groups"].str.replace(" ","-").str.replace(alpha, "", regex=True).str.replace("--", "-") combined["age_groups"] = combined["age_groups"].str.replace("20-", "Under 30").str.replace("21-25", "Under 30").str.replace("26-30", "Under 30") combined["age_groups"] = combined["age_groups"].str.replace("31-35", "31-40").str.replace("36-40", "31-40").str.replace("41-45", "41-50").str.replace("46-50", "41-50") combined["age_groups"] = combined["age_groups"].str.replace("51-55", "51-60").str.replace("56-60", "51-60").str.replace("56-", "51-60").str.replace("61-", "Over 60") display(combined["age_groups"].value_counts(dropna=False)) # In[20]: # heatmap showing missing values in pink import seaborn as sns display(sns.heatmap(combined.isnull(), cbar=False)) # In[21]: # bar chart showing resignation vs length of employment categories pv_service_cat = combined.pivot_table(values="dissatisfied", index="service_cat") ax = pv_service_cat.plot(kind="barh", legend=False, title="Resignations Due to Dissatisfaction, Grouped by Length of Employment", color="darkblue") ax.set_ylabel("") plt.show() # In[22]: # bar chart showing resignation vs age categories pv_age_groups = combined.pivot_table(values="dissatisfied", index="age_groups") ax = pv_age_groups.plot(kind="barh", legend=False, title="Resignations Due to Dissatisfaction, Grouped by Age", color="green") ax.set_ylabel("") plt.show() # In[23]: # bar chart showing resignation vs insitute pv_institute = combined.pivot_table(values="dissatisfied", index="institute") ax = pv_institute.plot(kind="barh", legend=False, title="Resignations Due to Dissatisfaction, Grouped by Institute", color="orange") ax.set_ylabel("") plt.show() # ## Conclusions # While this is only a primary survey, it appears that both age and length of employment are correlated to resignation due to job dissatisfaction. The high rate seen in workers over the age of 60 suggests reconsidering inclusion factors. The data also show an almost double rate of resignation due to job dissatisfaction in the DETE.