import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# create a dataframe with one column "ages" with (50) randomly generated integers between 10 and 85
df = pd.DataFrame({"ages" : np.random.randint(low = 10, high = 85, size = 50)})
df.info()
df
The age column will be divided into 10 intervals
plt.hist(df["ages"], bins = 10) # you can customize bins using any integer
plt.show()
plt.bar(x = df["ages"].value_counts().index,
height = df["ages"].value_counts().values)
plt.show()
temp_series = df["ages"].value_counts(bins = 10).sort_index()
fig, ax = plt.subplots(figsize = (15, 5))
ax.bar(x = temp_series.index.astype(str),
height = temp_series.values)
# set the ticks first before setting the labels
ax.set_xticks(ax.get_xticks())
# set the labels as the upper bound of each interval prefixed with 'xyz'
ax.set_xticklabels(["xyz " + str(each)[-5 : -1] for each in temp_series.index])
plt.show()
# define function to create custom intervals
def age_intervals(age):
if age <= 17:
return "17 or below"
elif 17 < age <= 34:
return "17 - 34"
elif 34 < age < 51:
return "34 - 51"
elif 51 < age <= 71:
return "51 - 71"
else:
return "71+"
# create new column with customized intervals
df["age_group"] = df["ages"].apply(age_intervals)
df["age_group"].value_counts()
# use a plt.hist on the age_group column which by default calculates the count of each group
plt.hist(x = df.sort_values(by = "ages")["age_group"])
plt.show()