Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Set up the data

In [2]:
# create a dataframe with one column "ages" with (50) randomly generated integers between 10 and 85
df = pd.DataFrame({"ages" : np.random.randint(low = 10, high = 85, size = 50)})

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ages    50 non-null     int32
dtypes: int32(1)
memory usage: 328.0 bytes

Dataframe

In [3]:
df
Out[3]:
ages
0 50
1 73
2 21
3 61
4 57
5 75
6 82
7 47
8 69
9 19
10 27
11 37
12 16
13 36
14 81
15 44
16 58
17 45
18 35
19 41
20 30
21 62
22 34
23 44
24 17
25 50
26 82
27 50
28 21
29 65
30 30
31 37
32 23
33 63
34 32
35 68
36 35
37 24
38 77
39 13
40 11
41 61
42 64
43 42
44 13
45 58
46 83
47 77
48 36
49 49

Option 1. Create a histogram of ages with bins set to 10

The age column will be divided into 10 intervals

In [4]:
plt.hist(df["ages"], bins = 10) # you can customize bins using any integer
plt.show()

Option 2A. Series.value_counts() without bins argument

In [5]:
plt.bar(x = df["ages"].value_counts().index,
        height = df["ages"].value_counts().values)
plt.show()

Option 2B. Series.value_counts() with bins argument and custom xticklabels defined

In [ ]:
temp_series = df["ages"].value_counts(bins = 10).sort_index()

fig, ax = plt.subplots(figsize = (15, 5))

ax.bar(x = temp_series.index.astype(str),
       height = temp_series.values)

# set the ticks first before setting the labels
ax.set_xticks(ax.get_xticks())

# set the labels as the upper bound of each interval prefixed with 'xyz'
ax.set_xticklabels(["xyz " + str(each)[-5 : -1] for each in temp_series.index])

plt.show()

Option 3. Divide age column in custom intervals

In [7]:
# define function to create custom intervals
def age_intervals(age):
    if age <= 17:
        return "17 or below"
    elif 17 < age <= 34:
        return "17 - 34"
    elif 34 < age < 51:
        return "34 - 51"
    elif 51 < age <= 71:
        return "51 - 71"
    else:
        return "71+"

# create new column with customized intervals    
df["age_group"] = df["ages"].apply(age_intervals)

df["age_group"].value_counts()
Out[7]:
34 - 51        16
51 - 71        11
17 - 34        10
71+             8
17 or below     5
Name: age_group, dtype: int64
In [8]:
# use a plt.hist on the age_group column which by default calculates the count of each group
plt.hist(x = df.sort_values(by = "ages")["age_group"])
plt.show()