Visualizing The Gender Gap In College Degrees

In this project we will be working on visualizing the data from The Department of Education Statistics.

The department releases a data set annually containing the percentage of bachelor's degrees granted to women from 1970 to 2012. The data set is broken up into 17 categories of degrees, with each column as a separate category. The dataset for this project, compiled by Randal Olsan, a data scientist at the University of Pennsylvania can be downloaded here.

Aim

To compare the gender gap in all degree categories using data visualization.

1. Open the dataset and visualize the gap across STEM fields

In [32]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

women_degrees = pd.read_csv('percent-bachelors-degrees-women-usa.csv')
cb_dark_blue = (0/255,107/255,164/255)
cb_orange = (255/255, 128/255, 14/255)
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

fig = plt.figure(figsize=(18, 3))

for sp in range(0,6):
    ax = fig.add_subplot(1,6,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(stem_cats[sp])
    ax.tick_params(bottom="off", top="off", left="off", right="off")
    
    if sp == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
        ax.legend(loc='best')
    elif sp == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
plt.show()

2. Compare all degrees

In [33]:
fig = plt.figure(figsize=(18, 20))
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
                 'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

for sp in range(0,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[index]], c=cb_orange, label='Men', linewidth=3)   
    ax.set_title(stem_cats[index])
    
   
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
  
    if index == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
            
   
for sp in range(1,16,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[index]], c=cb_orange, label='Men', linewidth=3)
    
    ax.set_title(lib_arts_cats[index])
     

    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 73, 'Women')
        ax.text(2002, 14, 'Men')
        ax.legend(loc='best')
    elif index == 4:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
   
   

for sp in range(2,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[other_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(other_cats[index])
    
    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
  
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 87, 'Women')
        ax.text(2002, 8, 'Men')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
            
   
plt.show()

3. Hide x-axis labels

To hide the x-axis labels, we set the labelbottom to off in the Axes.tickparams():

ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')

To declutter the charts, we are disabling the x-axis labels of al the line charts except the bottommost line chart in each column.

In [34]:
fig = plt.figure(figsize=(18, 20))
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
                 'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

for sp in range(0,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[index]], c=cb_orange, label='Men', linewidth=3)
   
    ax.set_title(stem_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    
   
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
      
    if index == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
            
   


for sp in range(1,16,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(lib_arts_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
       
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 73, 'Women')
        ax.text(2002, 14, 'Men')
        ax.legend(loc='best')
    elif index == 4:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
   
   

for sp in range(2,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[other_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(other_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
       
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 87, 'Women')
        ax.text(2002, 8, 'Men')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
            
   
plt.show()

4. Setting y-axis labels

For all the plots, we are setting y-axis labels to 0 and 100

In [35]:
fig = plt.figure(figsize=(18, 20))
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
                 'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

for sp in range(0,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[index]], c=cb_orange, label='Men', linewidth=3)
   
    ax.set_title(stem_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    
   
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100]) 
  
    
    if index == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
            
   


for sp in range(1,16,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(lib_arts_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100])
  
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 73, 'Women')
        ax.text(2002, 14, 'Men')
        ax.legend(loc='best')
    elif index == 4:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
   

for sp in range(2,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[other_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(other_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
     # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100])
    
      
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 87, 'Women')
        ax.text(2002, 8, 'Men')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
            
plt.show()

5. Adding a horizontal line

For all plots, to generate a horizontal line we use Axes.axhline()

In [36]:
fig = plt.figure(figsize=(18, 20))
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
                 'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

for sp in range(0,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[index]], c=cb_orange, label='Men', linewidth=3)
   
    ax.set_title(stem_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    
   
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100]) 
    
    # add horizontal line
    ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3) 
    
    if index == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
            
   


for sp in range(1,16,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(lib_arts_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100])
    
    # add horizontal line
    ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3) 
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 73, 'Women')
        ax.text(2002, 14, 'Men')
        ax.legend(loc='best')
    elif index == 4:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
   
   

for sp in range(2,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[other_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(other_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
     # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100])
    
     # add horizontal line
    ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3) 
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 87, 'Women')
        ax.text(2002, 8, 'Men')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
            
   
plt.show()

6. Export the plot as a .png file

In [37]:
fig = plt.figure(figsize=(18, 20))
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
                 'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

for sp in range(0,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[index]], c=cb_orange, label='Men', linewidth=3)
   
    ax.set_title(stem_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    
   
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100]) 
    
    # add horizontal line
    ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3) 
    
    if index == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
            
   


for sp in range(1,16,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(lib_arts_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
    # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100])
    
    # add horizontal line
    ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3) 
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 73, 'Women')
        ax.text(2002, 14, 'Men')
        ax.legend(loc='best')
    elif index == 4:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
   

for sp in range(2,18,3):
    index = int(sp/3)
    ax = fig.add_subplot(6,3,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[other_cats[index]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[index]], c=cb_orange, label='Men', linewidth=3)
     
    ax.set_title(other_cats[index])
    ax.tick_params(bottom="off", top="off", left="off", right="off", labelbottom='off')    

    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    
     # set starting and ending labels (0 and 100)
    ax.set_yticks([0,100])
    
     # add horizontal line
    ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3) 
    
    for key,spine in ax.spines.items():
         spine.set_visible(False)
    if index == 0:
        ax.text(2005, 87, 'Women')
        ax.text(2002, 8, 'Men')
        ax.legend(loc='best')
    elif index == 5:
        ax.text(2005, 62, 'Men')
        ax.text(2001, 35, 'Women')
        ax.tick_params(labelbottom='on') # show labels only on the last bottomost plot
        
fig.savefig("gender_degrees.png")   
plt.show()