Examining the distribution of shared data elements across forms.
Data source: https://repository.usaspending.gov/cder_library/authorized/report/common
import xml.etree.ElementTree as et
import matplotlib.pyplot as plt
import seaborn as sns
# Data downloaded from https://repository.usaspending.gov/cder_library/authorized/report/common
with open('/content/CDER_Library_DataElements_202001261038.xml','r') as fp:
tree = et.parse(fp)
# Examining the number of data elements total
d = tree.getroot()
print(f"There are {len(d)} data elements that appear on 2 or more forms.")
There are 4278 data elements that appear on 2 or more forms.
# Creating a list of the number of times that each element appears on a form
common_list = [len(elem.findall(".//form")) for elem in d]
# Plotting the histogram
# Note: I chose to plot the histogram *and* the log of the histogram to demonstrate that the long tail has some texture
fig, axs = plt.subplots(ncols=2)
sns.distplot(common_list, hist=True, hist_kws={"range":(1.5,19.5),"log":False}, kde=False, norm_hist=False, axlabel="Number shared forms", ax=axs[0])
sns.distplot(common_list, hist=True, hist_kws={"range":(1.5,19.5),"log":True}, kde=False, norm_hist=False, axlabel="Log of number of shared forms", ax=axs[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7fe7085f4cc0>