#!/usr/bin/env python # coding: utf-8 # # Benford's Law # ## Purpose # To take an iterable object (assumed to contain numbers) and plot the frequency of their leading digits. Based on [Benford's Law](https://en.wikipedia.org/wiki/Benford%27s_law) (also called the first-digit law), if it is a "natural dataset," we should see the following distribution of leading digits: # # | d | P(d) | # |--- |------: | # | 1 | 30.1% | # | 2 | 17.6% | # | 3 | 12.5% | # | 4 | 9.7% | # | 5 | 7.9% | # | 6 | 6.7% | # | 7 | 5.8% | # | 8 | 5.1% | # | 9 | 4.6% | # # ## Application # In data science, this pattern is used to detect fraud, primarily for taxes purposes. It can also be used to detect [deepfakes](https://en.wikipedia.org/wiki/Deepfake) or altered images. # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: world = pd.read_csv('world_population_data.csv') world.head() # In[3]: def digit_widget(list): number_stash = [] for num in list: leading_digit = str(num)[0] if leading_digit == '-': leading_digit = str(num)[1] if leading_digit == '$': leading_digit = str(num)[1] if leading_digit == 'n': continue if leading_digit == '0': continue number_stash.append(leading_digit) number_stash = sorted(number_stash) fig, ax = plt.subplots() ax.set_yticks([0.10, 0.20, 0.30]) plt.hist(number_stash, bins=9, density=True) return plt.show() # In[4]: digit_widget(world['Population_2020']) # In[5]: digit_widget(world['Migrants']) # In[6]: digit_widget(world['Net_Change'])