#!/usr/bin/env python # coding: utf-8 # # Visual exploratory data analysis # > A Summary of lecture "Analyzing Police Activity with pandas", via datacamp # # - toc: true # - badges: true # - comments: true # - author: Chanseok Kang # - categories: [Python, Datacamp, Data_Science, Visualization] # - image: images/k-zones-plot.png # In[1]: # Import the pandas library as pd import pandas as pd # Read 'police.csv' into a DataFrame named ri ri = pd.read_csv('./dataset/police.csv') # ## Does time of day affect arrest rate? # # # ### Calculating the hourly arrest rate # # When a police officer stops a driver, a small percentage of those stops ends in an arrest. This is known as the arrest rate. In this exercise, you'll find out whether the arrest rate varies by time of day. # # First, you'll calculate the arrest rate across all stops in the ri DataFrame. Then, you'll calculate the hourly arrest rate by using the hour attribute of the index. The hour ranges from 0 to 23, in which: # # - 0 = midnight # - 12 = noon # - 23 = 11 PM # ### Preprocess # In[2]: combined = ri.stop_date.str.cat(ri.stop_time, sep=' ') ri['stop_datetime'] = pd.to_datetime(combined) ri['is_arrested'] = ri['is_arrested'].astype(bool) ri.set_index('stop_datetime', inplace=True) # In[3]: # Calculate the overall arrest rate print(ri.is_arrested.mean()) # Calculate the hourly arrest rate print(ri.groupby(ri.index.hour).is_arrested.mean()) # Save the hourly arrest rate hourly_arrest_rate = ri.groupby(ri.index.hour).is_arrested.mean() # ### Plotting the hourly arrest rate # In this exercise, you'll create a line plot from the hourly_arrest_rate object. A line plot is appropriate in this case because you're showing how a quantity changes over time. # # This plot should help you to spot some trends that may not have been obvious when examining the raw numbers! # In[4]: import matplotlib.pyplot as plt # Create a line plot of 'hourly_arrest_rate' hourly_arrest_rate.plot() # Add the xlabel, ylabel, and title plt.xlabel('Hour') plt.ylabel('Arrest Rate') plt.title('Arrest Rate by Time of Day') # ## Are drug-related stops on the rise? # # ### Plotting drug-related stops # # In a small portion of traffic stops, drugs are found in the vehicle during a search. In this exercise, you'll assess whether these drug-related stops are becoming more common over time. # # The Boolean column drugs_related_stop indicates whether drugs were found during a given stop. You'll calculate the annual drug rate by resampling this column, and then you'll use a line plot to visualize how the rate has changed over time. # In[5]: # Calculate the annual rate of drug-related stops print(ri.drugs_related_stop.resample('A').mean()) # Save the annual rate of drug-related stops annual_drug_rate = ri.drugs_related_stop.resample('A').mean() # Create a line plot of 'annual_drug_rate' annual_drug_rate.plot() # ### Comparing drug and search rates # As you saw in the last exercise, the rate of drug-related stops increased significantly between 2005 and 2015. You might hypothesize that the rate of vehicle searches was also increasing, which would have led to an increase in drug-related stops even if more drivers were not carrying drugs. # # You can test this hypothesis by calculating the annual search rate, and then plotting it against the annual drug rate. If the hypothesis is true, then you'll see both rates increasing over time. # In[6]: # Calculate and save the annual search rate annual_search_rate = ri.search_conducted.resample('A').mean() # Concatenate 'annual_drug_rate' and 'annual_search_rate' annual = pd.concat([annual_drug_rate, annual_search_rate], axis='columns') # Create subplots from 'annual' annual.plot(subplots=True) # ## What violations are caught in each district? # # ### Tallying violations by district # The state of Rhode Island is broken into six police districts, also known as zones. How do the zones compare in terms of what violations are caught by police? # # In this exercise, you'll create a frequency table to determine how many violations of each type took place in each of the six zones. Then, you'll filter the table to focus on the "K" zones, which you'll examine further in the next exercise. # # # In[7]: # Create a frequency table of districts and violations print(pd.crosstab(ri.district, ri.violation)) # Save the frequency table as 'all_zones' all_zones = pd.crosstab(ri.district, ri.violation) # Select rows 'Zone K1' through 'Zone K3' print(all_zones.loc['Zone K1':'Zone K3']) # Save the smaller table as 'k_zones' k_zones = all_zones.loc['Zone K1':'Zone K3'] # ### Plotting violations by district # Now that you've created a frequency table focused on the "K" zones, you'll visualize the data to help you compare what violations are being caught in each zone. # # First you'll create a bar plot, which is an appropriate plot type since you're comparing categorical data. Then you'll create a stacked bar plot in order to get a slightly different look at the data. Which plot do you find to be more insightful? # In[10]: # Create a bar plot of 'k_zones' k_zones.plot(kind='bar') plt.savefig('../images/k-zones-plot.png') # In[11]: # Create a stacked bar plot of 'k_zones' k_zones.plot(kind='bar', stacked=True) # ## How long might you be stopped for a violation? # # ### Converting stop durations to numbers # In the traffic stops dataset, the stop_duration column tells you approximately how long the driver was detained by the officer. Unfortunately, the durations are stored as strings, such as '0-15 Min'. How can you make this data easier to analyze? # # In this exercise, you'll convert the stop durations to integers. Because the precise durations are not available, you'll have to estimate the numbers using reasonable values: # # - Convert '0-15 Min' to 8 # - Convert '16-30 Min' to 23 # - Convert '30+ Min' to 45 # In[40]: # Print the unique values in 'stop_duration' print(ri.stop_duration.unique()) # Create a dictionary that maps strings to integers mapping = {'0-15 Min': 8, '16-30 Min': 23, '30+ Min': 45} # Convert the 'stop_duration' strings to intergers using the 'mapping' ri['stop_minutes'] = ri.stop_duration.map(mapping) # Print the unique values in 'stop_minutes' print(ri.stop_minutes.unique()) # ### Plotting stop length # If you were stopped for a particular violation, how long might you expect to be detained? # # In this exercise, you'll visualize the average length of time drivers are stopped for each type of violation. Rather than using the violation column in this exercise, you'll use violation_raw since it contains more detailed descriptions of the violations. # In[43]: # Calculate the mean 'stop_minutes' for each value in 'violation_raw' print(ri.groupby('violation_raw').stop_minutes.mean()) # Save the resulting Series as 'stop_length' stop_length = ri.groupby('violation_raw').stop_minutes.mean() # Sort 'stop_length' by its values and create a horizontal bar plot stop_length.sort_values().plot(kind='barh')