#!/usr/bin/env python # coding: utf-8 # # Module 3 # # ## Video 15: Filtering and Modifying DataFrames # **Python for the Energy Industry** # # ## Filtering DataFrames # # A DataFrame can be filtered on a condition in a similar way to filtering numpy arrays. We will use the OPEC data example again: # In[1]: import pandas as pd import numpy as np OPEC_df = pd.DataFrame({ 'Country': ['Algeria','Angola','Equatorial Guinea','Gabon','Iran','Iraq','Kuwait','Libya','Nigeria','Republic of the Congo','Saudi Arabia','UAE','Venezuela'], 'Region': ['North Africa','Southern Africa','Central Africa','Central Africa','Middle East','Middle East','Middle East','North Africa','West Africa','Central Africa','Middle East','Middle East','South America'], 'Population': [42228408,30809787,1308975,2119275,81800188,38433600,4137312,6678559,195874685,5125821,33702756,9630959,28887118], 'Oil Production': [1348361,1769615,np.nan,210820,3990956,4451516,2923825,384686,1999885,260000,10460710,3106077,2276967], 'Proven Reserves': [12.2e9,8.423e9,np.nan,2e9,157.53e9,143.069e9,101.5e9,48.363e9,37.07e9,1.6e9,266.578e9,97.8e9,299.953e9] }) # We can do boolean operations on a column of a DataFrame to obtain an array of boolean values. This boolean array can be used to filter a DataFrame - all entries corresponding to a False value are dropped from the DataFrame. # In[4]: # Get Middle Eastern countries data only is_ME = (OPEC_df['Region'] == "Middle East") OPEC_df[is_ME] # Let's say we want to filter using some other condition (e.g. countries with production over 2M bbl/day): # In[5]: prod_over_2M = (OPEC_df["Oil Production"] > 2e6) OPEC_df[prod_over_2M] # The logical_and function we saw before can be used to filter on multiple conditions. # In[6]: OPEC_df[np.logical_and(is_ME,prod_over_2M)] # ## Modifying DataFrames # # ### Adding to DataFrames # # We can use the 'append' function to add a new entry to DataFrame, by appending a dictionary with keys corresponding to the column names. # In[7]: # Include countries with lapsed OPEC membership OPEC_df = OPEC_df.append({'Country':'Ecuador', 'Region':'South America', 'Population': 17084358, 'Oil Production': 548421, 'Proven Reserves': 8.273e9}, ignore_index=True) OPEC_df = OPEC_df.append({'Country':'Indonesia', 'Region':'Southeast Asia', 'Population': 267670543, 'Oil Production': 833667, 'Proven Reserves': 3.6925e9}, ignore_index=True) OPEC_df = OPEC_df.append({'Country':'Qatar', 'Region':'Middle East', 'Population': 2781682, 'Oil Production': 1522902, 'Proven Reserves': 25.244e9}, ignore_index=True) OPEC_df # New columns in a DataFrame can be created by setting a column of that name to a list/array of the right length: # In[8]: OPEC_df['Membership Active'] = [True]*13 + [False]*3 OPEC_df # New columns can also be created from operations on existing columns: # In[9]: OPEC_df['Production per cap'] = OPEC_df['Oil Production'] / OPEC_df['Population'] OPEC_df # ### Sorting DataFrames # # We can sort a DataFrame by the values in a given column: # In[10]: OPEC_df.sort_values(by='Oil Production', ascending=False) # ### Grouping # # We can also group entries by a common property. In this dataset, Region is the natural grouping. We have to specify what operation is applied to numeric data, e.g. adding or averaging the values for a given group. # In[11]: OPEC_df.groupby('Region').sum() # In[12]: OPEC_df.groupby('Region').mean() # ### Exercise # # Add to the DataFrame a column for proven reserves per capita. Output: # - The DataFrame in descending order by proven reserves per capita # - The average value of proven reserves per capita for each region # In[ ]: