#!/usr/bin/env python # coding: utf-8 # dsi logo # # # Pandas 101 - Series & DataFrames # # Get to know the basic operations on series and dataframes. # # For more details check the [User Guide](https://pandas.pydata.org/docs/user_guide/index.html) or the [API reference](https://pandas.pydata.org/docs/reference/index.html) from https://pandas.pydata.org # # # In[ ]: import pandas as pd # *** # # Series # # # ### Creation # # A `Series` object can be created by `pd.Series()` with a list of values as parameter. You can get the values by the `values` attribute of the `Series`, and the index values by the `index.values` attribute. An index can be a number o a string and it can be passed by the named parameter `index` of the `pd.Series()` method. # In[ ]: series1 = pd.Series([1,2,3,4]) print("series1 values: " + str(series1.values)) print("series1 indexes: " + str(series1.index.values)) series2 = pd.Series([1,2,3,4], index = ['a', 'b', 'c', 'd']) print("series2 values: " + str(series2.values)) print("series2 indexes: " + str(series2.index.values)) # *** # #
# Exercise: Create a Series # # Create a series of values with the five largest countries on earth with the name of the country as index # #
# # # | Country | Size ($km^2$) | # | --------------- | ------------ | # | Russia | 17,098,246 | # | Canada | 9,984,670 | # | China | 9,596,961 | # | United States | 9,525,067 | # | Brazil | 8,515,767 | # # # *** # # # In[ ]: # ### Selecting and Filtering Entries # # Single entries of a `Series` can be selected by the index or the index number. # # A filter can be used to select elements of the `Series` that match a given criteria. The result of the filtered `Series` are `Series` again. # In[ ]: # In[ ]: byIndex = series2['c'] print("selecting by index: "+ str(byIndex)) byIndexNumber = series2[3] print("selecting by index number: "+ str(byIndexNumber)) byFilter = series2[series2 > 2] print("Filtering values:\n" + str(byFilter)) # *** # #
# Exercise: Select & Filter a Series # # 1. Select the value for Brazil # # 2. Create a filter that select all countries that have more than $9,600,000~km^2$ #
# # # *** # # # In[ ]: # *** # # # DataFrame # # ### Creation # # A `DataFrame` can be created in several ways. One of the simplest ways to create one from scratch is the pass a `dict` object to `pd.DataFrame()`. In Jupypter the `display()` function can be used to render a `DataFrame` as a pretty table. To print information about the metadata of your `DataFrame` you can use the `.info()` on your dataset. The get only the first or last rows you can use `.head()`and `tail()`, which is especially useful on large datasets. # In[ ]: df = pd.DataFrame({ "Column1": [1, 4, 8, 7, 9], "Column2": ['a', 'b', 'c', 'd', 'f'], "Column3": [1.1, 23.5, 45.6, 32.1234, 89.453], "Column4": [True, False, True, False, True] }) print("Essential Information on DataFrame:\n") df.info() print("\n\nFull DataFrame:") display(df) print("\n\nHead of DataFrame:") display(df.head(2)) # In[ ]: # *** # #
# Exercise: Create a DataFrame # # Create a DataFrame for the land and water size of the largest countries. What are the datatypes of your columns # #
# # | Country | Land | Water | # | -------------- | ----------- | -------- | # | Russia | 16,377,742 | 720,500 | # | Canada | 9,093,507 | 891,163 | # | China | 9,326,410 | 270,550 | # | United States | 9,147,593 | 377,424 | # | Brazil | 8,460,415 | 55,352 | # # *** # # # In[ ]: # ### Selecting data by column # # A single column an be selected by `df['column_name']` (the result is a `Series` object), multiple columns by `df[['column_name1', 'column_name2']]` (the result is a `DataFrame` object) # In[ ]: display(df['Column1']) display(df[['Column2', 'Column3']]) # *** # #
# Exercise: Select columns of DataFrame # # Use your country dataframe. # 1. Select all country names as a series # 2. Create a new DataFrame that contains only two columns, the country name and the size of the land. #
# # *** # # # In[ ]: # ### Selecting data by row # # Every row has an index. This index can be used to select one or multiple rows with the `.loc` function. # # A filter can be used to select elements of the `DataFrame` that match a given criteria. The result of the filtered `DataFrame` are `DataFrame` again. # # In[ ]: print("A single row as Series object:") print(df.loc[2]) print("\n\nMultiple rows as DataFrame object:") display(df.loc[[2, 4]]) print("\n\nMultiple rows (as range) as DataFrame object: \n") display(df.loc[2:5]) print("\n\nRows filtered (Column3 > 42):") filtered = df[df['Column3'] < 42] display(filtered) # *** # #
# Exercise: Filter rows of DataFrame # # 1. Filter your country dataframe to only display rows that have a water size of $500000~km^2$ # # 2. Filter your country dataframe to only display rows that have a land size that more than 20 times larger than the water size #
# # *** # # # ### Adding / Removing a Series to a DataFrame # # A `Series` can easily be added to a `DataFrame` as a column with the assignment operation. For each index in the `DataFrame` the value at the index in the `Series` is assigned. If an index is not found the cell will be empty. # # With the `.drop` function a column (axis=1) or a row (axis=0) can be removed from a `DataFrame` # In[ ]: df['myserie'] = series1 display(df) df.drop('myserie', axis=1, inplace=True) display(df) # In[ ]: df.set_index('Column3') # *** # #
# Exercise: Add a Series to a DataFrame # # 1. Add your Series with the total size of the country as a column named 'Total' to your DataFrame. # # Note: Your DataFrame must have the same index as your Series. Use df.set_index('columnname') to transform a regular column to an index. (This can reverted by the .reset_index() function) # # 2. Create a new column named 'LandAndWater' that contains the sum of Land and Water. Is it equal to the 'Total' column? #
# # *** # # # In[ ]: