#!/usr/bin/env python
# coding: utf-8
#
#
# # Pandas 101 - Series & DataFrames
#
# Get to know the basic operations on series and dataframes.
#
# For more details check the [User Guide](https://pandas.pydata.org/docs/user_guide/index.html) or the [API reference](https://pandas.pydata.org/docs/reference/index.html) from https://pandas.pydata.org
#
#
# In[ ]:
import pandas as pd
# ***
# # Series
#
#
# ### Creation
#
# A `Series` object can be created by `pd.Series()` with a list of values as parameter. You can get the values by the `values` attribute of the `Series`, and the index values by the `index.values` attribute. An index can be a number o a string and it can be passed by the named parameter `index` of the `pd.Series()` method.
# In[ ]:
series1 = pd.Series([1,2,3,4])
print("series1 values: " + str(series1.values))
print("series1 indexes: " + str(series1.index.values))
series2 = pd.Series([1,2,3,4], index = ['a', 'b', 'c', 'd'])
print("series2 values: " + str(series2.values))
print("series2 indexes: " + str(series2.index.values))
# ***
#
#
# Exercise: Create a Series
#
# Create a series of values with the five largest countries on earth with the name of the country as index
#
#
#
#
# | Country | Size ($km^2$) |
# | --------------- | ------------ |
# | Russia | 17,098,246 |
# | Canada | 9,984,670 |
# | China | 9,596,961 |
# | United States | 9,525,067 |
# | Brazil | 8,515,767 |
#
#
# ***
#
#
# In[ ]:
# ### Selecting and Filtering Entries
#
# Single entries of a `Series` can be selected by the index or the index number.
#
# A filter can be used to select elements of the `Series` that match a given criteria. The result of the filtered `Series` are `Series` again.
# In[ ]:
# In[ ]:
byIndex = series2['c']
print("selecting by index: "+ str(byIndex))
byIndexNumber = series2[3]
print("selecting by index number: "+ str(byIndexNumber))
byFilter = series2[series2 > 2]
print("Filtering values:\n" + str(byFilter))
# ***
#
#
# Exercise: Select & Filter a Series
#
# 1. Select the value for Brazil
#
# 2. Create a filter that select all countries that have more than $9,600,000~km^2$
#
#
#
# ***
#
#
# In[ ]:
# ***
#
# # DataFrame
#
# ### Creation
#
# A `DataFrame` can be created in several ways. One of the simplest ways to create one from scratch is the pass a `dict` object to `pd.DataFrame()`. In Jupypter the `display()` function can be used to render a `DataFrame` as a pretty table. To print information about the metadata of your `DataFrame` you can use the `.info()` on your dataset. The get only the first or last rows you can use `.head()`and `tail()`, which is especially useful on large datasets.
# In[ ]:
df = pd.DataFrame({
"Column1": [1, 4, 8, 7, 9],
"Column2": ['a', 'b', 'c', 'd', 'f'],
"Column3": [1.1, 23.5, 45.6, 32.1234, 89.453],
"Column4": [True, False, True, False, True]
})
print("Essential Information on DataFrame:\n")
df.info()
print("\n\nFull DataFrame:")
display(df)
print("\n\nHead of DataFrame:")
display(df.head(2))
# In[ ]:
# ***
#
#
# Exercise: Create a DataFrame
#
# Create a DataFrame for the land and water size of the largest countries. What are the datatypes of your columns
#
#
#
# | Country | Land | Water |
# | -------------- | ----------- | -------- |
# | Russia | 16,377,742 | 720,500 |
# | Canada | 9,093,507 | 891,163 |
# | China | 9,326,410 | 270,550 |
# | United States | 9,147,593 | 377,424 |
# | Brazil | 8,460,415 | 55,352 |
#
# ***
#
#
# In[ ]:
# ### Selecting data by column
#
# A single column an be selected by `df['column_name']` (the result is a `Series` object), multiple columns by `df[['column_name1', 'column_name2']]` (the result is a `DataFrame` object)
# In[ ]:
display(df['Column1'])
display(df[['Column2', 'Column3']])
# ***
#
#
# Exercise: Select columns of DataFrame
#
# Use your country dataframe.
# 1. Select all country names as a series
# 2. Create a new DataFrame that contains only two columns, the country name and the size of the land.
#
#
# ***
#
#
# In[ ]:
# ### Selecting data by row
#
# Every row has an index. This index can be used to select one or multiple rows with the `.loc` function.
#
# A filter can be used to select elements of the `DataFrame` that match a given criteria. The result of the filtered `DataFrame` are `DataFrame` again.
#
# In[ ]:
print("A single row as Series object:")
print(df.loc[2])
print("\n\nMultiple rows as DataFrame object:")
display(df.loc[[2, 4]])
print("\n\nMultiple rows (as range) as DataFrame object: \n")
display(df.loc[2:5])
print("\n\nRows filtered (Column3 > 42):")
filtered = df[df['Column3'] < 42]
display(filtered)
# ***
#
#
# Exercise: Filter rows of DataFrame
#
# 1. Filter your country dataframe to only display rows that have a water size of $500000~km^2$
#
# 2. Filter your country dataframe to only display rows that have a land size that more than 20 times larger than the water size
#
#
# ***
#
#
# ### Adding / Removing a Series to a DataFrame
#
# A `Series` can easily be added to a `DataFrame` as a column with the assignment operation. For each index in the `DataFrame` the value at the index in the `Series` is assigned. If an index is not found the cell will be empty.
#
# With the `.drop` function a column (axis=1) or a row (axis=0) can be removed from a `DataFrame`
# In[ ]:
df['myserie'] = series1
display(df)
df.drop('myserie', axis=1, inplace=True)
display(df)
# In[ ]:
df.set_index('Column3')
# ***
#
#
# Exercise: Add a Series to a DataFrame
#
# 1. Add your Series with the total size of the country as a column named 'Total' to your DataFrame.
#
# Note: Your DataFrame must have the same index as your Series. Use df.set_index('columnname') to transform a regular column to an index. (This can reverted by the .reset_index() function)
#
# 2. Create a new column named 'LandAndWater' that contains the sum of Land and Water. Is it equal to the 'Total' column?
#
#
# ***
#
#
# In[ ]: