#!/usr/bin/env python # coding: utf-8 # # # # # # # #

Introduction to Python for Data Sciences

Franck Iutzeler
# # #

# #
Chap. 3 - Data Handling with Pandas
# #

# # # 1- Pandas # # # In a previous chapter, we explored some features of NumPy and notably its arrays. Here we will take a look at the data structures provided by the **Pandas** library. # # Pandas is a newer package built on top of NumPy which provides an efficient implementation of **DataFrames**. DataFrames are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data. As well as offering a convenient storage interface for labeled data, Pandas implements a number of powerful data operations. # # # # Just as we generally import NumPy under the alias ``np``, we will import Pandas under the alias ``pd``. # # In[1]: import pandas as pd import numpy as np # ## Pandas Series # # # A Pandas `Series` is a one-dimensional array of indexed data. # In[2]: data = pd.Series([0.25, 0.5, 0.75, 1.0]) data # The contents can be accessed in the same way as for NumPy arrays, to the difference that when more than one value is selected, the type remains a Pandas ``Series``. # In[3]: print(data[0],type(data[0])) # In[4]: print(data[2:],type(data[2:])) # The type ``Series`` wraps both a sequence of values and a sequence of indices, which we can access with the values and index attributes. # # * ``values`` are the contents of the series as a NumPy array # In[5]: print(data.values,type(data.values)) # * ``index`` are the indices of the series # In[6]: print(data.index,type(data.index)) # ### Series Indices # # The main difference between NumPy arrays and Pandas Series is the presence of this index field. By default, it is set (as in NumPy arrays) as 0,1,..,size_of_the_series but a Series index can be explicitly defined. The indices may be numbers but also strings. Then, the contents of the series *have to* be accessed using these defined indices. # In[7]: data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd']) print(data) # In[8]: print(data['c']) # In[9]: data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 3, 4, 2]) print(data) # In[10]: print(data[2]) # ### Series and Python Dictionaries [\*] # # Pandas Series and Python Dictionaries are close semantically: mappping keys to values. However, the implementation of Pandas series is usually more efficient than dictionaries in the context of data science. Naturally, Series can be contructed from dictionaries. # In[11]: population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135} population = pd.Series(population_dict) print(population_dict,type(population_dict)) print(population,type(population)) # In[12]: population['California'] # In[13]: population['California':'Illinois'] # ## Pandas DataFrames # # DataFrames is a fundamental object of Pandas that mimicks what can be found in `R` for instance. Dataframes can be seen as an array of Series: to each `index` (corresponding to an individual for instance or a line in a table), a Dataframe maps multiples values; these values corresponds to the `columns` of the DataFrame which each have a name (as a string). # # # In the following example, we will construct a Dataframe from two Series with common indices. # In[14]: area = pd.Series( {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}) population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}) # In[15]: states = pd.DataFrame({'Population': population, 'Area': area}) print(states,type(states)) # In Jupyter notebooks, DataFrames are displayed in a fancier way when the name of the dataframe is typed (instead of using print) # In[16]: states # DataFrames have # * index that are the defined indices as in Series # * columns that are the columns names # * values that return a (2D) NumPy array with the contents # In[17]: print(states.index) print(states.columns) print(states.values,type(states.values),states.values.shape) # *Warning:* When accessing a Dataframe, `dataframe_name[column_name]` return the corresponding column as a Series. `dataframe_name[index_name]` returns an error! We will see later how to access a specific index. # In[18]: print(states['Area'],type(states['Area'])) # In[26]: try: print(states['California']) except KeyError as error: print("KeyError: ",error) # ### Dataframe creation # # To create DataFrames, the main methods are: # * from Series (as above) # In[27]: print(population,type(population)) states = pd.DataFrame({'Population': population, 'Area': area}) states # * from NumPy arrays (the columns and indices are taken as the array's ones) # In[28]: A = np.random.randn(5,3) print(A,type(A)) dfA = pd.DataFrame(A) dfA # * from a *list* of *dictionaries*. Be careful, each element of the list is an example (corresponding to an automatic index 0,1,...) while each key of the dictonary corresponds to a column. # In[29]: data = [{'a': i, 'b': 2 * i} for i in range(3)] print(data,type(data)) print(data[0],type(data[0])) # In[30]: df = pd.DataFrame(data) df # * from a *file* , typically a csv file (for comma separated values), eventually with the names of the columns as a first line. # # # col_1_name,col_2_name,col_3_name # col_1_v1,col_2_v1,col_3_v1 # col_1_v2,col_2_v2,col_3_v2 # ... # # For other files types (MS Excel, libSVM, any other separator) see this [part of the doc](https://pandas.pydata.org/pandas-docs/stable/api.html#input-output) # In[31]: get_ipython().system('head -4 data/president_heights.csv # Jupyter bash command to see the first 4 lines of the file') # In[32]: data = pd.read_csv('data/president_heights.csv') data # ### Names and Values # # Notice there can be missing values in DataFrames. # In[33]: pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]) # You can set indices and columns names *a posteriori* # In[34]: dfA.columns = ['a','b','c'] dfA.index = [i**2 for i in range(1,6) ] dfA # ## Indexing # # # # In[35]: area = pd.Series( {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}) population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}) states = pd.DataFrame({'Population': population, 'Area': area}) states # You may access columns directly with names, *then* you can access individuals with their index. # In[36]: states['Area'] # In[37]: states['Area']['Texas'] # To ease the access, Pandas offers dedicated methods: # * iloc enables to access subparts of the dataframe as if it was a NumPy array. # In[38]: states.iloc[:2] # In[39]: states.iloc[:2,0] # * loc does the same but with the explicit names (the last one is included) # In[40]: states.loc[:'New York'] # In[41]: states.loc[:,'Population':]