#!/usr/bin/env python # coding: utf-8 # # pandas I/O tools and examples # In[1]: import addutils.toc ; addutils.toc.js(ipy_notebook=True) # In[2]: from addutils import css_notebook css_notebook() # ## 1 Matlab Variables # ### 1.1 Import a Matlab variable from file # In[3]: import re import os import scipy.io import numpy as np import pandas as pd import sqlite3 #from pandas.io.data import DataFrame, read_csv import pandas.io.sql as psql from time import time from IPython.display import display from addutils import css_notebook css_notebook() # In[4]: import sys # Import from '.mat' files # In[5]: x = scipy.io.loadmat('example_data/matlab_variable.mat') # In[6]: pyA = x['a'] pyA # The Matlab variable is passed to a pandas DataFrame: # In[7]: df = pd.DataFrame(pyA) df # ## 2 Importing a compressed CSV # The following example shows how to import directly a compressed csv file, in this case with multiple separators: # In[8]: df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.', skipinitialspace=True, compression='gzip', engine='python', encoding='iso8859_15') df_csv.head() # **TODO**: utf-8 has problems decoding euro sign. "\u00e2\u0082\u00ac" are the three chars you get when the UTF-8 encoded \u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1. # In[9]: coor = df_csv['Coordinates'] df_csv['lat'] = '' df_csv['lon'] = '' for j, coo in enumerate(coor): spl = re.split(',', str(coo)) df_csv.loc[j, 'lat'] = spl[0] df_csv.loc[j, 'lon'] = spl[1] df_csv.head() # In[10]: df_csv.to_json('temp/converted_json.json') # ## 3 Importing JSON files # In[11]: with open('temp/converted_json.json') as f: data = f.read().encode('utf-8') dfjson = pd.read_json(data) dfjson.head() # Since the import reordered columns in alphabetical order, we can choose a preferred column order: # In[12]: dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']] dfjson.head() # ## 4 Importing HTML # Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates) # In[13]: dfhtml = pd.read_html('example_data/generated.html', header=0) dfhtml[0].head() # ## 5 Importing Excel # In[14]: dfxl = pd.read_excel('example_data/generated2.xls', 'foglio') dfxl.head() # ## 6 Working with SQL and databases # ### 6.1 Write SQL # Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating: # In[15]: con = sqlite3.connect("temp.sql") sqlfile = dfxl.to_sql('Names', con, flavor='sqlite') # ### 6.2 Import SQL # In[16]: con = sqlite3.connect('temp.sql') with con: sql = "SELECT * FROM Names;" df = psql.read_sql(sql, con) print(df.shape) # In[17]: df.head() # In[18]: con.close() os.remove("temp.sql") # ## 7 Working with HDF5 # ### 7.1 Storer format # **HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety. # Add DataFrames to the HDFStore object: # In[19]: samples_01 = int(3e5) samples_02 = int(1e5) idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome') idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome') randn = np.random.randn # In[20]: df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C']) df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D']) # In[21]: print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB') # In[22]: get_ipython().run_cell_magic('timeit', '', "with pd.get_store('temp/store53.h5') as store:\n store.put('storer/df1', df1)\n store.put('storer/df2', df2)\n store.put('to_remove', df2)\n") # Retrieve stored objects: # In[23]: with pd.get_store('temp/store53.h5') as store: print (store.keys()) df1_retrieved = store.get('storer/df1') print (df1_retrieved[1:3]) print (df1[1:3]) print ('Check retrieved data equal to original data: ') print (df1_retrieved[1:3].equals(df1[1:3])) # Delete objects: # In[24]: with pd.get_store('temp/store53.h5') as store: try: store.remove('to_remove') except: pass print (store) # ### 7.2 Table format # The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported. # In[25]: with pd.get_store('temp/store53.h5') as store: # store.append creates a table automatically: store.append('table/df1_appended', df1.ix[:10000]) # In[26]: with pd.get_store('temp/store53.h5') as store: store.append('table/df1_appended', df1.ix[10001:20000]) store.append('table/df1_appended', df1.ix[20001:50000]) # ### 7.3 Querying a Table # Query the table using boolean expression with in-line function evaluation: # In[27]: with pd.get_store('temp/store53.h5') as store: query01 = store.select('table/df1_appended', "index>=Timestamp('2000-01-01 12:00:00.20+01:00') \ & index) for more tutorials and updates. # # This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.