import addutils.toc ; addutils.toc.js(ipy_notebook=True)
from addutils import css_notebook
css_notebook()
import re
import os
import scipy.io
import numpy as np
import pandas as pd
import sqlite3
import pandas.io.sql as psql
from time import time
from IPython.display import display
from addutils import css_notebook
css_notebook()
import sys
Import from '.mat' files
x = scipy.io.loadmat('example_data/matlab_variable.mat')
pyA = x['a']
pyA
The Matlab variable is passed to a pandas DataFrame:
df = pd.DataFrame(pyA)
df
The following example shows how to import directly a compressed csv file, in this case with multiple separators:
df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.',
skipinitialspace=True, compression='gzip', engine='python',
encoding='iso8859_15')
df_csv.head()
TODO: utf-8 has problems decoding euro sign. "\u00e2\u0082\u00ac" are the three chars you get when the UTF-8 encoded \u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1.
coor = df_csv['Coordinates']
df_csv['lat'] = ''
df_csv['lon'] = ''
for j, coo in enumerate(coor):
spl = re.split(',', str(coo))
df_csv.loc[j, 'lat'] = spl[0]
df_csv.loc[j, 'lon'] = spl[1]
df_csv.head()
df_csv.to_json('temp/converted_json.json')
with open('temp/converted_json.json') as f:
data = f.read().encode('utf-8')
dfjson = pd.read_json(data)
dfjson.head()
Since the import reordered columns in alphabetical order, we can choose a preferred column order:
dfjson = dfjson.loc[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]
dfjson.head()
Note: read_html
returns a list of DataFrame objects, even if there is only a single table contained in the HTML
content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)
dfhtml = pd.read_html('example_data/generated.html', header=0)
dfhtml[0].head()
dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')
dfxl.head()
Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:
con = sqlite3.connect("temp.sql")
sqlfile = dfxl.to_sql('Names', con)
con = sqlite3.connect('temp.sql')
with con:
sql = "SELECT * FROM Names;"
df = psql.read_sql(sql, con)
print(df.shape)
df.head()
con.close()
os.remove("temp.sql")
HDFStore is a dict-like object used by pandas to store datasets as HDF5 files using the PyTables library. HDF5 is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The Storer format stores fixed arrays, which are queryiable and must be retrieved in their entirety.
Add DataFrames to the HDFStore object:
samples_01 = int(3e5)
samples_02 = int(1e5)
idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')
idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')
randn = np.random.randn
df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])
df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])
print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')
%%timeit
with pd.HDFStore('temp/store53.h5') as store:
store.put('storer/df1', df1)
store.put('storer/df2', df2)
store.put('to_remove', df2)
Retrieve stored objects:
with pd.HDFStore('temp/store53.h5') as store:
print (store.keys())
df1_retrieved = store.get('storer/df1')
print (df1_retrieved[1:3])
print (df1[1:3])
print ('Check retrieved data equal to original data: ')
print (df1_retrieved[1:3].equals(df1[1:3]))
Delete objects:
with pd.HDFStore('temp/store53.h5') as store:
try:
store.remove('to_remove')
except:
pass
print (store)
The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported.
with pd.HDFStore('temp/store53.h5') as store:
# store.append creates a table automatically:
store.append('table/df1_appended', df1.iloc[:10000])
with pd.HDFStore('temp/store53.h5') as store:
store.append('table/df1_appended', df1.iloc[10001:20000])
store.append('table/df1_appended', df1.iloc[20001:50000])
Query the table using boolean expression with in-line function evaluation:
with pd.HDFStore('temp/store53.h5') as store:
query01 = store.select('table/df1_appended',
"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \
& index<Timestamp('2000-01-01 12:00:00.40+01:00') \
& columns=['A', 'B']")
query01
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.