#!/usr/bin/env python
# coding: utf-8

# # pandas I/O tools and examples

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# In[2]:


from addutils import css_notebook
css_notebook()


# ## 1 Matlab Variables

# ### 1.1 Import a Matlab variable from file

# In[3]:


import re
import os 
import scipy.io
import numpy as np
import pandas as pd
import sqlite3
#from pandas.io.data import DataFrame, read_csv
import pandas.io.sql as psql
from time import time
from IPython.display import display
from addutils import css_notebook
css_notebook()


# In[4]:


import sys


# Import from '.mat' files

# In[5]:


x = scipy.io.loadmat('example_data/matlab_variable.mat')


# In[6]:


pyA = x['a']
pyA


# The Matlab variable is passed to a pandas DataFrame:

# In[7]:


df = pd.DataFrame(pyA)
df


# ## 2 Importing a compressed CSV

# The following example shows how to import directly a compressed csv file, in this case with multiple separators:

# In[8]:


df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.',  
                     skipinitialspace=True, compression='gzip', engine='python',
                     encoding='iso8859_15')
df_csv.head()


# **TODO**: utf-8 has problems decoding euro sign. "\u00e2\u0082\u00ac" are the three chars you get when the UTF-8 encoded \u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1.

# In[9]:


coor = df_csv['Coordinates']
df_csv['lat'] = ''
df_csv['lon'] = ''
for j, coo in enumerate(coor):
    spl = re.split(',', str(coo))
    df_csv.loc[j, 'lat'] = spl[0] 
    df_csv.loc[j, 'lon'] = spl[1]
df_csv.head()


# In[10]:


df_csv.to_json('temp/converted_json.json')


# ## 3 Importing JSON files

# In[11]:


with open('temp/converted_json.json') as f:
    data = f.read().encode('utf-8')
dfjson = pd.read_json(data)
dfjson.head()


# Since the import reordered columns in alphabetical order, we can choose a preferred column order:

# In[12]:


dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]
dfjson.head()


# ## 4 Importing HTML

# Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)

# In[13]:


dfhtml = pd.read_html('example_data/generated.html', header=0)
dfhtml[0].head()


# ## 5 Importing Excel

# In[14]:


dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')
dfxl.head()


# ## 6 Working with SQL and databases

# ### 6.1 Write SQL

# Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:

# In[15]:


con = sqlite3.connect("temp.sql")
sqlfile = dfxl.to_sql('Names', con, flavor='sqlite')


# ### 6.2 Import SQL

# In[16]:


con = sqlite3.connect('temp.sql')
with con:
    sql = "SELECT * FROM Names;"
    df = psql.read_sql(sql, con)
    print(df.shape)


# In[17]:


df.head()


# In[18]:


con.close()
os.remove("temp.sql")


# ## 7 Working with HDF5

# ### 7.1 Storer format

# **HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety.

# Add DataFrames to the HDFStore object:

# In[19]:


samples_01 = int(3e5)
samples_02 = int(1e5)
idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')
idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')
randn = np.random.randn


# In[20]:


df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])
df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])


# In[21]:


print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')


# In[22]:


get_ipython().run_cell_magic('timeit', '', "with pd.get_store('temp/store53.h5') as store:\n    store.put('storer/df1', df1)\n    store.put('storer/df2', df2)\n    store.put('to_remove', df2)\n")


# Retrieve stored objects:

# In[23]:


with pd.get_store('temp/store53.h5') as store:
    print (store.keys())
    df1_retrieved = store.get('storer/df1')
    print (df1_retrieved[1:3])
    print (df1[1:3])
    print ('Check retrieved data equal to original data: ')
    print (df1_retrieved[1:3].equals(df1[1:3]))


# Delete objects:

# In[24]:


with pd.get_store('temp/store53.h5') as store:
    try:
        store.remove('to_remove')
    except:
        pass
    print (store)


# ### 7.2 Table format

# The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported.

# In[25]:


with pd.get_store('temp/store53.h5') as store:
    # store.append creates a table automatically:
    store.append('table/df1_appended', df1.ix[:10000])


# In[26]:


with pd.get_store('temp/store53.h5') as store:
    store.append('table/df1_appended', df1.ix[10001:20000])
    store.append('table/df1_appended', df1.ix[20001:50000])


# ### 7.3 Querying a Table

# Query the table using boolean expression with in-line function evaluation:

# In[27]:


with pd.get_store('temp/store53.h5') as store:
    query01 = store.select('table/df1_appended',
                           "index>=Timestamp('2000-01-01 12:00:00.20+01:00') \
                           & index<Timestamp('2000-01-01 12:00:00.40+01:00') \
                           & columns=['A', 'B']")
query01


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.