CHAPTER 6¶

Date Loading, Storage, and File Formats¶

Reading and writing data in text format¶

In [1]:

import pandas as pd
from pandas import Series
from pandas import DataFrame
import sys
import numpy as np

In [2]:

!cat ch06/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [3]:

df = pd.read_csv('ch06/ex1.csv') 

In [4]:

df

Out[4]:

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [5]:

pd.read_table('ch06/ex1.csv', sep = ',')

Out[5]:

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [6]:

!cat ch06/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [7]:

pd.read_csv('ch06/ex2.csv', header = None)

Out[7]:

	0	1	2	3	4
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [8]:

pd.read_csv('ch06/ex2.csv', names = ['a', 'b', 'c', 'd', 'message'])

Out[8]:

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [9]:

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ch06/ex2.csv', names = names, index_col = 'message')

Out[9]:

	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

In [10]:

!cat ch06/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16

In [11]:

parsed = pd.read_csv('ch06/csv_mindex.csv', index_col = ['key1', 'key2'])
parsed

Out[11]:

		value1	value2
key1	key2
one	a	1	2
	b	3	4
	c	5	6
	d	7	8
two	a	9	10
	b	11	12
	c	13	14
	d	15	16

In [12]:

list(open('ch06/ex3.txt'))

Out[12]:

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [13]:

result = pd.read_csv('ch06/ex3.txt', sep = '\s+')
result

Out[13]:

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

In [14]:

!cat ch06/ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [15]:

pd.read_csv('ch06/ex4.csv', skiprows = [0, 2, 3])

Out[15]:

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [16]:

!cat ch06/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [17]:

result = pd.read_csv('ch06/ex5.csv')
result

Out[17]:

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [18]:

pd.isnull(result)

Out[18]:

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

In [19]:

result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])
result

Out[19]:

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [20]:

sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
result = pd.read_csv('ch06/ex5.csv', na_values=sentinels)
result

Out[20]:

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	NaN	5	6	NaN	8	world
2	three	9	10	11.0	12	NaN

Reading text file in pieces¶

In [21]:

result = pd.read_csv('ch06/ex6.csv')
result.head()

Out[21]:

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

In [22]:

pd.read_csv('ch06/ex6.csv', nrows = 5)

Out[22]:

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

In [36]:

chunker = pd.read_csv('ch06/ex6.csv', chunksize = 1000)

tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value = 0)
tot = tot.sort_values(ascending = False)

In [37]:

tot[:10]

Out[37]:

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

Writing data out to data format¶

In [38]:

data = pd.read_csv('ch06/ex5.csv')
data

Out[38]:

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [39]:

data.to_csv('ch06/out.csv')

In [40]:

!cat ch06/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

In [43]:

data.to_csv(sys.stdout, sep = '|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

In [44]:

data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

In [45]:

data.to_csv(sys.stdout, header = False, index = False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

In [47]:

data.to_csv(sys.stdout, index = False, cols = ['a', 'b', 'c'])

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

In [48]:

dates = pd.date_range('1/1/2000', periods=7)
dates

Out[48]:

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [51]:

ts = Series(np.arange(7), index = dates)
ts

Out[51]:

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int64

In [52]:

ts.to_csv('ch06/tseries.csv')
!cat ch06/tseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

In [53]:

Series.from_csv('ch06/tseries.csv', parse_dates=True)

Out[53]:

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

Manually Working with Dilimiter Formats¶

In [54]:

!cat ch06/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3","4"

In [55]:

import csv
f = open('ch06/ex7.csv')
reader = csv.reader(f)

In [57]:

for line in reader:
    print (line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']

In [58]:

lines = list(csv.reader(open('ch06/ex7.csv')))
lines

Out[58]:

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]

In [59]:

lines[0]

Out[59]:

['a', 'b', 'c']

In [60]:

lines[1:]

Out[60]:

[['1', '2', '3'], ['1', '2', '3', '4']]

In [61]:

header, values = lines[0], lines[1:]

In [62]:

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

Out[62]:

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [64]:

class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
reader = csv.reader(f, dialect = my_dialect)
reader

Out[64]:

<_csv.reader at 0x1083a42e8>

In [65]:

reader = csv.reader(f, delimiter = '|')
reader

Out[65]:

<_csv.reader at 0x1083a4198>

JSON Data¶

In [69]:

obj = """ 
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"], 
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
} """

In [70]:

import json
result = json.loads(obj)

In [71]:

result

Out[71]:

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}

In [72]:

asjson = json.dumps(result)

In [75]:

siblings = DataFrame(result['siblings'], columns = ['name', 'age'])

In [76]:

siblings

Out[76]:

	name	age
0	Scott	25
1	Katie	33

XML and HTML: Web Scraping¶

In [80]:

from lxml.html import parse
from urllib.request import urlopen

In [81]:

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))

In [82]:

doc = parsed.getroot()

In [83]:

links = doc.findall('.//a')

In [84]:

links[15: 20]

Out[84]:

[<Element a at 0x10aa3d8b8>,
 <Element a at 0x10aa3d908>,
 <Element a at 0x10aa3d958>,
 <Element a at 0x10aa3d9a8>,
 <Element a at 0x10aa3d9f8>]

In [85]:

link = links[18]

In [86]:

link.get('href')

Out[86]:

'https://www.yahoo.com/beauty'

In [87]:

link.text_content()

Out[87]:

'Beauty'

In [88]:

urls = [link.get('href') for link in doc.findall('.//a')]

In [91]:

urls[0: 10]

Out[91]:

['https://www.yahoo.com/',
 'https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym',
 'https://search.yahoo.com/search',
 'http://news.yahoo.com/',
 'http://sports.yahoo.com/',
 'http://finance.yahoo.com/',
 'https://celebrity.yahoo.com/',
 'https://weather.yahoo.com/',
 'https://answers.yahoo.com/',
 'https://www.flickr.com/']

In [92]:

tables = doc.findall('.//table')

In [99]:

calls = tables[1]

In [100]:

puts = tables[2]

In [101]:

calls

Out[101]:

<Element table at 0x108093a98>

In [102]:

rows = calls.findall('.//tr')

In [106]:

def _unpack(row, kind = 'td'):
    elts = row.findall('.//%s' %kind)
    return [val.text_content() for val in elts]

In [107]:

_unpack(rows[0], kind='th')

Out[107]:

['\n                \n                     Strike\n                    \n                        \ue004\n                        \ue002\n                    \n                \n                ∵ Filter\n            ',
 'Contract Name',
 '\n                \n                    Last\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    Bid\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    Ask\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    Change\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    %Change\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    Volume\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    Open Interest\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ',
 '\n                \n                    Implied Volatility\n                    \n                        \ue004\n                        \ue002\n                    \n                \n            ']

In [108]:

_unpack(rows[1], kind = 'td')

Out[108]:

['\n                        \n                            \n                            ✕\n                            [modify]\n                        \n                    ']

Binary Data Frame¶

In [109]:

frame = pd.read_csv('ch06/ex1.csv')
frame

Out[109]:

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [110]:

frame.to_pickle('ch06/frame_pickle')

In [111]:

pd.read_pickle('ch06/frame_pickle')

Out[111]:

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

CHAPTER 6¶

Date Loading, Storage, and File Formats¶

Reading and writing data in text format¶

Reading text file in pieces¶

Writing data out to data format¶

Manually Working with Dilimiter Formats¶

JSON Data¶

XML and HTML: Web Scraping¶

Binary Data Frame¶

Using HDF5 Format¶

Reading Microsoft Excel Files¶

Interacting with HTML and Web APIs¶

Interacting with Databases¶

Storing and Loading Data in MongoDB¶