#!/usr/bin/env python # coding: utf-8 # ## Data Wrangling: Clean, Transform, Merge, Reshape # In[ ]: from __future__ import division from numpy.random import randn import numpy as np import os import matplotlib.pyplot as plt np.random.seed(12345) plt.rc('figure', figsize=(10, 6)) from pandas import Series, DataFrame import pandas import pandas as pd np.set_printoptions(precision=4, threshold=500) pd.options.display.max_rows = 100 # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') # ## Combining and merging data sets # ### Database-style DataFrame merges # In[ ]: df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)}) df2 = DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)}) df1 # In[ ]: df2 # In[ ]: pd.merge(df1, df2) # In[ ]: pd.merge(df1, df2, on='key') # In[ ]: df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)}) df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)}) pd.merge(df3, df4, left_on='lkey', right_on='rkey') # In[ ]: pd.merge(df1, df2, how='outer') # In[ ]: df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'], 'data2': range(5)}) # In[ ]: df1 # In[ ]: df2 # In[ ]: pd.merge(df1, df2, on='key', how='left') # In[ ]: pd.merge(df1, df2, how='inner') # In[ ]: left = DataFrame({'key1': ['foo', 'foo', 'bar'], 'key2': ['one', 'two', 'one'], 'lval': [1, 2, 3]}) right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], 'key2': ['one', 'one', 'one', 'two'], 'rval': [4, 5, 6, 7]}) pd.merge(left, right, on=['key1', 'key2'], how='outer') # In[ ]: pd.merge(left, right, on='key1') # In[ ]: pd.merge(left, right, on='key1', suffixes=('_left', '_right')) # ### Merging on index # In[ ]: left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)}) right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b']) # In[ ]: left1 # In[ ]: right1 # In[ ]: pd.merge(left1, right1, left_on='key', right_index=True) # In[ ]: pd.merge(left1, right1, left_on='key', right_index=True, how='outer') # In[ ]: lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'key2': [2000, 2001, 2002, 2001, 2002], 'data': np.arange(5.)}) righth = DataFrame(np.arange(12).reshape((6, 2)), index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'], [2001, 2000, 2000, 2000, 2001, 2002]], columns=['event1', 'event2']) lefth # In[ ]: righth # In[ ]: pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True) # In[ ]: pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer') # In[ ]: left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada']) right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama']) # In[ ]: left2 # In[ ]: right2 # In[ ]: pd.merge(left2, right2, how='outer', left_index=True, right_index=True) # In[ ]: left2.join(right2, how='outer') # In[ ]: left1.join(right1, on='key') # In[ ]: another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon']) # In[ ]: left2.join([right2, another]) # In[ ]: left2.join([right2, another], how='outer') # ### Concatenating along an axis # In[ ]: arr = np.arange(12).reshape((3, 4)) # In[ ]: arr # In[ ]: np.concatenate([arr, arr], axis=1) # In[ ]: s1 = Series([0, 1], index=['a', 'b']) s2 = Series([2, 3, 4], index=['c', 'd', 'e']) s3 = Series([5, 6], index=['f', 'g']) # In[ ]: pd.concat([s1, s2, s3]) # In[ ]: pd.concat([s1, s2, s3], axis=1) # In[ ]: s4 = pd.concat([s1 * 5, s3]) # In[ ]: pd.concat([s1, s4], axis=1) # In[ ]: pd.concat([s1, s4], axis=1, join='inner') # In[ ]: pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']]) # In[ ]: result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three']) # In[ ]: result # In[ ]: # Much more on the unstack function later result.unstack() # In[ ]: pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three']) # In[ ]: df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two']) df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four']) pd.concat([df1, df2], axis=1, keys=['level1', 'level2']) # In[ ]: pd.concat({'level1': df1, 'level2': df2}, axis=1) # In[ ]: pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['upper', 'lower']) # In[ ]: df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd']) df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a']) # In[ ]: df1 # In[ ]: df2 # In[ ]: pd.concat([df1, df2], ignore_index=True) # ### Combining data with overlap # In[ ]: a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a']) b = Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a']) b[-1] = np.nan # In[ ]: a # In[ ]: b # In[ ]: np.where(pd.isnull(a), b, a) # In[ ]: b[:-2].combine_first(a[2:]) # In[ ]: df1 = DataFrame({'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.], 'c': range(2, 18, 4)}) df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]}) df1.combine_first(df2) # ## Reshaping and pivoting # ### Reshaping with hierarchical indexing # In[ ]: data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) data # In[ ]: result = data.stack() result # In[ ]: result.unstack() # In[ ]: result.unstack(0) # In[ ]: result.unstack('state') # In[ ]: s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) data2.unstack() # In[ ]: data2.unstack().stack() # In[ ]: data2.unstack().stack(dropna=False) # In[ ]: df = DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side')) df # In[ ]: df.unstack('state') # In[ ]: df.unstack('state').stack('side') # ### Pivoting "long" to "wide" format # In[ ]: data = pd.read_csv('ch07/macrodata.csv') periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date') data = DataFrame(data.to_records(), columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'), index=periods.to_timestamp('D', 'end')) ldata = data.stack().reset_index().rename(columns={0: 'value'}) wdata = ldata.pivot('date', 'item', 'value') # In[ ]: ldata[:10] # In[ ]: pivoted = ldata.pivot('date', 'item', 'value') pivoted.head() # In[ ]: ldata['value2'] = np.random.randn(len(ldata)) ldata[:10] # In[ ]: pivoted = ldata.pivot('date', 'item') pivoted[:5] # In[ ]: pivoted['value'][:5] # In[ ]: unstacked = ldata.set_index(['date', 'item']).unstack('item') unstacked[:7] # ## Data transformation # ### Removing duplicates # In[ ]: data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]}) data # In[ ]: data.duplicated() # In[ ]: data.drop_duplicates() # In[ ]: data['v1'] = range(7) data.drop_duplicates(['k1']) # In[ ]: data.drop_duplicates(['k1', 'k2'], take_last=True) # ### Transforming data using a function or mapping # In[ ]: data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}) data # In[ ]: meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' } # In[ ]: data['animal'] = data['food'].map(str.lower).map(meat_to_animal) data # In[ ]: data['food'].map(lambda x: meat_to_animal[x.lower()]) # ### Replacing values # In[ ]: data = Series([1., -999., 2., -999., -1000., 3.]) data # In[ ]: data.replace(-999, np.nan) # In[ ]: data.replace([-999, -1000], np.nan) # In[ ]: data.replace([-999, -1000], [np.nan, 0]) # In[ ]: data.replace({-999: np.nan, -1000: 0}) # ### Renaming axis indexes # In[ ]: data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four']) # In[ ]: data.index.map(str.upper) # In[ ]: data.index = data.index.map(str.upper) data # In[ ]: data.rename(index=str.title, columns=str.upper) # In[ ]: data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'}) # In[ ]: # Always returns a reference to a DataFrame _ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True) data # ### Discretization and binning # In[ ]: ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] # In[ ]: bins = [18, 25, 35, 60, 100] cats = pd.cut(ages, bins) cats # In[ ]: cats.labels # In[ ]: cats.levels # In[ ]: pd.value_counts(cats) # In[ ]: pd.cut(ages, [18, 26, 36, 61, 100], right=False) # In[ ]: group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] pd.cut(ages, bins, labels=group_names) # In[ ]: data = np.random.rand(20) pd.cut(data, 4, precision=2) # In[ ]: data = np.random.randn(1000) # Normally distributed cats = pd.qcut(data, 4) # Cut into quartiles cats # In[ ]: pd.value_counts(cats) # In[ ]: pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) # ### Detecting and filtering outliers # In[ ]: np.random.seed(12345) data = DataFrame(np.random.randn(1000, 4)) data.describe() # In[ ]: col = data[3] col[np.abs(col) > 3] # In[ ]: data[(np.abs(data) > 3).any(1)] # In[ ]: data[np.abs(data) > 3] = np.sign(data) * 3 data.describe() # ### Permutation and random sampling # In[ ]: df = DataFrame(np.arange(5 * 4).reshape((5, 4))) sampler = np.random.permutation(5) sampler # In[ ]: df # In[ ]: df.take(sampler) # In[ ]: df.take(np.random.permutation(len(df))[:3]) # In[ ]: bag = np.array([5, 7, -1, 6, 4]) sampler = np.random.randint(0, len(bag), size=10) # In[ ]: sampler # In[ ]: draws = bag.take(sampler) draws # ### Computing indicator / dummy variables # In[ ]: df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) pd.get_dummies(df['key']) # In[ ]: dummies = pd.get_dummies(df['key'], prefix='key') df_with_dummy = df[['data1']].join(dummies) df_with_dummy # In[ ]: mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('ch02/movielens/movies.dat', sep='::', header=None, names=mnames) movies[:10] # In[ ]: genre_iter = (set(x.split('|')) for x in movies.genres) genres = sorted(set.union(*genre_iter)) # In[ ]: dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres) # In[ ]: for i, gen in enumerate(movies.genres): dummies.ix[i, gen.split('|')] = 1 # In[ ]: movies_windic = movies.join(dummies.add_prefix('Genre_')) movies_windic.ix[0] # In[ ]: np.random.seed(12345) # In[ ]: values = np.random.rand(10) values # In[ ]: bins = [0, 0.2, 0.4, 0.6, 0.8, 1] pd.get_dummies(pd.cut(values, bins)) # ## String manipulation # ### String object methods # In[ ]: val = 'a,b, guido' val.split(',') # In[ ]: pieces = [x.strip() for x in val.split(',')] pieces # In[ ]: first, second, third = pieces first + '::' + second + '::' + third # In[ ]: '::'.join(pieces) # In[ ]: 'guido' in val # In[ ]: val.index(',') # In[ ]: val.find(':') # In[ ]: val.index(':') # In[ ]: val.count(',') # In[ ]: val.replace(',', '::') # In[ ]: val.replace(',', '') # ### Regular expressions # In[ ]: import re text = "foo bar\t baz \tqux" re.split('\s+', text) # In[ ]: regex = re.compile('\s+') regex.split(text) # In[ ]: regex.findall(text) # In[ ]: text = """Dave dave@google.com Steve steve@gmail.com Rob rob@gmail.com Ryan ryan@yahoo.com """ pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' # re.IGNORECASE makes the regex case-insensitive regex = re.compile(pattern, flags=re.IGNORECASE) # In[ ]: regex.findall(text) # In[ ]: m = regex.search(text) m # In[ ]: text[m.start():m.end()] # In[ ]: print(regex.match(text)) # In[ ]: print(regex.sub('REDACTED', text)) # In[ ]: pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' regex = re.compile(pattern, flags=re.IGNORECASE) # In[ ]: m = regex.match('wesm@bright.net') m.groups() # In[ ]: regex.findall(text) # In[ ]: print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)) # In[ ]: regex = re.compile(r""" (?P[A-Z0-9._%+-]+) @ (?P[A-Z0-9.-]+) \. (?P[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE) # In[ ]: m = regex.match('wesm@bright.net') m.groupdict() # ### Vectorized string functions in pandas # In[ ]: data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan} data = Series(data) # In[ ]: data # In[ ]: data.isnull() # In[ ]: data.str.contains('gmail') # In[ ]: pattern # In[ ]: data.str.findall(pattern, flags=re.IGNORECASE) # In[ ]: matches = data.str.match(pattern, flags=re.IGNORECASE) matches # In[ ]: matches.str.get(1) # In[ ]: matches.str[0] # In[ ]: data.str[:5] # ## Example: USDA Food Database { "id": 21441, "description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY, Wing, meat and skin with breading", "tags": ["KFC"], "manufacturer": "Kentucky Fried Chicken", "group": "Fast Foods", "portions": [ { "amount": 1, "unit": "wing, with skin", "grams": 68.0 }, ... ], "nutrients": [ { "value": 20.8, "units": "g", "description": "Protein", "group": "Composition" }, ... ] } # In[ ]: import json db = json.load(open('ch07/foods-2011-10-03.json')) len(db) # In[ ]: db[0].keys() # In[ ]: db[0]['nutrients'][0] # In[ ]: nutrients = DataFrame(db[0]['nutrients']) nutrients[:7] # In[ ]: info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) # In[ ]: info[:5] # In[ ]: info # In[ ]: pd.value_counts(info.group)[:10] # In[ ]: nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) # In[ ]: nutrients # In[ ]: nutrients.duplicated().sum() # In[ ]: nutrients = nutrients.drop_duplicates() # In[ ]: col_mapping = {'description' : 'food', 'group' : 'fgroup'} info = info.rename(columns=col_mapping, copy=False) info # In[ ]: col_mapping = {'description' : 'nutrient', 'group' : 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) nutrients # In[ ]: ndata = pd.merge(nutrients, info, on='id', how='outer') # In[ ]: ndata # In[ ]: ndata.ix[30000] # In[ ]: result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh') # In[ ]: by_nutrient = ndata.groupby(['nutgroup', 'nutrient']) get_maximum = lambda x: x.xs(x.value.idxmax()) get_minimum = lambda x: x.xs(x.value.idxmin()) max_foods = by_nutrient.apply(get_maximum)[['value', 'food']] # make the food a little smaller max_foods.food = max_foods.food.str[:50] # In[ ]: max_foods.ix['Amino Acids']['food']