Introduction¶

In [1]:

import pandas as pd
pd.options.display.max_columns = 99
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

Compute each column's missing value counts

In [2]:

mv_list = []
for chunk in chunk_iter:
    mv_list.append(chunk.isnull().sum())
    
combined_mv_vc = pd.concat(mv_list)
unique_combined_mv_vc = combined_mv_vc.groupby(combined_mv_vc.index).sum()
unique_combined_mv_vc.sort_values()

Out[2]:

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

Total memory footprint for each column

In [3]:

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
counter = 0
series_memory_fp = pd.Series(dtype='float64')
for chunk in chunk_iter:
    if counter == 0:
        series_memory_fp = chunk.memory_usage(deep=True)
    else:
        series_memory_fp += chunk.memory_usage(deep=True)
    counter += 1

# Drop memory footprint calculation for the index.
series_memory_fp = series_memory_fp.drop('Index')
series_memory_fp

Out[3]:

company_permalink         4057788
company_name              3591326
company_category_code     3421104
company_country_code      3172176
company_state_code        3106051
company_region            3411585
company_city              3505926
investor_permalink        4980548
investor_name             3915666
investor_category_code     622424
investor_country_code     2647292
investor_state_code       2476607
investor_region           3396281
investor_city             2885083
funding_round_type        3410707
funded_at                 3542185
funded_month              3383584
funded_quarter            3383584
funded_year                422960
raised_amount_usd          422960
dtype: int64

Total memory footprint of the data (in megabytes)

In [4]:

series_memory_fp.sum() / (1024 * 1024)

Out[4]:

56.9876070022583

In [5]:

unique_combined_mv_vc.sort_values()

Out[5]:

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

In [6]:

# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']
keep_cols = chunk.columns.drop(drop_cols)

In [7]:

keep_cols.tolist

Out[7]:

<bound method IndexOpsMixin.tolist of Index(['company_name', 'company_category_code', 'company_country_code',
       'company_state_code', 'company_region', 'company_city', 'investor_name',
       'investor_country_code', 'investor_state_code', 'investor_region',
       'investor_city', 'funding_round_type', 'funded_at', 'funded_month',
       'funded_quarter', 'funded_year', 'raised_amount_usd'],
      dtype='object')>

Selecting Data Types¶

Let's first determine which columns shift types across chunks. Note that we only lay the groundwork for this step.

In [8]:

# Key: Column name, Value: List of types
col_types = {}
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)

for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in col_types:
            col_types[col] = [str(chunk.dtypes[col])]
        else:
            col_types[col].append(str(chunk.dtypes[col]))

In [9]:

uniq_col_types = {}
for k,v in col_types.items():
    uniq_col_types[k] = set(col_types[k])
uniq_col_types

Out[9]:

{'company_name': {'object'},
 'company_category_code': {'object'},
 'company_country_code': {'object'},
 'company_state_code': {'object'},
 'company_region': {'object'},
 'company_city': {'object'},
 'investor_name': {'object'},
 'investor_country_code': {'float64', 'object'},
 'investor_state_code': {'float64', 'object'},
 'investor_region': {'object'},
 'investor_city': {'float64', 'object'},
 'funding_round_type': {'object'},
 'funded_at': {'object'},
 'funded_month': {'object'},
 'funded_quarter': {'object'},
 'funded_year': {'float64', 'int64'},
 'raised_amount_usd': {'float64'}}

In [10]:

chunk

Out[10]:

	company_name	company_category_code	company_country_code	company_state_code	company_region	company_city	investor_name	investor_country_code	investor_state_code	investor_region	investor_city	funding_round_type	funded_at	funded_month	funded_quarter	funded_year	raised_amount_usd
50000	NuORDER	fashion	USA	CA	Los Angeles	West Hollywood	Mortimer Singer	NaN	NaN	unknown	NaN	series-a	2012-10-01	2012-10	2012-Q4	2012	3060000.0
50001	ChaCha	advertising	USA	IN	Indianapolis	Carmel	Morton Meyerson	NaN	NaN	unknown	NaN	series-b	2007-10-01	2007-10	2007-Q4	2007	12000000.0
50002	Binfire	software	USA	FL	Bocat Raton	Bocat Raton	Moshe Ariel	NaN	NaN	unknown	NaN	angel	2008-04-18	2008-04	2008-Q2	2008	500000.0
50003	Binfire	software	USA	FL	Bocat Raton	Bocat Raton	Moshe Ariel	NaN	NaN	unknown	NaN	angel	2010-01-01	2010-01	2010-Q1	2010	750000.0
50004	Unified Color	software	USA	CA	SF Bay	South San Frnacisco	Mr. Andrew Oung	NaN	NaN	unknown	NaN	angel	2010-01-01	2010-01	2010-Q1	2010	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
52865	Garantia Data	enterprise	USA	CA	SF Bay	Santa Clara	Zohar Gilon	NaN	NaN	unknown	NaN	series-a	2012-08-08	2012-08	2012-Q3	2012	3800000.0
52866	DudaMobile	mobile	USA	CA	SF Bay	Palo Alto	Zohar Gilon	NaN	NaN	unknown	NaN	series-c+	2013-04-08	2013-04	2013-Q2	2013	10300000.0
52867	SiteBrains	software	USA	CA	SF Bay	San Francisco	zohar israel	NaN	NaN	unknown	NaN	angel	2010-08-01	2010-08	2010-Q3	2010	350000.0
52868	Comprehend Systems	enterprise	USA	CA	SF Bay	Palo Alto	Zorba Lieberman	NaN	NaN	unknown	NaN	series-a	2013-07-11	2013-07	2013-Q3	2013	8400000.0
52869	SmartThings	mobile	USA	DC	unknown	Minneapolis	Zorik Gordon	NaN	NaN	unknown	NaN	series-a	2012-12-04	2012-12	2012-Q4	2012	3000000.0

2870 rows × 17 columns

Loading Chunks Into SQLite¶

In [11]:

import sqlite3
conn = sqlite3.connect('crunchbase.db')
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:
    chunk.to_sql("investments", conn, if_exists='append', index=False)