Instructions:
Study of the columns for memory optimization:
Actions: term - remove the months string int_rate / revol_util - remove the % sign grade / sub_grade / home_ownership / verification_status / loan_status / pymnt_plan / purpose - category? addr_state / initial_list_status / application_type
emp_length - remove the string objects issue_d / earliest_cr_line / last_pymnt_d / last_credit_pull_d / -> convert to date zip_code -> check if possible to convert to integer
import pandas as pd
df = pd.read_csv(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv", nrows=5)
for n, row in df.iterrows():
print(row)
df = pd.read_csv(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv", nrows=3250)
df.memory_usage(deep=True).sum() / (1024 * 1024)
#Reading 3,250 rows reaches 5 MB memory consumption
Actions: term - remove the months string int_rate / revol_util - remove the % sign grade / sub_grade / home_ownership / verification_status / loan_status / pymnt_plan / purpose - category? addr_state / initial_list_status / application_type
emp_length - remove the string objects issue_d / earliest_cr_line / last_pymnt_d / last_credit_pull_d / -> convert to date zip_code -> check if possible to convert to integer
import re
cols_digits = ["term", "int_rate", "revol_util", "emp_length"]
cols_date = ["issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d"]
def extract_digits(col):
col = col.str.extract(r"(\d+)", expand=False)
return col
renaming = {"term":"term months", "int_rate":"int_rate %", "revol_util": "revol_util%", "emp_length":"emp_length years"}
df[cols_date] = df[cols_date].apply(pd.to_datetime)
df[cols_digits] = df[cols_digits].apply(extract_digits).apply(pd.to_numeric)
df = df.rename(columns=renaming)
df.head()
df.memory_usage(deep=True).sum() / (1024 * 1024)
We have reduced by 1 MB the data by simply converting to date time and numeric. Now we will categorize data into categories if unique values are less than 20.
check_cat = ["grade", "sub_grade", "home_ownership", "verification_status", "loan_status",
"pymnt_plan", "purpose", "addr_state", "initial_list_status", "application_type"]
num_unique = {}
path = r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv"
for cat in check_cat:
test1 = pd.read_csv(path, usecols=[cat])
num_unique[cat] = df[cat].nunique()
test = pd.DataFrame(num_unique.items(), columns=["Col_name", "Number Unique"])
test
Except for addr_state, the rest will be converted into categories for memory optimization
check_cat = ["grade", "sub_grade", "home_ownership", "verification_status", "loan_status",
"pymnt_plan", "purpose", "initial_list_status", "application_type"]
for cat_col in check_cat:
df[cat_col] = df[cat_col].astype("category")
def memory_usage(df):
return str(round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2)) + " MB"
memory_usage(df)
df.select_dtypes(include=["integer", "float"]).columns
to_integer = ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
'term months', 'installment', 'emp_length years',
'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
'tax_liens']
df[to_integer] = df[to_integer].apply(pd.to_numeric, downcast="integer")
df.select_dtypes(include="integer")
types = df.dtypes.to_frame()
types = types[types[0] != "datetime64[ns]"]
types
We reduced from 5 MB to 1.83 MB 1000 rows memory usage.
Now read all the dataframe by cleaning each chunk and adapting the optimal data types.
def extract_digits(col):
col = col.str.extract(r"(\d+)", expand=False)
return col
def chunk_processing(path, chunksize=5000):
#Clean those columns
cols_digits = ["term", "int_rate", "revol_util", "emp_length"]
#Parse as dates
cols_date = ["issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d"]
#Convert to integer
numerical = ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
'term months', 'installment', 'emp_length years',
'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
'tax_liens']
#Convert to categorical
check_cat = ["grade", "sub_grade", "home_ownership", "verification_status", "loan_status",
"pymnt_plan", "purpose", "initial_list_status", "application_type"]
#Rename columns
renaming = {"term":"term months", "int_rate":"int_rate %", "revol_util": "revol_util%", "emp_length":"emp_length years"}
#Category dictionary
categorical = {}
for cat in check_cat:
categorical[cat] = "category"
chunk_iter = pd.read_csv(path, chunksize=chunksize, parse_dates=cols_date, dtype=categorical)
final_chunk = pd.DataFrame()
for chunk in chunk_iter:
chunk[cols_digits] = chunk[cols_digits].apply(lambda x: extract_digits(x)).apply(pd.to_numeric)
chunk = chunk.rename(columns=renaming)
chunk[numerical] = chunk[numerical].apply(pd.to_numeric, errors="coerce")
if len(final_chunk) == 0:
final_chunk = chunk
else:
final_chunk = pd.concat([final_chunk, chunk])
return final_chunk
optimized = memory_usage(chunk_processing(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv", 10000))
non_optimized = memory_usage(pd.read_csv(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv"))
print("Optimized:", optimized, "//", "Non-optimized", non_optimized)