import pandas as pd
import numpy as np
loan_data = pd.read_csv('./02-loan_data.csv')
loan_data.head()
customer_id | credit_lines_outstanding | loan_amt_outstanding | total_debt_outstanding | income | years_employed | fico_score | default | |
---|---|---|---|---|---|---|---|---|
0 | 8153374 | 0 | 5221.545193 | 3915.471226 | 78039.38546 | 5 | 605 | 0 |
1 | 7442532 | 5 | 1958.928726 | 8228.752520 | 26648.43525 | 2 | 572 | 1 |
2 | 2256073 | 0 | 3363.009259 | 2027.830850 | 65866.71246 | 4 | 602 | 0 |
3 | 4885975 | 0 | 4766.648001 | 2501.730397 | 74356.88347 | 5 | 612 | 0 |
4 | 4700614 | 1 | 1345.827718 | 1768.826187 | 23448.32631 | 6 | 631 | 0 |
data = loan_data[['customer_id', 'fico_score', 'default']]
data.head()
customer_id | fico_score | default | |
---|---|---|---|
0 | 8153374 | 605 | 0 |
1 | 7442532 | 572 | 1 |
2 | 2256073 | 602 | 0 |
3 | 4885975 | 612 | 0 |
4 | 4700614 | 631 | 0 |
from scipy.optimize import minimize
# Set desired number of buckets
num_buckets = 10
min_score, max_score = data.fico_score.min(), data.fico_score.max()
def log_likelihood(boundaries, scores, defaults):
n = len(scores)
ni = np.zeros_like(boundaries)
ki = np.zeros_like(boundaries)
for i, score in enumerate(scores):
bucket = np.digitize(score, boundaries) - 1
ni[bucket] += 1
ki[bucket] += defaults[i]
pi = ki / ni
LL = np.sum(ni * np.log(pi) + (ni - ki) * np.log(1 - pi))
return -LL
initial_boundaries = np.linspace(min_score, max_score, num_buckets + 1)
memo = {} # Top-down dynamic programming
threshold = 100
def optimize_boundaries(scores, defaults, min_score, max_score):
if max_score - min_score <= threshold:
return simple_bucketing(scores, defaults, min_score, max_score)
if (min_score, max_score) in memo:
return memo[(min_score, max_score)]
mid = (min_score + max_score) // 2
left_boundaries = optimize_boundaries(scores, defaults, min_score, mid)
right_boundaries = optimize_boundaries(scores, defaults, mid, max_score)
combined_boundaries = combine_boundaries(left_boundaries, right_boundaries)
optimized_boundaries = minimize(log_likelihood, combined_boundaries, args=(scores, defaults)).x
memo[(min_score, max_score)] = optimized_boundaries
return optimized_boundaries
def simple_bucketing(scores, defaults, min_score, max_score):
return np.linspace(min_score, max_score, 3)
def combine_boundaries(left_boundaries, right_boundaries):
return np.concatenate((left_boundaries[:-1], right_boundaries[1:]))
def create_rating_map(optimal_boundaries):
rating_map = {}
for i in range(len(optimal_boundaries) - 1):
rating_map[f"Rating {i + 1}"] = (optimal_boundaries[i], optimal_boundaries[i + 1])
return rating_map
optimal_boundaries = optimize_boundaries(data['fico_score'], data['default'], min(data['fico_score']), max(data['fico_score']))
rating_map = create_rating_map(optimal_boundaries)
print("Optimized Boundaries:", optimal_boundaries)
print("Rating Map:", rating_map)
/var/folders/h0/722z94dd3fb0pfv4wg3qmdkh0000gn/T/ipykernel_25445/2800205351.py:14: RuntimeWarning: divide by zero encountered in log LL = np.sum(ni * np.log(pi) + (ni - ki) * np.log(1 - pi)) /var/folders/h0/722z94dd3fb0pfv4wg3qmdkh0000gn/T/ipykernel_25445/2800205351.py:14: RuntimeWarning: invalid value encountered in multiply LL = np.sum(ni * np.log(pi) + (ni - ki) * np.log(1 - pi)) /Users/brhank/miniconda3/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:576: RuntimeWarning: invalid value encountered in subtract df = fun(x) - f0
Optimized Boundaries: [408. 435.5 490.5 545.5 600.9999951 656.5 711.5 766.5 822. 850. ] Rating Map: {'Rating 1': (408.0, 435.5), 'Rating 2': (435.5, 490.5), 'Rating 3': (490.5, 545.5), 'Rating 4': (545.5, 600.9999951015242), 'Rating 5': (600.9999951015242, 656.5), 'Rating 6': (656.5, 711.5), 'Rating 7': (711.5, 766.5), 'Rating 8': (766.5, 822.0), 'Rating 9': (822.0, 850.0)}
/var/folders/h0/722z94dd3fb0pfv4wg3qmdkh0000gn/T/ipykernel_25445/2800205351.py:13: RuntimeWarning: invalid value encountered in divide pi = ki / ni
# We are told to create a rating map that maps the FICO score of the borrowers to a rating where a lower rating signifies a better credit score.
def create_rating_map(optimal_boundaries):
rating_map = {}
l = len(optimal_boundaries)
for i in range(l - 1,0,-1):
rating_map[f"Rating {l-i}"] = (optimal_boundaries[i-1], optimal_boundaries[i])
return rating_map
optimal_boundaries = optimize_boundaries(data['fico_score'], data['default'], min(data['fico_score']), max(data['fico_score']))
rating_map = create_rating_map(optimal_boundaries)
print("Optimized Boundaries:", optimal_boundaries)
print("Rating Map:", rating_map)
Optimized Boundaries: [408. 435.5 490.5 545.5 600.9999951 656.5 711.5 766.5 822. 850. ] Rating Map: {'Rating 1': (822.0, 850.0), 'Rating 2': (766.5, 822.0), 'Rating 3': (711.5, 766.5), 'Rating 4': (656.5, 711.5), 'Rating 5': (600.9999951015242, 656.5), 'Rating 6': (545.5, 600.9999951015242), 'Rating 7': (490.5, 545.5), 'Rating 8': (435.5, 490.5), 'Rating 9': (408.0, 435.5)}