#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np # In[2]: loan_data = pd.read_csv('./02-loan_data.csv') # In[3]: loan_data.head() # In[6]: data = loan_data[['customer_id', 'fico_score', 'default']] data.head() # In[5]: from scipy.optimize import minimize # In[9]: # Set desired number of buckets num_buckets = 10 min_score, max_score = data.fico_score.min(), data.fico_score.max() def log_likelihood(boundaries, scores, defaults): n = len(scores) ni = np.zeros_like(boundaries) ki = np.zeros_like(boundaries) for i, score in enumerate(scores): bucket = np.digitize(score, boundaries) - 1 ni[bucket] += 1 ki[bucket] += defaults[i] pi = ki / ni LL = np.sum(ni * np.log(pi) + (ni - ki) * np.log(1 - pi)) return -LL initial_boundaries = np.linspace(min_score, max_score, num_buckets + 1) memo = {} # Top-down dynamic programming threshold = 100 def optimize_boundaries(scores, defaults, min_score, max_score): if max_score - min_score <= threshold: return simple_bucketing(scores, defaults, min_score, max_score) if (min_score, max_score) in memo: return memo[(min_score, max_score)] mid = (min_score + max_score) // 2 left_boundaries = optimize_boundaries(scores, defaults, min_score, mid) right_boundaries = optimize_boundaries(scores, defaults, mid, max_score) combined_boundaries = combine_boundaries(left_boundaries, right_boundaries) optimized_boundaries = minimize(log_likelihood, combined_boundaries, args=(scores, defaults)).x memo[(min_score, max_score)] = optimized_boundaries return optimized_boundaries def simple_bucketing(scores, defaults, min_score, max_score): return np.linspace(min_score, max_score, 3) def combine_boundaries(left_boundaries, right_boundaries): return np.concatenate((left_boundaries[:-1], right_boundaries[1:])) def create_rating_map(optimal_boundaries): rating_map = {} for i in range(len(optimal_boundaries) - 1): rating_map[f"Rating {i + 1}"] = (optimal_boundaries[i], optimal_boundaries[i + 1]) return rating_map optimal_boundaries = optimize_boundaries(data['fico_score'], data['default'], min(data['fico_score']), max(data['fico_score'])) rating_map = create_rating_map(optimal_boundaries) print("Optimized Boundaries:", optimal_boundaries) print("Rating Map:", rating_map) # In[15]: # We are told to create a rating map that maps the FICO score of the borrowers to a rating where a lower rating signifies a better credit score. def create_rating_map(optimal_boundaries): rating_map = {} l = len(optimal_boundaries) for i in range(l - 1,0,-1): rating_map[f"Rating {l-i}"] = (optimal_boundaries[i-1], optimal_boundaries[i]) return rating_map optimal_boundaries = optimize_boundaries(data['fico_score'], data['default'], min(data['fico_score']), max(data['fico_score'])) rating_map = create_rating_map(optimal_boundaries) print("Optimized Boundaries:", optimal_boundaries) print("Rating Map:", rating_map) # In[ ]: