You'll analyze real California house prices to understand wealth distribution. Calculate:
This will show you why median home price is more meaningful than average!
First, you'll need to access our shared datasets folder and add it to your Drive.
You should now see "datasets" in your Google Drive!
In your Colab notebook, run this code to access your Google Drive:
# Connect to Google Drive
from google.colab import drive
drive.mount("/content/gdrive")
When prompted:
# Open and read the file
file_content = open("/content/gdrive/MyDrive/datasets/house_prices.txt", "r").read()
# Convert file lines to numbers
prices = []
for line in file_content.strip().split('\n'):
prices.append(float(line))
print(f"Loaded {len(prices)} house prices")
print(f"First 5 prices: {prices[:5]}")
# Calculate the mean
# TODO: Sum all prices and divide by count
# Calculate the median
# TODO: Sort the prices first
# TODO: Find the middle value (remember even vs odd count!)
# Print your results
print(f"Mean house price: ${mean_price:,.2f}")
print(f"Median house price: ${median_price:,.2f}")
What do you notice about the difference?
Quartiles divide your data into four equal parts:
# Sort the data first!
sorted_prices = sorted(prices)
n = len(sorted_prices)
# Calculate quartile positions
# TODO: Q1 is at position n//4
# TODO: Q2 is at position n//2
# TODO: Q3 is at position 3*n//4
print(f"Q1 (25th percentile): ${q1:,.2f}")
print(f"Q2 (50th percentile): ${q2:,.2f}")
print(f"Q3 (75th percentile): ${q3:,.2f}")
Write a function that tells you what percentile a given house price is at:
def find_percentile(house_price, all_prices):
sorted_prices = sorted(all_prices)
# Count how many prices are below this price
# TODO: Loop through sorted_prices and count
# Calculate the percentile
# TODO: (count_below / total_count) * 100
return percentile
# Test your function
my_house = 450000
result = find_percentile(my_house, prices)
print(f"A ${my_house:,} house is at the {result:.1f} percentile")
Visualize the distribution using print statements where each █
represents 20 houses.
# Define price ranges (bins)
bin_0_50k = 0
bin_50_100k = 0
bin_100_150k = 0
# Etc.
# Loop through the prices and, for each one,
# use an if/elif/else statement to count the house
# in the correct bin
# This function, which makes a bar of the correct length,
# has been written for you.
def make_bar(size):
bar = ""
for i in range(size // 20):
bar += "█"
return bar
# Print the histogram
print("California House Price Distribution:")
print(f"$0-50k: {make_bar(bin_0_50k)}")
print(f"$50-100k: {make_bar(bin_50_100k)}")
print(f"$100-150k: {make_bar(bin_100_150k)}")
# TODO: Print the other bins
# Connect to Google Drive
from google.colab import drive
drive.mount("/content/gdrive")
# Open and read the file
file_content = open("/content/gdrive/MyDrive/datasets/house_prices.txt", "r").read()
# Convert file lines to numbers
prices = []
for line in file_content.strip().split('\n'):
prices.append(float(line))
print(f"Loaded {len(prices)} house prices")
print(f"First 5 prices: {prices[:5]}")
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True). Loaded 3000 house prices First 5 prices: [344700.0, 176500.0, 270500.0, 330000.0, 81700.0]
from math import floor
def get_mean(prices):
total = sum(prices)
count = len(prices)
mean = total / count
return mean
def get_median(prices):
sorted_prices = sorted(prices)
count = len(sorted_prices)
# If even
if count % 2 == 0:
idx_lo = count // 2 - 1
idx_hi = count // 2
median = (sorted_prices[idx_hi] + sorted_prices[idx_lo]) / 2
# If odd
else:
idx = count // 2
median = sorted_prices[idx]
return median
def get_dumb_median(prices):
sorted_prices = sorted(prices)
count = len(sorted_prices)
median = sorted_prices[count // 2]
return median
def get_dumb_median_short_code(prices):
return sorted(prices)[len(prices) // 2]
# Looks a lot like median
def get_quartile_q1(prices):
sorted_prices = sorted(prices)
count = len(sorted_prices)
median = sorted_prices[count // 4]
return median
def get_percentile(prices, percent):
sorted_prices = sorted(prices)
count = len(sorted_prices)
idx = floor((percent / 100) * count)
return sorted_prices[idx]
mean = get_mean(prices)
print(f"Mean is: {mean}")
median = get_median(prices)
print(f"Median is: {median}")
median = get_dumb_median(prices)
print(f"Dumb Median is: {median}")
q1 = get_quartile_q1(prices)
print(f"Q1 is: {q1}")
p25 = get_percentile(prices, 25)
print(f"25th percentile is: {p25}")
p50 = get_percentile(prices, 50)
print(f"50th percentile is: {p50}")
p95 = get_percentile(prices, 95)
print(f"95th percentile is: {p95}")
p5 = get_percentile(prices, 5)
print(f"5th percentile is: {p5}")
p0 = get_percentile(prices, 0)
print(f"0th percentile is: {p0}")
p99 = get_percentile(prices, 99)
print(f"99th percentile is: {p99}")
Mean is: 205846.275 Median is: 177650.0 Dumb Median is: 177700.0 Q1 is: 121200.0 25th percentile is: 121200.0 50th percentile is: 177700.0 95th percentile is: 466400.0 5th percentile is: 67800.0 0th percentile is: 22500.0 99th percentile is: 500001.0
def find_percentile(my_house, prices):
count = len(prices)
count_below = 0
for price in prices:
if price <= my_house:
count_below += 1
return (count_below / count) * 100
my_house = 450000
my_house_percentile = find_percentile(my_house, prices)
print(f"My house is in the {my_house_percentile:.2f}th percentile")
My house is in the 94.40th percentile
def make_bar(size):
bar = ""
for i in range(size // 20):
bar += "█"
return bar
bin_0_50k = 0
bin_50_100k = 0
bin_100_150k = 0
bin_150_200k = 0
bin_200_250k = 0
bin_250_300k = 0
bin_300_350k = 0
bin_350_400k = 0
bin_400_450k = 0
bin_450_500k = 0
bin_500_550k = 0
for price in prices:
if price > 0 and price < 50000:
bin_0_50k += 1
elif price >= 50000 and price < 100000:
bin_50_100k += 1
elif price >= 100000 and price < 150000:
bin_100_150k += 1
elif price >= 150000 and price < 200000:
bin_150_200k += 1
elif price >= 200000 and price < 250000:
bin_200_250k += 1
elif price >= 250000 and price < 300000:
bin_250_300k += 1
elif price >= 300000 and price < 350000:
bin_300_350k += 1
elif price >= 350000 and price < 400000:
bin_350_400k += 1
elif price >= 400000 and price < 450000:
bin_400_450k += 1
elif price >= 450000 and price < 500000:
bin_450_500k += 1
elif price >= 500000 and price < 550000:
bin_500_550k += 1
# Print the histogram
print("California House Price Distribution:")
print(f"$0-50k: {make_bar(bin_0_50k)}")
print(f"$50-100k: {make_bar(bin_50_100k)}")
print(f"$100-150k: {make_bar(bin_100_150k)}")
print(f"$150-200k: {make_bar(bin_150_200k)}")
print(f"$200-250k: {make_bar(bin_200_250k)}")
print(f"$250-300k: {make_bar(bin_250_300k)}")
print(f"$300-350k: {make_bar(bin_300_350k)}")
print(f"$350-400k: {make_bar(bin_350_400k)}")
print(f"$400-450k: {make_bar(bin_400_450k)}")
print(f"$450-500k: {make_bar(bin_450_500k)}")
print(f"$500-550k: {make_bar(bin_500_550k)}")
# TODO: Print the other bins
California House Price Distribution: $0-50k: █ $50-100k: ████████████████████████ $100-150k: █████████████████████████████ $150-200k: ████████████████████████████████ $200-250k: █████████████████████ $250-300k: ██████████████ $300-350k: ████████ $350-400k: ██████ $400-450k: ███ $450-500k: ██ $500-550k: ██████