In this assignment, you'll analyze real NYC Airbnb listing data using dictionaries and group-by operations. You'll discover pricing patterns across neighborhoods and room types.
First, mount your Google Drive to access the dataset:
Included in starter code
# Connect to Google Drive
from google.colab import drive
drive.mount("/content/gdrive")
The dataset is located at: /content/gdrive/MyDrive/datasets/NYC-Airbnb-2023.csv
The dataset contains information about Airbnb listings in NYC. Here are all the columns:
Use csv.DictReader
to load the data into a list of dictionaries. This function is provided for you:
Included in starter code
import csv
def load_airbnb_data(filename):
"""Load Airbnb data from CSV file into a list of dictionaries"""
listings = []
# Fields that should be converted to float
numeric_fields = [
'id', 'host_id', 'latitude', 'longitude', 'price',
'minimum_nights', 'number_of_reviews', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365',
'number_of_reviews_ltm'
]
with open(filename, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
# Try to convert numeric fields
skip_row = False
for field in numeric_fields:
if field in row and row[field]: # Check if field exists and is not empty
try:
row[field] = float(row[field])
except ValueError:
# Skip this entire row if any numeric conversion fails
skip_row = True
break
else:
# Set empty numeric fields to 0.0
row[field] = 0.0
if not skip_row:
listings.append(row)
return listings
# Load the data
listings = load_airbnb_data('/content/gdrive/MyDrive/datasets/NYC-Airbnb-2023.csv')
print(f"Loaded {len(listings)} listings")
Calculate the overall average and median rental price. You'll need to:
Question: Why might the average and median be different? What does this tell us about the price distribution?
Use the pattern above to group listings by borough (neighbourhood_group) and calculate average price for each:
def group_by_borough(listings):
"""Group listings by neighbourhood_group and calculate average price"""
# TODO: Use the grouping pattern
# TODO: Calculate average for each group
# TODO: Return dictionary of borough -> average price
pass
Create a function that groups by room_type and calculates multiple statistics:
def group_by_room_type(listings):
"""Group listings by room_type and calculate statistics"""
# TODO: Group by room_type
# TODO: For each room type, calculate:
# - average price
# - median price
# - count of listings
# TODO: Return nested dictionary
pass
Group by BOTH neighborhood AND room type:
def group_by_neighborhood_and_room_type(listings):
"""Group by neighbourhood and room_type"""
# TODO: Create a compound key (hint: combine strings)
# TODO: Group prices by compound key
# TODO: Calculate statistics for each group
# TODO: Only include groups with 5+ listings
pass
Find listings that are unusually expensive for their neighborhood:
def find_outliers_by_neighborhood(listings, threshold_multiplier=3):
"""Find listings priced way above neighborhood average"""
# TODO: First calculate average price per neighborhood
# TODO: Find listings where price > neighborhood_avg * threshold_multiplier
# TODO: Return list of outlier information
pass
Investigate what factors correlate with price. Consider grouping by:
def analyze_price_factors(listings):
"""Analyze which factors seem to affect price"""
# TODO: Define your own grouping criteria
# TODO: Compare average prices between groups
# TODO: Return your findings
pass
Create an interactive tool to find the best listings based on user criteria:
def explore_listings(listings):
"""Interactive tool to explore listings"""
# TODO: Get user input for:
# - neighborhood (optional)
# - room_type (optional)
# - max_price (optional)
# TODO: Filter listings based on criteria
# TODO: Sort by some measure of "quality" (you decide!)
# TODO: Show top 5 results
pass
To extra a single "column" from the dataset:
prices = []
for listing in listings:
values.append(listing["price"])
Don't forget about python's sorted
function
Here's the group by pattern you'll use throughout this assignment:
# Pattern for collecting values by group
grouped = {}
for item in data:
key = # extract grouping key from item
value = # extract value to collect from item
if key not in grouped:
grouped[key] = []
grouped[key].append(value)
# Then analyze each group
results = {}
for key, values in grouped.items():
# Calculate statistics on values
results[key] = # your calculation here
# Connect to Google Drive
from google.colab import drive
drive.mount("/content/gdrive")
import csv
def load_airbnb_data(filename):
"""Load Airbnb data from CSV file into a list of dictionaries"""
listings = []
# Fields that should be converted to float
numeric_fields = [
'id', 'host_id', 'latitude', 'longitude', 'price',
'minimum_nights', 'number_of_reviews', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365',
'number_of_reviews_ltm'
]
with open(filename, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
# Try to convert numeric fields
skip_row = False
for field in numeric_fields:
if field in row and row[field]: # Check if field exists and is not empty
try:
row[field] = float(row[field])
except ValueError:
# Skip this entire row if any numeric conversion fails
skip_row = True
break
else:
# Set empty numeric fields to 0.0
row[field] = 0.0
if not skip_row:
listings.append(row)
return listings
# Load the data
listings = load_airbnb_data('/content/gdrive/MyDrive/datasets/NYC-Airbnb-2023.csv')
print(f"Loaded {len(listings)} listings")
print("First 5 listings:")
for listing in listings[:5]:
print(listing)