In this assignment, you'll analyze real NYC Airbnb listing data using pandas and group-by operations. You'll discover pricing patterns across neighborhoods and room types.
First, mount your Google Drive to access the dataset:
Included in starter code
# Connect to Google Drive
from google.colab import drive
drive.mount("/content/gdrive")
The dataset is located at: /content/gdrive/MyDrive/datasets/NYC-Airbnb-2023.csv
The dataset contains information about Airbnb listings in NYC. Here are all the columns:
Use pandas to load the data. Pandas will automatically handle type conversion for us:
Included in starter code
import pandas as pd
import numpy as np
# Load the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/NYC-Airbnb-2023.csv')
print(f"Loaded {len(df)} listings")
print(f"Columns: {list(df.columns)}")
# Display basic info about the dataset
print("\nDataset Info:")
df.info()
# Display first few rows
print("\nFirst 5 listings:")
df.head()
Calculate the overall average and median rental price:
# Calculate average and median price
avg_price = df['price'].mean()
median_price = df['price'].median()
print(f"Average price: ${avg_price:.2f}")
print(f"Median price: ${median_price:.2f}")
# Bonus: Look at price distribution
df['price'].describe()
Question: Why might the average and median be different? What does this tell us about the price distribution?
Use pandas groupby to analyze prices by borough (neighbourhood_group):
def analyze_by_borough(df):
"""Group listings by neighbourhood_group and calculate statistics"""
# TODO: Use df.groupby() to group by neighbourhood_group
# TODO: Calculate mean price for each group
# TODO: Return a Series with borough names as index and average prices as values
pass
# Example usage:
# borough_prices = analyze_by_borough(df)
# print(borough_prices)
Create a function that groups by room_type and calculates multiple statistics:
def analyze_by_room_type(df):
"""Group listings by room_type and calculate statistics"""
# TODO: Group by room_type
# TODO: For each room type, calculate:
# - mean price
# - median price
# - count of listings
# TODO: Use .agg() to calculate multiple statistics at once
pass
# Example usage:
# room_stats = analyze_by_room_type(df)
# print(room_stats)
Group by BOTH neighborhood AND room type:
def analyze_by_neighborhood_and_room_type(df):
"""Group by neighbourhood and room_type"""
# TODO: Use groupby with a list of columns
# TODO: Calculate mean price for each combination
# TODO: Filter to only include groups with 5+ listings
# Hint: Use .filter() or check group size with .size()
pass
Find listings that are unusually expensive for their neighborhood:
def find_price_outliers(df, threshold_multiplier=3):
"""Find listings priced way above neighborhood average"""
# TODO: Calculate mean price per neighborhood
# TODO: Merge this back to the original dataframe
# TODO: Find listings where price > neighborhood_avg * threshold_multiplier
# TODO: Return dataframe with outlier listings
pass
Investigate what factors correlate with price:
def analyze_price_factors(df):
"""Analyze which factors affect price"""
# TODO: Create new columns for grouping:
# - 'stay_length': 'short' if minimum_nights < 7, else 'long'
# - 'availability': 'high' if availability_365 >= 180, else 'low'
# - 'popularity': based on number_of_reviews quartiles
# TODO: Group by each factor and compare average prices
# TODO: Return a summary of findings
pass
Use pandas' powerful features for deeper insights:
def advanced_analysis(df):
"""Perform advanced analysis using pandas features"""
# TODO: Create a pivot table showing average price by borough and room_type
# Hint: Use pd.pivot_table()
# TODO: Find the top 5 neighborhoods by average price
# Hint: Use .nlargest()
# TODO: Calculate correlation between numeric features and price
# Hint: Use .corr()
pass
Create an interactive tool to find the best listings:
def explore_listings(df):
"""Interactive tool to explore listings"""
# TODO: Get user input for filters
# TODO: Use boolean indexing to filter the dataframe
# TODO: Create a 'quality_score' based on reviews and price
# TODO: Sort by quality score and display top results
# Example of boolean indexing:
# filtered = df[(df['price'] <= max_price) & (df['neighbourhood'] == selected_neighborhood)]
pass
df.groupby('column')['price'].mean()
for simple groupingdf.groupby('column')['price'].agg(['mean', 'median', 'count'])
for multiple statsdf.groupby(['col1', 'col2'])
for multi-level grouping.reset_index()
to convert a grouped result back to a regular DataFramedf[df['column'] > value]
filters rowsdf['new_col'] = df['old_col'].apply(some_function)
df.describe()
gives you quick statisticsdf.value_counts()
counts unique valuesdf.sort_values()
sorts by column(s)df.merge()
joins dataframesdf.pivot_table()
creates summary tablesdf.corr()
shows correlations between numeric columns# Connect to Google Drive
from google.colab import drive
drive.mount("/content/gdrive")
import pandas as pd
import numpy as np
# Load the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/NYC-Airbnb-2023.csv')
print(f"Loaded {len(df)} listings")
print(f"Columns: {list(df.columns)}")
# Display basic info about the dataset
print("\nDataset Info:")
df.info()
# Display first few rows
print("\nFirst 5 listings:")
df.head()