import requests download_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv" target_csv_path = "nba_all_elo.csv" response = requests.get(download_url) response.raise_for_status() with open(target_csv_path, "wb") as f: f.write(response.content) print("Download ready.") import pandas as pd nba = pd.read_csv("nba_all_elo.csv") type(nba) # Expected: # len(nba) # Expected: # 126314 nba.shape # Expected: # (126314, 23) nba.head() pd.set_option("display.max.columns", None) pd.set_option("display.precision", 2) nba.tail() nba.info() nba.describe() import numpy as np nba.describe(include=np.object) nba["team_id"].value_counts() # Expected: # BOS 5997 # NYK 5769 # LAL 5078 # SDS 11 nba["fran_id"].value_counts() # Expected: # Name: team_id, Length: 104, dtype: int64 # Lakers 6024 # Celtics 5997 # Knicks 5769 # Huskies 60 # Name: fran_id, dtype: int64 nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts() # Expected: # LAL 5078 # MNL 946 # Name: team_id, dtype: int64 nba.loc[nba["team_id"] == "MNL", "date_game"].min() # Expected: # '1/1/1949' nba.loc[nba["team_id"] == "MNL", "date_game"].max() # Expected: # '4/9/1959' nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max")) # Expected: # min 1/1/1949 # max 4/9/1959 # Name: date_game, dtype: object revenues = pd.Series([5555, 7000, 1980]) revenues # Expected: # 0 5555 # 1 7000 # 2 1980 # dtype: int64 revenues.values # Expected: # array([5555, 7000, 1980]) revenues.index # Expected: # RangeIndex(start=0, stop=3, step=1) type(revenues.values) # Expected: # city_revenues = pd.Series( [4200, 8000, 6500], index=["Amsterdam", "Toronto", "Tokyo"] ) city_revenues # Expected: # Amsterdam 4200 # Toronto 8000 # Tokyo 6500 # dtype: int64 city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8}) city_employee_count # Expected: # Amsterdam 5 # Tokyo 8 # dtype: int64 city_employee_count.keys() # Expected: # Index(['Amsterdam', 'Tokyo'], dtype='object') "Tokyo" in city_employee_count # Expected: # True "New York" in city_employee_count # Expected: # False city_data = pd.DataFrame({ "revenue": city_revenues, "employee_count": city_employee_count }) city_data # Expected: # revenue employee_count # Amsterdam 4200 5.0 # Tokyo 6500 8.0 # Toronto 8000 NaN city_data.index # Expected: # Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object') city_data.values # Expected: # array([[4.2e+03, 5.0e+00], # [6.5e+03, 8.0e+00], # [8.0e+03, nan]]) city_data.axes # Expected: # [Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'), # Index(['revenue', 'employee_count'], dtype='object')] city_data.axes[0] # Expected: # Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object') city_data.axes[1] # Expected: # Index(['revenue', 'employee_count'], dtype='object') city_data.keys() # Expected: # Index(['revenue', 'employee_count'], dtype='object') "Amsterdam" in city_data # Expected: # False "revenue" in city_data # Expected: # True city_revenues # Expected: # Amsterdam 4200 # Toronto 8000 # Tokyo 6500 # dtype: int64 city_revenues["Toronto"] # Expected: # 8000 city_revenues[1] # Expected: # 8000 city_revenues[-1] # Expected: # 6500 city_revenues[1:] # Expected: # Toronto 8000 # Tokyo 6500 # dtype: int64 city_revenues["Toronto":] # Expected: # Toronto 8000 # Tokyo 6500 # dtype: int64 colors = pd.Series( ["red", "purple", "blue", "green", "yellow"], index=[1, 2, 3, 5, 8] ) colors # Expected: # 1 red # 2 purple # 3 blue # 5 green # 8 yellow # dtype: object colors.loc[1] # Expected: # 'red' colors.iloc[1] # Expected: # 'purple' # Return the elements with the implicit index: 1, 2 colors.iloc[1:3] # Expected: # 2 purple # 3 blue # dtype: object # Return the elements with the explicit index between 3 and 8 colors.loc[3:8] # Expected: # 3 blue # 5 green # 8 yellow # dtype: object colors.iloc[-2] # Expected: # 'green' city_data["revenue"] # Expected: # Amsterdam 4200 # Tokyo 6500 # Toronto 8000 # Name: revenue, dtype: int64 type(city_data["revenue"]) # Expected: # pandas.core.series.Series city_data.revenue # Expected: # Amsterdam 4200 # Tokyo 6500 # Toronto 8000 # Name: revenue, dtype: int64 toys = pd.DataFrame([ {"name": "ball", "shape": "sphere"}, {"name": "Rubik's cube", "shape": "cube"} ]) toys["shape"] # Expected: # 0 sphere # 1 cube # Name: shape, dtype: object toys.shape # Expected: # (2, 2) city_data.loc["Amsterdam"] # Expected: # revenue 4200.0 # employee_count 5.0 # Name: Amsterdam, dtype: float64 city_data.loc["Tokyo": "Toronto"] # Expected: # revenue employee_count # Tokyo 6500 8.0 # Toronto 8000 NaN city_data.iloc[1] # Expected: # revenue 6500.0 # employee_count 8.0 # Name: Tokyo, dtype: float64 city_data.loc["Amsterdam": "Tokyo", "revenue"] # Expected: # Amsterdam 4200 # Tokyo 6500 # Name: revenue, dtype: int64 current_decade = nba[nba["year_id"] > 2010] current_decade.shape # Expected: # (12658, 23) games_with_notes = nba[nba["notes"].notnull()] games_with_notes.shape # Expected: # (5424, 23) ers = nba[nba["fran_id"].str.endswith("ers")] ers.shape # Expected: # (27797, 23) nba[ (nba["_iscopy"] == 0) & (nba["pts"] > 100) & (nba["opp_pts"] > 100) & (nba["team_id"] == "BLB") ] city_revenues.sum() # Expected: # 18700 city_revenues.max() # Expected: # 8000 points = nba["pts"] type(points) # Expected: # points.sum() # Expected: # 12976235 nba.groupby("fran_id", sort=False)["pts"].sum() # Expected: # fran_id # Huskies 3995 # Knicks 582497 # Stags 20398 # Falcons 3797 # Capitols 22387 nba[ (nba["fran_id"] == "Spurs") & (nba["year_id"] > 2010) ].groupby(["year_id", "game_result"])["game_id"].count() # Expected: # year_id game_result # 2011 L 25 # W 63 # 2012 L 20 # W 60 # 2013 L 30 # W 73 # 2014 L 27 # W 78 # 2015 L 31 # W 58 # Name: game_id, dtype: int64 df = nba.copy() df.shape # Expected: # (126314, 23) df["difference"] = df.pts - df.opp_pts df.shape # Expected: # (126314, 24) df["difference"].max() # Expected: # 68 renamed_df = df.rename( columns={"game_result": "result", "game_location": "location"} ) renamed_df.info() # Expected: # # RangeIndex: 126314 entries, 0 to 126313 # Data columns (total 24 columns): # gameorder 126314 non-null int64 # location 126314 non-null object # result 126314 non-null object # forecast 126314 non-null float64 # notes 5424 non-null object # difference 126314 non-null int64 # dtypes: float64(6), int64(8), object(10) # memory usage: 23.1+ MB df.shape # Expected: # (126314, 24) elo_columns = ["elo_i", "elo_n", "opp_elo_i", "opp_elo_n"] df.drop(elo_columns, inplace=True, axis=1) df.shape # Expected: # (126314, 20) df.info() df["date_game"] = pd.to_datetime(df["date_game"]) df["game_location"].nunique() # Expected: # 3 df["game_location"].value_counts() # Expected: # A 63138 # H 63138 # N 38 # Name: game_location, dtype: int64 df["game_location"] = pd.Categorical(df["game_location"]) df["game_location"].dtype # Expected: # CategoricalDtype(categories=['A', 'H', 'N'], ordered=False) nba.info() rows_without_missing_data = nba.dropna() rows_without_missing_data.shape # Expected: # (5424, 23) data_without_missing_columns = nba.dropna(axis=1) data_without_missing_columns.shape # Expected: # (126314, 22) data_with_default_notes = nba.copy() data_with_default_notes["notes"].fillna( value="no notes at all", inplace=True ) data_with_default_notes["notes"].describe() # Expected: # count 126314 # unique 232 # top no notes at all # freq 120890 # Name: notes, dtype: object nba[nba["pts"] == 0] nba[(nba["pts"] > nba["opp_pts"]) & (nba["game_result"] != 'W')].empty # Expected: # True nba[(nba["pts"] < nba["opp_pts"]) & (nba["game_result"] != 'L')].empty # Expected: # True further_city_data = pd.DataFrame( {"revenue": [7000, 3400], "employee_count":[2, 2]}, index=["New York", "Barcelona"] ) all_city_data = pd.concat([city_data, further_city_data], sort=False) all_city_data # Expected: # Amsterdam 4200 5.0 # Tokyo 6500 8.0 # Toronto 8000 NaN # New York 7000 2.0 # Barcelona 3400 2.0 city_countries = pd.DataFrame({ "country": ["Holland", "Japan", "Holland", "Canada", "Spain"], "capital": [1, 1, 0, 0, 0]}, index=["Amsterdam", "Tokyo", "Rotterdam", "Toronto", "Barcelona"] ) cities = pd.concat([all_city_data, city_countries], axis=1, sort=False) cities # Expected: # revenue employee_count country capital # Amsterdam 4200.0 5.0 Holland 1.0 # Tokyo 6500.0 8.0 Japan 1.0 # Toronto 8000.0 NaN Canada 0.0 # New York 7000.0 2.0 NaN NaN # Barcelona 3400.0 2.0 Spain 0.0 # Rotterdam NaN NaN Holland 0.0 pd.concat([all_city_data, city_countries], axis=1, join="inner") # Expected: # revenue employee_count country capital # Amsterdam 4200 5.0 Holland 1 # Tokyo 6500 8.0 Japan 1 # Toronto 8000 NaN Canada 0 # Barcelona 3400 2.0 Spain 0 countries = pd.DataFrame({ "population_millions": [17, 127, 37], "continent": ["Europe", "Asia", "North America"] }, index= ["Holland", "Japan", "Canada"]) pd.merge(cities, countries, left_on="country", right_index=True) pd.merge( cities, countries, left_on="country", right_index=True, how="left" ) %matplotlib inline nba[nba["fran_id"] == "Knicks"].groupby("year_id")["pts"].sum().plot() nba["fran_id"].value_counts().head(10).plot(kind="bar")