$ python -m pip install pandas
$ conda install pandas
import requests
download_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv"
target_csv_path = "nba_all_elo.csv"
response = requests.get(download_url)
response.raise_for_status()
with open(target_csv_path, "wb") as f:
f.write(response.content)
print("Download ready.")
import pandas as pd
nba = pd.read_csv("nba_all_elo.csv")
type(nba)
# Expected:
# <class 'pandas.core.frame.DataFrame'>
len(nba)
# Expected:
# 126314
nba.shape
# Expected:
# (126314, 23)
nba.head()
pd.set_option("display.max.columns", None)
pd.set_option("display.precision", 2)
nba.tail()
Can you print the last 3 lines of our DataFrame
?
.info()
¶nba.info()
.describe()
¶nba.describe()
import numpy as np
nba.describe(include=np.object)
nba["team_id"].value_counts()
# Expected:
# BOS 5997
# NYK 5769
# LAL 5078
# SDS 11
nba["fran_id"].value_counts()
# Expected:
# Name: team_id, Length: 104, dtype: int64
# Lakers 6024
# Celtics 5997
# Knicks 5769
# Huskies 60
# Name: fran_id, dtype: int64
nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()
# Expected:
# LAL 5078
# MNL 946
# Name: team_id, dtype: int64
nba.loc[nba["team_id"] == "MNL", "date_game"].min()
# Expected:
# '1/1/1949'
nba.loc[nba["team_id"] == "MNL", "date_game"].max()
# Expected:
# '4/9/1959'
nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max"))
# Expected:
# min 1/1/1949
# max 4/9/1959
# Name: date_game, dtype: object
Find out how many points the Boston Celtics have scored during all its matches contained in this dataset.
Series
: The Basic Building Block in Pandas¶revenues = pd.Series([5555, 7000, 1980])
revenues
# Expected:
# 0 5555
# 1 7000
# 2 1980
# dtype: int64
revenues.values
# Expected:
# array([5555, 7000, 1980])
revenues.index
# Expected:
# RangeIndex(start=0, stop=3, step=1)
type(revenues.values)
# Expected:
# <class 'numpy.ndarray'>
city_revenues = pd.Series(
[4200, 8000, 6500],
index=["Amsterdam", "Toronto", "Tokyo"]
)
city_revenues
# Expected:
# Amsterdam 4200
# Toronto 8000
# Tokyo 6500
# dtype: int64
Pandas Series
vs Built-In Python Data Structures¶city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8})
city_employee_count
# Expected:
# Amsterdam 5
# Tokyo 8
# dtype: int64
city_employee_count.keys()
# Expected:
# Index(['Amsterdam', 'Tokyo'], dtype='object')
"Tokyo" in city_employee_count
# Expected:
# True
"New York" in city_employee_count
# Expected:
# False
DataFrame
: Pandas' Most Popular Data Structure¶city_data = pd.DataFrame({
"revenue": city_revenues,
"employee_count": city_employee_count
})
city_data
# Expected:
# revenue employee_count
# Amsterdam 4200 5.0
# Tokyo 6500 8.0
# Toronto 8000 NaN
city_data.index
# Expected:
# Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')
city_data.values
# Expected:
# array([[4.2e+03, 5.0e+00],
# [6.5e+03, 8.0e+00],
# [8.0e+03, nan]])
city_data.axes
# Expected:
# [Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'),
# Index(['revenue', 'employee_count'], dtype='object')]
city_data.axes[0]
# Expected:
# Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')
city_data.axes[1]
# Expected:
# Index(['revenue', 'employee_count'], dtype='object')
city_data.keys()
# Expected:
# Index(['revenue', 'employee_count'], dtype='object')
"Amsterdam" in city_data
# Expected:
# False
"revenue" in city_data
# Expected:
# True
Display the index and the axes of the nba
dataset.
Check whether it has a column "points". Or was it called "pts"?
list
and Pandas Series
: Similarities and Differences¶[]
¶city_revenues
# Expected:
# Amsterdam 4200
# Toronto 8000
# Tokyo 6500
# dtype: int64
city_revenues["Toronto"]
# Expected:
# 8000
city_revenues[1]
# Expected:
# 8000
city_revenues[-1]
# Expected:
# 6500
city_revenues[1:]
# Expected:
# Toronto 8000
# Tokyo 6500
# dtype: int64
city_revenues["Toronto":]
# Expected:
# Toronto 8000
# Tokyo 6500
# dtype: int64
Series
' Elements via .loc
and .iloc
¶colors = pd.Series(
["red", "purple", "blue", "green", "yellow"],
index=[1, 2, 3, 5, 8]
)
colors
# Expected:
# 1 red
# 2 purple
# 3 blue
# 5 green
# 8 yellow
# dtype: object
colors.loc[1]
# Expected:
# 'red'
colors.iloc[1]
# Expected:
# 'purple'
# Return the elements with the implicit index: 1, 2
colors.iloc[1:3]
# Expected:
# 2 purple
# 3 blue
# dtype: object
# Return the elements with the explicit index between 3 and 8
colors.loc[3:8]
# Expected:
# 3 blue
# 5 green
# 8 yellow
# dtype: object
colors.iloc[-2]
# Expected:
# 'green'
DataFrame
¶DataFrame
¶city_data["revenue"]
# Expected:
# Amsterdam 4200
# Tokyo 6500
# Toronto 8000
# Name: revenue, dtype: int64
type(city_data["revenue"])
# Expected:
# pandas.core.series.Series
city_data.revenue
# Expected:
# Amsterdam 4200
# Tokyo 6500
# Toronto 8000
# Name: revenue, dtype: int64
toys = pd.DataFrame([
{"name": "ball", "shape": "sphere"},
{"name": "Rubik's cube", "shape": "cube"}
])
toys["shape"]
# Expected:
# 0 sphere
# 1 cube
# Name: shape, dtype: object
toys.shape
# Expected:
# (2, 2)
DataFrame
With .loc
and .iloc
¶city_data.loc["Amsterdam"]
# Expected:
# revenue 4200.0
# employee_count 5.0
# Name: Amsterdam, dtype: float64
city_data.loc["Tokyo": "Toronto"]
# Expected:
# revenue employee_count
# Tokyo 6500 8.0
# Toronto 8000 NaN
city_data.iloc[1]
# Expected:
# revenue 6500.0
# employee_count 8.0
# Name: Tokyo, dtype: float64
Display the 2nd last row of the nba
dataset.
DataFrame
With .loc
and .iloc
¶city_data.loc["Amsterdam": "Tokyo", "revenue"]
# Expected:
# Amsterdam 4200
# Tokyo 6500
# Name: revenue, dtype: int64
Have a look at the games between the labels 5555 and 5559. We're interested only in the names of the teams and the scores.
current_decade = nba[nba["year_id"] > 2010]
current_decade.shape
# Expected:
# (12658, 23)
games_with_notes = nba[nba["notes"].notnull()]
games_with_notes.shape
# Expected:
# (5424, 23)
ers = nba[nba["fran_id"].str.endswith("ers")]
ers.shape
# Expected:
# (27797, 23)
nba[
(nba["_iscopy"] == 0) &
(nba["pts"] > 100) &
(nba["opp_pts"] > 100) &
(nba["team_id"] == "BLB")
]
In the spring of 1992, both teams from Los Angeles had to play a home game at another court. Find those 2 games. (Both teams have an ID starting with "LA".)
Series
¶city_revenues.sum()
# Expected:
# 18700
city_revenues.max()
# Expected:
# 8000
DataFrame
¶points = nba["pts"]
type(points)
# Expected:
# <class 'pandas.core.series.Series'>
points.sum()
# Expected:
# 12976235
nba.groupby("fran_id", sort=False)["pts"].sum()
# Expected:
# fran_id
# Huskies 3995
# Knicks 582497
# Stags 20398
# Falcons 3797
# Capitols 22387
nba[
(nba["fran_id"] == "Spurs") &
(nba["year_id"] > 2010)
].groupby(["year_id", "game_result"])["game_id"].count()
# Expected:
# year_id game_result
# 2011 L 25
# W 63
# 2012 L 20
# W 60
# 2013 L 30
# W 73
# 2014 L 27
# W 78
# 2015 L 31
# W 58
# Name: game_id, dtype: int64
Let's have a look at the Golden State Warriors' season in 2014-15 (year_id: 2015). How many wins and losses did they score during the regular season and the playoffs?
df = nba.copy()
df.shape
# Expected:
# (126314, 23)
df["difference"] = df.pts - df.opp_pts
df.shape
# Expected:
# (126314, 24)
df["difference"].max()
# Expected:
# 68
renamed_df = df.rename(
columns={"game_result": "result", "game_location": "location"}
)
renamed_df.info()
# Expected:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 126314 entries, 0 to 126313
# Data columns (total 24 columns):
# gameorder 126314 non-null int64
# location 126314 non-null object
# result 126314 non-null object
# forecast 126314 non-null float64
# notes 5424 non-null object
# difference 126314 non-null int64
# dtypes: float64(6), int64(8), object(10)
# memory usage: 23.1+ MB
df.shape
# Expected:
# (126314, 24)
elo_columns = ["elo_i", "elo_n", "opp_elo_i", "opp_elo_n"]
df.drop(elo_columns, inplace=True, axis=1)
df.shape
# Expected:
# (126314, 20)
df.info()
df["date_game"] = pd.to_datetime(df["date_game"])
df["game_location"].nunique()
# Expected:
# 3
df["game_location"].value_counts()
# Expected:
# A 63138
# H 63138
# N 38
# Name: game_location, dtype: int64
df["game_location"] = pd.Categorical(df["game_location"])
df["game_location"].dtype
# Expected:
# CategoricalDtype(categories=['A', 'H', 'N'], ordered=False)
Find another column which has a too generic data type.
nba.info()
rows_without_missing_data = nba.dropna()
rows_without_missing_data.shape
# Expected:
# (5424, 23)
data_without_missing_columns = nba.dropna(axis=1)
data_without_missing_columns.shape
# Expected:
# (126314, 22)
data_with_default_notes = nba.copy()
data_with_default_notes["notes"].fillna(
value="no notes at all",
inplace=True
)
data_with_default_notes["notes"].describe()
# Expected:
# count 126314
# unique 232
# top no notes at all
# freq 120890
# Name: notes, dtype: object
nba[nba["pts"] == 0]
nba[(nba["pts"] > nba["opp_pts"]) & (nba["game_result"] != 'W')].empty
# Expected:
# True
nba[(nba["pts"] < nba["opp_pts"]) & (nba["game_result"] != 'L')].empty
# Expected:
# True
concat()
¶further_city_data = pd.DataFrame(
{"revenue": [7000, 3400], "employee_count":[2, 2]},
index=["New York", "Barcelona"]
)
all_city_data = pd.concat([city_data, further_city_data], sort=False)
all_city_data
# Expected:
# Amsterdam 4200 5.0
# Tokyo 6500 8.0
# Toronto 8000 NaN
# New York 7000 2.0
# Barcelona 3400 2.0
city_countries = pd.DataFrame({
"country": ["Holland", "Japan", "Holland", "Canada", "Spain"],
"capital": [1, 1, 0, 0, 0]},
index=["Amsterdam", "Tokyo", "Rotterdam", "Toronto", "Barcelona"]
)
cities = pd.concat([all_city_data, city_countries], axis=1, sort=False)
cities
# Expected:
# revenue employee_count country capital
# Amsterdam 4200.0 5.0 Holland 1.0
# Tokyo 6500.0 8.0 Japan 1.0
# Toronto 8000.0 NaN Canada 0.0
# New York 7000.0 2.0 NaN NaN
# Barcelona 3400.0 2.0 Spain 0.0
# Rotterdam NaN NaN Holland 0.0
pd.concat([all_city_data, city_countries], axis=1, join="inner")
# Expected:
# revenue employee_count country capital
# Amsterdam 4200 5.0 Holland 1
# Tokyo 6500 8.0 Japan 1
# Toronto 8000 NaN Canada 0
# Barcelona 3400 2.0 Spain 0
merge()
¶countries = pd.DataFrame({
"population_millions": [17, 127, 37],
"continent": ["Europe", "Asia", "North America"]
}, index= ["Holland", "Japan", "Canada"])
pd.merge(cities, countries, left_on="country", right_index=True)
pd.merge(
cities,
countries,
left_on="country",
right_index=True,
how="left"
)
%matplotlib inline
nba[nba["fran_id"] == "Knicks"].groupby("year_id")["pts"].sum().plot()
nba["fran_id"].value_counts().head(10).plot(kind="bar")
In 2013, the Miami Heat won the championship. Create a pie plot showing the count of their wins and losses during that season.