import numpy as np
import pandas as pd
np.set_printoptions(precision=2)
Считываем данные из файла
data = pd.read_csv("../../data/beauty.csv", sep=";")
type(data)
Смотрим на первые 5 строк
data.head()
data.shape
Краткая статистика – info и describe
data.info()
data.describe()
Индексация
data["exper"].head()
loc и iloc
data.loc[0:5, ["wage", "female"]]
data.iloc[:, 2:4].head()
Логическая индексация
data[data["female"] == 1]["wage"].mean(), data[data["female"] == 0]["wage"].mean()
data[(data["female"] == 0) & (data["married"] == 1)]["wage"].median(), data[
(data["female"] == 0) & (data["married"] == 0)
]["wage"].median()
Groupby
for look, sub_df in data.groupby("looks"):
print(look)
# что угодно
print(sub_df["goodhlth"].mean())
data.groupby("looks")[["wage", "exper"]].agg(np.median)
Сводная таблица
pd.crosstab(data["female"], data["married"])
pd.crosstab(data["female"], data["looks"])
Добавление столбцов (построение признаков)
data["is_rich"] = (data["wage"] > data["wage"].quantile(0.75)).astype("int64")
data.head()
data["rubbish"] = 0.56 * data["wage"] + 0.32 * data["exper"]
map и apply
def string_gender(female):
return "female" if female else "male"
d = {1: "union", 0: "non-union"}
data["union"].map(d).head()
data["female"].apply(lambda female: "female" if female else "male").head()