In [1]:

from __future__ import annotations

from typing import Literal
from itertools import cycle

from PIL import ImageColor
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.core.display_functions import display

In [2]:

# set PLOTLY_PLOT_SHOW_TYPE to `None` for interactive, but NOTE: interactive plots can't be visualized in GitHub
PLOTLY_PLOT_SHOW_TYPE: Literal['svg', 'png'] | None = None
# If None - then use nbviewer to render results after the `.ipynb` will be uploaded to the repo. 
# See rendered results here:
# https://nbviewer.org/github/MistyDi/data_analytics-math_python_pandas/blob/main/data_analytics_task.ipynb

Helper functions:¶

In [3]:

def computer_readable_columns(*dfs: pd.DataFrame) -> None:
    """
    In-place updating of the columns:
    - replace " " -> "_" 
    - replace "-" -> "_"
    - do names lower-case (like "Col" -> "col").
    
    NOTE: "  " -> "__". Multiple spaces will be multiple underscores.
    """
    for df in dfs:
        df.columns = list(map(lambda col: col.lower().replace(' ', '_').replace('-', '_'), df.columns))


def get_df_name(df: pd.DataFrame, silent_on_err: bool = True, raise_on_err: bool = False) -> str:
    try:
        df_name = df.name
    except AttributeError as e:
        if raise_on_err:
            raise
        if not silent_on_err:
            print(e)
        return ""  # empy string

    return df_name


def get_cols(df: pd.DataFrame, skip_cols: list[str] | None = None) -> list[str]:
    cols = list(df.columns)
    if skip_cols is not None:
        cols =  list(set(cols) - set(skip_cols))
    return cols


def display_dtypes(*dfs: pd.DataFrame) -> None:
    for i, df in enumerate(dfs):
        df_name = get_df_name(df)
        if df_name:
            i_str = ''
            df_name = f'[{df_name}] '
        else:
            i_str = f'[{i}] '
        print(f'{i_str}{df_name}Displaying data-types:')
        display(pd.DataFrame(df.dtypes).T)


def check_not_null_columns(*dfs: pd.DataFrame) -> None:
    for i, df in enumerate(dfs):
        # Number of the null values in each column:
        na_count_cols = df.isna().sum()
        all_not_na = na_count_cols.sum() == 0

        df_name = get_df_name(df)
        if df_name:
            i_str = ''
            df_name = f'[{df_name}] '
        else:
            i_str = f'[{i}] '
            
        if all_not_na:
            continue
        
        print(f"{i_str}{df_name}Warning, table contain null-values")
        display(na_count_cols.to_frame('Number of null values'))


def display_unique_values(df: pd.DataFrame, *cols, skip_cols: list[str] | None = None,
                          skip_fillna: bool = False, to_display: bool = True) -> pd.DataFrame | None:
    if len(cols) == 0:
        cols = get_cols(df, skip_cols)

    df_name = get_df_name(df)
    if df_name:
        df_name = f'[{df_name}] '

    concatenated_unique_per_cols: pd.DataFrame | None = None
    for col in cols:
        if concatenated_unique_per_cols is None:
            concatenated_unique_per_cols = pd.DataFrame(df[col].unique(), columns=[col])
            continue
        
        concatenated_unique_per_cols = pd.concat(
            [concatenated_unique_per_cols, pd.Series(df[col].unique(), name=col)], axis=1)

    if concatenated_unique_per_cols is None:
        return

    if not skip_fillna:
        concatenated_unique_per_cols.fillna("", inplace=True)

    if to_display:
        print(f'{df_name}Unique values per columns:')
        display(concatenated_unique_per_cols)

    return concatenated_unique_per_cols


def multi_display_unique_values(*dfs: pd.DataFrame, skip_cols: list[str] | None = None,
                                skip_fillna: bool = False):
    for df in dfs:
        display_unique_values(df, skip_cols=skip_cols, skip_fillna=skip_fillna)


def display_duplicated_values(df: pd.DataFrame, *cols, skip_cols: list[str] | None = None,
                              skip_fillna: bool = False):
    if len(cols) == 0:
        cols = get_cols(df, skip_cols)

    df_name = get_df_name(df)
    if df_name:
        df_name = f'[{df_name}] '

    concatenated_duplicates_per_cols: pd.DataFrame | None = None
    for col in cols:
        if concatenated_duplicates_per_cols is None:
            vals_are_unique = df[col].is_unique  # check to all values be unique
            if vals_are_unique:
                continue

            print(f'{df_name}[col. {col!r}] contain duplicated values.')
            if concatenated_duplicates_per_cols is None:
                concatenated_duplicates_per_cols = df[df[col].duplicated(), col]
                continue

            concatenated_duplicates_per_cols = pd.concat(
                [concatenated_duplicates_per_cols, df[df[col].duplicated(), col]], axis=1)

    if concatenated_duplicates_per_cols is None:
        return

    if not skip_fillna:
        concatenated_duplicates_per_cols.fillna("", inplace=True)

    print(f'{df_name} Warn! Duplicated values per columns found. Duplicated:')
    display(concatenated_duplicates_per_cols)


def multi_display_duplicated_values(*dfs: pd.DataFrame, skip_cols: list[str] | None = None,
                                    skip_fillna: bool = False):
    for df in dfs:
        display_duplicated_values(df, skip_cols=skip_cols, skip_fillna=skip_fillna)

Задача 1 - Сотрудники и зарплата¶

Вам дано два файла. Файл 1 содержит данные о сотрудниках(Emp Id, Name, ZIP code, Salary). Файл 2 содержит налоговую ставку, соответствующую данному ZIP Code(Zip Code, Tax rate).

(1) Рассчитайте зарплату каждого сотрудника, скорректированную с учетом налогов,
(2) и определите среднюю зарплату среди всех сотрудников,
(3) медианную зарплату среди всех сотрудников.

Файл 1 - emp.csv Файл 2 - zip_tax.csv

In [4]:

emp_subpath = 'data/emp.csv'

emp_df = pd.read_csv(emp_subpath)
emp_df.name = 'Employee'  # name of the table
display(emp_df.head(3))

zip_tax = 'data/zip_tax.csv'
zip_tax_df = pd.read_csv(zip_tax)
zip_tax_df.name = 'Zip taxes'  # name of the table
display(zip_tax_df.head(3))

	emp id	emp name	ZIP	Salary
0	1	dldalka	10025	14000
1	2	dslbgdl	10029	16000
2	3	sdlbgl	10044	10000

	ZIP	Tax rate
0	10001	18
1	10002	7
2	10003	0

Подготовка и проверка данных¶

In [5]:

computer_readable_columns(emp_df, zip_tax_df)  # fixing column names

print(f'[{emp_df.name}] Column names after fixing:', list(emp_df.columns))
print(f'[{zip_tax_df.name}] Column names after fixing:', list(zip_tax_df.columns))

[Employee] Column names after fixing: ['emp_id', 'emp_name', 'zip', 'salary']
[Zip taxes] Column names after fixing: ['zip', 'tax_rate']

In [6]:

# Display data-types:
display_dtypes(emp_df, zip_tax_df)
# Check null values in columns:
check_not_null_columns(emp_df, zip_tax_df)
# Display duplicated values if existed:
multi_display_duplicated_values(emp_df, zip_tax_df, skip_cols=['zip', 'salary', 'tax_rate'])
# Display unique values if exists:
multi_display_unique_values(emp_df, zip_tax_df, skip_cols=['emp_id'])

# Number of the unique values in each column:
display(emp_df.nunique().to_frame('Number of unique values').T)

[Employee] Displaying data-types:

	emp_id	emp_name	zip	salary
0	int64	object	int64	int64

[Zip taxes] Displaying data-types:

	zip	tax_rate
0	int64	int64

[Employee] Unique values per columns:

	zip	salary	emp_name
0	10025.0	14000.0	dldalka
1	10029.0	16000.0	dslbgdl
2	10044.0	10000.0	sdlbgl
3	10009.0	6000.0	dsjgge
4	10005.0	17000.0	dkgds
5	10024.0	20000.0	dskjgwe
6	10045.0	9000.0	sidgkjsbf
7	10048.0	11000.0	hsvvd
8	10035.0	5000.0	dfvd
9	10026.0	19000.0	skdugew
10	10014.0	2000.0	kdsf
11	10049.0	15000.0	dskjfkj
12	10016.0	1000.0	sdljbfgd
13	10020.0	13000.0	skdsffk
14	10030.0	18000.0	akja
15	10040.0	8000.0	dflh
16	10033.0	7000.0	dso
17	10050.0	12000.0	sdloiy
18	10039.0	3000.0	xzkb
19	10034.0		xloc
20	10031.0		lsay
21	10032.0		aslfoiiy
22	10037.0		woiuy
23	10041.0		pqweou
24	10036.0		dsaloh
25	10046.0		adlh
26	10051.0		aslb
27	10043.0		alb
28	10047.0		wpouj
29			sdaoihy
30			dlfh
31			dlh
32			zxfd
33			dpiifh
34			dslihsdlh
35			dsk
36			asdoiy
37			xclk
38			cxzpoo
39			sxl
40			xzh
41			zxcoihy
42			ozxcugh
43			ugh
44			zsdg
45			zxv
46			erpcvih
47			oaiy
48			zxoiy
49			aspoiy

[Zip taxes] Unique values per columns:

	zip	tax_rate
0	10001	18.0
1	10002	7.0
2	10003	0.0
3	10004	20.0
4	10005	11.0
5	10006	9.0
6	10007	5.0
7	10008	14.0
8	10009	19.0
9	10010	3.0
10	10011	6.0
11	10012	17.0
12	10013	12.0
13	10014	2.0
14	10015	15.0
15	10016	4.0
16	10017	16.0
17	10018	1.0
18	10019	13.0
19	10020
20	10021
21	10022
22	10023
23	10024
24	10025
25	10026
26	10027
27	10028
28	10029
29	10030
30	10031
31	10032
32	10033
33	10034
34	10035
35	10036
36	10037
37	10038
38	10039
39	10040
40	10041
41	10042
42	10043
43	10044
44	10045
45	10046
46	10047
47	10048
48	10049
49	10050
50	10051

	emp_id	emp_name	zip	salary
Number of unique values	50	50	29	19

Joining tables by zip-code¶

In [7]:

emp_with_taxes_df = pd.merge(emp_df, zip_tax_df, how='left', left_on='zip', right_on='zip')
emp_with_taxes_df.head()

Out[7]:

	emp_id	emp_name	zip	salary	tax_rate
0	1	dldalka	10025	14000	9
1	2	dslbgdl	10029	16000	15
2	3	sdlbgl	10044	10000	2
3	4	dsjgge	10009	6000	19
4	5	dkgds	10005	17000	11

(1) Рассчитайте зарплату каждого сотрудника, скорректированную с учетом налогов¶

In [8]:

# Gross pay - what employees earn before taxes
# Net pay - what employees earn after taxes
emp_with_taxes_df['net_salary'] = emp_with_taxes_df['salary'] / 100 * (100 - emp_with_taxes_df['tax_rate'])
display(emp_with_taxes_df)

	emp_id	emp_name	zip	salary	tax_rate	net_salary
0	1	dldalka	10025	14000	9	12740.0
1	2	dslbgdl	10029	16000	15	13600.0
2	3	sdlbgl	10044	10000	2	9800.0
3	4	dsjgge	10009	6000	19	4860.0
4	5	dkgds	10005	17000	11	15130.0
5	6	dskjgwe	10024	16000	18	13120.0
6	7	sidgkjsbf	10045	20000	16	16800.0
7	8	hsvvd	10048	16000	1	15840.0
8	9	dfvd	10035	9000	20	7200.0
9	10	skdugew	10026	11000	11	9790.0
10	11	kdsf	10014	17000	6	15980.0
11	12	dskjfkj	10045	20000	16	16800.0
12	13	sdljbfgd	10049	5000	5	4750.0
13	14	skdsffk	10016	19000	20	15200.0
14	15	akja	10020	2000	2	1960.0
15	16	dflh	10030	2000	4	1920.0
16	17	dso	10040	20000	19	16200.0
17	18	sdloiy	10026	9000	11	8010.0
18	19	xzkb	10048	15000	1	14850.0
19	20	xloc	10033	1000	18	820.0
20	21	lsay	10050	13000	4	12480.0
21	22	aslfoiiy	10049	6000	5	5700.0
22	23	woiuy	10039	13000	19	10530.0
23	24	pqweou	10040	18000	19	14580.0
24	25	dsaloh	10034	9000	16	7560.0
25	26	adlh	10031	18000	15	15300.0
26	27	aslb	10032	6000	11	5340.0
27	28	alb	10040	17000	19	13770.0
28	29	wpouj	10050	11000	4	10560.0
29	30	sdaoihy	10037	9000	1	8910.0
30	31	dlfh	10048	20000	1	19800.0
31	32	dlh	10041	9000	0	9000.0
32	33	zxfd	10034	5000	16	4200.0
33	34	dpiifh	10036	10000	17	8300.0
34	35	dslihsdlh	10037	8000	1	7920.0
35	36	dsk	10046	17000	7	15810.0
36	37	asdoiy	10051	6000	4	5760.0
37	38	xclk	10050	7000	4	6720.0
38	39	cxzpoo	10041	2000	0	2000.0
39	40	sxl	10039	6000	19	4860.0
40	41	xzh	10050	19000	4	18240.0
41	42	zxcoihy	10049	11000	5	10450.0
42	43	ozxcugh	10043	19000	13	16530.0
43	44	ugh	10050	1000	4	960.0
44	45	zsdg	10047	19000	13	16530.0
45	46	zxv	10051	8000	4	7680.0
46	47	erpcvih	10050	12000	4	11520.0
47	48	oaiy	10050	12000	4	11520.0
48	49	zxoiy	10048	10000	1	9900.0
49	50	aspoiy	10049	3000	5	2850.0

(2) и определите среднюю зарплату среди всех сотрудников¶

In [9]:

print("Mean Net salary among employees:", emp_with_taxes_df['net_salary'].mean())

Mean Net salary among employees: 10213.0

(3) и медианную зарплату среди всех сотрудников.¶

In [10]:

print("Median Net salary among employees:", emp_with_taxes_df['net_salary'].median())

Median Net salary among employees: 10175.0

Задача 2 - Ниже приведены цифры продаж из магазина за 2009, 2010, 2011, 2012 годы.¶

Определите следующее:

(1) Средние продажи Computer Peripherals за 2009, 2010 гг.
(2) Общий объем продаж Furniture and Technology за каждый 2009, 2010 и 2011 годы.
(3) Медианный объем продаж Paper за четыре года.
(4) Стандартное отклонение продаж Office Supplies в 2012 году.
(5) Корреляцию между продажами Paper и Computer Peripherals.
(6) Стандартное отклонение количества заказов Office Supplies в 2012 году.

Имя файла: store_sales.csv

In [11]:

sales_subpath = 'data/store_sales.csv'

sales_df = pd.read_csv(sales_subpath)
sales_df.name = 'Sales'

Подготовка и проверка данных¶

In [12]:

computer_readable_columns(sales_df)  # renaming of the columns

display(sales_df.head())

# Display data-types:
display_dtypes(sales_df)
# Check null values in columns (nothing will be printed if no null values):
check_not_null_columns(sales_df)

	order_id	order_date	order_priority	order_quantity	sales	ship_mode	shipping_cost	province	customer_segment	product_category	product_sub_category	product_container	ship_date
0	928	3/1/2011	Low	26	390.2000	Express Air	7.40	British Columbia	Consumer	Furniture	Office Furnishings	Small Box	3/3/2011
1	32323	7/23/2010	High	38	259.7175	Regular Air	5.03	Ontario	Small Business	Technology	Telephones and Communication	Medium Box	7/25/2010
2	48353	12/15/2012	Not Specified	18	71.2200	Regular Air	0.70	British Columbia	Corporate	Office Supplies	Pens & Art Supplies	Wrap Bag	12/17/2012
3	10144	1/2/2011	Critical	1	192.4900	Delivery Truck	30.00	British Columbia	Corporate	Furniture	Chairs & Chairmats	Jumbo Drum	1/4/2011
4	26756	5/10/2012	Medium	25	767.2600	Regular Air	4.00	British Columbia	Home Office	Technology	Computer Peripherals	Small Box	5/10/2012

[Sales] Displaying data-types:

	order_id	order_date	order_priority	order_quantity	sales	ship_mode	shipping_cost	province	customer_segment	product_category	product_sub_category	product_container	ship_date
0	int64	object	object	int64	float64	object	float64	object	object	object	object	object	object

Столбцы времени order_date и ship_date нужно конвертировать из строки в дату.

In [13]:

# Date columns needed to be converted `object` -> `date` (will be converted to `datetime64[ns]`)
date_fmt: str = '%m/%d/%Y'  # date format
sales_df['order_date'] = pd.to_datetime(sales_df['order_date'], format=date_fmt)
sales_df['ship_date'] = pd.to_datetime(sales_df['ship_date'], format=date_fmt)

# Number of the unique values in each column:
display(sales_df.nunique().to_frame('Number of unique values').T)

# Display unique values if exists:
_ = display_unique_values(
    sales_df, 'product_category', 'product_sub_category', 'product_container',
    'customer_segment', 'ship_mode', 'province', 'order_priority')

	order_id	order_date	order_priority	order_quantity	sales	ship_mode	shipping_cost	province	customer_segment	product_category	product_sub_category	product_container	ship_date
Number of unique values	5496	1418	5	50	8153	3	652	13	4	3	17	7	1450

[Sales] Unique values per columns:

	product_category	product_sub_category	product_container	customer_segment	ship_mode	province	order_priority
0	Furniture	Office Furnishings	Small Box	Consumer	Express Air	British Columbia	Low
1	Technology	Telephones and Communication	Medium Box	Small Business	Regular Air	Ontario	High
2	Office Supplies	Pens & Art Supplies	Wrap Bag	Corporate	Delivery Truck	Northwest Territories	Not Specified
3		Chairs & Chairmats	Jumbo Drum	Home Office		Yukon	Critical
4		Computer Peripherals	Small Pack			Quebec	Medium
5		Paper	Large Box			Alberta
6		Binders and Binder Accessories	Jumbo Box			Saskachewan
7		Scissors, Rulers and Trimmers				Prince Edward Island
8		Copiers and Fax				Manitoba
9		Tables				Nova Scotia
10		Storage & Organization				New Brunswick
11		Office Machines				Nunavut
12		Envelopes				Newfoundland
13		Labels
14		Rubber Bands
15		Bookcases
16		Appliances

(1) Средние продажи `Computer Peripherals` за 2009, 2010 гг.¶

In [14]:

filtering = sales_df['product_sub_category'] == 'Computer Peripherals'
filtering = filtering & (sales_df['order_date'].dt.year.isin([2009, 2010]))

res_name = 'Mean Sales of `Computer Peripherals`'
sales_df[filtering].groupby(sales_df['order_date'].dt.year)['sales'].mean().to_frame(res_name).reset_index()

Out[14]:

	order_date	Mean Sales of `Computer Peripherals`
0	2009	1116.917461
1	2010	872.272338

(2) Общий объем продаж `Furniture` и `Technology` за каждый 2009, 2010 и 2011 годы.¶

In [15]:

# Filtering only needed values:
filtering = sales_df['product_category'].isin(['Furniture', 'Technology'])
filtering = filtering & (sales_df['order_date'].dt.year.isin([2009, 2010, 2011]))

# Grouping by `product_category` and year of the `order_date` field:
sales_grouped = sales_df[filtering].groupby([sales_df['product_category'], sales_df['order_date'].dt.year])

# Summary sales of the categories `Furniture` and `Technology`:
summary_sales_df = sales_grouped['sales'].sum().to_frame('Summary Sales').reset_index()

# Summary number of the orders of the categories `Furniture` and `Technology`:
summary_order_counts_df = sales_grouped['order_quantity'].sum().to_frame('Number of orders').reset_index()

# Outer-Joining for pretty showing in single table:
summary_sales_df = pd.merge(summary_sales_df, summary_order_counts_df,  how='outer', left_on=['product_category', 'order_date'], right_on = ['product_category', 'order_date'])
display(summary_sales_df)

	product_category	order_date	Summary Sales	Number of orders
0	Furniture	2009	1.472672e+06	11693
1	Furniture	2010	1.252665e+06	11557
2	Furniture	2011	1.269661e+06	10664
3	Technology	2009	1.701825e+06	13287
4	Technology	2010	1.397142e+06	13186
5	Technology	2011	1.366807e+06	12152

(3) Медианный объем продаж `Paper` за четыре года.¶

In [16]:

# Filtering only needed values:
filtering = sales_df['product_sub_category'] == 'Paper'

# Grouping by year of the `order_date` field. Calculate mean of sales per each year:
display(sales_df[filtering].groupby([sales_df['order_date'].dt.year]).median()['sales'].to_frame('Median sales').reset_index())

print("Summary median of sales per all years:", sales_df[filtering]['sales'].median())

	order_date	Median sales
0	2009	203.565
1	2010	192.380
2	2011	192.540
3	2012	217.870

Summary median of sales per all years: 200.08

(4) Стандартное отклонение продаж `Office Supplies` в 2012 году.¶

In [17]:

# Filtering only needed values:
filtering = sales_df['product_category'] == 'Office Supplies'
filtering = filtering & (sales_df['order_date'].dt.year == 2012)

print('Standard deviation of the product category `Office Supplies` sales at 2012:',
      sales_df[filtering]['sales'].std())

Standard deviation of the product category `Office Supplies` sales at 2012: 2151.485834542503

(5) Корреляция между продажами `Paper` и `Computer Peripherals`.¶

In [18]:

filtering = sales_df['product_sub_category'].isin(['Paper', 'Computer Peripherals'])
res_df = sales_df[filtering].copy()  # temp DataFrame to not overwrite the original data

# Pearson’s correlation: covariance(X, Y) / (std(X) * std(Y))
#  Where covariance(X, Y) = (sum(x - mean(X)) * (y - mean(Y)) ) * 1/(n-1)

# Note: `.reset_index(drop=True)` is needed because pandas don't allow you to calculate
#  the correlation between rows with different indexes values:
paper_sales = res_df[res_df['product_sub_category'] == 'Paper']['sales'].reset_index(drop=True)
computer_peripherals_sales = res_df[res_df['product_sub_category'] == 'Computer Peripherals']['sales'].reset_index(
    drop=True)
corr = paper_sales.corr(computer_peripherals_sales)
print("Correlation between sales of the product sub-category `Paper` "
      "and another sub-category `Computer Peripherals`: ", corr)

Correlation between sales of the product sub-category `Paper` and another sub-category `Computer Peripherals`:  0.015039019066899894

BONUS #1: let see sales change by timeline on the plotly-plot:¶

In [19]:

res_df.sort_values(by=['order_date'], inplace=True)
fig = px.histogram(
    res_df, x='order_date', y='sales', color='product_sub_category', marginal='rug',
    nbins=100, hover_data=res_df.columns, title='Sales',
)
fig.update_layout(
    bargap=0.2
)
fig.show(PLOTLY_PLOT_SHOW_TYPE)

(6) Стандартное отклонение количества заказов `Office Supplies` в 2012 году.¶

In [20]:

# Filtering only needed values:
filtering = sales_df['product_category'] == 'Office Supplies'
filtering = filtering & (sales_df['order_date'].dt.year == 2012)

print('Standard deviation of the order quantity for product category `Office Supplies` at 2012:',
      sales_df[filtering]['order_quantity'].std())

Standard deviation of the order quantity for product category `Office Supplies` at 2012: 14.500141861634301

Задача 3 - Вам представлены данные на конец торгового дня для различных акций.¶

Рассчитайте:

(1) 30-дневное, 60-дневное, 90-дневное скользящее среднее и скользящее стандартное отклонение.
(2) общее стандартное отклонение
(3) количество дней, в течение которых цена акций увеличивалась и уменьшалась.

Имя файла: stock_data.csv

In [21]:

stock_subpath = 'data/stock_data.csv'

stock_df = pd.read_csv(stock_subpath)
stock_df.name = 'Stock'

Подготовка и проверка данных¶

In [22]:

computer_readable_columns(stock_df)  # fixing column names
display(stock_df.head())

	date	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
0	10/29/07	37.41	185.09	34.46	106.78	57.13	31.78	65.67	1540.98	85.51
1	10/30/07	36.43	187.00	34.39	106.15	56.99	32.70	65.80	1531.02	83.25
2	10/31/07	36.79	189.95	34.97	108.01	57.30	33.84	65.69	1549.38	84.03
3	11/01/07	35.22	187.44	34.27	105.72	56.85	34.07	64.51	1508.44	80.84
4	11/02/07	35.83	187.87	34.27	106.59	56.95	34.07	65.03	1509.65	80.32

In [23]:

# Display data-types:
display_dtypes(stock_df)
# Check null values in columns:
check_not_null_columns(stock_df)
# Display duplicated values if existed:
display_duplicated_values(stock_df, 'date')

# Number of the unique values in each column:
display(stock_df.nunique().to_frame('Number of unique values').T)

[Stock] Displaying data-types:

	date	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
0	object	float64	float64	float64	float64	float64	float64	float64	float64	float64

	date	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
Number of unique values	1000	752	978	742	929	723	657	757	991	848

Необходимо конвертировать date из строки в дату. Попробуем понять формат даты, путём вывода некоторых примеров значений в данном столбце:

In [24]:

def min_max(x):
    return pd.Series(index=['min','max'],data=[x.min(),x.max()])


unique_date = stock_df['date'].unique()
print('Min value of the `date` string-column:', unique_date.min())
print('Max value of the `date` string-column:', unique_date.max())
split_date = stock_df['date'].str.split('/', expand=True)
split_date.columns = [f'{col}_part' for col in split_date.columns]
display(split_date.apply(min_max))
split_date_unique = display_unique_values(split_date, to_display=False)
display(split_date_unique.apply(lambda x: x.sort_values()))

# Displaying random sample of data:
display(stock_df['date'].sample(n=10))

Min value of the `date` string-column: 01/02/08
Max value of the `date` string-column: 12/31/10

	0_part	1_part	2_part
min	01	01	07
max	12	31	11

	0_part	1_part	2_part
0	10	29	07
1	11	30	08
2	12	31	09
3	01	01	10
4	02	02	11
5	03	05
6	04	06
7	05	07
8	06	08
9	07	09
10	08	12
11	09	13
12		14
13		15
14		16
15		19
16		20
17		21
18		23
19		26
20		27
21		28
22		03
23		04
24		10
25		11
26		17
27		18
28		24
29		22
30		25

205    08/21/08
53     01/15/08
443    08/03/09
873    04/15/11
59     01/24/08
667    06/23/10
880    04/27/11
77     02/20/08
537    12/15/09
457    08/21/09
Name: date, dtype: object

0_part - скорее всего месяц, т.к. уникальных значений из этого столбца больше, чем уникальных значений из 2_part. 1_part - это день. 2_part - скорее всего год.

In [25]:

date_fmt: str = '%m/%d/%Y'  # date format

stock_df['date'] = pd.to_datetime(sales_df['order_date'], format=date_fmt)

(1) 30-дневное, 60-дневное и 90-дневное скользящие средние и стандартные отклонения¶

Говоря об использовании оконной функции в pandas, обратимся к документации .rolling(...) метода. [pandas-docs of the .rolling(...)]:

DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, method='single')

Provide rolling window calculations.

Parameters:

window: int, offset, or BaseIndexer subclass
- If an integer, the fixed number of observations used for each window. If an offset, the time period of each window. Each window will be a variable sized based on the observations included in the time-period. This is only valid for datetime-like indexes. ...
...

*То есть если хотим отсчитывать по периоду в N-дней, то нужно иметь индекс DataFrame'а на основе datetime.

Причем также нужно использовать размер окна, задаваемый через offset, например, 30 дней = "30D", 5 секунд = "5S". *

In [26]:

stock_dtm_sorted_df = stock_df.set_index('date')
# NOTE: don't forget to sort your data by time:
stock_dtm_sorted_df.sort_index(inplace=True)
companies: set[str] = set(stock_dtm_sorted_df.columns)

stock_dtm_sorted_df.head()

Out[26]:

	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
date
2009-01-02	10.83	259.62	14.37	125.16	57.91	23.55	61.14	1077.96	56.99
2009-01-06	9.71	369.80	15.50	182.39	63.13	26.25	61.02	1155.46	73.56
2009-01-10	14.26	392.57	17.30	178.05	62.85	26.75	63.97	1260.34	77.21
2009-01-10	12.10	292.32	16.13	131.82	60.00	24.17	64.15	1148.67	60.30
2009-01-11	10.99	274.07	15.33	127.30	56.61	25.65	61.70	1117.51	61.18

In [27]:

rolling_range2dtm_stock = dict()
days = [30, 60, 90]
for d in days:
    offset_str = f'{d}D'
    # NOTE: `.dropna()` will drop first `d`-rows, because they can't be calculated
    rolling_range2dtm_stock[offset_str] = stock_dtm_sorted_df.rolling(offset_str).agg(
        [pd.Series.mean, pd.Series.std]).dropna()

    print(f'\nMoving Mean and Std by {d} days period:')
    display(rolling_range2dtm_stock[offset_str].head())

Moving Mean and Std by 30 days period:

	aa		aapl		ge		ibm		jnj		msft		pep		spx		xom
	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std
date
2009-01-06	10.270000	0.791960	314.710000	77.909025	14.935000	0.799031	153.775000	40.467721	60.520000	3.691097	24.900000	1.909188	61.080000	0.084853	1116.710000	54.800776	65.275000	11.716759
2009-01-10	11.600000	2.370717	340.663333	71.102986	15.723333	1.477712	161.866667	31.862885	61.296667	2.936279	25.516667	1.721434	62.043333	1.669621	1164.586667	91.531897	69.253333	10.776021
2009-01-10	11.725000	1.951760	328.577500	62.886346	15.825000	1.223560	154.355000	30.042129	60.972500	2.483578	25.180000	1.558504	62.570000	1.722769	1160.607500	75.158015	67.015000	9.871962
2009-01-11	11.578000	1.721938	317.676000	59.667687	15.726000	1.082511	148.944000	28.693061	60.100000	2.903860	25.274000	1.365972	62.396000	1.541859	1151.988000	67.882430	65.848000	8.938745
2009-01-12	14.938333	8.373953	292.255000	82.009449	17.185000	3.702641	144.488333	27.888191	60.220000	2.613871	25.066667	1.323112	61.873333	1.881740	1170.103333	75.202468	67.445000	8.900752

Moving Mean and Std by 60 days period:

	aa		aapl		ge		ibm		jnj		msft		pep		spx		xom
	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std
date
2009-01-06	10.270000	0.791960	314.710000	77.909025	14.935000	0.799031	153.775000	40.467721	60.520000	3.691097	24.900000	1.909188	61.080000	0.084853	1116.710000	54.800776	65.275000	11.716759
2009-01-10	11.600000	2.370717	340.663333	71.102986	15.723333	1.477712	161.866667	31.862885	61.296667	2.936279	25.516667	1.721434	62.043333	1.669621	1164.586667	91.531897	69.253333	10.776021
2009-01-10	11.725000	1.951760	328.577500	62.886346	15.825000	1.223560	154.355000	30.042129	60.972500	2.483578	25.180000	1.558504	62.570000	1.722769	1160.607500	75.158015	67.015000	9.871962
2009-01-11	11.578000	1.721938	317.676000	59.667687	15.726000	1.082511	148.944000	28.693061	60.100000	2.903860	25.274000	1.365972	62.396000	1.541859	1151.988000	67.882430	65.848000	8.938745
2009-01-12	14.938333	8.373953	292.255000	82.009449	17.185000	3.702641	144.488333	27.888191	60.220000	2.613871	25.066667	1.323112	61.873333	1.881740	1170.103333	75.202468	67.445000	8.900752

Moving Mean and Std by 90 days period:

	aa		aapl		ge		ibm		jnj		msft		pep		spx		xom
	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std	mean	std
date
2009-01-06	10.270000	0.791960	314.710000	77.909025	14.935000	0.799031	153.775000	40.467721	60.520000	3.691097	24.900000	1.909188	61.080000	0.084853	1116.710000	54.800776	65.275000	11.716759
2009-01-10	11.600000	2.370717	340.663333	71.102986	15.723333	1.477712	161.866667	31.862885	61.296667	2.936279	25.516667	1.721434	62.043333	1.669621	1164.586667	91.531897	69.253333	10.776021
2009-01-10	11.725000	1.951760	328.577500	62.886346	15.825000	1.223560	154.355000	30.042129	60.972500	2.483578	25.180000	1.558504	62.570000	1.722769	1160.607500	75.158015	67.015000	9.871962
2009-01-11	11.578000	1.721938	317.676000	59.667687	15.726000	1.082511	148.944000	28.693061	60.100000	2.903860	25.274000	1.365972	62.396000	1.541859	1151.988000	67.882430	65.848000	8.938745
2009-01-12	14.938333	8.373953	292.255000	82.009449	17.185000	3.702641	144.488333	27.888191	60.220000	2.613871	25.066667	1.323112	61.873333	1.881740	1170.103333	75.202468	67.445000	8.900752

BONUS #2: timeline-plot of 30/60/90 days moving average and moving std changes:¶

In [28]:

def plotly_hex2rgba_color(hex_color: str, opacity: float):
    rgb_clr = ImageColor.getcolor(hex_color, "RGB")
    return f'rgba({rgb_clr[0]},{rgb_clr[1]},{rgb_clr[2]},{opacity})'

In [29]:

clr_opacity = 0.2  # color opacity for the standard deviation band-plot
line_modes = cycle(['solid', 'dashdot', 'dot', 'dash'])

fig = go.Figure()
for offset_str, rolling_stock_df in rolling_range2dtm_stock.items():
    line_mode = next(line_modes)
    companies: set[str] = set([col_subcol[0] for col_subcol in rolling_stock_df.columns])
    _df = rolling_stock_df.reset_index()  # `date` column will be moved from index to column
    _df = _df.sort_values(by='date')

    # Reset of the colors:
    # colors = cycle(px.colors.qualitative.Light24)  # wrapping with `cycle` for cycled colors getting
    colors = cycle(px.colors.qualitative.Dark24)  # wrapping with `cycle` for cycled colors getting
    # More colors: https://plotly.com/python/discrete-color/#color-sequences-in-plotly-express

    for company, color in zip(companies, colors):

        # Create upper bound trace
        fig.add_trace(go.Scatter(
            # name=f'{company}: Upper Bound',
            name=f'{offset_str} Upper Bound',
            legendgroup=company,
            showlegend=False,
            x=_df['date'],
            y=_df[company]['mean'] + _df[company]['std'],
            mode='lines',
            fillcolor=plotly_hex2rgba_color(color, clr_opacity),
            # line_color='rgba(255,255,255,0)',
            line_color=plotly_hex2rgba_color(color, clr_opacity),
            line=dict(width=0, dash=line_mode),
        ))

        # Create lower bound trace
        fig.add_trace(go.Scatter(
            # name=f'{company}: Lower Bound',
            name=f'{offset_str} Lower Bound',
            legendgroup=company,
            showlegend=False,
            x=_df['date'],
            y=_df[company]['mean'] - _df[company]['std'],
            mode='lines',
            fillcolor=plotly_hex2rgba_color(color, clr_opacity),
            # line_color='rgba(255,255,255,0)',
            line_color=plotly_hex2rgba_color(color, clr_opacity),
            fill='tonexty',
            line=dict(width=0, dash=line_mode),
        ))

        # Adding mean measure trace
        fig.add_trace(go.Scatter(
            x=_df['date'],
            y=_df[company]['mean'],
            mode='lines+markers',
            name=f'{company}: {offset_str} mean',
            legendgroup=company,
            line=dict(color=color, dash=line_mode),
            marker=dict(size=2),
            # error_y=dict(array=df[company]['std'], visible=False),
        ))

fig.update_layout(
    yaxis_title='Stock Price',
    xaxis_title='Date',
    title='Mean and Std of stock market',
)
fig.show(PLOTLY_PLOT_SHOW_TYPE)

# FIXME legend day-buttons: add buttons to filter by "90D", "60D", "30D" and see all companies at stock

(2) Общее стандартное отклонение¶

In [30]:

std_stock_df = stock_dtm_sorted_df.std().to_frame('Standard deviation').sort_values(by='Standard deviation')
display(std_stock_df.T)

	msft	jnj	ge	pep	xom	aa	ibm	aapl	spx
Standard deviation	3.746426	4.515169	6.168905	6.531849	8.131698	9.108915	26.797665	90.627523	183.562134

BONUS #3: let see Standard Deviation on the bar-plot:¶

In [31]:

fig = px.bar(std_stock_df)
fig.update_layout(
    yaxis_title='Standard deviation',
    xaxis_title='Company',
    title='Std of stock market per all time',
)
fig.show(PLOTLY_PLOT_SHOW_TYPE)

(3) Количество дней, в течение которых цена акций увеличивалась и уменьшалась¶

In [32]:

# NOTE 1: `.dropna()` will drop first row, because it can't be calculated with diff
# NOTE 2: don't forget to sort your data by time. Actually - already sorted by time.
# stock_dtm_sorted_df.sort_index(inplace=True)
stock_diff_df = stock_dtm_sorted_df.diff().dropna()

display(stock_diff_df.head())  # stock_dtm_sorted_df is already sorted


def positive(col):
    return (col > 0).sum()


def negative(col):
    return (col < 0).sum()


pos_neg_stock = stock_diff_df.agg([positive, negative]).rename(index={
    positive.__name__: 'Number of days stock raising',
    negative.__name__: 'Number of days stock decreasing'})

display(pos_neg_stock)

	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
date
2009-01-06	-1.12	110.18	1.13	57.23	5.22	2.70	-0.12	77.50	16.57
2009-01-10	4.55	22.77	1.80	-4.34	-0.28	0.50	2.95	104.88	3.65
2009-01-10	-2.16	-100.25	-1.17	-46.23	-2.85	-2.58	0.18	-111.67	-16.91
2009-01-11	-1.11	-18.25	-0.80	-4.52	-3.39	1.48	-2.45	-31.16	0.88
2009-01-12	20.75	-108.92	9.15	-5.09	4.21	-1.62	-2.44	143.17	14.25

	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
Number of days stock raising	505	498	498	488	487	499	489	510	519
Number of days stock decreasing	492	501	500	511	512	498	509	489	480

In [33]:

# NOTE 1: `.dropna()` will drop first row, because it can't be calculated with diff
# NOTE 2: don't forget to sort your data by time. Actually - already sorted by time.
# stock_dtm_sorted_df.sort_index(inplace=True)
stock_diff_df = stock_dtm_sorted_df.diff().dropna()

display(stock_diff_df.head())  # stock_dtm_sorted_df is already sorted


def positive(col):
    return (col > 0).sum()


def negative(col):
    return (col < 0).sum()


pos_neg_stock = stock_diff_df.agg([positive, negative]).rename(index={
    positive.__name__: 'Number of days stock raising',
    negative.__name__: 'Number of days stock decreasing'})

display(pos_neg_stock)

	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
date
2009-01-06	-1.12	110.18	1.13	57.23	5.22	2.70	-0.12	77.50	16.57
2009-01-10	4.55	22.77	1.80	-4.34	-0.28	0.50	2.95	104.88	3.65
2009-01-10	-2.16	-100.25	-1.17	-46.23	-2.85	-2.58	0.18	-111.67	-16.91
2009-01-11	-1.11	-18.25	-0.80	-4.52	-3.39	1.48	-2.45	-31.16	0.88
2009-01-12	20.75	-108.92	9.15	-5.09	4.21	-1.62	-2.44	143.17	14.25

	aa	aapl	ge	ibm	jnj	msft	pep	spx	xom
Number of days stock raising	505	498	498	488	487	499	489	510	519
Number of days stock decreasing	492	501	500	511	512	498	509	489	480

	0_part	1_part	2_part
0	10	29	07
1	11	30	08
2	12	31	09
3	01	01	10
4	02	02	11
5	03	05
6	04	06
7	05	07
8	06	08
9	07	09
10	08	12
11	09	13
12		14
13		15
14		16
15		19
16		20
17		21
18		23
19		26
20		27
21		28
22		03
23		04
24		10
25		11
26		17
27		18
28		24
29		22
30		25

	0_part	1_part	2_part
0	10	29	07
1	11	30	08
2	12	31	09
3	01	01	10
4	02	02	11
5	03	05
6	04	06
7	05	07
8	06	08
9	07	09
10	08	12
11	09	13
12		14
13		15
14		16
15		19
16		20
17		21
18		23
19		26
20		27
21		28
22		03
23		04
24		10
25		11
26		17
27		18
28		24
29		22
30		25

Helper functions:¶

Задача 1 - Сотрудники и зарплата¶

Подготовка и проверка данных¶

Joining tables by zip-code¶

(1) Рассчитайте зарплату каждого сотрудника, скорректированную с учетом налогов¶

(2) и определите среднюю зарплату среди всех сотрудников¶

(3) и медианную зарплату среди всех сотрудников.¶

Задача 2 - Ниже приведены цифры продаж из магазина за 2009, 2010, 2011, 2012 годы.¶

Подготовка и проверка данных¶

(1) Средние продажи Computer Peripherals за 2009, 2010 гг.¶

(2) Общий объем продаж Furniture и Technology за каждый 2009, 2010 и 2011 годы.¶

(3) Медианный объем продаж Paper за четыре года.¶

(4) Стандартное отклонение продаж Office Supplies в 2012 году.¶

(5) Корреляция между продажами Paper и Computer Peripherals.¶

BONUS #1: let see sales change by timeline on the plotly-plot:¶

(6) Стандартное отклонение количества заказов Office Supplies в 2012 году.¶

Задача 3 - Вам представлены данные на конец торгового дня для различных акций.¶

Подготовка и проверка данных¶

(1) 30-дневное, 60-дневное и 90-дневное скользящие средние и стандартные отклонения¶

BONUS #2: timeline-plot of 30/60/90 days moving average and moving std changes:¶

(2) Общее стандартное отклонение¶

BONUS #3: let see Standard Deviation on the bar-plot:¶

(3) Количество дней, в течение которых цена акций увеличивалась и уменьшалась¶

(1) Средние продажи `Computer Peripherals` за 2009, 2010 гг.¶

(2) Общий объем продаж `Furniture` и `Technology` за каждый 2009, 2010 и 2011 годы.¶

(3) Медианный объем продаж `Paper` за четыре года.¶

(4) Стандартное отклонение продаж `Office Supplies` в 2012 году.¶

(5) Корреляция между продажами `Paper` и `Computer Peripherals`.¶

(6) Стандартное отклонение количества заказов `Office Supplies` в 2012 году.¶

	0_part	1_part	2_part
0	10	29	07
1	11	30	08
2	12	31	09
3	01	01	10
4	02	02	11
5	03	05
6	04	06
7	05	07
8	06	08
9	07	09
10	08	12
11	09	13
12		14
13		15
14		16
15		19
16		20
17		21
18		23
19		26
20		27
21		28
22		03
23		04
24		10
25		11
26		17
27		18
28		24
29		22
30		25