from __future__ import annotations
from typing import Literal
from itertools import cycle
from PIL import ImageColor
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.core.display_functions import display
# set PLOTLY_PLOT_SHOW_TYPE to `None` for interactive, but NOTE: interactive plots can't be visualized in GitHub
PLOTLY_PLOT_SHOW_TYPE: Literal['svg', 'png'] | None = None
# If None - then use nbviewer to render results after the `.ipynb` will be uploaded to the repo.
# See rendered results here:
# https://nbviewer.org/github/MistyDi/data_analytics-math_python_pandas/blob/main/data_analytics_task.ipynb
def computer_readable_columns(*dfs: pd.DataFrame) -> None:
"""
In-place updating of the columns:
- replace " " -> "_"
- replace "-" -> "_"
- do names lower-case (like "Col" -> "col").
NOTE: " " -> "__". Multiple spaces will be multiple underscores.
"""
for df in dfs:
df.columns = list(map(lambda col: col.lower().replace(' ', '_').replace('-', '_'), df.columns))
def get_df_name(df: pd.DataFrame, silent_on_err: bool = True, raise_on_err: bool = False) -> str:
try:
df_name = df.name
except AttributeError as e:
if raise_on_err:
raise
if not silent_on_err:
print(e)
return "" # empy string
return df_name
def get_cols(df: pd.DataFrame, skip_cols: list[str] | None = None) -> list[str]:
cols = list(df.columns)
if skip_cols is not None:
cols = list(set(cols) - set(skip_cols))
return cols
def display_dtypes(*dfs: pd.DataFrame) -> None:
for i, df in enumerate(dfs):
df_name = get_df_name(df)
if df_name:
i_str = ''
df_name = f'[{df_name}] '
else:
i_str = f'[{i}] '
print(f'{i_str}{df_name}Displaying data-types:')
display(pd.DataFrame(df.dtypes).T)
def check_not_null_columns(*dfs: pd.DataFrame) -> None:
for i, df in enumerate(dfs):
# Number of the null values in each column:
na_count_cols = df.isna().sum()
all_not_na = na_count_cols.sum() == 0
df_name = get_df_name(df)
if df_name:
i_str = ''
df_name = f'[{df_name}] '
else:
i_str = f'[{i}] '
if all_not_na:
continue
print(f"{i_str}{df_name}Warning, table contain null-values")
display(na_count_cols.to_frame('Number of null values'))
def display_unique_values(df: pd.DataFrame, *cols, skip_cols: list[str] | None = None,
skip_fillna: bool = False, to_display: bool = True) -> pd.DataFrame | None:
if len(cols) == 0:
cols = get_cols(df, skip_cols)
df_name = get_df_name(df)
if df_name:
df_name = f'[{df_name}] '
concatenated_unique_per_cols: pd.DataFrame | None = None
for col in cols:
if concatenated_unique_per_cols is None:
concatenated_unique_per_cols = pd.DataFrame(df[col].unique(), columns=[col])
continue
concatenated_unique_per_cols = pd.concat(
[concatenated_unique_per_cols, pd.Series(df[col].unique(), name=col)], axis=1)
if concatenated_unique_per_cols is None:
return
if not skip_fillna:
concatenated_unique_per_cols.fillna("", inplace=True)
if to_display:
print(f'{df_name}Unique values per columns:')
display(concatenated_unique_per_cols)
return concatenated_unique_per_cols
def multi_display_unique_values(*dfs: pd.DataFrame, skip_cols: list[str] | None = None,
skip_fillna: bool = False):
for df in dfs:
display_unique_values(df, skip_cols=skip_cols, skip_fillna=skip_fillna)
def display_duplicated_values(df: pd.DataFrame, *cols, skip_cols: list[str] | None = None,
skip_fillna: bool = False):
if len(cols) == 0:
cols = get_cols(df, skip_cols)
df_name = get_df_name(df)
if df_name:
df_name = f'[{df_name}] '
concatenated_duplicates_per_cols: pd.DataFrame | None = None
for col in cols:
if concatenated_duplicates_per_cols is None:
vals_are_unique = df[col].is_unique # check to all values be unique
if vals_are_unique:
continue
print(f'{df_name}[col. {col!r}] contain duplicated values.')
if concatenated_duplicates_per_cols is None:
concatenated_duplicates_per_cols = df[df[col].duplicated(), col]
continue
concatenated_duplicates_per_cols = pd.concat(
[concatenated_duplicates_per_cols, df[df[col].duplicated(), col]], axis=1)
if concatenated_duplicates_per_cols is None:
return
if not skip_fillna:
concatenated_duplicates_per_cols.fillna("", inplace=True)
print(f'{df_name} Warn! Duplicated values per columns found. Duplicated:')
display(concatenated_duplicates_per_cols)
def multi_display_duplicated_values(*dfs: pd.DataFrame, skip_cols: list[str] | None = None,
skip_fillna: bool = False):
for df in dfs:
display_duplicated_values(df, skip_cols=skip_cols, skip_fillna=skip_fillna)
Вам дано два файла. Файл 1 содержит данные о сотрудниках(Emp Id, Name, ZIP code, Salary). Файл 2 содержит налоговую ставку, соответствующую данному ZIP Code(Zip Code, Tax rate).
Файл 1 - emp.csv
Файл 2 - zip_tax.csv
emp_subpath = 'data/emp.csv'
emp_df = pd.read_csv(emp_subpath)
emp_df.name = 'Employee' # name of the table
display(emp_df.head(3))
zip_tax = 'data/zip_tax.csv'
zip_tax_df = pd.read_csv(zip_tax)
zip_tax_df.name = 'Zip taxes' # name of the table
display(zip_tax_df.head(3))
emp id | emp name | ZIP | Salary | |
---|---|---|---|---|
0 | 1 | dldalka | 10025 | 14000 |
1 | 2 | dslbgdl | 10029 | 16000 |
2 | 3 | sdlbgl | 10044 | 10000 |
ZIP | Tax rate | |
---|---|---|
0 | 10001 | 18 |
1 | 10002 | 7 |
2 | 10003 | 0 |
computer_readable_columns(emp_df, zip_tax_df) # fixing column names
print(f'[{emp_df.name}] Column names after fixing:', list(emp_df.columns))
print(f'[{zip_tax_df.name}] Column names after fixing:', list(zip_tax_df.columns))
[Employee] Column names after fixing: ['emp_id', 'emp_name', 'zip', 'salary'] [Zip taxes] Column names after fixing: ['zip', 'tax_rate']
# Display data-types:
display_dtypes(emp_df, zip_tax_df)
# Check null values in columns:
check_not_null_columns(emp_df, zip_tax_df)
# Display duplicated values if existed:
multi_display_duplicated_values(emp_df, zip_tax_df, skip_cols=['zip', 'salary', 'tax_rate'])
# Display unique values if exists:
multi_display_unique_values(emp_df, zip_tax_df, skip_cols=['emp_id'])
# Number of the unique values in each column:
display(emp_df.nunique().to_frame('Number of unique values').T)
[Employee] Displaying data-types:
emp_id | emp_name | zip | salary | |
---|---|---|---|---|
0 | int64 | object | int64 | int64 |
[Zip taxes] Displaying data-types:
zip | tax_rate | |
---|---|---|
0 | int64 | int64 |
[Employee] Unique values per columns:
zip | salary | emp_name | |
---|---|---|---|
0 | 10025.0 | 14000.0 | dldalka |
1 | 10029.0 | 16000.0 | dslbgdl |
2 | 10044.0 | 10000.0 | sdlbgl |
3 | 10009.0 | 6000.0 | dsjgge |
4 | 10005.0 | 17000.0 | dkgds |
5 | 10024.0 | 20000.0 | dskjgwe |
6 | 10045.0 | 9000.0 | sidgkjsbf |
7 | 10048.0 | 11000.0 | hsvvd |
8 | 10035.0 | 5000.0 | dfvd |
9 | 10026.0 | 19000.0 | skdugew |
10 | 10014.0 | 2000.0 | kdsf |
11 | 10049.0 | 15000.0 | dskjfkj |
12 | 10016.0 | 1000.0 | sdljbfgd |
13 | 10020.0 | 13000.0 | skdsffk |
14 | 10030.0 | 18000.0 | akja |
15 | 10040.0 | 8000.0 | dflh |
16 | 10033.0 | 7000.0 | dso |
17 | 10050.0 | 12000.0 | sdloiy |
18 | 10039.0 | 3000.0 | xzkb |
19 | 10034.0 | xloc | |
20 | 10031.0 | lsay | |
21 | 10032.0 | aslfoiiy | |
22 | 10037.0 | woiuy | |
23 | 10041.0 | pqweou | |
24 | 10036.0 | dsaloh | |
25 | 10046.0 | adlh | |
26 | 10051.0 | aslb | |
27 | 10043.0 | alb | |
28 | 10047.0 | wpouj | |
29 | sdaoihy | ||
30 | dlfh | ||
31 | dlh | ||
32 | zxfd | ||
33 | dpiifh | ||
34 | dslihsdlh | ||
35 | dsk | ||
36 | asdoiy | ||
37 | xclk | ||
38 | cxzpoo | ||
39 | sxl | ||
40 | xzh | ||
41 | zxcoihy | ||
42 | ozxcugh | ||
43 | ugh | ||
44 | zsdg | ||
45 | zxv | ||
46 | erpcvih | ||
47 | oaiy | ||
48 | zxoiy | ||
49 | aspoiy |
[Zip taxes] Unique values per columns:
zip | tax_rate | |
---|---|---|
0 | 10001 | 18.0 |
1 | 10002 | 7.0 |
2 | 10003 | 0.0 |
3 | 10004 | 20.0 |
4 | 10005 | 11.0 |
5 | 10006 | 9.0 |
6 | 10007 | 5.0 |
7 | 10008 | 14.0 |
8 | 10009 | 19.0 |
9 | 10010 | 3.0 |
10 | 10011 | 6.0 |
11 | 10012 | 17.0 |
12 | 10013 | 12.0 |
13 | 10014 | 2.0 |
14 | 10015 | 15.0 |
15 | 10016 | 4.0 |
16 | 10017 | 16.0 |
17 | 10018 | 1.0 |
18 | 10019 | 13.0 |
19 | 10020 | |
20 | 10021 | |
21 | 10022 | |
22 | 10023 | |
23 | 10024 | |
24 | 10025 | |
25 | 10026 | |
26 | 10027 | |
27 | 10028 | |
28 | 10029 | |
29 | 10030 | |
30 | 10031 | |
31 | 10032 | |
32 | 10033 | |
33 | 10034 | |
34 | 10035 | |
35 | 10036 | |
36 | 10037 | |
37 | 10038 | |
38 | 10039 | |
39 | 10040 | |
40 | 10041 | |
41 | 10042 | |
42 | 10043 | |
43 | 10044 | |
44 | 10045 | |
45 | 10046 | |
46 | 10047 | |
47 | 10048 | |
48 | 10049 | |
49 | 10050 | |
50 | 10051 |
emp_id | emp_name | zip | salary | |
---|---|---|---|---|
Number of unique values | 50 | 50 | 29 | 19 |
emp_with_taxes_df = pd.merge(emp_df, zip_tax_df, how='left', left_on='zip', right_on='zip')
emp_with_taxes_df.head()
emp_id | emp_name | zip | salary | tax_rate | |
---|---|---|---|---|---|
0 | 1 | dldalka | 10025 | 14000 | 9 |
1 | 2 | dslbgdl | 10029 | 16000 | 15 |
2 | 3 | sdlbgl | 10044 | 10000 | 2 |
3 | 4 | dsjgge | 10009 | 6000 | 19 |
4 | 5 | dkgds | 10005 | 17000 | 11 |
# Gross pay - what employees earn before taxes
# Net pay - what employees earn after taxes
emp_with_taxes_df['net_salary'] = emp_with_taxes_df['salary'] / 100 * (100 - emp_with_taxes_df['tax_rate'])
display(emp_with_taxes_df)
emp_id | emp_name | zip | salary | tax_rate | net_salary | |
---|---|---|---|---|---|---|
0 | 1 | dldalka | 10025 | 14000 | 9 | 12740.0 |
1 | 2 | dslbgdl | 10029 | 16000 | 15 | 13600.0 |
2 | 3 | sdlbgl | 10044 | 10000 | 2 | 9800.0 |
3 | 4 | dsjgge | 10009 | 6000 | 19 | 4860.0 |
4 | 5 | dkgds | 10005 | 17000 | 11 | 15130.0 |
5 | 6 | dskjgwe | 10024 | 16000 | 18 | 13120.0 |
6 | 7 | sidgkjsbf | 10045 | 20000 | 16 | 16800.0 |
7 | 8 | hsvvd | 10048 | 16000 | 1 | 15840.0 |
8 | 9 | dfvd | 10035 | 9000 | 20 | 7200.0 |
9 | 10 | skdugew | 10026 | 11000 | 11 | 9790.0 |
10 | 11 | kdsf | 10014 | 17000 | 6 | 15980.0 |
11 | 12 | dskjfkj | 10045 | 20000 | 16 | 16800.0 |
12 | 13 | sdljbfgd | 10049 | 5000 | 5 | 4750.0 |
13 | 14 | skdsffk | 10016 | 19000 | 20 | 15200.0 |
14 | 15 | akja | 10020 | 2000 | 2 | 1960.0 |
15 | 16 | dflh | 10030 | 2000 | 4 | 1920.0 |
16 | 17 | dso | 10040 | 20000 | 19 | 16200.0 |
17 | 18 | sdloiy | 10026 | 9000 | 11 | 8010.0 |
18 | 19 | xzkb | 10048 | 15000 | 1 | 14850.0 |
19 | 20 | xloc | 10033 | 1000 | 18 | 820.0 |
20 | 21 | lsay | 10050 | 13000 | 4 | 12480.0 |
21 | 22 | aslfoiiy | 10049 | 6000 | 5 | 5700.0 |
22 | 23 | woiuy | 10039 | 13000 | 19 | 10530.0 |
23 | 24 | pqweou | 10040 | 18000 | 19 | 14580.0 |
24 | 25 | dsaloh | 10034 | 9000 | 16 | 7560.0 |
25 | 26 | adlh | 10031 | 18000 | 15 | 15300.0 |
26 | 27 | aslb | 10032 | 6000 | 11 | 5340.0 |
27 | 28 | alb | 10040 | 17000 | 19 | 13770.0 |
28 | 29 | wpouj | 10050 | 11000 | 4 | 10560.0 |
29 | 30 | sdaoihy | 10037 | 9000 | 1 | 8910.0 |
30 | 31 | dlfh | 10048 | 20000 | 1 | 19800.0 |
31 | 32 | dlh | 10041 | 9000 | 0 | 9000.0 |
32 | 33 | zxfd | 10034 | 5000 | 16 | 4200.0 |
33 | 34 | dpiifh | 10036 | 10000 | 17 | 8300.0 |
34 | 35 | dslihsdlh | 10037 | 8000 | 1 | 7920.0 |
35 | 36 | dsk | 10046 | 17000 | 7 | 15810.0 |
36 | 37 | asdoiy | 10051 | 6000 | 4 | 5760.0 |
37 | 38 | xclk | 10050 | 7000 | 4 | 6720.0 |
38 | 39 | cxzpoo | 10041 | 2000 | 0 | 2000.0 |
39 | 40 | sxl | 10039 | 6000 | 19 | 4860.0 |
40 | 41 | xzh | 10050 | 19000 | 4 | 18240.0 |
41 | 42 | zxcoihy | 10049 | 11000 | 5 | 10450.0 |
42 | 43 | ozxcugh | 10043 | 19000 | 13 | 16530.0 |
43 | 44 | ugh | 10050 | 1000 | 4 | 960.0 |
44 | 45 | zsdg | 10047 | 19000 | 13 | 16530.0 |
45 | 46 | zxv | 10051 | 8000 | 4 | 7680.0 |
46 | 47 | erpcvih | 10050 | 12000 | 4 | 11520.0 |
47 | 48 | oaiy | 10050 | 12000 | 4 | 11520.0 |
48 | 49 | zxoiy | 10048 | 10000 | 1 | 9900.0 |
49 | 50 | aspoiy | 10049 | 3000 | 5 | 2850.0 |
print("Mean Net salary among employees:", emp_with_taxes_df['net_salary'].mean())
Mean Net salary among employees: 10213.0
print("Median Net salary among employees:", emp_with_taxes_df['net_salary'].median())
Median Net salary among employees: 10175.0
Определите следующее:
Computer Peripherals
за 2009, 2010 гг.Furniture and Technology
за каждый 2009, 2010 и 2011 годы.Paper
за четыре года.Office Supplies
в 2012 году.Paper
и Computer Peripherals
.Office Supplies
в 2012 году.Имя файла: store_sales.csv
sales_subpath = 'data/store_sales.csv'
sales_df = pd.read_csv(sales_subpath)
sales_df.name = 'Sales'
computer_readable_columns(sales_df) # renaming of the columns
display(sales_df.head())
# Display data-types:
display_dtypes(sales_df)
# Check null values in columns (nothing will be printed if no null values):
check_not_null_columns(sales_df)
order_id | order_date | order_priority | order_quantity | sales | ship_mode | shipping_cost | province | customer_segment | product_category | product_sub_category | product_container | ship_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 928 | 3/1/2011 | Low | 26 | 390.2000 | Express Air | 7.40 | British Columbia | Consumer | Furniture | Office Furnishings | Small Box | 3/3/2011 |
1 | 32323 | 7/23/2010 | High | 38 | 259.7175 | Regular Air | 5.03 | Ontario | Small Business | Technology | Telephones and Communication | Medium Box | 7/25/2010 |
2 | 48353 | 12/15/2012 | Not Specified | 18 | 71.2200 | Regular Air | 0.70 | British Columbia | Corporate | Office Supplies | Pens & Art Supplies | Wrap Bag | 12/17/2012 |
3 | 10144 | 1/2/2011 | Critical | 1 | 192.4900 | Delivery Truck | 30.00 | British Columbia | Corporate | Furniture | Chairs & Chairmats | Jumbo Drum | 1/4/2011 |
4 | 26756 | 5/10/2012 | Medium | 25 | 767.2600 | Regular Air | 4.00 | British Columbia | Home Office | Technology | Computer Peripherals | Small Box | 5/10/2012 |
[Sales] Displaying data-types:
order_id | order_date | order_priority | order_quantity | sales | ship_mode | shipping_cost | province | customer_segment | product_category | product_sub_category | product_container | ship_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | int64 | object | object | int64 | float64 | object | float64 | object | object | object | object | object | object |
Столбцы времени order_date
и ship_date
нужно конвертировать из строки в дату.
# Date columns needed to be converted `object` -> `date` (will be converted to `datetime64[ns]`)
date_fmt: str = '%m/%d/%Y' # date format
sales_df['order_date'] = pd.to_datetime(sales_df['order_date'], format=date_fmt)
sales_df['ship_date'] = pd.to_datetime(sales_df['ship_date'], format=date_fmt)
# Number of the unique values in each column:
display(sales_df.nunique().to_frame('Number of unique values').T)
# Display unique values if exists:
_ = display_unique_values(
sales_df, 'product_category', 'product_sub_category', 'product_container',
'customer_segment', 'ship_mode', 'province', 'order_priority')
order_id | order_date | order_priority | order_quantity | sales | ship_mode | shipping_cost | province | customer_segment | product_category | product_sub_category | product_container | ship_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Number of unique values | 5496 | 1418 | 5 | 50 | 8153 | 3 | 652 | 13 | 4 | 3 | 17 | 7 | 1450 |
[Sales] Unique values per columns:
product_category | product_sub_category | product_container | customer_segment | ship_mode | province | order_priority | |
---|---|---|---|---|---|---|---|
0 | Furniture | Office Furnishings | Small Box | Consumer | Express Air | British Columbia | Low |
1 | Technology | Telephones and Communication | Medium Box | Small Business | Regular Air | Ontario | High |
2 | Office Supplies | Pens & Art Supplies | Wrap Bag | Corporate | Delivery Truck | Northwest Territories | Not Specified |
3 | Chairs & Chairmats | Jumbo Drum | Home Office | Yukon | Critical | ||
4 | Computer Peripherals | Small Pack | Quebec | Medium | |||
5 | Paper | Large Box | Alberta | ||||
6 | Binders and Binder Accessories | Jumbo Box | Saskachewan | ||||
7 | Scissors, Rulers and Trimmers | Prince Edward Island | |||||
8 | Copiers and Fax | Manitoba | |||||
9 | Tables | Nova Scotia | |||||
10 | Storage & Organization | New Brunswick | |||||
11 | Office Machines | Nunavut | |||||
12 | Envelopes | Newfoundland | |||||
13 | Labels | ||||||
14 | Rubber Bands | ||||||
15 | Bookcases | ||||||
16 | Appliances |
Computer Peripherals
за 2009, 2010 гг.¶filtering = sales_df['product_sub_category'] == 'Computer Peripherals'
filtering = filtering & (sales_df['order_date'].dt.year.isin([2009, 2010]))
res_name = 'Mean Sales of `Computer Peripherals`'
sales_df[filtering].groupby(sales_df['order_date'].dt.year)['sales'].mean().to_frame(res_name).reset_index()
order_date | Mean Sales of `Computer Peripherals` | |
---|---|---|
0 | 2009 | 1116.917461 |
1 | 2010 | 872.272338 |
Furniture
и Technology
за каждый 2009, 2010 и 2011 годы.¶# Filtering only needed values:
filtering = sales_df['product_category'].isin(['Furniture', 'Technology'])
filtering = filtering & (sales_df['order_date'].dt.year.isin([2009, 2010, 2011]))
# Grouping by `product_category` and year of the `order_date` field:
sales_grouped = sales_df[filtering].groupby([sales_df['product_category'], sales_df['order_date'].dt.year])
# Summary sales of the categories `Furniture` and `Technology`:
summary_sales_df = sales_grouped['sales'].sum().to_frame('Summary Sales').reset_index()
# Summary number of the orders of the categories `Furniture` and `Technology`:
summary_order_counts_df = sales_grouped['order_quantity'].sum().to_frame('Number of orders').reset_index()
# Outer-Joining for pretty showing in single table:
summary_sales_df = pd.merge(summary_sales_df, summary_order_counts_df, how='outer', left_on=['product_category', 'order_date'], right_on = ['product_category', 'order_date'])
display(summary_sales_df)
product_category | order_date | Summary Sales | Number of orders | |
---|---|---|---|---|
0 | Furniture | 2009 | 1.472672e+06 | 11693 |
1 | Furniture | 2010 | 1.252665e+06 | 11557 |
2 | Furniture | 2011 | 1.269661e+06 | 10664 |
3 | Technology | 2009 | 1.701825e+06 | 13287 |
4 | Technology | 2010 | 1.397142e+06 | 13186 |
5 | Technology | 2011 | 1.366807e+06 | 12152 |
Paper
за четыре года.¶# Filtering only needed values:
filtering = sales_df['product_sub_category'] == 'Paper'
# Grouping by year of the `order_date` field. Calculate mean of sales per each year:
display(sales_df[filtering].groupby([sales_df['order_date'].dt.year]).median()['sales'].to_frame('Median sales').reset_index())
print("Summary median of sales per all years:", sales_df[filtering]['sales'].median())
order_date | Median sales | |
---|---|---|
0 | 2009 | 203.565 |
1 | 2010 | 192.380 |
2 | 2011 | 192.540 |
3 | 2012 | 217.870 |
Summary median of sales per all years: 200.08
Office Supplies
в 2012 году.¶# Filtering only needed values:
filtering = sales_df['product_category'] == 'Office Supplies'
filtering = filtering & (sales_df['order_date'].dt.year == 2012)
print('Standard deviation of the product category `Office Supplies` sales at 2012:',
sales_df[filtering]['sales'].std())
Standard deviation of the product category `Office Supplies` sales at 2012: 2151.485834542503
Paper
и Computer Peripherals
.¶filtering = sales_df['product_sub_category'].isin(['Paper', 'Computer Peripherals'])
res_df = sales_df[filtering].copy() # temp DataFrame to not overwrite the original data
# Pearson’s correlation: covariance(X, Y) / (std(X) * std(Y))
# Where covariance(X, Y) = (sum(x - mean(X)) * (y - mean(Y)) ) * 1/(n-1)
# Note: `.reset_index(drop=True)` is needed because pandas don't allow you to calculate
# the correlation between rows with different indexes values:
paper_sales = res_df[res_df['product_sub_category'] == 'Paper']['sales'].reset_index(drop=True)
computer_peripherals_sales = res_df[res_df['product_sub_category'] == 'Computer Peripherals']['sales'].reset_index(
drop=True)
corr = paper_sales.corr(computer_peripherals_sales)
print("Correlation between sales of the product sub-category `Paper` "
"and another sub-category `Computer Peripherals`: ", corr)
Correlation between sales of the product sub-category `Paper` and another sub-category `Computer Peripherals`: 0.015039019066899894
res_df.sort_values(by=['order_date'], inplace=True)
fig = px.histogram(
res_df, x='order_date', y='sales', color='product_sub_category', marginal='rug',
nbins=100, hover_data=res_df.columns, title='Sales',
)
fig.update_layout(
bargap=0.2
)
fig.show(PLOTLY_PLOT_SHOW_TYPE)
Office Supplies
в 2012 году.¶# Filtering only needed values:
filtering = sales_df['product_category'] == 'Office Supplies'
filtering = filtering & (sales_df['order_date'].dt.year == 2012)
print('Standard deviation of the order quantity for product category `Office Supplies` at 2012:',
sales_df[filtering]['order_quantity'].std())
Standard deviation of the order quantity for product category `Office Supplies` at 2012: 14.500141861634301
Рассчитайте:
Имя файла: stock_data.csv
stock_subpath = 'data/stock_data.csv'
stock_df = pd.read_csv(stock_subpath)
stock_df.name = 'Stock'
computer_readable_columns(stock_df) # fixing column names
display(stock_df.head())
date | aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10/29/07 | 37.41 | 185.09 | 34.46 | 106.78 | 57.13 | 31.78 | 65.67 | 1540.98 | 85.51 |
1 | 10/30/07 | 36.43 | 187.00 | 34.39 | 106.15 | 56.99 | 32.70 | 65.80 | 1531.02 | 83.25 |
2 | 10/31/07 | 36.79 | 189.95 | 34.97 | 108.01 | 57.30 | 33.84 | 65.69 | 1549.38 | 84.03 |
3 | 11/01/07 | 35.22 | 187.44 | 34.27 | 105.72 | 56.85 | 34.07 | 64.51 | 1508.44 | 80.84 |
4 | 11/02/07 | 35.83 | 187.87 | 34.27 | 106.59 | 56.95 | 34.07 | 65.03 | 1509.65 | 80.32 |
# Display data-types:
display_dtypes(stock_df)
# Check null values in columns:
check_not_null_columns(stock_df)
# Display duplicated values if existed:
display_duplicated_values(stock_df, 'date')
# Number of the unique values in each column:
display(stock_df.nunique().to_frame('Number of unique values').T)
[Stock] Displaying data-types:
date | aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|---|
0 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 |
date | aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|---|
Number of unique values | 1000 | 752 | 978 | 742 | 929 | 723 | 657 | 757 | 991 | 848 |
Необходимо конвертировать date
из строки в дату.
Попробуем понять формат даты, путём вывода некоторых примеров значений в данном столбце:
def min_max(x):
return pd.Series(index=['min','max'],data=[x.min(),x.max()])
unique_date = stock_df['date'].unique()
print('Min value of the `date` string-column:', unique_date.min())
print('Max value of the `date` string-column:', unique_date.max())
split_date = stock_df['date'].str.split('/', expand=True)
split_date.columns = [f'{col}_part' for col in split_date.columns]
display(split_date.apply(min_max))
split_date_unique = display_unique_values(split_date, to_display=False)
display(split_date_unique.apply(lambda x: x.sort_values()))
# Displaying random sample of data:
display(stock_df['date'].sample(n=10))
Min value of the `date` string-column: 01/02/08 Max value of the `date` string-column: 12/31/10
0_part | 1_part | 2_part | |
---|---|---|---|
min | 01 | 01 | 07 |
max | 12 | 31 | 11 |
0_part | 1_part | 2_part | |
---|---|---|---|
0 | 10 | 29 | 07 |
1 | 11 | 30 | 08 |
2 | 12 | 31 | 09 |
3 | 01 | 01 | 10 |
4 | 02 | 02 | 11 |
5 | 03 | 05 | |
6 | 04 | 06 | |
7 | 05 | 07 | |
8 | 06 | 08 | |
9 | 07 | 09 | |
10 | 08 | 12 | |
11 | 09 | 13 | |
12 | 14 | ||
13 | 15 | ||
14 | 16 | ||
15 | 19 | ||
16 | 20 | ||
17 | 21 | ||
18 | 23 | ||
19 | 26 | ||
20 | 27 | ||
21 | 28 | ||
22 | 03 | ||
23 | 04 | ||
24 | 10 | ||
25 | 11 | ||
26 | 17 | ||
27 | 18 | ||
28 | 24 | ||
29 | 22 | ||
30 | 25 |
205 08/21/08 53 01/15/08 443 08/03/09 873 04/15/11 59 01/24/08 667 06/23/10 880 04/27/11 77 02/20/08 537 12/15/09 457 08/21/09 Name: date, dtype: object
0_part
- скорее всего месяц, т.к. уникальных значений из этого столбца больше, чем уникальных значений из 2_part
.
1_part
- это день.
2_part
- скорее всего год.
date_fmt: str = '%m/%d/%Y' # date format
stock_df['date'] = pd.to_datetime(sales_df['order_date'], format=date_fmt)
Говоря об использовании оконной функции в pandas, обратимся к документации .rolling(...)
метода. [pandas-docs of the
.rolling(...)
]:
DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, method='single')
Provide rolling window calculations.
Parameters:
window
: int
, offset
, or BaseIndexer
subclass
datetime
-like indexes. ......
*То есть если хотим отсчитывать по периоду в N-дней, то нужно иметь индекс
DataFrame
'а на основеdatetime
.
Причем также нужно использовать размер окна, задаваемый через offset
, например, 30 дней = "30D", 5 секунд = "5S". *
stock_dtm_sorted_df = stock_df.set_index('date')
# NOTE: don't forget to sort your data by time:
stock_dtm_sorted_df.sort_index(inplace=True)
companies: set[str] = set(stock_dtm_sorted_df.columns)
stock_dtm_sorted_df.head()
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|
date | |||||||||
2009-01-02 | 10.83 | 259.62 | 14.37 | 125.16 | 57.91 | 23.55 | 61.14 | 1077.96 | 56.99 |
2009-01-06 | 9.71 | 369.80 | 15.50 | 182.39 | 63.13 | 26.25 | 61.02 | 1155.46 | 73.56 |
2009-01-10 | 14.26 | 392.57 | 17.30 | 178.05 | 62.85 | 26.75 | 63.97 | 1260.34 | 77.21 |
2009-01-10 | 12.10 | 292.32 | 16.13 | 131.82 | 60.00 | 24.17 | 64.15 | 1148.67 | 60.30 |
2009-01-11 | 10.99 | 274.07 | 15.33 | 127.30 | 56.61 | 25.65 | 61.70 | 1117.51 | 61.18 |
rolling_range2dtm_stock = dict()
days = [30, 60, 90]
for d in days:
offset_str = f'{d}D'
# NOTE: `.dropna()` will drop first `d`-rows, because they can't be calculated
rolling_range2dtm_stock[offset_str] = stock_dtm_sorted_df.rolling(offset_str).agg(
[pd.Series.mean, pd.Series.std]).dropna()
print(f'\nMoving Mean and Std by {d} days period:')
display(rolling_range2dtm_stock[offset_str].head())
Moving Mean and Std by 30 days period:
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | |
date | ||||||||||||||||||
2009-01-06 | 10.270000 | 0.791960 | 314.710000 | 77.909025 | 14.935000 | 0.799031 | 153.775000 | 40.467721 | 60.520000 | 3.691097 | 24.900000 | 1.909188 | 61.080000 | 0.084853 | 1116.710000 | 54.800776 | 65.275000 | 11.716759 |
2009-01-10 | 11.600000 | 2.370717 | 340.663333 | 71.102986 | 15.723333 | 1.477712 | 161.866667 | 31.862885 | 61.296667 | 2.936279 | 25.516667 | 1.721434 | 62.043333 | 1.669621 | 1164.586667 | 91.531897 | 69.253333 | 10.776021 |
2009-01-10 | 11.725000 | 1.951760 | 328.577500 | 62.886346 | 15.825000 | 1.223560 | 154.355000 | 30.042129 | 60.972500 | 2.483578 | 25.180000 | 1.558504 | 62.570000 | 1.722769 | 1160.607500 | 75.158015 | 67.015000 | 9.871962 |
2009-01-11 | 11.578000 | 1.721938 | 317.676000 | 59.667687 | 15.726000 | 1.082511 | 148.944000 | 28.693061 | 60.100000 | 2.903860 | 25.274000 | 1.365972 | 62.396000 | 1.541859 | 1151.988000 | 67.882430 | 65.848000 | 8.938745 |
2009-01-12 | 14.938333 | 8.373953 | 292.255000 | 82.009449 | 17.185000 | 3.702641 | 144.488333 | 27.888191 | 60.220000 | 2.613871 | 25.066667 | 1.323112 | 61.873333 | 1.881740 | 1170.103333 | 75.202468 | 67.445000 | 8.900752 |
Moving Mean and Std by 60 days period:
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | |
date | ||||||||||||||||||
2009-01-06 | 10.270000 | 0.791960 | 314.710000 | 77.909025 | 14.935000 | 0.799031 | 153.775000 | 40.467721 | 60.520000 | 3.691097 | 24.900000 | 1.909188 | 61.080000 | 0.084853 | 1116.710000 | 54.800776 | 65.275000 | 11.716759 |
2009-01-10 | 11.600000 | 2.370717 | 340.663333 | 71.102986 | 15.723333 | 1.477712 | 161.866667 | 31.862885 | 61.296667 | 2.936279 | 25.516667 | 1.721434 | 62.043333 | 1.669621 | 1164.586667 | 91.531897 | 69.253333 | 10.776021 |
2009-01-10 | 11.725000 | 1.951760 | 328.577500 | 62.886346 | 15.825000 | 1.223560 | 154.355000 | 30.042129 | 60.972500 | 2.483578 | 25.180000 | 1.558504 | 62.570000 | 1.722769 | 1160.607500 | 75.158015 | 67.015000 | 9.871962 |
2009-01-11 | 11.578000 | 1.721938 | 317.676000 | 59.667687 | 15.726000 | 1.082511 | 148.944000 | 28.693061 | 60.100000 | 2.903860 | 25.274000 | 1.365972 | 62.396000 | 1.541859 | 1151.988000 | 67.882430 | 65.848000 | 8.938745 |
2009-01-12 | 14.938333 | 8.373953 | 292.255000 | 82.009449 | 17.185000 | 3.702641 | 144.488333 | 27.888191 | 60.220000 | 2.613871 | 25.066667 | 1.323112 | 61.873333 | 1.881740 | 1170.103333 | 75.202468 | 67.445000 | 8.900752 |
Moving Mean and Std by 90 days period:
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | mean | std | |
date | ||||||||||||||||||
2009-01-06 | 10.270000 | 0.791960 | 314.710000 | 77.909025 | 14.935000 | 0.799031 | 153.775000 | 40.467721 | 60.520000 | 3.691097 | 24.900000 | 1.909188 | 61.080000 | 0.084853 | 1116.710000 | 54.800776 | 65.275000 | 11.716759 |
2009-01-10 | 11.600000 | 2.370717 | 340.663333 | 71.102986 | 15.723333 | 1.477712 | 161.866667 | 31.862885 | 61.296667 | 2.936279 | 25.516667 | 1.721434 | 62.043333 | 1.669621 | 1164.586667 | 91.531897 | 69.253333 | 10.776021 |
2009-01-10 | 11.725000 | 1.951760 | 328.577500 | 62.886346 | 15.825000 | 1.223560 | 154.355000 | 30.042129 | 60.972500 | 2.483578 | 25.180000 | 1.558504 | 62.570000 | 1.722769 | 1160.607500 | 75.158015 | 67.015000 | 9.871962 |
2009-01-11 | 11.578000 | 1.721938 | 317.676000 | 59.667687 | 15.726000 | 1.082511 | 148.944000 | 28.693061 | 60.100000 | 2.903860 | 25.274000 | 1.365972 | 62.396000 | 1.541859 | 1151.988000 | 67.882430 | 65.848000 | 8.938745 |
2009-01-12 | 14.938333 | 8.373953 | 292.255000 | 82.009449 | 17.185000 | 3.702641 | 144.488333 | 27.888191 | 60.220000 | 2.613871 | 25.066667 | 1.323112 | 61.873333 | 1.881740 | 1170.103333 | 75.202468 | 67.445000 | 8.900752 |
def plotly_hex2rgba_color(hex_color: str, opacity: float):
rgb_clr = ImageColor.getcolor(hex_color, "RGB")
return f'rgba({rgb_clr[0]},{rgb_clr[1]},{rgb_clr[2]},{opacity})'
clr_opacity = 0.2 # color opacity for the standard deviation band-plot
line_modes = cycle(['solid', 'dashdot', 'dot', 'dash'])
fig = go.Figure()
for offset_str, rolling_stock_df in rolling_range2dtm_stock.items():
line_mode = next(line_modes)
companies: set[str] = set([col_subcol[0] for col_subcol in rolling_stock_df.columns])
_df = rolling_stock_df.reset_index() # `date` column will be moved from index to column
_df = _df.sort_values(by='date')
# Reset of the colors:
# colors = cycle(px.colors.qualitative.Light24) # wrapping with `cycle` for cycled colors getting
colors = cycle(px.colors.qualitative.Dark24) # wrapping with `cycle` for cycled colors getting
# More colors: https://plotly.com/python/discrete-color/#color-sequences-in-plotly-express
for company, color in zip(companies, colors):
# Create upper bound trace
fig.add_trace(go.Scatter(
# name=f'{company}: Upper Bound',
name=f'{offset_str} Upper Bound',
legendgroup=company,
showlegend=False,
x=_df['date'],
y=_df[company]['mean'] + _df[company]['std'],
mode='lines',
fillcolor=plotly_hex2rgba_color(color, clr_opacity),
# line_color='rgba(255,255,255,0)',
line_color=plotly_hex2rgba_color(color, clr_opacity),
line=dict(width=0, dash=line_mode),
))
# Create lower bound trace
fig.add_trace(go.Scatter(
# name=f'{company}: Lower Bound',
name=f'{offset_str} Lower Bound',
legendgroup=company,
showlegend=False,
x=_df['date'],
y=_df[company]['mean'] - _df[company]['std'],
mode='lines',
fillcolor=plotly_hex2rgba_color(color, clr_opacity),
# line_color='rgba(255,255,255,0)',
line_color=plotly_hex2rgba_color(color, clr_opacity),
fill='tonexty',
line=dict(width=0, dash=line_mode),
))
# Adding mean measure trace
fig.add_trace(go.Scatter(
x=_df['date'],
y=_df[company]['mean'],
mode='lines+markers',
name=f'{company}: {offset_str} mean',
legendgroup=company,
line=dict(color=color, dash=line_mode),
marker=dict(size=2),
# error_y=dict(array=df[company]['std'], visible=False),
))
fig.update_layout(
yaxis_title='Stock Price',
xaxis_title='Date',
title='Mean and Std of stock market',
)
fig.show(PLOTLY_PLOT_SHOW_TYPE)
# FIXME legend day-buttons: add buttons to filter by "90D", "60D", "30D" and see all companies at stock
std_stock_df = stock_dtm_sorted_df.std().to_frame('Standard deviation').sort_values(by='Standard deviation')
display(std_stock_df.T)
msft | jnj | ge | pep | xom | aa | ibm | aapl | spx | |
---|---|---|---|---|---|---|---|---|---|
Standard deviation | 3.746426 | 4.515169 | 6.168905 | 6.531849 | 8.131698 | 9.108915 | 26.797665 | 90.627523 | 183.562134 |
fig = px.bar(std_stock_df)
fig.update_layout(
yaxis_title='Standard deviation',
xaxis_title='Company',
title='Std of stock market per all time',
)
fig.show(PLOTLY_PLOT_SHOW_TYPE)
# NOTE 1: `.dropna()` will drop first row, because it can't be calculated with diff
# NOTE 2: don't forget to sort your data by time. Actually - already sorted by time.
# stock_dtm_sorted_df.sort_index(inplace=True)
stock_diff_df = stock_dtm_sorted_df.diff().dropna()
display(stock_diff_df.head()) # stock_dtm_sorted_df is already sorted
def positive(col):
return (col > 0).sum()
def negative(col):
return (col < 0).sum()
pos_neg_stock = stock_diff_df.agg([positive, negative]).rename(index={
positive.__name__: 'Number of days stock raising',
negative.__name__: 'Number of days stock decreasing'})
display(pos_neg_stock)
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|
date | |||||||||
2009-01-06 | -1.12 | 110.18 | 1.13 | 57.23 | 5.22 | 2.70 | -0.12 | 77.50 | 16.57 |
2009-01-10 | 4.55 | 22.77 | 1.80 | -4.34 | -0.28 | 0.50 | 2.95 | 104.88 | 3.65 |
2009-01-10 | -2.16 | -100.25 | -1.17 | -46.23 | -2.85 | -2.58 | 0.18 | -111.67 | -16.91 |
2009-01-11 | -1.11 | -18.25 | -0.80 | -4.52 | -3.39 | 1.48 | -2.45 | -31.16 | 0.88 |
2009-01-12 | 20.75 | -108.92 | 9.15 | -5.09 | 4.21 | -1.62 | -2.44 | 143.17 | 14.25 |
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|
Number of days stock raising | 505 | 498 | 498 | 488 | 487 | 499 | 489 | 510 | 519 |
Number of days stock decreasing | 492 | 501 | 500 | 511 | 512 | 498 | 509 | 489 | 480 |
# NOTE 1: `.dropna()` will drop first row, because it can't be calculated with diff
# NOTE 2: don't forget to sort your data by time. Actually - already sorted by time.
# stock_dtm_sorted_df.sort_index(inplace=True)
stock_diff_df = stock_dtm_sorted_df.diff().dropna()
display(stock_diff_df.head()) # stock_dtm_sorted_df is already sorted
def positive(col):
return (col > 0).sum()
def negative(col):
return (col < 0).sum()
pos_neg_stock = stock_diff_df.agg([positive, negative]).rename(index={
positive.__name__: 'Number of days stock raising',
negative.__name__: 'Number of days stock decreasing'})
display(pos_neg_stock)
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|
date | |||||||||
2009-01-06 | -1.12 | 110.18 | 1.13 | 57.23 | 5.22 | 2.70 | -0.12 | 77.50 | 16.57 |
2009-01-10 | 4.55 | 22.77 | 1.80 | -4.34 | -0.28 | 0.50 | 2.95 | 104.88 | 3.65 |
2009-01-10 | -2.16 | -100.25 | -1.17 | -46.23 | -2.85 | -2.58 | 0.18 | -111.67 | -16.91 |
2009-01-11 | -1.11 | -18.25 | -0.80 | -4.52 | -3.39 | 1.48 | -2.45 | -31.16 | 0.88 |
2009-01-12 | 20.75 | -108.92 | 9.15 | -5.09 | 4.21 | -1.62 | -2.44 | 143.17 | 14.25 |
aa | aapl | ge | ibm | jnj | msft | pep | spx | xom | |
---|---|---|---|---|---|---|---|---|---|
Number of days stock raising | 505 | 498 | 498 | 488 | 487 | 499 | 489 | 510 | 519 |
Number of days stock decreasing | 492 | 501 | 500 | 511 | 512 | 498 | 509 | 489 | 480 |