import pandas as pd
# ensure that all columns are shown and that colum content is not cut
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width',1000)
pd.set_option('display.max_rows', 500) # ensure that all rows are shown
BalanceSheetStandardizer
¶==========================================================
If you find this tool useful, a sponsorship would be greatly appreciated!
https://github.com/sponsors/HansjoergW
How to get in touch
==========================================================
In the 07_00_stanardizer_basics.ipynb
we looked at the basic principles of the standardizer. And now we are going to explore the details of the BalanceSheetStandardizer
.
The main Goal of the BalanceSheetStandardizer
is to provide a consilidated, standardized view that contains the main positions of a balance sheet.
The current implementation tries to find/calculate the values for the following positions:
As input, we are going to use the dataset which was created with the 06_bulk_data_processing_deep_dive.ipynb
. That dataset contains all available data for balance sheets. The path to this dataset - on my machine - is either set/parallel/BS/joined
or set/serial/BS/joined
depending whether it was produced with the faster parallel or slower serial processing approach.
The data is already filtered for 10-K and 10-Q reports. Moreover, the following filters were applied as well: ReportPeriodRawFilter
, MainCoregRawFilter
, OfficialTagsOnlyRawFilter
, USDOnlyRawFilter
. The dataset is already joined, so we can use it directly with the BalanceSheetStandardizer
.
Of course, if you prefer another dataset, for instance all data of a few companies, or all data of a single year, feel free to do so.
# As an alternative, using the data of a single year
from secfsdstools.d_container.databagmodel import JoinedDataBag
from secfsdstools.e_collector.zipcollecting import ZipCollector
from secfsdstools.u_usecases.bulk_loading import default_postloadfilter
collector = ZipCollector.get_zip_by_names(names=["2022q1.zip", "2022q2.zip", "2022q3.zip", "2022q4.zip"],
forms_filter=["10-K", "10-Q"],
stmt_filter=["BS"], post_load_filter=default_postloadfilter)
all_bs_joinedbag: JoinedDataBag = collector.collect().join()
from secfsdstools.d_container.databagmodel import JoinedDataBag
from secfsdstools.f_standardize.bs_standardize import BalanceSheetStandardizer
bs_standardizer = BalanceSheetStandardizer()
# standardize the data
all_bs_joinedbag.present(bs_standardizer)
from secfsdstools.d_container.databagmodel import JoinedDataBag
from secfsdstools.f_standardize.bs_standardize import BalanceSheetStandardizer
print("loading data...")
all_bs_joinedbag:JoinedDataBag = JoinedDataBag.load(target_path="set/parallel/BS/joined")
bs_standardizer = BalanceSheetStandardizer()
# standardize the data
all_bs_joinedbag.present(bs_standardizer)
loading data...
2025-02-04 06:22:36,001 [INFO] standardizing start PRE processing ... 2025-02-04 06:22:46,701 [INFO] standardizing start MAIN processing ... 2025-02-04 06:22:47,979 [INFO] standardizing start POST processing ... 2025-02-04 06:22:48,085 [INFO] standardizing start FINALIZE ...
adsh | cik | name | form | fye | fy | fp | date | filed | coreg | report | ddate | qtrs | Assets | AssetsCurrent | Cash | AssetsNoncurrent | Liabilities | LiabilitiesCurrent | LiabilitiesNoncurrent | Equity | HolderEquity | RetainedEarnings | AdditionalPaidInCapital | TreasuryStockValue | TemporaryEquity | RedeemableEquity | LiabilitiesAndEquity | AssetsCheck_error | AssetsCheck_cat | LiabilitiesCheck_error | LiabilitiesCheck_cat | EquityCheck_error | EquityCheck_cat | AssetsLiaEquCheck_error | AssetsLiaEquCheck_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
200445 | 0001096906-21-001168 | 1089297 | NOVAGANT CORP | 10-K | 1231 | 2004.0 | FY | 2004-12-31 | 20210517 | 2 | 20041231 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.145700e+04 | 9.145700e+04 | 0.000000e+00 | -9.145700e+04 | -9.145700e+04 | -1.832780e+07 | 1.821434e+07 | 0.000000e+00 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
200446 | 0001096906-21-001172 | 1089297 | NOVAGANT CORP | 10-K | 1231 | 2005.0 | FY | 2005-12-31 | 20210517 | 2 | 20051231 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.145700e+04 | 9.145700e+04 | 0.000000e+00 | -9.145700e+04 | -9.145700e+04 | -1.832780e+07 | 1.821434e+07 | 0.000000e+00 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
200449 | 0001096906-21-001180 | 1089297 | NOVAGANT CORP | 10-K | 1231 | 2006.0 | FY | 2006-12-31 | 20210517 | 2 | 20061231 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.145700e+04 | 9.145700e+04 | 0.000000e+00 | -9.145700e+04 | -9.145700e+04 | -1.832780e+07 | 1.821434e+07 | 0.000000e+00 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
200450 | 0001096906-21-001182 | 1089297 | NOVAGANT CORP | 10-K | 1231 | 2007.0 | FY | 2007-12-31 | 20210517 | 2 | 20071231 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.145700e+04 | 9.145700e+04 | 0.000000e+00 | -9.145700e+04 | -9.145700e+04 | -1.832780e+07 | 1.821434e+07 | 0.000000e+00 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
200451 | 0001096906-21-001184 | 1089297 | NOVAGANT CORP | 10-K | 1231 | 2008.0 | FY | 2008-12-31 | 20210517 | 2 | 20081231 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.145700e+04 | 9.145700e+04 | 0.000000e+00 | -9.145700e+04 | -9.145700e+04 | -1.832780e+07 | 1.821434e+07 | 0.000000e+00 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
161593 | 0000723125-24-000047 | 723125 | MICRON TECHNOLOGY INC | 10-Q | 0831 | 2025.0 | Q1 | 2024-11-30 | 20241219 | 4 | 20241130 | 0 | 7.146100e+10 | 2.449300e+10 | 6.693000e+09 | 4.696800e+10 | 2.466400e+10 | 9.015000e+09 | 1.564900e+10 | 4.679700e+10 | 4.679700e+10 | 4.242700e+10 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 7.146100e+10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
163923 | 0000909832-24-000079 | 909832 | COSTCO WHOLESALE CORP /NEW | 10-Q | 0831 | 2025.0 | Q1 | 2024-11-30 | 20241219 | 4 | 20241130 | 0 | 7.338600e+10 | 3.752300e+10 | 1.090700e+10 | 3.586300e+10 | 4.893500e+10 | 3.828900e+10 | 1.064600e+10 | 2.445100e+10 | 2.445100e+10 | 1.870000e+10 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 7.338600e+10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
161657 | 0001193125-24-281288 | 40704 | GENERAL MILLS INC | 10-Q | 0531 | 2025.0 | Q2 | 2024-11-30 | 20241218 | 4 | 20241130 | 0 | 3.339610e+10 | 7.381400e+09 | 2.292800e+09 | 2.601470e+10 | 2.394690e+10 | 8.024300e+09 | 1.592260e+10 | 9.449200e+09 | 9.449200e+09 | 2.134030e+10 | 0.000000e+00 | -1.087330e+10 | 0.0 | 0.0 | 3.339610e+10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
162289 | 0001640334-24-001969 | 1584480 | STARTECH LABS, INC. | 10-Q | 0531 | 2025.0 | Q2 | 2024-11-30 | 20241231 | 2 | 20241130 | 0 | 0.000000e+00 | 0.000000e+00 | NaN | 0.000000e+00 | 4.106260e+05 | 4.106260e+05 | 0.000000e+00 | -4.106260e+05 | -4.106260e+05 | -3.912590e+07 | 3.865462e+07 | 0.000000e+00 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
238421 | 0001835681-24-000069 | 1835681 | POWERSCHOOL HOLDINGS, INC. | 10-Q | 1231 | 2024.0 | Q1 | 2024-12-31 | 20240507 | 2 | 20241231 | 0 | 3.766867e+09 | 1.301550e+08 | 1.742500e+07 | 3.636712e+09 | 2.022553e+09 | 5.362210e+08 | 1.486332e+09 | 1.744314e+09 | 1.744314e+09 | -2.379450e+08 | 1.532371e+09 | 0.000000e+00 | 0.0 | 0.0 | 3.766867e+09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
346570 rows × 36 columns
At first, we will save the results, including all the logs, so that we can use it directly in the future, without the need to process it again.
Note: you need to create the target directory before storing the data
import os
target_path = "standardized/BS"
os.makedirs(target_path, exist_ok=True)
bs_standardizer.get_standardize_bag().save(target_path)
Once the data has been standardized and saved, you can load it directly.
from secfsdstools.f_standardize.standardizing import StandardizedBag
bs_standardizer_result_bag = StandardizedBag.load("standardized/BS")
Before we dive into what the BalanceSheetStandardizer
does, lets get a first impression of the the produced data. First, let us see how many rows we have.
len(bs_standardizer_result_bag.result_df)
346570
Next, a good idea is to look at the validation_overview_df
. This table gives an idea about the "quality" of the dateset based on the summary of the results of the applied validation rules.
bs_standardizer_result_bag.validation_overview_df
AssetsCheck_cat | LiabilitiesCheck_cat | EquityCheck_cat | AssetsLiaEquCheck_cat | AssetsCheck_cat_pct | LiabilitiesCheck_cat_pct | EquityCheck_cat_pct | AssetsLiaEquCheck_cat_pct | |
---|---|---|---|---|---|---|---|---|
0 | 341277 | 329215 | 325014 | 324751 | 98.47 | 94.99 | 93.78 | 93.70 |
1 | 333 | 2226 | 6409 | 6549 | 0.10 | 0.64 | 1.85 | 1.89 |
5 | 614 | 3499 | 3578 | 3616 | 0.18 | 1.01 | 1.03 | 1.04 |
10 | 487 | 2841 | 1406 | 1425 | 0.14 | 0.82 | 0.41 | 0.41 |
100 | 3859 | 8789 | 9892 | 9958 | 1.11 | 2.54 | 2.85 | 2.87 |
This seems to be quite ok, since we have around 95% of the data in the first two categories. As a reminder, Category 0 means it is an exact match, catagory 1 means that it is less than 1 percent off the expected value (see notebook 07_00_standardizer_basics.ipynb
for details).
The following examples are just some ideas to show, what we could do with the standardized balance sheet dataset.
First let us have a look at the distribution of Equity using a box plot.
import matplotlib.pyplot as plt
data = bs_standardizer_result_bag.result_df.Equity[:1000]
plt.boxplot(data, vert=False)
plt.xscale('log') # using a logarithmic scale, we will lose negativ values though
Let's figure out, which report has the most Equity and then try to show the history of the Equity for that company.
bs_standardizer_result_bag.result_df[bs_standardizer_result_bag.result_df.Equity == bs_standardizer_result_bag.result_df.Equity.max()]
adsh | cik | name | form | fye | fy | fp | date | filed | coreg | report | ddate | qtrs | Assets | AssetsCurrent | Cash | AssetsNoncurrent | Liabilities | LiabilitiesCurrent | LiabilitiesNoncurrent | Equity | HolderEquity | RetainedEarnings | AdditionalPaidInCapital | TreasuryStockValue | TemporaryEquity | RedeemableEquity | LiabilitiesAndEquity | AssetsCheck_error | AssetsCheck_cat | LiabilitiesCheck_error | LiabilitiesCheck_cat | EquityCheck_error | EquityCheck_cat | AssetsLiaEquCheck_error | AssetsLiaEquCheck_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
277912 | 0001140361-11-038595 | 1115055 | PINNACLE FINANCIAL PARTNERS INC | 10-Q | 1231 | 2011.0 | Q2 | 2011-06-30 | 20110729 | 2 | 20110630 | 0 | 4.831333e+12 | 4.831333e+12 | 2.785681e+11 | 0.0 | 4.132105e+12 | 4.132105e+12 | 0.0 | 6.992280e+11 | 6.992280e+11 | 1.986414e+10 | 5.335573e+11 | 0.0 | 0.0 | 0.0 | 4.831333e+12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Since we used the present
method of the standardizer, the cik, form, fye, fy, and fp attributes from the sub_df were directly merged in the result. Also a date
column with a date datatype was added and the data is already sorted by date.
Next, get reports for this company and filter our standardized balance sheet data for it.
reports_of_1115055 = bs_standardizer_result_bag.result_df[bs_standardizer_result_bag.result_df.cik==1115055]
# Plotting
plt.plot(reports_of_1115055['date'], reports_of_1115055['Equity'], linestyle='-')
plt.yscale('log')
The first data point is obviously faulty.
Let us repeat the steps for apple (cik=320193) and display Equity, Assets, and Liabilities.
apple_reports_df = bs_standardizer_result_bag.result_df[bs_standardizer_result_bag.result_df.cik==320193]
# Plotting
plt.plot(apple_reports_df['date'], apple_reports_df['Equity'], label='Equity', linestyle='-')
plt.plot(apple_reports_df['date'], apple_reports_df['Assets'], label='Assets', linestyle='-')
plt.plot(apple_reports_df['date'], apple_reports_df['Liabilities'], label='Liabilities', linestyle='-')
plt.legend()
<matplotlib.legend.Legend at 0x14a01bba830>
Let's visualize and compare the history of euqity for a few companies:
ciks_to_consider = [320193, 789019, 1652044, 1045810, 1018724, 2488, 50863] # Apple, Microsoft, Alphabet, nvidia, Amazon, AMD, intel
df = bs_standardizer_result_bag.result_df[bs_standardizer_result_bag.result_df.cik.isin(ciks_to_consider)].copy()
# Group by 'name' and plot equity for each group
# Note: using the `present` method ensured that the same cik has always the same name even if the company name did change in the past
for name, group in df.groupby('name'):
plt.plot(group['date'], group['Equity'], label=name, linestyle='-')
# Add labels and title
plt.xlabel('Date')
plt.ylabel('Equity')
plt.title('Equity Over Time for Different Companies (CIKs)')
# Display legend
plt.legend()
<matplotlib.legend.Legend at 0x14a0132cd90>
Cloud providers seem to do exceptionally well these days: alphabet, amazon, and microsoft almost have an exponential grow in equity.
What was going with AMD? They had a massive increase in Equity in 2022. AssetsNoncurrent did increase 10 fold ...
amd_reports_df = bs_standardizer_result_bag.result_df[bs_standardizer_result_bag.result_df.cik==2488]
plt.plot(amd_reports_df['date'], amd_reports_df['Equity'], label='Equity', linestyle='-')
plt.plot(amd_reports_df['date'], amd_reports_df['Assets'], label='Assets', linestyle='-')
plt.plot(amd_reports_df['date'], amd_reports_df['AssetsCurrent'], label='AssetsCurrent', linestyle='-')
plt.plot(amd_reports_df['date'], amd_reports_df['AssetsNoncurrent'], label='AssetsNoncurrent', linestyle='-')
plt.plot(amd_reports_df['date'], amd_reports_df['Liabilities'], label='Liabilities', linestyle='-')
plt.legend()
<matplotlib.legend.Legend at 0x14a01bb87c0>
With the Balance Sheet Standardizer, we have the possibility to actually compare data between companies and also to create input for ML models.
The great thing is, that we can do this with official and free data of over 300'000 reports filed by about 14'000 companies since 2010.
Thanks to secfsdstools package, we have the possibility to gather and filter the data in a simple and efficient way, which otherwise would only be possible if you pay for the data. And you have all the data on your computer, no need to for slow api calls.
The Standardizer framework is simple and can be extended with additional rules to make other data points available. With the validation rules we also have a way to assess the quality of single rows in the dataset.
Of course, calculating financial ratios based on the standardized dataset is really simple now.
Also the size of the standardized dataset (about 30MB) is really easy to handle.
Note:
The following section tries to explain how the results are calculated and what kind of rules are applied. It isn't really necessary to understand this section in detail, but it gives you an idea what happens under the hood.
Next, let us see how often which rule was applied. This gives an idea about how much "calculation" had to be done in order to create a standardized dataset. We can to this by looking at the applied_rules_sum_s
pandas Series object.
bs_standardizer_result_bag.applied_rules_sum_s
0 NaN 0 PREPIVOT_BS_PREPIV_#1_DeDup 1450 PRE_BS_PRE_#1_Assets/AssetsNoncurrent 122 PRE_BS_PRE_#2_Assets/AssetsCurrent 0 MAIN_1_BS_#1_BR_#1_Assets<-AssetsNet 160 MAIN_1_BS_#1_BR_#2_Cash<-CashAndCashEquivalentsAtCarryingValue 274082 MAIN_1_BS_#1_BR_#3_LiabilitiesAndEquity<-LiabilitiesAndStockholdersEquity 341667 MAIN_1_BS_#1_BR_#4_RetainedEarnings<-RetainedEarningsAccumulatedDeficit 308581 MAIN_1_BS_#2_EQ_#1_HolderEquity<-StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest 93291 MAIN_1_BS_#2_EQ_#2_HolderEquity<-PartnersCapital 9738 MAIN_1_BS_#2_EQ_#3_HolderEquity<-StockholdersEquity 236497 MAIN_1_BS_#2_EQ_#4_TemporaryEquity 12292 MAIN_1_BS_#2_EQ_#5_RedeemableEquity 9618 MAIN_1_BS_#2_EQ_#6_Equity 340030 MAIN_1_BS_#3_SC_#1_Assets 89 MAIN_1_BS_#3_SC_#2_AssetsCurrent 103 MAIN_1_BS_#3_SC_#3_AssetsNoncurrent 244615 MAIN_1_BS_#3_SC_#4_Liabilities 16745 MAIN_1_BS_#3_SC_#5_LiabilitiesCurrent 74 MAIN_1_BS_#3_SC_#6_LiabilitiesNoncurrent 173265 MAIN_1_BS_#3_SC_#7_Assets 1896 MAIN_1_BS_#3_SC_#8_Liabilities 51532 MAIN_1_BS_#3_SC_#9_Equity 4942 MAIN_1_BS_#3_SC_#10_LiabilitiesAndEquity 3960 MAIN_1_BS_#3_SC_#11_Liabilities 1389 MAIN_1_BS_#3_SC_#12_Equity 14 MAIN_1_BS_#4_SU_#1_Cash 13913 MAIN_1_BS_#4_SU_#2_RetainedEarnings 2537 MAIN_1_BS_#4_SU_#3_LongTermDebt 109187 MAIN_1_BS_#4_SU_#4_LiabilitiesNoncurrent 50063 MAIN_1_BS_#5_SetSum_#1_Assets/AssetsNoncurrent 1561 MAIN_1_BS_#5_SetSum_#2_Assets/AssetsCurrent 4 MAIN_1_BS_#5_SetSum_#3_Liabilities/LiabilitiesNoncurrent 1015 MAIN_1_BS_#5_SetSum_#4_Liabilities/LiabilitiesCurrent 38 MAIN_2_BS_#1_BR_#1_Assets<-AssetsNet 0 MAIN_2_BS_#1_BR_#2_Cash<-CashAndCashEquivalentsAtCarryingValue 0 MAIN_2_BS_#1_BR_#3_LiabilitiesAndEquity<-LiabilitiesAndStockholdersEquity 0 MAIN_2_BS_#1_BR_#4_RetainedEarnings<-RetainedEarningsAccumulatedDeficit 0 MAIN_2_BS_#2_EQ_#1_HolderEquity<-StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest 0 MAIN_2_BS_#2_EQ_#2_HolderEquity<-PartnersCapital 0 MAIN_2_BS_#2_EQ_#3_HolderEquity<-StockholdersEquity 0 MAIN_2_BS_#2_EQ_#4_TemporaryEquity 0 MAIN_2_BS_#2_EQ_#5_RedeemableEquity 0 MAIN_2_BS_#2_EQ_#6_Equity 0 MAIN_2_BS_#3_SC_#1_Assets 0 MAIN_2_BS_#3_SC_#2_AssetsCurrent 1 MAIN_2_BS_#3_SC_#3_AssetsNoncurrent 437 MAIN_2_BS_#3_SC_#4_Liabilities 938 MAIN_2_BS_#3_SC_#5_LiabilitiesCurrent 18373 MAIN_2_BS_#3_SC_#6_LiabilitiesNoncurrent 20969 MAIN_2_BS_#3_SC_#7_Assets 714 MAIN_2_BS_#3_SC_#8_Liabilities 7 MAIN_2_BS_#3_SC_#9_Equity 1309 MAIN_2_BS_#3_SC_#10_LiabilitiesAndEquity 821 MAIN_2_BS_#3_SC_#11_Liabilities 0 MAIN_2_BS_#3_SC_#12_Equity 4 MAIN_2_BS_#4_SU_#1_Cash 0 MAIN_2_BS_#4_SU_#2_RetainedEarnings 0 MAIN_2_BS_#4_SU_#3_LongTermDebt 0 MAIN_2_BS_#4_SU_#4_LiabilitiesNoncurrent 0 MAIN_2_BS_#5_SetSum_#1_Assets/AssetsNoncurrent 0 MAIN_2_BS_#5_SetSum_#2_Assets/AssetsCurrent 0 MAIN_2_BS_#5_SetSum_#3_Liabilities/LiabilitiesNoncurrent 0 MAIN_2_BS_#5_SetSum_#4_Liabilities/LiabilitiesCurrent 0 MAIN_3_BS_#1_BR_#1_Assets<-AssetsNet 0 MAIN_3_BS_#1_BR_#2_Cash<-CashAndCashEquivalentsAtCarryingValue 0 MAIN_3_BS_#1_BR_#3_LiabilitiesAndEquity<-LiabilitiesAndStockholdersEquity 0 MAIN_3_BS_#1_BR_#4_RetainedEarnings<-RetainedEarningsAccumulatedDeficit 0 MAIN_3_BS_#2_EQ_#1_HolderEquity<-StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest 0 MAIN_3_BS_#2_EQ_#2_HolderEquity<-PartnersCapital 0 MAIN_3_BS_#2_EQ_#3_HolderEquity<-StockholdersEquity 0 MAIN_3_BS_#2_EQ_#4_TemporaryEquity 0 MAIN_3_BS_#2_EQ_#5_RedeemableEquity 0 MAIN_3_BS_#2_EQ_#6_Equity 0 MAIN_3_BS_#3_SC_#1_Assets 0 MAIN_3_BS_#3_SC_#2_AssetsCurrent 0 MAIN_3_BS_#3_SC_#3_AssetsNoncurrent 0 MAIN_3_BS_#3_SC_#4_Liabilities 0 MAIN_3_BS_#3_SC_#5_LiabilitiesCurrent 0 MAIN_3_BS_#3_SC_#6_LiabilitiesNoncurrent 0 MAIN_3_BS_#3_SC_#7_Assets 4 MAIN_3_BS_#3_SC_#8_Liabilities 0 MAIN_3_BS_#3_SC_#9_Equity 0 MAIN_3_BS_#3_SC_#10_LiabilitiesAndEquity 0 MAIN_3_BS_#3_SC_#11_Liabilities 0 MAIN_3_BS_#3_SC_#12_Equity 0 MAIN_3_BS_#4_SU_#1_Cash 0 MAIN_3_BS_#4_SU_#2_RetainedEarnings 0 MAIN_3_BS_#4_SU_#3_LongTermDebt 0 MAIN_3_BS_#4_SU_#4_LiabilitiesNoncurrent 0 MAIN_3_BS_#5_SetSum_#1_Assets/AssetsNoncurrent 0 MAIN_3_BS_#5_SetSum_#2_Assets/AssetsCurrent 0 MAIN_3_BS_#5_SetSum_#3_Liabilities/LiabilitiesNoncurrent 0 MAIN_3_BS_#5_SetSum_#4_Liabilities/LiabilitiesCurrent 0 POST_BS_POST_#1_AssetsCurrent/AssetsNoncurrent 78505 POST_BS_POST_#2_LiabilitiesCurrent/LiabilitiesNoncurrent 59897 POST_BS_POST_#3_Assets/AssetsCurrent/AssetsNoncurrent 23 POST_BS_POST_#4_Liabilities/LiabilitiesCurrent/LiabilitiesNoncurrent 248 POST_BS_POST_#5_TemporaryEquity 334278 POST_BS_POST_#6_RedeemableEquity 336952 POST_BS_POST_#7_AdditionalPaidInCapital 173878 POST_BS_POST_#8_TreasuryStockValue 254898 Name: 1, dtype: int64
To be able to assess the content of applied_rules_sum_s
we need to understand the rules that are applied. The simplest way to do this is to print the description of them:
bs_standardizer_result_bag.process_description_df
part | type | ruleclass | identifier | description | |
---|---|---|---|---|---|
0 | PREPIVOT | Group | PREPIVOT_BS_PREPIV | ||
1 | PREPIVOT | Rule | PrePivotDeduplicate | PREPIVOT_BS_PREPIV_#1_DeDup | Deduplicates the dataframe based on the columns ['adsh', 'coreg', 'report', 'ddate', 'qtrs', 'tag', 'version', 'value'] |
2 | PRE | Group | PRE_BS_PRE | ||
3 | PRE | Rule | PreSumUpCorrection | PRE_BS_PRE_#1_Assets/AssetsNoncurrent | Swaps the values between the tag 'Assets' and 'AssetsNoncurrent' if the following equation is True "'AssetsNoncurrent' = 'Assets' + 'AssetsCurrent" and 'AssetsCurrent' > 0 |
4 | PRE | Rule | PreSumUpCorrection | PRE_BS_PRE_#2_Assets/AssetsCurrent | Swaps the values between the tag 'Assets' and 'AssetsCurrent' if the following equation is True "'AssetsCurrent' = 'Assets' + 'AssetsNoncurrent" and 'AssetsNoncurrent' > 0 |
5 | MAIN | Group | MAIN_BS | ||
6 | MAIN | Group | MAIN_BS_#1_BR | ||
7 | MAIN | Rule | CopyTagRule | MAIN_BS_#1_BR_#1_Assets<-AssetsNet | Copies the values from AssetsNet to Assets if AssetsNet is not null and Assets is nan |
8 | MAIN | Rule | CopyTagRule | MAIN_BS_#1_BR_#2_Cash<-CashAndCashEquivalentsAtCarryingValue | Copies the values from CashAndCashEquivalentsAtCarryingValue to Cash if CashAndCashEquivalentsAtCarryingValue is not null and Cash is nan |
9 | MAIN | Rule | CopyTagRule | MAIN_BS_#1_BR_#3_LiabilitiesAndEquity<-LiabilitiesAndStockholdersEquity | Copies the values from LiabilitiesAndStockholdersEquity to LiabilitiesAndEquity if LiabilitiesAndStockholdersEquity is not null and LiabilitiesAndEquity is nan |
10 | MAIN | Rule | CopyTagRule | MAIN_BS_#1_BR_#4_RetainedEarnings<-RetainedEarningsAccumulatedDeficit | Copies the values from RetainedEarningsAccumulatedDeficit to RetainedEarnings if RetainedEarningsAccumulatedDeficit is not null and RetainedEarnings is nan |
11 | MAIN | Group | MAIN_BS_#2_EQ | ||
12 | MAIN | Rule | CopyTagRule | MAIN_BS_#2_EQ_#1_HolderEquity<-StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest | Copies the values from StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest to HolderEquity if StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest is not null and HolderEquity is nan |
13 | MAIN | Rule | CopyTagRule | MAIN_BS_#2_EQ_#2_HolderEquity<-PartnersCapital | Copies the values from PartnersCapital to HolderEquity if PartnersCapital is not null and HolderEquity is nan |
14 | MAIN | Rule | CopyTagRule | MAIN_BS_#2_EQ_#3_HolderEquity<-StockholdersEquity | Copies the values from StockholdersEquity to HolderEquity if StockholdersEquity is not null and HolderEquity is nan |
15 | MAIN | Rule | SumUpRule | MAIN_BS_#2_EQ_#4_TemporaryEquity | Sums up the availalbe values in the columns ['TemporaryEquityAggregateAmountOfRedemptionRequirement', 'TemporaryEquityCarryingAmountAttributableToParent', 'TemporaryEquityRedemptionAmountAttributableToParent', 'TemporaryEquityRedemptionAmountAttributableToNoncontrollingInterest'] into the column 'TemporaryEquity', if the column 'TemporaryEquity' is nan |
16 | MAIN | Rule | SumUpRule | MAIN_BS_#2_EQ_#5_RedeemableEquity | Sums up the availalbe values in the columns ['RedeemableNoncontrollingInterestEquityCarryingAmount', 'RedeemableNoncontrollingInterestEquityRedemptionAmount', 'RedeemableNoncontrollingInterestEquityOtherCarryingAmount', 'RedeemableNoncontrollingInterestEquityOtherRedemptionAmount', 'RedeemablePreferredStockEquityOtherCarryingAmount', 'RedeemablePreferredStockEquityOtherRedemptionAmount'] into the column 'RedeemableEquity', if the column 'RedeemableEquity' is nan |
17 | MAIN | Rule | SumUpRule | MAIN_BS_#2_EQ_#6_Equity | Sums up the availalbe values in the columns ['HolderEquity', 'TemporaryEquity', 'RedeemableEquity'] into the column 'Equity', if the column 'Equity' is nan |
18 | MAIN | Group | MAIN_BS_#3_SC | ||
19 | MAIN | Rule | MissingSumRule | MAIN_BS_#3_SC_#1_Assets | Sums up the values in the columns ['AssetsCurrent', 'AssetsNoncurrent'] into the column 'Assets', if the column 'Assets' is nan and if all columns ['AssetsCurrent', 'AssetsNoncurrent'] have a value |
20 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#2_AssetsCurrent | Calculates the value for the missing column 'AssetsCurrent' by subtracting the values of the columns '['AssetsNoncurrent']' from the column 'Assets' if all of the columns ['Assets', 'AssetsNoncurrent'] are set. |
21 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#3_AssetsNoncurrent | Calculates the value for the missing column 'AssetsNoncurrent' by subtracting the values of the columns '['AssetsCurrent']' from the column 'Assets' if all of the columns ['Assets', 'AssetsCurrent'] are set. |
22 | MAIN | Rule | MissingSumRule | MAIN_BS_#3_SC_#4_Liabilities | Sums up the values in the columns ['LiabilitiesCurrent', 'LiabilitiesNoncurrent'] into the column 'Liabilities', if the column 'Liabilities' is nan and if all columns ['LiabilitiesCurrent', 'LiabilitiesNoncurrent'] have a value |
23 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#5_LiabilitiesCurrent | Calculates the value for the missing column 'LiabilitiesCurrent' by subtracting the values of the columns '['LiabilitiesNoncurrent']' from the column 'Liabilities' if all of the columns ['Liabilities', 'LiabilitiesNoncurrent'] are set. |
24 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#6_LiabilitiesNoncurrent | Calculates the value for the missing column 'LiabilitiesNoncurrent' by subtracting the values of the columns '['LiabilitiesCurrent']' from the column 'Liabilities' if all of the columns ['Liabilities', 'LiabilitiesCurrent'] are set. |
25 | MAIN | Rule | MissingSumRule | MAIN_BS_#3_SC_#7_Assets | Sums up the values in the columns ['Liabilities', 'Equity'] into the column 'Assets', if the column 'Assets' is nan and if all columns ['Liabilities', 'Equity'] have a value |
26 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#8_Liabilities | Calculates the value for the missing column 'Liabilities' by subtracting the values of the columns '['Equity']' from the column 'Assets' if all of the columns ['Assets', 'Equity'] are set. |
27 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#9_Equity | Calculates the value for the missing column 'Equity' by subtracting the values of the columns '['Liabilities']' from the column 'Assets' if all of the columns ['Assets', 'Liabilities'] are set. |
28 | MAIN | Rule | MissingSumRule | MAIN_BS_#3_SC_#10_LiabilitiesAndEquity | Sums up the values in the columns ['Liabilities', 'Equity'] into the column 'LiabilitiesAndEquity', if the column 'LiabilitiesAndEquity' is nan and if all columns ['Liabilities', 'Equity'] have a value |
29 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#11_Liabilities | Calculates the value for the missing column 'Liabilities' by subtracting the values of the columns '['Equity']' from the column 'LiabilitiesAndEquity' if all of the columns ['LiabilitiesAndEquity', 'Equity'] are set. |
30 | MAIN | Rule | MissingSummandRule | MAIN_BS_#3_SC_#12_Equity | Calculates the value for the missing column 'Equity' by subtracting the values of the columns '['Liabilities']' from the column 'LiabilitiesAndEquity' if all of the columns ['LiabilitiesAndEquity', 'Liabilities'] are set. |
31 | MAIN | Group | MAIN_BS_#4_SU | ||
32 | MAIN | Rule | SumUpRule | MAIN_BS_#4_SU_#1_Cash | Sums up the availalbe values in the columns ['CashAndCashEquivalentsAtFairValue', 'CashAndDueFromBanks', 'CashCashEquivalentsAndFederalFundsSold', 'RestrictedCashAndCashEquivalentsAtCarryingValue', 'CashAndCashEquivalentsInForeignCurrencyAtCarryingValue'] into the column 'Cash', if the column 'Cash' is nan |
33 | MAIN | Rule | SumUpRule | MAIN_BS_#4_SU_#2_RetainedEarnings | Sums up the availalbe values in the columns ['RetainedEarningsUnappropriated', 'RetainedEarningsAppropriated'] into the column 'RetainedEarnings', if the column 'RetainedEarnings' is nan |
34 | MAIN | Rule | SumUpRule | MAIN_BS_#4_SU_#3_LongTermDebt | Sums up the availalbe values in the columns ['LongTermDebtNoncurrent', 'LongTermDebtAndCapitalLeaseObligations'] into the column 'LongTermDebt', if the column 'LongTermDebt' is nan |
35 | MAIN | Rule | SumUpRule | MAIN_BS_#4_SU_#4_LiabilitiesNoncurrent | Sums up the availalbe values in the columns ['AccruedIncomeTaxesNoncurrent', 'DeferredAndPayableIncomeTaxes', 'DeferredIncomeTaxesAndOtherLiabilitiesNoncurrent', 'DeferredIncomeTaxLiabilitiesNet', 'DeferredTaxLiabilitiesNoncurrent', 'DefinedBenefitPensionPlanLiabilitiesNoncurrent', 'DerivativeLiabilitiesNoncurrent', 'FinanceLeaseLiabilityNoncurrent', 'LiabilitiesOtherThanLongtermDebtNoncurrent', 'LiabilitiesSubjectToCompromise', 'LiabilityForUncertainTaxPositionsNoncurrent', 'LongTermDebt', 'LongTermRetirementBenefitsAndOtherLiabilities', 'OperatingLeaseLiabilityNoncurrent', 'OtherLiabilitiesNoncurrent', 'OtherPostretirementDefinedBenefitPlanLiabilitiesNoncurrent', 'PensionAndOtherPostretirementDefinedBenefitPlansLiabilitiesNoncurrent', 'RegulatoryLiabilityNoncurrent', 'SelfInsuranceReserveNoncurrent'] into the column 'LiabilitiesNoncurrent', if the column 'LiabilitiesNoncurrent' is nan |
36 | MAIN | Group | MAIN_BS_#5_SetSum | ||
37 | MAIN | Rule | SetSumIfOnlyOneSummand | MAIN_BS_#5_SetSum_#1_Assets/AssetsNoncurrent | Copies the value of the column 'AssetsCurrent' into the column 'Assets' and sets the columns ['AssetsNoncurrent'] to 0.0 if the column 'AssetsCurrent is set and the columns ['Assets', 'AssetsNoncurrent'] are nan. |
38 | MAIN | Rule | SetSumIfOnlyOneSummand | MAIN_BS_#5_SetSum_#2_Assets/AssetsCurrent | Copies the value of the column 'AssetsNoncurrent' into the column 'Assets' and sets the columns ['AssetsCurrent'] to 0.0 if the column 'AssetsNoncurrent is set and the columns ['Assets', 'AssetsCurrent'] are nan. |
39 | MAIN | Rule | SetSumIfOnlyOneSummand | MAIN_BS_#5_SetSum_#3_Liabilities/LiabilitiesNoncurrent | Copies the value of the column 'LiabilitiesCurrent' into the column 'Liabilities' and sets the columns ['LiabilitiesNoncurrent'] to 0.0 if the column 'LiabilitiesCurrent is set and the columns ['Liabilities', 'LiabilitiesNoncurrent'] are nan. |
40 | MAIN | Rule | SetSumIfOnlyOneSummand | MAIN_BS_#5_SetSum_#4_Liabilities/LiabilitiesCurrent | Copies the value of the column 'LiabilitiesNoncurrent' into the column 'Liabilities' and sets the columns ['LiabilitiesCurrent'] to 0.0 if the column 'LiabilitiesNoncurrent is set and the columns ['Liabilities', 'LiabilitiesCurrent'] are nan. |
41 | POST | Group | POST_BS_POST | ||
42 | POST | Rule | PostCopyToFirstSummand | POST_BS_POST_#1_AssetsCurrent/AssetsNoncurrent | Copies the value of the 'Assets' to the first summand 'AssetsCurrent' and set the other summands ['AssetsNoncurrent'] to 0.0 if 'Assets is set and the summands ['AssetsCurrent', 'AssetsNoncurrent'] are nan. |
43 | POST | Rule | PostCopyToFirstSummand | POST_BS_POST_#2_LiabilitiesCurrent/LiabilitiesNoncurrent | Copies the value of the 'Liabilities' to the first summand 'LiabilitiesCurrent' and set the other summands ['LiabilitiesNoncurrent'] to 0.0 if 'Liabilities is set and the summands ['LiabilitiesCurrent', 'LiabilitiesNoncurrent'] are nan. |
44 | POST | Rule | PostSetToZero | POST_BS_POST_#3_Assets/AssetsCurrent/AssetsNoncurrent | Set the value of the ['Assets', 'AssetsCurrent', 'AssetsNoncurrent'] to 0.0 if all ['Assets', 'AssetsCurrent', 'AssetsNoncurrent'] are nan. |
45 | POST | Rule | PostSetToZero | POST_BS_POST_#4_Liabilities/LiabilitiesCurrent/LiabilitiesNoncurrent | Set the value of the ['Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent'] to 0.0 if all ['Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent'] are nan. |
46 | POST | Rule | PostSetToZero | POST_BS_POST_#5_TemporaryEquity | Set the value of the ['TemporaryEquity'] to 0.0 if all ['TemporaryEquity'] are nan. |
47 | POST | Rule | PostSetToZero | POST_BS_POST_#6_RedeemableEquity | Set the value of the ['RedeemableEquity'] to 0.0 if all ['RedeemableEquity'] are nan. |
48 | POST | Rule | PostSetToZero | POST_BS_POST_#7_AdditionalPaidInCapital | Set the value of the ['AdditionalPaidInCapital'] to 0.0 if all ['AdditionalPaidInCapital'] are nan. |
49 | POST | Rule | PostSetToZero | POST_BS_POST_#8_TreasuryStockValue | Set the value of the ['TreasuryStockValue'] to 0.0 if all ['TreasuryStockValue'] are nan. |
50 | VALID | Validation | SumValidationRule | AssetsCheck | Checks whether the sum of ['AssetsCurrent', 'AssetsNoncurrent'] equals the value in 'Assets' |
51 | VALID | Validation | SumValidationRule | LiabilitiesCheck | Checks whether the sum of ['LiabilitiesCurrent', 'LiabilitiesNoncurrent'] equals the value in 'Liabilities' |
52 | VALID | Validation | SumValidationRule | EquityCheck | Checks whether the sum of ['Equity', 'Liabilities'] equals the value in 'LiabilitiesAndEquity' |
53 | VALID | Validation | SumValidationRule | AssetsLiaEquCheck | Checks whether the sum of ['Equity', 'Liabilities'] equals the value in 'Assets' |
Let's discuss a few of the rules in detail:
appliedprepivot_rules_log
dataframeapplied_rules_sum_s
data, two thirds of the entries have only StockholdersEquity present, one quarter has StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest present, and a few thousands have PartnerCapital set.applied_rules_sum_s
many reports do not have an entry for LiabilitiesNoncurrent (in about 50% of the reports, it was possible to calculate it with rule MAIN_1_BS_#3_SC_#6_LiabilitiesNoncurrent, based on Liabilities and LiabilitiesCurrent and in about 15% of the cases it was possible to calculate it with these rules.It might be interesting to know how many rules are applied in general per report. In this example, we just look at the MAIN and PRE rules:
# just use a shorter variable name
df=bs_standardizer_result_bag.applied_rules_log_df
# we are just interested in the MAIN and PRE rules
filtered_columns = df.columns[df.columns.str.contains('MAIN|PRE')]
# count how many True values are in each row
df['count_true_values'] = df[filtered_columns].sum(axis='columns')
df.count_true_values.value_counts().sort_index()
0 70 1 47 2 1067 3 4827 4 13464 5 51296 6 49549 7 114253 8 87029 9 23416 10 1514 11 38 Name: count_true_values, dtype: int64
import matplotlib.pyplot as plt
plt.hist(df.count_true_values)
# Show the plot
plt.show()
If we analys a single report and want to know which rules were applied, we can do that with the following code:
apple_10k_2022 = "0000320193-22-000108"
apple_10k_2022_applied_rules_log_df = bs_standardizer_result_bag.applied_rules_log_df[bs_standardizer_result_bag.applied_rules_log_df.adsh==apple_10k_2022]
# filter for the applied MAIN,PRE, and POST rules
main_rule_cols = df.columns[df.columns.str.contains('MAIN|PRE|POST')]
main_rule_df = apple_10k_2022_applied_rules_log_df[main_rule_cols]
# get the applied rules, by using the True and False values of main_rule_df.iloc[0] as a mask on the columns index
main_rule_df.columns[main_rule_df.iloc[0]].tolist()
['MAIN_1_BS_#1_BR_#2_Cash<-CashAndCashEquivalentsAtCarryingValue', 'MAIN_1_BS_#1_BR_#3_LiabilitiesAndEquity<-LiabilitiesAndStockholdersEquity', 'MAIN_1_BS_#1_BR_#4_RetainedEarnings<-RetainedEarningsAccumulatedDeficit', 'MAIN_1_BS_#2_EQ_#3_HolderEquity<-StockholdersEquity', 'MAIN_1_BS_#2_EQ_#6_Equity', 'MAIN_1_BS_#4_SU_#3_LongTermDebt', 'POST_BS_POST_#5_TemporaryEquity', 'POST_BS_POST_#6_RedeemableEquity', 'POST_BS_POST_#7_AdditionalPaidInCapital', 'POST_BS_POST_#8_TreasuryStockValue']