This code is benefited from the publicly available code by QINGYI (FREDA) SONG DRECHSLER.
Reference: Drechsler, Qingyi (Freda) S., 2022, Python Programs for Empirical Finance, https://www.fredasongdrechsler.com
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
!pip install wrds
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting wrds Downloading wrds-3.1.1-py3-none-any.whl (12 kB) Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.7/dist-packages (from wrds) (1.4.37) Collecting psycopg2-binary Downloading psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB) |████████████████████████████████| 3.0 MB 11.8 MB/s Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from wrds) (1.3.5) Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from wrds) (1.21.6) Collecting mock Downloading mock-4.0.3-py3-none-any.whl (28 kB) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->wrds) (2022.1) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->wrds) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->wrds) (1.15.0) Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from sqlalchemy->wrds) (4.11.4) Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.7/dist-packages (from sqlalchemy->wrds) (1.1.2) Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->sqlalchemy->wrds) (4.1.1) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->sqlalchemy->wrds) (3.8.0) Installing collected packages: psycopg2-binary, mock, wrds Successfully installed mock-4.0.3 psycopg2-binary-2.9.3 wrds-3.1.1
import wrds
# Now I am connecting to WRDS site - you must have a WRDS account user id and password
conn=wrds.Connection()
help(wrds)
Help on package wrds: NAME wrds DESCRIPTION WRDS Python Data Access Library ============================== WRDS-Py is a library for extracting data from WRDS data sources and getting it into Pandas. >>> import wrds >>> db = wrds.Connection() >>> db.list_libraries() ['aha', 'aha_sample', 'ahasamp', 'audit', 'audit_audit_comp', ...] >>> db.list_tables(library='crsp') ['acti', 'asia', 'asib', 'asic', 'asio', 'asix', 'bmdebt', 'bmheader', ...] >>> data = db.raw_sql('SELECT * FROM crsp.stocknames', index_col='permno') >>> data.head() permco namedt nameenddt cusip ncusip ticker permno 10000.0 7952.0 1986-01-07 1987-06-11 68391610 68391610 OMFGA 10001.0 7953.0 1986-01-09 1993-11-21 36720410 39040610 GFGC 10001.0 7953.0 1993-11-22 2008-02-04 36720410 29274A10 EWST 10001.0 7953.0 2008-02-05 2009-08-03 36720410 29274A20 EWST 10001.0 7953.0 2009-08-04 2009-12-17 36720410 29269V10 EGAS ... PACKAGE CONTENTS sql test DATA __copyright__ = '2017-2021 Wharton Research Data Services' __title__ = 'wrds-py' VERSION 3.1.1 AUTHOR Wharton Research Data Services FILE /usr/local/lib/python3.7/dist-packages/wrds/__init__.py
#listing all dtabases or libraries in WRDS
conn.list_libraries()
['aha', 'aha_sample', 'ahasamp', 'audit', 'audit_audit_comp', 'audit_common', 'audit_corp_legal', 'auditsmp', 'auditsmp_all', 'bank', 'blab', 'block', 'block_all', 'boardex', 'boardex_trial', 'boardsmp', 'bvd', 'bvdsamp', 'calcbench_trial', 'calcbnch', 'cboe', 'cboe_all', 'centris', 'ciq', 'ciqsamp', 'ciqsamp_capstrct', 'ciqsamp_common', 'ciqsamp_keydev', 'ciqsamp_pplintel', 'ciqsamp_transcripts', 'cisdm', 'cisdmsmp', 'clrvt', 'clrvtsmp', 'columnar', 'comp', 'comp_bank', 'comp_bank_daily', 'comp_execucomp', 'comp_na_annual_all', 'comp_na_daily_all', 'comp_na_monthly_all', 'comp_segments_hist', 'comp_segments_hist_daily', 'compa', 'compb', 'compbd', 'compdcur', 'compg', 'compgd', 'comph', 'compm', 'compmcur', 'compnad', 'compsamp', 'compsamp_snapshot', 'compseg', 'compsegd', 'compsnap', 'comscore', 'contrib', 'contrib_ceo_turnover', 'contrib_char_returns', 'contrib_general', 'contrib_intangible_value', 'contrib_kpss', 'crsp', 'crsp_a_ccm', 'crsp_a_indexes', 'crsp_a_stock', 'crsp_q_indexhist', 'crspa', 'crspm', 'crspq', 'crspsamp', 'csmar', 'cusipm', 'custom_jl', 'dealscan', 'djones', 'djones_all', 'dmef', 'dmef_all', 'doe', 'doe_all', 'emdb', 'estimize', 'etfg', 'etfg_samp', 'etfgsamp', 'eureka', 'eurekahedge_sample', 'eursamp', 'eventus', 'evts', 'execcomp', 'factsamp_all', 'factsamp_revere', 'factset', 'ff', 'ff_all', 'fisd', 'fisdsamp', 'fjc', 'fjc_linking', 'fjc_litigation', 'frb', 'frb_all', 'fssamp', 'ftse', 'ftsesamp', 'ginsight', 'gmi', 'govpx', 'govpxsmp', 'hbase', 'hbase_sample', 'hbsamp', 'hfr', 'hfrsamp', 'hfrsamp_hfrdb', 'ibes', 'ibescorp', 'ibeskpi', 'ifgr', 'ifgrsamp', 'ims', 'ims_obp_trial', 'imssamp', 'infogroupsamp_business', 'infogroupsamp_residential', 'iri', 'iss', 'issm', 'kld', 'ktmine', 'ktmine_patents_samp', 'ktsamp', 'levin', 'lspd', 'lspdsamp', 'lvnsamp', 'lvnsamp_all', 'macrofin', 'macrofin_comm_trade', 'markit', 'mfl', 'midas', 'mrktsamp', 'mrktsamp_cds', 'mrktsamp_cdx', 'mrktsamp_msf', 'msfanly', 'msfinst', 'msrb', 'msrb_all', 'msrbsamp', 'nastraq', 'omtrial', 'optionm', 'optionmsamp_europe', 'optionmsamp_us', 'otc', 'otc_endofday', 'pacap', 'phlx', 'phlx_all', 'pitchbk', 'pitchsmp', 'ppublica', 'ppubsamp', 'ppubsamp_d4d', 'preqin', 'preqsamp', 'preqsamp_all', 'public', 'public_all', 'pwt', 'pwt_all', 'ravenpack_trial', 'rent', 'reprisk', 'reprisk_sample', 'repsamp', 'revere', 'risk', 'risksamp', 'risksamp_all', 'rpa', 'rpna', 'rpnasamp', 'rq_all', 'sdc', 'sdcsamp', 'secsamp', 'secsamp_all', 'snapsamp', 'snl', 'snlsamp', 'snlsamp_fig', 'sprat', 'sustain', 'sustainalyticssamp_all', 'sustsamp', 'taq', 'taqmsamp', 'taqmsamp_all', 'taqmsec', 'taqsamp', 'tass', 'tfn', 'totalq', 'totalq_all', 'toyo', 'toyosamp', 'toyosamp_all', 'tr_dealscan', 'tr_ds', 'trace', 'trace_enhanced', 'trace_standard', 'trcstsmp', 'trdssamp', 'trdstrm', 'tresg', 'tresgsmp', 'trown', 'trsamp', 'trsamp_ds_eq', 'trsamp_dscom', 'trsamp_dsecon', 'trsamp_dsfut', 'trsdcgs', 'trucost', 'trws', 'twoiq', 'twoiq_samp', 'twoiqsmp', 'wappsamp', 'wcai', 'wind', 'windsmp', 'wqa', 'wrds_lib_internal', 'wrdsapps', 'wrdsapps_backtest_basic', 'wrdsapps_backtest_plus', 'wrdsapps_eushort', 'wrdsapps_evtstudy_int_ginsight', 'wrdsapps_evtstudy_lr', 'wrdsapps_finratio', 'wrdsapps_finratio_ccm', 'wrdsapps_link_comp_eushort', 'wrdsapps_link_crsp_bond', 'wrdsapps_link_crsp_factset', 'wrdsapps_link_crsp_taq', 'wrdsapps_link_datastream_wscope', 'wrdsapps_link_supplychain', 'wrdsapps_patents', 'wrdsapps_subsidiary', 'wrdsappssamp_all', 'wrdsrpts', 'wrdsrpts_rep_usage', 'wrdssec', 'wrdssec_midas', 'zacks', 'zacksamp', 'zacksamp_all']
help(conn.list_libraries)
Help on method list_libraries in module wrds.sql: list_libraries() method of wrds.sql.Connection instance Return all the libraries (schemas) the user can access. :rtype: list Usage:: >>> db.list_libraries() ['aha', 'audit', 'block', 'boardex', ...]
help(conn.list_tables)
Help on method list_tables in module wrds.sql: list_tables(library) method of wrds.sql.Connection instance Returns a list of all the views/tables/foreign tables within a schema. :param library: Postgres schema name. :rtype: list Usage:: >>> db.list_tables('wrdssec') ['wciklink_gvkey', 'dforms', 'wciklink_cusip', 'wrds_forms', ...]
conn.list_tables(library='comp')
['aco_amda', 'aco_imda', 'aco_indfnta', 'aco_indfntq', 'aco_indfntytd', 'aco_indsta', 'aco_indstq', 'aco_indstytd', 'aco_notesa', 'aco_notesq', 'aco_notessa', 'aco_notesytd', 'aco_pnfnda', 'aco_pnfndq', 'aco_pnfndytd', 'aco_pnfnta', 'aco_pnfntq', 'aco_pnfntytd', 'aco_transa', 'aco_transq', 'aco_transsa', 'aco_transytd', 'adsprate', 'asec_amda', 'asec_imda', 'asec_notesa', 'asec_notesq', 'asec_transa', 'asec_transq', 'bank_aacctchg', 'bank_adesind', 'bank_afnd1', 'bank_afnd2', 'bank_afnddc1', 'bank_afnddc2', 'bank_afntind', 'bank_funda', 'bank_funda_fncd', 'bank_fundq', 'bank_fundq_fncd', 'bank_iacctchg', 'bank_idesind', 'bank_ifndq', 'bank_ifndytd', 'bank_ifntq', 'bank_ifntytd', 'bank_names', 'bank_namesq', 'chars', 'co_aacctchg', 'co_aaudit', 'co_adesind', 'co_adjfact', 'co_afnd1', 'co_afnd2', 'co_afnddc1', 'co_afnddc2', 'co_afntind1', 'co_afntind2', 'co_ainvval', 'co_amkt', 'co_busdescl', 'co_cotype', 'co_filedate', 'co_fortune', 'co_hgic', 'co_iacctchg', 'co_iaudit', 'co_idesind', 'co_ifndq', 'co_ifndsa', 'co_ifndytd', 'co_ifntq', 'co_ifntsa', 'co_ifntytd', 'co_imkt', 'co_industry', 'co_ipcd', 'co_mthly', 'co_offtitl', 'company', 'currency', 'dd_group', 'dd_group_xref', 'dd_item', 'dd_package', 'ecind_desc', 'ecind_mth', 'exrt_dly', 'exrt_mth', 'filings', 'funda', 'funda_fncd', 'fundq', 'fundq_fncd', 'g_chars', 'g_co_aaudit', 'g_co_adesind', 'g_co_afnd1', 'g_co_afnd2', 'g_co_afnddc1', 'g_co_afnddc2', 'g_co_afntind1', 'g_co_afntind2', 'g_co_ainvval', 'g_co_gsuppl', 'g_co_hgic', 'g_co_iaudit', 'g_co_idesind', 'g_co_ifndq', 'g_co_ifndsa', 'g_co_ifndytd', 'g_co_ifntq', 'g_co_ifntsa', 'g_co_ifntytd', 'g_co_industry', 'g_co_ipcd', 'g_co_offtitl', 'g_company', 'g_currency', 'g_ecind_desc', 'g_ecind_mth', 'g_exrt_dly', 'g_exrt_mth', 'g_funda', 'g_funda_fncd', 'g_fundq', 'g_fundq_fncd', 'g_idx_daily', 'g_idx_index', 'g_idx_mth', 'g_idxcst_his', 'g_names', 'g_names_ix', 'g_names_ix_cst', 'g_namesq', 'g_sec_adesind', 'g_sec_adjfact', 'g_sec_afnd', 'g_sec_afnddc', 'g_sec_afnt', 'g_sec_divid', 'g_sec_dprc', 'g_sec_dtrt', 'g_sec_history', 'g_sec_idesind', 'g_sec_ifnd', 'g_sec_ifnt', 'g_sec_split', 'g_secd', 'g_secnamesd', 'g_security', 'g_sedolgvkey', 'g_tmptable_pkg6153_tbl4023', 'idx_ann', 'idx_anndes', 'idx_daily', 'idx_index', 'idx_mth', 'idx_qrt', 'idx_qrtdes', 'idxcst_his', 'io_qaggregate', 'io_qbuysell', 'io_qchanges', 'io_qfloatadj', 'io_qholders', 'it_mbuysell', 'it_msummary', 'it_r_rltn', 'names', 'names_aco_indsta', 'names_aco_indstq', 'names_aco_pnfnda', 'names_aco_pnfndq', 'names_adsprate', 'names_ix', 'names_ix_cst', 'names_seg', 'namesd', 'namesm', 'namesq', 'r_accstd', 'r_acqmeth', 'r_auditors', 'r_auopic', 'r_balpres', 'r_cf_formt', 'r_co_status', 'r_coindpre', 'r_compstat', 'r_consol', 'r_country', 'r_cstclscd', 'r_datacode', 'r_datafmt', 'r_divtaxmarker', 'r_docsrce', 'r_ex_codes', 'r_exchgtier', 'r_exrt_typ', 'r_fndfntcd', 'r_footnts', 'r_foricd', 'r_giccd', 'r_hcalendr', 'r_idxclscd', 'r_inactvcd', 'r_incstats', 'r_indfmt', 'r_indsec', 'r_invval', 'r_issuetyp', 'r_majidxcl', 'r_mic_codes', 'r_naiccd', 'r_notetype', 'r_ntsubtype', 'r_offcrso', 'r_ogmethod', 'r_opinions', 'r_prc_stat', 'r_qsrcdoc', 'r_sec_stat', 'r_secannfn', 'r_sectors', 'r_siccd', 'r_spiicd', 'r_spmicd', 'r_statalrt', 'r_states', 'r_stko', 'r_titles', 'r_updates', 'sec_adesind', 'sec_adjfact', 'sec_afnd', 'sec_afnddc', 'sec_afnt', 'sec_divid', 'sec_dprc', 'sec_dtrt', 'sec_history', 'sec_idesind', 'sec_ifnd', 'sec_ifnt', 'sec_mdivfn', 'sec_mshare', 'sec_msptfn', 'sec_mth', 'sec_mthdiv', 'sec_mthprc', 'sec_mthspt', 'sec_mthtrt', 'sec_shortint', 'sec_spind', 'sec_split', 'secd', 'secm', 'security', 'sedolgvkey', 'seg_ann', 'seg_annfund', 'seg_customer', 'seg_geo', 'seg_naics', 'seg_product', 'seg_type', 'spidx_cst', 'spind', 'spind_dly', 'spind_mth', 'tmptable_pkg6153_tbl4023', 'wrds_seg_customer', 'wrds_seg_geo', 'wrds_seg_product', 'wrds_segmerged', 'xfl_column', 'xfl_table']
help(conn.describe_table)
Help on method describe_table in module wrds.sql: describe_table(library, table) method of wrds.sql.Connection instance Takes the library and the table and describes all the columns in that table. Includes Column Name, Column Type, Nullable?. :param library: Postgres schema name. :param table: Postgres table name. :rtype: pandas.DataFrame Usage:: >>> db.describe_table('wrdssec_all', 'dforms') name nullable type 0 cik True VARCHAR 1 fdate True DATE 2 secdate True DATE 3 form True VARCHAR 4 coname True VARCHAR 5 fname True VARCHAR
Note that there are 948 columns or Compustat variables and there are 858066 firm-years. However, we will see that there are two lines for each fimr-year in some cases, so we need to do mroe investigation.
compustat_variables=conn.describe_table('comp', 'funda')
print(45 * "-")
print(type(compustat_variables))
print(45 * "-")
print(compustat_variables)
print(45 * "-")
print(compustat_variables.shape)
print(45 * "-")
print(compustat_variables.info())
print(45 * "-")
compustat_variables.head(20)
Approximately 858066 rows in comp.funda. --------------------------------------------- <class 'pandas.core.frame.DataFrame'> --------------------------------------------- name nullable type 0 gvkey True VARCHAR(6) 1 datadate True DATE 2 fyear True DOUBLE_PRECISION 3 indfmt True VARCHAR(12) 4 consol True VARCHAR(2) .. ... ... ... 943 au True VARCHAR(8) 944 auop True VARCHAR(8) 945 auopic True VARCHAR(1) 946 ceoso True VARCHAR(1) 947 cfoso True VARCHAR(1) [948 rows x 3 columns] --------------------------------------------- (948, 3) --------------------------------------------- <class 'pandas.core.frame.DataFrame'> RangeIndex: 948 entries, 0 to 947 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 948 non-null object 1 nullable 948 non-null bool 2 type 948 non-null object dtypes: bool(1), object(2) memory usage: 15.9+ KB None ---------------------------------------------
name | nullable | type | |
---|---|---|---|
0 | gvkey | True | VARCHAR(6) |
1 | datadate | True | DATE |
2 | fyear | True | DOUBLE_PRECISION |
3 | indfmt | True | VARCHAR(12) |
4 | consol | True | VARCHAR(2) |
5 | popsrc | True | VARCHAR(1) |
6 | datafmt | True | VARCHAR(12) |
7 | tic | True | VARCHAR(8) |
8 | cusip | True | VARCHAR(9) |
9 | conm | True | VARCHAR(70) |
10 | acctchg | True | VARCHAR(8) |
11 | acctstd | True | VARCHAR(8) |
12 | acqmeth | True | VARCHAR(2) |
13 | adrr | True | DOUBLE_PRECISION |
14 | ajex | True | DOUBLE_PRECISION |
15 | ajp | True | DOUBLE_PRECISION |
16 | bspr | True | VARCHAR(8) |
17 | compst | True | VARCHAR(8) |
18 | curcd | True | VARCHAR(3) |
19 | curncd | True | VARCHAR(3) |
help(conn.get_row_count)
Help on method get_row_count in module wrds.sql: get_row_count(library, table) method of wrds.sql.Connection instance Uses the library and table to get the approximate row count for the table. :param library: Postgres schema name. :param table: Postgres table name. :rtype: int Usage:: >>> db.get_row_count('wrdssec', 'dforms') 16378400
conn.get_row_count('comp', 'funda')
858066
help(conn.get_table)
Help on method get_table in module wrds.sql: get_table(library, table, obs=-1, offset=0, columns=None, coerce_float=None, index_col=None, date_cols=None) method of wrds.sql.Connection instance Creates a data frame from an entire table in the database. :param sql: SQL code in string object. :param library: Postgres schema name. :param obs: (optional) int, default: -1 Specifies the number of observations to pull from the table. An integer less than 0 will return the entire table. :param offset: (optional) int, default: 0 Specifies the starting point for the query. An offset of 0 will start selecting from the beginning. :param columns: (optional) list or tuple, default: None Specifies the columns to be included in the output data frame. :param coerce_float: (optional) boolean, default: True Attempt to convert values to non-string, non-numeric objects to floating point. Can result in loss of precision. :param date_cols: (optional) list or dict, default: None - List of column names to parse as date - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times or is one of (D, s, ns, ms, us) in case of parsing integer timestamps - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` :param index_col: (optional) string or list of strings, default: None Column(s) to set as index(MultiIndex) :rtype: pandas.DataFrame Usage :: >>> data = db.get_table('wrdssec_all', 'dforms', obs=1000, columns=['cik', 'fdate', 'coname']) >>> data.head() cik fdate coname 0000000003 1995-02-15 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1996-02-14 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1997-02-19 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1998-03-02 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1998-03-10 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y.. ...
conn.get_table(library='comp', table='funda',
columns = ['conm', 'gvkey', 'cusip','cik', 'fyear', 'consol', 'tic', 'at', 'ni', 'prcc_f'], obs=20)
conm | gvkey | cusip | cik | fyear | consol | tic | at | ni | prcc_f | |
---|---|---|---|---|---|---|---|---|---|---|
0 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1961.0 | C | AE.2 | NaN | NaN | NaN |
1 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1962.0 | C | AE.2 | NaN | NaN | NaN |
2 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1963.0 | C | AE.2 | NaN | 0.003 | NaN |
3 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1964.0 | C | AE.2 | 1.416 | 0.052 | NaN |
4 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1965.0 | C | AE.2 | 2.310 | -0.197 | NaN |
5 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1966.0 | C | AE.2 | 2.430 | 0.164 | NaN |
6 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1967.0 | C | AE.2 | 2.456 | -0.090 | NaN |
7 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1968.0 | C | AE.2 | 5.922 | 0.463 | NaN |
8 | A & M FOOD SERVICES INC | 001001 | 000165100 | 0000723576 | 1982.0 | C | AMFD. | 8.590 | 0.965 | NaN |
9 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1969.0 | C | AE.2 | 28.712 | 1.766 | NaN |
10 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1970.0 | C | AE.2 | 33.450 | 0.558 | 10.000 |
11 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1971.0 | C | AE.2 | 29.330 | -2.318 | 5.750 |
12 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1971.0 | C | AE.2 | NaN | -2.317 | 5.750 |
13 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1972.0 | C | AE.2 | 19.907 | 2.225 | 5.125 |
14 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1972.0 | C | AE.2 | NaN | 2.225 | 5.125 |
15 | AAI CORP | 001002 | 000352104 | 0001306124 | 1960.0 | C | AAIC.1 | NaN | NaN | NaN |
16 | AGOIL INC-OLD | 001188 | 008484107 | None | 1978.0 | C | AGOLC | NaN | NaN | NaN |
17 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1973.0 | C | AE.2 | 21.771 | 1.863 | 1.750 |
18 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1973.0 | C | AE.2 | NaN | 1.863 | 1.750 |
19 | A & E PLASTIK PAK INC | 001000 | 000032102 | None | 1974.0 | C | AE.2 | 25.638 | 2.240 | 2.125 |
I googled and found this. https://www.wiwi.uni-muenster.de/uf/sites/uf/files/2017_10_12_wrds_data_items.pdf
Also TSLA were ounded and incorporated on July 1, 2003, and issued IPOs on Tesla went on to launch its first initial public offering (IPO) on NASDAQ on June 29, 2010. They issued 13.3 million shares of common stock for the public at a price of $17.00 per share.
Note: I am not sure why there are two lines for each year. Need to do more investigation.
subset = conn.get_table(library='comp', table='funda',
columns = ['conm', 'gvkey', 'cusip','cik', 'fyear', 'consol', 'tic', 'at', 'ni', 'prcc_f', 'che', 'ebit'])
subset2=subset[subset["tic"] == 'TSLA']
subset2
conm | gvkey | cusip | cik | fyear | consol | tic | at | ni | prcc_f | che | ebit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
330222 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2005.0 | C | TSLA | NaN | NaN | NaN | NaN | NaN |
330223 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2006.0 | C | TSLA | NaN | NaN | NaN | NaN | NaN |
330224 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2007.0 | C | TSLA | NaN | NaN | NaN | NaN | NaN |
333193 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2008.0 | C | TSLA | 51.699 | -82.782 | NaN | 9.277 | -78.504 |
333194 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2008.0 | C | TSLA | NaN | NaN | NaN | NaN | NaN |
333195 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2009.0 | C | TSLA | 130.424 | -55.740 | NaN | 69.627 | -51.897 |
333196 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2009.0 | C | TSLA | NaN | NaN | NaN | NaN | NaN |
333198 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2010.0 | C | TSLA | 386.082 | -154.328 | 26.630 | 173.155 | -146.838 |
333199 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2010.0 | C | TSLA | NaN | NaN | 26.630 | NaN | NaN |
333200 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2011.0 | C | TSLA | 713.448 | -254.411 | 28.560 | 303.803 | -251.488 |
333201 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2011.0 | C | TSLA | NaN | NaN | 28.560 | NaN | NaN |
333205 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2012.0 | C | TSLA | 1114.190 | -396.213 | 33.870 | 220.984 | -394.283 |
333206 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2012.0 | C | TSLA | 1114.190 | -396.213 | 33.870 | NaN | NaN |
333207 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2013.0 | C | TSLA | 2416.930 | -74.014 | 150.429 | 848.901 | -61.283 |
333208 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2013.0 | C | TSLA | 2411.816 | -74.014 | 150.429 | NaN | NaN |
333211 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2014.0 | C | TSLA | 5849.251 | -294.040 | 222.410 | 1923.660 | -186.689 |
333212 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2014.0 | C | TSLA | 5830.667 | -294.040 | 222.410 | NaN | NaN |
333213 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2015.0 | C | TSLA | 8092.460 | -888.663 | 240.010 | 1219.536 | -716.629 |
333214 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2015.0 | C | TSLA | 8067.939 | -888.663 | 240.010 | NaN | NaN |
333217 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2016.0 | C | TSLA | 22664.076 | -674.914 | 213.690 | 3498.735 | -645.640 |
333218 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2016.0 | C | TSLA | 22664.076 | -674.914 | 213.690 | NaN | NaN |
333219 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2017.0 | C | TSLA | 28655.372 | -1961.400 | 311.350 | 3523.237 | -1565.086 |
333220 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2017.0 | C | TSLA | 28655.372 | -1961.400 | 311.350 | NaN | NaN |
333223 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2018.0 | C | TSLA | 29739.614 | -976.091 | 332.800 | 3878.169 | -252.840 |
333224 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2018.0 | C | TSLA | 29739.614 | -976.091 | 332.800 | NaN | NaN |
333225 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2019.0 | C | TSLA | 34309.000 | -862.000 | 418.330 | 6514.000 | 80.000 |
333226 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2019.0 | C | TSLA | 34309.000 | -862.000 | 418.330 | NaN | NaN |
333229 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2020.0 | C | TSLA | 52148.000 | 721.000 | 705.670 | 19622.000 | 2184.000 |
333230 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2020.0 | C | TSLA | 52148.000 | 721.000 | 705.670 | NaN | NaN |
333231 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2021.0 | C | TSLA | 62131.000 | 5519.000 | 1056.780 | 18052.000 | 6496.000 |
333232 | TESLA INC | 184996 | 88160R101 | 0001318605 | 2021.0 | C | TSLA | 62131.000 | 5519.000 | 1056.780 | NaN | NaN |
Let's look at comp/company
table.
print(conn.describe_table('comp', 'company'))
Approximately 51778 rows in comp.company. name nullable type 0 conm True VARCHAR(70) 1 gvkey True VARCHAR(6) 2 add1 True VARCHAR(65) 3 add2 True VARCHAR(65) 4 add3 True VARCHAR(65) 5 add4 True VARCHAR(65) 6 addzip True VARCHAR(20) 7 busdesc True VARCHAR(2000) 8 cik True VARCHAR(10) 9 city True VARCHAR(100) 10 conml True VARCHAR(100) 11 costat True VARCHAR(1) 12 county True VARCHAR(100) 13 dlrsn True VARCHAR(8) 14 ein True VARCHAR(10) 15 fax True VARCHAR(20) 16 fic True VARCHAR(3) 17 fyrc True DOUBLE_PRECISION 18 ggroup True VARCHAR(4) 19 gind True VARCHAR(6) 20 gsector True VARCHAR(2) 21 gsubind True VARCHAR(8) 22 idbflag True VARCHAR(1) 23 incorp True VARCHAR(8) 24 loc True VARCHAR(3) 25 naics True VARCHAR(6) 26 phone True VARCHAR(20) 27 prican True VARCHAR(8) 28 prirow True VARCHAR(8) 29 priusa True VARCHAR(8) 30 sic True VARCHAR(4) 31 spcindcd True DOUBLE_PRECISION 32 spcseccd True DOUBLE_PRECISION 33 spcsrc True VARCHAR(3) 34 state True VARCHAR(8) 35 stko True DOUBLE_PRECISION 36 weburl True VARCHAR(60) 37 dldte True DATE 38 ipodate True DATE
print(type(conn.get_table(library='comp', table='company',
columns = ['conm', 'gvkey', 'cik'], obs=10)))
print(45 * "-")
conn.get_table(library='comp', table='company',
columns = ['conm', 'gvkey', 'cik', 'busdesc', 'dlrsn', 'ipodate'], obs=10)
<class 'pandas.core.frame.DataFrame'> ---------------------------------------------
conm | gvkey | cik | busdesc | dlrsn | ipodate | |
---|---|---|---|---|---|---|
0 | A & E PLASTIK PAK INC | 001000 | None | None | 09 | None |
1 | A & M FOOD SERVICES INC | 001001 | 0000723576 | None | 01 | None |
2 | AAI CORP | 001002 | 0001306124 | AAI Corporation, together with its subsidiarie... | 01 | None |
3 | A.A. IMPORTING CO INC | 001003 | 0000730052 | A.A. Importing Company, Inc. designs, manufact... | 07 | None |
4 | AAR CORP | 001004 | 0000001750 | AAR Corp. provides products and services to co... | None | 1988-01-01 |
5 | A.B.A. INDUSTRIES INC | 001005 | None | A.B.A. Industries Inc. was acquired by McSwain... | 01 | None |
6 | ABC INDS INC | 001006 | None | ABC Industries, Inc. manufactures and supplies... | 01 | None |
7 | ABKCO INDUSTRIES INC | 001007 | 0000001882 | ABKCO Music & Records, Inc. operates as an ent... | 10 | None |
8 | ABM COMPUTER SYSTEMS INC | 001008 | None | None | 04 | None |
9 | ABS INDUSTRIES INC | 001009 | 0000313368 | Makes cold and warm forgings, including transm... | 07 | None |
help(conn.raw_sql)
Help on method raw_sql in module wrds.sql: raw_sql(sql, coerce_float=True, date_cols=None, index_col=None, params=None, chunksize=500000, return_iter=False) method of wrds.sql.Connection instance Queries the database using a raw SQL string. :param sql: SQL code in string object. :param coerce_float: (optional) boolean, default: True Attempt to convert values to non-string, non-numeric objects to floating point. Can result in loss of precision. :param date_cols: (optional) list or dict, default: None - List of column names to parse as date - Dict of ``{column_name: format string}`` where format string is: strftime compatible in case of parsing string times or is one of (D, s, ns, ms, us) in case of parsing integer timestamps - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` :param index_col: (optional) string or list of strings, default: None Column(s) to set as index(MultiIndex) :param params: parameters to SQL query, if parameterized. :param chunksize: (optional) integer or None default: 500000 Process query in chunks of this size. Smaller chunksizes can save a considerable amount of memory while query is being processed. Set to None run query w/o chunking. :param return_iter: (optional) boolean, default:False When chunksize is not None, return an iterator where chunksize number of rows is included in each chunk. :rtype: pandas.DataFrame or or Iterator[pandas.DataFrame] Usage :: # Basic Usage >>> data = db.raw_sql('select cik, fdate, coname from wrdssec_all.dforms;', date_cols=['fdate'], index_col='cik') >>> data.head() cik fdate coname 0000000003 1995-02-15 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1996-02-14 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1997-02-19 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1998-03-02 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y... 0000000003 1998-03-10 DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW Y.. ... # Parameterized SQL query >>> parm = {'syms': ('A', 'AA', 'AAPL'), 'num_shares': 50000} >>> data = db.raw_sql('select * from taqmsec.ctm_20030910 where sym_root in %(syms)s and size > %(num_shares)s', params=parm) >>> data.head() date time_m ex sym_root sym_suffix tr_scond size price tr_stopind tr_corr tr_seqnum tr_source tr_rf 2003-09-10 11:02:09.485000 T A None None 211400.0 25.350 N 00 1.929952e+15 C None 2003-09-10 11:04:29.508000 N A None None 55500.0 25.180 N 00 1.929952e+15 C None 2003-09-10 15:08:21.155000 N A None None 50500.0 24.470 N 00 1.929967e+15 C None 2003-09-10 16:10:35.522000 T A None B 71900.0 24.918 N 00 1.929970e+15 C None 2003-09-10 09:35:20.709000 N AA None None 108100.0 28.200 N 00 1.929947e+15 C None
apple = conn.raw_sql("""select permno, date, prc, ret, shrout
from crsp.msf
where permno = 14593
and date>='01/01/2020'""",
date_cols=['date'])
apple
permno | date | prc | ret | shrout | |
---|---|---|---|---|---|
0 | 14593.0 | 2020-01-31 | 309.510010 | 0.054010 | 4375480.0 |
1 | 14593.0 | 2020-02-28 | 273.359985 | -0.114310 | 4375480.0 |
2 | 14593.0 | 2020-03-31 | 254.289993 | -0.069761 | 4323987.0 |
3 | 14593.0 | 2020-04-30 | 293.799988 | 0.155374 | 4334335.0 |
4 | 14593.0 | 2020-05-29 | 317.940002 | 0.084956 | 4334335.0 |
5 | 14593.0 | 2020-06-30 | 364.799988 | 0.147386 | 4283939.0 |
6 | 14593.0 | 2020-07-31 | 425.040009 | 0.165132 | 4275634.0 |
7 | 14593.0 | 2020-08-31 | 129.039993 | 0.216309 | 17102536.0 |
8 | 14593.0 | 2020-09-30 | 115.809998 | -0.102526 | 16976763.0 |
9 | 14593.0 | 2020-10-30 | 108.860001 | -0.060012 | 17001802.0 |
10 | 14593.0 | 2020-11-30 | 119.050003 | 0.095490 | 17001802.0 |
11 | 14593.0 | 2020-12-31 | 132.690002 | 0.114574 | 16823263.0 |
12 | 14593.0 | 2021-01-29 | 131.960007 | -0.005502 | 16788096.0 |
13 | 14593.0 | 2021-02-26 | 121.260002 | -0.079532 | 16788096.0 |
14 | 14593.0 | 2021-03-31 | 122.150002 | 0.007340 | 16686305.0 |
15 | 14593.0 | 2021-04-30 | 131.460007 | 0.076218 | 16687631.0 |
16 | 14593.0 | 2021-05-28 | 124.610001 | -0.050434 | 16687631.0 |
17 | 14593.0 | 2021-06-30 | 136.960007 | 0.099109 | 16556942.0 |
18 | 14593.0 | 2021-07-30 | 145.860001 | 0.064982 | 16530166.0 |
19 | 14593.0 | 2021-08-31 | 151.830002 | 0.042438 | 16530166.0 |
20 | 14593.0 | 2021-09-30 | 141.500000 | -0.068037 | 16426786.0 |
21 | 14593.0 | 2021-10-29 | 149.800003 | 0.058657 | 16406397.0 |
22 | 14593.0 | 2021-11-30 | 165.300003 | 0.104940 | 16406397.0 |
23 | 14593.0 | 2021-12-31 | 177.570007 | 0.074229 | 16344923.0 |
24 | 14593.0 | 2022-01-31 | 174.779999 | -0.015712 | 16319441.0 |
25 | 14593.0 | 2022-02-28 | 165.119995 | -0.054011 | 16319441.0 |
26 | 14593.0 | 2022-03-31 | 174.610001 | 0.057473 | 16319441.0 |
apple_fund = conn.raw_sql("""select a.gvkey, a.iid, a.datadate, a.tic, a.conm,
a.at, b.prccm, b.cshoq
from comp.funda a
inner join comp.secm b
on a.gvkey = b.gvkey
and a.iid = b.iid
and a.datadate = b.datadate
where a.tic = 'AAPL'
and a.datadate>='01/01/1960'
and a.datafmt = 'STD'
and a.consol = 'C'
and a.indfmt = 'INDL'
""", date_cols=['datadate'])
print(apple_fund)
print(apple_fund.info())
gvkey iid datadate tic conm at prccm cshoq 0 001690 01 1981-09-30 AAPL APPLE INC 254.838 15.249996 55.309 1 001690 01 1982-09-30 AAPL APPLE INC 357.787 18.249985 57.123 2 001690 01 1983-09-30 AAPL APPLE INC 556.579 23.125000 59.198 3 001690 01 1984-09-30 AAPL APPLE INC 788.786 25.125000 60.535 4 001690 01 1985-09-30 AAPL APPLE INC 936.177 15.750000 61.850 5 001690 01 1986-09-30 AAPL APPLE INC 1160.128 33.500000 62.628 6 001690 01 1987-09-30 AAPL APPLE INC 1477.931 56.500000 126.088 7 001690 01 1988-09-30 AAPL APPLE INC 2082.086 43.250000 122.768 8 001690 01 1989-09-30 AAPL APPLE INC 2743.899 44.500000 126.270 9 001690 01 1990-09-30 AAPL APPLE INC 2975.707 29.000000 115.359 10 001690 01 1991-09-30 AAPL APPLE INC 3493.597 49.500000 118.386 11 001690 01 1992-09-30 AAPL APPLE INC 4223.693 45.125000 118.479 12 001690 01 1993-09-30 AAPL APPLE INC 5171.412 23.375000 116.147 13 001690 01 1994-09-30 AAPL APPLE INC 5302.746 33.686996 119.543 14 001690 01 1995-09-30 AAPL APPLE INC 6231.000 37.250000 122.922 15 001690 01 1996-09-30 AAPL APPLE INC 5364.000 22.186996 124.497 16 001690 01 1997-09-30 AAPL APPLE INC 4233.000 21.686996 127.949 17 001690 01 1998-09-30 AAPL APPLE INC 4289.000 38.125000 135.193 18 001690 01 1999-09-30 AAPL APPLE INC 5161.000 63.312500 160.799 19 001690 01 2000-09-30 AAPL APPLE INC 6803.000 25.750000 335.677 20 001690 01 2001-09-30 AAPL APPLE INC 6021.000 15.510000 350.922 21 001690 01 2002-09-30 AAPL APPLE INC 6298.000 14.500000 358.959 22 001690 01 2003-09-30 AAPL APPLE INC 6815.000 20.720000 366.727 23 001690 01 2004-09-30 AAPL APPLE INC 8050.000 38.750000 391.444 24 001690 01 2005-09-30 AAPL APPLE INC 11551.000 53.610000 835.019 25 001690 01 2006-09-30 AAPL APPLE INC 17205.000 76.980000 855.263 26 001690 01 2007-09-30 AAPL APPLE INC 25347.000 153.470000 872.329 27 001690 01 2008-09-30 AAPL APPLE INC 39572.000 113.660000 888.326 28 001690 01 2009-09-30 AAPL APPLE INC 47501.000 185.350000 899.806 29 001690 01 2010-09-30 AAPL APPLE INC 75183.000 283.750000 915.970 30 001690 01 2011-09-30 AAPL APPLE INC 116371.000 381.320000 929.277 31 001690 01 2012-09-30 AAPL APPLE INC 176064.000 667.105000 939.208 32 001690 01 2013-09-30 AAPL APPLE INC 207000.000 476.750000 899.213 33 001690 01 2014-09-30 AAPL APPLE INC 231839.000 100.750000 5866.161 34 001690 01 2015-09-30 AAPL APPLE INC 290479.000 110.300000 5578.753 35 001690 01 2016-09-30 AAPL APPLE INC 321686.000 113.050000 5336.166 36 001690 01 2017-09-30 AAPL APPLE INC 375319.000 154.120000 5126.201 37 001690 01 2018-09-30 AAPL APPLE INC 365725.000 225.740000 4754.986 38 001690 01 2019-09-30 AAPL APPLE INC 338516.000 223.970000 4443.236 39 001690 01 2020-09-30 AAPL APPLE INC 323888.000 115.810000 16976.763 40 001690 01 2021-09-30 AAPL APPLE INC 351002.000 141.500000 16426.786 <class 'pandas.core.frame.DataFrame'> RangeIndex: 41 entries, 0 to 40 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gvkey 41 non-null object 1 iid 41 non-null object 2 datadate 41 non-null datetime64[ns] 3 tic 41 non-null object 4 conm 41 non-null object 5 at 41 non-null float64 6 prccm 41 non-null float64 7 cshoq 41 non-null float64 dtypes: datetime64[ns](1), float64(3), object(4) memory usage: 2.7+ KB None
data_directory = '/content/drive/My Drive/Colab Notebooks/data/'
!pwd
/content
apple_fund.to_pickle(data_directory + "apple_fund.pkl")