#!/usr/bin/env python # coding: utf-8 # Open In Colab # In[1]: get_ipython().system('pip install wrds') # In[2]: import wrds # In[5]: # Now I am connecting to WRDS site - you must ahve a WRDS account user id and password conn=wrds.Connection() # In[6]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns plt.style.use('fivethirtyeight') get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") # In[7]: help(wrds) # In[8]: #listing all dtabases or libraries in WRDS conn.list_libraries() # In[9]: #listing CRSP files conn.list_tables(library="crspa") # In[10]: #listing CRSP dsf file or table conn.describe_table('crsp', 'dsf') # In[11]: #listing CRSP dsf file or table conn.describe_table('crspm', 'dsfhdr') # In[ ]: #listing CRSP dsf file or table conn.describe_table('crsp', 'dsfhdr') # In[ ]: conn.describe_table('crsp', 'dse') # In[ ]: conn.describe_table('crspm', 'dsf') # In[ ]: conn.describe_table('crsp', 'stocknames') # In[ ]: conn.describe_table('crsp', 'dsenames') # In[ ]: conn.describe_table('crsp', 'crsp_header') # **CRSP** is a **library**, and the monthly stock file **CRSP.MSF** is a **table**. # In[13]: stock_names = conn.get_table(library = 'crsp', table ='stocknames', columns=['permno', 'comnam','ticker', 'siccd','shrcd', 'namedt','nameenddt']) stock_names.info() # In[17]: conn.list_tables(library="comp") # In[18]: conn.describe_table('comp', 'funda') # In[ ]: # In[20]: firm_names = conn.get_table(library = 'comp', table ='funda', columns=['gvkey', 'fyear','datadate', 'tic','conm']) firm_names.info() # In[22]: help(conn.raw_sql) # In[29]: crsp = conn.raw_sql(""" SELECT permno, date, cusip, permco, ret, vol, shrout, prc, cfacpr, cfacshr FROM crsp.msf WHERE date BETWEEN '01/01/2020' AND '12/31/2021' """, date_cols = ['date']) print(crsp.head(10)) print(crsp.info()) # In[33]: #checking data intergrity check = {"tickers": ("MSFT", "TSLA", "COP", "AAPL", "WMT")} checking = conn.raw_sql('select * from crsp.dse WHERE ticker in %(tickers)s', params=check) checking # In[34]: checking = conn.raw_sql('select * from crsp.stocknames WHERE ticker in %(tickers)s', params=check) checking # In[35]: checking = conn.raw_sql('select * from crsp.dsenames WHERE ticker in %(tickers)s', params=check) checking # In[36]: check = {'tickers': ('JNJ','COP')} check # In[37]: crsp = conn.raw_sql(""" select a.permno, a.date, a.ret, b.rf, b.mktrf, b.smb, b.hml from crsp.dsf as a left join ff.factors_daily as b on a.date=b.date where a.date > '01/01/2020' """) # In[38]: # this example shows that tickers are not a good screening id checking = conn.raw_sql('select a.permno, a.ticker, a.comnam, b.date, b.prc, b.ret, b.retx, b.cfacpr from crsp.stocknames a join crsp.dsf b on a.permno = b.permno WHERE a.ticker in %(tickers)s and a.st_date <= b.date and b.date <= a.end_date', params=check) checking # In[ ]: # # It looks that **COP** was assigned to two different companies, so I used `dsenames` file to find unique ticker using "tsymbol", instead of "ticker". # Now let's extract price data from CRSP using `sql` statement # In[39]: stocks = ['AAPL','TSLA','AMZN','FB', 'MSFT', 'WMT', 'HD', 'JNJ', 'JPM', 't'] # In[40]: # CRSP command takes tuples, not lists, so let's create tuples. stocks = {'tickers': ('AAPL','TSLA','AMZN','FB', 'MSFT', 'WMT', 'HD', 'JNJ', 'JPM', 'COP')} print(stocks) type(stocks) # In[41]: raw_data_from_crsp = conn.raw_sql('select a.permno, a.ticker, a.comnam, a.tsymbol, b.date, b.prc, b.ret, b.retx, b.cfacpr from crsp.dsenames a join crsp.dsf b on a.permno = b.permno WHERE a.tsymbol in %(tickers)s and a.namedt <= b.date and b.date <= a.nameendt', params=stocks) pd_data = pd.DataFrame(raw_data_from_crsp) pd_data.info() # In[42]: # note that the most recent date for which data is available is the March 31 2022 pd_data.tail() # In[43]: # convert date to datetime pd_data['date'] = pd.to_datetime(pd_data['date'], format='%Y-%m-%d') pd_data.info() # In[44]: # setting index pd_data_index= pd_data.set_index('date') pd_data_index.head() # In[45]: # important: you need to adjust prices for splits pd_data_index['adj_price'] = pd_data_index['prc'] / pd_data_index['cfacpr'] # In[46]: pd_data_index.info() # In[47]: #slicing ticker and price column pd_data_index_tic_prc = pd_data_index[['ticker', 'prc']] pd_data_index_tic_prc.info() # In[48]: b = pd_data_index_tic_prc['2016-01-01' <= pd_data_index_tic_prc.index] b.head() # In[49]: c = b[b.index <= '2022-03-30'] c.tail() # In[50]: # 564 obs for each stock times 10 = 5640 c.info() # In[51]: # I am transposing the table table = c.pivot(columns='ticker') # By specifying col[1] in below list comprehension # You can select the stock names under multi-level column table.columns = [col[1] for col in table.columns] table.head() # In[52]: table.describe() # In[53]: table.describe([.01,.1, .9, .99]) # Let's first look at how the price of each stock has evolved within give time frame. # In[ ]: plt.figure(figsize=(14, 7)) for c in table.columns.values: plt.plot(table.index, table[c], lw=3, alpha=0.8,label=c) plt.legend(loc='upper left', fontsize=12) plt.ylabel('price in $') # Another way to plot this is plotting daily returns (percent change compared to the day before). By plotting daily returns instead of actual prices, we can see the stocks' volatility. # In[ ]: returns = table.pct_change() plt.figure(figsize=(14, 7)) for c in returns.columns.values: plt.plot(returns.index, returns[c], lw=3, alpha=0.8,label=c) plt.legend(loc='upper right', fontsize=12) plt.ylabel('daily returns') # In[ ]: returns.describe() # ### Source: https://www.fredasongdrechsler.com/intro-to-python-for-fnce/sp500-constituents # Reference: Drechsler, Qingyi (Freda) S., 2022, Python Programs for Empirical Finance, https://www.fredasongdrechsler.com # ##### Extending her code, I modified ..... # In[ ]: sp500 = conn.raw_sql(""" select a.*, b.date, b.ret from crsp.msp500list as a, crsp.msf as b where a.permno=b.permno and b.date >= a.start and b.date<= a.ending and b.date>='01/01/2000' order by date; """, date_cols=['start', 'ending', 'date']) # In[ ]: # Add Other Descriptive Variables mse = conn.raw_sql(""" select comnam, ncusip, namedt, nameendt, permno, shrcd, exchcd, hsiccd, ticker from crsp.msenames """, date_cols=['namedt', 'nameendt']) # if nameendt is missing then set to today date mse['nameendt']=mse['nameendt'].fillna(pd.to_datetime('today')) # Merge with SP500 data sp500_full = pd.merge(sp500, mse, how = 'left', on = 'permno') # Impose the date range restrictions sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \ & (sp500_full.date<=sp500_full.nameendt)] # In[ ]: # Linking with Compustat through CCM ccm=conn.raw_sql(""" select gvkey, liid as iid, lpermno as permno, linktype, linkprim, linkdt, linkenddt from crsp.ccmxpf_linktable where substr(linktype,1,1)='L' and (linkprim ='C' or linkprim='P') """, date_cols=['linkdt', 'linkenddt']) # if linkenddt is missing then set to today date ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today')) # Merge the CCM data with S&P500 data # First just link by matching PERMNO sp500ccm = pd.merge(sp500_full, ccm, how='left', on=['permno']) # Then set link date bounds sp500ccm = sp500ccm.loc[(sp500ccm['date']>=sp500ccm['linkdt'])\ &(sp500ccm['date']<=sp500ccm['linkenddt'])] # Rearrange columns for final output sp500ccm = sp500ccm.drop(columns=['namedt', 'nameendt', \ 'linktype', 'linkprim', 'linkdt', 'linkenddt']) sp500ccm = sp500ccm[['date', 'permno', 'comnam', 'ncusip', 'shrcd', 'exchcd', 'hsiccd', 'ticker', \ 'gvkey', 'iid', 'start', 'ending', 'ret']] # In[ ]: sp500ccm # In[30]: conn.describe_table('crsp','msi') # In[32]: crsp_msi = conn.raw_sql(""" SELECT * FROM crsp.msi WHERE date BETWEEN '01/01/2020' AND '12/31/2021' """, date_cols = ['date']) crsp_msi # In[ ]: # In[ ]: # In[ ]: