# Preliminaries
import pandas as pd
import numpy as np
import sys
old_stdout = sys.stdout
sys.stdout = open('/home/brian/Documents/econ_data/bd_CPS/codebook.txt', 'w')
import os
os.chdir('/home/brian/Documents/econ_data/bd_CPS/')
from bd_CPS_details import CodebookNotes, ValueLabels
os.chdir('/home/brian/Documents/CPS/data/clean/')
# Go through data and store requested details
d = {}
for year in range(1989, 2023):
data = pd.read_feather(f'cps{year}.ft')
for month, df in data.groupby('MONTH'):
date = pd.to_datetime(f'{year}-{month}-01')
variables = list(df.keys())
for var in variables:
values_list = list(df[var].dropna().unique())
if (var in d.keys()) and (len(values_list) > 1):
date_list = d[var]['avail']
date_list.append(date)
dtypes = d[var]['dtype']
if df[var].dtype.name not in dtypes:
if len(dtypes) > 0:
d[var]['breaks'].append(date)
dtypes.append(df[var].dtype.name)
if len(values_list) < 100 and 'HHID2' not in var and 'OTC' not in var and 'PRICE' not in var:
values = d[var]['values']
for value in values_list:
if value not in values:
values.append(value)
else:
loc_max = sorted(values_list)[-1]
loc_min = sorted(values_list)[0]
if d[var]['max_val'] == None:
max_val = loc_max
min_val = loc_min
if d[var]['max_val'] != None:
max_val = d[var]['max_val']
min_val = d[var]['min_val']
if loc_max >= max_val:
d[var]['max_val'] = loc_max
if loc_min <= min_val:
d[var]['min_val'] = loc_min
if (var not in d.keys()) and (len(values_list) > 1):
d[var] = {}
d[var]['breaks'] = []
d[var]['max_val'] = None
d[var]['min_val'] = None
d[var]['avail'] = [date]
d[var]['dtype'] = [df[var].dtype.name]
if len(values_list) < 100 and len(values_list) > 0:
d[var]['values'] = values_list
else:
d[var]['values'] = []
for key, values in d.items():
avail = sorted(values['avail'])
max_date = avail[-1].strftime('%Y-%m')
values['date_max'] = max_date
min_date = avail[0].strftime('%Y-%m')
values['date_min'] = min_date
# Print out the bd CPS codebook
print('\n========================\n\n bd CPS Codebook'
'\n\n========================\n')
today = pd.to_datetime('today').strftime('%B %d, %Y')
print(f'updated: {today}\n\nvariables:\n')
for key, values in d.items():
if key in CodebookNotes.keys():
print(f'{key} - {CodebookNotes[key]["Name"]}')
print(f' Notes: {CodebookNotes[key]["Notes"]}')
else:
print(f'{key}')
print(f' Data types: {values["dtype"]}')
print(f' Available from: {values["date_min"]} to: {values["date_max"]}')
if len(values['breaks']) > 0:
print(f' Breaks in dtype: {values["breaks"]}')
print(' Value range: ')
if len(sorted(values['values'])) > 0:
print(sorted(values['values']))
else:
print(f'{values["min_val"]} to {values["max_val"]}')
print('\n\n')
print('\n==============================\n\n Variable value labels'
'\n\n==============================\n\n')
print('2018 Occupation codes (OCC18):\n')
for key, value in ValueLabels['OCC18'].items():
print(f'{key} {value}')
print('\n\n2010 Occupation detailed recodes (OCC03D):\n')
for key, value in ValueLabels['OCC03D'].items():
print(f'{key} {value}')
print('\n\n2010 Occupation major recodes (OCC03M):\n')
for key, value in ValueLabels['OCC03M'].items():
print(f'{key} {value}')
print('\n\n\n2017 Industry codes (IND17):\n')
for key, value in ValueLabels['IND17'].items():
print(f'{key} {value}')
print('\n\n2012 Industry detailed recodes (IND03D):\n')
for key, value in ValueLabels['IND03D'].items():
print(f'{key} {value}')
print('\n\n2012 Industry major recodes (IND03M):\n')
for key, value in ValueLabels['IND03M'].items():
print(f'{key} {value}')
print('\n\nCore-based statistical areas (CBSA):\n')
for key, value in ValueLabels['CBSA'].items():
print(f'{key} {value}')
print('\n\nConsolidated statistical areas (CSA):\n')
for key, value in ValueLabels['CSA'].items():
print(f'{key} {value}')
print('\n\nCounties (COUNTY):\n')
for key, value in ValueLabels['COUNTY'].items():
print(f'{key} {value}')
#sys.stdout = old_stdout
#codebook = open('/home/brian/Documents/econ_data/bd_CPS/codebook.txt', 'r').read()
#print(codebook)