import pandas as pd
import numpy as np
import os
data_dir='/Users/davecash/Data/IDEAS/sample'
xls_demo=os.path.join(data_dir,'GENFI_DEMOGRAPHICS_DF3_FINAL_BLINDED.xlsx')
df_demo=pd.read_excel(xls_demo)
xls_img=os.path.join(data_dir,'GENFI_IMAGING_DF3_FINAL_BLINDED.xlsx')
df_img=pd.read_excel(xls_img)
Join the data, get rid of unneeded variables and keep first visit that has both demographics and imaging
df_combined=pd.merge(df_img,df_demo,on=['Blinded Code','Visit'])
Categorize key variables and get rid of any rare mutations
df_combined['MutationType']=pd.Categorical(df_combined['Genetic Group'],categories=['C9orf72','GRN','MAPT'])
df_combined = df_combined.dropna(subset=['MutationType','Age at visit','DRC_QC','Scanner'])
Now it's time to get rid of some missing values. Start by assuming TIV constant over time and just keeping first value
df_combined['TIV'] = df_combined.groupby(['Blinded Code'])["TIV mm3"].fillna(method="ffill")
Remove a bunch of columns that we don't need.
df_combined = df_combined.drop(columns=['Date of scan','Blinded Site_x','TIV mm3','1 Orbitofrontal LEFT',
'2. DLPFC LEFT', '3. VMPFC LEFT', '4. Motor LEFT', '5. Opercular LEFT',
'6. FRP LEFT', '7. Medial Temp LEFT', '8. Lateral Temp LEFT',
'9. Temporal Pole LEFT', '10. Supra Temp LEFT', '11. Medial Parietal LEFT',
'12. Lateral Parietal LEFT', '13. Sensory LEFT', '14. Medial occ LEFT',
'15. Lateral Occ LEFT', '16. Anterior Cing LEFT', '17. Middle Cing LEFT',
'18. Posterior Cing LEFT', '19. Ant Insula LEFT', '20. Post Insula LEFT',
'1 Orbitofrontal RIGHT', '2. DLPFC RIGHT', '3. VMPFC RIGHT', '4. Motor RIGHT',
'5. Opercular RIGHT', '6. FRP RIGHT', '7. Medial Temp RIGHT',
'8. Lateral Temp RIGHT', '9. Temporal Pole RIGHT', '10. Supra Temp RIGHT',
'11. Medial Parietal RIGHT', '12. Lateral Parietal RIGHT', '13. Sensory RIGHT',
'14. Medial occ RIGHT', '15. Lateral Occ RIGHT', '16. Anterior Cing RIGHT',
'17. Middle Cing RIGHT', '18. Posterior Cing RIGHT', '19. Ant Insula RIGHT',
'20. Post Insula RIGHT','Genetic Group', 'Blinded Family', 'Date of assessment','Handedness', 'Employment', 'Ethnicity'])
Keep only images that passed QC for GIF
df_combined=df_combined[df_combined['QC_include in GIF']==1]
Keep only one timepoint
df_xsec=df_combined.drop_duplicates(subset='Blinded Code')
Find all of the scanners, sort and assign a new identifying number to use instead of the official GENFI one.
site_list=df_xsec['Blinded Site_y'].drop_duplicates()
site_list=site_list.sort_values(ignore_index=True)
site_list = site_list.reset_index(level=0)
site_list.set_index('Blinded Site_y',inplace=True)
df_xsec = pd.merge(df_xsec,site_list,how='left',left_on='Blinded Site_y',right_index=True)
Rename some markdown columns for easier use later.
df_xsec.rename(columns={'index':'Site',
'Gender':'Sex',
'Genetic status 2': 'Group',
'Age at visit': 'Age'},inplace=True)
df_xsec.columns = df_xsec.columns.str.replace(' ', '_')
Finally blind the blinded code and re-sort on the randomcode
new_id=1+np.arange(len(df_xsec.index))
np.random.shuffle(new_id)
df_xsec['New Code']=new_id
df_xsec['ID']=df_xsec['New Code'].map('GENFI{:03d}'.format)
df_xsec=df_xsec[['ID','Site','Scanner', 'Age', 'Sex',
'Education', 'EYO', 'Group', 'MutationType', 'TIV',
'Right_Accumbens_Area','Left_Accumbens_Area', 'Right_Amygdala',
'Left_Amygdala', 'Pons','Brain_Stem', 'Right_Caudate', 'Left_Caudate',
'Right_Hippocampus','Left_Hippocampus', 'Right_Pallidum', 'Left_Pallidum',
'Right_Putamen','Left_Putamen', 'Right_Thalamus_Proper', 'Left_Thalamus_Proper',
'Total_Brain', 'Frontal_lobe_volume', 'Temporal_lobe_volume',
'Parietal_lobe_volume', 'Occipital_lobe_volume', 'Cingulate_volume',
'Insula_volume', 'Left_Frontal_lobe_volume', 'Right_Frontal_lobe_volume',
'Left_Temporal_lobe_volume','Right_Temporal_lobe_volume', 'Left_Parietal_lobe_volume',
'Right_Parietal_lobe_volume', 'Left_Occipital_lobe_volume',
'Right_Occipital_lobe_volume', 'Left_Cingulate_volume', 'Right_Cingulate_volume',
'Left_Insula_volume', 'Right_Insula_volume','Total_Cerebellum']]
df_xsec=df_xsec.set_index('ID',drop=True)
df_xsec=df_xsec.sort_index()
df_xsec['TIV']=df_xsec['TIV']/1000
df_xsec=df_xsec.round(1)
df_xsec.to_excel(os.path.join(data_dir,'GENFI_DEMON_SPREADSHEET.xlsx'))