%matplotlib inline
import datetime as dt
import itertools as it
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns
from tqdm import tqdm
InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()
tqdm.pandas()
def isnum(x):
if x is None:
return False
try:
float(x)
return True
except ValueError:
return False
Welcome, Abhishek Pratap!
v1sid, v2sid = 'syn10250489', 'syn17023091'
v1r = pd.read_excel(syn.get(v1sid).path)
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])
v1r.head()
v2r.head()
brightenid | start | week | user_id | sent_time_local | sent_time_utc | response_local | response_utc | response_id | Why did you download this app? | For "other", please type in box | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | BLUE-00048 | 2014-08-01 | 0 | 10431 | NaT | NaT | 2014-08-01 07:00:09 | 2014-08-01 11:00:09 | 166331 | for fun|for mental health reasons [e.g. depres... | NaN |
1 | BLUE-00049 | 2014-08-01 | 0 | 10470 | NaT | NaT | 2014-08-01 12:31:58 | 2014-08-01 16:31:58 | 166824 | for brain health [e.g. better memory]|to impro... | NaN |
2 | BLUE-00050 | 2014-08-09 | 0 | 10519 | NaT | NaT | 2014-08-08 19:31:06 | 2014-08-09 02:31:06 | 173157 | for management of daily problems|for brain hea... | NaN |
3 | BLUE-00050 | 2014-08-09 | 4 | 10519 | 2014-09-06 09:00:01 | 2014-09-06 16:00:01 | 2014-09-08 18:21:29 | 2014-09-09 01:21:29 | 198454 | for management of daily problems|for brain hea... | NaN |
4 | BLUE-00050 | 2014-08-09 | 12 | 10519 | 2014-11-01 08:00:05 | 2014-11-01 16:00:05 | 2014-11-02 09:16:22 | 2014-11-02 17:16:22 | 256858 | for mood [e.g. sadness]|for brain health [e.g.... | NaN |
username | Why did you download this app? | createdAt | |
---|---|---|---|
0 | EN05039 | Fun,My mental health,My mood,Managing daily is... | 2016-09-03 17:13:21 |
1 | EN05331 | My mental health,Brain health,Improve work | 2017-01-22 23:04:11 |
2 | EN00387 | My mental health,My mood,Brain health,Fun | 2016-10-28 08:37:06 |
3 | EN00322 | My mental health | 2016-09-07 21:32:32 |
4 | EN00478 | My mental health,Managing daily issues,My mood... | 2016-11-11 07:21:42 |
# drop the uneeded columns
v1 = v1r.drop(columns=[
'week', 'user_id', 'sent_time_local', 'sent_time_utc', 'response_utc', 'response_id'
]).rename(columns={
'brightenid': 'participant_id',
'response_local':'dt_response',
'Why did you download this app?':'apps',
'For "other", please type in box':'other_description'
})
# conver to lowercase for lookups
v1.apps = v1.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')
# add indicators for different app usage
v1['fun'] = v1.apps.apply(lambda x: int(x.find('fun') > -1))
v1['mental_health'] = v1.apps.apply(lambda x: int(x.find('mental health') > -1))
v1['mood'] = v1.apps.apply(lambda x: int(x.find('mood') > -1))
v1['managing_daily_issues'] = v1.apps.apply(lambda x: int(x.find('management of daily ') > -1))
v1['improve_work'] = v1.apps.apply(lambda x: int(x.find('improve work') > -1))
v1['brain_health'] = v1.apps.apply(lambda x: int(x.find('brain health') > -1))
v1['improve_relationships'] = v1.apps.apply(lambda x: int(x.find('improve relationships') > -1))
v1['other'] = v1.apps.apply(lambda x: int(x.find('other') > -1))
v1.head()
# print the unique application reasons
t = [print(a) for a in pd.unique(list(it.chain(*[t.split('|') for t in v1.apps])))]; del t
plt.hist([len(a) for a in v1.other_description if not isnum(a)])
v1['day'] = [(t.dt_response - t.start).days + 1 for t in v1.itertuples()]
v1.day.hist()
incentive_words = [
'paid', 'pay', 'mone', 'compens', 'gift', 'incentive', '$', 'finan', 'incom', 'reimb', 'craigs', 'pd'
]
def is_incentive(s):
if not isinstance(s, str):
return 0
else:
return int(any(s.find(a) > -1 for a in incentive_words))
v1['happ_inc'] = pd.to_numeric(v1.other_description.apply(is_incentive), downcast='integer')
for_the_study_words = [
'part', 'require', 'for study', 'told to', 'asked to', 'request', 'to be', 'to do', 'brighten', 'assignment', 'ucsf', 'for a', 'study'
]
def is_for_the_study(s):
if not isinstance(s, str):
return int(0)
else:
return int(any(s.find(a) > -1 for a in for_the_study_words))
v1['happ_fts'] = v1.other_description.apply(is_for_the_study)
v1 = v1.drop(columns=['start', 'day'])
# drop the uneeded columns
v2 = v2r.rename(columns={
'username': 'participant_id',
'createdAt':'dt_response',
'Why did you download this app?':'apps'
})
# conver to lowercase for lookups
v2.apps = v2.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')
# add indicators for different app usage
v2['fun'] = v2.apps.apply(lambda x: int(x.find('fun') > -1))
v2['mental_health'] = v2.apps.apply(lambda x: int(x.find('my mental health') > -1))
v2['mood'] = v2.apps.apply(lambda x: int(x.find('my mood') > -1))
v2['managing_daily_issues'] = v2.apps.apply(lambda x: int(x.find('managing daily issues') > -1))
v2['improve_work'] = v2.apps.apply(lambda x: int(x.find('improve work') > -1))
v2['brain_health'] = v2.apps.apply(lambda x: int(x.find('brain health') > -1))
v2['improve_relationships'] = v2.apps.apply(lambda x: int(x.find('improve relationships') > -1))
v2['other'] = v2.apps.apply(lambda x: int(x.find('other') > -1))
# add the study flag
v2.head()
t = [print(a) for a in pd.unique(list(it.chain(*[t.split(',') for t in v2.apps])))]; del t
combined = pd.concat([v1, v2], sort=False)
combined = combined.drop(columns=['apps']).rename(columns={
'brain_health': 'happ_bh',
'fun': 'happ_f',
'improve_relationships': 'happ_ir',
'improve_work': 'happ_iw',
'managing_daily_issues': 'happ_mdi',
'mental_health': 'happ_mh',
'mood':'happ_m',
'other': 'happ_o',
'other_description':'happ_o_description'
}).loc[:, [
'participant_id', 'dt_response',
'happ_bh', 'happ_f', 'happ_fts', 'happ_inc', 'happ_ir', 'happ_iw', 'happ_m', 'happ_mdi', 'happ_mh', 'happ_o',
'happ_o_description'
]]
combined = combined.fillna(0)
# make sure they're all the correct type. for some reason pandas was converting fts and inc to floats
# but we don't need that many bits. plus i like consistency
for c in combined.columns:
if c.find('description') > -1:
continue
if c.find('app') > -1:
combined[c] = combined[c].astype(int)
combined.head()
metasid = 'syn27082597'
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)
# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')
# get the time difference in weeks as a float
combined['week'] = [
d.days/7 for d in (
combined.dt_response.apply(
lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
)
]
# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)
# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')
# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)
combined.head()
# localize timestamps
combined['dt_response'] = [
str(t.tz_localize('UTC'))
for t in combined.dt_response
]
combined.head()
t = syn.delete(
syn.tableQuery('select * from syn17022426')
)
final = syn.store(Table(
Schema(
name='Health Applications',
columns=as_table_columns(combined),
parent='syn10848316'),
combined
)
)
final = syn.setProvenance(
'syn17022426',
activity=Activity(
name='Combine V1 and V2 data',
description='Process and combine the data collected during study 1 and study 2',
used=['syn17023091', v1sid, v2sid, 'syn12181332', metasid],
executed=[
dict(
name='Curate_StudyAppDownloadReason_Data',
url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_StudyAppDownloadReason_Data.ipynb'
)
]
)
)