Curate_StudyAppDownloadReason_Data¶

In [1]:

%matplotlib inline

import datetime as dt
import itertools as it
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

def isnum(x):
    if x is None:
        return False
    try:
        float(x)
        return True
    except ValueError:
        return False

Welcome, Abhishek Pratap!

In [2]:

v1sid, v2sid = 'syn10250489', 'syn17023091'

v1r = pd.read_excel(syn.get(v1sid).path)
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])

v1r.head()
v2r.head()

Out[2]:

	brightenid	start	week	user_id	sent_time_local	sent_time_utc	response_local	response_utc	response_id	Why did you download this app?	For "other", please type in box
0	BLUE-00048	2014-08-01	0	10431	NaT	NaT	2014-08-01 07:00:09	2014-08-01 11:00:09	166331	for fun\|for mental health reasons [e.g. depres...	NaN
1	BLUE-00049	2014-08-01	0	10470	NaT	NaT	2014-08-01 12:31:58	2014-08-01 16:31:58	166824	for brain health [e.g. better memory]\|to impro...	NaN
2	BLUE-00050	2014-08-09	0	10519	NaT	NaT	2014-08-08 19:31:06	2014-08-09 02:31:06	173157	for management of daily problems\|for brain hea...	NaN
3	BLUE-00050	2014-08-09	4	10519	2014-09-06 09:00:01	2014-09-06 16:00:01	2014-09-08 18:21:29	2014-09-09 01:21:29	198454	for management of daily problems\|for brain hea...	NaN
4	BLUE-00050	2014-08-09	12	10519	2014-11-01 08:00:05	2014-11-01 16:00:05	2014-11-02 09:16:22	2014-11-02 17:16:22	256858	for mood [e.g. sadness]\|for brain health [e.g....	NaN

Out[2]:

	username	Why did you download this app?	createdAt
0	EN05039	Fun,My mental health,My mood,Managing daily is...	2016-09-03 17:13:21
1	EN05331	My mental health,Brain health,Improve work	2017-01-22 23:04:11
2	EN00387	My mental health,My mood,Brain health,Fun	2016-10-28 08:37:06
3	EN00322	My mental health	2016-09-07 21:32:32
4	EN00478	My mental health,Managing daily issues,My mood...	2016-11-11 07:21:42

V1 Data Prep¶

In [ ]:

# drop the uneeded columns
v1 = v1r.drop(columns=[
    'week',	'user_id',	'sent_time_local',	'sent_time_utc', 'response_utc', 'response_id'
]).rename(columns={
    'brightenid': 'participant_id',
    'response_local':'dt_response',
    'Why did you download this app?':'apps',
    'For "other", please type in box':'other_description'
})

# conver to lowercase for lookups
v1.apps = v1.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')

# add indicators for different app usage
v1['fun'] = v1.apps.apply(lambda x: int(x.find('fun') > -1))
v1['mental_health'] =  v1.apps.apply(lambda x: int(x.find('mental health') > -1))
v1['mood'] =  v1.apps.apply(lambda x: int(x.find('mood') > -1))
v1['managing_daily_issues'] =  v1.apps.apply(lambda x: int(x.find('management of daily ') > -1))
v1['improve_work'] =  v1.apps.apply(lambda x: int(x.find('improve work') > -1))
v1['brain_health'] =  v1.apps.apply(lambda x: int(x.find('brain health') > -1))
v1['improve_relationships'] =  v1.apps.apply(lambda x: int(x.find('improve relationships') > -1))
v1['other'] =  v1.apps.apply(lambda x: int(x.find('other') > -1))

v1.head()

In [ ]:

# print the unique application reasons
t = [print(a) for a in pd.unique(list(it.chain(*[t.split('|') for t in v1.apps])))]; del t

In [ ]:

plt.hist([len(a) for a in v1.other_description if not isnum(a)])

In [ ]:

v1['day'] = [(t.dt_response - t.start).days + 1 for t in v1.itertuples()]
v1.day.hist()

Extract two common topics I saw a quick read through the other descriptions¶

In [ ]:

incentive_words = [
    'paid', 'pay', 'mone', 'compens', 'gift', 'incentive', '$', 'finan', 'incom', 'reimb', 'craigs', 'pd'
]

def is_incentive(s):
    if not isinstance(s, str):
        return 0
    else:
        return int(any(s.find(a) > -1 for a in incentive_words))
    
v1['happ_inc'] = pd.to_numeric(v1.other_description.apply(is_incentive), downcast='integer')

In [ ]:

for_the_study_words = [
    'part', 'require', 'for study', 'told to', 'asked to', 'request', 'to be', 'to do', 'brighten', 'assignment', 'ucsf', 'for a', 'study'
]

def is_for_the_study(s):
    if not isinstance(s, str):
        return int(0)
    else:
        return int(any(s.find(a) > -1 for a in for_the_study_words))
    
v1['happ_fts'] = v1.other_description.apply(is_for_the_study)

In [ ]:

v1 = v1.drop(columns=['start', 'day'])

V2 Data Prep¶

In [ ]:

# drop the uneeded columns
v2 = v2r.rename(columns={
    'username': 'participant_id',
    'createdAt':'dt_response',
    'Why did you download this app?':'apps'
})

# conver to lowercase for lookups
v2.apps = v2.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')

# add indicators for different app usage
v2['fun'] = v2.apps.apply(lambda x: int(x.find('fun') > -1))
v2['mental_health'] =  v2.apps.apply(lambda x: int(x.find('my mental health') > -1))
v2['mood'] =  v2.apps.apply(lambda x: int(x.find('my mood') > -1))
v2['managing_daily_issues'] =  v2.apps.apply(lambda x: int(x.find('managing daily issues') > -1))
v2['improve_work'] =  v2.apps.apply(lambda x: int(x.find('improve work') > -1))
v2['brain_health'] =  v2.apps.apply(lambda x: int(x.find('brain health') > -1))
v2['improve_relationships'] =  v2.apps.apply(lambda x: int(x.find('improve relationships') > -1))
v2['other'] =  v2.apps.apply(lambda x: int(x.find('other') > -1))

# add the study flag
v2.head()

In [ ]:

t = [print(a) for a in pd.unique(list(it.chain(*[t.split(',') for t in v2.apps])))]; del t

Combine the DataFrames¶

In [ ]:

combined = pd.concat([v1, v2], sort=False)

combined = combined.drop(columns=['apps']).rename(columns={
    'brain_health': 'happ_bh',
    'fun': 'happ_f',
    'improve_relationships': 'happ_ir',
    'improve_work': 'happ_iw',
    'managing_daily_issues': 'happ_mdi',
    'mental_health': 'happ_mh',
    'mood':'happ_m',
    'other': 'happ_o',
    'other_description':'happ_o_description'
}).loc[:, [
    'participant_id', 'dt_response', 
    'happ_bh', 'happ_f', 'happ_fts', 'happ_inc', 'happ_ir', 'happ_iw', 'happ_m', 'happ_mdi', 'happ_mh', 'happ_o',
    'happ_o_description'
]]

combined = combined.fillna(0)

# make sure they're all the correct type. for some reason pandas was converting fts and inc to floats
# but we don't need that many bits. plus i like consistency
for c in combined.columns:
    if c.find('description') > -1:
        continue 
        
    if c.find('app') > -1:
        combined[c] = combined[c].astype(int)

combined.head()

Add week into study¶

In [3]:

metasid = 'syn27082597'

In [ ]:

metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')

# get the time difference in weeks as a float
combined['week'] = [
    d.days/7 for d in (
        combined.dt_response.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
    )
]

# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)

# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)

combined.head()

Localize timestamps¶

In [ ]:

# localize timestamps
combined['dt_response'] = [
    str(t.tz_localize('UTC'))
    for t in combined.dt_response
]

combined.head()

Set provenance and upload to Synapse¶

In [ ]:

t = syn.delete(
    syn.tableQuery('select * from syn17022426')
)

In [ ]:

final = syn.store(Table(
    Schema(
            name='Health Applications',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [4]:

final = syn.setProvenance(
    'syn17022426',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=['syn17023091', v1sid, v2sid, 'syn12181332', metasid],
        executed=[
            dict(
                name='Curate_StudyAppDownloadReason_Data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_StudyAppDownloadReason_Data.ipynb'
            )
        ]
    )
)

In [ ]: