import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#model data
df=pd.read_csv('ex/data/days-simulated-v2.tsv')
#target structure: activity, duration, activity, duration, ...
df.head()
day | |
---|---|
0 | 0,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1... |
1 | 4,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1... |
2 | 0,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,... |
3 | 0,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,... |
4 | 0,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1... |
df1=pd.read_csv('ex/1.csv')
df2=pd.read_csv('ex/2.csv')
df3=pd.read_csv('ex/3.csv')
df=pd.concat([df1,df2[1:],df3[1:]]) #no need for headers twice, df headers completely identical
#create weekday dataframe
hkoz=df[df.columns[9:489]].reset_index()
#create weekend dataframe
hetv=df[df.columns[489:969]].reset_index()
#create descriptive stats dataframe
desc=df[df.columns[969:]].reset_index()
#create survey metadata dataframe
time=df[df.columns[2:4]].reset_index()
#top 16 activity groups
activities=[['Alv\xc3\xa1s'],
['Zuhany / Mosd\xc3\xb3'],
['\xc3\x89tkez\xc3\xa9s', '\xc3\x89tterem/Vend\xc3\xa9gl\xc5\x91'],
[u'Munka (irodai)', 'Munka (k\xc3\xa9tkezi)'],
[u'Internet', u'Telefon/Chat/Facebook'],
['V\xc3\xa1s\xc3\xa1rl\xc3\xa1s'],
['Vall\xc3\xa1sgyakorl\xc3\xa1s','\xc3\x96nk\xc3\xa9ntess\xc3\xa9g'],
[u'TV/Film', u'Mozi'],
['Olvas\xc3\xa1s', '\xc3\x9ajs\xc3\xa1g/Keresztrejtv\xc3\xa9ny'],
['H\xc3\xa1zimunka/Gyerekfel\xc3\xbcgyelet'],
['Hivatalos elint\xc3\xa9znival\xc3\xb3k'],
[u'Sport', 'Edz\xc5\x91terem/Sz\xc3\xa9ps\xc3\xa9gszalon'],
['Egy\xc3\xa9b Hobby', 'PC j\xc3\xa1t\xc3\xa9k', 'Kert\xc3\xa9szked\xc3\xa9s/Bark\xc3\xa1csol\xc3\xa1s', 'Rokonl\xc3\xa1togat\xc3\xa1s', 'M\xc3\xa1s'],
['Tanul\xc3\xa1s', 'Mag\xc3\xa1n\xc3\xb3ra'],
['Sz\xc3\xb3rakoz\xc3\xb3hely/K\xc3\xa1v\xc3\xa9z\xc3\xb3/Pub'],
['S\xc3\xa9ta/Kutyas\xc3\xa9t\xc3\xa1ltat\xc3\xa1s', 'Term\xc3\xa9szet/Kir\xc3\xa1ndul\xc3\xa1s'],
['Utaz\xc3\xa1s/Vezet\xc3\xa9s']]
#bin activities into activity groups
actidict={}
for i in range(len(activities)):
for j in range(len(activities[i])):
actidict[activities[i][j]]=i
#binned activities into activity groups
actidict
{'Olvas\xc3\xa1s': 8, 'Rokonl\xc3\xa1togat\xc3\xa1s': 12, u'Mozi': 7, 'M\xc3\xa1s': 12, u'Internet': 4, '\xc3\x89tterem/Vend\xc3\xa9gl\xc5\x91': 2, 'H\xc3\xa1zimunka/Gyerekfel\xc3\xbcgyelet': 9, 'S\xc3\xa9ta/Kutyas\xc3\xa9t\xc3\xa1ltat\xc3\xa1s': 15, '\xc3\x89tkez\xc3\xa9s': 2, 'Vall\xc3\xa1sgyakorl\xc3\xa1s': 6, '\xc3\x96nk\xc3\xa9ntess\xc3\xa9g': 6, '\xc3\x9ajs\xc3\xa1g/Keresztrejtv\xc3\xa9ny': 8, 'Edz\xc5\x91terem/Sz\xc3\xa9ps\xc3\xa9gszalon': 11, 'Egy\xc3\xa9b Hobby': 12, u'TV/Film': 7, 'Alv\xc3\xa1s': 0, 'Utaz\xc3\xa1s/Vezet\xc3\xa9s': 16, 'Sz\xc3\xb3rakoz\xc3\xb3hely/K\xc3\xa1v\xc3\xa9z\xc3\xb3/Pub': 14, 'Mag\xc3\xa1n\xc3\xb3ra': 13, 'PC j\xc3\xa1t\xc3\xa9k': 12, u'Sport': 11, 'Hivatalos elint\xc3\xa9znival\xc3\xb3k': 10, 'Kert\xc3\xa9szked\xc3\xa9s/Bark\xc3\xa1csol\xc3\xa1s': 12, u'Telefon/Chat/Facebook': 4, 'V\xc3\xa1s\xc3\xa1rl\xc3\xa1s': 5, u'Munka (irodai)': 3, 'Munka (k\xc3\xa9tkezi)': 3, 'Term\xc3\xa9szet/Kir\xc3\xa1ndul\xc3\xa1s': 15, 'Tanul\xc3\xa1s': 13, 'Zuhany / Mosd\xc3\xb3': 1}
# run only once
hetv.columns=hetv.loc[0].values
hetv=hetv[1:].drop(0,axis=1)
#extract and linearize data from pandas dataframe
hvegdata={}
for i in hetv.index:
index=hetv.loc[i].index
values=hetv.loc[i].values
helper=[]
for j in range(len(values)):
if str(values[j]).lower()!='nan':
helper.append(index[j])
hvegdata[i]=helper
#create timematrix - timeslice:activity list
j=1
timematrix={}
for i in hvegdata[j]:
activity=i[:i.find('-')-1]
timeslice=i[i.find('-')+2:]
if timeslice not in timematrix:timematrix[timeslice]=[]
timematrix[timeslice].append(actidict[activity])
#create correct timeslice order to start day at 04:00
parseorder=np.roll(np.sort(timematrix.keys()),-2)
#create output list, with shared timeslots
output=[]
for k in range(len(parseorder)):
helper=timematrix[parseorder[k]]
np.random.shuffle(helper)
output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons
print output
[[0], [0], [0], [2, 1, 0], [13], [13], [12, 4, 7], [2], [2], [0], [0], [0], [0]]
#create output CSV list: activity, duration, activity, duration, ...
output2=[]
fixed=90 # survey 90 min timeslices are fixed
for k in range(len(output)):
for z in range(len(output[k])):
output2.append(output[k][z])
output2.append(fixed/(len(output[k])))
print output2
[0, 90, 0, 90, 0, 90, 2, 30, 1, 30, 0, 30, 13, 90, 13, 90, 12, 30, 4, 30, 7, 30, 2, 90, 2, 90, 0, 90, 0, 90, 0, 90, 0, 90]
#minutes in perfect day
sum([output2[i*2+1] for i in range(len(output2)/2)])
1170
#create output CSV list: activity, duration, activity, duration, ... including pruning
output2=[]
fixed=90 # survey 90 min timeslices are fixed
current=999
for k in range(len(output)):
for z in range(len(output[k])):
if output[k][z]!=current:
current=output[k][z]
output2.append(output[k][z])
output2.append(fixed/(len(output[k]))-15+int(np.random.uniform(30))) #randomize a bit better movement
else:
output2[-1]+=fixed/(len(output[k]))
print output2
[0, 264, 2, 31, 1, 21, 0, 25, 13, 191, 12, 20, 4, 22, 7, 21, 2, 166, 0, 350]
sum([output2[i*2+1] for i in range(len(output2)/2)])
1111
Parse all data
#create timematrix - timeslice:activity list
output4=[]
for j in hvegdata:
timematrix={}
for i in hvegdata[j]:
activity=i[:i.find('-')-1]
timeslice=i[i.find('-')+2:]
if timeslice not in timematrix:timematrix[timeslice]=[]
timematrix[timeslice].append(actidict[activity])
#create correct timeslice order to start day at 04:00
parseorder=np.roll(np.sort(timematrix.keys()),-2)
#create output list, with shared timeslots
for x in range(3): #create 3 randomized person-instances
output=[]
for k in range(len(parseorder)):
helper=timematrix[parseorder[k]]
np.random.shuffle(helper)
output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons
#create output CSV list: activity, duration, activity, duration, ...
output2=[]
fixed=90 # survey 90 min timeslices are fixed
current=999
for k in range(len(output)):
for z in range(len(output[k])):
if output[k][z]!=current:
current=output[k][z]
output2.append(output[k][z])
output2.append(fixed/(len(output[k]))+int(np.random.uniform(30)))
else:
output2[-1]+=fixed/(len(output[k]))
output4.append(str(output2)[1:-1].replace(' ',''))
savedata=pd.DataFrame(output4)
savedata.columns=['day']
savedata.to_csv('hkoz.csv',index=False)
len(savedata)
798