import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from jupyterthemes import jtplot
import matplotlib as mpl
# onedork | grade3 | oceans16 | chesterish | monokai | solarizedl | solarizedd
#model data
df=pd.read_csv('ex/data/days-simulated-v2.tsv')
#target structure: activity, duration, activity, duration, ...
df.head()
a=df.loc[430].values[0].split(',')
#activity sum duration (minues by day)
sum([int(a[i*2+1]) for i in range(len(a)/2)])
df1=pd.read_csv('ex/1.csv', encoding = 'utf8')
df2=pd.read_csv('ex/2.csv', encoding = 'utf8')
df3=pd.read_csv('ex/3.csv', encoding = 'utf8')
df=pd.concat([df1,df2[1:],df3[1:]]) #no need for headers twice, df headers completely identical
#create weekday dataframe
hkoz=df[df.columns[9:489]].reset_index()
#create weekend dataframe
hetv=df[df.columns[489:969]].reset_index()
#create descriptive stats dataframe
desc=df[df.columns[969:]].reset_index()
#create survey metadata dataframe
time=df[df.columns[2:4]].reset_index()
#top 16 activity groups
activities=[['Alv\xc3\xa1s'],
['Zuhany / Mosd\xc3\xb3'],
['\xc3\x89tkez\xc3\xa9s', '\xc3\x89tterem/Vend\xc3\xa9gl\xc5\x91'],
[u'Munka (irodai)', 'Munka (k\xc3\xa9tkezi)'],
[u'Internet', u'Telefon/Chat/Facebook'],
['V\xc3\xa1s\xc3\xa1rl\xc3\xa1s'],
['Vall\xc3\xa1sgyakorl\xc3\xa1s','\xc3\x96nk\xc3\xa9ntess\xc3\xa9g'],
[u'TV/Film', u'Mozi'],
['Olvas\xc3\xa1s', '\xc3\x9ajs\xc3\xa1g/Keresztrejtv\xc3\xa9ny'],
['H\xc3\xa1zimunka/Gyerekfel\xc3\xbcgyelet'],
['Hivatalos elint\xc3\xa9znival\xc3\xb3k'],
[u'Sport', 'Edz\xc5\x91terem/Sz\xc3\xa9ps\xc3\xa9gszalon'],
['Egy\xc3\xa9b Hobby', 'PC j\xc3\xa1t\xc3\xa9k', 'Kert\xc3\xa9szked\xc3\xa9s/Bark\xc3\xa1csol\xc3\xa1s', 'Rokonl\xc3\xa1togat\xc3\xa1s', 'M\xc3\xa1s'],
['Tanul\xc3\xa1s', 'Mag\xc3\xa1n\xc3\xb3ra'],
['Sz\xc3\xb3rakoz\xc3\xb3hely/K\xc3\xa1v\xc3\xa9z\xc3\xb3/Pub'],
['S\xc3\xa9ta/Kutyas\xc3\xa9t\xc3\xa1ltat\xc3\xa1s', 'Term\xc3\xa9szet/Kir\xc3\xa1ndul\xc3\xa1s'],
['Utaz\xc3\xa1s/Vezet\xc3\xa9s']]
activities=[[u'Alvás'],
[u'Zuhany / Mosdó'],
[u'Étkezés', u'Étterem/Vendéglõ'],
[u'Munka (irodai)', u'Munka (kétkezi)'],
[u'Internet', u'Telefon/Chat/Facebook'],
[u'Vásárlás'],
[u'Vallásgyakorlás', u'Önkéntesség'],
[u'TV/Film', u'Mozi'],
[u'Olvasás', u'Újság/Keresztrejtvény'],
[u'Házimunka/Gyerekfelügyelet'],
[u'Hivatalos elintéznivalók'],
[u'Sport', u'Edzõterem/Szépségszalon'],
[u'Egyéb Hobby',u'PC játék', u'Kertészkedés/Barkácsolás', u'Rokonlátogatás', u'Más'],
[u'Tanulás', u'Magánóra'],
[u'Szórakozóhely/Kávézó/Pub'],
[u'Séta/Kutyasétáltatás', u'Természet/Kirándulás'],
[u'Utazás/Vezetés']]
#bin activities into activity groups
actidict={}
for i in range(len(activities)):
for j in range(len(activities[i])):
actidict[activities[i][j]]=i
#binned activities into activity groups
for i in activities:
for j in i:
print j,
print
timekeys=['01:00-02:30', '02:30-04:00', '04:00-05:30', '05:30-07:00',
'07:00-08:30', '08:30-10:00', '10:00-11:30', '11:30-13:00',
'13:00-14:30', '14:30-16:00', '16:00-17:30', '17:30-19:00',
'19:00-20:30', '20:30-22:00', '22:00-23:30', '23:30-01:00']
# run only once
hkoz.columns=hkoz.loc[0].values
hkoz=hkoz[1:].drop(0,axis=1)
hetv.columns=hetv.loc[0].values
hetv=hetv[1:].drop(0,axis=1)
#extract and linearize data from pandas dataframe
hkozdata={}
for i in hkoz.index:
index=hkoz.loc[i].index
values=hkoz.loc[i].values
helper=[]
for j in range(len(values)):
if str(values[j]).lower()!='nan':
helper.append(index[j])
hkozdata[i]=helper
#extract and linearize data from pandas dataframe
hetvdata={}
for i in hetv.index:
index=hetv.loc[i].index
values=hetv.loc[i].values
helper=[]
for j in range(len(values)):
if str(values[j]).lower()!='nan':
helper.append(index[j])
hetvdata[i]=helper
#create timematrix - timeslice:activity list
j=1
timematrix={}
for i in hkozdata[j]:
activity=i[:i.find('-')-1]
timeslice=i[i.find('-')+2:]
if timeslice not in timematrix:timematrix[timeslice]=[]
timematrix[timeslice].append(actidict[activity])
#create correct timeslice order to start day at 04:00
parseorder=np.roll(np.sort(timematrix.keys()),-2)
#create output list, with shared timeslots
output=[]
for k in range(len(parseorder)):
helper=timematrix[parseorder[k]]
np.random.shuffle(helper)
output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons
print output
#create output CSV list: activity, duration, activity, duration, ...
output2=[]
fixed=90 # survey 90 min timeslices are fixed
for k in range(len(output)):
for z in range(len(output[k])):
output2.append(output[k][z])
output2.append(fixed/(len(output[k])))
print output2
#minutes in perfect day
sum([output2[i*2+1] for i in range(len(output2)/2)])
#create output CSV list: activity, duration, activity, duration, ... including pruning
output2=[]
fixed=90 # survey 90 min timeslices are fixed
current=999
for k in range(len(output)):
for z in range(len(output[k])):
if output[k][z]!=current:
current=output[k][z]
output2.append(output[k][z])
output2.append(fixed/(len(output[k]))-10+int(np.random.uniform(20))) #randomize a bit better movement
else:
output2[-1]+=fixed/(len(output[k]))
print output2
sum([output2[i*2+1] for i in range(len(output2)/2)])
Parse all data
instances=2
rrange=60
#fill method 1:
#assume sleeping if notehing clicked in
#fill method 2:
#disregard incomplete data
#create timematrix - timeslice:activity list
output4=[]
for j in hkozdata:
timematrix={}
for i in hkozdata[j]:
activity=i[:i.find('-')-1]
timeslice=i[i.find('-')+2:]
if timeslice not in timematrix:timematrix[timeslice]=[]
timematrix[timeslice].append(actidict[activity])
#fill up with sleep
if len(timematrix)<16:
for t in timekeys:
if t not in timematrix:
timematrix[t]=[0]
#create correct timeslice order to start day at 04:00
parseorder=np.roll(np.sort(timematrix.keys()),-2)
#create output list, with shared timeslots
for x in range(instances): #create 3 randomized person-instances
output=[]
for k in range(len(parseorder)):
helper=timematrix[parseorder[k]]
np.random.shuffle(helper)
output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons
#create output CSV list: activity, duration, activity, duration, ...
output2=[]
fixed=90 # survey 90 min timeslices are fixed
current=999
for k in range(len(output)):
for z in range(len(output[k])):
if output[k][z]!=current:
current=output[k][z]
output2.append(output[k][z])
output2.append(int(fixed*1.0/(len(output[k]))))
else:
output2[-1]+=fixed/(len(output[k]))
output4.append(str(output2)[1:-1].replace(' ',''))
output4b=[]
for j in range(len(output4)):
a=[int(i) for i in output4[j].split(',')]
b=[int(np.random.uniform(rrange)) for k in range(len(a)/2-1)]
for g in range(len(b)):
toshift=int(min(min(a[g*2+1],a[(g+1)*2+1])/1.6,b[g]))
a[g*2+1]+=toshift
a[(g+1)*2+1]-=toshift
output4b.append(str(a)[1:-1].replace(' ',''))
savedata=pd.DataFrame(output4b)
savedata.columns=['day']
savedata.to_csv('hkoz.csv',index=False)
#create timematrix - timeslice:activity list
output5=[]
for j in hetvdata:
timematrix={}
for i in hetvdata[j]:
activity=i[:i.find('-')-1]
timeslice=i[i.find('-')+2:]
if timeslice not in timematrix:timematrix[timeslice]=[]
timematrix[timeslice].append(actidict[activity])
#fill up with sleep
if len(timematrix)<16:
for t in timekeys:
if t not in timematrix:
timematrix[t]=[0]
#create correct timeslice order to start day at 04:00
parseorder=np.roll(np.sort(timematrix.keys()),-2)
#create output list, with shared timeslots
for x in range(instances): #create 3 randomized person-instances
output=[]
for k in range(len(parseorder)):
helper=timematrix[parseorder[k]]
np.random.shuffle(helper)
output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons
#create output CSV list: activity, duration, activity, duration, ...
output2=[]
fixed=90 # survey 90 min timeslices are fixed
current=999
for k in range(len(output)):
for z in range(len(output[k])):
if output[k][z]!=current:
current=output[k][z]
output2.append(output[k][z])
output2.append(int(fixed*1.0/(len(output[k]))))
else:
output2[-1]+=fixed/(len(output[k]))
output5.append(str(output2)[1:-1].replace(' ',''))
output5b=[]
for j in range(len(output5)):
a=[int(i) for i in output5[j].split(',')]
b=[int(np.random.uniform(rrange)) for k in range(len(a)/2-1)]
for g in range(len(b)):
toshift=int(min(min(a[g*2+1],a[(g+1)*2+1])/1.6,b[g]))
a[g*2+1]+=toshift
a[(g+1)*2+1]-=toshift
output5b.append(str(a)[1:-1].replace(' ',''))
savedata=pd.DataFrame(output5b)
savedata.columns=['day']
savedata.to_csv('hetv.csv',index=False)
len(savedata)
Plots
colorsdict={
"0": "#e0d400",
"1": "#1c8af9",
"2": "#51BC05",
"3": "#FF7F00",
"4": "#DB32A4",
"5": "#00CDF8",
"6": "#E63B60",
"7": "#8E5649",
"8": "#68c99e",
"9": "#a477c8",
"10": "#5C76EC",
"11": "#E773C3",
"12": "#799fd2",
"13": "#038a6c",
"14": "#cc87fa",
"15": "#ee8e76",
"16": "#bbbbbb",
}
colors=[colorsdict[str(i)] for i in range(len(colorsdict))]
act_codes = [
{"index": "15", "short": u"Kutya", "desc": u"Sétáltatás | Természet"},
{"index": "3", "short": u"Meló", "desc": u"Munka (irodai | kétkezi)"},
{"index": "7", "short": u"TV", "desc": u"TV | Film | Mozi"},
{"index": "5", "short": u"Vásárlás", "desc": u"Vásárlás | Mall"},
{"index": "6", "short": u"Áhítat", "desc": u"Vallásgyakorlás | Önkéntesség"},
{"index": "4", "short": u"Net", "desc": u"Internet | Telefon | Facebook"},
{"index": "1", "short": u"Zuhany", "desc": u"Zuhany | Mosdó"},
{"index": "8", "short": u"Olvasás", "desc": u"Könyv | Újság | Keresztrejtvény"},
{"index": "9", "short": u"Otthon", "desc": u"Házimunka | Gyerekfelügyelet"},
{"index": "0", "short": u"Szundi", "desc": u"Alvás"},
{"index": "11", "short": u"Sport", "desc": u"Edzés | Edzőterem | Szaladás"},
{"index": "12", "short": u"Hobby", "desc": u"Kertészkedés | Barkácsolás | Rokonok"},
{"index": "2", "short": u"Kaja", "desc": u"Étkezés | Étterem"},
{"index": "14", "short": u"Sör", "desc": u"Kávézó | Pub | Szórakozóhely"},
{"index": "13", "short": u"Suli", "desc": u"Tanulás | Magánóra"},
{"index": "10", "short": u"Hivatal", "desc": u"Hivatalos elintéznivalók"},
{"index": "16", "short": u"Úton", "desc": u"Utazás | Vezetés"},
]
labels={i['index']:i['short'] for i in act_codes}
#duration
time['sd']=pd.to_datetime(time['Start Date'])
time['ed']=pd.to_datetime(time['End Date'])
jtplot.style(theme='onedork',fscale=1.1, spines=False, grid=False, )
fig,ax=plt.subplots(1,1,figsize=(6,4))
plt.hist([(time['ed'].sub(time['sd'], axis=0))[1:][i].total_seconds()/60.0 for i in range(1,len(time))],
range=[0,20],bins=range(20),normed=True,color='#ffcc00',alpha=0.8)
plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Válaszadók száazléka (%)")
plt.title(u"A kérdõívre 7-8 percet szántatok")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
fig.tight_layout()
plt.savefig('1.png')
plt.show()
mdata=[]
mlen=[]
for k in range(len(output4b)):
z=np.array(output4b[k].split(',')).astype(int)
data=[]
for i in range(len(z)/2):
for j in range(z[i*2+1]):
data.append(z[i*2])
mdata.append(data)
mlen.append(len(data))
print np.average(mlen)
df=pd.DataFrame(mdata)
histdata=[]
for i in range(len(df.columns)):
a=np.histogram(df[df.columns[i]],bins=17,range=[-0.5,16.5],normed=True)
histdata.append(a[0])
df=pd.rolling_mean(pd.DataFrame(histdata),2*rrange,center=True)
df1=df.T
df1[0]=pd.DataFrame(pd.DataFrame(histdata).loc[0])
df1[df1.columns[-1]]=pd.DataFrame(pd.DataFrame(histdata).loc[0])
df2=df1.T.interpolate()
jtplot.style(theme='onedork',fscale=1.1, spines=False, grid=False, )
fig,ax=plt.subplots(1,1,figsize=(6,4))
for i in df2.columns:
plt.plot(df2[i],label=labels[str(i)],color=colors[i],alpha=0.9,lw=2)
plt.xlim(0,1440)
ax.set_xticks([60,240,420,600,780,960,1140,1320])
ax.set_xticklabels(["05:00","08:00","11:00","14:00","17:00","20:00","23:00","02:00"],fontsize=11)
#plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Tevékenységek eloszlása (%)")
plt.title(u"Egy átlagos erdélyi hétköznap rutinja")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
ax.legend(bbox_to_anchor=(1.25, 1.05),fontsize=9)
fig.tight_layout()
plt.savefig('2a.png')
plt.show()
fig,ax=plt.subplots(1,1,figsize=(6,4))
for i in df2.columns:
if i not in [0,3]:
plt.plot(df2[i],label=labels[str(i)],color=colors[i],alpha=0.9,lw=2)
plt.xlim(0,1440)
ax.set_xticks([60,240,420,600,780,960,1140,1320])
ax.set_xticklabels(["05:00","08:00","11:00","14:00","17:00","20:00","23:00","02:00"],fontsize=11)
#plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Tevékenységek eloszlása (%)")
plt.title(u"Egy átlagos erdélyi hétköznap rutinja\n(munka és szundi kivételével)")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
ax.legend(bbox_to_anchor=(1.25, 1.05),fontsize=9)
fig.tight_layout()
plt.savefig('2b.png')
plt.show()
df2.to_csv('hkoz2.csv')
jtplot.style(theme='onedork',fscale=1.1, spines=False, grid=False, )
fig,ax=plt.subplots(1,1,figsize=(6,4))
toplot=[]
tolabel=[]
for i in df2.columns[::-1]:
toplot.append(df2[i])
tolabel.append(labels[str(i)])
plt.stackplot(df.index,toplot,colors=colors[::-1],alpha=0.9,labels=tolabel)
plt.xlim(0,1440)
plt.ylim(0,1)
ax.set_xticks([60,240,420,600,780,960,1140,1320])
ax.set_xticklabels(["05:00","08:00","11:00","14:00","17:00","20:00","23:00","02:00"],fontsize=11)
#plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Tevékenységek eloszlása (%)")
plt.title(u"Egy átlagos erdélyi hétköznap rutinja")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
ax.legend()
#reverse legend order
handles, labls = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labls[::-1], bbox_to_anchor=(1.25, 1.05),fontsize=9)
fig.tight_layout()
plt.savefig('3.png')
plt.show()
mdata=[]
mlen=[]
for k in range(len(output5b)):
z=np.array(output5b[k].split(',')).astype(int)
data=[]
for i in range(len(z)/2):
for j in range(z[i*2+1]):
data.append(z[i*2])
mdata.append(data)
mlen.append(len(data))
print np.average(mlen)
df=pd.DataFrame(mdata)
histdata=[]
for i in range(len(df.columns)):
a=np.histogram(df[df.columns[i]],bins=17,range=[-0.5,16.5],normed=True)
histdata.append(a[0])
df=pd.rolling_mean(pd.DataFrame(histdata),2*rrange,center=True)
df1=df.T
df1[0]=pd.DataFrame(pd.DataFrame(histdata).loc[0])
df1[df1.columns[-1]]=pd.DataFrame(pd.DataFrame(histdata).loc[0])
df2=df1.T.interpolate()
jtplot.style(theme='onedork',fscale=1.1, spines=False, grid=False, )
fig,ax=plt.subplots(1,1,figsize=(6,4))
for i in df2.columns:
plt.plot(df2[i],label=labels[str(i)],color=colors[i],alpha=0.9,lw=2)
plt.xlim(0,1440)
ax.set_xticks([60,240,420,600,780,960,1140,1320])
ax.set_xticklabels(["05:00","08:00","11:00","14:00","17:00","20:00","23:00","02:00"],fontsize=11)
#plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Tevékenységek eloszlása (%)")
plt.title(u"Egy átlagos erdélyi hétvége rutinja")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
ax.legend(bbox_to_anchor=(1.25, 1.05),fontsize=9)
fig.tight_layout()
plt.savefig('4a.png')
plt.show()
jtplot.style(theme='onedork',fscale=1.1, spines=False, grid=False, )
fig,ax=plt.subplots(1,1,figsize=(6,4))
for i in df2.columns:
if i!=0:
plt.plot(df2[i],label=labels[str(i)],color=colors[i],alpha=0.9,lw=2)
plt.xlim(0,1440)
ax.set_xticks([60,240,420,600,780,960,1140,1320])
ax.set_xticklabels(["05:00","08:00","11:00","14:00","17:00","20:00","23:00","02:00"],fontsize=11)
#plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Tevékenységek eloszlása (%)")
plt.title(u"Egy átlagos erdélyi hétvége rutinja\n(szundi kivételével)")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
ax.legend(bbox_to_anchor=(1.25, 1.05),fontsize=9)
fig.tight_layout()
plt.savefig('4b.png')
plt.show()
df2.to_csv('hetv2.csv')
jtplot.style(theme='onedork',fscale=1.1, spines=False, grid=False, )
fig,ax=plt.subplots(1,1,figsize=(6,4))
toplot=[]
tolabel=[]
for i in df2.columns[::-1]:
toplot.append(df2[i])
tolabel.append(labels[str(i)])
plt.stackplot(df2.index,toplot,colors=colors[::-1],alpha=0.9,labels=tolabel)
plt.xlim(0,1440)
plt.ylim(0,1)
ax.set_xticks([60,240,420,600,780,960,1140,1320])
ax.set_xticklabels(["05:00","08:00","11:00","14:00","17:00","20:00","23:00","02:00"],fontsize=11)
#plt.xlabel(u"Kérdõív kitöltési ideje (perc)")
plt.ylabel(u"Tevékenységek eloszlása (%)")
plt.title(u"Egy átlagos erdélyi hétvége rutinja")
def mjrFormatter(x, pos):
return str(int(x*100))
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(mjrFormatter))
ax.legend()
#reverse legend order
handles, labls = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labls[::-1], bbox_to_anchor=(1.25, 1.05),fontsize=9)
fig.tight_layout()
plt.savefig('5.png')
plt.show()
desc['one']=1
desc.head()
df=desc[1:].groupby(desc.columns[2]).count()
pie=plt.pie(df['index'],labels=df.index,autopct='%1.0f%%')
labels=[pie[1][i].get_text()+'\n'+pie[2][i].get_text() for i in range(len(pie[1]))]
labels[-1]=labels[-1].replace('\n',' (')+')'
labels[-2]=labels[-2].replace('\n',' (')+')'
plt.clf()
cmap = plt.cm.viridis
colors1 = cmap(np.linspace(0.2, 0.9, len(pie[0])))
#np.random.shuffle(colors1)
pie=plt.pie(df['index'],labels=labels,colors=colors1,startangle=5)
ax=plt.gca()
ax.set_aspect('equal')
plt.title(u'Válaszadók életkor szerinti eloszlása')
fig.tight_layout()
plt.savefig('6.png')
plt.show()
df=desc[1:].groupby(desc.columns[3]).count()
pie=plt.pie(df['index'],labels=df.index,autopct='%1.0f%%')
labels=[pie[1][i].get_text()+'\n'+pie[2][i].get_text() for i in range(len(pie[1]))]
labels[-1]=labels[-1].replace('\n',' (')+')'
labels[-2]=labels[-2].replace('\n',' (')+')'
plt.clf()
cmap = plt.cm.viridis
colors1 = cmap(np.linspace(0.2, 0.9, len(pie[0])))
#np.random.shuffle(colors1)
pie=plt.pie(df['index'],labels=labels,colors=colors1,startangle=25)
ax=plt.gca()
ax.set_aspect('equal')
plt.title(u'Válaszadók nem szerinti eloszlása')
fig.tight_layout()
plt.savefig('7.png')
plt.show()