Exploring and initial data cleaning¶

This is my working note for the process of initial data cleaning. For a short version, go to cleaning_short.ipynb which contains functions for cleaning this dataset

In [1]:

%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:

import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt

In [5]:

PATH = Path('data')
list(PATH.iterdir())

Out[5]:

[PosixPath('data/houston.csv'), PosixPath('data/location_history.json')]

In [7]:

j_file = json.load((PATH/'location_history.json').open())

In [5]:

len(j_file)

Out[5]:

In [8]:

df = pd.DataFrame.from_dict(j_file['locations'])

In [6]:

df.tail(10) 

Out[6]:

	accuracy	activity	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy
647887	30	NaN	NaN	NaN	296678426	-952766618	1386536707631	NaN	NaN
647888	30	[{'timestampMs': '1386536651324', 'activity': ...	NaN	NaN	296678240	-952766606	1386536647690	NaN	NaN
647889	30	NaN	NaN	NaN	296678308	-952766592	1386536587653	NaN	NaN
647890	30	NaN	NaN	NaN	296678200	-952766444	1386536527693	NaN	NaN
647891	11	NaN	NaN	NaN	296678076	-952766669	1386536468150	NaN	NaN
647892	12	NaN	NaN	NaN	296678120	-952766469	1386536423014	NaN	NaN
647893	5	[{'timestampMs': '1386536388113', 'activity': ...	31.0	320.0	296677802	-952765843	1386536377898	0.0	NaN
647894	6	NaN	31.0	320.0	296677783	-952765824	1386536372859	0.0	NaN
647895	3	NaN	30.0	307.0	296677688	-952765729	1386536352800	0.0	NaN
647896	8	[{'timestampMs': '1386536307870', 'activity': ...	30.0	161.0	296676529	-952764549	1386536307163	0.0	NaN

Note: data is already sorted by timestamp (newest to oldest)

In [9]:

df.timestampMs = df.timestampMs.astype(np.int64)

In [7]:

# df['date_time'] = pd.to_datetime(df.timestampMs,unit='ms').dt.tz_localize('utc').dt.tz_convert('US/Central')

In [8]:

df.head()

Out[8]:

	accuracy	activity	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy	date_time
0	16	NaN	118.0	NaN	390846488	-771527455	1531424546373	NaN	2.0	2018-07-12 14:42:26.373000-05:00
1	16	NaN	118.0	NaN	390846471	-771527444	1531423345272	NaN	2.0	2018-07-12 14:22:25.272000-05:00
2	16	NaN	118.0	NaN	390846486	-771527475	1531422368498	NaN	2.0	2018-07-12 14:06:08.498000-05:00
3	16	NaN	118.0	NaN	390846477	-771527482	1531421692554	NaN	2.0	2018-07-12 13:54:52.554000-05:00
4	26	NaN	NaN	NaN	390845015	-771527064	1531421092427	NaN	NaN	2018-07-12 13:44:52.427000-05:00

In [10]:

# Extract data in Houston/TX area: from 2013 to 1/30/2017 6:18 am
df_houston = df[df.timestampMs <= 1485778729042].reset_index(drop=True)

In [12]:

df_houston.shape

Out[12]:

(270054, 9)

In [8]:

# note that when saving to csv, data structure type (dictionary or list) will be converted into string
# df_houston.to_csv(PATH/'houston.csv',index=False)

In [9]:

# df_houston = pd.read_csv(PATH/'houston.csv')

Activity (list) - exclude long list of activity¶

In [12]:

df_act = df_houston[~df_houston.activity.isnull()].copy()

In [18]:

# % of non-null activity records
len(df_act)/ len(df_houston)

Out[18]:

0.3827827027187155

In [21]:

df_act.head(10)

Out[21]:

	accuracy	activity	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy
0	545	[{'timestampMs': '1485778729815', 'activity': ...	23.0	NaN	299853940	-953481015	1485778729042	NaN	NaN
1	585	[{'timestampMs': '1485778608624', 'activity': ...	19.0	NaN	299853159	-953482313	1485778672000	NaN	NaN
2	8	[{'timestampMs': '1485778474389', 'activity': ...	4.0	224.0	299850980	-953489148	1485778297913	NaN	NaN
4	8	[{'timestampMs': '1485778148328', 'activity': ...	4.0	224.0	299850980	-953489148	1485778148040	NaN	NaN
5	8	[{'timestampMs': '1485778027987', 'activity': ...	4.0	224.0	299850980	-953489148	1485778028950	NaN	NaN
6	292	[{'timestampMs': '1485777980551', 'activity': ...	16.0	186.0	299849212	-953487105	1485777980208	NaN	NaN
7	292	[{'timestampMs': '1485777850168', 'activity': ...	16.0	186.0	299849212	-953487105	1485777812675	NaN	NaN
8	145	[{'timestampMs': '1485777709913', 'activity': ...	15.0	77.0	299853950	-953490222	1485777721949	NaN	NaN
9	43	[{'timestampMs': '1485777678185', 'activity': ...	16.0	NaN	299849213	-953489881	1485777656000	NaN	NaN
10	106	[{'timestampMs': '1485777555400', 'activity': ...	-45.0	6.0	299852839	-953485393	1485777566000	NaN	NaN

Note: activity feature contains a list of smaller activities, stored as dictionary

In [13]:

# how many smaller activities are there in 'activity' feature
df_act_expand = df_act.activity.apply(pd.Series)

In [20]:

df_act_expand.head()

Out[20]:

	0	1	2	3	4	5	6	7	8	9	...	89	90	91	92	93	94	95	96	97	98
0	{'timestampMs': '1485778729815', 'activity': [...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	{'timestampMs': '1485778608624', 'activity': [...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	{'timestampMs': '1485778474389', 'activity': [...	{'timestampMs': '1485778410435', 'activity': [...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	{'timestampMs': '1485778148328', 'activity': [...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	{'timestampMs': '1485778027987', 'activity': [...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 99 columns

In [22]:

df_act_expand.shape 

Out[22]:

(103372, 99)

In [89]:

# null values percentages for each col
((df_act_expand.isnull().sum()) / len(df_act_expand))*100

Out[89]:

0      0.000000
1     68.148715
2     82.064344
3     88.444479
4     92.835422
5     95.221678
6     97.024928
7     98.041798
8     98.710027
9     99.117744
10    99.385036
11    99.557419
12    99.681381
13    99.751109
14    99.819869
15    99.859575
16    99.879912
17    99.897345
18    99.913808
19    99.929303
20    99.940925
21    99.944798
22    99.954483
23    99.960294
24    99.964167
25    99.965136
26    99.969010
27    99.969978
28    99.972883
29    99.973852
        ...    
69    99.998063
70    99.998063
71    99.998063
72    99.998063
73    99.998063
74    99.998063
75    99.998063
76    99.998063
77    99.999032
78    99.999032
79    99.999032
80    99.999032
81    99.999032
82    99.999032
83    99.999032
84    99.999032
85    99.999032
86    99.999032
87    99.999032
88    99.999032
89    99.999032
90    99.999032
91    99.999032
92    99.999032
93    99.999032
94    99.999032
95    99.999032
96    99.999032
97    99.999032
98    99.999032
Length: 99, dtype: float64

In [1]:

100-99.117744

Out[1]:

0.8822559999999982

Note: A record can have maximum of 99 small activites. However, very few records (< 1%) have >= 9 small activities. We will treat these records as outliers and drop them

In [14]:

null_sum = df_act_expand.isnull().sum()
null_sum[(null_sum / len(df_act_expand))*100 <= 99.0].index
# cols 0 to 8 have <= 99.0% null values

Out[14]:

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')

In [15]:

col_del_start = null_sum[(null_sum / len(df_act_expand))*100 > 99.0].index[0]
col_del_end = df_act_expand.shape[1]

print(col_del_start)
print(col_del_end)

9
99

In [17]:

# get records (by indices) that have non-null values starting at col_del_start
row_drops = df_act_expand[(~df_act_expand.loc[:,range(col_del_start,col_del_end)].isnull()).sum(axis=1) >=1].index
row_drops

Out[17]:

Int64Index([    52,    751,    754,    756,    757,    856,   1354,   2155,
              2191,   2199,
            ...
            258650, 258655, 258669, 258705, 258724, 258960, 258963, 258968,
            258991, 258994],
           dtype='int64', length=912)

In [18]:

df_houston2 = df_houston.drop(row_drops)

In [19]:

df_houston2.shape

Out[19]:

(269142, 9)

In [20]:

# sanity check
len(df_houston) - len(df_houston2) == len(row_drops)

Out[20]:

True

In [31]:

df_houston2.reset_index(drop=True,inplace=True)

In [21]:

# df_houston2.to_csv(PATH/'houston.csv',index=False)

In [24]:

df_houston2.head()

Out[24]:

	accuracy	activity	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy	date_time
0	545	[{'timestampMs': '1485778729815', 'activity': ...	23.0	NaN	299853940	-953481015	1485778729042	NaN	NaN	2017-01-30 06:18:49.042000-06:00
1	585	[{'timestampMs': '1485778608624', 'activity': ...	19.0	NaN	299853159	-953482313	1485778672000	NaN	NaN	2017-01-30 06:17:52-06:00
2	8	[{'timestampMs': '1485778474389', 'activity': ...	4.0	224.0	299850980	-953489148	1485778297913	NaN	NaN	2017-01-30 06:11:37.913000-06:00
3	8	NaN	4.0	224.0	299850980	-953489148	1485778233440	NaN	NaN	2017-01-30 06:10:33.440000-06:00
4	8	[{'timestampMs': '1485778148328', 'activity': ...	4.0	224.0	299850980	-953489148	1485778148040	NaN	NaN	2017-01-30 06:09:08.040000-06:00

Activity (dictionary) - Extract small activites from dictionary¶

In [20]:

df_act = df_houston2[~df_houston2.activity.isnull()].copy()

In [25]:

df_act.shape

Out[25]:

(102460, 10)

In [120]:

df_act.iloc[3].activity[0].keys()

Out[120]:

dict_keys(['timestampMs', 'activity'])

In [126]:

# check if # of dict keys are consistent
(df_act.activity.apply(lambda x: len(x[0].keys())) <2).sum()

Out[126]:

In [127]:

(df_act.activity.apply(lambda x: len(x[0].keys())) >2).sum()

Out[127]:

In [130]:

df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)].activity.apply(lambda x: x[0].keys())

Out[130]:

19233    (timestampMs, activity, extra)
19234    (timestampMs, activity, extra)
19236    (timestampMs, activity, extra)
19237    (timestampMs, activity, extra)
19238    (timestampMs, activity, extra)
19242    (timestampMs, activity, extra)
19243    (timestampMs, activity, extra)
19244    (timestampMs, activity, extra)
19246    (timestampMs, activity, extra)
19250    (timestampMs, activity, extra)
19301    (timestampMs, activity, extra)
19303    (timestampMs, activity, extra)
19304    (timestampMs, activity, extra)
19305    (timestampMs, activity, extra)
19307    (timestampMs, activity, extra)
19309    (timestampMs, activity, extra)
19310    (timestampMs, activity, extra)
19315    (timestampMs, activity, extra)
19318    (timestampMs, activity, extra)
19352    (timestampMs, activity, extra)
19353    (timestampMs, activity, extra)
19354    (timestampMs, activity, extra)
19356    (timestampMs, activity, extra)
19357    (timestampMs, activity, extra)
19358    (timestampMs, activity, extra)
19360    (timestampMs, activity, extra)
19363    (timestampMs, activity, extra)
19365    (timestampMs, activity, extra)
19366    (timestampMs, activity, extra)
Name: activity, dtype: object

Note: Almost all records have 'activity' dictionary containing 2 keys: timestampMs and activity. Only 29 (out of 300k+) of them have an extra key: extra. Let's see what these keys contain

In [136]:

df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)].loc[19233,'activity']
# look at few records that have 3 keys

Out[136]:

[{'activity': [{'confidence': 82, 'type': 'IN_VEHICLE'},
   {'confidence': 15, 'type': 'STILL'},
   {'confidence': 13, 'type': 'UNKNOWN'},
   {'confidence': 2, 'type': 'ON_FOOT'},
   {'confidence': 2, 'type': 'WALKING'}],
  'extra': [{'intVal': 100,
    'name': 'vehicle_personal_confidence',
    'type': 'VALUE'}],
  'timestampMs': '1483476045607'},
 {'activity': [{'confidence': 100, 'type': 'TILTING'}],
  'timestampMs': '1483476045545'}]

In [140]:

df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)].loc[19242,'activity']

Out[140]:

[{'activity': [{'confidence': 74, 'type': 'IN_VEHICLE'},
   {'confidence': 26, 'type': 'UNKNOWN'}],
  'extra': [{'intVal': 100,
    'name': 'vehicle_personal_confidence',
    'type': 'VALUE'}],
  'timestampMs': '1483475547194'},
 {'activity': [{'confidence': 100, 'type': 'TILTING'}],
  'timestampMs': '1483475547028'},
 {'activity': [{'confidence': 100, 'type': 'IN_VEHICLE'},
   {'confidence': 23, 'type': 'STILL'}],
  'extra': [{'intVal': 100,
    'name': 'vehicle_personal_confidence',
    'type': 'VALUE'}],
  'timestampMs': '1483475523282'}]

'Extra' key¶

In [151]:

df_act_2act = df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)]

In [166]:

df_act_2act[df_act_2act.activity.apply(lambda x: len(x[0]['extra'][0].keys()) == 3)].activity.apply(lambda x: x[0]['extra'][0])

Out[166]:

19233    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19234    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19236    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19237    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19238    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19242    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19243    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19244    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19246    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19250    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19301    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19303    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19304    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19305    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19307    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19309    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19310    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19315    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19318    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19352    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19353    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19354    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19356    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19357    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19358    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19360    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19363    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19365    {'type': 'VALUE', 'name': 'vehicle_personal_co...
19366    {'type': 'VALUE', 'name': 'vehicle_personal_co...
Name: activity, dtype: object

Note: extra list only contains 1 value (a list size 1) and they are all the same: personal vehicle related info

Create new data record for each non-null activity¶

Convert this (an example of Activity feature)

[{'activity': [{'confidence': 74, 'type': 'IN_VEHICLE'},
   {'confidence': 26, 'type': 'UNKNOWN'}],
  'extra': [{'intVal': 100,
    'name': 'vehicle_personal_confidence',
    'type': 'VALUE'}],
  'timestampMs': '1483475547194'}]

to this

activity_conf1	activity_type1	activity_conf2	activity_type2	extra_intVal	extra_name	extra_type	timestampMs
74	IN_VEHICLE	26	UNKNOWN	100	vehicle_personal_confidence	VALUE	1483475547194

(Google already sorted 'confidence' from high to low, so we will record only the first 2 confidence. We will replace the record timestamp with this timestamp)

In [22]:

df_houston2.columns

Out[22]:

Index(['accuracy', 'activity', 'altitude', 'heading', 'latitudeE7',
       'longitudeE7', 'timestampMs', 'velocity', 'verticalAccuracy'],
      dtype='object')

In [24]:

df_act = df_houston2[~df_houston2.activity.isnull()].copy() # get all the non-null activity record

In [25]:

df_act.timestampMs.value_counts().shape

Out[25]:

(102460,)

In [28]:

df_act.head(3)

Out[28]:

	accuracy	activity	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy
0	545	[{'timestampMs': '1485778729815', 'activity': ...	23.0	NaN	299853940	-953481015	1485778729042	NaN	NaN
1	585	[{'timestampMs': '1485778608624', 'activity': ...	19.0	NaN	299853159	-953482313	1485778672000	NaN	NaN
2	8	[{'timestampMs': '1485778474389', 'activity': ...	4.0	224.0	299850980	-953489148	1485778297913	NaN	NaN

In [66]:

temp_act = df_act.iloc[0:10]

In [69]:

temp_act.head()

Out[69]:

	accuracy	activity	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy
0	545	[{'timestampMs': '1485778729815', 'activity': ...	23.0	NaN	299853940	-953481015	1485778729042	NaN	NaN
1	585	[{'timestampMs': '1485778608624', 'activity': ...	19.0	NaN	299853159	-953482313	1485778672000	NaN	NaN
2	8	[{'timestampMs': '1485778474389', 'activity': ...	4.0	224.0	299850980	-953489148	1485778297913	NaN	NaN
4	8	[{'timestampMs': '1485778148328', 'activity': ...	4.0	224.0	299850980	-953489148	1485778148040	NaN	NaN
5	8	[{'timestampMs': '1485778027987', 'activity': ...	4.0	224.0	299850980	-953489148	1485778028950	NaN	NaN

Recap: Activity List can contain >=1 dictionary

Dictionary has keys: timestampms, activity and extra

Value of 'activity' key in dictionary is a list that can have >=1 dict, each dict has keys: confidence, type. Only pick 2 highest confidences

Value of 'extra' key in dictionary is a list that can have >=1 dict, each dict has keys: intval, name and type

In [70]:

temp_act.index.values

Out[70]:

array([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10])

In [72]:

col_to_drop = ['timestampMs','activity']
org_cols=[col for col in temp_act.columns.values.tolist() if col not in col_to_drop]
org_cols

Out[72]:

['accuracy',
 'altitude',
 'heading',
 'latitudeE7',
 'longitudeE7',
 'velocity',
 'verticalAccuracy']

In [63]:

new_cols = ['timestampMs','act_conf1','act_type1','act_cont2','act_type2',
                                    'extra_intVal','extra_name','extra_type']

In [73]:

temp_combined_df = pd.DataFrame(columns=org_cols + new_cols)

for idx,acts in zip(temp_act.index.values,temp_act.activity.values):
    temp_new_df = pd.DataFrame(columns=new_cols)
    c=0
    for act in acts:
        row=[]
        row.append(act['timestampMs'])
        # get activity max confidence and type, which is the first dict in activity list
        cof = act['activity'][0]
        row+=[cof['confidence'],cof['type']]
        # get secondary activity conf and type
        if len(act['activity']) > 1:
            cof = act['activity'][1]
            row+=[cof['confidence'],cof['type']]
        else:
            row+=[np.NaN,np.NaN]
        # extra
        if 'extra' in act:
            ex = act['extra'][0] # take only 1 item in extra list
            row+=[ex['intVal'],ex['name'],ex['type']]
        else:
            row+=[np.NaN,np.NaN,np.NaN]
        temp_new_df.loc[c]=row
        c+=1
        
        
    temp_org_df = pd.DataFrame([temp_act.loc[idx]]*c,index=range(0,c)) # copy the rest of features and save it to another df
    temp_org_df.drop(col_to_drop,axis=1,inplace=True)
    temp_combined_df=temp_combined_df.append(pd.concat([temp_org_df,temp_new_df],axis=1),ignore_index=True)

In [74]:

temp_combined_df

Out[74]:

	accuracy	altitude	heading	latitudeE7	longitudeE7	velocity	verticalAccuracy	timestampMs	act_conf1	act_type1	act_cont2	act_type2	extra_intVal	extra_name	extra_type
0	545	23.0	NaN	299853940	-953481015	NaN	NaN	1485778729815	100	STILL	NaN	NaN	NaN	NaN	NaN
1	585	19.0	NaN	299853159	-953482313	NaN	NaN	1485778608624	100	STILL	NaN	NaN	NaN	NaN	NaN
2	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778474389	100	STILL	NaN	NaN	NaN	NaN	NaN
3	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778410435	100	STILL	NaN	NaN	NaN	NaN	NaN
4	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778148328	100	STILL	NaN	NaN	NaN	NaN	NaN
5	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778027987	71	STILL	19	IN_VEHICLE	NaN	NaN	NaN
6	292	16.0	186.0	299849212	-953487105	NaN	NaN	1485777980551	87	STILL	13	IN_VEHICLE	NaN	NaN	NaN
7	292	16.0	186.0	299849212	-953487105	NaN	NaN	1485777920344	100	TILTING	NaN	NaN	NaN	NaN	NaN
8	292	16.0	186.0	299849212	-953487105	NaN	NaN	1485777850168	100	TILTING	NaN	NaN	NaN	NaN	NaN
9	292	16.0	186.0	299849212	-953487105	NaN	NaN	1485777840198	90	STILL	10	IN_VEHICLE	NaN	NaN	NaN
10	145	15.0	77.0	299853950	-953490222	NaN	NaN	1485777709913	44	STILL	42	UNKNOWN	NaN	NaN	NaN
11	145	15.0	77.0	299853950	-953490222	NaN	NaN	1485777694049	100	TILTING	NaN	NaN	NaN	NaN	NaN
12	43	16.0	NaN	299849213	-953489881	NaN	NaN	1485777678185	75	STILL	16	UNKNOWN	NaN	NaN	NaN
13	43	16.0	NaN	299849213	-953489881	NaN	NaN	1485777676309	100	TILTING	NaN	NaN	NaN	NaN	NaN
14	43	16.0	NaN	299849213	-953489881	NaN	NaN	1485777674433	69	STILL	13	IN_VEHICLE	NaN	NaN	NaN
15	43	16.0	NaN	299849213	-953489881	NaN	NaN	1485777614917	100	TILTING	NaN	NaN	NaN	NaN	NaN
16	106	-45.0	6.0	299852839	-953485393	NaN	NaN	1485777555400	85	STILL	15	IN_VEHICLE	NaN	NaN	NaN

Combined non-null-activity dataframe to null-activity dataframe¶

In [90]:

df_non_act = df_houston2[df_houston2.activity.isnull()].copy() # get all the null activity record

In [93]:

temp_non_act=df_non_act.iloc[:10].copy()

In [94]:

temp_non_act.drop('activity',axis=1,inplace=True)

In [96]:

temp_non_act

Out[96]:

	accuracy	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy
3	8	4.0	224.0	299850980	-953489148	1485778233440	NaN	NaN
13	395	-18.0	NaN	299857978	-953493753	1485777399000	NaN	NaN
14	53	-3.0	335.0	299855823	-953490404	1485777360000	NaN	NaN
15	138	54.0	30.0	299850044	-953490081	1485777320000	NaN	NaN
16	1700	NaN	NaN	299819691	-953542128	1485777244889	NaN	NaN
19	1700	NaN	NaN	299819691	-953542128	1485776979169	NaN	NaN
21	23	60.0	111.0	299851691	-953488097	1485776857000	NaN	NaN
23	125	356.0	248.0	299846068	-953487162	1485776770000	NaN	NaN
29	23	NaN	NaN	299852733	-953493493	1485776105741	NaN	NaN
31	22	NaN	NaN	299856137	-953498397	1485775997117	NaN	NaN

In [99]:

temp_combined_df.head()

Out[99]:

	accuracy	altitude	heading	latitudeE7	longitudeE7	velocity	verticalAccuracy	timestampMs	act_conf1	act_type1	act_cont2	act_type2	extra_intVal	extra_name	extra_type
0	545	23.0	NaN	299853940	-953481015	NaN	NaN	1485778729815	100	STILL	NaN	NaN	NaN	NaN	NaN
1	585	19.0	NaN	299853159	-953482313	NaN	NaN	1485778608624	100	STILL	NaN	NaN	NaN	NaN	NaN
2	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778474389	100	STILL	NaN	NaN	NaN	NaN	NaN
3	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778410435	100	STILL	NaN	NaN	NaN	NaN	NaN
4	8	4.0	224.0	299850980	-953489148	NaN	NaN	1485778148328	100	STILL	NaN	NaN	NaN	NaN	NaN

In [100]:

temp_non_act.append(temp_combined_df,ignore_index=True,sort=False) # combine all the null and non-null activity records

Out[100]:

	accuracy	altitude	heading	latitudeE7	longitudeE7	timestampMs	velocity	verticalAccuracy	act_conf1	act_type1	act_cont2	act_type2	extra_intVal	extra_name	extra_type
0	8	4.0	224.0	299850980	-953489148	1485778233440	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	395	-18.0	NaN	299857978	-953493753	1485777399000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	53	-3.0	335.0	299855823	-953490404	1485777360000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	138	54.0	30.0	299850044	-953490081	1485777320000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1700	NaN	NaN	299819691	-953542128	1485777244889	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	1700	NaN	NaN	299819691	-953542128	1485776979169	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	23	60.0	111.0	299851691	-953488097	1485776857000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	125	356.0	248.0	299846068	-953487162	1485776770000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	23	NaN	NaN	299852733	-953493493	1485776105741	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	22	NaN	NaN	299856137	-953498397	1485775997117	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10	545	23.0	NaN	299853940	-953481015	1485778729815	NaN	NaN	100	STILL	NaN	NaN	NaN	NaN	NaN
11	585	19.0	NaN	299853159	-953482313	1485778608624	NaN	NaN	100	STILL	NaN	NaN	NaN	NaN	NaN
12	8	4.0	224.0	299850980	-953489148	1485778474389	NaN	NaN	100	STILL	NaN	NaN	NaN	NaN	NaN
13	8	4.0	224.0	299850980	-953489148	1485778410435	NaN	NaN	100	STILL	NaN	NaN	NaN	NaN	NaN
14	8	4.0	224.0	299850980	-953489148	1485778148328	NaN	NaN	100	STILL	NaN	NaN	NaN	NaN	NaN
15	8	4.0	224.0	299850980	-953489148	1485778027987	NaN	NaN	71	STILL	19	IN_VEHICLE	NaN	NaN	NaN
16	292	16.0	186.0	299849212	-953487105	1485777980551	NaN	NaN	87	STILL	13	IN_VEHICLE	NaN	NaN	NaN
17	292	16.0	186.0	299849212	-953487105	1485777920344	NaN	NaN	100	TILTING	NaN	NaN	NaN	NaN	NaN
18	292	16.0	186.0	299849212	-953487105	1485777850168	NaN	NaN	100	TILTING	NaN	NaN	NaN	NaN	NaN
19	292	16.0	186.0	299849212	-953487105	1485777840198	NaN	NaN	90	STILL	10	IN_VEHICLE	NaN	NaN	NaN
20	145	15.0	77.0	299853950	-953490222	1485777709913	NaN	NaN	44	STILL	42	UNKNOWN	NaN	NaN	NaN
21	145	15.0	77.0	299853950	-953490222	1485777694049	NaN	NaN	100	TILTING	NaN	NaN	NaN	NaN	NaN
22	43	16.0	NaN	299849213	-953489881	1485777678185	NaN	NaN	75	STILL	16	UNKNOWN	NaN	NaN	NaN
23	43	16.0	NaN	299849213	-953489881	1485777676309	NaN	NaN	100	TILTING	NaN	NaN	NaN	NaN	NaN
24	43	16.0	NaN	299849213	-953489881	1485777674433	NaN	NaN	69	STILL	13	IN_VEHICLE	NaN	NaN	NaN
25	43	16.0	NaN	299849213	-953489881	1485777614917	NaN	NaN	100	TILTING	NaN	NaN	NaN	NaN	NaN
26	106	-45.0	6.0	299852839	-953485393	1485777555400	NaN	NaN	85	STILL	15	IN_VEHICLE	NaN	NaN	NaN

In [ ]: