%matplotlib inline import flickrapi import pandas as pd import json api_key = '' flickr = flickrapi.FlickrAPI(api_key, format='json') fav = flickr.photos_getFavorites(photo_id='14368887810'); print 'head of fav: ',fav[:50] print 'tail of fav: ',fav[-50:],'\n' # to return a valid json string, just remove 14 first characters and the last one print 'head of fav (valid): ',fav[14:50] print 'tail of fav (valid): ',fav[-50:-1] s_fav = json.loads(fav[14:-1]) print str(s_fav)[:200] # printing only the beginning print str(s_fav['photo']['person'])[:200], '\n' # display the fav list print s_fav['photo']['person'][5], '\n' # display info about one fav print s_fav['photo']['person'][5]['favedate'], '\n' # display fav date (UNIX timestamp) # read only valid json sub string into data frame df_fav = pd.read_json(json.dumps(s_fav['photo']['person'])); df_fav df_fav['favedate'] df2 = pd.DataFrame(pd.to_datetime(df_fav['favedate'], unit='s')) df2['weekday'] = pd.DatetimeIndex(df2['favedate']).weekday df2 # picture that is 1st on explore page (date Jul 29th 2014) pp = 50; p_id = '14778520992' str_tmp = flickr.photos_getFavorites(photo_id=p_id, per_page=pp); json_tmp = json.loads(str_tmp[14:-1]) df_tmp = pd.read_json(json.dumps(json_tmp['photo']['person'])); df_fav = pd.DataFrame(pd.to_datetime(df_tmp['favedate'], unit='s')) num_p = json_tmp['photo']['pages'] num_p str_tmp = flickr.photos_getInfo(photo_id=p_id); tt = int(json.loads(str_tmp[14:-1])['photo']['dates']['posted']) posted = pd.to_datetime(tt, unit='s') posted for i in arange(2, num_p+1): str_tmp = \ flickr.photos_getFavorites(photo_id=p_id, page=i, per_page=pp) json_tmp = json.loads(str_tmp[14:-1]); df_tmp = pd.read_json(json.dumps(json_tmp['photo']['person'])) df_fav = df_fav.append(pd.DataFrame(pd.to_datetime(df_tmp['favedate'],\ unit='s')), ignore_index=True) #print "data frame\n", df_fav df_fav['diff'] = df_fav['favedate'] - posted ts=(df_fav['diff'].astype('timedelta64[s]').astype(float)/3600); figsize(17,5) ts.hist(bins=50, color='slategrey', histtype='stepfilled') xlabel('hours'); ylabel('Number of favorites') import seaborn as sb figsize(17,5) ts.hist(bins=50, color='slategrey', histtype='stepfilled') xlabel('hours'); ylabel('Number of favorites') # Extracts for a photo the times and types of events (fave or comment) def getFlickrActivity(df_list): pp = 50; df = pd.DataFrame(columns=['time','event_type','photo_id']) for p_id in df_list: # parse first page (necessary to know the total number of pages) str_tmp = flickr.photos_getFavorites(photo_id=p_id, per_page=pp) json_tmp = json.loads(str_tmp[14:-1]) num_p = json_tmp['photo']['pages'] df_tmp = pd.read_json(json.dumps(json_tmp['photo']['person'])) dt = pd.Series(pd.to_datetime(df_tmp['favedate'], unit='s')) df = df.append(pd.DataFrame({ 'time' : dt, 'event_type' : 'f', 'photo_id' : p_id})) # parse other pages for i in arange(2, num_p+1): str_tmp = \ flickr.photos_getFavorites(photo_id=p_id, page=i, per_page=pp); json_tmp = json.loads(str_tmp[14:-1]) df_tmp = pd.read_json(json.dumps(json_tmp['photo']['person'])); dt = pd.Series(pd.to_datetime(df_tmp['favedate'], unit='s')) df = df.append(pd.DataFrame({ 'time' : dt, 'event_type' : 'f', 'photo_id' : p_id}), ignore_index=True) # do the same with comments (no page parsing required) str_tmp = json.loads(flickr.photos_comments_getList(photo_id=p_id)[14:-1]) df_tmp = pd.read_json(json.dumps(str_tmp['comments']['comment'])) dt = pd.Series(pd.to_datetime(df_tmp['datecreate'], unit='s')) df = df.append(pd.DataFrame({ 'time' : dt, 'event_type' : 'c', 'photo_id' : p_id}), ignore_index=True) return df # Takes a data frame containing a list of photos and extracts infomation for each of them # time posted, views (to complete in future versions) def getFlickrInfo(df_list): pp = 50; df_out = pd.DataFrame(index = arange(df_list.size), columns = ['photo_id', 'posted', 'views']) for idx, p_id in enumerate(df_list): str_tmp = flickr.photos_getInfo(photo_id=p_id) tt = int(json.loads(str_tmp[14:-1])['photo']['dates']['posted']) posted = pd.to_datetime(tt, unit='s') views = int(json.loads(str_tmp[14:-1])['photo']['views']) df_out.posted.ix[idx] = posted; df_out.views.ix[idx] = views; df_out.photo_id.ix[idx] = p_id; return df_out def getFlickrExploreList(): str_tmp = flickr.interestingness_getList() json_tmp = json.loads(str_tmp[14:-1]) df_tmp = pd.read_json(json.dumps(json_tmp['photos']['photo'])) df = df_tmp['id'] return df # list of photo ids # Making a data frame with pictures from explore df_list = getFlickrExploreList() df_info = getFlickrInfo(df_list) df_event = getFlickrActivity(df_list) df_info.tail() df_event.tail() figsize(8,6) tt=pd.DatetimeIndex(df_event['time']); hist(tt.hour, bins=24, histtype='stepfilled', alpha=0.7); figsize(8,6) tt=pd.DatetimeIndex(df_info['posted']); hist(tt.hour, bins=24, histtype='stepfilled', alpha=0.7); grouped = df_event.groupby('photo_id') figsize(15,7) b=arange(0, 24) acc = zeros(len(b)-1) for k, grp in grouped: tt=pd.DatetimeIndex(grp['time']) h,hh=histogram(tt.hour, bins=b) acc += h plot(b[:-1],h, '.r', alpha=0.25, markersize=10) acc /= len(grouped) plot(b[:-1],acc) gca().invert_yaxis() gca().xaxis.tick_top() figsize(15,7) nbin = 100 b=arange(0, 48) acc = zeros(len(b)-1) for k, grp in grouped: tt=grp['time']-df_info.posted[df_info.photo_id==k].iloc[0] #print tt ttt=(tt.astype('timedelta64[s]').astype(float)/3600); h,hh=histogram(ttt, bins=b) acc += h plot(b[:-1], h, '.r', alpha=0.25, markersize=10) acc /= len(grouped) plot(b[:-1], acc) gca().invert_yaxis() gca().xaxis.tick_top()