import numpy as np import matplotlib.pyplot as plt %pylab inline from MERhelpers import * matplotlib.pyplot.xkcd(scale=0.5, length=100, randomness=5) mainColour = '#2F4F4F'; secondColour1 = '#666666'; secondColour2 = '#7491A3'; dict_all,date_clicks,num_clicks = data_to_dict_clickdates_clickscount('Analytics_Winter2014.csv') wiki_courses = ['MATH100','MATH101','MATH102','MATH103','MATH104','MATH105','MATH110', 'MATH152','MATH200','MATH215','MATH220','MATH221','MATH257','MATH437'] clean_dict(dict_all,wiki_courses) courses = list_courses(dict_all) page_views = []; total_time = []; pages_per_visit = []; for course in courses: pv = 0; pgs = []; t = []; exams = list_exams(dict_all,course) for exam in exams: questions = list_questions(dict_all,course,exam) for q in questions: pv = pv + dict_all[course][exam][q]['pageviews'] pgs.append(dict_all[course][exam][q]['pages_per_visit']) t.append(dict_all[course][exam][q]['avg_time']) pages_per_visit.append(mean(pgs)) page_views.append(pv) total_time.append(mean(t)) y_pos = np.arange(len(courses)) total_time plt.figure(figsize(7,5)) plt.subplots_adjust(right = 2) plt.subplot(131) plt.barh(y_pos, page_views, align='center', alpha=0.9,color=mainColour) plt.yticks(y_pos, courses) plt.grid() plt.xlabel('number of page views') plt.title('Absolute page views by course') plt.subplot(132) plt.barh(y_pos, total_time, align='center', alpha=0.9,color=secondColour1) plt.yticks(y_pos, courses) plt.grid() plt.xlabel('time (s)') plt.title('Average question page viewing time by course') plt.subplot(133) plt.barh(y_pos, pages_per_visit, align='center', alpha=0.9,color=secondColour2) plt.yticks(y_pos, courses) plt.grid() plt.xlabel('number of pages per session') plt.title('Average number of page views per session by course') plt.show() def get_years(course_list): years = [] for course in course_list: years.append(int(course[4])) return years years = get_years(courses) x = page_views y = pages_per_visit area = total_time colors = years plt.figure(figsize(6,4)) c = plt.scatter(x,y,s=area,c=colors) c.set_alpha(0.3) plt.grid() #for i, txt in enumerate(courses): # plt.annotate(txt, (x[i],y[i]),rotation=-5) #plt.xlim([-15000,190000]) #plt.ylim([0,35]) plt.xlabel('Total page views') plt.ylabel('Avg. pages per session') plt.title('Page views vs. Pages/Session') plt.show() # Take two questions from the same exam and determine from their titles if they are the same question def isSameQuestion(q1,q2): num1 = int(''.join(x for x in q1 if x.isdigit())) num2 = int(''.join(x for x in q2 if x.isdigit())) samenum = num1 == num2 let1 = ''.join(x for x in q1 if x.islower()) let2 = ''.join(x for x in q2 if x.islower()) samelet = let1 == let2 if samenum and samelet: return True else: return False # Will return data the combined data of two question pages. ## WARNING: THIS IS NOT FINISHED YET AND MORE WORK MAY BE NEEDED TO PROPERLY BLEND THE QUESTION DATA. def combine_two_questions_data(q1,q2): pgviews = q1['pageviews'] + q2['pageviews'] avgtime = (q1['avg_time']*q1['pageviews'] + q2['avg_time']*q2['pageviews'])/pgviews return pgviews, avgtime def clean_exam(my_dict,course,exam): #Remove pages that have less than a certain number of views questions = list_questions(my_dict,course,exam) for question in questions: pgv = my_dict[course][exam][question]['pageviews'] if pgv < 30: del my_dict[course][exam][question] #Merge questions that are the same, but were labeled using a different scheme previously questions = list_questions(my_dict,course,exam) for q1 in questions: temp_array = questions[questions.index(q1)+1:] for q2 in temp_array: if isSameQuestion(q1,q2): pgv,avt = combine_two_questions_data(my_dict[course][exam][q1],my_dict[course][exam][q2]) my_dict[course][exam][q1]['pageviews'] = pgv my_dict[course][exam][q1]['avg_time'] = avt del my_dict[course][exam][q2] def get_question_data_array(mydict,course,exam,question_num): avg_time = [] visit_duration = [] pageviews = [] unique_pageviews = [] pages_per_visit = [] questions = list_questions(mydict,course,exam) for question in questions: pgv = mydict[course][exam][question]['pageviews'] if pgv > 29: num = int(''.join(x for x in question if x.isdigit())) if num == question_num: avg_time.append(mydict[course][exam][question]['avg_time']) visit_duration.append(mydict[course][exam][question]['visit_duration']) pageviews.append(mydict[course][exam][question]['pageviews']) unique_pageviews.append(mydict[course][exam][question]['unique_pageviews']) pages_per_visit.append(mydict[course][exam][question]['pages_per_visit']) return avg_time,visit_duration, pageviews, unique_pageviews, pages_per_visit def get_question_data_array_for_exam(mydict,course,exam): count = 1; avg_time_array = [] visit_duration_array = [] pageviews_array = [] unique_pageviews_array = [] pages_per_visit_array = [] avt,vdr,pvs,upv,ppv = get_question_data_array(mydict,course,exam,count) while not len(avt) == 0: avg_time_array.append(avt) visit_duration_array.append(vdr) pageviews_array.append(pvs) unique_pageviews_array.append(upv) pages_per_visit_array.append(ppv) count = count + 1; avt,vdr,pvs,upv,ppv = get_question_data_array(mydict,course,exam,count) return avg_time_array, visit_duration_array, pageviews_array, unique_pageviews_array, pages_per_visit_array def plot_question_info(course,exam): avt,vdr,pvs,upv,ppv = get_question_data_array_for_exam(dict_all,course,exam) count = 0 pos_final = 0 rects = [] colors = [mainColour,secondColour1,secondColour2]; colors = colors + colors; colors = colors + colors; xt = [] for pv,at in zip(pvs,avt): positions = np.cumsum(at) positions = [0] + list(positions[:-1]) positions = [p + pos_final for p in positions] heights = pv bar_width = at rects.append(plt.bar(positions,heights,bar_width,color=colors[count],alpha=0.9)) pos_final = pos_final + sum(at) xt.append(pos_final - sum(at)/2) count = count + 1 plt.xticks(xt, [ 'Q' + str(y) for y in range(1,20)]) plt.xlim([0,pos_final]) plt.title(course + ', '+ exam + ' (average total viewing time = '+ str(round(float(pos_final)/60,1)) + ' mins)') plt.show() course_list = ['MATH101','MATH103','MATH105','MATH257','MATH220'] exam_list = ['April_2012','April_2012','April_2012','December_2011','April_2011']; for cour,exam in zip(course_list,exam_list): plt.figure(figsize(13,4)) clean_exam(dict_all,cour,exam) plot_question_info(cour,exam)