import pandas as pd import json import itertools import collections import re import arrow rcParams['figure.figsize'] = (15, 3) pd.set_option('display.mpl_style', 'default') !git2json --git-dir=/home/tavish/code/ipython/.git > ipython-log.json log = json.load(open('ipython-log.json')) print log[0] print "Number of commits = ", len(log) file_changes = lambda: itertools.chain.from_iterable( [change[2] for change in commit['changes'] if re.match(r'^.*\.py$', change[2])] for commit in log ) rcParams['figure.figsize'] = (15, 6) fchanges = file_changes() fchange_count = collections.Counter(fchanges) a = average(fchange_count.values()) most_common = fchange_count.most_common(20) df = pd.DataFrame(most_common) df.head() df.index = df[0] df = df[[1]] df.head() p = df.plot(kind='bar', legend=False) p.set_title('Most-changed Files in IPython') p.set_ylabel('Commits') # Draw a red line at the average hlines(a, 0, len(df), colors='r') def weekly_date_resolution(ts): ar = arrow.arrow(ts) day_of_month = ar.timetuple().tm_mday week = int(day_of_month) / 7 new_day = (week*7)+1 assert new_day > 0 assert new_day < 30 try: day_adjusted = ar.replace(day=new_day) except ValueError: new_day = day_of_month # just keep the original day_adjusted = ar.replace(day=new_day) return day_adjusted.date() commit_times = lambda: ( (weekly_date_resolution(commit['committer']['date']), commit['commit']) for commit in log ) dfct = pd.DataFrame(list(commit_times()), columns=['date', 'id']) dfct = dfct.groupby('date').aggregate(len) dfct.head() p = dfct.plot(legend=False) p.set_title('Weekly commits on IPython') p.set_ylabel('Commits') def is_pull_request_merge(message): '''Match commit messages that start with "Merge pull request #"''' return re.match(r'^Merge pull request #', message) def pull_request_number(message): '''Extract the PR # from the commit message.''' return int(re.match(r'^Merge pull request #(\d+)', message).groups()[0]) # Create a table for the merge commits. merge_commits = [ [ commit['parents'][1], # the second parent seems to be the source branch commit['committer']['date'], pull_request_number(commit['message']), ] for commit in log if is_pull_request_merge(commit['message']) ] # Also create a set of merge commits for use in the next cell. merge_commit_parent_ids = [mc[0] for mc in merge_commits] merge_df = pd.DataFrame( merge_commits, columns=['merge_commit', 'merge_timestamp', 'pr'], index=merge_commit_parent_ids, ) merge_df.head() # Create a table for pull request commits. pr_commits = [ [ commit['commit'], commit['committer']['date'], len(commit['changes']), # let's sneak in another basic churn metric ] for commit in log # Do we have a merge commit for this commit? if commit['commit'] in merge_commit_parent_ids ] commit_ids = [prc[0] for prc in pr_commits] pr_df = pd.DataFrame(pr_commits, columns=['commit', 'commit_timestamp', 'churn'], index=commit_ids) pr_df.head() both_df = pr_df.join(merge_df) both_df.head() rcParams['figure.figsize'] = (5, 10) both_df['delta'] = both_df['merge_timestamp'] - both_df['commit_timestamp'] delta = both_df['delta'] both_df['delta_weeks'] = delta/60.0/60.0/24.0/7.0 both_df['delta_days'] = delta/60.0/60.0/24.0 both_df['delta_hours'] = delta/60.0/60.0 both_df['delta_mins'] = delta/60.0 both_df['delta_secs'] = delta both_df.head() rcParams['figure.figsize'] = (15, 3) both_df[['delta_days']].boxplot(vert=False) rcParams['figure.figsize'] = (15, 3) both_df[both_df['delta_days'] < 30][['delta_days']].boxplot(vert=False) # The data is skewed, so i'll use the spearman correlation both_df[['churn', 'delta']].corr(method='spearman')