#!/usr/bin/env python # coding: utf-8 # # Identifying Modularization Options based on Code Changes # ## Part 1: Analysing existing Modularization # ### Stating Question # _"How suitable is the domain modularization for the actual development activities?"_ # ### Idea # Heuristic: "Are changes in one module cohesive?" # * Changes => commits from version control system # * Module => Part of file path # # ### Data import and preparation # #### Import Git log data # In[1]: import pandas as pd git_log = pd.read_csv("../datasets/git_log_numstat_dropover.csv")[['sha', 'file']] git_log.head() # #### Just keep Java production code # In[2]: prod_code = git_log.copy() prod_code = prod_code[prod_code.file.str.contains("src/main/java")] prod_code = prod_code[~prod_code.file.str.endswith("package-info.java")] prod_code.head() # ### Analysis # #### Set marker for Commit # In[3]: prod_code['commit'] = 1 prod_code.head() # #### Rotate the data table ("pivoting") # In[4]: commit_matrix = prod_code.reset_index().pivot_table( index='file', columns='sha', values='commit', fill_value=0) commit_matrix.iloc[0:5,50:55] # #### Calculate distances between vectors # In[5]: from sklearn.metrics.pairwise import cosine_distances dissimilarity_matrix = cosine_distances(commit_matrix) dissimilarity_matrix[:5,:5] # #### (Pretty print results) # In[6]: import pandas as pd dissimilarity_df = pd.DataFrame( dissimilarity_matrix, index=commit_matrix.index, columns=commit_matrix.index) dissimilarity_df.iloc[:5,:2] # ### Interpretation # #### Reduce dimensions # In[7]: from sklearn.manifold import MDS # uses a fixed seed for random_state for reproducibility model = MDS(dissimilarity='precomputed', random_state=0) dissimilarity_2d = model.fit_transform(dissimilarity_df) dissimilarity_2d[:5] # #### (Pretty print results) # In[8]: dissimilarity_2d_df = pd.DataFrame( dissimilarity_2d, index=commit_matrix.index, columns=["x", "y"]) dissimilarity_2d_df.head() # #### Extract module information # In[9]: dissimilarity_2d_df['module'] = dissimilarity_2d_df.index.str.split("/").str[6].values dissimilarity_2d_df.head() # ### Visualization # #### Create an interactive graphic # In[10]: from ausi import pygal xy = pygal.create_xy_chart(dissimilarity_2d_df,"module") xy.render_in_browser() # ## Part 2: Alternative modularization # ### Stating Question # # _Does an alternative modularization based on the change behavior exist?_ # ### Idea # # Cluster source code files according to their distances # In[11]: commit_matrix.head() # ### Modeling # #### Cluster distance matrix # In[12]: from sklearn.cluster import AgglomerativeClustering clustering = AgglomerativeClustering() model = clustering.fit(commit_matrix) model # ### Visualization # #### Plot dendogram of clustering results # In[13]: from ausi.scipy import plot_dendrogram plot_dendrogram(model, labels=commit_matrix.index) # ## End