#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().system('git clone https://github.com/gotec/git2net-tutorials') import os os.chdir('git2net-tutorials') get_ipython().system('pip install -r requirements.txt') os.chdir('..') get_ipython().system('git clone https://github.com/gotec/git2net git2net4analysis') # In[ ]: import git2net import os from collections import defaultdict import pandas as pd import pathpy as pp # ## Network Analysis and Visualisation # # This tutorial discusses the functions of `git2net` that allow us to generate various network projections of a git repository. # However, before we do so, let's mine the repository and disambiguate the author aliases again to have a clean starting point to work with. # For the sake of exposition, we start with a repository only including commits that touched five or fewer files. # In[ ]: # We assume a clone of git2net's repository exists in the folder below following the first tutorial. git_repo_dir = 'git2net4analysis' # Here, we specify the database in which we will store the results of the mining process. sqlite_db_file = 'git2net4analysis.db' # Remove database if exists if os.path.exists(sqlite_db_file): os.remove(sqlite_db_file) git2net.mine_git_repo(git_repo_dir, sqlite_db_file, max_modifications=5) git2net.disambiguate_aliases_db(sqlite_db_file) # Great, with these few lines, we're already all set. # # ## Network projections # # We now provide a brief overview of all network projections included in `git2net`. # # ### Co-editing networks # # To start our exploration, let's try to obtain a co-editing network for our project. # We can do this by simply calling the `get_coediting_network()` function and providing the database we just mined. # # Note that by default, all network visualisations use the `author_id` that we obtained from the author disambiguation in the previous tutorial to plot the networks. # However, if you want to use the `author_name` or `author_email` instead, you can provide it as an optional argument, e.g., `author_identifier='author_email'`. # In[ ]: t, node_info, edge_info = git2net.get_coediting_network(sqlite_db_file) print(t) t # The function returns a `pathpy` temporal network object and two dictionaries containing properties of nodes and edges. # As of writing this tutorial, not all of them are used. # However, they are set as placeholders for future versions of `git2net`. # # As shown above, a `pathpy` temporal network object can be visualised by itself. # In addition, we can also aggregate the network by dropping the order of events, yielding a standard network object. # Let's do this next. # In[ ]: pp.Network.from_temporal_network(t) # In both the temporal and aggregated network, a node represents an author, whereas edges point from the person changing a line of code to the person who was the original author. # # ### Bipartite author-file networks # # Next, we could ask the question of which files different authors collaborated on. Therefore, we can plot a bipartite network containing both files and authors as nodes. # In[ ]: t, node_info, edge_info = git2net.get_bipartite_network(sqlite_db_file) n = pp.Network.from_temporal_network(t) n # For this network, `node_info` contains the classes of authors in the network. These can e.g. be used to colour nodes as shown below. # In[ ]: colour_map = {'author': '#73D2DE', 'file': '#2E5EAA'} node_color = {node: colour_map[node_info['class'][node]] for node in n.nodes} pp.visualisation.plot(n, node_color=node_color) # If we are interested in, e.g. more recently edited files, we can filter the database by providing the `time_from` and `time_to` options. Let's check the files edited since May 2019. # In[ ]: from datetime import datetime time_from = datetime(2019, 5, 1) t, node_info, edge_info = git2net.get_bipartite_network(sqlite_db_file, time_from=time_from) n = pp.Network.from_temporal_network(t) colour_map = {'author': '#73D2DE', 'file': '#2E5EAA'} node_color = {node: colour_map[node_info['class'][node]] for node in n.nodes} pp.visualisation.plot(n, node_color=node_color) # ### Co-authorship networks # # The projection of this network that links authors editing the same file is the co-authorship network. # In[ ]: n, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file) n # Note that it looks similar as, at its core, the co-authorship network is a projection of the bipartite network to authors. # ### Line editing paths # # `git2net` allows the extraction of editing paths on the level of individual lines. I.e. we can track consecutive changes made to a single line over time—even if these lines move within a file or even across files. This is very powerful, as it allows us to determine editing sequences and find lines that require more editing than others. # These could either be lines that are tough to implement, or they could contain essential information, such as the version number in an `__init__.py` file. # # To extract these paths, we can use the `get_line_editing_paths` function. As these networks tend to be very large, we limit the analysis to a small file for this tutorial. # To only look at a specific set of file paths, we can use the `file_paths` option. # In[ ]: file_paths = ['setup.py'] paths, dag, node_info, edge_info = git2net.get_line_editing_paths(sqlite_db_file, git_repo_dir, file_paths=file_paths) pp.visualisation.plot(dag, node_color=node_info['colors']) # Notice that despite only looking at a single file, the network shown above is not connected. This is due to our database not being complete. Let's fix this now and try again. # In[ ]: git2net.mine_git_repo(git_repo_dir, sqlite_db_file) # As we now have more commits in our database, we also need to rerun the disambiguation. # In[ ]: git2net.disambiguate_aliases_db(sqlite_db_file) # Let's colour the nodes by type and look at the visualisation again. # In[ ]: paths, dag, node_info, edge_info = git2net.get_line_editing_paths(sqlite_db_file, git_repo_dir, file_paths=file_paths) colors = {} for x in dag.nodes: colors[x] = '#FBB13C' for x in dag.roots: colors[x] = '#A83236' for x in dag.leafs: colors[x] = '#21830' pp.visualisation.plot(dag, node_color=colors, width=1000, height=1000) # As mentioned before, these networks get very large very quickly. Therefore, it is often more helpful to work with the `pathpy` path object that is also returned by the function. # It contains all paths and subpaths contained in the network shown above. More information regarding this object can be found in the documentation on [pathpy.net](http://www.pathpy.net/). # ### Commit editing paths # # Finally, let's look at a projection in which nodes represent commits and a directed edge $c_1 \rightarrow c_2$ between two commits $c_1, c_2$ exists if commit $c_2$ modified lines written in $c_1$. # In[ ]: dag, node_info, edge_info = git2net.get_commit_editing_dag(sqlite_db_file) dag # ## Comparison co-editing and co-authorship networks # # In our [original publication](https://arxiv.org/abs/1903.10180) of `git2net`, we compared the co-editing and co-authorship networks of an Open Source and proprietary software development project. # Let's see how a similar comparison would work for the repository behind `git2net`. # In[ ]: # Get the co-authorship network n_coauthorship, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file) # Get the (aggregated) co-editing network n_coediting_t, node_info, edge_info = git2net.get_coediting_network(sqlite_db_file) n_coediting = pp.Network.from_temporal_network(n_coediting_t) # In[ ]: print('==============================================\n') print('# co-authorship') print(n_coauthorship) print('# co-editing') print(n_coediting) print('==============================================') # We find that both networks have seven nodes representing the seven individuals that worked on `git2net`. # In the co-authorship network, the nodes are connected by eight undirected links indicating the two people edited the same file. # In the co-editing network, we have nine directed links that show who edited lines from whom. # # Finally, let's see how the co-editing network evolves over time! # For this example, we will use a rolling time window spanning 365 days, which we then shift in 30-day increments. # We can use the `RollingTimeWindow()` function in `pathpy` to compute the networks over time. # Then, we only need to add the statistics of interest to a dictionary. # In[ ]: WINDOW_SIZE = 365*24*60*60 STEP_SIZE = 30*24*60*60 data = defaultdict(list) for network, window in pp.RollingTimeWindow(n_coediting_t, window_size=WINDOW_SIZE, step_size=STEP_SIZE, directed=True, return_window=True): data['number of developers'].append(network.ncount()) data['unique relations directed'].append(network.ecount()) data['mean outdegree'].append(pp.algorithms.statistics.mean_degree(network, degree='outdegree')) data['time'].append(window[1]) # append window end time # Now that we have the data let's plot them and see what we get! # In[ ]: # Plot time-variable network measures df = pd.DataFrame(data, columns=list(data.keys())) df.set_index(pd.to_datetime(df.time, unit='s'), inplace=True) df.drop('time', axis=1, inplace=True) df.plot(y='number of developers') df.plot(y='unique relations directed') df.plot(y='mean outdegree') # With this, we conclude this part of the tutorial. # The following part will look at the remaining information mined by `git2net` and see how we can efficiently handle the resulting SQLite database.