#!/usr/bin/env python
# coding: utf-8

# In[ ]:


get_ipython().system('git clone https://github.com/gotec/git2net-tutorials')
import os
os.chdir('git2net-tutorials')
get_ipython().system('pip install -r requirements.txt')
os.chdir('..')
get_ipython().system('git clone https://github.com/gotec/git2net git2net4analysis')


# In[ ]:


import git2net
import os
from collections import defaultdict
import pandas as pd
import pathpy as pp


# ## Network Analysis and Visualisation
# 
# This tutorial discusses the functions of `git2net` that allow us to generate various network projections of a git repository.
# However, before we do so, let's mine the repository and disambiguate the author aliases again to have a clean starting point to work with.
# For the sake of exposition, we start with a repository only including commits that touched five or fewer files.

# In[ ]:


# We assume a clone of git2net's repository exists in the folder below following the first tutorial.
git_repo_dir = 'git2net4analysis'

# Here, we specify the database in which we will store the results of the mining process.
sqlite_db_file = 'git2net4analysis.db'

# Remove database if exists
if os.path.exists(sqlite_db_file):
    os.remove(sqlite_db_file)
    
git2net.mine_git_repo(git_repo_dir, sqlite_db_file, max_modifications=5)
git2net.disambiguate_aliases_db(sqlite_db_file)


# Great, with these few lines, we're already all set.
# 
# ## Network projections
# 
# We now provide a brief overview of all network projections included in `git2net`.
# 
# ### Co-editing networks
# 
# To start our exploration, let's try to obtain a co-editing network for our project.
# We can do this by simply calling the `get_coediting_network()` function and providing the database we just mined.
# 
# Note that by default, all network visualisations use the `author_id` that we obtained from the author disambiguation in the previous tutorial to plot the networks.
# However, if you want to use the `author_name` or `author_email` instead, you can provide it as an optional argument, e.g., `author_identifier='author_email'`.

# In[ ]:


t, node_info, edge_info = git2net.get_coediting_network(sqlite_db_file)
print(t)
t


# The function returns a `pathpy` temporal network object and two dictionaries containing properties of nodes and edges.
# As of writing this tutorial, not all of them are used. 
# However, they are set as placeholders for future versions of `git2net`.
# 
# As shown above, a `pathpy` temporal network object can be visualised by itself.
# In addition, we can also aggregate the network by dropping the order of events, yielding a standard network object.
# Let's do this next.

# In[ ]:


pp.Network.from_temporal_network(t)


# In both the temporal and aggregated network, a node represents an author, whereas edges point from the person changing a line of code to the person who was the original author.
# 
# ### Bipartite author-file networks
# 
# Next, we could ask the question of which files different authors collaborated on. Therefore, we can plot a bipartite network containing both files and authors as nodes.

# In[ ]:


t, node_info, edge_info = git2net.get_bipartite_network(sqlite_db_file)
n = pp.Network.from_temporal_network(t)
n


# For this network, `node_info` contains the classes of authors in the network. These can e.g. be used to colour nodes as shown below.

# In[ ]:


colour_map = {'author': '#73D2DE', 'file': '#2E5EAA'}
node_color = {node: colour_map[node_info['class'][node]] for node in n.nodes}
pp.visualisation.plot(n, node_color=node_color)


# If we are interested in, e.g. more recently edited files, we can filter the database by providing the `time_from` and `time_to` options. Let's check the files edited since May 2019.

# In[ ]:


from datetime import datetime
time_from = datetime(2019, 5, 1)
t, node_info, edge_info = git2net.get_bipartite_network(sqlite_db_file, time_from=time_from)
n = pp.Network.from_temporal_network(t)
colour_map = {'author': '#73D2DE', 'file': '#2E5EAA'}
node_color = {node: colour_map[node_info['class'][node]] for node in n.nodes}
pp.visualisation.plot(n, node_color=node_color)


# ### Co-authorship networks
# 
# The projection of this network that links authors editing the same file is the co-authorship network.

# In[ ]:


n, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file)
n


# Note that it looks similar as, at its core, the co-authorship network is a projection of the bipartite network to authors.

# ### Line editing paths
# 
# `git2net` allows the extraction of editing paths on the level of individual lines. I.e. we can track consecutive changes made to a single line over time&mdash;even if these lines move within a file or even across files. This is very powerful, as it allows us to determine editing sequences and find lines that require more editing than others.
# These could either be lines that are tough to implement, or they could contain essential information, such as the version number in an `__init__.py` file.
# 
# To extract these paths, we can use the `get_line_editing_paths` function. As these networks tend to be very large, we limit the analysis to a small file for this tutorial.
# To only look at a specific set of file paths, we can use the `file_paths` option.

# In[ ]:


file_paths = ['setup.py']
paths, dag, node_info, edge_info = git2net.get_line_editing_paths(sqlite_db_file, git_repo_dir,
                                                                  file_paths=file_paths)
pp.visualisation.plot(dag, node_color=node_info['colors'])


# Notice that despite only looking at a single file, the network shown above is not connected. This is due to our database not being complete. Let's fix this now and try again.

# In[ ]:


git2net.mine_git_repo(git_repo_dir, sqlite_db_file)


# As we now have more commits in our database, we also need to rerun the disambiguation.

# In[ ]:


git2net.disambiguate_aliases_db(sqlite_db_file)


# Let's colour the nodes by type and look at the visualisation again.

# In[ ]:


paths, dag, node_info, edge_info = git2net.get_line_editing_paths(sqlite_db_file, git_repo_dir,
                                                                  file_paths=file_paths)
colors = {}
for x in dag.nodes:
    colors[x] = '#FBB13C'
for x in dag.roots:
    colors[x] = '#A83236'
for x in dag.leafs:
    colors[x] = '#21830'

pp.visualisation.plot(dag, node_color=colors, width=1000, height=1000)


# As mentioned before, these networks get very large very quickly. Therefore, it is often more helpful to work with the `pathpy` path object that is also returned by the function.
# It contains all paths and subpaths contained in the network shown above. More information regarding this object can be found in the documentation on [pathpy.net](http://www.pathpy.net/).

# ### Commit editing paths
# 
# Finally, let's look at a projection in which nodes represent commits and a directed edge $c_1 \rightarrow c_2$ between two commits $c_1, c_2$ exists if commit $c_2$ modified lines written in $c_1$.

# In[ ]:


dag, node_info, edge_info = git2net.get_commit_editing_dag(sqlite_db_file)

dag


# ## Comparison co-editing and co-authorship networks
# 
# In our [original publication](https://arxiv.org/abs/1903.10180) of `git2net`, we compared the co-editing and co-authorship networks of an Open Source and proprietary software development project.
# Let's see how a similar comparison would work for the repository behind `git2net`.

# In[ ]:


# Get the co-authorship network
n_coauthorship, node_info, edge_info = git2net.get_coauthorship_network(sqlite_db_file)

# Get the (aggregated) co-editing network
n_coediting_t, node_info, edge_info = git2net.get_coediting_network(sqlite_db_file)
n_coediting = pp.Network.from_temporal_network(n_coediting_t)


# In[ ]:


print('==============================================\n')
print('# co-authorship')
print(n_coauthorship)
print('# co-editing')
print(n_coediting)
print('==============================================')


# We find that both networks have seven nodes representing the seven individuals that worked on `git2net`.
# In the co-authorship network, the nodes are connected by eight undirected links indicating the two people edited the same file.
# In the co-editing network, we have nine directed links that show who edited lines from whom.
# 
# Finally, let's see how the co-editing network evolves over time!
# For this example, we will use a rolling time window spanning 365 days, which we then shift in 30-day increments.
# We can use the `RollingTimeWindow()` function in `pathpy` to compute the networks over time.
# Then, we only need to add the statistics of interest to a dictionary.

# In[ ]:


WINDOW_SIZE = 365*24*60*60
STEP_SIZE = 30*24*60*60

data = defaultdict(list)
for network, window in pp.RollingTimeWindow(n_coediting_t,
                                            window_size=WINDOW_SIZE,
                                            step_size=STEP_SIZE,
                                            directed=True, return_window=True):
    data['number of developers'].append(network.ncount())
    data['unique relations directed'].append(network.ecount())
    data['mean outdegree'].append(pp.algorithms.statistics.mean_degree(network, degree='outdegree'))
    data['time'].append(window[1]) # append window end time


# Now that we have the data let's plot them and see what we get!

# In[ ]:


# Plot time-variable network measures
df = pd.DataFrame(data, columns=list(data.keys()))
df.set_index(pd.to_datetime(df.time, unit='s'), inplace=True)
df.drop('time', axis=1, inplace=True)
df.plot(y='number of developers')
df.plot(y='unique relations directed')
df.plot(y='mean outdegree')


# With this, we conclude this part of the tutorial.
# The following part will look at the remaining information mined by `git2net` and see how we can efficiently handle the resulting SQLite database.