#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().system('git clone https://github.com/gotec/git2net-tutorials') import os os.chdir('git2net-tutorials') get_ipython().system('pip install -r requirements.txt') os.chdir('..') get_ipython().system('git clone https://github.com/gotec/git2net git2net4analysis') # In[ ]: import git2net import os import pandas as pd import sqlite3 from collections import Counter # # Author disambiguation # # In the previous tutorial we discussed the options of `git2net`'s `mine_git_repo()` function. # Let's now call this function with its default options and have a closer look at the results we get. # In[ ]: # We assume a clone of git2net's repository exists in the folder below following the first tutorial. git_repo_dir = 'git2net4analysis' # Here, we specify the database in which we will store the results of the mining process. sqlite_db_file = 'git2net4analysis.db' # Remove database if exists if os.path.exists(sqlite_db_file): os.remove(sqlite_db_file) git2net.mine_git_repo(git_repo_dir, sqlite_db_file) # Specifically, we are interested in who worked on the project. # Using the `commits` table in the resulting database, we can find this out. # In[ ]: with sqlite3.connect(sqlite_db_file) as con: authors = pd.read_sql("""SELECT author_name, author_email FROM commits""", con) Counter(['{}, <{}>'.format(row.author_name, row.author_email) for idx, row in authors.iterrows()]) # The results show that both *Ingo Scholtes* and *Christoph Gote* made commits with multiple different name-email combinations, i.e., using multiple different aliases. # Let's try to understand why this could be an issue based on an example from one of our [recent publications](https://arxiv.org/abs/2201.04588). # In this project, we use `git2net` to study the relationship between team size and productivity using over 200 repositories from GitHub. # If we used the identities above, we would overestimate the team size (assuming the team size is the number of name-email combinations). # Simultaneously, we would underestimate the productivity of individual team members (assuming the productivity is the number of commits). # As a consequence, we would potentially come to vastly different conclusions. # Therefore, we need a method to disambiguate which aliases belong to the same author. # # One might assume from the results above that we could simply use the names instead of name-email information. # However, similar challenges can arise here too. # Just think of middle names or apostrophes/umlauts, which might be stored differently on one computer compared to another. # To deal with these challenges, `git2net` uses the Open Source name disambiguation tool `gambit` that we have developed specifically with the application in `git2net` in mind. # We provide links to its [development repository](https://github.com/gotec/gambit) and the [original publication](https://arxiv.org/abs/2103.05666) for those of you who are interested. # # Let's now apply `gambit` to the database from above and look at the results. # To do so, we can simply call the function `disambiguate_aliases_db()` on the `sqlite_db_file` resulting from `mine_git_repo()`. # In[ ]: git2net.disambiguate_aliases_db(sqlite_db_file) with sqlite3.connect(sqlite_db_file) as con: authors = pd.read_sql("""SELECT author_name, author_email, author_id FROM commits""", con) Counter(['{} --- {}, <{}>'.format(row.author_id, row.author_name, row.author_email) for idx, row in authors.iterrows()]) # We find that—as we were hoping for—`gambit` assigns both aliases of Ingo Scholtes and Christoph Gote the same `author_id`s, respectively. # Hence, we can use `author_id` in our subsequent analyses and visualisations as a unique identifier for the different authors active in the repository. # With this, we conclude our tutorial on author disambiguation. # We have now covered the cloning of repositories and the different options you can select during mining. # Finally, we showed how to disambiguate author identities, providing clean data for our subsequent analyses. # In the following tutorial, it is finally time to look at the `net`work part in `git2net`!