#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().system('git clone https://github.com/gotec/git2net-tutorials') import os os.chdir('git2net-tutorials') get_ipython().system('pip install -r requirements.txt') os.chdir('..') # # Cloning a git repository for analysis with `git2net` # # In this notebook, we go through the first steps necessary to analyse your repository. Specifically, we show how to create a local copy (a clone) of an existing git repository. In sections 1 and 2 of this tutorial, we will solely consider the default branch (typically the `main` or `master` branch) of repositories. In section 3, we will then show how different or even multiple branches of repositories can be analysed. # ## 1 - Manually cloning git repositories # # The easiest way to clone a git repository and immediately be ready for analysis with `git2net` is to clone the repository manually. # To do this, you need the URL to the repository you want to analyse. For repositories on GitHub, [this](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) short manual shows you how to obtain the URL. # Once you have the URL, you can clone the repository using the command `git clone ` from a terminal, where you replace `` with the URL to the repository of interest. # Subsequently, when starting to analyse the repository with `git2net`, you need to provide the path to the repository you just cloned. # # Cloning repositories in this way works great for both public and private repositories as you will be asked for your access credentials during the cloning process. # ## 2 - Cloning git repositories with Python # # Manually cloning repositories is excellent when the number of repositories you aim to analyse is low. # But what when you are faced with analysing many hundred repositories for a study? # In this case, you will likely be looking to write a script that can clone the repositories for you. # Below, we show you how you can achieve this directly from Python. # # As the process will be completely automated, you will not be asked for your access credentials while cloning the repository. # Therefore, the processes for cloning public and private repositories is slightly different, as we will explain in the following sections. # # ### Public repositories # # Let's start with public repositories. # To start, you will need to select and clone a git repository that you are interested in analysing. For the purpose of this tutorial, we will explore the repository behind `git2net`—aiming to finally find a solution to the well-known chicken and egg problem. # # The following lines will clone the `git2net` repository to your current working directory. You can edit the path to the local directory stored in `git_repo_dir` to change this location. # In[ ]: import pygit2 as git2 import os import shutil git_repo_url = 'https://github.com/gotec/git2net.git' git_repo_dir = 'git2net4analysis' # Remove the clone of the repository if it already exists from a previous run if os.path.exists(git_repo_dir): shutil.rmtree(git_repo_dir) repo = git2.clone_repository(git_repo_url, git_repo_dir) # Clones a non-bare repository # #### Public repositories on GitHub # # For public repositories hosted on GitHub, `git2net` also provides the function `mine_github()` that allows you to clone and analyse a repository in a single step. # The URL to the repository can be provided either as a full HTTPS URL (e.g. `https://github.com/gotec/git2net`) or simply as a combination `/` (e.g. `gotec/git2net`). # Further, you need to specify the path where the repository is cloned to, `git_repo_dir`, and the path to the `sqlite_db_file` to which the results are written. # In[ ]: import git2net import os import shutil github_url = 'gotec/git2net' git_repo_dir = 'git2net4analysis' sqlite_db_file = 'git2net4analysis.db' # Remove the clone of the repository if it already exists from a previous run if os.path.exists(git_repo_dir): shutil.rmtree(git_repo_dir) # Remove resulting sqlite database if it already exists from a previous run if os.path.exists(sqlite_db_file): os.remove(sqlite_db_file) git2net.mine_github(github_url, git_repo_dir, sqlite_db_file) # From here, you can immediately start analysing the repository, e.g., by looking at network projections or performing time series analysis. # We will provide more details on how to do so in the subsequent tutorials. # ### Private repositories # # Private repositories require some more effort. Firstly, you have to generate a personal token. The procedure on the GitHub side is explained [here](https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token). Make sure to copy your new access token to a file (`secret.txt` for instance). You won't be able to see it again! Please add `secret.txt` directly to your `.gitignore` file! You wouldn't believe how many access tokens are freely available at GitHub :-) # # Now, we can pass the token as a third parameter embedded in a callback method to `clone_repository()`. # # *The code below is commented to allow for the execution of the entire notebook. Uncomment it to use your own private repository for the tutorial.* # In[ ]: # import pygit2 as git2 # import os # import shutil # git_repo_url = 'https://github.com/user/SecretRepository.git' # does not exist :-) # git_repo_dir = 'secretRepository' # f = open("secret.txt", "r") # token = f.read() # if os.path.exists(git_repo_dir): # shutil.rmtree(git_repo_dir) # callbacks = git2.RemoteCallbacks(git2.UserPass(token, 'x-oauth-basic')) # repo = git2.clone_repository(git_repo_url, git_repo_dir, callbacks=callbacks) # ## 3 - Analysing different or multiple branches of a repository # # So far, we have focused on looking at the default branch (typically the `main` or `master` branch) of a repository. # For most analyses, this is sufficient as this is (usually) the branch to which other branches are merged once their content is sufficiently developed. # That said, there are repositories where this does not occur. # Further, you might be interested in analysing the development in a specific branch or all branches of a repository. # # ### Tracking multiple branches of a repository # # Again we first look at the general approach to achieve this. # Here, we need to first clone the repository as before. # Then, we need to track all branches in which we are interested. # Below, we show you how you can do this using `GitPython`, another library next to `pygit2` that we can use to interact with git repositories. # In[ ]: import git import os import shutil # Step 1: Clone repository as before git_repo_url = 'https://github.com/gotec/git2net.git' git_repo_dir = 'git2net4analysis' if os.path.exists(git_repo_dir): shutil.rmtree(git_repo_dir) repo = git.Repo.clone_from(git_repo_url, git_repo_dir) # Step 2: Track remote branches existing_branches = [b.name for b in repo.branches] for ref in repo.remote().refs: branch_name = ref.name.split('/')[-1] if branch_name != 'HEAD' and branch_name not in existing_branches: repo.git.branch('--track', branch_name, 'remotes/origin/' + branch_name) # Subsequently, in step 3, we can start mining the repository using the option `all_branches` set to `True`. # As stated earlier, we will provide more details on the options you can use with `git2net` in the subsequent tutorials. # In[ ]: import git2net import sqlite3 import pandas as pd from collections import Counter # Step 3: Crawl the local repository git_repo_dir = 'git2net4analysis' sqlite_db_file = 'git2net4analysis.db' if os.path.exists(sqlite_db_file): os.remove(sqlite_db_file) git2net.mine_git_repo(git_repo_dir, sqlite_db_file, all_branches=True) # Step 4: Check the covered branches with sqlite3.connect(sqlite_db_file) as con: branches = pd.read_sql_query("SELECT branches FROM commits", con).branches print(Counter([b for b_list in branches for b in b_list.split(',')])) # If we check the resulting database, we can see that we have processed the commits from multiple branches. # ### Analysing a specific branch from public GitHub repository # # When analysing a repository using `git2net`'s `mine_github()` function introduced above, you can also specify the branch you want to analyse. # We show an example for this below. # In[ ]: import git2net import os import shutil github_url = 'gotec/git2net' git_repo_dir = 'git2net4analysis' sqlite_db_file = 'git2net4analysis.db' branch = 'object-oriented' # Remove the clone of the repository if it already exists from a previous run if os.path.exists(git_repo_dir): shutil.rmtree(git_repo_dir) # Remove resulting sqlite database if it already exists from a previous run if os.path.exists(sqlite_db_file): os.remove(sqlite_db_file) # Mine a specific branch using the branch option git2net.mine_github(github_url, git_repo_dir, sqlite_db_file, branch=branch) # Check the resulting database with sqlite3.connect(sqlite_db_file) as con: branches = pd.read_sql_query("SELECT branches FROM commits", con).branches print(Counter([b for b_list in branches for b in b_list.split(',')])) # Looking at the resulting repository, we can see that only commits from the selected branch are in the database. # # We can extend the database with commits from another branch by rerunning the command with another active branch. # Note, however, that to do so, you first need to remove the already existing clone of the repository as `git2net` will not overwrite your existing data. # In[ ]: branch = 'main' # Remove the clone of the repository if it already exists from a previous run if os.path.exists(git_repo_dir): shutil.rmtree(git_repo_dir) git2net.mine_github(github_url, git_repo_dir, sqlite_db_file, branch=branch) with sqlite3.connect(sqlite_db_file) as con: branches = pd.read_sql_query("SELECT branches FROM commits", con).branches print(Counter([b for b_list in branches for b in b_list.split(',')])) # As you can see, if a commit from the new branch is already present in the database, `git2net` will recognise this and not mine the commit again. # Instead, only the information on which branches these commits appear in is updated. # With this, we conclude the first tutorial in which we showed you how to clone a repository. # With some examples, we even started looking at how you can start mining the cloned repositories using `git2net`. # In these examples, we have used `git2net`'s default settings. # However, depending on your research or application, you might be interested in extracting additional data, such as the content of modified lines or information on the cyclomatic complexity of files. # As we will cover in the following tutorial, `git2net` comes with various options that allow you to achieve this.