#!/usr/bin/env python # coding: utf-8 # # collect timing of github status checks # # Used to compute things like [this](https://github.com/conda-forge/petsc4py-feedstock/pull/112#issuecomment-2586817022) # # In[241]: from subprocess import run try: token = run(["gh", "auth", "token"], check=True, text=True, capture_output=True).stdout.strip() except Exception: token = None from github import Github as GitHub gh = GitHub(token) # In[248]: repo = gh.get_repo("conda-forge/fenics-dolfinx-feedstock") # In[243]: import pandas as pd # In[244]: import re name_pat = re.compile(r"\(Build\s+\w+\s+([\w\.]+)\)?") def extract_name(name): m = name_pat.search(name) if m: return m.group(1) else: return name def get_checks_df(repo, pr): records = [] if isinstance(pr, int): pr = repo.get_pull(pr) for check in repo.get_commit(pr.head.sha).get_check_runs(): # exclude meta tasks, like the overall job and Check Skip if "Build" not in check.name: continue name = extract_name(check.name) records.append( { "name": name, "started": check.started_at, "completed": check.completed_at, "duration": (check.completed_at - check.started_at), "status": check.conclusion, } ) return pd.DataFrame.from_records(records).sort_values("started").reset_index(drop=True) # In[245]: flat_df = get_checks_df(repo, 99) split_df = get_checks_df(repo, 100) # In[246]: before_df = get_checks_df(repo, 98) # In[247]: before_df.duration.sum() # In[249]: petsc4py = gh.get_repo("conda-forge/petsc4py-feedstock") before_df = get_checks_df(petsc4py, 111) # In[260]: before_df.completed.max() - before_df.started.min() # In[261]: before_df.duration.sum() # In[262]: after_df = get_checks_df(petsc4py, 112) # In[293]: # artificial: split_df had one retry, which skews things # set start of last job to start of second to last job last_split_df = after_df.iloc[-1] next_to_last = after_df.iloc[-2] offset = last_split_df.started - next_to_last.started last_split_df.started -= offset last_split_df.completed -= offset after_df.iloc[-1] = last_split_df # In[294]: after_df["tool"] = "rattler-build" before_df["tool"] = "conda-build" df = pd.concat([after_df, before_df]) df["duration"] = df["duration"].dt.total_seconds() # In[295]: len(before_df) # In[296]: len(after_df) # In[308]: before_df.duration.sum().total_seconds() / 3600 # In[309]: after_df.duration.sum().total_seconds() / 3600 # In[299]: before_df.duration.min(), before_df.duration.median(), before_df.duration.max() # In[300]: after_df.duration.min(), after_df.duration.median(), after_df.duration.max() # In[305]: before_df.completed.max() - before_df.started.min() # In[306]: after_df.completed.max() - after_df.started.min() # In[280]: import altair as alt # before_df["seconds"] = before_df.duration.dt.total_seconds() alt.Chart(df).mark_bar().encode( alt.X("duration:Q", bin=True), alt.Y("count()"), alt.Color("tool"), alt.Facet("tool") ) # In[ ]: import altair as alt before_df["seconds"] = before_df.duration.dt.total_seconds() alt.Chart(before_df[["seconds"]]).mark_bar().encode( alt.X("seconds:Q", bin=True), y="count()", ) # In[256]: minutes = before_df.duration.dt.total_seconds() / 60 minutes.plot.hist() # In[196]: split_df.name # In[197]: # artificial: split_df had one retry, which skews things # set start of last job to start of second to last job last_split_df = split_df.iloc[-1] next_to_last = split_df.iloc[-2] offset = last_split_df.started - next_to_last.started last_split_df.started -= offset last_split_df.completed -= offset split_df.iloc[-1] = last_split_df # In[199]: flat_df.completed.max() - flat_df.started.min() # In[200]: split_df.completed.max() - split_df.started.min() # In[201]: flat_df['label'] = 'flat' split_df['label'] = 'split' merged = pd.concat([flat_df, split_df]) merged.groupby('label').duration.sum() # In[202]: gb = merged.groupby('label') gb.duration.min() # In[203]: gb.duration.max() # In[228]: flat_df.name.str.split("_", n=2).str[:2].str.join("_") # In[229]: flat_df['plat'] = flat_df.name.str.split("_", n=2).str[:2].str.join("_") split_df['plat'] = split_df.name.str.split("_", n=2).str[:2].str.join("_") # In[230]: flat_df.groupby('plat').duration.sum() # In[231]: split_df.groupby('plat').duration.sum() # In[232]: 5.25 / 18 # In[233]: 10 / 32 # In[208]: flat_df.duration.sum() # In[209]: len(split_df) # In[210]: from matplotlib import pyplot as plt # In[211]: split_df.started # In[212]: flat_df.started.astype(int) # In[213]: (flat_df.started.astype(int).astype(float) - flat_df.started.astype(int).astype(float).min()) * 1e-9, # In[214]: plt.bar( x=flat_df.name, height=flat_df.duration.astype(int).astype(float) * 1e-9, bottom=(flat_df.started.astype(int).astype(float) - flat_df.started.astype(int).astype(float).min()) * 1e-9, ) plt.grid(False) plt.tick_params(labelbottom=False) # In[215]: plt.bar( x=split_df.name, height=split_df.duration.astype(int).astype(float) * 1e-9, bottom=(split_df.started.astype(int).astype(float) - split_df.started.astype(int).astype(float).min()) * 1e-9, ) plt.grid(True, axis='y') plt.grid(False, axis='x') plt.tick_params(labelbottom=False) # In[301]: plt.bar( x=after_df.name, height=after_df.duration.astype(int).astype(float) * 1e-9, bottom=(after_df.started.astype(int).astype(float) - after_df.started.astype(int).astype(float).min()) * 1e-9, ) plt.grid(False) plt.tick_params(labelbottom=False) # In[303]: plt.bar( x=before_df.name, height=before_df.duration.astype(int).astype(float) * 1e-9, bottom=(before_df.started.astype(int).astype(float) - before_df.started.astype(int).astype(float).min()) * 1e-9, ) plt.grid(False) plt.tick_params(labelbottom=False) # In[ ]: