collect timing of github status checks¶

Used to compute things like this

In [241]:

from subprocess import run
try:
    token = run(["gh", "auth", "token"], check=True, text=True, capture_output=True).stdout.strip()
except Exception:
    token = None
    
from github import Github as GitHub
gh = GitHub(token)

In [248]:

repo = gh.get_repo("conda-forge/fenics-dolfinx-feedstock")

In [243]:

import pandas as pd

In [244]:

import re

name_pat = re.compile(r"\(Build\s+\w+\s+([\w\.]+)\)?")

def extract_name(name):
    m = name_pat.search(name)
    if m:
        return m.group(1)
    else:
        return name

def get_checks_df(repo, pr):
    records = []
    if isinstance(pr, int):
        pr = repo.get_pull(pr)
    for check in repo.get_commit(pr.head.sha).get_check_runs():
        # exclude meta tasks, like the overall job and Check Skip
        if "Build" not in check.name:
            continue
        name = extract_name(check.name)
        records.append(
            {
                "name": name,
                "started": check.started_at,
                "completed": check.completed_at,
                "duration": (check.completed_at - check.started_at),
                "status": check.conclusion,
            }
        )
    return pd.DataFrame.from_records(records).sort_values("started").reset_index(drop=True)

In [245]:

flat_df = get_checks_df(repo, 99)
split_df = get_checks_df(repo, 100)

In [246]:

before_df = get_checks_df(repo, 98)

In [247]:

before_df.duration.sum()

Out[247]:

Timedelta('1 days 23:11:17')

In [249]:

petsc4py = gh.get_repo("conda-forge/petsc4py-feedstock")
before_df = get_checks_df(petsc4py, 111)

In [260]:

before_df.completed.max() - before_df.started.min()

Out[260]:

Timedelta('0 days 00:51:03')

In [261]:

before_df.duration.sum()

Out[261]:

Timedelta('1 days 12:24:58')

In [262]:

after_df = get_checks_df(petsc4py, 112)

In [293]:

# artificial: split_df had one retry, which skews things
# set start of last job to start of second to last job
last_split_df = after_df.iloc[-1]
next_to_last = after_df.iloc[-2]
offset = last_split_df.started - next_to_last.started
last_split_df.started -= offset
last_split_df.completed -= offset
after_df.iloc[-1] = last_split_df

/var/folders/qr/3vxfnp1x2t1fw55dr288mphc0000gn/T/ipykernel_76070/2980723383.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_split_df.started -= offset
/var/folders/qr/3vxfnp1x2t1fw55dr288mphc0000gn/T/ipykernel_76070/2980723383.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_split_df.completed -= offset

In [294]:

after_df["tool"] = "rattler-build"
before_df["tool"] = "conda-build"
df = pd.concat([after_df, before_df])
df["duration"] = df["duration"].dt.total_seconds()

In [295]:

len(before_df)

Out[295]:

In [296]:

len(after_df)

Out[296]:

In [308]:

before_df.duration.sum().total_seconds() / 3600

Out[308]:

36.416111111111114

In [309]:

after_df.duration.sum().total_seconds() / 3600

Out[309]:

24.595

In [299]:

before_df.duration.min(), before_df.duration.median(), before_df.duration.max()

Out[299]:

(Timedelta('0 days 00:06:13'),
 Timedelta('0 days 00:09:53'),
 Timedelta('0 days 00:16:13'))

In [300]:

after_df.duration.min(), after_df.duration.median(), after_df.duration.max()

Out[300]:

(Timedelta('0 days 00:03:42'),
 Timedelta('0 days 00:06:58'),
 Timedelta('0 days 00:10:59'))

In [305]:

before_df.completed.max() - before_df.started.min()

Out[305]:

Timedelta('0 days 00:51:03')

In [306]:

after_df.completed.max() - after_df.started.min()

Out[306]:

Timedelta('0 days 00:36:40')

In [280]:

import altair as alt

# before_df["seconds"] = before_df.duration.dt.total_seconds()

alt.Chart(df).mark_bar().encode(
    alt.X("duration:Q", bin=True),
    alt.Y("count()"),
    alt.Color("tool"),
    alt.Facet("tool")
)

Out[280]:

In [ ]:

import altair as alt
before_df["seconds"] = before_df.duration.dt.total_seconds()

alt.Chart(before_df[["seconds"]]).mark_bar().encode(
    alt.X("seconds:Q", bin=True),
    y="count()",
)

In [256]:

minutes = before_df.duration.dt.total_seconds() / 60
minutes.plot.hist()

Out[256]:

<Axes: ylabel='Frequency'>

In [196]:

split_df.name

Out[196]:

0        osx_64_mpimpichpython3.9.____cpythonscalarreal
1     osx_64_mpiopenmpipython3.11.____cpythonscalarreal
2     osx_64_mpiopenmpipython3.10.____cpythonscalarc...
3     osx_64_mpiopenmpipython3.12.____cpythonscalarc...
4     osx_64_mpiopenmpipython3.11.____cpythonscalarc...
                            ...                        
95    linux_ppc64le_mpiopenmpipython3.13.____cp313sc...
96    linux_ppc64le_mpiopenmpipython3.13.____cp313sc...
97    linux_ppc64le_mpiopenmpipython3.9.____cpythons...
98    linux_ppc64le_mpiopenmpipython3.9.____cpythons...
99    osx_arm64_mpiopenmpipython3.10.____cpythonscal...
Name: name, Length: 100, dtype: object

In [197]:

# artificial: split_df had one retry, which skews things
# set start of last job to start of second to last job
last_split_df = split_df.iloc[-1]
next_to_last = split_df.iloc[-2]
offset = last_split_df.started - next_to_last.started
last_split_df.started -= offset
last_split_df.completed -= offset
split_df.iloc[-1] = last_split_df

In [199]:

flat_df.completed.max() - flat_df.started.min()

Out[199]:

Timedelta('0 days 01:33:20')

In [200]:

split_df.completed.max() - split_df.started.min()

Out[200]:

Timedelta('0 days 01:03:48')

In [201]:

flat_df['label'] = 'flat'
split_df['label'] = 'split'
merged = pd.concat([flat_df, split_df])
merged.groupby('label').duration.sum()

Out[201]:

label
flat    0 days 18:14:53
split   1 days 08:19:22
Name: duration, dtype: timedelta64[ns]

In [202]:

gb = merged.groupby('label')
gb.duration.min()

Out[202]:

label
flat    0 days 00:18:39
split   0 days 00:06:32
Name: duration, dtype: timedelta64[ns]

In [203]:

gb.duration.max()

Out[203]:

label
flat    0 days 01:33:13
split   0 days 00:41:30
Name: duration, dtype: timedelta64[ns]

In [228]:

flat_df.name.str.split("_", n=2).str[:2].str.join("_")

Out[228]:

0            osx_64
1         osx_arm64
2            osx_64
3            osx_64
4          linux_64
5          linux_64
6         osx_arm64
7         osx_arm64
8     linux_aarch64
9     linux_aarch64
10    linux_aarch64
11         linux_64
12    linux_aarch64
13    linux_ppc64le
14           osx_64
15    linux_ppc64le
16    linux_ppc64le
17        osx_arm64
18    linux_ppc64le
19         linux_64
Name: name, dtype: object

In [229]:

flat_df['plat'] = flat_df.name.str.split("_", n=2).str[:2].str.join("_")
split_df['plat'] = split_df.name.str.split("_", n=2).str[:2].str.join("_")

In [230]:

flat_df.groupby('plat').duration.sum()

Out[230]:

plat
linux_64        0 days 03:04:59
linux_aarch64   0 days 05:05:18
linux_ppc64le   0 days 05:14:42
osx_64          0 days 03:27:21
osx_arm64       0 days 01:22:33
Name: duration, dtype: timedelta64[ns]

In [231]:

split_df.groupby('plat').duration.sum()

Out[231]:

plat
linux_64        0 days 04:24:43
linux_aarch64   0 days 09:33:57
linux_ppc64le   0 days 10:01:46
osx_64          0 days 05:16:08
osx_arm64       0 days 03:02:48
Name: duration, dtype: timedelta64[ns]

In [232]:

5.25 / 18

Out[232]:

0.2916666666666667

In [233]:

10 / 32

Out[233]:

0.3125

In [208]:

flat_df.duration.sum()

Out[208]:

Timedelta('0 days 18:14:53')

In [209]:

len(split_df)

Out[209]:

In [210]:

from matplotlib import pyplot as plt

In [211]:

split_df.started

Out[211]:

0    2025-01-09 14:18:33+00:00
1    2025-01-09 14:18:34+00:00
2    2025-01-09 14:18:34+00:00
3    2025-01-09 14:18:34+00:00
4    2025-01-09 14:18:34+00:00
                ...           
95   2025-01-09 14:46:07+00:00
96   2025-01-09 14:46:09+00:00
97   2025-01-09 14:49:11+00:00
98   2025-01-09 14:54:42+00:00
99   2025-01-09 14:54:42+00:00
Name: started, Length: 100, dtype: datetime64[ns, UTC]

In [212]:

flat_df.started.astype(int)

Out[212]:

0     1736441406000000000
1     1736441407000000000
2     1736441408000000000
3     1736441409000000000
4     1736441410000000000
5     1736441410000000000
6     1736441410000000000
7     1736441410000000000
8     1736441410000000000
9     1736441410000000000
10    1736441411000000000
11    1736441411000000000
12    1736441411000000000
13    1736441411000000000
14    1736441412000000000
15    1736441412000000000
16    1736441412000000000
17    1736441413000000000
18    1736441413000000000
19    1736441422000000000
Name: started, dtype: int64

In [213]:

(flat_df.started.astype(int).astype(float) - flat_df.started.astype(int).astype(float).min()) * 1e-9,

Out[213]:

(0      0.0
 1      1.0
 2      2.0
 3      3.0
 4      4.0
 5      4.0
 6      4.0
 7      4.0
 8      4.0
 9      4.0
 10     5.0
 11     5.0
 12     5.0
 13     5.0
 14     6.0
 15     6.0
 16     6.0
 17     7.0
 18     7.0
 19    16.0
 Name: started, dtype: float64,)

In [214]:

plt.bar(
    x=flat_df.name,
    height=flat_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(flat_df.started.astype(int).astype(float) - flat_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(False)
plt.tick_params(labelbottom=False)

In [215]:

plt.bar(
    x=split_df.name,
    height=split_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(split_df.started.astype(int).astype(float) - split_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(True, axis='y')
plt.grid(False, axis='x')
plt.tick_params(labelbottom=False)

In [301]:

plt.bar(
    x=after_df.name,
    height=after_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(after_df.started.astype(int).astype(float) - after_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(False)
plt.tick_params(labelbottom=False)

In [303]:

plt.bar(
    x=before_df.name,
    height=before_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(before_df.started.astype(int).astype(float) - before_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(False)
plt.tick_params(labelbottom=False)

In [ ]: