#!/usr/bin/env python
# coding: utf-8

# # collect timing of github status checks
# 
# Used to compute things like [this](https://github.com/conda-forge/petsc4py-feedstock/pull/112#issuecomment-2586817022)
# 

# In[241]:


from subprocess import run
try:
    token = run(["gh", "auth", "token"], check=True, text=True, capture_output=True).stdout.strip()
except Exception:
    token = None
    
from github import Github as GitHub
gh = GitHub(token)


# In[248]:


repo = gh.get_repo("conda-forge/fenics-dolfinx-feedstock")


# In[243]:


import pandas as pd


# In[244]:


import re

name_pat = re.compile(r"\(Build\s+\w+\s+([\w\.]+)\)?")

def extract_name(name):
    m = name_pat.search(name)
    if m:
        return m.group(1)
    else:
        return name

def get_checks_df(repo, pr):
    records = []
    if isinstance(pr, int):
        pr = repo.get_pull(pr)
    for check in repo.get_commit(pr.head.sha).get_check_runs():
        # exclude meta tasks, like the overall job and Check Skip
        if "Build" not in check.name:
            continue
        name = extract_name(check.name)
        records.append(
            {
                "name": name,
                "started": check.started_at,
                "completed": check.completed_at,
                "duration": (check.completed_at - check.started_at),
                "status": check.conclusion,
            }
        )
    return pd.DataFrame.from_records(records).sort_values("started").reset_index(drop=True)


# In[245]:


flat_df = get_checks_df(repo, 99)
split_df = get_checks_df(repo, 100)


# In[246]:


before_df = get_checks_df(repo, 98)


# In[247]:


before_df.duration.sum()


# In[249]:


petsc4py = gh.get_repo("conda-forge/petsc4py-feedstock")
before_df = get_checks_df(petsc4py, 111)


# In[260]:


before_df.completed.max() - before_df.started.min()


# In[261]:


before_df.duration.sum()


# In[262]:


after_df = get_checks_df(petsc4py, 112)


# In[293]:


# artificial: split_df had one retry, which skews things
# set start of last job to start of second to last job
last_split_df = after_df.iloc[-1]
next_to_last = after_df.iloc[-2]
offset = last_split_df.started - next_to_last.started
last_split_df.started -= offset
last_split_df.completed -= offset
after_df.iloc[-1] = last_split_df


# In[294]:


after_df["tool"] = "rattler-build"
before_df["tool"] = "conda-build"
df = pd.concat([after_df, before_df])
df["duration"] = df["duration"].dt.total_seconds()


# In[295]:


len(before_df)


# In[296]:


len(after_df)


# In[308]:


before_df.duration.sum().total_seconds() / 3600


# In[309]:


after_df.duration.sum().total_seconds() / 3600


# In[299]:


before_df.duration.min(), before_df.duration.median(), before_df.duration.max()


# In[300]:


after_df.duration.min(), after_df.duration.median(), after_df.duration.max()


# In[305]:


before_df.completed.max() - before_df.started.min()


# In[306]:


after_df.completed.max() - after_df.started.min()


# In[280]:


import altair as alt

# before_df["seconds"] = before_df.duration.dt.total_seconds()

alt.Chart(df).mark_bar().encode(
    alt.X("duration:Q", bin=True),
    alt.Y("count()"),
    alt.Color("tool"),
    alt.Facet("tool")
)


# In[ ]:


import altair as alt
before_df["seconds"] = before_df.duration.dt.total_seconds()

alt.Chart(before_df[["seconds"]]).mark_bar().encode(
    alt.X("seconds:Q", bin=True),
    y="count()",
)


# In[256]:


minutes = before_df.duration.dt.total_seconds() / 60
minutes.plot.hist()


# In[196]:


split_df.name


# In[197]:


# artificial: split_df had one retry, which skews things
# set start of last job to start of second to last job
last_split_df = split_df.iloc[-1]
next_to_last = split_df.iloc[-2]
offset = last_split_df.started - next_to_last.started
last_split_df.started -= offset
last_split_df.completed -= offset
split_df.iloc[-1] = last_split_df


# In[199]:


flat_df.completed.max() - flat_df.started.min()


# In[200]:


split_df.completed.max() - split_df.started.min()


# In[201]:


flat_df['label'] = 'flat'
split_df['label'] = 'split'
merged = pd.concat([flat_df, split_df])
merged.groupby('label').duration.sum()


# In[202]:


gb = merged.groupby('label')
gb.duration.min()


# In[203]:


gb.duration.max()


# In[228]:


flat_df.name.str.split("_", n=2).str[:2].str.join("_")


# In[229]:


flat_df['plat'] = flat_df.name.str.split("_", n=2).str[:2].str.join("_")
split_df['plat'] = split_df.name.str.split("_", n=2).str[:2].str.join("_")


# In[230]:


flat_df.groupby('plat').duration.sum()


# In[231]:


split_df.groupby('plat').duration.sum()


# In[232]:


5.25 / 18


# In[233]:


10 / 32


# In[208]:


flat_df.duration.sum()


# In[209]:


len(split_df)


# In[210]:


from matplotlib import pyplot as plt


# In[211]:


split_df.started


# In[212]:


flat_df.started.astype(int)


# In[213]:


(flat_df.started.astype(int).astype(float) - flat_df.started.astype(int).astype(float).min()) * 1e-9,


# In[214]:


plt.bar(
    x=flat_df.name,
    height=flat_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(flat_df.started.astype(int).astype(float) - flat_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(False)
plt.tick_params(labelbottom=False)


# In[215]:


plt.bar(
    x=split_df.name,
    height=split_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(split_df.started.astype(int).astype(float) - split_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(True, axis='y')
plt.grid(False, axis='x')
plt.tick_params(labelbottom=False)


# In[301]:


plt.bar(
    x=after_df.name,
    height=after_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(after_df.started.astype(int).astype(float) - after_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(False)
plt.tick_params(labelbottom=False)


# In[303]:


plt.bar(
    x=before_df.name,
    height=before_df.duration.astype(int).astype(float) * 1e-9,
    bottom=(before_df.started.astype(int).astype(float) - before_df.started.astype(int).astype(float).min()) * 1e-9,
)
plt.grid(False)
plt.tick_params(labelbottom=False)


# In[ ]: