#!/usr/bin/env python # coding: utf-8 # # Collect info on jupyterhub packages # # Looks for repos that publish Python ackages # # GitHub: # # - List all repos with GraphQL # - Get package name from pyproject.toml # - For a few older packages, look in setup.cfg and setup.py, if no pyproject.toml # # For each found package, look it up on PyPI: # # - Check if packages exist on PyPI # # In[1]: import json import netrc import re import configparser from configparser import ConfigParser from jinja2 import Template import requests import requests_cache import toml file_gql = Template( """ { organization(login: "{{ organization }}") { repositories(first: 100{% if after -%}, after: "{{ after }}" {%- endif %}) { pageInfo { endCursor hasNextPage } edges { node { isArchived nameWithOwner object(expression: "HEAD:{{ path }}") { ... on Blob { text } } } } } } } """ ) requests_cache.install_cache("github", allowable_methods=["GET", "POST"]) s = requests.Session() auth = netrc.netrc().authenticators("api.github.com") if auth: print("Using netrc auth token") s.headers["Authorization"] = f"bearer {auth[2]}" else: print("No auth") github_graphql = "https://api.github.com/graphql" # In[2]: def fetch_files(path, organization: str = "jupyterhub", after=None): query = file_gql.render( organization=organization, after=after, path=path, ) resp = s.post(github_graphql, data=json.dumps(dict(query=query))) result = resp.json() if resp.status_code >= 400: print(result) resp.raise_for_status() repos = result["data"]["organization"]["repositories"] for repo_edge in repos["edges"]: repo = repo_edge["node"] if repo["isArchived"]: # ignore archived repos continue repo_info = { "name": repo["nameWithOwner"], path: None, } if repo["object"]: repo_info[path] = repo["object"]["text"] yield repo_info # pagination if repos["pageInfo"]["hasNextPage"]: yield from fetch_files( path, organization=organization, after=repos["pageInfo"]["endCursor"], ) def name_from_pyproject_toml(pyproject_toml): """get package name from pyproject.toml""" pyproject = toml.loads(pyproject_toml) return pyproject.get("project", {}).get("name") def name_from_setup_cfg(setup_cfg): cfg = ConfigParser() cfg.read_string(setup_cfg) try: return cfg.get("metadata", "name") except configparser.Error: return None def name_from_setup_py(setup_py): """get package name from setup.py""" m = None for name in re.findall(r"""name\s*=\s*['"]([^'"]+)['"]""", setup_py): # this is a regex, but probably good enough for a few old setup.pys return name return m def collect_repo_packages(organizations, verbose=False): repos = {} for organization in organizations: for repo in fetch_files("pyproject.toml", organization): name = repo["name"] repo["package"] = None repos[name] = repo pyproject_toml = repo["pyproject.toml"] if pyproject_toml: repo["package"] = pkg = name_from_pyproject_toml(pyproject_toml) else: if verbose: print(f"{name}: no pyproject.toml") for new_info in fetch_files("setup.cfg", organization): name = new_info["name"] repo = repos[name] repo.update(new_info) if repo["setup.cfg"] and not repo["package"]: repo["package"] = pkg = name_from_setup_cfg(repo["setup.cfg"]) if pkg and verbose: print(f"{name}: got name from setup.cfg") for new_info in fetch_files("setup.py", organization): name = new_info["name"] repo = repos[name] repo.update(new_info) if repo["setup.py"] and not repo["package"]: repo["package"] = pkg = name_from_setup_py(repo["setup.py"]) if pkg and verbose: print(f"{name}: got name from setup.py") if verbose: print("----") for repo in repos.values(): print(f"{repo['name']}: {repo['package']}") return repos repos = collect_repo_packages(["jupyterhub", "jupyterhealth"]) # Scrape members of Jupyter org. No API for this? # In[3]: from bs4 import BeautifulSoup jupyter_org_url = "https://pypi.org/org/jupyter/" r = requests.get(jupyter_org_url) r.raise_for_status() page = BeautifulSoup(r.text) jupyter_org_packages = { el.text.strip() for el in page.find_all(class_="package-snippet__title") } # Get information from PyPI: # In[4]: def get_pypi_info(repo): package = repo["package"] if not package: return repo repo["pypi_url"] = None repo["pypi_version"] = None repo["trusted_publisher"] = None pypi_url = f"https://pypi.org/pypi/{package}/json" r = requests.get(pypi_url) if r.status_code == 404: return repo if not r.ok: print(f"{r.status_code}: {pypi_url} (not a published package?)") return repo pypi_info = r.json() repo["package"] = package = pypi_info["info"]["name"] # apply PyPI normalization repo["jupyter_org"] = package in jupyter_org_packages repo["pypi_url"] = pypi_info["info"]["package_url"] repo["pypi_version"] = version = pypi_info["info"]["version"] filename = pypi_info["releases"][version][0]["filename"] provenance_url = ( f"https://pypi.org/integrity/{package}/{version}/{filename}/provenance" ) r = requests.get(provenance_url) if r.ok: repo["trusted_publisher"] = True else: try: r.json() except: # not JSON? print(provenance_url) print(r.text) raise else: repo["trusted_publisher"] = False return repo for repo in repos.values(): get_pypi_info(repo) # In[5]: import pandas as pd df = pd.DataFrame.from_records(list(repos.values())) df # Has package info, not on PyPI: # In[6]: for repo in df.name[df.package.notna() & df.pypi_url.isna()].sort_values(): print(repo) # No Python package found: # In[7]: for repo in df.name[df.package.isna()].sort_values(): print(repo) # In[8]: packages = df.loc[df.pypi_url.notna()] len(packages) # Repos with a package, but no pyproject.toml (should probably update or archive): # In[9]: for name in packages.name[packages["pyproject.toml"].isna()].sort_values(): print(name) # ## Package summary: # In[10]: packages[["name", "package", "jupyter_org", "trusted_publisher"]].sort_values( ["jupyter_org", "trusted_publisher", "package"] ).reset_index(drop=True)