Looks for repos that publish Python ackages
GitHub:
For each found package, look it up on PyPI:
import json
import netrc
import re
import configparser
from configparser import ConfigParser
from jinja2 import Template
import requests
import requests_cache
import toml
file_gql = Template(
"""
{
organization(login: "{{ organization }}") {
repositories(first: 100{% if after -%}, after: "{{ after }}" {%- endif %}) {
pageInfo {
endCursor
hasNextPage
}
edges {
node {
isArchived
nameWithOwner
object(expression: "HEAD:{{ path }}") {
... on Blob {
text
}
}
}
}
}
}
}
"""
)
requests_cache.install_cache("github", allowable_methods=["GET", "POST"])
s = requests.Session()
auth = netrc.netrc().authenticators("api.github.com")
if auth:
print("Using netrc auth token")
s.headers["Authorization"] = f"bearer {auth[2]}"
else:
print("No auth")
github_graphql = "https://api.github.com/graphql"
Using netrc auth token
def fetch_files(path, organization: str = "jupyterhub", after=None):
query = file_gql.render(
organization=organization,
after=after,
path=path,
)
resp = s.post(github_graphql, data=json.dumps(dict(query=query)))
result = resp.json()
if resp.status_code >= 400:
print(result)
resp.raise_for_status()
repos = result["data"]["organization"]["repositories"]
for repo_edge in repos["edges"]:
repo = repo_edge["node"]
if repo["isArchived"]:
# ignore archived repos
continue
repo_info = {
"name": repo["nameWithOwner"],
path: None,
}
if repo["object"]:
repo_info[path] = repo["object"]["text"]
yield repo_info
# pagination
if repos["pageInfo"]["hasNextPage"]:
yield from fetch_files(
path,
organization=organization,
after=repos["pageInfo"]["endCursor"],
)
def name_from_pyproject_toml(pyproject_toml):
"""get package name from pyproject.toml"""
pyproject = toml.loads(pyproject_toml)
return pyproject.get("project", {}).get("name")
def name_from_setup_cfg(setup_cfg):
cfg = ConfigParser()
cfg.read_string(setup_cfg)
try:
return cfg.get("metadata", "name")
except configparser.Error:
return None
def name_from_setup_py(setup_py):
"""get package name from setup.py"""
m = None
for name in re.findall(r"""name\s*=\s*['"]([^'"]+)['"]""", setup_py):
# this is a regex, but probably good enough for a few old setup.pys
return name
return m
def collect_repo_packages(organizations, verbose=False):
repos = {}
for organization in organizations:
for repo in fetch_files("pyproject.toml", organization):
name = repo["name"]
repo["package"] = None
repos[name] = repo
pyproject_toml = repo["pyproject.toml"]
if pyproject_toml:
repo["package"] = pkg = name_from_pyproject_toml(pyproject_toml)
else:
if verbose:
print(f"{name}: no pyproject.toml")
for new_info in fetch_files("setup.cfg", organization):
name = new_info["name"]
repo = repos[name]
repo.update(new_info)
if repo["setup.cfg"] and not repo["package"]:
repo["package"] = pkg = name_from_setup_cfg(repo["setup.cfg"])
if pkg and verbose:
print(f"{name}: got name from setup.cfg")
for new_info in fetch_files("setup.py", organization):
name = new_info["name"]
repo = repos[name]
repo.update(new_info)
if repo["setup.py"] and not repo["package"]:
repo["package"] = pkg = name_from_setup_py(repo["setup.py"])
if pkg and verbose:
print(f"{name}: got name from setup.py")
if verbose:
print("----")
for repo in repos.values():
print(f"{repo['name']}: {repo['package']}")
return repos
repos = collect_repo_packages(["jupyterhub", "jupyterhealth"])
Scrape members of Jupyter org. No API for this?
from bs4 import BeautifulSoup
jupyter_org_url = "https://pypi.org/org/jupyter/"
r = requests.get(jupyter_org_url)
r.raise_for_status()
page = BeautifulSoup(r.text)
jupyter_org_packages = {
el.text.strip() for el in page.find_all(class_="package-snippet__title")
}
Get information from PyPI:
def get_pypi_info(repo):
package = repo["package"]
if not package:
return repo
repo["pypi_url"] = None
repo["pypi_version"] = None
repo["trusted_publisher"] = None
pypi_url = f"https://pypi.org/pypi/{package}/json"
r = requests.get(pypi_url)
if r.status_code == 404:
return repo
if not r.ok:
print(f"{r.status_code}: {pypi_url} (not a published package?)")
return repo
pypi_info = r.json()
repo["package"] = package = pypi_info["info"]["name"] # apply PyPI normalization
repo["jupyter_org"] = package in jupyter_org_packages
repo["pypi_url"] = pypi_info["info"]["package_url"]
repo["pypi_version"] = version = pypi_info["info"]["version"]
filename = pypi_info["releases"][version][0]["filename"]
provenance_url = (
f"https://pypi.org/integrity/{package}/{version}/{filename}/provenance"
)
r = requests.get(provenance_url)
if r.ok:
repo["trusted_publisher"] = True
else:
try:
r.json()
except:
# not JSON?
print(provenance_url)
print(r.text)
raise
else:
repo["trusted_publisher"] = False
return repo
for repo in repos.values():
get_pypi_info(repo)
import pandas as pd
df = pd.DataFrame.from_records(list(repos.values()))
df
name | pyproject.toml | package | setup.cfg | setup.py | pypi_url | pypi_version | trusted_publisher | jupyter_org | |
---|---|---|---|---|---|---|---|---|---|
0 | jupyterhub/jupyterhub | # PEP 621 build info\n[build-system]\nrequires... | jupyterhub | None | #!/usr/bin/env python3\n# Copyright (c) Jupyte... | https://pypi.org/project/jupyterhub/ | 5.3.0 | False | False |
1 | jupyterhub/configurable-http-proxy | None | None | None | None | NaN | NaN | NaN | NaN |
2 | jupyterhub/oauthenticator | # autoflake is used for autoformatting Python ... | oauthenticator | None | #!/usr/bin/env python\n# Copyright (c) Jupyter... | https://pypi.org/project/oauthenticator/ | 17.3.0 | True | False |
3 | jupyterhub/dockerspawner | # autoflake is used for autoformatting Python ... | dockerspawner | None | #!/usr/bin/env python\n# Copyright (c) Jupyter... | https://pypi.org/project/dockerspawner/ | 14.0.0 | True | False |
4 | jupyterhub/sudospawner | None | sudospawner | None | #!/usr/bin/env python\n# coding: utf-8\n\n# Co... | https://pypi.org/project/sudospawner/ | 0.5.2 | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
74 | jupyterhealth/myst-site-article | None | None | None | None | NaN | NaN | NaN | NaN |
75 | jupyterhealth/myst-site-book | None | None | None | None | NaN | NaN | NaN | NaN |
76 | jupyterhealth/jupyterhealth-deploy | None | None | None | None | NaN | NaN | NaN | NaN |
77 | jupyterhealth/jupyter-smart-on-fhir | [build-system]\nrequires = ["setuptools>=77", ... | jupyter-smart-on-fhir | None | None | https://pypi.org/project/jupyter-smart-on-fhir/ | 0.1.0a3 | True | False |
78 | jupyterhealth/jupyterhealth-client | [project]\nname = "jupyterhealth-client"\ndesc... | jupyterhealth-client | None | None | https://pypi.org/project/jupyterhealth-client/ | 0.0.1a4 | True | False |
79 rows × 9 columns
Has package info, not on PyPI:
for repo in df.name[df.package.notna() & df.pypi_url.isna()].sort_values():
print(repo)
jupyterhub/jupyterhub-python-repo-template jupyterhub/nbgitpuller-downloader-plugins jupyterhub/the-littlest-jupyterhub
No Python package found:
for repo in df.name[df.package.isna()].sort_values():
print(repo)
jupyterhealth/demos jupyterhealth/jupyter-health-software jupyterhealth/jupyterhealth-deploy jupyterhealth/jupyterhealth.github.io jupyterhealth/myst-site-article jupyterhealth/myst-site-book jupyterhealth/partner-client-docker jupyterhealth/singleuser-image jupyterhealth/software-documentation jupyterhub/.github jupyterhub/action-get-quayio-tags jupyterhub/action-k3s-helm jupyterhub/action-k8s-await-workloads jupyterhub/action-k8s-namespace-report jupyterhub/action-major-minor-tag-calculator jupyterhub/binder-billing jupyterhub/binder-data jupyterhub/configurable-http-proxy jupyterhub/design jupyterhub/grafana-dashboards jupyterhub/helm-chart jupyterhub/jupyterhub-container-images jupyterhub/jupyterhub-deploy-docker jupyterhub/jupyterhub-deploy-hpc jupyterhub/jupyterhub-deploy-teaching jupyterhub/jupyterhub-on-hadoop jupyterhub/jupyterhub-the-hard-way jupyterhub/jupyterhub-tutorial jupyterhub/jupyterhub.github.io jupyterhub/katacoda-scenarios jupyterhub/mybinder-tools jupyterhub/mybinder.org-deploy jupyterhub/mybinder.org-user-guide jupyterhub/outreachy jupyterhub/pebble-helm-chart jupyterhub/repo2docker-action jupyterhub/team-compass jupyterhub/zero-to-jupyterhub-k8s
packages = df.loc[df.pypi_url.notna()]
len(packages)
38
Repos with a package, but no pyproject.toml (should probably update or archive):
for name in packages.name[packages["pyproject.toml"].isna()].sort_values():
print(name)
jupyterhub/firstuseauthenticator jupyterhub/gh-scoped-creds jupyterhub/kerberosauthenticator jupyterhub/nullauthenticator jupyterhub/repo2docker jupyterhub/sudospawner jupyterhub/wrapspawner jupyterhub/yarnspawner
packages[["name", "package", "jupyter_org", "trusted_publisher"]].sort_values(
["jupyter_org", "trusted_publisher", "package"]
).reset_index(drop=True)
name | package | jupyter_org | trusted_publisher | |
---|---|---|---|---|
0 | jupyterhub/batchspawner | batchspawner | False | False |
1 | jupyterhub/binderhub | binderhub | False | False |
2 | jupyterhub/chartpress | chartpress | False | False |
3 | jupyterhub/docker-image-cleaner | docker-image-cleaner | False | False |
4 | jupyterhub/gh-scoped-creds | gh-scoped-creds | False | False |
5 | jupyterhub/repo2docker | jupyter-repo2docker | False | False |
6 | jupyterhub/jupyter-server-proxy | jupyter-server-proxy | False | False |
7 | jupyterhub/jupyterhub | jupyterhub | False | False |
8 | jupyterhub/firstuseauthenticator | jupyterhub-firstuseauthenticator | False | False |
9 | jupyterhub/jupyterhub-idle-culler | jupyterhub-idle-culler | False | False |
10 | jupyterhub/kerberosauthenticator | jupyterhub-kerberosauthenticator | False | False |
11 | jupyterhub/kubespawner | jupyterhub-kubespawner | False | False |
12 | jupyterhub/ldapauthenticator | jupyterhub-ldapauthenticator | False | False |
13 | jupyterhub/ltiauthenticator | jupyterhub-ltiauthenticator | False | False |
14 | jupyterhub/nativeauthenticator | jupyterhub-nativeauthenticator | False | False |
15 | jupyterhub/jupyterhub-sphinx-theme | jupyterhub-sphinx-theme | False | False |
16 | jupyterhub/systemdspawner | jupyterhub-systemdspawner | False | False |
17 | jupyterhub/tmpauthenticator | jupyterhub-tmpauthenticator | False | False |
18 | jupyterhub/traefik-proxy | jupyterhub-traefik-proxy | False | False |
19 | jupyterhub/yarnspawner | jupyterhub-yarnspawner | False | False |
20 | jupyterhub/nbgitpuller-downloader-dropbox | nbgitpuller-downloader-dropbox | False | False |
21 | jupyterhub/nbgitpuller-downloader-generic-web | nbgitpuller-downloader-generic-web | False | False |
22 | jupyterhub/nbgitpuller-downloader-googledrive | nbgitpuller-downloader-googledrive | False | False |
23 | jupyterhub/nullauthenticator | nullauthenticator | False | False |
24 | jupyterhub/pytest-jupyterhub | pytest-jupyterhub | False | False |
25 | jupyterhub/simpervisor | simpervisor | False | False |
26 | jupyterhub/sudospawner | sudospawner | False | False |
27 | jupyterhub/wrapspawner | wrapspawner | False | False |
28 | jupyterhub/dockerspawner | dockerspawner | False | True |
29 | jupyterhub/jupyter-remote-desktop-proxy | jupyter-remote-desktop-proxy | False | True |
30 | jupyterhub/jupyter-rsession-proxy | jupyter-rsession-proxy | False | True |
31 | jupyterhealth/jupyter-smart-on-fhir | jupyter-smart-on-fhir | False | True |
32 | jupyterhealth/jupyterhealth-client | jupyterhealth-client | False | True |
33 | jupyterhub/nbgitpuller | nbgitpuller | False | True |
34 | jupyterhub/oauthenticator | oauthenticator | False | True |
35 | jupyterhub/autodoc-traits | autodoc-traits | True | False |
36 | jupyterhub/escapism | escapism | True | False |
37 | jupyterhub/pamela | pamela | True | False |