Skip to content

Instantly share code, notes, and snippets.

@ddelange
Last active November 19, 2024 13:27
Show Gist options
  • Save ddelange/1f55d7a4ba8e10ba881235e70a778ee4 to your computer and use it in GitHub Desktop.
Save ddelange/1f55d7a4ba8e10ba881235e70a778ee4 to your computer and use it in GitHub Desktop.
Get licenses and attempt to find source links to licenses for a lockfile
# pip install requests pandas orjson
import io
import os
import subprocess
import requests
import pandas as pd
import orjson
input_path = os.environ.get("INPUT_PATH", "requirements/requirements.lock")
output_path = os.environ.get("OUTPUT_PATH", "requirements/licenses.jsonl")
print("Running pip for", input_path)
output = subprocess.check_output(
f"""pip install -r {input_path} -qq --no-deps --use-feature=fast-deps --ignore-installed --disable-pip-version-check --dry-run --report - | jq -c '.install[].metadata | [.name, .license, (try .classifier | map(select(contains("License")))), "https://pypi.org/project/" + .name, ( ( if has("project_url") then .project_url else [] end ) + [.home_page?] | join(" ") | match("(https?://)?github.com/[^/]+/[^ /]+").string)]'""",
shell=True,
)
df = pd.read_json(io.BytesIO(output), lines=True)
def try_license_url(url):
original_url = url
if not url:
return
url = (
url.replace("http://", "https://")
.replace(".git", "")
.replace("github.com", "raw.githubusercontent.com")
)
for branch in ["main", "master", "develop"]:
for license in ["LICENSE", "COPYING"]:
for extension in ["", ".md", ".txt", ".rst"]:
final = f"{url}/{branch}/{license}{extension}"
resp = requests.get(final)
if resp.status_code == 200:
print(final)
return final
print("NO LICENSE FOUND", original_url)
print("Getting license urls")
df[5] = df[4].apply(try_license_url)
# df.columns = ['name', 'license_attr', 'license_trove_classifiers', 'pypi_link', 'github_link', 'license_link']
# df.to_json('requirements/licenses.jsonl', lines=True, orient='records')
print("Writing jsonl to", output_path)
with open(output_path, "wb") as fp:
for dict in df.to_dict(orient="records"):
fp.write(orjson.dumps(list(dict.values()), option=orjson.OPT_APPEND_NEWLINE))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment