Last active
November 19, 2024 13:27
-
-
Save ddelange/1f55d7a4ba8e10ba881235e70a778ee4 to your computer and use it in GitHub Desktop.
Get licenses and attempt to find source links to licenses for a lockfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install requests pandas orjson | |
import io | |
import os | |
import subprocess | |
import requests | |
import pandas as pd | |
import orjson | |
input_path = os.environ.get("INPUT_PATH", "requirements/requirements.lock") | |
output_path = os.environ.get("OUTPUT_PATH", "requirements/licenses.jsonl") | |
print("Running pip for", input_path) | |
output = subprocess.check_output( | |
f"""pip install -r {input_path} -qq --no-deps --use-feature=fast-deps --ignore-installed --disable-pip-version-check --dry-run --report - | jq -c '.install[].metadata | [.name, .license, (try .classifier | map(select(contains("License")))), "https://pypi.org/project/" + .name, ( ( if has("project_url") then .project_url else [] end ) + [.home_page?] | join(" ") | match("(https?://)?github.com/[^/]+/[^ /]+").string)]'""", | |
shell=True, | |
) | |
df = pd.read_json(io.BytesIO(output), lines=True) | |
def try_license_url(url): | |
original_url = url | |
if not url: | |
return | |
url = ( | |
url.replace("http://", "https://") | |
.replace(".git", "") | |
.replace("github.com", "raw.githubusercontent.com") | |
) | |
for branch in ["main", "master", "develop"]: | |
for license in ["LICENSE", "COPYING"]: | |
for extension in ["", ".md", ".txt", ".rst"]: | |
final = f"{url}/{branch}/{license}{extension}" | |
resp = requests.get(final) | |
if resp.status_code == 200: | |
print(final) | |
return final | |
print("NO LICENSE FOUND", original_url) | |
print("Getting license urls") | |
df[5] = df[4].apply(try_license_url) | |
# df.columns = ['name', 'license_attr', 'license_trove_classifiers', 'pypi_link', 'github_link', 'license_link'] | |
# df.to_json('requirements/licenses.jsonl', lines=True, orient='records') | |
print("Writing jsonl to", output_path) | |
with open(output_path, "wb") as fp: | |
for dict in df.to_dict(orient="records"): | |
fp.write(orjson.dumps(list(dict.values()), option=orjson.OPT_APPEND_NEWLINE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment