Created
August 25, 2022 18:29
-
-
Save timfel/3fc3b1f339917ff48b69289e79a39f8d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import namedtuple | |
import os | |
import pprint | |
import re | |
import requests | |
import shutil | |
import subprocess | |
import tarfile | |
import traceback | |
import zipfile | |
session = requests.Session() | |
EXTRA_URLS = { | |
# source urls for some projects that have no sdist in PyPI | |
"torch": "https://github.com/pytorch/pytorch/archive/refs/tags/v1.10.2.zip", | |
"tensorflow": "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.8.0.zip", | |
"caffe": "https://github.com/BVLC/caffe/archive/refs/heads/master.zip", | |
"torchvision": "https://github.com/pytorch/vision/archive/refs/tags/v0.11.3.zip", | |
"keras": "https://github.com/keras-team/keras/archive/refs/tags/v2.8.0.zip", | |
"cv2": "https://github.com/opencv/opencv-python/archive/refs/heads/3.4.zip", | |
"nimbusml": "https://github.com/microsoft/NimbusML/archive/refs/heads/master.zip", | |
"mxnet": "https://github.com/apache/incubator-mxnet/archive/refs/tags/v2.0.0.beta0.rc1.zip", | |
"onnxruntime": "https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.10.0.zip", | |
"python-util": "https://github.com/MisterL2/python-util/archive/refs/heads/master.zip", | |
} | |
DS_IMPORTANT_PACKAGES = list( | |
dict.fromkeys( | |
[ | |
# top 10 in Data Science Through the Looking Glass | |
"numpy", | |
"matplotlib", | |
"pandas", | |
"scikit-learn", | |
"scipy", | |
"seaborn", | |
"tensorflow", | |
# "pylab", # pylab is just matplotlib.pylab | |
"requests", | |
"statsmodels", | |
# top 10 rank change in Data Science Through the Looking Glass | |
"torch", | |
"keras", | |
"xgboost", | |
"Pillow", # called PIL in the paper | |
"python-util", | |
"cv2", # really python-opencv | |
"tqdm", | |
"sqlalchemy", | |
"gensim", | |
"tensorflow", | |
# top 10 pct change in Data Science Through the Looking Glass | |
"pandas", | |
"matplotlib", | |
"scikit-learn", | |
"seaborn", | |
"keras", | |
"torch", | |
"numpy", | |
"tensorflow", | |
"Pillow", # called PIL in the paper | |
"cv2", # really python-opencv | |
# top 5 deep learning imports in Data Science Through the Looking Glass | |
"tensorflow", | |
"keras", | |
"theano", | |
"caffe", | |
"torch", | |
# top 10 imports in Data Science Through the Looking Glass | |
"scikit-learn", | |
"numpy", | |
"matplotlib", | |
"pandas", | |
"scipy", | |
"keras", | |
"seaborn", | |
"tensorflow", | |
"nltk", | |
"statsmodels", | |
# extras from correlation | |
"bs4", | |
"torchvision", | |
# "selenium", # pure Python | |
# release analysis packages from Data Science Through the Looking Glass | |
"keras", | |
"lasagne", | |
"matplotlib", | |
"nolearn", | |
"numpy", | |
"pandas", | |
"scikit-learn", | |
"scipy", | |
"seaborn", | |
"nimbusml", | |
"mxnet", | |
# extra packages | |
"category-encoders", | |
"dask", | |
"imbalanced-learn", | |
"lightgbm", | |
"onnx", | |
"onnxmltools", | |
"onnxruntime", | |
"orbit-ml", | |
"psutil", | |
"pyod", | |
"skl2onnx", | |
"sktime", | |
"plotly", | |
# needed | |
"cython", | |
] | |
) | |
) | |
SLOCCOUNT_LANGUAGE_PATTERN = re.compile(r"([a-z]+)=(\d+)") | |
TARFILE_PATTERN = re.compile(r"\.(tar\.gz|tgz|bz2)$") | |
NATIVE_FILE_PATTERN = re.compile(r"\.(cc|cxx|cpp|C|CC|c\+\+|c|h|hpp|pyx|rs)$") | |
NATIVE_API_PATTERN = re.compile( | |
r""" | |
pyo3 # Rust package | |
| | |
\#include\s+[<"]cppy # cppy | |
| | |
\#include\s+[<"]pybind11 # pybind11 | |
| | |
\#include\s+[<"]Python\.h[>"] # includes Python API | |
| | |
\#include\s+[<"]numpy # dependency on numpy API | |
| | |
Py_VISIT # macro used in tp_traverse | |
| | |
PyStructSequence_Desc # tuple subclass that uses a hack to hide some fields from Python code | |
| | |
PyCapsule_Destructor # destructor for capsules | |
| | |
PyObject_GC_U?n?Track # potentially (?) problematic GC access | |
| | |
PyUnicode_(?:DATA|WRITE|READ) # direct access to unicode void* | |
| | |
Py_TYPE\([^\)]+\)\s*=\s*(?!__pyx)[_\(\*_a-zA-Z0-9]+ # assignment to type | |
| | |
tp_traverse\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ # own code that runs on GC | |
| | |
ob_type\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ # assignment to type | |
| | |
tp_bases?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ # assignment to base and bases | |
| | |
tp_finalize?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ | |
| | |
tp_del?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ | |
| | |
PyTypeObject\s+(?!__pyx)[ \(\*_a-zA-Z0-9_]+\s+=\s+{ # static type definition | |
""", | |
re.VERBOSE, | |
) | |
PYTHON_API_PATTERN = re.compile("Py_") | |
def projects(): | |
resp = session.get( | |
"https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json" | |
) | |
resp.raise_for_status() | |
return [p["project"] for p in resp.json()["rows"]] | |
def find_url(proj): | |
resp = session.get(f"https://pypi.org/pypi/{proj}/json") | |
try: | |
for u in resp.json()["urls"]: | |
if u["packagetype"] == "sdist": | |
return u["url"] | |
except: | |
pass | |
return EXTRA_URLS.get(proj, None) | |
def download_sdist(idx, proj): | |
# Download the sdist file for this project. The file is downloaded into a | |
# generic name, and a stamp file is created. This is done to avoid any | |
# requests to PyPI even to get the URL. To actually fetch the most recent | |
# package, the previous download must be deleted manually. | |
filename = f"{idx:>05}_{proj}" | |
if os.path.exists(filename): | |
print(f"Exists: {filename}") | |
for f in os.scandir(): | |
if f.is_file() and f.name.startswith(filename) and f.name != filename: | |
return filename, f.name | |
url = find_url(proj) | |
if not url: | |
# Universal wheel only, maybe. | |
print(f"Cannot find url for {proj}") | |
return | |
stamp = f'{filename}_{url[url.rfind("/") + 1 :]}' | |
print(f"Saving {stamp} to {filename}") | |
resp = session.get(url) | |
resp.raise_for_status() | |
with open(filename, "wb") as f: | |
f.write(resp.content) | |
with open(stamp, "wb") as f: | |
f.write(b"") | |
return filename, stamp | |
class PackageInfo( | |
namedtuple("PackageInfo", "name interesting_files native_apis sloccount") | |
): | |
@classmethod | |
def new(cls, name, interesting_files, native_apis): | |
return cls(name, interesting_files, native_apis, [[], 0, []]) | |
@property | |
def api_sloccount(self): | |
return self.sloccount[0] | |
@api_sloccount.setter | |
def api_sloccount(self, value): | |
self.sloccount[0] = value | |
@property | |
def py_sloccount(self): | |
return self.sloccount[1] | |
@py_sloccount.setter | |
def py_sloccount(self, value): | |
self.sloccount[1] = value | |
@property | |
def total_sloccount(self): | |
return self.sloccount[2] | |
@total_sloccount.setter | |
def total_sloccount(self, value): | |
self.sloccount[2] = value | |
def __repr__(self): | |
extensions = set() | |
for f in self.interesting_files: | |
extensions.add(os.path.splitext(f)[1]) | |
extensions = ",".join(extensions) | |
apis = pprint.pformat(list(self.native_apis.keys()), width=120, indent=1) | |
if "\n" in apis: | |
apis = ("\n" + apis).replace("\n", "\n ") | |
sloccount = ["\n "] | |
for el in self.total_sloccount: | |
sloccount.append(el) | |
sloccount.append(sloccount[0]) | |
if self.api_sloccount: | |
sloccount.append("In files using C API") | |
sloccount.append(sloccount[0]) | |
for el in self.api_sloccount: | |
sloccount.append(el) | |
sloccount.append(sloccount[0]) | |
if self.py_sloccount: | |
sloccount.append(f"Counting only lines with 'Py' in them: {self.py_sloccount}") | |
sloccount.append(sloccount[0]) | |
return f"{self.name}: {extensions} with {apis}\n{''.join(sloccount)}" | |
def main(projs, prefix=""): | |
infos = [] | |
for idx, p in enumerate(projs): | |
try: | |
filename, stamp = download_sdist(f"{prefix}{idx}", p) | |
except Exception as e: | |
traceback.print_exc() | |
print(f"Failed to download {p}") | |
continue | |
if stamp.endswith(".zip"): | |
archive = zipfile.ZipFile(filename) | |
names = archive.namelist() | |
members = names | |
elif TARFILE_PATTERN.search(stamp): | |
archive = tarfile.open(filename) | |
members = archive.getmembers() | |
names = [m.name for m in members] | |
else: | |
print(f"Don't know how to extract {stamp}") | |
continue | |
interesting_files = list( | |
filter( | |
lambda el: el is not None, | |
map( | |
lambda pair: pair[1] | |
if ( | |
not pair[0].startswith("/") | |
and ".." not in pair[0] | |
and NATIVE_FILE_PATTERN.search(pair[0]) | |
) | |
else None, | |
zip(names, members), | |
), | |
) | |
) | |
if interesting_files: | |
info = PackageInfo.new(name=p, interesting_files=set(), native_apis={}) | |
dirname = f"{filename}.dir" | |
apidirname = f"{filename}.dir.with_api_usage" | |
if not os.path.exists(dirname): | |
os.makedirs(dirname, exist_ok=True) | |
archive.extractall(path=dirname, members=interesting_files) | |
chars = list(r"/-\|") | |
for path in interesting_files: | |
print("\033[1D", chars[0], sep="", end="", flush=True) | |
chars = chars[1:] + chars[:1] | |
name = os.path.join(dirname, getattr(path, "name", path)) | |
with open(name, "rb") as f: | |
content = f.read().decode("utf-8", errors="replace") | |
uses_c_api = False | |
for m in NATIVE_API_PATTERN.finditer(content): | |
print("\033[1DX\033[1C", end="", flush=True) | |
info.interesting_files.add(name) | |
info.native_apis.setdefault(m.group(0), set()).add(name) | |
uses_c_api = True | |
uses_c_api = uses_c_api or PYTHON_API_PATTERN.search(content) | |
if uses_c_api: | |
sloccountname = os.path.join(apidirname, getattr(path, "name", path)) | |
sloccountnamedir = os.path.dirname(sloccountname) | |
if not os.path.exists(sloccountnamedir): | |
os.makedirs(sloccountnamedir, exist_ok=True) | |
# modify some file endings for sloccount to consider them | |
if sloccountname.endswith(".pyx"): | |
sloccountname += ".py" | |
elif sloccountname.endswith(".rs"): | |
sloccountname += ".cpp" | |
shutil.copy(name, sloccountname) | |
sloccount_sb = [] | |
for line in subprocess.getoutput(f"sloccount {dirname}").split("\n"): | |
if sloccount_sb: | |
sloccount_sb.append(line) | |
break | |
elif "SLOC-by-Language" in line: | |
sloccount_sb.append(line) | |
info.total_sloccount = sloccount_sb | |
api_sloccount_sb = [] | |
if os.path.exists(apidirname): | |
for line in subprocess.getoutput(f"sloccount {apidirname}").split("\n"): | |
if api_sloccount_sb: | |
api_sloccount_sb.append(line) | |
break | |
elif "SLOC-by-Language" in line: | |
api_sloccount_sb.append(line) | |
for dirpath,dirnames,filenames in os.walk(apidirname): | |
print("\033[1D", chars[0], sep="", end="", flush=True) | |
chars = chars[1:] + chars[:1] | |
for f in filenames: | |
if os.path.splitext(f)[1] == ".pyx": | |
continue | |
else: | |
with open(os.path.join(dirpath, f), "r") as file: | |
for l in file.readlines(): | |
if "Py" in l: | |
info.py_sloccount = info.py_sloccount + 1 | |
shutil.rmtree(apidirname) | |
info.api_sloccount = api_sloccount_sb | |
print() | |
if info.interesting_files: | |
infos.append(info) | |
else: | |
shutil.rmtree(dirname) | |
totalSloc = {} | |
apiSloc = {} | |
pySloc = 0 | |
print(f"\n{len(infos)} of top {idx} packages found to have C API usage.\n") | |
for info in infos: | |
print(info, "\n") | |
for el in info.total_sloccount: | |
for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el): | |
totalSloc[m.group(1)] = totalSloc.get(m.group(1), 0) + int(m.group(2)) | |
for el in info.api_sloccount: | |
for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el): | |
apiSloc[m.group(1)] = apiSloc.get(m.group(1), 0) + int(m.group(2)) | |
pySloc += info.py_sloccount | |
print() | |
print("Totals") | |
print("Package SLOC:", totalSloc) | |
print("API using files SLOC:", apiSloc) | |
print("Lines with 'Py':", pySloc) | |
if __name__ == "__main__": | |
from argparse import ArgumentParser | |
import sys | |
parser = ArgumentParser( | |
description="Analyse the top packages either of PyPI or for data science. Must choose either or!" | |
) | |
parser.add_argument("--top5000", action="store_true") | |
parser.add_argument("--topDS", action="store_true") | |
args = parser.parse_args(sys.argv[1:]) | |
if args.top5000 and args.topDS or not args.top5000 and not args.topDS: | |
parser.print_help() | |
sys.exit(1) | |
if args.top5000: | |
main(projects()) | |
else: | |
main(DS_IMPORTANT_PACKAGES, prefix="DS_") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment