Created
May 22, 2025 14:07
-
-
Save Erotemic/3abd223c55b761b39fc7746ea4307faf to your computer and use it in GitHub Desktop.
autoprofile_webdataset_example.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main(): | |
""" | |
Usage: | |
cd ~/code/line_profiler/dev/mwe | |
# Try different invocations | |
# It would be nice if this worked, but it doesn't include submodules. | |
kernprof -l -z -p webdataset -p wids ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py | |
python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof" | |
# This wont really work because it only includes things directly imported in the package init. | |
kernprof -l -z -p webdataset -p wids --prof-imports ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py | |
python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof" | |
# This gives us roughly what I want, but I had to manually trace which | |
# modules were used in the specific code I was interested in. | |
kernprof -l -z -p wids -p webdataset -p wids.wids_lru,wids.wids_dl,wids.wids_decode,wids.wids_tar,wids.wids_mmtar autoprofile_webdataset_example.py | |
python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof" | |
""" | |
import numpy as np | |
import ubelt as ub | |
import webdataset as wds | |
# Write a new webdataset | |
out_dpath = ub.Path.appdir('line_profiler/tests/mwe/wds-example') | |
out_dpath.delete().ensuredir() | |
# Test webdataset writer | |
writer = wds.ShardWriter(str(out_dpath / "%06d.tar"), maxcount=10) | |
for idx in range(103): | |
sample = { | |
'__key__': f'index_{idx:04d}', | |
'data.pyd': { | |
'imdata': np.random.rand(3, 3), | |
'label': np.random.randint(0, 3), | |
}, | |
} | |
print(f'write sample = {ub.urepr(sample, nl=1)}') | |
writer.write(sample) | |
writer.close() | |
# Gather shards that we wrote | |
leaf_shards = [] | |
for r, ds, fs in out_dpath.walk(): | |
if len(ds) == 0 and len(fs): | |
shards = [r / f for f in fs] | |
leaf_shards.append(shards) | |
print(f'leaf_shards = {ub.urepr(leaf_shards, nl=2)}') | |
# test wids reader | |
import wids | |
import os | |
for shards in leaf_shards: | |
paths = sorted([str(s) for s in shards]) | |
wids_paths = [{'url': p, 'nsamples': 10} for p in paths] | |
print(f'wids_paths = {ub.urepr(wids_paths, nl=1)}') | |
cache_dir = ub.Path('./tmp-cache').ensuredir() | |
cache_dir.delete().ensuredir() | |
os.environ['WIDS_VERBOSE'] = '1' | |
wids_bucket = wids.ShardListDataset( | |
wids_paths, | |
cache_dir=str(cache_dir), | |
transformations=lambda x: x, | |
) | |
wids_bucket[0] | |
for idx, item in ub.ProgIter(enumerate(wids_bucket), verbose=3, desc='reading from shards'): | |
print(f'read sample (idx={idx}) = {ub.urepr(item, nl=1)}') | |
... | |
max_idx = idx + 1 | |
print(f'max_idx={max_idx}') | |
if __name__ == '__main__': | |
""" | |
CommandLine: | |
python ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py | |
""" | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment