Skip to content

Instantly share code, notes, and snippets.

@Erotemic
Created May 22, 2025 14:07
Show Gist options
  • Save Erotemic/3abd223c55b761b39fc7746ea4307faf to your computer and use it in GitHub Desktop.
Save Erotemic/3abd223c55b761b39fc7746ea4307faf to your computer and use it in GitHub Desktop.
autoprofile_webdataset_example.py
def main():
"""
Usage:
cd ~/code/line_profiler/dev/mwe
# Try different invocations
# It would be nice if this worked, but it doesn't include submodules.
kernprof -l -z -p webdataset -p wids ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"
# This wont really work because it only includes things directly imported in the package init.
kernprof -l -z -p webdataset -p wids --prof-imports ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"
# This gives us roughly what I want, but I had to manually trace which
# modules were used in the specific code I was interested in.
kernprof -l -z -p wids -p webdataset -p wids.wids_lru,wids.wids_dl,wids.wids_decode,wids.wids_tar,wids.wids_mmtar autoprofile_webdataset_example.py
python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"
"""
import numpy as np
import ubelt as ub
import webdataset as wds
# Write a new webdataset
out_dpath = ub.Path.appdir('line_profiler/tests/mwe/wds-example')
out_dpath.delete().ensuredir()
# Test webdataset writer
writer = wds.ShardWriter(str(out_dpath / "%06d.tar"), maxcount=10)
for idx in range(103):
sample = {
'__key__': f'index_{idx:04d}',
'data.pyd': {
'imdata': np.random.rand(3, 3),
'label': np.random.randint(0, 3),
},
}
print(f'write sample = {ub.urepr(sample, nl=1)}')
writer.write(sample)
writer.close()
# Gather shards that we wrote
leaf_shards = []
for r, ds, fs in out_dpath.walk():
if len(ds) == 0 and len(fs):
shards = [r / f for f in fs]
leaf_shards.append(shards)
print(f'leaf_shards = {ub.urepr(leaf_shards, nl=2)}')
# test wids reader
import wids
import os
for shards in leaf_shards:
paths = sorted([str(s) for s in shards])
wids_paths = [{'url': p, 'nsamples': 10} for p in paths]
print(f'wids_paths = {ub.urepr(wids_paths, nl=1)}')
cache_dir = ub.Path('./tmp-cache').ensuredir()
cache_dir.delete().ensuredir()
os.environ['WIDS_VERBOSE'] = '1'
wids_bucket = wids.ShardListDataset(
wids_paths,
cache_dir=str(cache_dir),
transformations=lambda x: x,
)
wids_bucket[0]
for idx, item in ub.ProgIter(enumerate(wids_bucket), verbose=3, desc='reading from shards'):
print(f'read sample (idx={idx}) = {ub.urepr(item, nl=1)}')
...
max_idx = idx + 1
print(f'max_idx={max_idx}')
if __name__ == '__main__':
"""
CommandLine:
python ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
"""
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment