Erotemic · May 22, 2025 14:07
diff --git a/autoprofile_webdataset_example.py b/autoprofile_webdataset_example.py

 def main():
    """
    Usage:
        cd ~/code/line_profiler/dev/mwe

        # Try different invocations

        # It would be nice if this worked, but it doesn't include submodules.
        kernprof -l -z -p webdataset -p wids ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
        python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"

        # This wont really work because it only includes things directly imported in the package init.
        kernprof -l -z -p webdataset  -p wids --prof-imports  ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
        python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"

        # This gives us roughly what I want, but I had to manually trace which
        # modules were used in the specific code I was interested in.
        kernprof -l -z -p wids -p webdataset -p wids.wids_lru,wids.wids_dl,wids.wids_decode,wids.wids_tar,wids.wids_mmtar autoprofile_webdataset_example.py
        python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"
    """
    import numpy as np
    import ubelt as ub
    import webdataset as wds

    # Write a new webdataset
    out_dpath = ub.Path.appdir('line_profiler/tests/mwe/wds-example')
    out_dpath.delete().ensuredir()

    # Test webdataset writer
    writer = wds.ShardWriter(str(out_dpath / "%06d.tar"), maxcount=10)
    for idx in range(103):
        sample = {
            '__key__': f'index_{idx:04d}',
            'data.pyd': {
                'imdata': np.random.rand(3, 3),
                'label': np.random.randint(0, 3),
            },
        }
        print(f'write sample = {ub.urepr(sample, nl=1)}')
        writer.write(sample)
    writer.close()

    # Gather shards that we wrote
    leaf_shards = []
    for r, ds, fs in out_dpath.walk():
        if len(ds) == 0 and len(fs):
            shards = [r / f for f in fs]
            leaf_shards.append(shards)
    print(f'leaf_shards = {ub.urepr(leaf_shards, nl=2)}')

    # test wids reader
    import wids
    import os
    for shards in leaf_shards:

        paths = sorted([str(s) for s in shards])
        wids_paths = [{'url': p, 'nsamples': 10} for p in paths]
        print(f'wids_paths = {ub.urepr(wids_paths, nl=1)}')

        cache_dir = ub.Path('./tmp-cache').ensuredir()
        cache_dir.delete().ensuredir()

        os.environ['WIDS_VERBOSE'] = '1'
        wids_bucket = wids.ShardListDataset(
            wids_paths,
            cache_dir=str(cache_dir),
            transformations=lambda x: x,
        )
        wids_bucket[0]
        for idx, item in ub.ProgIter(enumerate(wids_bucket), verbose=3, desc='reading from shards'):
            print(f'read sample (idx={idx}) = {ub.urepr(item, nl=1)}')
            ...
        max_idx = idx + 1
        print(f'max_idx={max_idx}')

 if __name__ == '__main__':
    """
    CommandLine:
        python ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
    """
    main()

	def main():
	"""
	Usage:
	cd ~/code/line_profiler/dev/mwe

	# Try different invocations

	# It would be nice if this worked, but it doesn't include submodules.
	kernprof -l -z -p webdataset -p wids ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
	python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"

	# This wont really work because it only includes things directly imported in the package init.
	kernprof -l -z -p webdataset -p wids --prof-imports ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
	python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"

	# This gives us roughly what I want, but I had to manually trace which
	# modules were used in the specific code I was interested in.
	kernprof -l -z -p wids -p webdataset -p wids.wids_lru,wids.wids_dl,wids.wids_decode,wids.wids_tar,wids.wids_mmtar autoprofile_webdataset_example.py
	python -m line_profiler -rmtz "autoprofile_webdataset_example.py.lprof"
	"""
	import numpy as np
	import ubelt as ub
	import webdataset as wds

	# Write a new webdataset
	out_dpath = ub.Path.appdir('line_profiler/tests/mwe/wds-example')
	out_dpath.delete().ensuredir()

	# Test webdataset writer
	writer = wds.ShardWriter(str(out_dpath / "%06d.tar"), maxcount=10)
	for idx in range(103):
	sample = {
	'__key__': f'index_{idx:04d}',
	'data.pyd': {
	'imdata': np.random.rand(3, 3),
	'label': np.random.randint(0, 3),
	},
	}
	print(f'write sample = {ub.urepr(sample, nl=1)}')
	writer.write(sample)
	writer.close()

	# Gather shards that we wrote
	leaf_shards = []
	for r, ds, fs in out_dpath.walk():
	if len(ds) == 0 and len(fs):
	shards = [r / f for f in fs]
	leaf_shards.append(shards)
	print(f'leaf_shards = {ub.urepr(leaf_shards, nl=2)}')

	# test wids reader
	import wids
	import os
	for shards in leaf_shards:

	paths = sorted([str(s) for s in shards])
	wids_paths = [{'url': p, 'nsamples': 10} for p in paths]
	print(f'wids_paths = {ub.urepr(wids_paths, nl=1)}')

	cache_dir = ub.Path('./tmp-cache').ensuredir()
	cache_dir.delete().ensuredir()

	os.environ['WIDS_VERBOSE'] = '1'
	wids_bucket = wids.ShardListDataset(
	wids_paths,
	cache_dir=str(cache_dir),
	transformations=lambda x: x,
	)
	wids_bucket[0]
	for idx, item in ub.ProgIter(enumerate(wids_bucket), verbose=3, desc='reading from shards'):
	print(f'read sample (idx={idx}) = {ub.urepr(item, nl=1)}')
	...
	max_idx = idx + 1
	print(f'max_idx={max_idx}')

	if __name__ == '__main__':
	"""
	CommandLine:
	python ~/code/line_profiler/dev/mwe/autoprofile_webdataset_example.py
	"""
	main()