You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
✅✅✅ My working code: Create WebDataset from local data files to local .tar files
## example code for webdatasetimportwebdatasetaswdsimportioimportjsonprint("👉 WebDataset version:", wds.__version__)
tar_stream=io.BytesIO()
base_name="100313"withwds.TarWriter(tar_stream) assink:
withopen("../data/bin-images/100313.jpg", "rb") asf:
image_data=f.read()
withopen("../data/metadata/100313.json", "rb") asf:
label=json.load(f)['EXPECTED_QUANTITY'] ## load json binarywithopen("../data/metadata/100313.json", "rb") asf:
metadata_data=f.read()
# Save as WebDataset samplesink.write({
"__key__": f"{base_name}",
"image": image_data,
"label": str(label),
"metadata": metadata_data,
})
# Once the tar file is in memory, save to local filetar_stream.seek(0)
withopen("../data/test/test.tar", "wb") asf:
f.write(tar_stream.getvalue())
!tar-tf ../data/test/test.tar
✅✅✅ My working code: stream WebDataset .tar data from s3 and transform the data for training
## test code streaming data from s3. pay attention to the object types.importwebdatasetaswdsimportmatplotlib.pyplotaspltfromPILimportImageimportiotry:
s3_uri="s3://p5-amazon-bin-images/webdataset/train/train-shard-{000000..000001}.tar"path=f"pipe:aws s3 cp {s3_uri} -"## write to standard output (stdout)train_dataset= (
wds.WebDataset(
path,
shardshuffle=True,
# nodesplitter=wds.split_by_worker, ## distributed training
)
.shuffle(1000) # Shuffle dataset ## The tuple names have to be the same with the WebDataset keys## check the "scripts_process/*convert_to_webdataset*.py" files
.to_tuple("image", "label") ## Tuple of image and label
.map_tuple(
lambdax:Image.open(io.BytesIO(x)), # Apply the train transforms to the imagelambdax:x.decode(),
)
)
forimage,labeliniter(train_dataset):
print(type(label), label)
# img = Image.open(io.BytesIO(image))print(type(image))
plt.imshow(image)
plt.show()
breakexceptExceptionase:
print(e)
⚠️ WebDataset pipeline that can work with Torch dataloader
Data ingestion, transformation are ok. However the iteration for different GPUs (ranks, nodes and workers) have some issues.
defkey_transform(x):
returnint(x)
classimage_transform:
def__call__(self, x):
returnImage.open(io.BytesIO(x))
train_transform=transforms.Compose([
image_transform(),
transforms.RandomResizedCrop(224),
transforms.ToTensor(),
])
deflabel_transform(x):
## Original lables are (1,2,3,4,5)## Convert to (0,1,2,3,4)returntorch.tensor(int(x.decode())-1, dtype=torch.int64)
## WebDataset class inherits from IterableDataset classclassWebDatasetDDP(IterableDataset):
def__init__(self,
path,
num_samples=0,
world_size=1,
rank=0,
no_shuffle=False,
shuffle_shard_size=100,
split_by_node=False,
split_by_worker=False,
# shardshuffle=True,# empty_check=False,key_transform=None,
train_transform=None,
label_transform=None,
shuffle_sample_size=1000,
# batch_size=64,
):
super().__init__()
self.dataset= (
## WebDataset## https://github.com/webdataset/webdataset?tab=readme-ov-file#the-webdataset-library# wds.WebDataset(# path, # shardshuffle=shardshuffle,# ## Official doc: add wds.split_by_node here if you are using multiple nodes# # nodesplitter=wds.split_by_node, # ## Or "ValueError: you need to add an explicit nodesplitter # ## to your input pipeline for multi-node training"# nodesplitter=wds.split_by_worker,# empty_check=empty_check, # )# .shuffle(shuffle_buffer_size) # Shuffle dataset # ## The tuple names have to be the same with the WebDataset keys# ## check the "scripts_process/*convert_to_webdataset*.py" files# .to_tuple("__key__", "image", "label") ## Tuple of image and label# .map_tuple(# key_transform,# train_transform, # Apply the train transforms to the image# ## lambda function can't not be pickled, hence cause error when num_workers>1 # label_transform, # )## WebDataset pipeline## https://github.com/webdataset/webdataset?tab=readme-ov-file#the-webdataset-pipeline-apiwds.DataPipeline(
wds.SimpleShardList(path),
# at this point we have an iterator over all the shardswds.shuffle(shuffle_shard_size) ifnotno_shuffleelseNone,
# add wds.split_by_node here if you are using multiple nodeswds.split_by_nodeifsplit_by_nodeelseNone,
wds.split_by_workerifsplit_by_workerelseNone,
# at this point, we have an iterator over the shards assigned to each workerwds.tarfile_to_samples(),
# this shuffles the samples in memorywds.shuffle(shuffle_sample_size) ifnotno_shuffleelseNone,
# this decodes the images and json# wds.decode("pil"),wds.to_tuple("__key__", "image", "label"),
# wds.map(preprocess),wds.map_tuple(
key_transform,
train_transform,
label_transform,
),
wds.shuffle(shuffle_sample_size) ifnotno_shuffleelseNone,
# wds.batched(batch_size),
)
)
self.world_size=world_sizeself.rank=rankself.num_samples=num_samplesself.split_by_node=split_by_nodeself.split_by_worker=split_by_workerdef__len__(self):
returnself.num_samplesdef__iter__(self):
forkey,image,labelinself.dataset: ## Use dataset keys to distribute data## ⚠️ need a fixifkey%self.world_size==self.rank: ## Ensure each GPU gets different datayield (image, label)
defcollate_fn(batch):
images, labels=zip(*batch)
# Stack the images into a single tensor (this assumes the images have the same size)images=torch.stack(images)
labels=torch.stack(labels)
returnimages, labels## For data distributed training, use torch.utils.data.DistributedSampler or WebDataset? path=f"pipe:aws s3 cp {task.config.train_data_path} -"train_dataset= (
WebDatasetDDP(
path,
num_samples=task.config.train_data_size,
world_size=dist.get_world_size(),
rank=dist.get_rank(),
split_by_node=True,
split_by_worker=True,
shuffle_sample_size=1000,
key_transform=key_transform,
train_transform=train_transform,
label_transform=label_transform,
)
)
path=f"pipe:aws s3 cp {task.config.val_data_path} -"val_dataset= (
WebDatasetDDP(
path,
num_samples=task.config.val_data_size,
no_shuffle=True,
key_transform=key_transform,
train_transform=train_transform,
label_transform=label_transform,
)
)
path=f"pipe:aws s3 cp {task.config.test_data_path} -"test_dataset= (
WebDatasetDDP(
path,
num_samples=task.config.test_data_size,
no_shuffle=True,
key_transform=key_transform,
train_transform=train_transform,
label_transform=label_transform,
)
)
## Handle class imbalance. class weights will be used in the loss functions.## train_dataset is an instance of TorchVision.datasets.ImageFolder().## class_weights is an instance of <class 'numpy.ndarray'>.# class_weights = compute_class_weight(# class_weight='balanced', # classes=np.unique(train_dataset.cls), # y=train_dataset.cls)## Use pre-calculated class weights if the dataset is very large.classes=np.unique(list(task.config.class_weights_dict.keys())) ## It has to be sorted.task.config.num_classes=len(classes) ## get number of total classes for net creationclass_weights= [task.config.class_weights_dict[k] forkinclasses]
class_weights=torch.tensor(
class_weights,
dtype=torch.float32).to(task.config.device)
# ## SMDDP: set num_replicas and rank in Torch DistributedSampler# train_sampler = DistributedSampler( ## ⚠️ doesn't work with WebDataset# train_dataset,# num_replicas=dist.get_world_size(),# rank=dist.get_rank(),# shuffle=False,# )## Torch dataloadertask.train_loader=DataLoader(
train_dataset,
batch_size=task.config.batch_size_ddp,
shuffle=False, ## Don't shuffle for Distributed Data Parallel (DDP) # sampler=train_sampler, # ⚠️ Distributed Sampler + WebDataset causes errornum_workers=task.config.num_cpu,
persistent_workers=True,
pin_memory=True,
collate_fn=collate_fn,
)
task.val_loader=DataLoader(
val_dataset,
batch_size=task.config.batch_size,
shuffle=False, ## Don't shuffle for eval anyway## no DDP samplernum_workers=task.config.num_cpu,
persistent_workers=True,
pin_memory=True,
collate_fn=collate_fn,
)
task.test_loader=DataLoader(
test_dataset,
batch_size=task.config.batch_size,
shuffle=False, ## Don't shuffle for eval anyway# no DDP samplernum_workers=task.config.num_cpu,
persistent_workers=True,
pin_memory=True,
collate_fn=collate_fn,
)