Created
May 30, 2025 12:23
-
-
Save kenenbek/9c7d8ecf1280938f16609d8e4b83386a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
from math import ceil | |
# Configuration | |
SOURCE_DIR = 'my_source_files_test' # Path to source files | |
OUTPUT_DIR = 'output' # Path to output worker folders | |
NUM_LABELERS = 5 # Number of workers | |
NUM_COPIES = 3 # Number of workers each file should be copied to | |
def distribute_files( | |
source_dir=SOURCE_DIR, | |
output_dir=OUTPUT_DIR, | |
num_labelers=NUM_LABELERS, | |
num_copies=NUM_COPIES | |
): | |
files = sorted([ | |
f for f in os.listdir(source_dir) | |
if os.path.isfile(os.path.join(source_dir, f)) | |
]) | |
num_files = len(files) | |
# Create output directories for each worker | |
for worker_id in range(num_labelers): | |
worker_dir = os.path.join(output_dir, f'worker_{worker_id}') | |
os.makedirs(worker_dir, exist_ok=True) | |
# Distribute files to workers (round-robin, each file to num_copies workers) | |
worker_files = [[] for _ in range(num_labelers)] | |
for idx, filename in enumerate(files): | |
for p in range(num_copies): | |
worker_id = (idx*num_copies + p) % num_labelers | |
worker_files[worker_id].append(filename) | |
# Remove duplicates and keep order | |
for i in range(num_labelers): | |
seen = set() | |
unique_files = [] | |
for f in worker_files[i]: | |
if f not in seen: | |
unique_files.append(f) | |
seen.add(f) | |
worker_files[i] = unique_files | |
# Copy files to each worker's folder | |
for worker_id, file_list in enumerate(worker_files): | |
worker_dir = os.path.join(output_dir, f'worker_{worker_id}') | |
for filename in file_list: | |
src = os.path.join(source_dir, filename) | |
dst = os.path.join(worker_dir, filename) | |
shutil.copy2(src, dst) | |
print(f"Distributed {num_files} files to {num_labelers} workers, each file copied to {num_copies} workers.") | |
# Optionally, call the function if running as script | |
if __name__ == "__main__": | |
distribute_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment