Skip to content

Instantly share code, notes, and snippets.

@pierrelouisbescond
Last active September 5, 2021 17:59
Show Gist options
  • Save pierrelouisbescond/c23a832407ea919be283462522602f7a to your computer and use it in GitHub Desktop.
Save pierrelouisbescond/c23a832407ea919be283462522602f7a to your computer and use it in GitHub Desktop.
# We extract the list of duplicates files to remove
duplicates = pd.read_excel("./roman-numerals-labeling-plb-20210830.xlsx", sheet_name="duplicates")
duplicates_list = duplicates["file"].tolist()
# as well as the unreadable files
files_analysis = pd.read_excel("./roman-numerals-labeling-plb-20210830.xlsx", sheet_name="analysis", usecols="B:L")
erroneous_list = files_analysis["file"][files_analysis["to_be_removed"] == 1].tolist()
removal_list = duplicates_list + erroneous_list
print(len(duplicates_list), "duplicates +", len(erroneous_list), "errouneous =", len(removal_list), "pictures to remove.")
nb_files_removed_total = 0
# We remove every file in the folders which is listed as "to be removed"
for folder in INITIAL_FOLDERS:
for label in LABELS:
nb_files_removed_label = 0
nb_files, files = how_many_files_in_folder(DATA_FOLDER+folder+"/"+label+"/*.png")
for file in files:
if file[-40:] in removal_list:
nb_files_removed_label += 1
nb_files_removed_total += 1
os.remove(file)
print ("Total Number of files removed:", nb_files_removed_total, "\n")
initial_folder.summary()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment