Skip to content

Instantly share code, notes, and snippets.

@kenenbek
Created May 30, 2025 11:55
Show Gist options
  • Save kenenbek/0be21e057d419e2037e778513f6d7036 to your computer and use it in GitHub Desktop.
Save kenenbek/0be21e057d419e2037e778513f6d7036 to your computer and use it in GitHub Desktop.
import os
import pandas as pd
import librosa
import csv
import re
# Load the DataFrame from a CSV file
input_file = "deperson_13_files.csv" # Replace with the actual file path
input_df = pd.read_csv(input_file)
output_dir = "./downloaded_audio"
chunks_to_remove = 5 # Number of chunks to remove
output = {
'path': [],
'segment': [], # This will be used to track the segment number
'start': [],
'end': [],
}
def natural_sort_key(s):
"""
Key function for natural sorting (sorts numbers in strings numerically).
Example: "file2" < "file10" (unlike lexicographic sorting)
"""
return [int(text) if text.isdigit() else text.lower()
for text in re.split('([0-9]+)', s)]
def listdir_lsv(directory):
"""
Returns files sorted like `ls -1v` (one per line, natural sort).
"""
files = os.listdir(directory)
return sorted(files, key=natural_sort_key)
def get_audio_duration(file_path):
duration = librosa.get_duration(path=file_path)
return duration
for index, row in input_df.iterrows():
audio_path = row['path']
current_segment_num = int(row['segment'])
start_time = float(row['start'])
basename = os.path.splitext(os.path.basename(audio_path))[0]
segments_dir = os.path.join(output_dir, basename)
segment_names = [f for f in listdir_lsv(segments_dir) if f.lower().endswith('.wav')]
total_chunks_num = len(segment_names)
if current_segment_num + chunks_to_remove >= total_chunks_num:
end_time = get_audio_duration(os.path.join(output_dir, audio_path))
else:
final_segment_name = segment_names[current_segment_num + chunks_to_remove]
final_segment_list = os.path.splitext(final_segment_name)[0].split("_")
end_time = float(final_segment_list[3])
output['path'].append(audio_path)
output['segment'].append(current_segment_num)
output['start'].append(start_time)
output['end'].append(end_time)
out_df = pd.DataFrame(output)
out_df.to_csv(f'test_deperson_13_files_{chunks_to_remove}.csv', index=False,
quoting=csv.QUOTE_MINIMAL, sep=',')
# if current_segment_num + chunks_to_remove >= total_chunks_num:
# end_time = get_audio_duration(audio_file)
# print(audio_file, start_time, end_time, sep=", ")
# break
#
# final_segment_name = segment_names[current_segment_num + chunks_to_remove]
# final_segment_list = os.path.splitext(final_segment_name)[0].split("_")
# end_time = float(final_segment_list[3])
#
# audio = AudioSegment.from_file(audio_file)
#
# # Convert to milliseconds
# start_ms = int(start_time * 1000)
# end_ms = int(end_time * 1000)
#
# # Trim the audio
# trimmed = audio[:start_ms] + audio[end_ms:]
#
# print(audio_file, start_time, end_time, sep=", ")
# output['path'].append(base_name)
# output['segment'].append(current_segment_num)
# output['start'].append(start_time)
# output['end'].append(end_time)
# # Export trimmed audio
# # trimmed.export(os.path.join(out_dir, base_name + "_trimmed.wav"), format="wav")
# # print("Trimmed audio file to:", os.path.join(out_dir, base_name + "_trimmed.wav"))
# # sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment