Created
April 30, 2023 08:19
-
-
Save avelican/4a161089b7391415a092833e7aca6989 to your computer and use it in GitHub Desktop.
Get YouTube transcript (from subtitles / caption file)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import subprocess | |
if len(sys.argv) < 2: | |
print("Please provide a YouTube video URL as the first argument.") | |
sys.exit(1) | |
video_url = sys.argv[1] | |
command = [ | |
"yt-dlp", | |
"--write-sub", | |
"--write-auto-sub", | |
"--sub-lang", | |
"en.*", | |
"--skip-download", | |
video_url, | |
] | |
try: | |
subprocess.run(command, check=True) | |
print("Command executed successfully.") | |
except subprocess.CalledProcessError as e: | |
print(f"Error: {e}") | |
### convert vtt to txt | |
import os | |
import re | |
import glob | |
# Find the first file matching the pattern *.vtt | |
vtt_files = glob.glob('*.vtt') | |
if vtt_files: | |
first_vtt_file = vtt_files[0] | |
output_file = first_vtt_file + '.txt' | |
# Process the first *.vtt file found | |
with open(first_vtt_file, 'r') as infile, open(output_file, 'w') as outfile: | |
print('Saving output to ' + output_file) | |
seen = set() | |
for line in infile: | |
if ( | |
not re.search(':', line) and | |
line not in seen and | |
line.strip() != "[Music]" and | |
line.strip() != "WEBVTT" and | |
line.strip() != "" | |
): | |
seen.add(line) | |
outfile.write(line) | |
# Delete all *.vtt files in the current directory | |
# for vtt_file in vtt_files: | |
# os.remove(vtt_file) | |
else: | |
print("No *.vtt files found in the current directory.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment