Skip to content

Instantly share code, notes, and snippets.

@hippietrail
Last active February 26, 2025 11:52
Show Gist options
  • Save hippietrail/af26d363d7f6ac37565c5074f04e4f5e to your computer and use it in GitHub Desktop.
Save hippietrail/af26d363d7f6ac37565c5074f04e4f5e to your computer and use it in GitHub Desktop.
TypeScript code to fetch one or more YouTube transcripts as plain text without API key
import url from 'url';
import parse from 'html-dom-parser';
import { Element, Text } from 'domhandler';
import { decodeXML } from 'entities';
async function getHtmlByVideoID(videoID: string): Promise<string> {
const reponse = await fetch(url.format({
protocol: 'https',
hostname: 'www.youtube.com',
pathname: 'watch',
query: { v: videoID },
}));
return (await reponse.text());
}
function capitalizeFirstLetter<T extends string>([ first='', ...rest ]: T): string {
return [ first.toUpperCase(), ...rest ].join('');
}
const puncs = [',', '.', '?', '!', ';', ':'];
async function main() {
let sentences = false;
let dedupe = false;
if (process.argv.includes('-d')) {
dedupe = true;
process.argv = process.argv.filter(arg => arg !== '-d');
}
if (process.argv.includes('-s')) {
sentences = true;
process.argv = process.argv.filter(arg => arg !== '-s');
}
const videoIDsFromCommandline = process.argv.slice(2);
let videoIDs = videoIDsFromCommandline;
let videoIDsToRetry: string[] = [];
while (videoIDs.length > 0) {
const promiseArray = videoIDs.map(getHtmlByVideoID);
const settledPromises = await Promise.allSettled(promiseArray);
for (const [settledNum, settled] of settledPromises.entries()) {
const videoID = `${videoIDs[settledNum]}`;
if (settled.status === 'fulfilled') {
if (settled.value) {
const dom = parse(settled.value);
const data = ((((dom[1] as Element).children[1] as Element).children[0] as Element).children[0] as Text).data;
const startsWith = 'var ytInitialPlayerResponse = ';
if (data.includes(startsWith) && data.endsWith(';')) {
const json = JSON.parse(data.substring(startsWith.length, data.length - 1));
if (!('captions' in json)) {
console.error('no captions', videoID);
} else {
const xml = await (await fetch(json.captions.playerCaptionsTracklistRenderer.captionTracks[0].baseUrl)).text();
const dom = parse(xml);
for (const dn of (dom[1] as Element).children) {
if ((dn as Element).children[0] === undefined) continue;
let line = decodeXML(((dn as Element).children[0] as Text).data);
if (sentences) {
const chars = [...line];
if (chars.length > 0) {
const firstUpper = chars[0].toUpperCase();
let theRest = chars.slice(1);
if (theRest.length > 0) {
let lastChar = theRest.pop()!;
if (!puncs.includes(lastChar)) {
lastChar += '.';
}
theRest.push(lastChar);
}
line = [firstUpper, ...theRest].join('');
if (dedupe) {
let words = line.split(' ').filter((x, i, a) => a.indexOf(x) === i);
if (words.length > 1) {
if (words[0] === capitalizeFirstLetter(words[1])) words.splice(1, 1);
if (words.length > 1) {
const last = words[words.length - 1];
if (puncs.includes(last[last.length - 1])
&& last.slice(0, -1) === words[words.length - 2])
words.splice(words.length - 2, 1);
}
line = words.join(' ');
}
}
}
}
console.log(line);
}
}
}
}
} else if (settled.status === 'rejected') {
console.error(`rejected ${videoID}`, settled.reason);
videoIDsToRetry.push(videoID);
}
}
videoIDs = videoIDsToRetry;
videoIDsToRetry = [];
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment