hippietrail · June 1, 2025 07:31
diff --git a/fetch-youtube-transcript.ts b/fetch-youtube-transcript.ts
 import url from 'url';

 import parse from 'html-dom-parser';
 import { Element, Text } from 'domhandler';
 import { decodeXML } from 'entities';

 async function getHtmlByVideoID(videoID: string): Promise<string> {
    const reponse = await fetch(url.format({
        protocol: 'https',
        hostname: 'www.youtube.com',
        pathname: 'watch',
        query: { v: videoID },
    }));

    return (await reponse.text());
 }

 function capitalizeFirstLetter<T extends string>([ first='', ...rest ]: T): string {
    return [ first.toUpperCase(), ...rest ].join('');
 }

 const puncs = [',', '.', '?', '!', ';', ':'];

 async function main() {
    let sentences = false;
    let dedupe = false;
    if (process.argv.includes('-d')) {
        dedupe = true;
        process.argv = process.argv.filter(arg => arg !== '-d');
    }
    if (process.argv.includes('-s')) {
        sentences = true;
        process.argv = process.argv.filter(arg => arg !== '-s');
    }

    const videoIDsFromCommandline = process.argv.slice(2);
    let videoIDs = videoIDsFromCommandline;
    let videoIDsToRetry: string[] = [];

    while (videoIDs.length > 0) {
        const promiseArray = videoIDs.map(getHtmlByVideoID);
        const settledPromises = await Promise.allSettled(promiseArray);

        for (const [settledNum, settled] of settledPromises.entries()) {
            const videoID = `${videoIDs[settledNum]}`;

            if (settled.status === 'fulfilled') {
                if (settled.value) {
                    const dom = parse(settled.value);

                    const data = ((((dom[1] as Element).children[1] as Element).children[0] as Element).children[0] as Text).data;
                    const startsWith = 'var ytInitialPlayerResponse = ';

                    if (data.includes(startsWith) && data.endsWith(';')) {
                        const json = JSON.parse(data.substring(startsWith.length, data.length - 1));
                        if (!('captions' in json)) {
                            console.error('no captions', videoID);
                        } else {
                            const xml = await (await fetch(json.captions.playerCaptionsTracklistRenderer.captionTracks[0].baseUrl)).text();
                            const dom = parse(xml);
                            for (const dn of (dom[1] as Element).children) {
                                if ((dn as Element).children[0] === undefined) continue;
                                
                                let line = decodeXML(((dn as Element).children[0] as Text).data);

                                if (sentences) {
                                    const chars = [...line];
                                    if (chars.length > 0) {

                                        const firstUpper = chars[0].toUpperCase();
                                        let theRest = chars.slice(1);

                                        if (theRest.length > 0) {
                                            let lastChar = theRest.pop()!;
                                            if (!puncs.includes(lastChar)) {
                                                lastChar += '.';
                                            }
                                            theRest.push(lastChar);
                                        }
                                        line = [firstUpper, ...theRest].join('');

                                        if (dedupe) {
                                            let words = line.split(' ').filter((x, i, a) => a.indexOf(x) === i);

                                            if (words.length > 1) {
                                                if (words[0] === capitalizeFirstLetter(words[1])) words.splice(1, 1);

                                                if (words.length > 1) {
                                                    const last = words[words.length - 1];
                                                    if (puncs.includes(last[last.length - 1])
                                                            && last.slice(0, -1) === words[words.length - 2])
                                                        words.splice(words.length - 2, 1);
                                                }

                                                line = words.join(' ');
                                            }
                                        }
                                    }
                                }
                                console.log(line);
                            }
                        }
                    }
                }
            } else if (settled.status === 'rejected') {
                console.error(`rejected ${videoID}`, settled.reason);
                videoIDsToRetry.push(videoID);
            }
        }

        videoIDs = videoIDsToRetry;
        videoIDsToRetry = [];
    }
 }

 main();
	import url from 'url';

	import parse from 'html-dom-parser';
	import { Element, Text } from 'domhandler';
	import { decodeXML } from 'entities';

	async function getHtmlByVideoID(videoID: string): Promise<string> {
	const reponse = await fetch(url.format({
	protocol: 'https',
	hostname: 'www.youtube.com',
	pathname: 'watch',
	query: { v: videoID },
	}));

	return (await reponse.text());
	}

	function capitalizeFirstLetter<T extends string>([ first='', ...rest ]: T): string {
	return [ first.toUpperCase(), ...rest ].join('');
	}

	const puncs = [',', '.', '?', '!', ';', ':'];

	async function main() {
	let sentences = false;
	let dedupe = false;
	if (process.argv.includes('-d')) {
	dedupe = true;
	process.argv = process.argv.filter(arg => arg !== '-d');
	}
	if (process.argv.includes('-s')) {
	sentences = true;
	process.argv = process.argv.filter(arg => arg !== '-s');
	}

	const videoIDsFromCommandline = process.argv.slice(2);
	let videoIDs = videoIDsFromCommandline;
	let videoIDsToRetry: string[] = [];

	while (videoIDs.length > 0) {
	const promiseArray = videoIDs.map(getHtmlByVideoID);
	const settledPromises = await Promise.allSettled(promiseArray);

	for (const [settledNum, settled] of settledPromises.entries()) {
	const videoID = `${videoIDs[settledNum]}`;

	if (settled.status === 'fulfilled') {
	if (settled.value) {
	const dom = parse(settled.value);

	const data = ((((dom[1] as Element).children[1] as Element).children[0] as Element).children[0] as Text).data;
	const startsWith = 'var ytInitialPlayerResponse = ';

	if (data.includes(startsWith) && data.endsWith(';')) {
	const json = JSON.parse(data.substring(startsWith.length, data.length - 1));
	if (!('captions' in json)) {
	console.error('no captions', videoID);
	} else {
	const xml = await (await fetch(json.captions.playerCaptionsTracklistRenderer.captionTracks[0].baseUrl)).text();
	const dom = parse(xml);
	for (const dn of (dom[1] as Element).children) {
	if ((dn as Element).children[0] === undefined) continue;

	let line = decodeXML(((dn as Element).children[0] as Text).data);

	if (sentences) {
	const chars = [...line];
	if (chars.length > 0) {

	const firstUpper = chars[0].toUpperCase();
	let theRest = chars.slice(1);

	if (theRest.length > 0) {
	let lastChar = theRest.pop()!;
	if (!puncs.includes(lastChar)) {
	lastChar += '.';
	}
	theRest.push(lastChar);
	}
	line = [firstUpper, ...theRest].join('');

	if (dedupe) {
	let words = line.split(' ').filter((x, i, a) => a.indexOf(x) === i);

	if (words.length > 1) {
	if (words[0] === capitalizeFirstLetter(words[1])) words.splice(1, 1);

	if (words.length > 1) {
	const last = words[words.length - 1];
	if (puncs.includes(last[last.length - 1])
	&& last.slice(0, -1) === words[words.length - 2])
	words.splice(words.length - 2, 1);
	}

	line = words.join(' ');
	}
	}
	}
	}
	console.log(line);
	}
	}
	}
	}
	} else if (settled.status === 'rejected') {
	console.error(`rejected ${videoID}`, settled.reason);
	videoIDsToRetry.push(videoID);
	}
	}

	videoIDs = videoIDsToRetry;
	videoIDsToRetry = [];
	}
	}

	main();