Last active
February 26, 2025 11:52
-
-
Save hippietrail/af26d363d7f6ac37565c5074f04e4f5e to your computer and use it in GitHub Desktop.
TypeScript code to fetch one or more YouTube transcripts as plain text without API key
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import url from 'url'; | |
import parse from 'html-dom-parser'; | |
import { Element, Text } from 'domhandler'; | |
import { decodeXML } from 'entities'; | |
async function getHtmlByVideoID(videoID: string): Promise<string> { | |
const reponse = await fetch(url.format({ | |
protocol: 'https', | |
hostname: 'www.youtube.com', | |
pathname: 'watch', | |
query: { v: videoID }, | |
})); | |
return (await reponse.text()); | |
} | |
function capitalizeFirstLetter<T extends string>([ first='', ...rest ]: T): string { | |
return [ first.toUpperCase(), ...rest ].join(''); | |
} | |
const puncs = [',', '.', '?', '!', ';', ':']; | |
async function main() { | |
let sentences = false; | |
let dedupe = false; | |
if (process.argv.includes('-d')) { | |
dedupe = true; | |
process.argv = process.argv.filter(arg => arg !== '-d'); | |
} | |
if (process.argv.includes('-s')) { | |
sentences = true; | |
process.argv = process.argv.filter(arg => arg !== '-s'); | |
} | |
const videoIDsFromCommandline = process.argv.slice(2); | |
let videoIDs = videoIDsFromCommandline; | |
let videoIDsToRetry: string[] = []; | |
while (videoIDs.length > 0) { | |
const promiseArray = videoIDs.map(getHtmlByVideoID); | |
const settledPromises = await Promise.allSettled(promiseArray); | |
for (const [settledNum, settled] of settledPromises.entries()) { | |
const videoID = `${videoIDs[settledNum]}`; | |
if (settled.status === 'fulfilled') { | |
if (settled.value) { | |
const dom = parse(settled.value); | |
const data = ((((dom[1] as Element).children[1] as Element).children[0] as Element).children[0] as Text).data; | |
const startsWith = 'var ytInitialPlayerResponse = '; | |
if (data.includes(startsWith) && data.endsWith(';')) { | |
const json = JSON.parse(data.substring(startsWith.length, data.length - 1)); | |
if (!('captions' in json)) { | |
console.error('no captions', videoID); | |
} else { | |
const xml = await (await fetch(json.captions.playerCaptionsTracklistRenderer.captionTracks[0].baseUrl)).text(); | |
const dom = parse(xml); | |
for (const dn of (dom[1] as Element).children) { | |
if ((dn as Element).children[0] === undefined) continue; | |
let line = decodeXML(((dn as Element).children[0] as Text).data); | |
if (sentences) { | |
const chars = [...line]; | |
if (chars.length > 0) { | |
const firstUpper = chars[0].toUpperCase(); | |
let theRest = chars.slice(1); | |
if (theRest.length > 0) { | |
let lastChar = theRest.pop()!; | |
if (!puncs.includes(lastChar)) { | |
lastChar += '.'; | |
} | |
theRest.push(lastChar); | |
} | |
line = [firstUpper, ...theRest].join(''); | |
if (dedupe) { | |
let words = line.split(' ').filter((x, i, a) => a.indexOf(x) === i); | |
if (words.length > 1) { | |
if (words[0] === capitalizeFirstLetter(words[1])) words.splice(1, 1); | |
if (words.length > 1) { | |
const last = words[words.length - 1]; | |
if (puncs.includes(last[last.length - 1]) | |
&& last.slice(0, -1) === words[words.length - 2]) | |
words.splice(words.length - 2, 1); | |
} | |
line = words.join(' '); | |
} | |
} | |
} | |
} | |
console.log(line); | |
} | |
} | |
} | |
} | |
} else if (settled.status === 'rejected') { | |
console.error(`rejected ${videoID}`, settled.reason); | |
videoIDsToRetry.push(videoID); | |
} | |
} | |
videoIDs = videoIDsToRetry; | |
videoIDsToRetry = []; | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment