byteface · September 20, 2021 10:22
diff --git a/gistfile1.txt b/gistfile1.txt
 import datetime
 import os
 import requests
 from flask import Flask
 from flask_sqlalchemy import SQLAlchemy
 from xml.etree import ElementTree

 app = Flask(__name__)
 app.debug = True
 app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///saywhat.db'
 db = SQLAlchemy(app)

 class Words(db.Model):    
    id = db.Column(db.Integer, primary_key=True)
    person = db.Column(db.String(80))
    word = db.Column(db.String(120))
    url = db.Column(db.String(200))
    # captions = db.Column(db.String(1000))
    timecode = db.Column(db.String(120))
    filename = db.Column(db.String(120))

    def __repr__(self):
        return '<Words %r>' % self.word


 # TODO - need a full second each side.
 STEPS = 10 # how many bits of video to chop
 GAP = 25 # how many milliseconds difference per chop
 OFFSET = (-((STEPS*GAP)/2))


 # a simple page that says hello
 # populates the database with words and timecodes from the timecode file
 @app.route('/test')
 def test():

    get_words( 0, 0, "https://www.youtube.com/watch?v=Xk_pi_nq838", \
        "https://www.youtube.com/api/timedtext?v=Xk_pi_nq838&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1569128570&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=1942F9D6A196D1E22096F131C86289D972349AFA.58D31D71E4B5AE71FAE9EC8AFE2AF621937A923B&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
        "Queen Elizabeth" )
    
    # get_words( 0, 0, "https://www.youtube.com/watch?v=k0jJL_YFyIU", \
    #         "https://www.youtube.com/api/timedtext?v=k0jJL_YFyIU&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568919560&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EFBB2184FB54EB35CD350A1488D3CFA350167138.9786C4CDB6AEED10CF096131BBA59A05824EA496&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
    #         "barack obama" )

    # get_words( 0, 0, "https://www.youtube.com/watch?v=E0UcjSHd7mo", \
    #     "https://www.youtube.com/api/timedtext?v=E0UcjSHd7mo&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568957363&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EA8E3FF8B47B4C80ACE15405DB6E5CC8B7DD23C7.A0A18E8788B714F82803D3748B074B856FB2E75F&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
    #     "Queen Elizabeth" )

    # get_words( 0, 0, "https://www.youtube.com/watch?v=w19Yu_1kHAk", \
    #     "https://www.youtube.com/api/timedtext?v=w19Yu_1kHAk&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568958153&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=337E25A83921C1A746599AEE6FB5070CD901F755.13C280C7C172F91E909272FC7997E960F4A56D67&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
    #     "Boris Johnson" )

    # TODO - always download as specific format or format with ffmpeg afterwards
    os.system(f'youtube-dl https://www.youtube.com/watch?v=w19Yu_1kHAk') #' -o tmp/{word.person}.%\(ext\)s' )

    return 'get_words!'


 # genereates videos of the words
 @app.route('/compose')
 def compose():

    # words = Words.query.all()
    # word = Words.query.filter_by(person=person).filter_by(word=word).first()
    words = Words.query.filter_by(person="Queen Elizabeth")#.first()
    for word in words:

        print("===============================================================")
        print(word)
        print(word.url)
        print(word.timecode)
        print(milliseconds_to_timestamp(int(word.timecode)-3600000))

        tc = milliseconds_to_timestamp(int(word.timecode)-3600000)

        # os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' )

        # TODO - use subproccess to know when done
        # TODO - destroy the one that exists?

        # os.system(f'youtube-dl -f 22 --get-url {word.url}')  # this seems to stream or be faster?
        
        milliseconds = 80 + (len(word.word*2))

        import syllables
        clip_duration_mills = 275
        clip_duration_mills += syllables.estimate(word.word)*milliseconds
        # clip_duration = len(word.word)*milliseconds_per_letter
        import math
        clip_duration_secs = math.floor(int(clip_duration_mills)/1000)

        # print( f'ffmpeg -ss {tc} -i test.mp4 -map 0 -t 00:00:00.500 -c:v copy -c:a copy temp/{word.word}.mp4' )
        # os.system( f'ffmpeg -y -ss {tc} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy tmp/{word.word}.mp4' )

        i=0
        while i <= STEPS:
            print(i)
            i = i+1 
            tc3 = milliseconds_to_timestamp(int(word.timecode)-3600000 + (OFFSET + (i*GAP) ) )
            os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -vn -acodec pcm_s16le -ar 16000 -ac 1 tmp/{word.filename}{i}.wav' )
            # os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy -strict -2 tmp/{word.filename}.mp4' )
        

        # ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 output.wav
        
        # tc2 = milliseconds_to_timestamp(200+int(word.timecode)-3600000)
        # tc3 = milliseconds_to_timestamp((-200)+int(word.timecode)-3600000)
        

    # composite_video( "Queen Elizabeth", "leaves European Union ministers committed working with parliament devolved others build widest possible consensus future outside European Union bill introduced" )
    
    return 'compose!'





 # verifies the words using a neural network
 @app.route('/clean')
 def clean():

    from deepspeech import Model
    import scipy.io.wavfile as wav

    # words = Words.query.all()
    words = Words.query.filter_by(person="Queen Elizabeth")#.first()
    for word in words:

        i=0
        while i <= STEPS:

            i = i+1
            try:

                # deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --alphabet deepspeech-0.5.1-models/alphabet.txt --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio tmp/future.wav
                # python mystt.py models/output_graph.pb models/alphabet.txt /tmp/input1.wav
                
                ds = Model("deepspeech-0.5.1-models/output_graph.pbmm", 26, 9, "deepspeech-0.5.1-models/alphabet.txt", 500)
                fs, audio = wav.read(f"tmp/{word.filename}{i}.wav")
                processed_data = ds.stt(audio, fs)

                print("processed_data::")
                print(processed_data)

                # TODO - check if its 90% the same?
                win = (processed_data == word.word)
                print( "result::", word.word, win)

                # TODO - last one not being deleted
                if not win:
                    os.system( f"rm tmp/{word.filename}{i}.wav" )

                # TODO - if win update the offsets?

            except Exception as e:
                print( "fail", word.word )
                print( e )


    return 'clean!'



 def milliseconds_to_timestamp(milliseconds):
    return datetime.datetime.fromtimestamp(int(milliseconds)/1000).strftime('%H:%M:%S.%f')[:-3]


 def clean_text(text):
    return text.strip().replace("\n", " ")



 # start_time - in milliseconds - or take a variety of forms??
 # end_time - in milliseconds 
 # url_video - youtube video url
 # url_caption - link to the timecodes
 # person - who it is. GLOBAL. can be everyone.
 def get_words(start_time, end_time, url_video, url_caption, person):    

    print( "get_words::", start_time, end_time, url_video, url_caption, person)
    start = milliseconds_to_timestamp(start_time)
    end = milliseconds_to_timestamp(end_time)
    # TODO - only search between these points
    # TODO - need to know what to do if end is 0
    # TODO - download the timecode files?

    # download and parse captions
    response = requests.get(url_caption)
    tree = ElementTree.fromstring(response.content)

    # <p t="1433610" d="7590" w="1">
    # <s ac="252">safeguards</s>
    # <s t="1250" ac="166"> agriculture</s>
    # <s t="2250" ac="201"> and</s>
    # <s t="2460" ac="160"> fisheries</s>
    # <s t="4640" ac="242"> my</s>
    # </p>

    # assosiative arrays
    timecodes=[]
    words=[]

    body = tree.find("body")
    for p in body.findall("p"):
        if p.text is None or p.text == "\n":

            w=[]
            t=[]

            time = int(p.attrib['t'])

            for s in p.findall("s"):

                w.append( clean_text(s.text) )

                try:
                    time += int(s.attrib['ac'])

                    # t.append( int(p.attrib['t'])+int(s.attrib['t']) )
                    t.append( time )
                    # TODO - need to increment them each time.... its only space between
                except:
                    t.append( int(p.attrib['t']) )

            words.extend(w)
            timecodes.extend(t)

        else:
            # p.text is an actual word/string!
            print("sentences")
            print(p.text)
            # TODO - can at least do 1st word


    for count, word in enumerate(words):        
        print(word, timecodes[count])
        
        import uuid
        uid = uuid.uuid4().hex[0:5]

        word = Words(person=person, word=word, url=url_video, timecode=timecodes[count], filename=word+uid)
        db.session.add(word)
        db.session.commit()


 def fetch_word(person,word):
    try:
        word = Words.query.filter_by(person=person).filter_by(word=word).first()
        tc = milliseconds_to_timestamp(int(word.timecode)-3600000)
        os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' )
    except Exception as e:
        print(e)



 # oerson - who you want to use
 # sentence - what you want to generate
 def composite_video(person,sentence):
    print( "composite_video", sentence, person )
    words = sentence.split(" ")
    for word in words:
        fetch_word(person,word)



 # Database

 # words
 # person | word | url | timecode | duration

 # TODO - see about generating phonemes from words
 # phonemes
 # person | sound | url | timecode | duration


 # UI

 # 1: page where you can choose and actor and type a sentence.
 # 2: result page where you can change the word as we may have muliples of a word
 # 3: also let users give feedback per word. to kill of bad words.

 # TODO -
 # think of potential validation methods
 # consider downloading the videos to make composition faster - but its lots of data to store. do for 1. see how much?
 # tooling - tool to generate new models. i.e. a preview results and save to master set.

 # Ideas/pages -
 # search by word or piece of dialogue to jump into a video


 db.create_all()
 app.run()
	import datetime
	import os
	import requests
	from flask import Flask
	from flask_sqlalchemy import SQLAlchemy
	from xml.etree import ElementTree

	app = Flask(__name__)
	app.debug = True
	app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///saywhat.db'
	db = SQLAlchemy(app)

	class Words(db.Model):
	id = db.Column(db.Integer, primary_key=True)
	person = db.Column(db.String(80))
	word = db.Column(db.String(120))
	url = db.Column(db.String(200))
	# captions = db.Column(db.String(1000))
	timecode = db.Column(db.String(120))
	filename = db.Column(db.String(120))

	def __repr__(self):
	return '<Words %r>' % self.word


	# TODO - need a full second each side.
	STEPS = 10 # how many bits of video to chop
	GAP = 25 # how many milliseconds difference per chop
	OFFSET = (-((STEPS*GAP)/2))


	# a simple page that says hello
	# populates the database with words and timecodes from the timecode file
	@app.route('/test')
	def test():

	get_words( 0, 0, "https://www.youtube.com/watch?v=Xk_pi_nq838", \
	"https://www.youtube.com/api/timedtext?v=Xk_pi_nq838&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1569128570&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=1942F9D6A196D1E22096F131C86289D972349AFA.58D31D71E4B5AE71FAE9EC8AFE2AF621937A923B&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
	"Queen Elizabeth" )

	# get_words( 0, 0, "https://www.youtube.com/watch?v=k0jJL_YFyIU", \
	# "https://www.youtube.com/api/timedtext?v=k0jJL_YFyIU&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568919560&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EFBB2184FB54EB35CD350A1488D3CFA350167138.9786C4CDB6AEED10CF096131BBA59A05824EA496&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
	# "barack obama" )

	# get_words( 0, 0, "https://www.youtube.com/watch?v=E0UcjSHd7mo", \
	# "https://www.youtube.com/api/timedtext?v=E0UcjSHd7mo&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568957363&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EA8E3FF8B47B4C80ACE15405DB6E5CC8B7DD23C7.A0A18E8788B714F82803D3748B074B856FB2E75F&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
	# "Queen Elizabeth" )

	# get_words( 0, 0, "https://www.youtube.com/watch?v=w19Yu_1kHAk", \
	# "https://www.youtube.com/api/timedtext?v=w19Yu_1kHAk&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568958153&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=337E25A83921C1A746599AEE6FB5070CD901F755.13C280C7C172F91E909272FC7997E960F4A56D67&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
	# "Boris Johnson" )

	# TODO - always download as specific format or format with ffmpeg afterwards
	os.system(f'youtube-dl https://www.youtube.com/watch?v=w19Yu_1kHAk') #' -o tmp/{word.person}.%\(ext\)s' )

	return 'get_words!'


	# genereates videos of the words
	@app.route('/compose')
	def compose():

	# words = Words.query.all()
	# word = Words.query.filter_by(person=person).filter_by(word=word).first()
	words = Words.query.filter_by(person="Queen Elizabeth")#.first()
	for word in words:

	print("===============================================================")
	print(word)
	print(word.url)
	print(word.timecode)
	print(milliseconds_to_timestamp(int(word.timecode)-3600000))

	tc = milliseconds_to_timestamp(int(word.timecode)-3600000)

	# os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' )

	# TODO - use subproccess to know when done
	# TODO - destroy the one that exists?

	# os.system(f'youtube-dl -f 22 --get-url {word.url}') # this seems to stream or be faster?

	milliseconds = 80 + (len(word.word*2))

	import syllables
	clip_duration_mills = 275
	clip_duration_mills += syllables.estimate(word.word)*milliseconds
	# clip_duration = len(word.word)*milliseconds_per_letter
	import math
	clip_duration_secs = math.floor(int(clip_duration_mills)/1000)

	# print( f'ffmpeg -ss {tc} -i test.mp4 -map 0 -t 00:00:00.500 -c:v copy -c:a copy temp/{word.word}.mp4' )
	# os.system( f'ffmpeg -y -ss {tc} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy tmp/{word.word}.mp4' )

	i=0
	while i <= STEPS:
	print(i)
	i = i+1
	tc3 = milliseconds_to_timestamp(int(word.timecode)-3600000 + (OFFSET + (i*GAP) ) )
	os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -vn -acodec pcm_s16le -ar 16000 -ac 1 tmp/{word.filename}{i}.wav' )
	# os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy -strict -2 tmp/{word.filename}.mp4' )


	# ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 output.wav

	# tc2 = milliseconds_to_timestamp(200+int(word.timecode)-3600000)
	# tc3 = milliseconds_to_timestamp((-200)+int(word.timecode)-3600000)


	# composite_video( "Queen Elizabeth", "leaves European Union ministers committed working with parliament devolved others build widest possible consensus future outside European Union bill introduced" )

	return 'compose!'





	# verifies the words using a neural network
	@app.route('/clean')
	def clean():

	from deepspeech import Model
	import scipy.io.wavfile as wav

	# words = Words.query.all()
	words = Words.query.filter_by(person="Queen Elizabeth")#.first()
	for word in words:

	i=0
	while i <= STEPS:

	i = i+1
	try:

	# deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --alphabet deepspeech-0.5.1-models/alphabet.txt --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio tmp/future.wav
	# python mystt.py models/output_graph.pb models/alphabet.txt /tmp/input1.wav

	ds = Model("deepspeech-0.5.1-models/output_graph.pbmm", 26, 9, "deepspeech-0.5.1-models/alphabet.txt", 500)
	fs, audio = wav.read(f"tmp/{word.filename}{i}.wav")
	processed_data = ds.stt(audio, fs)

	print("processed_data::")
	print(processed_data)

	# TODO - check if its 90% the same?
	win = (processed_data == word.word)
	print( "result::", word.word, win)

	# TODO - last one not being deleted
	if not win:
	os.system( f"rm tmp/{word.filename}{i}.wav" )

	# TODO - if win update the offsets?

	except Exception as e:
	print( "fail", word.word )
	print( e )


	return 'clean!'



	def milliseconds_to_timestamp(milliseconds):
	return datetime.datetime.fromtimestamp(int(milliseconds)/1000).strftime('%H:%M:%S.%f')[:-3]


	def clean_text(text):
	return text.strip().replace("\n", " ")



	# start_time - in milliseconds - or take a variety of forms??
	# end_time - in milliseconds
	# url_video - youtube video url
	# url_caption - link to the timecodes
	# person - who it is. GLOBAL. can be everyone.
	def get_words(start_time, end_time, url_video, url_caption, person):

	print( "get_words::", start_time, end_time, url_video, url_caption, person)
	start = milliseconds_to_timestamp(start_time)
	end = milliseconds_to_timestamp(end_time)
	# TODO - only search between these points
	# TODO - need to know what to do if end is 0
	# TODO - download the timecode files?

	# download and parse captions
	response = requests.get(url_caption)
	tree = ElementTree.fromstring(response.content)

	# <p t="1433610" d="7590" w="1">
	# <s ac="252">safeguards</s>
	# <s t="1250" ac="166"> agriculture</s>
	# <s t="2250" ac="201"> and</s>
	# <s t="2460" ac="160"> fisheries</s>
	# <s t="4640" ac="242"> my</s>
	# </p>

	# assosiative arrays
	timecodes=[]
	words=[]

	body = tree.find("body")
	for p in body.findall("p"):
	if p.text is None or p.text == "\n":

	w=[]
	t=[]

	time = int(p.attrib['t'])

	for s in p.findall("s"):

	w.append( clean_text(s.text) )

	try:
	time += int(s.attrib['ac'])

	# t.append( int(p.attrib['t'])+int(s.attrib['t']) )
	t.append( time )
	# TODO - need to increment them each time.... its only space between
	except:
	t.append( int(p.attrib['t']) )

	words.extend(w)
	timecodes.extend(t)

	else:
	# p.text is an actual word/string!
	print("sentences")
	print(p.text)
	# TODO - can at least do 1st word


	for count, word in enumerate(words):
	print(word, timecodes[count])

	import uuid
	uid = uuid.uuid4().hex[0:5]

	word = Words(person=person, word=word, url=url_video, timecode=timecodes[count], filename=word+uid)
	db.session.add(word)
	db.session.commit()


	def fetch_word(person,word):
	try:
	word = Words.query.filter_by(person=person).filter_by(word=word).first()
	tc = milliseconds_to_timestamp(int(word.timecode)-3600000)
	os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' )
	except Exception as e:
	print(e)



	# oerson - who you want to use
	# sentence - what you want to generate
	def composite_video(person,sentence):
	print( "composite_video", sentence, person )
	words = sentence.split(" ")
	for word in words:
	fetch_word(person,word)



	# Database

	# words
	# person \| word \| url \| timecode \| duration

	# TODO - see about generating phonemes from words
	# phonemes
	# person \| sound \| url \| timecode \| duration


	# UI

	# 1: page where you can choose and actor and type a sentence.
	# 2: result page where you can change the word as we may have muliples of a word
	# 3: also let users give feedback per word. to kill of bad words.

	# TODO -
	# think of potential validation methods
	# consider downloading the videos to make composition faster - but its lots of data to store. do for 1. see how much?
	# tooling - tool to generate new models. i.e. a preview results and save to master set.

	# Ideas/pages -
	# search by word or piece of dialogue to jump into a video


	db.create_all()
	app.run()