Skip to content

Instantly share code, notes, and snippets.

@byteface
Created September 20, 2021 10:22
Show Gist options
  • Save byteface/7198108926b58a5a378b1c7c5e203d70 to your computer and use it in GitHub Desktop.
Save byteface/7198108926b58a5a378b1c7c5e203d70 to your computer and use it in GitHub Desktop.
deepspeech test
import datetime
import os
import requests
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from xml.etree import ElementTree
app = Flask(__name__)
app.debug = True
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///saywhat.db'
db = SQLAlchemy(app)
class Words(db.Model):
id = db.Column(db.Integer, primary_key=True)
person = db.Column(db.String(80))
word = db.Column(db.String(120))
url = db.Column(db.String(200))
# captions = db.Column(db.String(1000))
timecode = db.Column(db.String(120))
filename = db.Column(db.String(120))
def __repr__(self):
return '<Words %r>' % self.word
# TODO - need a full second each side.
STEPS = 10 # how many bits of video to chop
GAP = 25 # how many milliseconds difference per chop
OFFSET = (-((STEPS*GAP)/2))
# a simple page that says hello
# populates the database with words and timecodes from the timecode file
@app.route('/test')
def test():
get_words( 0, 0, "https://www.youtube.com/watch?v=Xk_pi_nq838", \
"https://www.youtube.com/api/timedtext?v=Xk_pi_nq838&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1569128570&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=1942F9D6A196D1E22096F131C86289D972349AFA.58D31D71E4B5AE71FAE9EC8AFE2AF621937A923B&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
"Queen Elizabeth" )
# get_words( 0, 0, "https://www.youtube.com/watch?v=k0jJL_YFyIU", \
# "https://www.youtube.com/api/timedtext?v=k0jJL_YFyIU&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568919560&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EFBB2184FB54EB35CD350A1488D3CFA350167138.9786C4CDB6AEED10CF096131BBA59A05824EA496&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
# "barack obama" )
# get_words( 0, 0, "https://www.youtube.com/watch?v=E0UcjSHd7mo", \
# "https://www.youtube.com/api/timedtext?v=E0UcjSHd7mo&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568957363&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EA8E3FF8B47B4C80ACE15405DB6E5CC8B7DD23C7.A0A18E8788B714F82803D3748B074B856FB2E75F&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
# "Queen Elizabeth" )
# get_words( 0, 0, "https://www.youtube.com/watch?v=w19Yu_1kHAk", \
# "https://www.youtube.com/api/timedtext?v=w19Yu_1kHAk&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568958153&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=337E25A83921C1A746599AEE6FB5070CD901F755.13C280C7C172F91E909272FC7997E960F4A56D67&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \
# "Boris Johnson" )
# TODO - always download as specific format or format with ffmpeg afterwards
os.system(f'youtube-dl https://www.youtube.com/watch?v=w19Yu_1kHAk') #' -o tmp/{word.person}.%\(ext\)s' )
return 'get_words!'
# genereates videos of the words
@app.route('/compose')
def compose():
# words = Words.query.all()
# word = Words.query.filter_by(person=person).filter_by(word=word).first()
words = Words.query.filter_by(person="Queen Elizabeth")#.first()
for word in words:
print("===============================================================")
print(word)
print(word.url)
print(word.timecode)
print(milliseconds_to_timestamp(int(word.timecode)-3600000))
tc = milliseconds_to_timestamp(int(word.timecode)-3600000)
# os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' )
# TODO - use subproccess to know when done
# TODO - destroy the one that exists?
# os.system(f'youtube-dl -f 22 --get-url {word.url}') # this seems to stream or be faster?
milliseconds = 80 + (len(word.word*2))
import syllables
clip_duration_mills = 275
clip_duration_mills += syllables.estimate(word.word)*milliseconds
# clip_duration = len(word.word)*milliseconds_per_letter
import math
clip_duration_secs = math.floor(int(clip_duration_mills)/1000)
# print( f'ffmpeg -ss {tc} -i test.mp4 -map 0 -t 00:00:00.500 -c:v copy -c:a copy temp/{word.word}.mp4' )
# os.system( f'ffmpeg -y -ss {tc} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy tmp/{word.word}.mp4' )
i=0
while i <= STEPS:
print(i)
i = i+1
tc3 = milliseconds_to_timestamp(int(word.timecode)-3600000 + (OFFSET + (i*GAP) ) )
os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -vn -acodec pcm_s16le -ar 16000 -ac 1 tmp/{word.filename}{i}.wav' )
# os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy -strict -2 tmp/{word.filename}.mp4' )
# ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 output.wav
# tc2 = milliseconds_to_timestamp(200+int(word.timecode)-3600000)
# tc3 = milliseconds_to_timestamp((-200)+int(word.timecode)-3600000)
# composite_video( "Queen Elizabeth", "leaves European Union ministers committed working with parliament devolved others build widest possible consensus future outside European Union bill introduced" )
return 'compose!'
# verifies the words using a neural network
@app.route('/clean')
def clean():
from deepspeech import Model
import scipy.io.wavfile as wav
# words = Words.query.all()
words = Words.query.filter_by(person="Queen Elizabeth")#.first()
for word in words:
i=0
while i <= STEPS:
i = i+1
try:
# deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --alphabet deepspeech-0.5.1-models/alphabet.txt --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio tmp/future.wav
# python mystt.py models/output_graph.pb models/alphabet.txt /tmp/input1.wav
ds = Model("deepspeech-0.5.1-models/output_graph.pbmm", 26, 9, "deepspeech-0.5.1-models/alphabet.txt", 500)
fs, audio = wav.read(f"tmp/{word.filename}{i}.wav")
processed_data = ds.stt(audio, fs)
print("processed_data::")
print(processed_data)
# TODO - check if its 90% the same?
win = (processed_data == word.word)
print( "result::", word.word, win)
# TODO - last one not being deleted
if not win:
os.system( f"rm tmp/{word.filename}{i}.wav" )
# TODO - if win update the offsets?
except Exception as e:
print( "fail", word.word )
print( e )
return 'clean!'
def milliseconds_to_timestamp(milliseconds):
return datetime.datetime.fromtimestamp(int(milliseconds)/1000).strftime('%H:%M:%S.%f')[:-3]
def clean_text(text):
return text.strip().replace("\n", " ")
# start_time - in milliseconds - or take a variety of forms??
# end_time - in milliseconds
# url_video - youtube video url
# url_caption - link to the timecodes
# person - who it is. GLOBAL. can be everyone.
def get_words(start_time, end_time, url_video, url_caption, person):
print( "get_words::", start_time, end_time, url_video, url_caption, person)
start = milliseconds_to_timestamp(start_time)
end = milliseconds_to_timestamp(end_time)
# TODO - only search between these points
# TODO - need to know what to do if end is 0
# TODO - download the timecode files?
# download and parse captions
response = requests.get(url_caption)
tree = ElementTree.fromstring(response.content)
# <p t="1433610" d="7590" w="1">
# <s ac="252">safeguards</s>
# <s t="1250" ac="166"> agriculture</s>
# <s t="2250" ac="201"> and</s>
# <s t="2460" ac="160"> fisheries</s>
# <s t="4640" ac="242"> my</s>
# </p>
# assosiative arrays
timecodes=[]
words=[]
body = tree.find("body")
for p in body.findall("p"):
if p.text is None or p.text == "\n":
w=[]
t=[]
time = int(p.attrib['t'])
for s in p.findall("s"):
w.append( clean_text(s.text) )
try:
time += int(s.attrib['ac'])
# t.append( int(p.attrib['t'])+int(s.attrib['t']) )
t.append( time )
# TODO - need to increment them each time.... its only space between
except:
t.append( int(p.attrib['t']) )
words.extend(w)
timecodes.extend(t)
else:
# p.text is an actual word/string!
print("sentences")
print(p.text)
# TODO - can at least do 1st word
for count, word in enumerate(words):
print(word, timecodes[count])
import uuid
uid = uuid.uuid4().hex[0:5]
word = Words(person=person, word=word, url=url_video, timecode=timecodes[count], filename=word+uid)
db.session.add(word)
db.session.commit()
def fetch_word(person,word):
try:
word = Words.query.filter_by(person=person).filter_by(word=word).first()
tc = milliseconds_to_timestamp(int(word.timecode)-3600000)
os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' )
except Exception as e:
print(e)
# oerson - who you want to use
# sentence - what you want to generate
def composite_video(person,sentence):
print( "composite_video", sentence, person )
words = sentence.split(" ")
for word in words:
fetch_word(person,word)
# Database
# words
# person | word | url | timecode | duration
# TODO - see about generating phonemes from words
# phonemes
# person | sound | url | timecode | duration
# UI
# 1: page where you can choose and actor and type a sentence.
# 2: result page where you can change the word as we may have muliples of a word
# 3: also let users give feedback per word. to kill of bad words.
# TODO -
# think of potential validation methods
# consider downloading the videos to make composition faster - but its lots of data to store. do for 1. see how much?
# tooling - tool to generate new models. i.e. a preview results and save to master set.
# Ideas/pages -
# search by word or piece of dialogue to jump into a video
db.create_all()
app.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment