Created
September 20, 2021 10:22
-
-
Save byteface/7198108926b58a5a378b1c7c5e203d70 to your computer and use it in GitHub Desktop.
deepspeech test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import os | |
import requests | |
from flask import Flask | |
from flask_sqlalchemy import SQLAlchemy | |
from xml.etree import ElementTree | |
app = Flask(__name__) | |
app.debug = True | |
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///saywhat.db' | |
db = SQLAlchemy(app) | |
class Words(db.Model): | |
id = db.Column(db.Integer, primary_key=True) | |
person = db.Column(db.String(80)) | |
word = db.Column(db.String(120)) | |
url = db.Column(db.String(200)) | |
# captions = db.Column(db.String(1000)) | |
timecode = db.Column(db.String(120)) | |
filename = db.Column(db.String(120)) | |
def __repr__(self): | |
return '<Words %r>' % self.word | |
# TODO - need a full second each side. | |
STEPS = 10 # how many bits of video to chop | |
GAP = 25 # how many milliseconds difference per chop | |
OFFSET = (-((STEPS*GAP)/2)) | |
# a simple page that says hello | |
# populates the database with words and timecodes from the timecode file | |
@app.route('/test') | |
def test(): | |
get_words( 0, 0, "https://www.youtube.com/watch?v=Xk_pi_nq838", \ | |
"https://www.youtube.com/api/timedtext?v=Xk_pi_nq838&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1569128570&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=1942F9D6A196D1E22096F131C86289D972349AFA.58D31D71E4B5AE71FAE9EC8AFE2AF621937A923B&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \ | |
"Queen Elizabeth" ) | |
# get_words( 0, 0, "https://www.youtube.com/watch?v=k0jJL_YFyIU", \ | |
# "https://www.youtube.com/api/timedtext?v=k0jJL_YFyIU&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568919560&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EFBB2184FB54EB35CD350A1488D3CFA350167138.9786C4CDB6AEED10CF096131BBA59A05824EA496&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \ | |
# "barack obama" ) | |
# get_words( 0, 0, "https://www.youtube.com/watch?v=E0UcjSHd7mo", \ | |
# "https://www.youtube.com/api/timedtext?v=E0UcjSHd7mo&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568957363&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=EA8E3FF8B47B4C80ACE15405DB6E5CC8B7DD23C7.A0A18E8788B714F82803D3748B074B856FB2E75F&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \ | |
# "Queen Elizabeth" ) | |
# get_words( 0, 0, "https://www.youtube.com/watch?v=w19Yu_1kHAk", \ | |
# "https://www.youtube.com/api/timedtext?v=w19Yu_1kHAk&asr_langs=de%2Cen%2Ces%2Cfr%2Cit%2Cja%2Cko%2Cnl%2Cpt%2Cru&caps=asr&xorp=true&hl=en-GB&ip=0.0.0.0&ipbits=0&expire=1568958153&sparams=ip%2Cipbits%2Cexpire%2Cv%2Casr_langs%2Ccaps%2Cxorp&signature=337E25A83921C1A746599AEE6FB5070CD901F755.13C280C7C172F91E909272FC7997E960F4A56D67&key=yt8&kind=asr&lang=en&fmt=srv3&xorb=2&xobt=3&xovt=3", \ | |
# "Boris Johnson" ) | |
# TODO - always download as specific format or format with ffmpeg afterwards | |
os.system(f'youtube-dl https://www.youtube.com/watch?v=w19Yu_1kHAk') #' -o tmp/{word.person}.%\(ext\)s' ) | |
return 'get_words!' | |
# genereates videos of the words | |
@app.route('/compose') | |
def compose(): | |
# words = Words.query.all() | |
# word = Words.query.filter_by(person=person).filter_by(word=word).first() | |
words = Words.query.filter_by(person="Queen Elizabeth")#.first() | |
for word in words: | |
print("===============================================================") | |
print(word) | |
print(word.url) | |
print(word.timecode) | |
print(milliseconds_to_timestamp(int(word.timecode)-3600000)) | |
tc = milliseconds_to_timestamp(int(word.timecode)-3600000) | |
# os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' ) | |
# TODO - use subproccess to know when done | |
# TODO - destroy the one that exists? | |
# os.system(f'youtube-dl -f 22 --get-url {word.url}') # this seems to stream or be faster? | |
milliseconds = 80 + (len(word.word*2)) | |
import syllables | |
clip_duration_mills = 275 | |
clip_duration_mills += syllables.estimate(word.word)*milliseconds | |
# clip_duration = len(word.word)*milliseconds_per_letter | |
import math | |
clip_duration_secs = math.floor(int(clip_duration_mills)/1000) | |
# print( f'ffmpeg -ss {tc} -i test.mp4 -map 0 -t 00:00:00.500 -c:v copy -c:a copy temp/{word.word}.mp4' ) | |
# os.system( f'ffmpeg -y -ss {tc} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy tmp/{word.word}.mp4' ) | |
i=0 | |
while i <= STEPS: | |
print(i) | |
i = i+1 | |
tc3 = milliseconds_to_timestamp(int(word.timecode)-3600000 + (OFFSET + (i*GAP) ) ) | |
os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -vn -acodec pcm_s16le -ar 16000 -ac 1 tmp/{word.filename}{i}.wav' ) | |
# os.system( f'ffmpeg -y -ss {tc3} -i test.mp4 -map 0 -t 00:00:0{clip_duration_secs}.{clip_duration_mills} -c:v copy -c:a copy -strict -2 tmp/{word.filename}.mp4' ) | |
# ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 output.wav | |
# tc2 = milliseconds_to_timestamp(200+int(word.timecode)-3600000) | |
# tc3 = milliseconds_to_timestamp((-200)+int(word.timecode)-3600000) | |
# composite_video( "Queen Elizabeth", "leaves European Union ministers committed working with parliament devolved others build widest possible consensus future outside European Union bill introduced" ) | |
return 'compose!' | |
# verifies the words using a neural network | |
@app.route('/clean') | |
def clean(): | |
from deepspeech import Model | |
import scipy.io.wavfile as wav | |
# words = Words.query.all() | |
words = Words.query.filter_by(person="Queen Elizabeth")#.first() | |
for word in words: | |
i=0 | |
while i <= STEPS: | |
i = i+1 | |
try: | |
# deepspeech --model deepspeech-0.5.1-models/output_graph.pbmm --alphabet deepspeech-0.5.1-models/alphabet.txt --lm deepspeech-0.5.1-models/lm.binary --trie deepspeech-0.5.1-models/trie --audio tmp/future.wav | |
# python mystt.py models/output_graph.pb models/alphabet.txt /tmp/input1.wav | |
ds = Model("deepspeech-0.5.1-models/output_graph.pbmm", 26, 9, "deepspeech-0.5.1-models/alphabet.txt", 500) | |
fs, audio = wav.read(f"tmp/{word.filename}{i}.wav") | |
processed_data = ds.stt(audio, fs) | |
print("processed_data::") | |
print(processed_data) | |
# TODO - check if its 90% the same? | |
win = (processed_data == word.word) | |
print( "result::", word.word, win) | |
# TODO - last one not being deleted | |
if not win: | |
os.system( f"rm tmp/{word.filename}{i}.wav" ) | |
# TODO - if win update the offsets? | |
except Exception as e: | |
print( "fail", word.word ) | |
print( e ) | |
return 'clean!' | |
def milliseconds_to_timestamp(milliseconds): | |
return datetime.datetime.fromtimestamp(int(milliseconds)/1000).strftime('%H:%M:%S.%f')[:-3] | |
def clean_text(text): | |
return text.strip().replace("\n", " ") | |
# start_time - in milliseconds - or take a variety of forms?? | |
# end_time - in milliseconds | |
# url_video - youtube video url | |
# url_caption - link to the timecodes | |
# person - who it is. GLOBAL. can be everyone. | |
def get_words(start_time, end_time, url_video, url_caption, person): | |
print( "get_words::", start_time, end_time, url_video, url_caption, person) | |
start = milliseconds_to_timestamp(start_time) | |
end = milliseconds_to_timestamp(end_time) | |
# TODO - only search between these points | |
# TODO - need to know what to do if end is 0 | |
# TODO - download the timecode files? | |
# download and parse captions | |
response = requests.get(url_caption) | |
tree = ElementTree.fromstring(response.content) | |
# <p t="1433610" d="7590" w="1"> | |
# <s ac="252">safeguards</s> | |
# <s t="1250" ac="166"> agriculture</s> | |
# <s t="2250" ac="201"> and</s> | |
# <s t="2460" ac="160"> fisheries</s> | |
# <s t="4640" ac="242"> my</s> | |
# </p> | |
# assosiative arrays | |
timecodes=[] | |
words=[] | |
body = tree.find("body") | |
for p in body.findall("p"): | |
if p.text is None or p.text == "\n": | |
w=[] | |
t=[] | |
time = int(p.attrib['t']) | |
for s in p.findall("s"): | |
w.append( clean_text(s.text) ) | |
try: | |
time += int(s.attrib['ac']) | |
# t.append( int(p.attrib['t'])+int(s.attrib['t']) ) | |
t.append( time ) | |
# TODO - need to increment them each time.... its only space between | |
except: | |
t.append( int(p.attrib['t']) ) | |
words.extend(w) | |
timecodes.extend(t) | |
else: | |
# p.text is an actual word/string! | |
print("sentences") | |
print(p.text) | |
# TODO - can at least do 1st word | |
for count, word in enumerate(words): | |
print(word, timecodes[count]) | |
import uuid | |
uid = uuid.uuid4().hex[0:5] | |
word = Words(person=person, word=word, url=url_video, timecode=timecodes[count], filename=word+uid) | |
db.session.add(word) | |
db.session.commit() | |
def fetch_word(person,word): | |
try: | |
word = Words.query.filter_by(person=person).filter_by(word=word).first() | |
tc = milliseconds_to_timestamp(int(word.timecode)-3600000) | |
os.system( f'youtube-dl -x --postprocessor-args "-ss {tc} -t 00:00:00.500" {word.url} -o tmp/{word.word}.%\(ext\)s' ) | |
except Exception as e: | |
print(e) | |
# oerson - who you want to use | |
# sentence - what you want to generate | |
def composite_video(person,sentence): | |
print( "composite_video", sentence, person ) | |
words = sentence.split(" ") | |
for word in words: | |
fetch_word(person,word) | |
# Database | |
# words | |
# person | word | url | timecode | duration | |
# TODO - see about generating phonemes from words | |
# phonemes | |
# person | sound | url | timecode | duration | |
# UI | |
# 1: page where you can choose and actor and type a sentence. | |
# 2: result page where you can change the word as we may have muliples of a word | |
# 3: also let users give feedback per word. to kill of bad words. | |
# TODO - | |
# think of potential validation methods | |
# consider downloading the videos to make composition faster - but its lots of data to store. do for 1. see how much? | |
# tooling - tool to generate new models. i.e. a preview results and save to master set. | |
# Ideas/pages - | |
# search by word or piece of dialogue to jump into a video | |
db.create_all() | |
app.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment