Skip to content

Instantly share code, notes, and snippets.

@rumbu13
Created April 22, 2023 10:17
Show Gist options
  • Save rumbu13/ef85148016853d5f63250e4ccd6b0353 to your computer and use it in GitHub Desktop.
Save rumbu13/ef85148016853d5f63250e4ccd6b0353 to your computer and use it in GitHub Desktop.
from flask import Flask, Response, request, jsonify
from tempfile import NamedTemporaryFile
from faster_whisper import WhisperModel
import re
import time
import os
from statistics import mean
import logging
from unidecode import unidecode
import multiprocessing
app = Flask(__name__)
models = {}
app.logger.setLevel(logging.INFO)
app.logger.info("Started")
def str2bool(v):
return v.lower() in ("yes", "true", "on", "y")
@app.route('/api/speech-to-text', methods=['POST'])
def handler():
app.logger.info("Incoming connection from %s", request.host)
modelName = request.args.get('model', 'tiny')
beamSize = int(request.args.get('beamSize', 5))
language = request.args.get('language', 'ro')
stripAccents = str2bool(request.args.get('stripAccents', "no"))
if modelName not in models:
app.logger.warning("Model %s was not cached", modelName)
models[modelName] = WhisperModel(modelName, device="cpu", compute_type="int8", cpu_threads=multiprocessing.cpu_count())
model = models[modelName]
data = request.get_data()
temp = NamedTemporaryFile(suffix='.wav', delete=False)
app.logger.info("Saving audio stream to %s", temp.name)
temp.write(data)
temp.close()
start_time = time.perf_counter()
iterable, _ = model.transcribe(temp.name, language=language, beam_size=beamSize, temperature=0, without_timestamps = True)
app.logger.info("Processing audio stream")
segments = list(iterable)
end_time = time.perf_counter()
app.logger.info("Audio stream converted to text in %f seconds", end_time - start_time)
average = mean(segment.avg_log_prob for segment in segments)
os.remove(temp.name)
text = " ".join(segment.text for segment in segments)
clean_text = re.sub(r"[,.;@#?!&$]+\ *", " ", text).strip().lower()
if stripAccents:
clean_text = unidecode(clean_text)
return jsonify(
text = clean_text,
original_text = text,
transcribe_seconds = (end_time - start_time),
likelihood = 1 + average,
model = modelName,
beamSize = beamSize,
language = language,
stripAccents = stripAccents
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment