Created
March 27, 2019 02:37
-
-
Save deivguerrero/2811a841809312737ab7c8bbac5e5590 to your computer and use it in GitHub Desktop.
Script que obtiene la transcripción de un audio alojado en Cloud Storage y busca coincidencias con expresiones regulares
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import audioread | |
from google.cloud import speech | |
from google.cloud.speech import enums | |
from google.cloud.speech import types | |
AUDIO_CHANNELS = 2 | |
AUDIO_RATE = 16000 | |
BLOB_PATH = "audio.flac" | |
BUCKET_NAME = "audio-devlife" | |
LANG_CODE = 'es-MX' | |
ORIGINAL_PHRASE = r'(?P<phrase>(vi|bi){1}da de programador)' | |
POSSIBLE_PHRASE = r'(?P<phrase>(vi|bi){1}.{0,3}(de)? programador)' | |
DEV_PHRASE = r'programador' | |
with audioread.audio_open(BLOB_PATH) as f: | |
AUDIO_CHANNELS = int(f.channels) | |
AUDIO_RATE = int(f.samplerate) | |
gcs_uri = "gs://{}/{}".format(BUCKET_NAME, BLOB_PATH) | |
client = speech.SpeechClient() | |
audio = types.RecognitionAudio(uri=gcs_uri) | |
config = types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.FLAC, | |
language_code=LANG_CODE, | |
audio_channel_count=AUDIO_CHANNELS, | |
enable_separate_recognition_per_channel=True) | |
operation = client.long_running_recognize(config, audio) | |
response = operation.result() | |
contador_phx = 0 | |
contador_php = 0 | |
contador_dev = 0 | |
for result in response.results: | |
for alternative in result.alternatives: | |
print('=' * 20) | |
text_block = alternative.transcript | |
print(text_block) | |
phx = re.findall(ORIGINAL_PHRASE, text_block, | |
re.MULTILINE & re.IGNORECASE) | |
php = re.findall(POSSIBLE_PHRASE, text_block, | |
re.MULTILINE & re.IGNORECASE) | |
devp = re.findall(DEV_PHRASE, text_block, | |
re.MULTILINE & re.IGNORECASE) | |
if phx: | |
contador_phx += len(phx) | |
if php: | |
contador_php += len(php) | |
if devp: | |
contador_dev += len(devp) | |
print("\nFRASE ORIGINAL: {}\tFRASE POSIBLE: {}\t PALABRA PROGRAMADOR:{}". | |
format(contador_phx, contador_php, contador_dev)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment