Created
July 17, 2017 16:01
-
-
Save jfsantos/93dd14829a2d6f6a8e14a4cde0ad697a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import logging | |
import os | |
import numpy as np | |
import re | |
import soundfile | |
from numpy.lib.stride_tricks import as_strided | |
from maracas.maracas import asl_meter | |
from audio_tools import iterate_invert_spectrogram | |
def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128, pad=0): | |
""" | |
Compute the spectrogram for a real signal. | |
The parameters follow the naming convention of | |
matplotlib.mlab.specgram | |
Args: | |
samples (1D array): input audio signal | |
fft_length (int): number of elements in fft window | |
sample_rate (scalar): sample rate | |
hop_length (int): hop length (relative offset between neighboring | |
fft windows). | |
Returns: | |
x (2D array): spectrogram [frequency x time] | |
freq (1D array): frequency of each row in x | |
Note: | |
This is a truncating computation e.g. if fft_length=10, | |
hop_length=5 and the signal has 23 elements, then the | |
last 3 elements will be truncated. | |
""" | |
assert not np.iscomplexobj(samples), "Must not pass in complex numbers" | |
window = np.hanning(fft_length)[:, None] | |
window_norm = np.sum(window**2) | |
# The scaling below follows the convention of | |
# matplotlib.mlab.specgram which is the same as | |
# matlabs specgram. | |
scale = window_norm * sample_rate | |
trunc = (len(samples) - fft_length) % hop_length | |
x = samples[:len(samples) - trunc] | |
# "stride trick" reshape to include overlap | |
nshape = (fft_length, (len(x) - fft_length) // hop_length + 1) | |
nstrides = (x.strides[0], x.strides[0] * hop_length) | |
x = as_strided(x, shape=nshape, strides=nstrides) | |
# window stride sanity check | |
assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)]) | |
# broadcast window, compute fft over columns and square mod | |
x = np.fft.rfft(x * window, axis=0) | |
phase = np.angle(x) | |
x = np.absolute(x)**2 | |
# scale, 2.0 for everything except dc and fft_length/2 | |
x[1:-1, :] *= (2.0 / scale) | |
x[(0, -1), :] /= scale | |
freqs = float(sample_rate) / fft_length * np.arange(x.shape[0]) | |
if pad > 0: | |
x = np.pad(x, ((0, 0), (pad, pad)), 'constant') | |
phase = np.pad(phase, ((0, 0), (pad, pad)), 'constant') | |
return x, phase, freqs | |
def spectrogram_from_file(filename, step=10, window=20, max_freq=None, | |
eps=1e-14, log=True, pad=0): | |
""" Calculate the log of linear spectrogram from FFT energy | |
Params: | |
filename (str): Path to the audio file | |
step (int): Step size in milliseconds between windows | |
window (int): FFT window size in milliseconds | |
max_freq (int): Only FFT bins corresponding to frequencies between | |
[0, max_freq] are returned | |
eps (float): Small value to ensure numerical stability (for ln(x)) | |
""" | |
with soundfile.SoundFile(filename) as sound_file: | |
audio = sound_file.read(dtype='float32') | |
sample_rate = sound_file.samplerate | |
if audio.ndim >= 2: | |
audio = np.mean(audio, 1) | |
if max_freq is None: | |
max_freq = sample_rate / 2 | |
if max_freq > sample_rate / 2: | |
raise ValueError("max_freq must not be greater than half of " | |
" sample rate") | |
if step > window: | |
raise ValueError("step size must not be greater than window size") | |
hop_length = int(0.001 * step * sample_rate) | |
fft_length = int(0.001 * window * sample_rate) | |
pxx, phase, freqs = spectrogram( | |
audio, fft_length=fft_length, sample_rate=sample_rate, | |
hop_length=hop_length, pad=pad) | |
ind = np.where(freqs <= max_freq)[0][-1] + 1 | |
if log: | |
return np.transpose(np.log(pxx[:ind, :] + eps)), np.transpose(phase[:ind, :]) | |
else: | |
return np.transpose(pxx[:ind, :] + eps), np.transpose(phase[:ind, :]) | |
def normalize(y_hat, fs, level=-26.0): | |
# Normalize energy | |
y_hat = y_hat/10**(asl_meter(y_hat, fs)/20) * 10**(level/20) | |
return y_hat |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment