|
@@ -0,0 +1,205 @@
|
|
|
|
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
|
|
|
|
+#
|
|
|
|
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
+# you may not use this file except in compliance with the License.
|
|
|
|
+# You may obtain a copy of the License at
|
|
|
|
+#
|
|
|
|
+# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
+#
|
|
|
|
+# Unless required by applicable law or agreed to in writing, software
|
|
|
|
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
+# See the License for the specific language governing permissions and
|
|
|
|
+# limitations under the License.
|
|
|
|
+# ==============================================================================
|
|
|
|
+
|
|
|
|
+"""Defines routines to compute mel spectrogram features from audio waveform."""
|
|
|
|
+
|
|
|
|
+import numpy as np
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def frame(data, window_length, hop_length):
|
|
|
|
+ """Convert array into a sequence of successive possibly overlapping frames.
|
|
|
|
+ An n-dimensional array of shape (num_samples, ...) is converted into an
|
|
|
|
+ (n+1)-D array of shape (num_frames, window_length, ...), where each frame
|
|
|
|
+ starts hop_length points after the preceding one.
|
|
|
|
+ This is accomplished using stride_tricks, so the original data is not
|
|
|
|
+ copied. However, there is no zero-padding, so any incomplete frames at the
|
|
|
|
+ end are not included.
|
|
|
|
+ Args:
|
|
|
|
+ data: np.array of dimension N >= 1.
|
|
|
|
+ window_length: Number of samples in each frame.
|
|
|
|
+ hop_length: Advance (in samples) between each window.
|
|
|
|
+ Returns:
|
|
|
|
+ (N+1)-D np.array with as many rows as there are complete frames that can be
|
|
|
|
+ extracted.
|
|
|
|
+ """
|
|
|
|
+ num_samples = data.shape[0]
|
|
|
|
+ num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
|
|
|
|
+ shape = (num_frames, window_length) + data.shape[1:]
|
|
|
|
+ strides = (data.strides[0] * hop_length,) + data.strides
|
|
|
|
+ return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def periodic_hann(window_length):
|
|
|
|
+ """Calculate a "periodic" Hann window.
|
|
|
|
+ The classic Hann window is defined as a raised cosine that starts and
|
|
|
|
+ ends on zero, and where every value appears twice, except the middle
|
|
|
|
+ point for an odd-length window. Matlab calls this a "symmetric" window
|
|
|
|
+ and np.hanning() returns it. However, for Fourier analysis, this
|
|
|
|
+ actually represents just over one cycle of a period N-1 cosine, and
|
|
|
|
+ thus is not compactly expressed on a length-N Fourier basis. Instead,
|
|
|
|
+ it's better to use a raised cosine that ends just before the final
|
|
|
|
+ zero value - i.e. a complete cycle of a period-N cosine. Matlab
|
|
|
|
+ calls this a "periodic" window. This routine calculates it.
|
|
|
|
+ Args:
|
|
|
|
+ window_length: The number of points in the returned window.
|
|
|
|
+ Returns:
|
|
|
|
+ A 1D np.array containing the periodic hann window.
|
|
|
|
+ """
|
|
|
|
+ return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
|
|
|
|
+ np.arange(window_length)))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def stft_magnitude(signal, fft_length,
|
|
|
|
+ hop_length=None,
|
|
|
|
+ window_length=None):
|
|
|
|
+ """Calculate the short-time Fourier transform magnitude.
|
|
|
|
+ Args:
|
|
|
|
+ signal: 1D np.array of the input time-domain signal.
|
|
|
|
+ fft_length: Size of the FFT to apply.
|
|
|
|
+ hop_length: Advance (in samples) between each frame passed to FFT.
|
|
|
|
+ window_length: Length of each block of samples to pass to FFT.
|
|
|
|
+ Returns:
|
|
|
|
+ 2D np.array where each row contains the magnitudes of the fft_length/2+1
|
|
|
|
+ unique values of the FFT for the corresponding frame of input samples.
|
|
|
|
+ """
|
|
|
|
+ frames = frame(signal, window_length, hop_length)
|
|
|
|
+ # Apply frame window to each frame. We use a periodic Hann (cosine of period
|
|
|
|
+ # window_length) instead of the symmetric Hann of np.hanning (period
|
|
|
|
+ # window_length-1).
|
|
|
|
+ window = periodic_hann(window_length)
|
|
|
|
+ windowed_frames = frames * window
|
|
|
|
+ return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# Mel spectrum constants and functions.
|
|
|
|
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
|
|
|
|
+_MEL_HIGH_FREQUENCY_Q = 1127.0
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def hertz_to_mel(frequencies_hertz):
|
|
|
|
+ """Convert frequencies to mel scale using HTK formula.
|
|
|
|
+ Args:
|
|
|
|
+ frequencies_hertz: Scalar or np.array of frequencies in hertz.
|
|
|
|
+ Returns:
|
|
|
|
+ Object of same size as frequencies_hertz containing corresponding values
|
|
|
|
+ on the mel scale.
|
|
|
|
+ """
|
|
|
|
+ return _MEL_HIGH_FREQUENCY_Q * np.log(
|
|
|
|
+ 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def spectrogram_to_mel_matrix(num_mel_bins=20,
|
|
|
|
+ num_spectrogram_bins=129,
|
|
|
|
+ audio_sample_rate=8000,
|
|
|
|
+ lower_edge_hertz=125.0,
|
|
|
|
+ upper_edge_hertz=3800.0):
|
|
|
|
+ """Return a matrix that can post-multiply spectrogram rows to make mel.
|
|
|
|
+ Returns a np.array matrix A that can be used to post-multiply a matrix S of
|
|
|
|
+ spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
|
|
|
|
+ "mel spectrogram" M of frames x num_mel_bins. M = S A.
|
|
|
|
+ The classic HTK algorithm exploits the complementarity of adjacent mel bands
|
|
|
|
+ to multiply each FFT bin by only one mel weight, then add it, with positive
|
|
|
|
+ and negative signs, to the two adjacent mel bands to which that bin
|
|
|
|
+ contributes. Here, by expressing this operation as a matrix multiply, we go
|
|
|
|
+ from num_fft multiplies per frame (plus around 2*num_fft adds) to around
|
|
|
|
+ num_fft^2 multiplies and adds. However, because these are all presumably
|
|
|
|
+ accomplished in a single call to np.dot(), it's not clear which approach is
|
|
|
|
+ faster in Python. The matrix multiplication has the attraction of being more
|
|
|
|
+ general and flexible, and much easier to read.
|
|
|
|
+ Args:
|
|
|
|
+ num_mel_bins: How many bands in the resulting mel spectrum. This is
|
|
|
|
+ the number of columns in the output matrix.
|
|
|
|
+ num_spectrogram_bins: How many bins there are in the source spectrogram
|
|
|
|
+ data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
|
|
|
|
+ only contains the nonredundant FFT bins.
|
|
|
|
+ audio_sample_rate: Samples per second of the audio at the input to the
|
|
|
|
+ spectrogram. We need this to figure out the actual frequencies for
|
|
|
|
+ each spectrogram bin, which dictates how they are mapped into mel.
|
|
|
|
+ lower_edge_hertz: Lower bound on the frequencies to be included in the mel
|
|
|
|
+ spectrum. This corresponds to the lower edge of the lowest triangular
|
|
|
|
+ band.
|
|
|
|
+ upper_edge_hertz: The desired top edge of the highest frequency band.
|
|
|
|
+ Returns:
|
|
|
|
+ An np.array with shape (num_spectrogram_bins, num_mel_bins).
|
|
|
|
+ Raises:
|
|
|
|
+ ValueError: if frequency edges are incorrectly ordered or out of range.
|
|
|
|
+ """
|
|
|
|
+ nyquist_hertz = audio_sample_rate / 2.
|
|
|
|
+ if lower_edge_hertz < 0.0:
|
|
|
|
+ raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
|
|
|
|
+ if lower_edge_hertz >= upper_edge_hertz:
|
|
|
|
+ raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
|
|
|
|
+ (lower_edge_hertz, upper_edge_hertz))
|
|
|
|
+ if upper_edge_hertz > nyquist_hertz:
|
|
|
|
+ raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
|
|
|
|
+ (upper_edge_hertz, nyquist_hertz))
|
|
|
|
+ spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
|
|
|
|
+ spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
|
|
|
|
+ # The i'th mel band (starting from i=1) has center frequency
|
|
|
|
+ # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
|
|
|
|
+ # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in
|
|
|
|
+ # the band_edges_mel arrays.
|
|
|
|
+ band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
|
|
|
|
+ hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
|
|
|
|
+ # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
|
|
|
|
+ # of spectrogram values.
|
|
|
|
+ mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
|
|
|
|
+ for i in range(num_mel_bins):
|
|
|
|
+ lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
|
|
|
|
+ # Calculate lower and upper slopes for every spectrogram bin.
|
|
|
|
+ # Line segments are linear in the *mel* domain, not hertz.
|
|
|
|
+ lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
|
|
|
|
+ (center_mel - lower_edge_mel))
|
|
|
|
+ upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
|
|
|
|
+ (upper_edge_mel - center_mel))
|
|
|
|
+ # .. then intersect them with each other and zero.
|
|
|
|
+ mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
|
|
|
|
+ upper_slope))
|
|
|
|
+ # HTK excludes the spectrogram DC bin; make sure it always gets a zero
|
|
|
|
+ # coefficient.
|
|
|
|
+ mel_weights_matrix[0, :] = 0.0
|
|
|
|
+ return mel_weights_matrix
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def log_mel_spectrogram(data,
|
|
|
|
+ audio_sample_rate=8000,
|
|
|
|
+ log_offset=0.0,
|
|
|
|
+ window_length_secs=0.025,
|
|
|
|
+ hop_length_secs=0.010,
|
|
|
|
+ **kwargs):
|
|
|
|
+ """Convert waveform to a log magnitude mel-frequency spectrogram.
|
|
|
|
+ Args:
|
|
|
|
+ data: 1D np.array of waveform data.
|
|
|
|
+ audio_sample_rate: The sampling rate of data.
|
|
|
|
+ log_offset: Add this to values when taking log to avoid -Infs.
|
|
|
|
+ window_length_secs: Duration of each window to analyze.
|
|
|
|
+ hop_length_secs: Advance between successive analysis windows.
|
|
|
|
+ **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
|
|
|
|
+ Returns:
|
|
|
|
+ 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
|
|
|
|
+ magnitudes for successive frames.
|
|
|
|
+ """
|
|
|
|
+ window_length_samples = int(round(audio_sample_rate * window_length_secs))
|
|
|
|
+ hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
|
|
|
|
+ fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
|
|
|
|
+ spectrogram = stft_magnitude(
|
|
|
|
+ data,
|
|
|
|
+ fft_length=fft_length,
|
|
|
|
+ hop_length=hop_length_samples,
|
|
|
|
+ window_length=window_length_samples)
|
|
|
|
+ mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
|
|
|
|
+ num_spectrogram_bins=spectrogram.shape[1],
|
|
|
|
+ audio_sample_rate=audio_sample_rate, **kwargs))
|
|
|
|
+ return np.log(mel_spectrogram + log_offset)
|