123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- # Copyright 2017 The TensorFlow Authors All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """Defines routines to compute mel spectrogram features from audio waveform."""
- import numpy as np
- def frame(data, window_length, hop_length):
- """Convert array into a sequence of successive possibly overlapping frames.
- An n-dimensional array of shape (num_samples, ...) is converted into an
- (n+1)-D array of shape (num_frames, window_length, ...), where each frame
- starts hop_length points after the preceding one.
- This is accomplished using stride_tricks, so the original data is not
- copied. However, there is no zero-padding, so any incomplete frames at the
- end are not included.
- Args:
- data: np.array of dimension N >= 1.
- window_length: Number of samples in each frame.
- hop_length: Advance (in samples) between each window.
- Returns:
- (N+1)-D np.array with as many rows as there are complete frames that can be
- extracted.
- """
- num_samples = data.shape[0]
- num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
- shape = (num_frames, window_length) + data.shape[1:]
- strides = (data.strides[0] * hop_length,) + data.strides
- return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
- def periodic_hann(window_length):
- """Calculate a "periodic" Hann window.
- The classic Hann window is defined as a raised cosine that starts and
- ends on zero, and where every value appears twice, except the middle
- point for an odd-length window. Matlab calls this a "symmetric" window
- and np.hanning() returns it. However, for Fourier analysis, this
- actually represents just over one cycle of a period N-1 cosine, and
- thus is not compactly expressed on a length-N Fourier basis. Instead,
- it's better to use a raised cosine that ends just before the final
- zero value - i.e. a complete cycle of a period-N cosine. Matlab
- calls this a "periodic" window. This routine calculates it.
- Args:
- window_length: The number of points in the returned window.
- Returns:
- A 1D np.array containing the periodic hann window.
- """
- return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
- np.arange(window_length)))
- def stft_magnitude(signal, fft_length,
- hop_length=None,
- window_length=None):
- """Calculate the short-time Fourier transform magnitude.
- Args:
- signal: 1D np.array of the input time-domain signal.
- fft_length: Size of the FFT to apply.
- hop_length: Advance (in samples) between each frame passed to FFT.
- window_length: Length of each block of samples to pass to FFT.
- Returns:
- 2D np.array where each row contains the magnitudes of the fft_length/2+1
- unique values of the FFT for the corresponding frame of input samples.
- """
- frames = frame(signal, window_length, hop_length)
- # Apply frame window to each frame. We use a periodic Hann (cosine of period
- # window_length) instead of the symmetric Hann of np.hanning (period
- # window_length-1).
- window = periodic_hann(window_length)
- windowed_frames = frames * window
- return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
- # Mel spectrum constants and functions.
- _MEL_BREAK_FREQUENCY_HERTZ = 700.0
- _MEL_HIGH_FREQUENCY_Q = 1127.0
- def hertz_to_mel(frequencies_hertz):
- """Convert frequencies to mel scale using HTK formula.
- Args:
- frequencies_hertz: Scalar or np.array of frequencies in hertz.
- Returns:
- Object of same size as frequencies_hertz containing corresponding values
- on the mel scale.
- """
- return _MEL_HIGH_FREQUENCY_Q * np.log(
- 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
- def spectrogram_to_mel_matrix(num_mel_bins=20,
- num_spectrogram_bins=129,
- audio_sample_rate=8000,
- lower_edge_hertz=125.0,
- upper_edge_hertz=3800.0):
- """Return a matrix that can post-multiply spectrogram rows to make mel.
- Returns a np.array matrix A that can be used to post-multiply a matrix S of
- spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
- "mel spectrogram" M of frames x num_mel_bins. M = S A.
- The classic HTK algorithm exploits the complementarity of adjacent mel bands
- to multiply each FFT bin by only one mel weight, then add it, with positive
- and negative signs, to the two adjacent mel bands to which that bin
- contributes. Here, by expressing this operation as a matrix multiply, we go
- from num_fft multiplies per frame (plus around 2*num_fft adds) to around
- num_fft^2 multiplies and adds. However, because these are all presumably
- accomplished in a single call to np.dot(), it's not clear which approach is
- faster in Python. The matrix multiplication has the attraction of being more
- general and flexible, and much easier to read.
- Args:
- num_mel_bins: How many bands in the resulting mel spectrum. This is
- the number of columns in the output matrix.
- num_spectrogram_bins: How many bins there are in the source spectrogram
- data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
- only contains the nonredundant FFT bins.
- audio_sample_rate: Samples per second of the audio at the input to the
- spectrogram. We need this to figure out the actual frequencies for
- each spectrogram bin, which dictates how they are mapped into mel.
- lower_edge_hertz: Lower bound on the frequencies to be included in the mel
- spectrum. This corresponds to the lower edge of the lowest triangular
- band.
- upper_edge_hertz: The desired top edge of the highest frequency band.
- Returns:
- An np.array with shape (num_spectrogram_bins, num_mel_bins).
- Raises:
- ValueError: if frequency edges are incorrectly ordered or out of range.
- """
- nyquist_hertz = audio_sample_rate / 2.
- if lower_edge_hertz < 0.0:
- raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
- if lower_edge_hertz >= upper_edge_hertz:
- raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
- (lower_edge_hertz, upper_edge_hertz))
- if upper_edge_hertz > nyquist_hertz:
- raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
- (upper_edge_hertz, nyquist_hertz))
- spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
- spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
- # The i'th mel band (starting from i=1) has center frequency
- # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
- # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in
- # the band_edges_mel arrays.
- band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
- hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
- # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
- # of spectrogram values.
- mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
- for i in range(num_mel_bins):
- lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
- # Calculate lower and upper slopes for every spectrogram bin.
- # Line segments are linear in the *mel* domain, not hertz.
- lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
- (center_mel - lower_edge_mel))
- upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
- (upper_edge_mel - center_mel))
- # .. then intersect them with each other and zero.
- mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
- upper_slope))
- # HTK excludes the spectrogram DC bin; make sure it always gets a zero
- # coefficient.
- mel_weights_matrix[0, :] = 0.0
- return mel_weights_matrix
- def log_mel_spectrogram(data,
- audio_sample_rate=8000,
- log_offset=0.0,
- window_length_secs=0.025,
- hop_length_secs=0.010,
- **kwargs):
- """Convert waveform to a log magnitude mel-frequency spectrogram.
- Args:
- data: 1D np.array of waveform data.
- audio_sample_rate: The sampling rate of data.
- log_offset: Add this to values when taking log to avoid -Infs.
- window_length_secs: Duration of each window to analyze.
- hop_length_secs: Advance between successive analysis windows.
- **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
- Returns:
- 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
- magnitudes for successive frames.
- """
- window_length_samples = int(round(audio_sample_rate * window_length_secs))
- hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
- fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
- spectrogram = stft_magnitude(
- data,
- fft_length=fft_length,
- hop_length=hop_length_samples,
- window_length=window_length_samples)
- mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
- num_spectrogram_bins=spectrogram.shape[1],
- audio_sample_rate=audio_sample_rate, **kwargs))
- return np.log(mel_spectrogram + log_offset)
|