Noise RIR

To create a audio with reverberation:

  • get the RIR, resample it to the audio frame rate
  • convolve the audio with the rir

Sample code:

import numpy as np 
import scipy.signal as s
import torchaudio
import torch
import torchaudio.transforms as T
def reverb_rir(frames,rir):
        """
        frames is the clean audio numpy with shape [1, T]
        rir is the rir numpy with shape [1, T']
        returns: reverberated audio with shape [T'] (numpy)
        """
        orig_frames_shape = frames.shape
        frames,filter = np.squeeze(frames),np.squeeze(rir)
        frames = s.convolve(frames,filter)
        actlev = np.max(np.abs(frames))
        if(actlev > 0.99):
            frames = (frames / actlev) * 0.98
        frames = frames[:orig_frames_shape[1]]
        # print(frames.shape, orig_frames_shape)
        return frames

rir_impulse = "/home/bltang/work/data/impulse/datasets_fullband/impulse_responses/SLR26/simulated_rirs_48k/largeroom/Room002/Room002-00001.wav"
## 48khz

frame_path = "/home/bltang/work/voicefixer_main/test/clean/SSB00050001.wav"
## 44.1khz 
frame, frame_rate = torchaudio.load(frame_path)

rir, rir_rate = torchaudio.load(rir_impulse)

print(f"loaded audio frame: {frame.shape}, sample rate: {frame_rate}")
print(f"loaded rir: {rir.shape}, sample rate: {rir_rate}")

## downsample the rir to be 44.1khz
resampler = T.Resample(rir_rate, frame_rate, dtype=frame.dtype)
rir = resampler(rir)

frame = frame.numpy()
rir = rir.numpy()

## doing the convolution
output = reverb_rir(frame,rir)

output = torch.from_numpy(output).unsqueeze(0)
torchaudio.save("output.wav",output, frame_rate)

perform clipping:

### perform clipping
clip_factor = 0.1
z = torch.clamp(output,min = output.min() * clip_factor, max = output.max() * clip_factor)
print(z.min())
torchaudio.save("clamp.wav",z, frame_rate)