#collapse # setting things for pretty visualization from rich import print from pyannote.core import notebook, Segment SAMPLE_EXTENT = Segment(0, 30) notebook.crop = SAMPLE_EXTENT SAMPLE_CHUNK = Segment(15, 20) SAMPLE_URI = "sample" SAMPLE_WAV = f"{SAMPLE_URI}.wav" SAMPLE_REF = f"{SAMPLE_URI}.rttm" from pyannote.audio import Model model = Model.from_pretrained("pyannote/segmentation") print(model.specifications) model.introspection.frames.step #collapse from pyannote.database.util import load_rttm reference = load_rttm(SAMPLE_REF)[SAMPLE_URI] reference #collapse from IPython.display import Audio as AudioPlayer AudioPlayer(SAMPLE_WAV) #collapse SAMPLE_CHUNK from pyannote.audio import Audio audio_reader = Audio(sample_rate=model.hparams.sample_rate) waveform, sample_rate = audio_reader.crop(SAMPLE_WAV, SAMPLE_CHUNK) #collapse import numpy as np from pyannote.core import SlidingWindowFeature, SlidingWindow _waveform, sample_rate = Audio()(SAMPLE_WAV) _waveform = _waveform.numpy().T _waveform[:round(SAMPLE_CHUNK.start * sample_rate)] = np.NAN _waveform[round(SAMPLE_CHUNK.end * sample_rate):] = np.NAN SlidingWindowFeature(_waveform, SlidingWindow(1./sample_rate, 1./sample_rate)) output = model(waveform) #collapse _output = output.detach()[0].numpy() shifted_frames = SlidingWindow(start=SAMPLE_CHUNK.start, duration=model.introspection.frames.duration, step=model.introspection.frames.step) _output = SlidingWindowFeature(_output, shifted_frames) _output #collapse reference.crop(SAMPLE_CHUNK) from pyannote.audio import Inference inference = Inference(model, duration=5.0, step=2.5) output = inference(SAMPLE_WAV) #collapse output output.data.shape BATCH_AXIS = 0 TIME_AXIS = 1 SPEAKER_AXIS = 2 to_vad = lambda o: np.max(o, axis=SPEAKER_AXIS, keepdims=True) to_vad(output) vad = Inference("pyannote/segmentation", pre_aggregation_hook=to_vad) vad_prob = vad(SAMPLE_WAV) #collapse vad_prob.labels = ['SPEECH'] vad_prob from pyannote.audio.utils.signal import Binarize binarize = Binarize(onset=0.5) speech = binarize(vad_prob) #collapse speech #collapse reference to_osd = lambda o: np.partition(o, -2, axis=SPEAKER_AXIS)[:, :, -2, np.newaxis] osd = Inference("pyannote/segmentation", pre_aggregation_hook=to_osd) osd_prob = osd(SAMPLE_WAV) #collapse osd_prob.labels = ['OVERLAP'] osd_prob binarize(osd_prob) #collapse reference to_cnt = lambda probability: np.sum(probability, axis=SPEAKER_AXIS, keepdims=True) cnt = Inference("pyannote/segmentation", pre_aggregation_hook=to_cnt) cnt(SAMPLE_WAV) to_scd = lambda probability: np.max( np.abs(np.diff(probability, n=1, axis=TIME_AXIS)), axis=SPEAKER_AXIS, keepdims=True) scd = Inference("pyannote/segmentation", pre_aggregation_hook=to_scd) scd_prob = scd(SAMPLE_WAV) #collapse scd_prob.labels = ['SPEAKER_CHANGE'] scd_prob from pyannote.audio.utils.signal import Peak peak = Peak(alpha=0.05) peak(scd_prob).crop(speech.get_timeline()) #collapse reference #collapse output