#collapse

# setting things for pretty visualization

from rich import print
from pyannote.core import notebook, Segment
SAMPLE_EXTENT = Segment(0, 30)
notebook.crop = SAMPLE_EXTENT

SAMPLE_CHUNK = Segment(15, 20)
SAMPLE_URI = "sample"
SAMPLE_WAV = f"{SAMPLE_URI}.wav"
SAMPLE_REF = f"{SAMPLE_URI}.rttm"

from pyannote.audio import Model
model = Model.from_pretrained("pyannote/segmentation")

print(model.specifications)

model.introspection.frames.step

#collapse

from pyannote.database.util import load_rttm
reference = load_rttm(SAMPLE_REF)[SAMPLE_URI]
reference

#collapse 

from IPython.display import Audio as AudioPlayer
AudioPlayer(SAMPLE_WAV)

#collapse 

SAMPLE_CHUNK

from pyannote.audio import Audio
audio_reader = Audio(sample_rate=model.hparams.sample_rate)
waveform, sample_rate = audio_reader.crop(SAMPLE_WAV, SAMPLE_CHUNK)

#collapse 

import numpy as np
from pyannote.core import SlidingWindowFeature, SlidingWindow
_waveform, sample_rate = Audio()(SAMPLE_WAV)
_waveform = _waveform.numpy().T
_waveform[:round(SAMPLE_CHUNK.start * sample_rate)] = np.NAN
_waveform[round(SAMPLE_CHUNK.end * sample_rate):] = np.NAN
SlidingWindowFeature(_waveform, SlidingWindow(1./sample_rate, 1./sample_rate))

output = model(waveform)

#collapse 
_output = output.detach()[0].numpy()
shifted_frames = SlidingWindow(start=SAMPLE_CHUNK.start, 
                       duration=model.introspection.frames.duration, 
                       step=model.introspection.frames.step)
_output = SlidingWindowFeature(_output, shifted_frames)
_output

#collapse 
reference.crop(SAMPLE_CHUNK)

from pyannote.audio import Inference
inference = Inference(model, duration=5.0, step=2.5)
output = inference(SAMPLE_WAV)

#collapse
output

output.data.shape

BATCH_AXIS = 0
TIME_AXIS = 1
SPEAKER_AXIS = 2

to_vad = lambda o: np.max(o, axis=SPEAKER_AXIS, keepdims=True)
to_vad(output)

vad = Inference("pyannote/segmentation", pre_aggregation_hook=to_vad)
vad_prob = vad(SAMPLE_WAV)

#collapse
vad_prob.labels = ['SPEECH']
vad_prob

from pyannote.audio.utils.signal import Binarize
binarize = Binarize(onset=0.5)
speech = binarize(vad_prob)

#collapse
speech

#collapse
reference

to_osd = lambda o: np.partition(o, -2, axis=SPEAKER_AXIS)[:, :, -2, np.newaxis]
osd = Inference("pyannote/segmentation", pre_aggregation_hook=to_osd)
osd_prob = osd(SAMPLE_WAV)

#collapse
osd_prob.labels = ['OVERLAP']
osd_prob

binarize(osd_prob)

#collapse
reference

to_cnt = lambda probability: np.sum(probability, axis=SPEAKER_AXIS, keepdims=True)
cnt = Inference("pyannote/segmentation", pre_aggregation_hook=to_cnt)
cnt(SAMPLE_WAV)

to_scd = lambda probability: np.max(
    np.abs(np.diff(probability, n=1, axis=TIME_AXIS)), 
    axis=SPEAKER_AXIS, keepdims=True)
scd = Inference("pyannote/segmentation", pre_aggregation_hook=to_scd)
scd_prob = scd(SAMPLE_WAV)

#collapse
scd_prob.labels = ['SPEAKER_CHANGE']
scd_prob

from pyannote.audio.utils.signal import Peak
peak = Peak(alpha=0.05)
peak(scd_prob).crop(speech.get_timeline())

#collapse
reference

#collapse
output