This notebook demonstrates ART's DeepSpeech estimator and the Imperceptible ASR attack.
import os
import torch
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from deepspeech_pytorch.loader.data_loader import load_audio
from art.estimators.speech_recognition import PyTorchDeepSpeech
from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch
from art import config
from art.utils import get_file
# Set seed
np.random.seed(1234)
# Prepare to download data
data_dir = os.path.join(config.ART_DATA_PATH, "deepspeech_audio")
current_dir = %pwd
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Download audio data
get_file('librispeech.py', 'https://raw.githubusercontent.com/SeanNaren/deepspeech.pytorch/master/data/librispeech.py', path=data_dir)
%cd $data_dir
!python librispeech.py --files-to-use test-clean.tar.gz
%cd $current_dir
/home/minhtn/.art/data/deepspeech_audio Skipping url: http://www.openslr.org/resources/12/train-clean-100.tar.gz Skipping url: http://www.openslr.org/resources/12/train-clean-360.tar.gz Skipping url: http://www.openslr.org/resources/12/train-other-500.tar.gz Sorting manifests... Pruning manifests between 1 and 15 seconds 0it [00:00, ?it/s] Skipping url: http://www.openslr.org/resources/12/dev-clean.tar.gz Skipping url: http://www.openslr.org/resources/12/dev-other.tar.gz Sorting manifests... 0it [00:00, ?it/s] 100% [..................................................] 346663984 / 346663984Unpacking test-clean.tar.gz... Converting flac files to wav and extracting transcripts... 129it [00:29, 4.38it/s] Finished http://www.openslr.org/resources/12/test-clean.tar.gz Sorting manifests... 100%|████████████████████████████████████| 2620/2620 [00:00<00:00, 69321.65it/s] Skipping url: http://www.openslr.org/resources/12/test-other.tar.gz Sorting manifests... 0it [00:00, ?it/s] /home/minhtn/ibm/projects/adversarial-robustness-toolbox/notebooks
# Create a DeepSpeech estimator
speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech")
def display_waveform(waveform, title="", sample_rate=16000):
"""
Display waveform plot and audio play UI.
"""
plt.figure()
plt.title(title)
plt.plot(waveform)
ipd.display(ipd.Audio(waveform, rate=sample_rate))
labels_map = dict([(speech_recognizer.model.labels[i], i) for i in range(len(speech_recognizer.model.labels))])
def parse_transcript(path):
with open(path, 'r', encoding='utf8') as f:
transcript = f.read().replace('\n', '')
result = list(filter(None, [labels_map.get(x) for x in list(transcript)]))
return transcript, result
# A long audio sample
x1 = load_audio(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/wav/1089-134686-0000.wav"))
label1, encoded_label1 = parse_transcript(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/txt/1089-134686-0000.txt"))
print("Encoded label: ", encoded_label1)
print("Groundtrue label: ", label1)
display_waveform(x1, title="Long Sample")
Encoded label: [9, 6, 28, 9, 16, 17, 6, 5, 28, 21, 9, 6, 19, 6, 28, 24, 16, 22, 13, 5, 28, 3, 6, 28, 20, 21, 6, 24, 28, 7, 16, 19, 28, 5, 10, 15, 15, 6, 19, 28, 21, 22, 19, 15, 10, 17, 20, 28, 2, 15, 5, 28, 4, 2, 19, 19, 16, 21, 20, 28, 2, 15, 5, 28, 3, 19, 22, 10, 20, 6, 5, 28, 17, 16, 21, 2, 21, 16, 6, 20, 28, 2, 15, 5, 28, 7, 2, 21, 28, 14, 22, 21, 21, 16, 15, 28, 17, 10, 6, 4, 6, 20, 28, 21, 16, 28, 3, 6, 28, 13, 2, 5, 13, 6, 5, 28, 16, 22, 21, 28, 10, 15, 28, 21, 9, 10, 4, 12, 28, 17, 6, 17, 17, 6, 19, 6, 5, 28, 7, 13, 16, 22, 19, 28, 7, 2, 21, 21, 6, 15, 6, 5, 28, 20, 2, 22, 4, 6] Groundtrue label: HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOUR FATTENED SAUCE
# A short audio sample
x2 = load_audio(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/wav/1089-134691-0003.wav"))
label2, encoded_label2 = parse_transcript(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/txt/1089-134691-0003.txt"))
print("Encoded label: ", encoded_label2)
print("Groundtrue label: ", label2)
display_waveform(x2, title="Short Sample")
Encoded label: [21, 9, 6, 28, 22, 15, 10, 23, 6, 19, 20, 10, 21, 26] Groundtrue label: THE UNIVERSITY
# Another short audio sample
x3 = load_audio(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/wav/1089-134691-0018.wav"))
label3, encoded_label3 = parse_transcript(os.path.join(data_dir, "LibriSpeech_dataset/test_clean/txt/1089-134691-0018.txt"))
print("Encoded label: ", encoded_label3)
print("Groundtrue label: ", label3)
display_waveform(x3, title="Short Sample")
Encoded label: [2, 8, 2, 10, 15, 28, 2, 8, 2, 10, 15] Groundtrue label: AGAIN AGAIN
pred1 = speech_recognizer.predict(np.array([x1]), transcription_output=True)
print("Groundtruth label: ", label1)
print("Predicted label: ", pred1[0])
Groundtruth label: HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOUR FATTENED SAUCE Predicted label: HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERD FLOUR FAT AND SAUCE
pred2 = speech_recognizer.predict(np.array([x2]), transcription_output=True)
print("Groundtruth label: ", label2)
print("Predicted label: ", pred2[0])
Groundtruth label: THE UNIVERSITY Predicted label: THE UNIVERSITY
pred3 = speech_recognizer.predict(np.array([x3]), transcription_output=True)
print("Groundtruth label: ", label3)
print("Predicted label: ", pred3[0])
Groundtruth label: AGAIN AGAIN Predicted label: AGAIN AGAIN
x = np.array([x1, x2, x3])
pred_all = speech_recognizer.predict(x, transcription_output=True)
print("Predicted labels: ", pred_all)
Predicted labels: ['HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERD FLOUR FAT AND SAUCE' 'THE UNIVERSITY' 'AGAIN AGAIN']
global_max_length = int(np.max([len(x2), len(x3)]))
# Define an Imperceptible ASR attack
asr_attack = ImperceptibleASRPyTorch(
estimator=speech_recognizer,
eps=0.05,
max_iter_1=100,
max_iter_2=500,
learning_rate_1=0.00002,
learning_rate_2=0.00002,
optimizer_1=torch.optim.Adam,
optimizer_2=torch.optim.Adam,
global_max_length=global_max_length,
initial_rescale=1.0,
decrease_factor_eps=0.8,
num_iter_decrease_eps=20,
alpha=1.2,
increase_factor_alpha=1.2,
num_iter_increase_alpha=20,
decrease_factor_alpha=0.8,
num_iter_decrease_alpha=20,
win_length=2048,
hop_length=512,
n_fft=2048,
batch_size=2,
use_amp=False,
)
# Target labels
y = np.array(['THE UNIVERSAL', 'GAIN GAIN'])
# Generate adversarial examples
x_adv = asr_attack.generate(np.array([x2, x3]), y)
First stage, step 0, loss 69.741150 First stage, step 5, loss 58.773037 First stage, step 10, loss 51.136898 First stage, step 15, loss 48.239479 First stage, step 20, loss 40.845539 First stage, step 25, loss 36.802544 First stage, step 30, loss 32.711205 First stage, step 35, loss 29.145208 First stage, step 40, loss 26.394424 First stage, step 45, loss 44.487350 First stage, step 50, loss 42.173428 First stage, step 55, loss 39.783951 First stage, step 60, loss 37.151741 First stage, step 65, loss 34.456093 First stage, step 70, loss 30.969639 First stage, step 75, loss 29.559433 First stage, step 80, loss 27.740614 First stage, step 85, loss 26.211428 First stage, step 90, loss 24.573879 First stage, step 95, loss 22.899168
/home/minhtn/ibm/installation/miniconda3/lib/python3.6/site-packages/torch/functional.py:581: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:639.) normalized, onesided, return_complex)
Second stage, step 0, loss 887.815700 Second stage, step 5, loss 543.191106 Second stage, step 10, loss 395.780205 Second stage, step 15, loss 291.439771 Second stage, step 20, loss 216.803986 Second stage, step 25, loss 132.597252 Second stage, step 30, loss 102.339386 Second stage, step 35, loss 79.895121 Second stage, step 40, loss 62.953748 Second stage, step 45, loss 42.079926 Second stage, step 50, loss 34.634669 Second stage, step 55, loss 28.961950 Second stage, step 60, loss 24.520864 Second stage, step 65, loss 18.534713 Second stage, step 70, loss 16.328669 Second stage, step 75, loss 14.545206 Second stage, step 80, loss 13.060859 Second stage, step 85, loss 10.806125 Second stage, step 90, loss 9.900532 Second stage, step 95, loss 9.133247 Second stage, step 100, loss 8.474490 Second stage, step 105, loss 7.365411 Second stage, step 110, loss 6.920150 Second stage, step 115, loss 6.530938 Second stage, step 120, loss 6.185102 Second stage, step 125, loss 5.526468 Second stage, step 130, loss 5.271661 Second stage, step 135, loss 5.045556 Second stage, step 140, loss 4.838332 Second stage, step 145, loss 4.390867 Second stage, step 150, loss 4.219669 Second stage, step 155, loss 4.059496 Second stage, step 160, loss 3.907564 Second stage, step 165, loss 3.558051 Second stage, step 170, loss 3.421292 Second stage, step 175, loss 3.290129 Second stage, step 180, loss 3.162481 Second stage, step 185, loss 2.858444 Second stage, step 190, loss 2.736934 Second stage, step 195, loss 2.617957 Second stage, step 200, loss 2.500414 Second stage, step 205, loss 2.229840 Second stage, step 210, loss 2.124433 Second stage, step 215, loss 2.024690 Second stage, step 220, loss 1.932897 Second stage, step 225, loss 1.704043 Second stage, step 230, loss 1.623649 Second stage, step 235, loss 1.550399 Second stage, step 240, loss 1.483242 Second stage, step 245, loss 1.556927 Second stage, step 250, loss 1.504139 Second stage, step 255, loss 1.453637 Second stage, step 260, loss 1.400224 Second stage, step 265, loss 1.508619 Second stage, step 270, loss 1.453266 Second stage, step 275, loss 1.413941 Second stage, step 280, loss 1.369114 Second stage, step 285, loss 1.469223 Second stage, step 290, loss 1.431475 Second stage, step 295, loss 1.389646 Second stage, step 300, loss 1.353665 Second stage, step 305, loss 1.458392 Second stage, step 310, loss 1.424629 Second stage, step 315, loss 1.391908 Second stage, step 320, loss 1.348441 Second stage, step 325, loss 1.456609 Second stage, step 330, loss 1.423668 Second stage, step 335, loss 1.381091 Second stage, step 340, loss 1.349383 Second stage, step 345, loss 1.442660 Second stage, step 350, loss 1.427358 Second stage, step 355, loss 1.400758 Second stage, step 360, loss 1.349413 Second stage, step 365, loss 1.455723 Second stage, step 370, loss 1.457263 Second stage, step 375, loss 1.430490 Second stage, step 380, loss 1.408006 Second stage, step 385, loss 1.471980 Second stage, step 390, loss 1.503218 Second stage, step 395, loss 1.448039 Second stage, step 400, loss 1.399196 Second stage, step 405, loss 1.478893 Second stage, step 410, loss 1.549467 Second stage, step 415, loss 1.458022 Second stage, step 420, loss 1.429721 Second stage, step 425, loss 1.547260 Second stage, step 430, loss 1.545492 Second stage, step 435, loss 1.543032 Second stage, step 440, loss 1.577160 Second stage, step 445, loss 1.664575 Second stage, step 450, loss 1.583724 Second stage, step 455, loss 1.655049 Second stage, step 460, loss 1.600740 Second stage, step 465, loss 1.613063 Second stage, step 470, loss 1.850034 Second stage, step 475, loss 1.667144 Second stage, step 480, loss 1.567518 Second stage, step 485, loss 1.633470 Second stage, step 490, loss 1.770336 Second stage, step 495, loss 1.781754
adv_transcriptions = speech_recognizer.predict(x_adv, batch_size=2, transcription_output=True)
print("Groundtruth transcriptions: ", np.array([label2, label3]))
print("Target transcriptions: ", y)
print("Adversarial transcriptions: ", adv_transcriptions)
Groundtruth transcriptions: ['THE UNIVERSITY' 'AGAIN AGAIN'] Target transcriptions: ['THE UNIVERSAL' 'GAIN GAIN'] Adversarial transcriptions: ['THE UNIVERSAL' 'GAIN GAIN']
display_waveform(x_adv[0][:len(x2)], title="THE UNIVERSITY is attacked to THE UNIVERSAL")
display_waveform(x_adv[1][:len(x3)], title="AGAIN AGAIN is attacked to GAIN GAIN")