#@title Check available memory of GPU
# Check that we are using 100% of GPU
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip -q install gputil
!pip -q install psutil
!pip -q install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

# If GPU RAM Util > 0% => crash notebook on purpose
# !kill -9 -1

# install transformes
!pip uninstall -y transformers
!pip install -q git+https://github.com/huggingface/transformers.git

# install py3nvml to track GPU memory usage
!pip install -q py3nvml

!rm -f run_benchmark.py
!rm -f run_benchmark_tf.py
!rm -f plot_csv_file.py
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/benchmarking/run_benchmark.py -qq
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/benchmarking/run_benchmark_tf.py -qq
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/benchmarking/plot_csv_file.py -qq

# import pandas to pretty print csv files
import pandas as pd

!python run_benchmark.py --help

# create plots folder in content
!mkdir -p plots_pt

# run benchmark
!python run_benchmark.py --no_speed --save_to_csv \
                                --models a-ware/roberta-large-squad-classification \
                                  a-ware/xlmroberta-squadv2 \
                                  aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \
                                  deepset/roberta-base-squad2 \
                                  mrm8488/longformer-base-4096-finetuned-squadv2 \
                                --sequence_lengths 32 128 512 1024 \
                                --batch_sizes 32 \
                                --inference_memory_csv_file plots_pt/required_memory.csv \
                                --env_info_csv_file plots_pt/env.csv >/dev/null 2>&1  # redirect all prints

df = pd.read_csv('plots_pt/required_memory.csv')
df

df = pd.read_csv('plots_pt/env.csv')
df

# plot graph and save as image
!python plot_csv_file.py --csv_file plots_pt/required_memory.csv --figure_png_file=plots_pt/required_memory_plot.png --no_log_scale --short_model_names a-ware-roberta a-aware-xlm aodiniz-bert deepset-roberta mrm8488-long

# show image
from IPython.display import Image
Image('plots_pt/required_memory_plot.png')

!python run_benchmark.py --no_speed --save_to_csv \
                                --inference_memory_csv_file plots_pt/required_memory_2.csv \
                                --env_info_csv_file plots_pt/env.csv \
                                --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \
                                  deepset/roberta-base-squad2 \
                                --sequence_lengths 512 \
                                --batch_sizes 64 128 256 512\
                                --no_env_print

# plot graph and save as image
!python plot_csv_file.py --csv_file plots_pt/required_memory_2.csv \
                          --figure_png_file=plots_pt/required_memory_plot_2.png \
                          --no_log_scale \
                          --short_model_names aodiniz-bert deepset-roberta \
                          --plot_along_batch

# show image
from IPython.display import Image
Image('plots_pt/required_memory_plot_2.png')

# create plots folder in content
!mkdir -p plots_tf

!TF_CPP_MIN_LOG_LEVEL=3 python run_benchmark_tf.py --no_speed --save_to_csv \
                                --inference_memory_csv_file plots_tf/required_memory_2.csv \
                                --env_info_csv_file plots_tf/env.csv \
                                --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \
                                         deepset/roberta-base-squad2 \
                                --sequence_lengths 512 \
                                --batch_sizes 64 128 256 512 \
                                --no_env_print \

# plot graph and save as image
!python plot_csv_file.py --csv_file plots_tf/required_memory_2.csv --figure_png_file=plots_tf/required_memory_plot_2.png --no_log_scale --short_model_names aodiniz-bert deepset-roberta --plot_along_batch

# show image
from IPython.display import Image
Image('plots_tf/required_memory_plot_2.png')

!TF_CPP_MIN_LOG_LEVEL=3 python run_benchmark_tf.py --no_memory --save_to_csv \
                                --inference_time_csv_file plots_tf/time_2.csv \
                                --env_info_csv_file plots_tf/env.csv \
                                --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \
                                         deepset/roberta-base-squad2 \
                                --sequence_lengths 8 32 128 512 \
                                --batch_sizes 256 \
                                --no_env_print \

# plot graph and save as image
!python plot_csv_file.py --csv_file plots_tf/time_2.csv --figure_png_file=plots_tf/time_plot_2.png --no_log_scale --short_model_names aodiniz-bert deepset-roberta --is_time

# show image
from IPython.display import Image
Image('plots_tf/time_plot_2.png')

!TF_CPP_MIN_LOG_LEVEL=3 python run_benchmark_tf.py --no_memory --save_to_csv \
                                --inference_time_csv_file plots_tf/time_xla_1.csv \
                                --env_info_csv_file plots_tf/env.csv \
                                --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \
                                --sequence_lengths 512 \
                                --batch_sizes 8 64 256 \
                                --no_env_print \
                                --use_xla

# Imports
from transformers import BartConfig, PyTorchBenchmark, PyTorchBenchmarkArguments

BartConfig.from_pretrained("facebook/bart-large-mnli").to_diff_dict()

config_baseline = BartConfig.from_pretrained("facebook/bart-large-mnli")
config_768_hidden = BartConfig.from_pretrained("facebook/bart-large-mnli", d_model=768)
config_8_heads = BartConfig.from_pretrained("facebook/bart-large-mnli", decoder_attention_heads=8, encoder_attention_heads=8)
config_10000_vocab = BartConfig.from_pretrained("facebook/bart-large-mnli", vocab_size=10000)
config_8_layers = BartConfig.from_pretrained("facebook/bart-large-mnli", encoder_layers=8, decoder_layers=8)

# define args
args = PyTorchBenchmarkArguments(models=["bart-base", "bart-768-hid", "bart-8-head", "bart-10000-voc", "bart-8-lay"], 
                                 no_speed=True,
                                 no_inference=True,
                                 training=True, 
                                 train_memory_csv_file="plots_pt/training_mem_fp16.csv", 
                                 save_to_csv=True, 
                                 env_info_csv_file="plots_pt/env.csv",
                                 sequence_lengths=[64, 128, 256, 512],
                                 batch_sizes=[8],
                                 no_env_print=True,
                                 fp16=True)  # let's train on fp16

# create benchmark
benchmark = PyTorchBenchmark(configs=[config_baseline, config_768_hidden, config_8_heads, config_10000_vocab, config_8_layers], args=args)

# run benchmark
result = benchmark.run()

# plot graph and save as image
!python plot_csv_file.py --csv_file plots_pt/training_mem_fp16.csv --figure_png_file=plots_pt/training_mem_fp16.png --no_log_scale

# show image
from IPython.display import Image
Image('plots_pt/training_mem_fp16.png')

# define args
args = PyTorchBenchmarkArguments(models=["bart-8-head", "bart-8-lay"], 
                                 no_inference=True,
                                 training=True,
                                 no_memory=True,
                                 train_time_csv_file="plots_pt/training_speed_fp16.csv", 
                                 save_to_csv=True, 
                                 env_info_csv_file="plots_pt/env.csv",
                                 sequence_lengths=[32, 128, 512],
                                 batch_sizes=[8],
                                 no_env_print=True,
                                 repeat=1, # to make speed measurement faster but less accurate
                                 no_multi_process=True,  # google colab has problems with multi processing
                                 fp16=True
                                 )

# create benchmark
benchmark = PyTorchBenchmark(configs=[config_8_heads, config_8_layers], args=args)

# run benchmark
result = benchmark.run()

# plot graph and save as image
!python plot_csv_file.py --csv_file plots_pt/training_speed_fp16.csv --figure_png_file=plots_pt/training_speed_fp16.png --no_log_scale --is_time

# show image
from IPython.display import Image
Image('plots_pt/training_speed_fp16.png')