#@title Check available memory of GPU # Check that we are using 100% of GPU # memory footprint support libraries/code !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi !pip -q install gputil !pip -q install psutil !pip -q install humanize import psutil import humanize import os import GPUtil as GPU GPUs = GPU.getGPUs() # XXX: only one GPU on Colab and isn’t guaranteed gpu = GPUs[0] def printm(): process = psutil.Process(os.getpid()) print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss)) print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal)) printm() # If GPU RAM Util > 0% => crash notebook on purpose # !kill -9 -1 # install transformes !pip uninstall -y transformers !pip install -q git+https://github.com/huggingface/transformers.git # install py3nvml to track GPU memory usage !pip install -q py3nvml !rm -f run_benchmark.py !rm -f run_benchmark_tf.py !rm -f plot_csv_file.py !wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/benchmarking/run_benchmark.py -qq !wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/benchmarking/run_benchmark_tf.py -qq !wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/benchmarking/plot_csv_file.py -qq # import pandas to pretty print csv files import pandas as pd !python run_benchmark.py --help # create plots folder in content !mkdir -p plots_pt # run benchmark !python run_benchmark.py --no_speed --save_to_csv \ --models a-ware/roberta-large-squad-classification \ a-ware/xlmroberta-squadv2 \ aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \ deepset/roberta-base-squad2 \ mrm8488/longformer-base-4096-finetuned-squadv2 \ --sequence_lengths 32 128 512 1024 \ --batch_sizes 32 \ --inference_memory_csv_file plots_pt/required_memory.csv \ --env_info_csv_file plots_pt/env.csv >/dev/null 2>&1 # redirect all prints df = pd.read_csv('plots_pt/required_memory.csv') df df = pd.read_csv('plots_pt/env.csv') df # plot graph and save as image !python plot_csv_file.py --csv_file plots_pt/required_memory.csv --figure_png_file=plots_pt/required_memory_plot.png --no_log_scale --short_model_names a-ware-roberta a-aware-xlm aodiniz-bert deepset-roberta mrm8488-long # show image from IPython.display import Image Image('plots_pt/required_memory_plot.png') !python run_benchmark.py --no_speed --save_to_csv \ --inference_memory_csv_file plots_pt/required_memory_2.csv \ --env_info_csv_file plots_pt/env.csv \ --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \ deepset/roberta-base-squad2 \ --sequence_lengths 512 \ --batch_sizes 64 128 256 512\ --no_env_print # plot graph and save as image !python plot_csv_file.py --csv_file plots_pt/required_memory_2.csv \ --figure_png_file=plots_pt/required_memory_plot_2.png \ --no_log_scale \ --short_model_names aodiniz-bert deepset-roberta \ --plot_along_batch # show image from IPython.display import Image Image('plots_pt/required_memory_plot_2.png') # create plots folder in content !mkdir -p plots_tf !TF_CPP_MIN_LOG_LEVEL=3 python run_benchmark_tf.py --no_speed --save_to_csv \ --inference_memory_csv_file plots_tf/required_memory_2.csv \ --env_info_csv_file plots_tf/env.csv \ --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \ deepset/roberta-base-squad2 \ --sequence_lengths 512 \ --batch_sizes 64 128 256 512 \ --no_env_print \ # plot graph and save as image !python plot_csv_file.py --csv_file plots_tf/required_memory_2.csv --figure_png_file=plots_tf/required_memory_plot_2.png --no_log_scale --short_model_names aodiniz-bert deepset-roberta --plot_along_batch # show image from IPython.display import Image Image('plots_tf/required_memory_plot_2.png') !TF_CPP_MIN_LOG_LEVEL=3 python run_benchmark_tf.py --no_memory --save_to_csv \ --inference_time_csv_file plots_tf/time_2.csv \ --env_info_csv_file plots_tf/env.csv \ --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \ deepset/roberta-base-squad2 \ --sequence_lengths 8 32 128 512 \ --batch_sizes 256 \ --no_env_print \ # plot graph and save as image !python plot_csv_file.py --csv_file plots_tf/time_2.csv --figure_png_file=plots_tf/time_plot_2.png --no_log_scale --short_model_names aodiniz-bert deepset-roberta --is_time # show image from IPython.display import Image Image('plots_tf/time_plot_2.png') !TF_CPP_MIN_LOG_LEVEL=3 python run_benchmark_tf.py --no_memory --save_to_csv \ --inference_time_csv_file plots_tf/time_xla_1.csv \ --env_info_csv_file plots_tf/env.csv \ --models aodiniz/bert_uncased_L-10_H-512_A-8_cord19-200616_squad2 \ --sequence_lengths 512 \ --batch_sizes 8 64 256 \ --no_env_print \ --use_xla # Imports from transformers import BartConfig, PyTorchBenchmark, PyTorchBenchmarkArguments BartConfig.from_pretrained("facebook/bart-large-mnli").to_diff_dict() config_baseline = BartConfig.from_pretrained("facebook/bart-large-mnli") config_768_hidden = BartConfig.from_pretrained("facebook/bart-large-mnli", d_model=768) config_8_heads = BartConfig.from_pretrained("facebook/bart-large-mnli", decoder_attention_heads=8, encoder_attention_heads=8) config_10000_vocab = BartConfig.from_pretrained("facebook/bart-large-mnli", vocab_size=10000) config_8_layers = BartConfig.from_pretrained("facebook/bart-large-mnli", encoder_layers=8, decoder_layers=8) # define args args = PyTorchBenchmarkArguments(models=["bart-base", "bart-768-hid", "bart-8-head", "bart-10000-voc", "bart-8-lay"], no_speed=True, no_inference=True, training=True, train_memory_csv_file="plots_pt/training_mem_fp16.csv", save_to_csv=True, env_info_csv_file="plots_pt/env.csv", sequence_lengths=[64, 128, 256, 512], batch_sizes=[8], no_env_print=True, fp16=True) # let's train on fp16 # create benchmark benchmark = PyTorchBenchmark(configs=[config_baseline, config_768_hidden, config_8_heads, config_10000_vocab, config_8_layers], args=args) # run benchmark result = benchmark.run() # plot graph and save as image !python plot_csv_file.py --csv_file plots_pt/training_mem_fp16.csv --figure_png_file=plots_pt/training_mem_fp16.png --no_log_scale # show image from IPython.display import Image Image('plots_pt/training_mem_fp16.png') # define args args = PyTorchBenchmarkArguments(models=["bart-8-head", "bart-8-lay"], no_inference=True, training=True, no_memory=True, train_time_csv_file="plots_pt/training_speed_fp16.csv", save_to_csv=True, env_info_csv_file="plots_pt/env.csv", sequence_lengths=[32, 128, 512], batch_sizes=[8], no_env_print=True, repeat=1, # to make speed measurement faster but less accurate no_multi_process=True, # google colab has problems with multi processing fp16=True ) # create benchmark benchmark = PyTorchBenchmark(configs=[config_8_heads, config_8_layers], args=args) # run benchmark result = benchmark.run() # plot graph and save as image !python plot_csv_file.py --csv_file plots_pt/training_speed_fp16.csv --figure_png_file=plots_pt/training_speed_fp16.png --no_log_scale --is_time # show image from IPython.display import Image Image('plots_pt/training_speed_fp16.png')