%cd /workspace/axolotl
/workspace/axolotl
!accelerate config --config_file configs/accelerate/default_config.yaml default
Setting ds_accelerator to cuda (auto detect) accelerate configuration saved at /root/.cache/huggingface/accelerate/default_config.yaml
!cat examples/openllama/qlora.yml
# Based on https://gist.github.com/fearnworks/723709806cebc67bafe1eb8138e7efbd base_model: openlm-research/open_llama_3b_600bt_preview base_model_config: openlm-research/open_llama_3b_600bt_preview model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false load_in_4bit: true strict: false push_dataset_to_hub: datasets: # - path: AtlasUnified/Code-Instruct-Sets # data_files: # - unmasked-set-1.jsonl # - unmasked-set-2.jsonl # - unmasked-set-3.jsonl # - unmasked-set-4.jsonl # type: alpaca_code_instruct # - path: winglian/pygmalion-cleaned # data_files: # - v13_no_ai.cleaned.jsonl # type: pygmalion # shards: 4 # - path: winglian/evals # data_files: # - hf/ARC-Challenge.jsonl # - hf/ARC-Easy.jsonl # - hf/riddle_sense.jsonl # type: explainchoice:chat # - path: winglian/evals # data_files: # - hf/gsm8k.jsonl # - custom/logic_inference_oa.jsonl # type: alpaca_chat.load_qa # - path: winglian/evals # data_files: # - custom/in_context_qa.jsonl # type: context_qa # - path: winglian/evals # data_files: # - custom/in_context_qa.jsonl # type: context_qa.load_404 # - path: winglian/evals # data_files: # - custom/jokes_explained_500up.jsonl # type: sharegpt_jokes # - path: winglian/evals # data_files: # - custom/classify-self-chat.sharegpt.jsonl # - custom/coding-self-chat.sharegpt.jsonl # - custom/prose-gpt4.sharegpt.jsonl # - custom/prose-rewrite-gpt4.sharegpt.jsonl # type: sharegpt_simple # - path: winglian/evals # data_files: # - custom/guanaco-cleaned.en.jsonl # type: sharegpt_simple.load_guanaco # - path: winglian/evals # data_files: # - openai/tldr.jsonl # type: summarizetldr:chat # - path: winglian/evals # data_files: # - hellaswag/hellaswag.jsonl # type: explainchoice:chat # shards: 60 # - path: metaeval/ScienceQA_text_only # type: concisechoice:chat # shards: 13 # - path: teknium/GPTeacher-General-Instruct # data_files: # - gpt4-instruct-similarity-0.6-dataset.json # type: gpteacher:chat - path: QingyiSi/Alpaca-CoT data_files: # - chain-of-thought/formatted_cot_data/aqua_train.jsonl # - Chain-of-Thought/formatted_cot_data/creak_train.json # - Chain-of-Thought/formatted_cot_data/ecqa_train.json # - Chain-of-Thought/formatted_cot_data/esnli_train.json - Chain-of-Thought/formatted_cot_data/gsm8k_train.json # - Chain-of-Thought/formatted_cot_data/qasc_train.json # - Chain-of-Thought/formatted_cot_data/qed_train.json # - Chain-of-Thought/formatted_cot_data/sensemaking_train.json # - Chain-of-Thought/formatted_cot_data/strategyqa_train.json # - GPTeacher/Roleplay/formatted_roleplay-similarity_0.6-instruct-dataset.json type: "alpaca:chat" dataset_prepared_path: last_run_prepared val_set_size: 0.01 adapter: qlora lora_model_dir: sequence_len: 2048 max_packed_sequence_len: 2048 lora_r: 64 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: wandb_project: openllama-7b-qlora-gsm8k wandb_watch: wandb_run_id: wandb_log_model: checkpoint output_dir: ./qlora-out batch_size: 36 micro_batch_size: 9 num_epochs: 3 optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: true fp16: false tf32: true gradient_checkpointing: true # stop training after this many evaluation losses have increased in a row # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true local_rank: logging_steps: 1 xformers_attention: false flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 eval_steps: 5 save_steps: 10 debug: deepspeed: weight_decay: 0.000001 fsdp: fsdp_config: special_tokens: bos_token: "<s>" eos_token: "</s>" unk_token: "<unk>"
!accelerate launch scripts/finetune.py examples/openllama/qlora.yml
Setting ds_accelerator to cuda (auto detect) ===================================BUG REPORT=================================== Welcome to bitsandbytes. For bug reports, please run python -m bitsandbytes and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues ================================================================================ bin /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')} warn(msg) /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths... warn(msg) /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')} warn(msg) /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIK1tFOFrWbmoa2ckCJYhzgBHKTSMeR/AeuScCCzugqlI utensilcandel@gmail.com')} warn(msg) CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths... /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/usr/local/cuda/lib64/libcudart.so'), PosixPath('/usr/local/cuda/lib64/libcudart.so.11.0')}.. We'll flip a coin and try one of these, in order to fail forward. Either way, this might cause trouble in the future: If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env. warn(msg) CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so CUDA SETUP: Highest compute capability among GPUs detected: 8.6 CUDA SETUP: Detected CUDA version 118 CUDA SETUP: Loading binary /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so... Setting ds_accelerator to cuda (auto detect) INFO:root:loading tokenizer... Using pad_token, but it is not set yet. INFO:root:Loading prepared packed dataset from disk at last_run_prepared/21a0611c6c2b67b31f00097fa2a91c26... INFO:root:Prepared packed dataset loaded from disk... INFO:root:loading model and peft_config... INFO:root:converting PEFT model w/ prepare_model_for_int8_training /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/peft/utils/other.py:76: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead. warnings.warn( INFO:root:found linear modules: ['k_proj', 'gate_proj', 'q_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj'] trainable params: 101703680 || all params: 1917425280 || trainable%: 5.304179571472011 INFO:root:Compiling torch model INFO:root:Pre-saving adapter config to ./qlora-out INFO:root:Starting trainer... INFO:root:Using Auto-resume functionality to start with checkpoint at qlora-out/checkpoint-40 wandb: Currently logged in as: utensil. Use `wandb login --relogin` to force relogin wandb: Tracking run with wandb version 0.15.3 wandb: Run data is saved locally in /workspace/axolotl/wandb/run-20230531_043745-ggfx5q40 wandb: Run `wandb offline` to turn off syncing. wandb: Syncing run peach-feather-14 wandb: ⭐️ View project at https://wandb.ai/utensil/openllama-7b-qlora-gsm8k wandb: 🚀 View run at https://wandb.ai/utensil/openllama-7b-qlora-gsm8k/runs/ggfx5q40 {'loss': 0.7336, 'learning_rate': 0.0001, 'epoch': 1.71} {'loss': 0.7318, 'learning_rate': 9.493508311612874e-05, 'epoch': 1.75} {'loss': 0.7294, 'learning_rate': 8.98831678012568e-05, 'epoch': 1.79} {'loss': 0.7361, 'learning_rate': 8.485722224954237e-05, 'epoch': 1.83} {'loss': 0.692, 'learning_rate': 7.987014799113397e-05, 'epoch': 1.88} 62%|██████████████████████████▉ | 45/72 [04:57<06:00, 13.33s/it] {'eval_loss': 0.7622343897819519, 'eval_runtime': 4.0149, 'eval_samples_per_second': 1.993, 'eval_steps_per_second': 0.249, 'epoch': 1.88} 62%|██████████████████████████▉ | 45/72 [05:01<06:00, 13.33s/it] 100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 7.40it/s] {'loss': 0.7289, 'learning_rate': 7.493474677412794e-05, 'epoch': 1.92} {'loss': 0.7027, 'learning_rate': 7.006368770266421e-05, 'epoch': 1.96} {'loss': 0.7396, 'learning_rate': 6.526947471551798e-05, 'epoch': 2.0} 67%|████████████████████████████▋ | 48/72 [07:49<11:07, 27.80s/it]
Below are ad hoc cells handling issues during training
!apt install lsof
Reading package lists... Done Building dependency tree... Done Reading state information... Done The following NEW packages will be installed: lsof 0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded. Need to get 253 kB of archives. After this operation, 458 kB of additional disk space will be used. Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 lsof amd64 4.93.2+dfsg-1.1build2 [253 kB] Fetched 253 kB in 1s (364 kB/s)0m debconf: delaying package configuration, since apt-utils is not installed 78Selecting previously unselected package lsof. (Reading database ... 21634 files and directories currently installed.) Preparing to unpack .../lsof_4.93.2+dfsg-1.1build2_amd64.deb ... 7Progress: [ 0%] [..........................................................] 87Progress: [ 20%] [###########...............................................] 8Unpacking lsof (4.93.2+dfsg-1.1build2) ... 7Progress: [ 40%] [#######################...................................] 8Setting up lsof (4.93.2+dfsg-1.1build2) ... 7Progress: [ 60%] [##################################........................] 87Progress: [ 80%] [##############################################............] 8 78
!lsof /dev/nvidia*
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME docker-in 1 root 0u CHR 1,3 0t0 6 /dev/null bash 7 root 0u CHR 1,3 0t0 6 /dev/null sshd 19 root 0u CHR 1,3 0t0 6 /dev/null sshd 19 root 1u CHR 1,3 0t0 6 /dev/null sshd 19 root 2u CHR 1,3 0t0 6 /dev/null jupyter-l 2308 root 0r CHR 1,3 0t0 6 /dev/null jupyter-l 2308 root 12r CHR 1,9 0t0 11 /dev/urandom python3 2541 root 4r CHR 1,9 0t0 11 /dev/urandom python3 2947 root mem CHR 195,255 472 /dev/nvidiactl python3 2947 root mem CHR 195,0 473 /dev/nvidia0 python3 2947 root mem CHR 234,0 481 /dev/nvidia-uvm python3 2947 root 3r CHR 1,9 0t0 11 /dev/urandom python3 2947 root 132u CHR 195,255 0t0 472 /dev/nvidiactl python3 2947 root 133u CHR 234,0 0t0 481 /dev/nvidia-uvm python3 2947 root 134u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 135u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 136u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 139u CHR 195,255 0t0 472 /dev/nvidiactl python3 2947 root 140u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 141u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 142u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 145u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 147u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 148u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 149u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 151u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 152u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 153u CHR 195,0 0t0 473 /dev/nvidia0 python3 2947 root 154u CHR 195,0 0t0 473 /dev/nvidia0 python3 3545 root 4r CHR 1,9 0t0 11 /dev/urandom python3 4493 root mem CHR 195,255 472 /dev/nvidiactl python3 4493 root mem CHR 195,0 473 /dev/nvidia0 python3 4493 root mem CHR 234,0 481 /dev/nvidia-uvm python3 4493 root 3r CHR 1,9 0t0 11 /dev/urandom python3 4493 root 132u CHR 195,255 0t0 472 /dev/nvidiactl python3 4493 root 133u CHR 234,0 0t0 481 /dev/nvidia-uvm python3 4493 root 134u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 135u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 136u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 139u CHR 195,255 0t0 472 /dev/nvidiactl python3 4493 root 140u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 141u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 142u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 145u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 146u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 147u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 148u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 150u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 151u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 152u CHR 195,0 0t0 473 /dev/nvidia0 python3 4493 root 153u CHR 195,0 0t0 473 /dev/nvidia0 sh 4950 root 10u CHR 5,0 0t0 13 /dev/tty python3 5051 root mem CHR 195,255 472 /dev/nvidiactl python3 5051 root mem CHR 195,0 473 /dev/nvidia0 python3 5051 root mem CHR 234,0 481 /dev/nvidia-uvm python3 5051 root 3r CHR 1,9 0t0 11 /dev/urandom python3 5051 root 132u CHR 195,255 0t0 472 /dev/nvidiactl python3 5051 root 133u CHR 234,0 0t0 481 /dev/nvidia-uvm python3 5051 root 134u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 135u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 136u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 139u CHR 195,255 0t0 472 /dev/nvidiactl python3 5051 root 140u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 141u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 142u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 145u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 146u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 147u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 148u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 150u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 151u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 152u CHR 195,0 0t0 473 /dev/nvidia0 python3 5051 root 153u CHR 195,0 0t0 473 /dev/nvidia0 tmux:\x20 5801 root 0u CHR 1,3 0t0 6 /dev/null tmux:\x20 5801 root 1u CHR 1,3 0t0 6 /dev/null tmux:\x20 5801 root 2u CHR 1,3 0t0 6 /dev/null nvitop 5817 root 3u CHR 195,255 0t0 472 /dev/nvidiactl nvitop 5817 root 4u CHR 195,0 0t0 473 /dev/nvidia0 nvitop 5817 root 5u CHR 195,0 0t0 473 /dev/nvidia0 nvitop 5817 root 6u CHR 195,0 0t0 473 /dev/nvidia0
!ps aux|grep python|grep finetune|awk '{print $2}'|xargs kill -9
!kill -9 2960
!ps aux|grep python
root 2353 0.7 0.0 576260 110108 ? Sl 12:51 0:05 /root/miniconda3/envs/py3.9/bin/python3 /root/miniconda3/envs/py3.9/bin/jupyter-lab --allow-root --no-browser --port=8888 --ip=* --ServerApp.token=sc --ServerApp.allow_origin=* --ServerApp.preferred_dir=/workspace/ root 2636 1.6 0.0 770824 63020 ? Ssl 12:52 0:12 /root/miniconda3/envs/py3.9/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-b2638c7c-467b-4866-a969-c97f1b037796.json root 3776 3.5 0.0 316080 90152 pts/2 Sl+ 12:55 0:19 /root/miniconda3/envs/py3.9/bin/python3 /root/miniconda3/envs/py3.9/bin/nvitop -m full root 5019 0.0 0.0 2880 952 pts/3 Ss+ 13:04 0:00 /usr/bin/sh -c ps aux|grep python root 5022 0.0 0.0 3836 1968 pts/3 S+ 13:04 0:00 grep python
!pwd
!apt install zip
!zip -r last_run_prepared.zip -xi last_run_prepared
!pip install nvitop
!nvitop -m full