-
slurm job script example카테고리 없음 2024. 11. 8. 21:26
job_script.sh
#!/bin/bash
# Job name:#SBATCH --job-name=hilcodec_training## Project:#SBATCH --account=nn11068k## Wall time limit:#SBATCH --time=00-00:05:00## Other parameters:#SBATCH --partition=accel#SBATCH --gres=gpu:1#SBATCH --mem=50G#SBATCH --cpus-per-task=16
# Set up job environment:set -o errexit # Exit the script on any error
# Initialize Conda without sourcing .bashrcMINIFORGE_INSTALL_PATH="/cluster/projects/nn11068k/miniforge3"eval "$($MINIFORGE_INSTALL_PATH/bin/conda shell.bash hook)"
# Load conda environmentENV_NAME="hilcodec_inductive_bias"conda activate "$ENV_NAME"echo "Current Conda environment: $(conda info --envs | grep '*' | awk '{print $1}')"
# Check if PyTorch and CUDA can be loadedecho "Checking if PyTorch and CUDA are available..."python -c "import torchprint(f'PyTorch loaded successfully. Version: {torch.__version__}')cuda_available = torch.cuda.is_available()print(f'CUDA available: {cuda_available}')if cuda_available:print(f'CUDA device count: {torch.cuda.device_count()}')print(f'CUDA device name: {torch.cuda.get_device_name(0)}')"# log in wandbwandb login MY_API_KEY
# Run Python scriptecho "Run the script..."SCRIPT_FNAME="/cluster/projects/nn11068k/daesoo/hilcodec_inductive_bias/stage1.py"CKPT_PATH="/cluster/projects/nn11068k/daesoo/hilcodec_inductive_bias/ckpts/epoch=2-step=23320.ckpt"python "$SCRIPT_FNAME" --ckpt_path "$CKPT_PATH"The job can be sumibtted by running "$ sbatch job_script.sh"
NB! replace MY_API_KEY