ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • slurm job script example
    카테고리 없음 2024. 11. 8. 21:26

    job_script.sh

    #!/bin/bash

    # Job name:
    #SBATCH --job-name=hilcodec_training
    #
    # Project:
    #SBATCH --account=nn11068k
    #
    # Wall time limit:
    #SBATCH --time=00-00:05:00
    #
    # Other parameters:
    #SBATCH --partition=accel
    #SBATCH --gres=gpu:1
    #SBATCH --mem=50G
    #SBATCH --cpus-per-task=16



    # Set up job environment:
    set -o errexit # Exit the script on any error

    # Initialize Conda without sourcing .bashrc
    MINIFORGE_INSTALL_PATH="/cluster/projects/nn11068k/miniforge3"
    eval "$($MINIFORGE_INSTALL_PATH/bin/conda shell.bash hook)"

    # Load conda environment
    ENV_NAME="hilcodec_inductive_bias"
    conda activate "$ENV_NAME"
    echo "Current Conda environment: $(conda info --envs | grep '*' | awk '{print $1}')"

    # Check if PyTorch and CUDA can be loaded
    echo "Checking if PyTorch and CUDA are available..."
    python -c "
    import torch
    print(f'PyTorch loaded successfully. Version: {torch.__version__}')
    cuda_available = torch.cuda.is_available()
    print(f'CUDA available: {cuda_available}')
    if cuda_available:
    print(f'CUDA device count: {torch.cuda.device_count()}')
    print(f'CUDA device name: {torch.cuda.get_device_name(0)}')
    "
     
    # log in wandb
    wandb login MY_API_KEY

    # Run Python script
    echo "Run the script..."
    SCRIPT_FNAME="/cluster/projects/nn11068k/daesoo/hilcodec_inductive_bias/stage1.py"
    CKPT_PATH="/cluster/projects/nn11068k/daesoo/hilcodec_inductive_bias/ckpts/epoch=2-step=23320.ckpt"
    python "$SCRIPT_FNAME" --ckpt_path "$CKPT_PATH"

    The job can be sumibtted by running "$ sbatch job_script.sh"

    NB! replace MY_API_KEY

    Comments