SLURM Job Submit Guide#

āđāļ™āļ°āļ™āļģāđƒāļŦāđ‰āđ€āļ‚āļĩāļĒāļ™ SLURM script āļ›āļĢāļ°āļĄāļēāļ“āļ™āļĩāđ‰āļ„āļĢāļąāļš

Basic SLURM Script#

#!/bin/bash
#SBATCH --job-name=my_job
#SBATCH --partition=defq
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --gpus=1
#SBATCH --mem=16G
#SBATCH --time=01:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err

SCRATCHDIR=/scratch/$USER

mkdir -p /scratch/$USER
cp input.DAT /scratch/$USER

srun python mycode.py $SCRATCHDIR/input.DAT

āļ„āļģāļ­āļ˜āļīāļšāļēāļĒ SBATCH Options#

Option

āļ„āļģāļ­āļ˜āļīāļšāļēāļĒ

--job-name

āļŠāļ·āđˆāļ­ job āļ—āļĩāđˆāļˆāļ°āđāļŠāļ”āļ‡āđƒāļ™ queue

--partition

partition āļ—āļĩāđˆāļ•āđ‰āļ­āļ‡āļāļēāļĢāđƒāļŠāđ‰ (cluster āļ™āļĩāđ‰āļĄāļĩ partition āđ€āļ”āļĩāļĒāļ§āļ„āļ·āļ­ defq)

--nodes

āļˆāļģāļ™āļ§āļ™ node āļ—āļĩāđˆāļ•āđ‰āļ­āļ‡āļāļēāļĢ

--ntasks

āļˆāļģāļ™āļ§āļ™ task (process) āļ—āļąāđ‰āļ‡āļŦāļĄāļ”

--cpus-per-task

āļˆāļģāļ™āļ§āļ™ CPU cores āļ•āđˆāļ­ task

--gpus

āļˆāļģāļ™āļ§āļ™ GPU āļ—āļĩāđˆāļ•āđ‰āļ­āļ‡āļāļēāļĢ

--mem

āļˆāļģāļ™āļ§āļ™ memory āļ—āļĩāđˆāļ•āđ‰āļ­āļ‡āļāļēāļĢ

--time

āđ€āļ§āļĨāļēāļŠāļđāļ‡āļŠāļļāļ”āļ—āļĩāđˆ job āļˆāļ°āļĢāļąāļ™ (format: HH:MM:SS)

--output

āđ„āļŸāļĨāđŒāļŠāļģāļŦāļĢāļąāļš stdout (%j = job ID)

--error

āđ„āļŸāļĨāđŒāļŠāļģāļŦāļĢāļąāļš stderr

āļāļēāļĢāđƒāļŠāđ‰ /scratch#

āđāļ™āļ°āļ™āļģāđƒāļŦāđ‰ copy āļ‚āđ‰āļ­āļĄāļđāļĨāđ„āļ›āļ—āļĩāđˆ /scratch āļāđˆāļ­āļ™āļĢāļąāļ™ job āđ€āļžāļ·āđˆāļ­āļ›āļĢāļ°āļŠāļīāļ—āļ˜āļīāļ āļēāļž I/O āļ—āļĩāđˆāļ”āļĩāļāļ§āđˆāļē āđāļĨāļ° copy āļœāļĨāļĨāļąāļžāļ˜āđŒāļāļĨāļąāļšāļĄāļēāđ€āļĄāļ·āđˆāļ­āđ€āļŠāļĢāđ‡āļˆ

SCRATCHDIR=/scratch/$USER/$SLURM_JOB_ID

# āļŠāļĢāđ‰āļēāļ‡ scratch directory
mkdir -p $SCRATCHDIR

# copy input āđ„āļ›āļ—āļĩāđˆ scratch
cp input.DAT $SCRATCHDIR/

# āļĢāļąāļ™ job
srun python mycode.py $SCRATCHDIR/input.DAT

# copy āļœāļĨāļĨāļąāļžāļ˜āđŒāļāļĨāļąāļšāļĄāļē
cp $SCRATCHDIR/output.* $SLURM_SUBMIT_DIR/

# cleanup scratch
rm -rf $SCRATCHDIR

Multi-GPU Job#

#!/bin/bash
#SBATCH --job-name=multi_gpu
#SBATCH --partition=defq
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --gpus=4
#SBATCH --mem=64G
#SBATCH --time=04:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err

SCRATCHDIR=/scratch/$USER/$SLURM_JOB_ID
mkdir -p $SCRATCHDIR
cp -r data/ $SCRATCHDIR/

srun python train.py --data_dir $SCRATCHDIR/data --gpus 4

cp $SCRATCHDIR/checkpoints/* $SLURM_SUBMIT_DIR/checkpoints/
rm -rf $SCRATCHDIR

Multi-Node Job#

#!/bin/bash
#SBATCH --job-name=multi_node
#SBATCH --partition=defq
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --gpus-per-node=4
#SBATCH --mem=64G
#SBATCH --time=08:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err

SCRATCHDIR=/scratch/$USER/$SLURM_JOB_ID
mkdir -p $SCRATCHDIR
cp -r data/ $SCRATCHDIR/

srun python -m torch.distributed.run \
    --nproc_per_node=4 \
    --nnodes=$SLURM_NNODES \
    --node_rank=$SLURM_NODEID \
    --master_addr=$(scontrol show hostname $SLURM_NODELIST | head -n1) \
    --master_port=29500 \
    train.py --data_dir $SCRATCHDIR/data

cp $SCRATCHDIR/checkpoints/* $SLURM_SUBMIT_DIR/checkpoints/
rm -rf $SCRATCHDIR

āļ„āļģāļŠāļąāđˆāļ‡āļ—āļĩāđˆāđƒāļŠāđ‰āļšāđˆāļ­āļĒ#

# submit job
sbatch myjob.sh

# āļ”āļđ queue
squeue -u $USER

# āļĒāļāđ€āļĨāļīāļ job
scancel <job_id>

# āļ”āļđāļĢāļēāļĒāļĨāļ°āđ€āļ­āļĩāļĒāļ” job
scontrol show job <job_id>

# āļ”āļđ partition āļ—āļĩāđˆāđƒāļŠāđ‰āđ„āļ”āđ‰
sinfo

# āļ”āļđ resource āļ—āļĩāđˆāđƒāļŠāđ‰āļŦāļĨāļąāļ‡ job āđ€āļŠāļĢāđ‡āļˆ
sacct -j <job_id> --format=JobID,Elapsed,MaxRSS,MaxVMSize,State