Comprehensive guide to using the Ruqola server’s custom GPU queue management system for efficient resource sharing.
Our custom GPU queue system (gpuq
) manages fair access to the server’s 3 H200 GPUs, ensuring:
# Check current GPU status and queue
gpuq status
# Submit a simple training job
gpuq submit --command "python train.py"
# Submit with specific requirements
gpuq submit --command "python large_model.py" --gpus 2 --memory 40 --time 12
# Check your jobs
gpuq status | grep $USER
# Kill a specific job
gpuq kill --job-id 12345
gpuq status
nvidia-smi
gpuq submit --command "python -c 'print(\"Hello GPU!\")'" --time 1
watch -n 5 gpuq status
# Minimal submission (uses defaults)
gpuq submit --command "python train.py"
# Specify all parameters
gpuq submit \
--command "python train.py --epochs 100 --batch-size 32" \
--gpus 1 \
--memory 40 \
--time 8 \
--email "your-email@example.com"
# Single GPU (default)
gpuq submit --command "python train.py" --gpus 1
# Multi-GPU training
gpuq submit --command "python -m torch.distributed.launch train.py" --gpus 2
# All available GPUs
gpuq submit --command "python multi_gpu_train.py" --gpus 3
# Specify memory per GPU (in GB)
gpuq submit --command "python big_model.py" --memory 60
# For memory-intensive models
gpuq submit --command "python huge_model.py" --memory 75
# Conservative memory usage
gpuq submit --command "python small_model.py" --memory 20
# Short experiments (1 hour)
gpuq submit --command "python quick_test.py" --time 1
# Medium training (8 hours)
gpuq submit --command "python train.py" --time 8
# Long training (24 hours - maximum)
gpuq submit --command "python long_train.py" --time 24
# PyTorch training
gpuq submit --command "python train.py --model resnet50 --epochs 100" --gpus 1 --memory 30 --time 12
# TensorFlow training
gpuq submit --command "python tf_train.py --model_dir ./models" --gpus 1 --memory 25 --time 8
# Distributed training
gpuq submit --command "torchrun --nproc_per_node=2 distributed_train.py" --gpus 2 --memory 40 --time 16
# Start Jupyter on port 8888
gpuq submit --command "jupyter notebook --ip=0.0.0.0 --port=8888 --no-browser" --gpus 1 --time 8
# JupyterLab with custom port
gpuq submit --command "jupyter lab --ip=0.0.0.0 --port=9999 --no-browser" --gpus 1 --memory 30 --time 4
# Jupyter with specific working directory
gpuq submit --command "cd /path/to/project && jupyter notebook --ip=0.0.0.0 --port=8888" --gpus 1 --time 6
# Large dataset preprocessing
gpuq submit --command "python preprocess_data.py --dataset imagenet" --gpus 1 --memory 50 --time 4
# Feature extraction
gpuq submit --command "python extract_features.py --model vit_large" --gpus 1 --memory 35 --time 3
# Overall system status
gpuq status
# Detailed status with job information
gpuq status --detailed
# Monitor in real-time
watch -n 5 gpuq status
# Check only your jobs
gpuq status | grep $USER
# Real-time GPU monitoring
nvidia-smi -l 1
# GPU utilization and memory
nvidia-smi --query-gpu=index,name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv
# Continuous monitoring with better formatting
watch -n 2 'nvidia-smi --query-gpu=index,name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits'
# View job output logs
tail -f /tmp/gpu_queue/logs/job_12345_stdout.log
# View error logs
tail -f /tmp/gpu_queue/logs/job_12345_stderr.log
# Search for specific patterns in logs
grep -i "error\|warning" /tmp/gpu_queue/logs/job_12345_stderr.log
# Kill a specific job
gpuq kill --job-id 12345
# Kill all your jobs (be careful!)
gpuq status | grep $USER | awk '{print $1}' | xargs -I {} gpuq kill --job-id {}
# Check if job completed successfully
echo $? # after job completion, 0 = success
# Set CUDA devices within job
gpuq submit --command "CUDA_VISIBLE_DEVICES=0 python train.py"
# Use specific conda environment
gpuq submit --command "conda activate myenv && python train.py"
# Set multiple environment variables
gpuq submit --command "export PYTHONPATH=/path/to/modules && python train.py"
# Chain multiple commands
gpuq submit --command "cd /path/to/project && python preprocess.py && python train.py"
# Conditional execution
gpuq submit --command "python train.py && python evaluate.py || echo 'Training failed'"
# Background processes within job
gpuq submit --command "python train.py > output.log 2>&1 &"
# Memory-efficient training with gradient checkpointing
gpuq submit --command "python train.py --gradient-checkpointing --batch-size 16" --memory 35
# Mixed precision training
gpuq submit --command "python train.py --fp16 --batch-size 64" --memory 25
# Model parallelism
gpuq submit --command "python train.py --model-parallel" --gpus 2 --memory 60
# Interactive Python session
gpuq submit --command "python -i" --gpus 1 --time 2
# Interactive shell with GPU access
gpuq submit --command "bash" --gpus 1 --time 1
# Remote development session
gpuq submit --command "code-server --bind-addr 0.0.0.0:8080" --gpus 1 --time 8
# Good: Specific requirements
gpuq submit --command "python train.py" --gpus 1 --memory 30 --time 8
# Bad: Excessive resources
gpuq submit --command "python train.py" --gpus 3 --memory 75 --time 24
# Check if you're using allocated resources efficiently
nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv -l 10
# Test with small dataset/short training first
python train.py --epochs 1 --batch-size 8
# Good
gpuq submit --command "cd /home/user/project && python train.py"
# Bad (relative paths may not work)
gpuq submit --command "python ../train.py"
gpuq submit --command "python train.py --output-dir /home/user/results/exp1"
# config.yaml
model:
name: "resnet50"
batch_size: 32
training:
epochs: 100
lr: 0.001
# Save checkpoints regularly
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}, f'checkpoint_epoch_{epoch}.pth')
import wandb # or tensorboard, mlflow
wandb.log({"loss": loss, "accuracy": acc, "epoch": epoch})
# 1. Prepare data and code
cd /home/user/myproject
ls -la # check files are ready
# 2. Test locally with small dataset
python train.py --epochs 1 --batch-size 4 --debug
# 3. Submit full training job
gpuq submit \
--command "python train.py --epochs 100 --batch-size 32 --save-dir ./models" \
--gpus 1 \
--memory 40 \
--time 12 \
--email "user@example.com"
# 4. Monitor progress
watch -n 10 gpuq status
tail -f /tmp/gpu_queue/logs/job_XXXXX_stdout.log
# Submit multiple jobs with different hyperparameters
for lr in 0.001 0.01 0.1; do
for bs in 16 32 64; do
gpuq submit \
--command "python train.py --lr $lr --batch-size $bs --name lr${lr}_bs${bs}" \
--gpus 1 --memory 30 --time 8
done
done
# Batch inference on large dataset
gpuq submit \
--command "python inference.py --model-path ./best_model.pth --data-dir ./test_data" \
--gpus 1 \
--memory 25 \
--time 4
# Start Jupyter for development
gpuq submit \
--command "jupyter lab --ip=0.0.0.0 --port=8888 --no-browser" \
--gpus 1 \
--memory 30 \
--time 8
# Connect via SSH tunnel (from your local machine)
ssh -L 8888:localhost:8888 user@server.com
# Open http://localhost:8888 in your browser
# Check queue status
gpuq status
# Common causes:
# 1. All GPUs busy - wait or reduce resource requirements
# 2. Requesting more memory than available (max ~75GB per H200)
# 3. Syntax error in command
# Debugging
gpuq submit --command "echo 'Test job'" --gpus 1 --time 1
# Check job logs for errors
tail -100 /tmp/gpu_queue/logs/job_XXXXX_stderr.log
# Common causes:
# 1. Out of memory - reduce batch size or model size
# 2. Time limit exceeded - increase time limit
# 3. Code error - check stderr logs
# Check current GPU memory usage
nvidia-smi
# Solutions:
# 1. Reduce batch size
# 2. Use gradient accumulation
# 3. Enable gradient checkpointing
# 4. Use mixed precision (fp16)
# Example with memory optimization
gpuq submit \
--command "python train.py --batch-size 16 --gradient-checkpointing --fp16" \
--gpus 1 --memory 30
# Check if job is still running
gpuq status | grep job_id
# Find log files
ls -la /tmp/gpu_queue/logs/job_*
# Check permissions
ls -la /tmp/gpu_queue/logs/job_XXXXX_*.log
# Check GPU utilization
nvidia-smi -l 1
# If utilization is low:
# 1. Increase batch size
# 2. Check data loading (use more workers)
# 3. Profile your code
# Example with optimized data loading
gpuq submit \
--command "python train.py --num-workers 8 --batch-size 64" \
--gpus 1
# Monitor memory usage over time
nvidia-smi --query-gpu=memory.used --format=csv -l 60 > memory_usage.log
# In Python code, use memory profiling
pip install memory-profiler
python -m memory_profiler train.py
gpuq status
/tmp/gpu_queue/logs/
gpuq submit --command "nvidia-smi"
Next Steps: Once you’re comfortable with the queue system, check out the framework-specific guides: