forked from SamsungSAILMontreal/TinyRecursiveModels
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlaunch_rloo_pretrain_array.sh
More file actions
72 lines (62 loc) · 3.08 KB
/
Copy pathlaunch_rloo_pretrain_array.sh
File metadata and controls
72 lines (62 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
set -e
echo "Setting up SLURM array job..."
echo ""
# ============================================================================
# CHECKPOINT LIST CONFIGURATION
# ============================================================================
# Specify checkpoint paths for each array task (one per task)
# - Use empty string "" for tasks that should start from scratch (no checkpoint)
# - The array should have one entry per task in your SLURM array (currently 1-7 = 7 tasks)
# - Each task will use the checkpoint at the corresponding index
#
# Example for 7 tasks (array 1-7):
# checkpoint_list=(
# "" # Task 1: no checkpoint
# "path/to/checkpoint1/step_65100" # Task 2: use checkpoint1
# "path/to/checkpoint2/step_65100" # Task 3: use checkpoint2
# "" # Task 4: no checkpoint
# "path/to/checkpoint3/step_65100" # Task 5: use checkpoint3
# "path/to/checkpoint4/step_65100" # Task 6: use checkpoint4
# "" # Task 7: no checkpoint
# )
#
# Note: Task indices are 1-indexed (SLURM_ARRAY_TASK_ID), but array is 0-indexed
# So checkpoint_list[0] goes to task 1, checkpoint_list[1] goes to task 2, etc.
checkpoint_list=("checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540" "checkpoints/Tiny Recursion Policy/sn_NTE_cont/step_32540")
# ============================================================================
# Make sure the SLURM script is executable
chmod +x rloo_pretrain_array.sh
# Write checkpoint list to a file that rloo_pretrain_array.sh will source
checkpoint_file=".checkpoint_list.sh"
cat > "$checkpoint_file" << EOF
# Auto-generated checkpoint list (do not edit manually)
# Generated by launch_pretrain_array.sh
checkpoint_list=($(printf '"%s" ' "${checkpoint_list[@]}"))
EOF
echo "Checkpoint list written to $checkpoint_file"
# Create logs and output directories
mkdir -p logs
mkdir -p out
eval "$(conda shell.bash hook)"
conda activate trp
# Submit the array job
echo "Submitting SLURM array job..."
job_id=$(sbatch rloo_pretrain_array.sh | awk '{print $4}')
echo "Note: Checkpoint file $checkpoint_file will be used by the array jobs."
echo " You can remove it after all jobs complete if desired."
# Create output directory for this job
mkdir -p out/${job_id}
echo "Array job submitted with ID: $job_id"
echo ""
echo "To monitor the jobs:"
echo " squeue -u \$USER"
echo " squeue -j $job_id"
echo ""
echo "To cancel all jobs in the array:"
echo " scancel $job_id"
echo ""
echo "To check job status:"
echo " sacct -j $job_id --format=JobID,JobName,State,ExitCode,Start,End"
echo ""
echo "To view real-time output of a specific task (e.g., task 1):"
echo " tail -f out/${job_id}/rlooarr_${job_id}_1.out"