... | ... | @@ -75,7 +75,7 @@ Your job will get a number which you need in order to track the progress, errors |
|
|
job-ID prior name user state submit/start at queue slots ja-task-ID
|
|
|
|
|
|
------------------------------------------------------------------------------------------------------
|
|
|
517 0.00000 my_first_j chiel qw 06/09/2010 13:24:15 1
|
|
|
517 0.00000 my_first_j username qw 06/09/2010 13:24:15 1
|
|
|
````
|
|
|
`qstat -ext`
|
|
|
|
... | ... | @@ -200,48 +200,3 @@ If your job needs more memory then the default limit you need to specify that as |
|
|
|
|
|
`qsub -l h_vmem=12G test.sh`
|
|
|
|
|
|
|
|
|
### Checkpointing example
|
|
|
checkpoint_program.R:
|
|
|
|
|
|
args<-commandArgs(TRUE)
|
|
|
outfile<-args[1] #"test_levels1.out"
|
|
|
end<-10000
|
|
|
start<-1+as.integer(args[2])
|
|
|
if (start==1) write(c("i", "root_i"), file=outfile,ncol=2)
|
|
|
for (idx in start:end) {
|
|
|
res<-c(i=idx,root=sqrt(idx))
|
|
|
write(res, file=outfile,ncol=2,append=TRUE)
|
|
|
}
|
|
|
}}}
|
|
|
qsub example file: save this example checkpoint_test.sh
|
|
|
|
|
|
#!/bin/bash
|
|
|
#$ -cwd
|
|
|
#$ -V
|
|
|
#$ -S /bin/bash
|
|
|
|
|
|
outfile=checkpoint_test.out
|
|
|
if [-e $outfile ]()
|
|
|
then
|
|
|
wc_out=`wc -l $outfile`
|
|
|
wcount=${wc_out%% *}
|
|
|
#echo $wcount
|
|
|
count=$(($wcount - 1)) #1 for the header and 1 for starting at 1
|
|
|
else
|
|
|
count=0
|
|
|
fi
|
|
|
R --vanilla --slave -f checkpoint_program.R --args $outfile $count
|
|
|
}}}
|
|
|
We can submit this job as:
|
|
|
|
|
|
#!div class=important style="border: 2pt solid; text-align: left"
|
|
|
qsub -q subordinate.q -ckpt check_userdefined ./checkpoint_test.sh
|
|
|
|
|
|
Try to suspend this job with:
|
|
|
|
|
|
#!div class=important style="border: 2pt solid; text-align: left"
|
|
|
qmod -s <job id>
|
|
|
|
|
|
You will then see a Rq(rerunning and queued) and then a Rr(rerunning). Your job will start from the last output it wrote.
|
|
|
|