From 2dc3bba80b73584926a47c0f206c46eb63afbca2 Mon Sep 17 00:00:00 2001 From: Mehdi Dogguy <mehdi@debian.org> Date: Mon, 8 Sep 2014 22:39:01 +0200 Subject: [PATCH] Imported Upstream version 14.03.2 --- META | 6 +- NEWS | 36 +++ contribs/perlapi/libslurm/perl/typemap | 7 +- doc/html/accounting.shtml | 4 +- doc/html/cray.shtml | 11 +- doc/html/download.shtml | 4 +- doc/html/faq.shtml | 93 +++--- doc/html/high_throughput.shtml | 7 +- doc/html/job_array.shtml | 4 +- doc/html/quickstart_admin.shtml | 8 +- doc/html/team.shtml | 3 +- doc/man/man1/salloc.1 | 29 +- doc/man/man1/sbatch.1 | 28 +- doc/man/man1/scontrol.1 | 14 +- doc/man/man1/srun.1 | 30 +- doc/man/man5/slurm.conf.5 | 9 +- etc/cgroup.release_common.example | 12 +- src/api/job_info.c | 16 +- src/common/read_config.c | 7 +- src/common/slurm_acct_gather.c | 45 +++ src/common/slurm_acct_gather.h | 2 + src/common/slurm_jobcomp.c | 1 + src/common/slurm_jobcomp.h | 2 + src/common/util-net.c | 43 ++- src/common/util-net.h | 19 +- .../jobcomp/filetxt/filetxt_jobcomp_process.c | 6 + src/plugins/sched/backfill/backfill.c | 23 +- .../select/bluegene/bg_record_functions.c | 20 +- src/plugins/select/cray/select_cray.c | 85 +++--- src/plugins/switch/nrt/nrt.c | 19 +- src/sacct/print.c | 2 +- src/salloc/opt.c | 27 +- src/salloc/opt.h | 1 + src/salloc/salloc.c | 2 + src/sbatch/opt.c | 57 ++-- src/sbatch/opt.h | 1 + src/sbatch/sbatch.c | 3 + src/scontrol/scontrol.c | 12 +- src/scontrol/update_job.c | 32 ++- src/slurmctld/acct_policy.c | 270 ++++++++---------- src/slurmctld/acct_policy.h | 16 ++ src/slurmctld/agent.c | 28 +- src/slurmctld/controller.c | 4 +- src/slurmctld/groups.c | 9 +- src/slurmctld/job_mgr.c | 51 ++-- src/slurmctld/job_scheduler.c | 166 +++++------ src/slurmctld/job_scheduler.h | 9 +- src/slurmctld/node_scheduler.c | 2 + src/slurmctld/slurmctld.h | 1 + src/slurmd/slurmd/req.c | 54 ++-- src/slurmd/slurmstepd/fname.c | 21 +- src/slurmd/slurmstepd/slurmstepd_job.c | 30 +- src/slurmd/slurmstepd/slurmstepd_job.h | 2 +- src/squeue/print.c | 19 +- src/srun/libsrun/allocate.c | 2 + src/srun/libsrun/opt.c | 26 +- src/srun/libsrun/opt.h | 1 + src/srun/libsrun/srun_job.c | 7 + testsuite/expect/globals | 33 ++- testsuite/expect/globals_accounting | 10 +- testsuite/expect/inc21.21.4 | 2 +- testsuite/expect/inc21.30.2 | 4 +- testsuite/expect/inc21.30.5 | 2 +- testsuite/expect/inc21.30.6 | 2 +- testsuite/expect/test1.89 | 30 +- testsuite/expect/test1.90 | 4 +- testsuite/expect/test21.21 | 4 +- testsuite/expect/test3.4 | 22 +- 68 files changed, 965 insertions(+), 596 deletions(-) diff --git a/META b/META index 42af05136..3424f9729 100644 --- a/META +++ b/META @@ -9,9 +9,9 @@ Name: slurm Major: 14 Minor: 03 - Micro: 1 - Version: 14.03.1 - Release: 2 + Micro: 2 + Version: 14.03.2 + Release: 1 ## # When changing API_CURRENT update src/common/slurm_protocol_common.h diff --git a/NEWS b/NEWS index 07324e530..57af14e2f 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,38 @@ documents those changes that are of interest to users and admins. * Changes in Slurm 14.03.2 ========================== + -- Fix race condition if PrologFlags=Alloc,NoHold is used. + -- Cray - Make NPC only limit running other NPC jobs on shared blades instead + of limited non NPC jobs. + -- Fix for sbatch #PBS -m (mail) option parsing. + -- Fix job dependency bug. Jobs dependent upon multiple other jobs may start + prematurely. + -- Set "Reason" field for all elements of a job array on short-circuited + scheduling for job arrays. + -- Allow -D option of salloc/srun/sbatch to specify relative path. + -- Added SchedulerParameter of batch_sched_delay to permit many batch jobs + to be submitted between each scheduling attempt to reduce overhead of + scheduling logic. + -- Added job reason of "SchedTimeout" if the scheduler was not able to reach + the job to attempt scheduling it. + -- Add job's exit state and exit code to email message. + -- scontrol hold/release accepts job name option (in addition to job ID). + -- Handle when trying to cancel a step that hasn't started yet better. + -- Handle Max/GrpCPU limits better + -- Add --priority option to salloc, sbatch and srun commands. + -- Honor partition priorities over job priorities. + -- Fix sacct -c when using jobcomp/filetxt to read newer variables + -- Fix segfault of sacct -c if spaces are in the variables. + -- Release held job only with "scontrol release <jobid>" and not by resetting + the job's priority. This is needed to support job arrays better. + -- Correct squeue command not to merge jobs with state pending and completing + together. + -- Fix issue where user is requesting --acctg-freq=0 and no memory limits. + -- Fix issue with GrpCPURunMins if a job's timelimit is altered while the job + is running. + -- Temporary fix for handling our typemap for the perl api with newer perl. + -- Fix allowgroup on bad group seg fault with the controller. + -- Handle node ranges better when dealing with accounting max node limits. * Changes in Slurm 14.03.1-2 ========================== @@ -389,6 +421,10 @@ documents those changes that are of interest to users and admins. * Changes in Slurm 2.6.10 ========================= -- Switch/nrt - On switch resource allocation failure, free partial allocation. + -- Switch/nrt - Properly track usage of CAU and RDMA resources with multiple + tasks per compute node. + -- Fix issue where user is requesting --acctg-freq=0 and no memory limits. + -- BGQ - Temp fix issue where job could be left on job_list after it finished. * Changes in Slurm 2.6.9 ======================== diff --git a/contribs/perlapi/libslurm/perl/typemap b/contribs/perlapi/libslurm/perl/typemap index 3d66a594d..d42cad717 100644 --- a/contribs/perlapi/libslurm/perl/typemap +++ b/contribs/perlapi/libslurm/perl/typemap @@ -33,8 +33,7 @@ T_SLURM T_PTROBJ_SLURM - sv_setref_pv( $arg, \"${eval(`cat classmap`);\$class_map->{$ntype}}\", (void*)$var ); - + sv_setref_pv( $arg, \"$ntype\", (void*)$var ); ##################################### INPUT @@ -49,12 +48,12 @@ T_SLURM } T_PTROBJ_SLURM - if (sv_isobject($arg) && (SvTYPE(SvRV($arg)) == SVt_PVMG) && sv_derived_from($arg, \"${eval(`cat classmap`);\$class_map->{$ntype}}\")) { + if (sv_isobject($arg) && (SvTYPE(SvRV($arg)) == SVt_PVMG)) { IV tmp = SvIV((SV*)SvRV($arg)); $var = INT2PTR($type,tmp); } else { Perl_croak(aTHX_ \"%s: %s is not of type %s\", ${$ALIAS?\q[GvNAME(CvGV(cv))]:\qq[\"$pname\"]}, - \"$var\", \"${eval(`cat classmap`);\$class_map->{$ntype}}\"); + \"$var\", \"$ntype\"); } diff --git a/doc/html/accounting.shtml b/doc/html/accounting.shtml index 442788a8f..25c6b7d8a 100644 --- a/doc/html/accounting.shtml +++ b/doc/html/accounting.shtml @@ -117,7 +117,7 @@ users should have consistent names and IDs.</p> <p>The best way to insure security of the data is by authenticating communications to the SlurmDBD and we recommend -<a href="http://munge.googlecode.com/">MUNGE</a> for that purpose. +<a href="https://code.google.com/p/munge/">MUNGE</a> for that purpose. If you have one cluster managed by SLURM and execute the SlurmDBD on that one cluster, the normal MUNGE configuration will suffice. Otherwise MUNGE should then be installed on all nodes of all @@ -770,7 +770,7 @@ as deleted. If an entity has existed for less than 1 day, the entity will be removed completely. This is meant to clean up after typographic errors.</p> -<p style="text-align: center;">Last modified 3 April 2014</p> +<p style="text-align:center;">Last modified 30 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/cray.shtml b/doc/html/cray.shtml index a1162fce6..8804d6319 100644 --- a/doc/html/cray.shtml +++ b/doc/html/cray.shtml @@ -44,15 +44,18 @@ There are 2 different types of counters, system and blade. <p> For the system option (--network=system) only one job can use system at - a time. This option will allocate every node in the cluster. If - steps inside the allocation wish to run on less than the full system - they must specify the node count for each step. + a time. Only nodes requested will be marked in use for the job + allocation. If the job does not fill up the entire system the rest + of the nodes are not able to be used by other jobs using NPC, if + idle their state will appear as PerfCnts. These nodes are still + available for other jobs not using NPC. </p> <p> For the blade option (--network=blade) Only nodes requested will be marked in use for the job allocation. If the job does not fill up the entire blade(s) allocated to the job those blade(s) are not - able to be used by other jobs, but will appear in the state PerfCnts. + able to be used by other jobs using NPC, if idle their state will appear as + PerfCnts. These nodes are still available for other jobs not using NPC. </p> <li>Core Specialization</li> <p> diff --git a/doc/html/download.shtml b/doc/html/download.shtml index beabd07e3..1ffead136 100644 --- a/doc/html/download.shtml +++ b/doc/html/download.shtml @@ -33,7 +33,7 @@ a message.</li> <li><b>MUNGE</b> (recommended)<br> In order to compile the "auth/munge" authentication plugin for SLURM, you will need to build and install MUNGE, available from -<a href="http://munge.googlecode.com/">http://munge.googlecode.com/</a> and +<a href="https://code.google.com/p/munge/">https://code.google.com/p/munge/</a> and <a href="http://packages.debian.org/src:munge">Debian</a> and <a href="http://fedoraproject.org/">Fedora</a> and <a href="http://packages.ubuntu.com/src:munge">Ubuntu</a>.</li> @@ -272,6 +272,6 @@ easy and elegantly manner. </ul> -<p style="text-align:center;">Last modified 26 March 2014</p> +<p style="text-align:center;">Last modified 30 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 580777d0b..00a7a3cce 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -178,13 +178,18 @@ launch a shell on a node in the job's allocation?</a></li> Free Open Source Software (FOSS) does not mean that it is without cost. It does mean that the you have access to the code so that you are free to use it, study it, and/or enhance it. +These reasons contribute to Slurm (and FOSS in general) being subject to +active research and development worldwide, displacing proprietary software +in many environments. If the software is large and complex, like Slurm or the Linux kernel, -then its use is not without cost. -If your work is important, you'll want the leading Slurm experts at your +then while there is no license fee, its use is not without cost.</p> +<p>If your work is important, you'll want the leading Slurm experts at your disposal to keep your systems operating at peak efficiency. While Slurm has a global development community incorporating leading edge technology, <a href="http://www.schedmd.com">SchedMD</a> personnel have developed most of the code and can provide competitively priced commercial support. +SchedMD works with various organizations to provide a range of support +options ranging from remote level-3 support to 24x7 on-site personnel. Customers switching from commercial workload mangers to Slurm typically report higher scalability, better performance and lower costs.</p> @@ -337,34 +342,12 @@ the same time limit (for example the partition's time limit), then backfill will not be effective. Note that partitions can have both default and maximum time limits, which can be helpful in configuring a system for effective backfill scheduling.</p> -<p>In addition, there are significant limitations in the current backfill -scheduler plugin. -It was designed to perform backfill node scheduling for a homogeneous cluster. -It does not manage scheduling on individual processors (or other consumable -resources). It does not update the required or excluded node list of -individual jobs. It does support job's with constraints/features unless -the exclusive OR operator is used in the constraint expression. -You can use the scontrol show command to check if these conditions apply.</p> -<ul> -<li>Partition: State=UP</li> -<li>Partition: RootOnly=NO</li> -<li>Partition: Shared=NO</li> -<li>Job: ReqNodeList=NULL</li> -<li>Job: ExcNodeList=NULL</li> -<li>Job: Contiguous=0</li> -<li>Job: Features=NULL</li> -<li>Job: MinCPUs, MinMemory, and MinTmpDisk satisfied by all nodes in -the partition</li> -<li>Job: MinCPUs or MinNodes not to exceed partition's MaxNodes</li> -</ul> -<p>If the partitions specifications differ from those listed above, -no jobs in that partition will be scheduled by the backfills scheduler. -Their jobs will only be scheduled on a First-In-First-Out (FIFO) basis.</p> -<p>Jobs failing to satisfy the requirements above (i.e. with specific -node requirements) will not be considered candidates for backfill -scheduling and other jobs may be scheduled ahead of these jobs. -These jobs are subject to starvation, but will not block other -jobs from running when sufficient resources are available for them.</p> + +<p>In addition, there are a multitude of backfill scheduling parameters +which can impact which jobs are considered for backfill scheduling, such +as the maximum number of jobs tested per user. For more information see +the slurm.conf man page and check the configuration of SchedulingParameters +on your system.</p> <p><a name="steps"><b>8. How can I run multiple jobs from within a single script?</b></a><br> @@ -651,13 +634,13 @@ or <b>--distribution</b>' is 'arbitrary'. This means you can tell slurm to layout your tasks in any fashion you want. For instance if I had an allocation of 2 nodes and wanted to run 4 tasks on the first node and 1 task on the second and my nodes allocated from SLURM_NODELIST -where tux[0-1] my srun line would look like this.<p> -<i>srun -n5 -m arbitrary -w tux[0,0,0,0,1] hostname</i><p> +where tux[0-1] my srun line would look like this:<br><br> +<i>srun -n5 -m arbitrary -w tux[0,0,0,0,1] hostname</i><br><br> If I wanted something similar but wanted the third task to be on tux 1 -I could run this...<p> -<i>srun -n5 -m arbitrary -w tux[0,0,1,0,0] hostname</i><p> +I could run this:<br><br> +<i>srun -n5 -m arbitrary -w tux[0,0,1,0,0] hostname</i><br><br> Here is a simple perl script named arbitrary.pl that can be ran to easily lay -out tasks on nodes as they are in SLURM_NODELIST<p> +out tasks on nodes as they are in SLURM_NODELIST.</p> <pre> #!/usr/bin/perl my @tasks = split(',', $ARGV[0]); @@ -685,9 +668,9 @@ foreach my $task (@tasks) { print $layout; </pre> -We can now use this script in our srun line in this fashion.<p> -<i>srun -m arbitrary -n5 -w `arbitrary.pl 4,1` -l hostname</i><p> -This will layout 4 tasks on the first node in the allocation and 1 +<p>We can now use this script in our srun line in this fashion.<br><br> +<i>srun -m arbitrary -n5 -w `arbitrary.pl 4,1` -l hostname</i><br><br> +<p>This will layout 4 tasks on the first node in the allocation and 1 task on the second node.</p> <p><a name="hold"><b>21. How can I temporarily prevent a job from running @@ -948,11 +931,10 @@ $ srun -p mic ./hello.mic <br> <p> Slurm supports requeue jobs in done or failed state. Use the -command: +command:</p> <p align=left><b>scontrol requeue job_id</b></p> </head> -</p> -The job will be requeued back in PENDING state and scheduled again. +<p>The job will be requeued back in PENDING state and scheduled again. See man(1) scontrol. </p> <p>Consider a simple job like this:</p> @@ -979,12 +961,10 @@ $->squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 10 mira zoppo david R 0:03 1 alanz1 </pre> -<p> -Slurm supports requeuing jobs in hold state with the command: +<p>Slurm supports requeuing jobs in hold state with the command:</p> <p align=left><b>'scontrol requeuehold job_id'</b></p> -The job can be in state RUNNING, SUSPENDED, COMPLETED or FAILED -before being requeued. -</p> +<p>The job can be in state RUNNING, SUSPENDED, COMPLETED or FAILED +before being requeued.</p> <pre> $->scontrol requeuehold 10 $->squeue @@ -1801,10 +1781,19 @@ qstat, and qsub commands (see contribs/torque in the distribution and the "slurm-torque" RPM). There is also a wrapper for the showq command at <a href="https://github.com/pedmon/slurm_showq"> -https://github.com/pedmon/slurm_showq</a>. -Slurm recognizes and translates the "#PBS" options in batch scripts. -Most, but not all options are supported. -Please share any enhancements that you make.</p> +https://github.com/pedmon/slurm_showq</a>.</p> + +<p>Slurm recognizes and translates the "#PBS" options in batch scripts. +Most, but not all options are supported.</p> + +<p>Slurm also includes a SPANK plugin that will set all of the PBS environment +variables based upon the Slurm environment (e.g. PBS_JOBID, PBS_JOBNAME, +PBS_WORKDIR, etc.). +One environment not set by PBS_ENVIRONMENT, which if set would result in the +failure of some MPI implementations. +The plugin will be installed in<br> +<install_directory>/lib/slurm/spank_pbs.so<br> +See the SPANK man page for configuration details.</p> <p><a name="sssd"><b>53. I am having trouble using SSSD with Slurm.</b></a></br> SSSD or System Security Services Deamon does not allow enumeration of group @@ -1854,7 +1843,7 @@ If you which to prevent this condition from setting the node into a DOWN state then configure ReturnToService to 2. See the slurm.conf man page for details. Otherwise use the scontrol or sview to manually return the node to service.</p> -<p><a name="reqspec"><b>57.How can a job which has exited with a specific exit +<p><a name="reqspec"><b>57. How can a job which has exited with a specific exit code be requeued?</b></a></br> Slurm supports requeue in hold with a <b>SPECIAL_EXIT</b> state using the command:</p> @@ -1942,6 +1931,6 @@ sacctmgr delete user name=adam cluster=tux account=chemistry <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 20 February 2014</p> +<p style="text-align:center;">Last modified 30 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/high_throughput.shtml b/doc/html/high_throughput.shtml index 35a3fbce1..e280d76f2 100644 --- a/doc/html/high_throughput.shtml +++ b/doc/html/high_throughput.shtml @@ -104,6 +104,11 @@ schedules jobs only on a First In First Out (FIFO) basis.</li> <li><b>SchedulerParameters</b>: Several scheduling parameters are available. <ul> +<li>Setting option <b>batch_sched_delay</b> will control how long the +scheduling of batch jobs can be delayed. This effects only batch jobs. +For example, if many jobs are submitted each second, the overhead of +trying to schedule each one will adversely impact the rate at which jobs +can be submitted. The default value is 3 seconds.</li> <li>Setting option <b>defer</b> will avoid attempting to schedule each job individually at job submit time, but defer it until a later time when scheduling multiple jobs simultaneously may be possible. @@ -164,6 +169,6 @@ not appear to add any measurable overhead.</li> appropriate for your environment.</li> </ul> -<p style="text-align:center;">Last modified 7 March 2014</p> +<p style="text-align:center;">Last modified 25 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/job_array.shtml b/doc/html/job_array.shtml index 728a31783..63af0d6fe 100644 --- a/doc/html/job_array.shtml +++ b/doc/html/job_array.shtml @@ -83,7 +83,7 @@ output files names of this sort "slurm-36_1.out", "slurm-36_2.out" and "slurm-36_3.out". If these file name options are used without being part of a job array then "%A" will be replaced by the current job ID and "%a" will be replaced by -65534 (NO_VAL).</p> +4,294,967,294 (equivalent to 0xfffffffe or NO_VAL).</p> <h2>Scancel Command Use</h2> @@ -300,6 +300,6 @@ array data structure is added rather than the current logic that only adds a new field to the existing job data structure. It is not certain when that work will occur.</p> -<p style="text-align:center;">Last modified 24 January 2014</p> +<p style="text-align:center;">Last modified 28 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/quickstart_admin.shtml b/doc/html/quickstart_admin.shtml index 4a17516a7..2f57de506 100644 --- a/doc/html/quickstart_admin.shtml +++ b/doc/html/quickstart_admin.shtml @@ -12,7 +12,7 @@ computer platforms.</p> <ol> <li>Make sure that you have synchronized clocks plus consistent users and groups (UIDs and GIDs) across the cluster.</li> -<li>Install <a href="http://munge.googlecode.com/">MUNGE</a> for +<li>Install <a href="https://code.google.com/p/munge/">MUNGE</a> for authentication. Make sure that all nodes in your cluster have the same <i>munge.key</i>. Make sure the MUNGE daemon, <i>munged</i> is started before you start the SLURM daemons.</li> @@ -211,7 +211,7 @@ authentication infrastructure is provided by a dynamically loaded plugin chosen at runtime via the <b>AuthType</b> keyword in the SLURM configuration file. Currently available authentication types include <a href="http://www.theether.org/authd/">authd</a>, -<a href="http://munge.googlecode.com/">munge</a>, and none. +<a href="https://code.google.com/p/munge/">munge</a>, and none. The default authentication infrastructure is "munge", but this does require the installation of the MUNGE package. An authentication type of "none" requires no infrastructure, but permits @@ -498,7 +498,7 @@ Currently, only three authentication plugins are supported: <b>auth/none</b>, <b>auth/authd</b>, and <b>auth/munge</b>. The auth/none plugin is built by default, but either Brent Chun's <a href="http://www.theether.org/authd/">authd</a>, -or LLNL's <a href="http://munge.googlecode.com/">MUNGE</a> +or LLNL's <a href="https://code.google.com/p/munge/">MUNGE</a> should be installed in order to get properly authenticated communications. Unless you are experience with authd, we recommend the use of MUNGE. The configure script in the top-level directory of this distribution will @@ -737,6 +737,6 @@ options such as mysql and gui tools via a configuration menu.</p> </pre> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 7 April 2014</p> +<p style="text-align:center;">Last modified 30 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/team.shtml b/doc/html/team.shtml index 2a0cf1e42..640e0dd86 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -125,6 +125,7 @@ Lead Slurm developers are: <li>Steven McDougall (SiCortex)</li> <li>Donna Mecozzi (Lawrence Livermore National Laboratory)</li> <li>Bjørn-Helge Mevik (University of Oslo, Norway)</li> +<li>Stuart Midgley (Down Under GeoSolutions)</li> <li>Chris Morrone (Lawrence Livermore National Laboratory)</li> <li>Pere Munt (Barcelona Supercomputing Center, Spain)</li> <br> @@ -195,6 +196,6 @@ Lead Slurm developers are: <!-- INDIVIDUALS, PLEASE KEEP IN ALPHABETICAL ORDER --> </ul> -<p style="text-align:center;">Last modified 11 April 2014</p> +<p style="text-align:center;">Last modified 25 April 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index f1b65ee90..a0d95b54d 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -62,6 +62,9 @@ may be specified. Supported datatypes are as follows: where \fI<interval>\fR is the task sampling interval in seconds for the jobacct_gather plugins and for task profiling by the acct_gather_profile plugin. +NOTE: This frequency is used to monitor memory usage. If memory limits +are enforced the highest frequency a user can request is what is configured in +the slurm.conf file. They can not turn it off (=0) either. .TP \fBenergy=\fI<interval>\fR where \fI<interval>\fR is the sampling interval in seconds @@ -406,7 +409,9 @@ sharing the same job name and user have terminated. .TP \fB\-D\fR, \fB\-\-chdir\fR=<\fIpath\fR> -change directory to \fIpath\fR before beginning execution. +Change directory to \fIpath\fR before beginning execution. The path +can be specified as full path or relative path to the directory where +the command is executed. .TP \fB\-\-exclusive\fR @@ -807,16 +812,18 @@ In this configuration supported values include: .RS .TP 6 \fBsystem\fR -Use the system\-wide network performance counters. Because only one -job can use this at a time this option will allocate every node in the -cluster. In steps inside the allocation wish to run on less than the -full system they must specify the node count for each step. +Use the system\-wide network performance counters. Only nodes requested +will be marked in use for the job allocation. If the job does not +fill up the entire system the rest of the nodes are not +able to be used by other jobs using NPC, if idle their state will appear as +PerfCnts. These nodes are still available for other jobs not using NPC. .TP \fBblade\fR Use the blade network performance counters. Only nodes requested will be marked in use for the job allocation. If the job does not fill up the entire blade(s) allocated to the job those blade(s) are not -able to be used by other jobs, but will appear in the state PerfCnts. +able to be used by other jobs using NPC, if idle their state will appear as +PerfCnts. These nodes are still available for other jobs not using NPC. .TP .RE @@ -1019,6 +1026,11 @@ permitted to execute per node. NOTE: \fBMAX_TASKS_PER_NODE\fR is defined in the file \fIslurm.h\fR and is not a variable, it is set at SLURM build time. +.TP +\fB\-\-priority\fR=<value> +Request a specific job priority. +May be subject to configuration specific constraints. + .TP \fB\-\-profile\fR=<all|none|[energy[,|task[,|lustre[,|network]]]]> enables detailed data collection by the acct_gather_profile plugin. @@ -1057,7 +1069,10 @@ Request a specific partition for the resource allocation. If not specified, the default behavior is to allow the slurm controller to select the default partition as designated by the system administrator. If the job can use more than one partition, specify their names in a comma separate list and the one -offering earliest initiation will be used. +offering earliest initiation will be used with no regard given to the partition +name ordering (although higher priority partitions will be considered first). +When the job is initiated, the name of the partition used will be placed first +in the job record partition string. .TP \fB\-Q\fR, \fB\-\-quiet\fR diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index d56df6437..fa7651c60 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -77,6 +77,9 @@ may be specified. Supported datatypes are as follows: where \fI<interval>\fR is the task sampling interval in seconds for the jobacct_gather plugins and for task profiling by the acct_gather_profile plugin. +NOTE: This frequency is used to monitor memory usage. If memory limits +are enforced the highest frequency a user can request is what is configured in +the slurm.conf file. They can not turn it off (=0) either. .TP \fBenergy=\fI<interval>\fR where \fI<interval>\fR is the sampling interval in seconds @@ -432,7 +435,8 @@ sharing the same job name and user have terminated. .TP \fB\-D\fR, \fB\-\-workdir\fR=<\fIdirectory\fR> Set the working directory of the batch script to \fIdirectory\fR before -it is executed. +it is executed. The path can be specified as full path or relative path +to the directory where the command is executed. .TP \fB\-e\fR, \fB\-\-error\fR=<\fIfilename pattern\fR> @@ -895,16 +899,18 @@ In this configuration supported values include: .RS .TP 6 \fBsystem\fR -Use the system\-wide network performance counters. Because only one -job can use this at a time this option will allocate every node in the -cluster. If steps inside the allocation wish to run on less than the -full system they must specify the node count for each step. +Use the system\-wide network performance counters. Only nodes requested +will be marked in use for the job allocation. If the job does not +fill up the entire system the rest of the nodes are not +able to be used by other jobs using NPC, if idle their state will appear as +PerfCnts. These nodes are still available for other jobs not using NPC. .TP \fBblade\fR Use the blade network performance counters. Only nodes requested will be marked in use for the job allocation. If the job does not fill up the entire blade(s) allocated to the job those blade(s) are not -able to be used by other jobs, but will appear in the state PerfCnts. +able to be used by other jobs using NPC, if idle their state will appear as +PerfCnts. These nodes are still available for other jobs not using NPC. .TP .RE @@ -1121,7 +1127,15 @@ Request a specific partition for the resource allocation. If not specified, the default behavior is to allow the slurm controller to select the default partition as designated by the system administrator. If the job can use more than one partition, specify their names in a comma separate list and the one -offering earliest initiation will be used. +offering earliest initiation will be used with no regard given to the partition +name ordering (although higher priority partitions will be considered first). +When the job is initiated, the name of the partition used will be placed first +in the job record partition string. + +.TP +\fB\-\-priority\fR=<value> +Request a specific job priority. +May be subject to configuration specific constraints. .TP \fB\-\-profile\fR=<all|none|[energy[,|task[,|lustre[,|network]]]]> diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index d51bd4a2e..062a06c2b 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "March 2014" "scontrol 14.03" "Slurm components" +.TH SCONTROL "1" "April 2014" "scontrol 14.03" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -184,10 +184,10 @@ configured as hidden or partitions that are unavailable to the user's group. This is the default behavior. .TP -\fBhold\fP \fIjob_id_list\fP +\fBhold\fP \fIjob_list\fP Prevent a pending job from beginning started (sets it's priority to 0). Use the \fIrelease\fP command to permit the job to be scheduled. -Multiple job ID values may be specified separated by spaces. +The job_list argument is a space separated list of job IDs or job names. Note that when a job is held by a system administrator using the \fBhold\fP command, only a system administrator may release the job for execution (also see the \fBuhold\fP command). When the job is held by its owner, it may also @@ -267,9 +267,9 @@ or SlurmdPort. The slurmctld daemon must be restarted if nodes are added to or removed from the cluster. .TP -\fBrelease\fP \fIjob_id_list\fP +\fBrelease\fP \fIjob_list\fP Release a previously held job to begin execution. -Multiple job ID values may be specified separated by spaces. +The job_list argument is a space separated list of job IDs or job names. Also see \fBhold\fR. .TP @@ -410,9 +410,9 @@ primary SLURM controller is scheduled down. (Note: SLURM's primary controller will take the control back at startup.) .TP -\fBuhold\fP \fIjob_id_list\fP +\fBuhold\fP \fIjob_list\fP Prevent a pending job from being started (sets it's priority to 0). -Multiple job ID values may be specified separated by spaces. +The job_list argument is a space separated list of job IDs or job names. Use the \fIrelease\fP command to permit the job to be scheduled. This command is designed for a system administrator to hold a job so that the job owner may release it rather than requiring the intervention of a diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index c1de00caa..d198bf4de 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -46,6 +46,9 @@ may be specified. Supported datatypes are as follows: where \fI<interval>\fR is the task sampling interval in seconds for the jobacct_gather plugins and for task profiling by the acct_gather_profile plugin. +NOTE: This frequency is used to monitor memory usage. If memory limits +are enforced the highest frequency a user can request is what is configured in +the slurm.conf file. They can not turn it off (=0) either. .TP \fBenergy=\fI<interval>\fR where \fI<interval>\fR is the sampling interval in seconds @@ -480,9 +483,10 @@ sharing the same job name and user have terminated. .TP \fB\-D\fR, \fB\-\-chdir\fR=<\fIpath\fR> -have the remote processes do a chdir to \fIpath\fR before beginning +Have the remote processes do a chdir to \fIpath\fR before beginning execution. The default is to chdir to the current working directory -of the \fBsrun\fR process. +of the \fBsrun\fR process. The path can be specified as full path or +relative path to the directory where the command is executed. .TP \fB\-e\fR, \fB\-\-error\fR=<\fImode\fR> @@ -997,16 +1001,18 @@ In this configuration supported values include: .RS .TP 6 \fBsystem\fR -Use the system\-wide network performance counters. Because only one -job can use this at a time this option will allocate every node in the -cluster. In steps inside the allocation wish to run on less than the -full system they must specify the node count for each step. +Use the system\-wide network performance counters. Only nodes requested +will be marked in use for the job allocation. If the job does not +fill up the entire system the rest of the nodes are not +able to be used by other jobs using NPC, if idle their state will appear as +PerfCnts. These nodes are still available for other jobs not using NPC. .TP \fBblade\fR Use the blade network performance counters. Only nodes requested will be marked in use for the job allocation. If the job does not fill up the entire blade(s) allocated to the job those blade(s) are not -able to be used by other jobs, but will appear in the state PerfCnts. +able to be used by other jobs using NPC, if idle their state will appear as +PerfCnts. These nodes are still available for other jobs not using NPC. .TP .RE @@ -1222,7 +1228,15 @@ Request a specific partition for the resource allocation. If not specified, the default behavior is to allow the slurm controller to select the default partition as designated by the system administrator. If the job can use more than one partition, specify their names in a comma separate list and the one -offering earliest initiation will be used. +offering earliest initiation will be used with no regard given to the partition +name ordering (although higher priority partitions will be considered first). +When the job is initiated, the name of the partition used will be placed first +in the job record partition string. + +.TP +\fB\-\-priority\fR=<value> +Request a specific job priority. +May be subject to configuration specific constraints. .TP \fB\-\-profile\fR=<all|none|[energy[,|task[,|lustre[,|network]]]]> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 4c6e810c4..4cb660c45 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1844,6 +1844,13 @@ The interpretation of this parameter varies by \fBSchedulerType\fR. Multiple options may be comma separated. .RS .TP +\fBbatch_sched_delay=#\fR +How long, in seconds, the scheduling of batch jobs can be delayed. +This can be useful in a high\-throughput environment in which batch jobs are +submitted at a very high rate (i.e. using the sbatch command) and one wishes +to reduce the overhead of attempting to schedule each job at submit time. +The default value is 3 seconds. +.TP \fBbf_continue\fR The backfill scheduler periodically releases locks in order to permit other operations to proceed rather than blocking all activity for what could be an @@ -1926,7 +1933,7 @@ Specifying a large value (say 1000 or higher) can be expected to result in poor system responsiveness since this scheduling logic will not release locks for other events to occur. It would be better to let the backfill scheduler process a larger number of jobs -(see \fBmax_job_bf\fR, \fBbf_continue\fR and other options here for more +(see \fBbf_max_job_test\fR, \fBbf_continue\fR and other options here for more information). .TP \fBdefer\fR diff --git a/etc/cgroup.release_common.example b/etc/cgroup.release_common.example index 94acd9f8d..ae4aedbd6 100644 --- a/etc/cgroup.release_common.example +++ b/etc/cgroup.release_common.example @@ -44,7 +44,7 @@ fi slurmcg=${rmcg%/uid_*} if [[ ${slurmcg} == ${rmcg} ]] then - # not a slurm job pattern, perhaps the slurmcg, just remove + # not a slurm job pattern, perhaps the slurmcg, just remove # the dir with a lock and exit flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? @@ -55,7 +55,7 @@ orphancg=${slurmcg}/orphan if [[ ! -d ${orphancg} ]] then mkdir ${orphancg} - case ${subsystem} in + case ${subsystem} in cpuset) cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems @@ -64,7 +64,7 @@ then ;; esac fi - + # kernel call if [[ $# -eq 1 ]] then @@ -76,7 +76,7 @@ then uidcg=${rmcg%/job_*} if [[ ${uidcg} == ${rmcg} ]] then - # not a slurm job pattern, perhaps the uidcg, just remove + # not a slurm job pattern, perhaps the uidcg, just remove # the dir with a lock and exit flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? @@ -122,7 +122,7 @@ then # and the remaining job if [[ -d ${uidcg} ]] then - case ${subsystem} in + case ${subsystem} in cpuset) cpus=$(cat ${uidcg}/job_*/cpuset.cpus 2>/dev/null) if [[ -n ${cpus} ]] @@ -131,7 +131,7 @@ then cpus=$(echo ${cpus} | tr ' ' ',') echo ${cpus} > ${uidcg}/cpuset.cpus else - # first move the remaining processes to + # first move the remaining processes to # a cgroup reserved for orphaned processes for t in $(cat ${uidcg}/tasks) do diff --git a/src/api/job_info.c b/src/api/job_info.c index dca7b0b79..c5d8d383d 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -75,28 +75,34 @@ static void _fname_format(char *buf, int buf_size, job_info_t * job_ptr, char *ptr, *tmp, *tmp2 = NULL, *user; tmp = xstrdup(fname); - while ((ptr = strstr(tmp, "%A"))) { + while ((ptr = strstr(tmp, "%A"))) { /* Array job ID */ ptr[0] = '\0'; - xstrfmtcat(tmp2, "%s%u%s", tmp, job_ptr->array_job_id, ptr+2); + if (job_ptr->array_task_id == NO_VAL) { + /* Not a job array */ + xstrfmtcat(tmp2, "%s%u%s", tmp, job_ptr->job_id, ptr+2); + } else { + xstrfmtcat(tmp2, "%s%u%s", tmp, job_ptr->array_job_id, + ptr+2); + } xfree(tmp); /* transfer the results */ tmp = tmp2; tmp2 = NULL; } - while ((ptr = strstr(tmp, "%a"))) { + while ((ptr = strstr(tmp, "%a"))) { /* Array task ID */ ptr[0] = '\0'; xstrfmtcat(tmp2, "%s%u%s", tmp, job_ptr->array_task_id, ptr+2); xfree(tmp); /* transfer the results */ tmp = tmp2; tmp2 = NULL; } - while ((ptr = strstr(tmp, "%j"))) { + while ((ptr = strstr(tmp, "%j"))) { /* Job ID */ ptr[0] = '\0'; xstrfmtcat(tmp2, "%s%u%s", tmp, job_ptr->job_id, ptr+2); xfree(tmp); /* transfer the results */ tmp = tmp2; tmp2 = NULL; } - while ((ptr = strstr(tmp, "%u"))) { + while ((ptr = strstr(tmp, "%u"))) { /* User name */ ptr[0] = '\0'; user = uid_to_string((uid_t) job_ptr->user_id); xstrfmtcat(tmp2, "%s%s%s", tmp, user, ptr+2); diff --git a/src/common/read_config.c b/src/common/read_config.c index 48ce72875..7b935d963 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -344,7 +344,7 @@ static bool _is_valid_path (char *path, char *msg) * Allocate temporary space for walking the list of dirs: */ int pathlen; - char *buf, *p, *entry; + char *buf = NULL, *p, *entry; if (path == NULL) { error ("is_valid_path: path is NULL!"); @@ -2388,8 +2388,9 @@ static int _config_is_storage(s_p_hashtbl_t *hashtbl, char *name) port = strrchr(&host[1], ':'); if (port == NULL) return (-1); - conf_ptr->accounting_storage_type = xstrdup_printf("accounting_storage/%.*s", - (int)(cluster - name), name); + conf_ptr->accounting_storage_type = + xstrdup_printf("accounting_storage/%.*s", + (int)(cluster - name), name); cluster++; cluster = xstrndup(cluster, host - cluster); host++; diff --git a/src/common/slurm_acct_gather.c b/src/common/slurm_acct_gather.c index ad8799c28..e15cd4142 100644 --- a/src/common/slurm_acct_gather.c +++ b/src/common/slurm_acct_gather.c @@ -197,6 +197,51 @@ extern int acct_gather_parse_freq(int type, char *freq) return freq_int; } +extern int acct_gather_check_acct_freq_task( + uint32_t job_mem_lim, char *acctg_freq) +{ + int task_freq; + static uint32_t acct_freq_task = NO_VAL; + + if (acct_freq_task == NO_VAL) { + char *acct_freq = slurm_get_jobacct_gather_freq(); + int i = acct_gather_parse_freq(PROFILE_TASK, acct_freq); + xfree(acct_freq); + + /* If the value is -1 lets set the freq to something + really high so we don't check this again. + */ + if (i == -1) + acct_freq_task = (uint16_t)NO_VAL; + else + acct_freq_task = i; + } + + if (!job_mem_lim || !acct_freq_task) + return 0; + + task_freq = acct_gather_parse_freq(PROFILE_TASK, acctg_freq); + + if (task_freq == -1) + return 0; + + if (task_freq == 0) { + error("Can't turn accounting frequency off. " + "We need it to monitor memory usage."); + slurm_seterrno(ESLURMD_INVALID_ACCT_FREQ); + return 1; + } else if (task_freq > acct_freq_task) { + error("Can't set frequency to %d, it is higher than %u. " + "We need it to be at least at this level to " + "monitor memory usage.", + task_freq, acct_freq_task); + slurm_seterrno(ESLURMD_INVALID_ACCT_FREQ); + return 1; + } + + return 0; +} + extern void acct_gather_suspend_poll(void) { acct_gather_suspended = true; diff --git a/src/common/slurm_acct_gather.h b/src/common/slurm_acct_gather.h index e17d84cb2..e6533a92a 100644 --- a/src/common/slurm_acct_gather.h +++ b/src/common/slurm_acct_gather.h @@ -65,6 +65,8 @@ extern int acct_gather_conf_destroy(void); /* don't forget to free this */ extern List acct_gather_conf_values(void); extern int acct_gather_parse_freq(int type, char *freq); +extern int acct_gather_check_acct_freq_task( + uint32_t job_mem_lim, char *acctg_freq); extern void acct_gather_suspend_poll(void); extern void acct_gather_resume_poll(void); diff --git a/src/common/slurm_jobcomp.c b/src/common/slurm_jobcomp.c index 3af67004c..678f4dfbb 100644 --- a/src/common/slurm_jobcomp.c +++ b/src/common/slurm_jobcomp.c @@ -109,6 +109,7 @@ jobcomp_destroy_job(void *object) xfree(job->rotate); xfree(job->geo); xfree(job->bg_start_point); + xfree(job->work_dir); xfree(job); } } diff --git a/src/common/slurm_jobcomp.h b/src/common/slurm_jobcomp.h index 41824ce97..0f62f452c 100644 --- a/src/common/slurm_jobcomp.h +++ b/src/common/slurm_jobcomp.h @@ -66,6 +66,7 @@ typedef struct { uint32_t gid; char *gid_name; uint32_t node_cnt; + uint32_t proc_cnt; char *nodelist; char *jobname; char *state; @@ -77,6 +78,7 @@ typedef struct { uint32_t max_procs; char *geo; char *bg_start_point; + char *work_dir; } jobcomp_job_rec_t; typedef struct slurm_jobcomp_context * slurm_jobcomp_context_t; diff --git a/src/common/util-net.c b/src/common/util-net.c index 344d01b48..6cfea3e9c 100644 --- a/src/common/util-net.c +++ b/src/common/util-net.c @@ -1,6 +1,4 @@ /*****************************************************************************\ - * $Id$ - ***************************************************************************** * Copyright (C) 2001-2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Chris Dunlap <cdunlap@llnl.gov>. @@ -42,6 +40,10 @@ # include "config.h" #endif /* HAVE_CONFIG_H */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif + #include <arpa/inet.h> #include <assert.h> #include <errno.h> @@ -50,6 +52,7 @@ #include <pthread.h> #include <stdio.h> #include <string.h> +#include <stdlib.h> #include <sys/socket.h> #include "src/common/strlcpy.h" @@ -366,3 +369,39 @@ const char * inet_ntop(int family, const void *addr, char *str, size_t len) return(str); } #endif /* !HAVE_INET_NTOP */ + +/* is_full_path() + * + * Test if the given path is a full or relative one. + */ +extern +bool is_full_path(const char *path) +{ + if (path[0] == '/') + return true; + + return false; +} + +/* make_full_path() + * + * Given a relative path in input make it full relative + * to the current working directory. + */ +extern char *make_full_path(char *rpath) +{ + char *cwd; + char *cwd2; + int len; + + cwd = get_current_dir_name(); + /* 2 = / + 0 + */ + len = strlen(cwd) + strlen(rpath) + 2; + cwd2 = xmalloc(len); + sprintf(cwd2, "%s/%s", cwd, rpath); + free(cwd); + + return cwd2; +} + diff --git a/src/common/util-net.h b/src/common/util-net.h index 39b93f8a6..2e949be88 100644 --- a/src/common/util-net.h +++ b/src/common/util-net.h @@ -44,10 +44,15 @@ # include "config.h" #endif /* HAVE_CONFIG_H */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif + #include <netdb.h> #include <netinet/in.h> #include <unistd.h> - +#include "src/common/macros.h" +#include "src/common/xmalloc.h" #define HOSTENT_SIZE 8192 /* cf. Stevens UNPv1 11.15 p304 */ @@ -123,5 +128,17 @@ const char * inet_ntop(int family, const void *addr, char *str, size_t len); */ #endif /* !HAVE_INET_NTOP */ +/* is_full_path() + * + * Test if the given path is a full or relative one. + */ +extern bool is_full_path(const char *); + +/* make_full_path() + * + * Given a relative path in input make it full relative + * to the current working directory. + */ +extern char *make_full_path(char *); #endif /* !_UTIL_NET_H */ diff --git a/src/plugins/jobcomp/filetxt/filetxt_jobcomp_process.c b/src/plugins/jobcomp/filetxt/filetxt_jobcomp_process.c index 6cc65681a..43560ba6b 100644 --- a/src/plugins/jobcomp/filetxt/filetxt_jobcomp_process.c +++ b/src/plugins/jobcomp/filetxt/filetxt_jobcomp_process.c @@ -129,10 +129,14 @@ static jobcomp_job_rec_t *_parse_line(List job_info_list) job->nodelist = xstrdup(jobcomp_info->val); } else if (!strcasecmp("NodeCnt", jobcomp_info->name)) { job->node_cnt = atoi(jobcomp_info->val); + } else if (!strcasecmp("ProcCnt", jobcomp_info->name)) { + job->proc_cnt = atoi(jobcomp_info->val); } else if (!strcasecmp("JobState", jobcomp_info->name)) { job->state = xstrdup(jobcomp_info->val); } else if (!strcasecmp("Timelimit", jobcomp_info->name)) { job->timelimit = xstrdup(jobcomp_info->val); + } else if (!strcasecmp("Workdir", jobcomp_info->name)) { + job->work_dir = xstrdup(jobcomp_info->val); } #ifdef HAVE_BG else if (!strcasecmp("MaxProcs", jobcomp_info->name)) { @@ -195,6 +199,8 @@ extern List filetxt_jobcomp_process_get_jobs(slurmdb_job_cond_t *job_cond) list_append(job_info_list, jobcomp_info); jobcomp_info->name = fptr; fptr = strstr(fptr, "="); + if (!fptr) + break; *fptr++ = 0; jobcomp_info->val = fptr; fptr = strstr(fptr, " "); diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 1f9f845c6..bcdca8e3f 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -1021,22 +1021,36 @@ static int _attempt_backfill(void) uint32_t save_time_limit = job_ptr->time_limit; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { - if (orig_time_limit == NO_VAL) + if (orig_time_limit == NO_VAL) { + acct_policy_alter_job( + job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; - else + } else { + acct_policy_alter_job( + job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; + } job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ + acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; + } else if (orig_time_limit == NO_VAL) { + acct_policy_alter_job(job_ptr, comp_time_limit); + job_ptr->time_limit = comp_time_limit; + job_ptr->end_time = job_ptr->start_time + + (job_ptr->time_limit * 60); } else { + acct_policy_alter_job(job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; + job_ptr->end_time = job_ptr->start_time + + (job_ptr->time_limit * 60); } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ @@ -1197,6 +1211,7 @@ static void _reset_job_time_limit(struct job_record *job_ptr, time_t now, { int32_t j, resv_delay; uint32_t orig_time_limit = job_ptr->time_limit; + uint32_t new_time_limit; for (j=0; ; ) { if ((node_space[j].begin_time != now) && @@ -1212,7 +1227,9 @@ static void _reset_job_time_limit(struct job_record *job_ptr, time_t now, if ((j = node_space[j].next) == 0) break; } - job_ptr->time_limit = MAX(job_ptr->time_min, job_ptr->time_limit); + new_time_limit = MAX(job_ptr->time_min, job_ptr->time_limit); + acct_policy_alter_job(job_ptr, new_time_limit); + job_ptr->time_limit = new_time_limit; job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); job_time_adj_resv(job_ptr); diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c index f46375d6c..4156d27e9 100644 --- a/src/plugins/select/bluegene/bg_record_functions.c +++ b/src/plugins/select/bluegene/bg_record_functions.c @@ -409,8 +409,16 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) struct job_record *job_ptr; sec_record->job_list = list_create(NULL); itr = list_iterator_create(fir_record->job_list); - while ((job_ptr = list_next(itr))) + while ((job_ptr = list_next(itr))) { + if (job_ptr->magic != JOB_MAGIC) { + error("copy_bg_record: bad job magic, " + "this should never happen"); + list_delete_item(itr); + continue; + } + list_append(sec_record->job_list, job_ptr); + } list_iterator_destroy(itr); } sec_record->job_ptr = fir_record->job_ptr; @@ -2056,8 +2064,14 @@ static void _set_block_avail(bg_record_t *bg_record) bg_record->avail_cnode_cnt = bg_record->cnode_cnt; while ((job_ptr = list_next(itr))) { - select_jobinfo_t *jobinfo = - job_ptr->select_jobinfo->data; + select_jobinfo_t *jobinfo; + if (job_ptr->magic != JOB_MAGIC) { + error("_set_block_avail: bad job magic, " + "this should never happen"); + list_delete_item(itr); + continue; + } + jobinfo = job_ptr->select_jobinfo->data; if (job_ptr->end_time > bg_record->avail_job_end) bg_record->avail_job_end = job_ptr->end_time; diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 01b061821..28d66bc91 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -146,7 +146,6 @@ time_t last_node_update; #endif static blade_info_t *blade_array = NULL; -static bitstr_t *blade_nodes_running_jobs = NULL; static bitstr_t *blade_nodes_running_npc = NULL; static uint32_t blade_cnt = 0; static pthread_mutex_t blade_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -730,19 +729,15 @@ static void _remove_job_from_blades(select_jobinfo_t *jobinfo) blade_array[i].job_cnt = 0; } - if (jobinfo->npc) { + if (jobinfo->npc == NPC_SYS) { + bit_nclear(blade_nodes_running_npc, 0, + node_record_count-1); + } else if (jobinfo->npc) { bit_not(blade_nodes_running_npc); bit_or(blade_nodes_running_npc, blade_array[i].node_bitmap); bit_not(blade_nodes_running_npc); } - - if (!blade_array[i].job_cnt) { - bit_not(blade_nodes_running_jobs); - bit_or(blade_nodes_running_jobs, - blade_array[i].node_bitmap); - bit_not(blade_nodes_running_jobs); - } } if (jobinfo->npc) @@ -804,16 +799,14 @@ static void _set_job_running(struct job_record *job_ptr) nodeinfo = node_record_table_ptr[i].select_nodeinfo->data; if (!bit_test(jobinfo->blade_map, nodeinfo->blade_id)) { - bit_set(jobinfo->blade_map, - nodeinfo->blade_id); - if (!blade_array[nodeinfo->blade_id].job_cnt) - bit_or(blade_nodes_running_jobs, - blade_array[nodeinfo->blade_id]. - node_bitmap); + bit_set(jobinfo->blade_map, nodeinfo->blade_id); blade_array[nodeinfo->blade_id].job_cnt++; - if (jobinfo->npc) + if (jobinfo->npc == NPC_SYS) { + bit_nset(blade_nodes_running_npc, 0, + node_record_count-1); + } else if (jobinfo->npc) bit_or(blade_nodes_running_npc, blade_array[nodeinfo->blade_id]. node_bitmap); @@ -836,13 +829,14 @@ static void _set_job_running_restore(select_jobinfo_t *jobinfo) if (!bit_test(jobinfo->blade_map, i)) continue; - if (!blade_array[i].job_cnt) - bit_or(blade_nodes_running_jobs, blade_array[i].node_bitmap); - blade_array[i].job_cnt++; - if (jobinfo->npc) - bit_or(blade_nodes_running_npc, blade_array[i].node_bitmap); + if (jobinfo->npc == NPC_SYS) { + bit_nset(blade_nodes_running_npc, 0, + node_record_count-1); + } else if (jobinfo->npc) + bit_or(blade_nodes_running_npc, + blade_array[i].node_bitmap); } if (jobinfo->npc) @@ -1071,7 +1065,6 @@ extern int fini ( void ) slurm_mutex_lock(&blade_mutex); - FREE_NULL_BITMAP(blade_nodes_running_jobs); FREE_NULL_BITMAP(blade_nodes_running_npc); for (i=0; i<blade_cnt; i++) @@ -1414,9 +1407,6 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) if (!blade_array) blade_array = xmalloc(sizeof(blade_info_t) * node_cnt); - if (!blade_nodes_running_jobs) - blade_nodes_running_jobs = bit_alloc(node_cnt); - if (!blade_nodes_running_npc) blade_nodes_running_npc = bit_alloc(node_cnt); @@ -1553,30 +1543,33 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, { select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data; slurm_mutex_lock(&blade_mutex); - if (jobinfo->npc == NPC_NONE) { - if (mode != SELECT_MODE_TEST_ONLY) { - bit_not(blade_nodes_running_npc); - bit_and(bitmap, blade_nodes_running_npc); - bit_not(blade_nodes_running_npc); - } - } else { + + if (jobinfo->npc != NPC_NONE) { /* If looking for network performance counters unmark all the nodes that are in use since they cannot be used. */ if (mode != SELECT_MODE_TEST_ONLY) { - bit_not(blade_nodes_running_jobs); - bit_and(bitmap, blade_nodes_running_jobs); - bit_not(blade_nodes_running_jobs); + if (jobinfo->npc == NPC_SYS) { + /* All the nodes have to be free of + * network performance counters to run + * NPC_SYS. + */ + if (bit_ffs(blade_nodes_running_npc) != -1) + bit_nclear(bitmap, 0, + bit_size(bitmap) - 1); + } else { + bit_not(blade_nodes_running_npc); + bit_and(bitmap, blade_nodes_running_npc); + bit_not(blade_nodes_running_npc); + } } } /* char *tmp = bitmap2node_name(bitmap); */ - /* char *tmp2 = bitmap2node_name(blade_nodes_running_jobs); */ /* char *tmp3 = bitmap2node_name(blade_nodes_running_npc); */ - /* info("trying %u on %s '%s' '%s'", job_ptr->job_id, tmp, tmp2, tmp3); */ + /* info("trying %u on %s '%s'", job_ptr->job_id, tmp, tmp3); */ /* xfree(tmp); */ - /* xfree(tmp2); */ /* xfree(tmp3); */ slurm_mutex_unlock(&blade_mutex); @@ -1602,11 +1595,9 @@ extern int select_p_job_begin(struct job_record *job_ptr) _set_job_running(job_ptr); - /* char *tmp2 = bitmap2node_name(blade_nodes_running_jobs); */ /* char *tmp3 = bitmap2node_name(blade_nodes_running_npc); */ - /* info("adding %u '%s' '%s'", job_ptr->job_id, tmp2, tmp3); */ - /* xfree(tmp2); */ + /* info("adding %u '%s'", job_ptr->job_id, tmp3); */ /* xfree(tmp3); */ slurm_mutex_unlock(&blade_mutex); @@ -2239,17 +2230,7 @@ extern int select_p_update_node_state(struct node_record *node_ptr) extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) { - job_desc_msg_t *job_desc = (job_desc_msg_t *)data; - switch (type) { - case SELECT_SET_NODE_CNT: - if (job_desc->network && !strcmp(job_desc->network, "system")) - job_desc->min_cpus = job_desc->min_nodes = - job_desc->max_nodes = node_record_count; - return SLURM_SUCCESS; - break; - default: - return other_alter_node_cnt(type, data); - } + return other_alter_node_cnt(type, data); } extern int select_p_reconfigure(void) diff --git a/src/plugins/switch/nrt/nrt.c b/src/plugins/switch/nrt/nrt.c index 769b0665b..cfb4518db 100644 --- a/src/plugins/switch/nrt/nrt.c +++ b/src/plugins/switch/nrt/nrt.c @@ -267,12 +267,14 @@ static int _add_immed_use(char *hostname, slurm_nrt_jobinfo_t *jp, static int _allocate_windows_all(slurm_nrt_jobinfo_t *jp, char *hostname, uint32_t node_id, nrt_task_id_t task_id, nrt_adapter_t adapter_type, int network_id, - nrt_protocol_table_t *protocol_table, int instances); + nrt_protocol_table_t *protocol_table, int instances, + int task_inx); static int _allocate_window_single(char *adapter_name, slurm_nrt_jobinfo_t *jp, char *hostname, uint32_t node_id, nrt_task_id_t task_id, nrt_adapter_t adapter_type, int network_id, - nrt_protocol_table_t *protocol_table, int instances); + nrt_protocol_table_t *protocol_table, int instances, + int task_inx); static slurm_nrt_libstate_t *_alloc_libstate(void); static slurm_nrt_nodeinfo_t *_alloc_node(slurm_nrt_libstate_t *lp, char *name); static int _copy_node(slurm_nrt_nodeinfo_t *dest, @@ -1125,7 +1127,8 @@ static int _allocate_windows_all(slurm_nrt_jobinfo_t *jp, char *hostname, uint32_t node_id, nrt_task_id_t task_id, nrt_adapter_t adapter_type, int network_id, - nrt_protocol_table_t *protocol_table, int instances) + nrt_protocol_table_t *protocol_table, int instances, + int task_inx) { nrt_tableinfo_t *tableinfo = jp->tableinfo; nrt_job_key_t job_key = jp->job_key; @@ -1170,7 +1173,7 @@ _allocate_windows_all(slurm_nrt_jobinfo_t *jp, char *hostname, if (user_space && (adapter->adapter_type == NRT_IPONLY)) continue; - if ((context_id == 0) && + if ((context_id == 0) && (task_inx == 0) && (_add_block_use(jp, adapter))) { goto alloc_fail; } @@ -1311,7 +1314,7 @@ _allocate_window_single(char *adapter_name, slurm_nrt_jobinfo_t *jp, char *hostname, uint32_t node_id, nrt_task_id_t task_id, nrt_adapter_t adapter_type, int network_id, nrt_protocol_table_t *protocol_table, - int instances) + int instances, int task_inx) { nrt_tableinfo_t *tableinfo = jp->tableinfo; nrt_job_key_t job_key = jp->job_key; @@ -1370,7 +1373,7 @@ _allocate_window_single(char *adapter_name, slurm_nrt_jobinfo_t *jp, table_inx = -1; for (context_id = 0; context_id < protocol_table->protocol_table_cnt; context_id++) { - if ((context_id == 0) && + if ((context_id == 0) && (task_inx == 0) && (_add_block_use(jp, adapter))) { goto alloc_fail; } @@ -3209,7 +3212,7 @@ nrt_build_jobinfo(slurm_nrt_jobinfo_t *jp, hostlist_t hl, adapter_type, network_id, protocol_table, - instances); + instances, j); } else { rc = _allocate_window_single( adapter_name, @@ -3218,7 +3221,7 @@ nrt_build_jobinfo(slurm_nrt_jobinfo_t *jp, hostlist_t hl, adapter_type, network_id, protocol_table, - instances); + instances, j); } if (rc != SLURM_SUCCESS) { _unlock(); diff --git a/src/sacct/print.c b/src/sacct/print.c index c5dc227b7..d45c6a373 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -168,7 +168,7 @@ void print_fields(type_t type, void *object) break; case JOBCOMP: default: - tmp_int = NO_VAL; + tmp_int = job_comp->proc_cnt; break; } field->print_routine(field, diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 43740d6c7..78428fcc0 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -87,7 +87,7 @@ #include "src/common/uid.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" - +#include "src/common/util-net.h" #include "src/salloc/salloc.h" #include "src/salloc/opt.h" @@ -167,6 +167,8 @@ #define LONG_OPT_WAIT_ALL_NODES 0x142 #define LONG_OPT_REQ_SWITCH 0x143 #define LONG_OPT_PROFILE 0x144 +#define LONG_OPT_PRIORITY 0x160 + /*---- global variables, defined in opt.h ----*/ opt_t opt; @@ -368,6 +370,9 @@ static void _opt_default() opt.wckey = NULL; opt.req_switch = -1; opt.wait4switch = -1; + + opt.nice = 0; + opt.priority = 0; } /*---[ env var processing ]-----------------------------------------------*/ @@ -673,6 +678,7 @@ void set_options(const int argc, char **argv) {"mloader-image", required_argument, 0, LONG_OPT_MLOADER_IMAGE}, {"network", required_argument, 0, LONG_OPT_NETWORK}, {"nice", optional_argument, 0, LONG_OPT_NICE}, + {"priority", required_argument, 0, LONG_OPT_PRIORITY}, {"no-bell", no_argument, 0, LONG_OPT_NO_BELL}, {"no-shell", no_argument, 0, LONG_OPT_NOSHELL}, {"ntasks-per-core", required_argument, 0, LONG_OPT_NTASKSPERCORE}, @@ -750,7 +756,10 @@ void set_options(const int argc, char **argv) break; case 'D': xfree(opt.cwd); - opt.cwd = xstrdup(optarg); + if (is_full_path(optarg)) + opt.cwd = xstrdup(optarg); + else + opt.cwd = make_full_path(optarg); break; case 'F': xfree(opt.nodelist); @@ -1015,6 +1024,19 @@ void set_options(const int argc, char **argv) } } break; + case LONG_OPT_PRIORITY: { + long long priority = strtoll(optarg, NULL, 10); + if (priority < 0) { + error("Priority must be >= 0"); + exit(error_exit); + } + if (priority >= NO_VAL) { + error("Priority must be < %i", NO_VAL); + exit(error_exit); + } + opt.priority = priority; + break; + } case LONG_OPT_BELL: opt.bell = BELL_ALWAYS; break; @@ -1920,6 +1942,7 @@ static void _help(void) " --ntasks-per-node=n number of tasks to invoke on each node\n" " -N, --nodes=N number of nodes on which to run (N = min[-max])\n" " -O, --overcommit overcommit resources\n" +" --priority=value set the priority of the job to value\n" " --profile=value enable acct_gather_profile for detailed data\n" " value is all or none or any combination of\n" " energy, lustre, network or task\n" diff --git a/src/salloc/opt.h b/src/salloc/opt.h index 94993e60c..f1d947e2a 100644 --- a/src/salloc/opt.h +++ b/src/salloc/opt.h @@ -104,6 +104,7 @@ typedef struct salloc_options { unsigned int jobid; /* --jobid=jobid */ char *dependency; /* --dependency, -P type:jobid */ int nice; /* --nice */ + uint32_t priority; /* --priority */ char *account; /* --account, -U acct_name */ char *comment; /* --comment */ char *qos; /* --qos */ diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 190819950..77ac614d8 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -654,6 +654,8 @@ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) desc->network = opt.network; if (opt.nice) desc->nice = NICE_OFFSET + opt.nice; + if (opt.priority) + desc->priority = opt.priority; desc->mail_type = opt.mail_type; if (opt.mail_user) desc->mail_user = xstrdup(opt.mail_user); diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index f8a491f35..d56b2c4d8 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -86,6 +86,7 @@ #include "src/common/uid.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" +#include "src/common/util-net.h" #include "src/sbatch/opt.h" @@ -178,6 +179,7 @@ #define LONG_OPT_IGNORE_PBS 0x155 #define LONG_OPT_TEST_ONLY 0x156 #define LONG_OPT_PARSABLE 0x157 +#define LONG_OPT_PRIORITY 0x160 /*---- global variables, defined in opt.h ----*/ opt_t opt; @@ -394,6 +396,9 @@ static void _opt_default() opt.ckpt_interval_str = NULL; opt.ckpt_dir = xstrdup(opt.cwd); + opt.nice = 0; + opt.priority = 0; + opt.test_only = false; } @@ -724,6 +729,7 @@ static struct option long_options[] = { {"parsable", optional_argument, 0, LONG_OPT_PARSABLE}, {"propagate", optional_argument, 0, LONG_OPT_PROPAGATE}, {"profile", required_argument, 0, LONG_OPT_PROFILE}, + {"priority", required_argument, 0, LONG_OPT_PRIORITY}, {"qos", required_argument, 0, LONG_OPT_QOS}, {"ramdisk-image", required_argument, 0, LONG_OPT_RAMDISK_IMAGE}, {"reboot", no_argument, 0, LONG_OPT_REBOOT}, @@ -1178,7 +1184,10 @@ static void _set_options(int argc, char **argv) break; case 'D': xfree(opt.cwd); - opt.cwd = xstrdup(optarg); + if (is_full_path(optarg)) + opt.cwd = xstrdup(optarg); + else + opt.cwd = make_full_path(optarg); break; case 'e': xfree(opt.efname); @@ -1461,6 +1470,19 @@ static void _set_options(int argc, char **argv) } } break; + case LONG_OPT_PRIORITY: { + long long priority = strtoll(optarg, NULL, 10); + if (priority < 0) { + error("Priority must be >= 0"); + exit(error_exit); + } + if (priority >= NO_VAL) { + error("Priority must be < %i", NO_VAL); + exit(error_exit); + } + opt.priority = priority; + break; + } case LONG_OPT_NO_REQUEUE: opt.requeue = 0; break; @@ -2580,30 +2602,14 @@ static bool _opt_verify(void) static uint16_t _parse_pbs_mail_type(const char *arg) { - uint16_t rc; - - if (strcasecmp(arg, "b") == 0) - rc = MAIL_JOB_BEGIN; - else if (strcasecmp(arg, "e") == 0) - rc = MAIL_JOB_END; - else if (strcasecmp(arg, "a") == 0) - rc = MAIL_JOB_FAIL; - else if (strcasecmp(arg, "bea") == 0 - || strcasecmp(arg, "eba") == 0 - || strcasecmp(arg, "eab") == 0 - || strcasecmp(arg, "bae") == 0) - rc = MAIL_JOB_BEGIN | MAIL_JOB_END | MAIL_JOB_FAIL; - else if (strcasecmp(arg, "be") == 0 - || strcasecmp(arg, "eb") == 0) - rc = MAIL_JOB_BEGIN | MAIL_JOB_END; - else if (strcasecmp(arg, "ba") == 0 - || strcasecmp(arg, "ab") == 0) - rc = MAIL_JOB_BEGIN | MAIL_JOB_FAIL; - else if (strcasecmp(arg, "ea") == 0 - || strcasecmp(arg, "ae") == 0) - rc = MAIL_JOB_END | MAIL_JOB_FAIL; - else - rc = 0; /* arg="n" or failure */ + uint16_t rc = 0; + + if (strchr(arg, 'b') || strchr(arg, 'B')) + rc |= MAIL_JOB_BEGIN; + if (strchr(arg, 'e') || strchr(arg, 'E')) + rc |= MAIL_JOB_END; + if (strchr(arg, 'a') || strchr(arg, 'A')) + rc |= MAIL_JOB_FAIL; return rc; } @@ -2992,6 +2998,7 @@ static void _help(void) " -p, --partition=partition partition requested\n" " --parsable outputs only the jobid and cluster name (if present),\n" " separated by semicolon, only on successful submission.\n" +" --priority=value set the priority of the job to value\n" " --profile=value enable acct_gather_profile for detailed data\n" " value is all or none or any combination of\n" " energy, lustre, network or task\n" diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h index d6f4a82a8..5c1ef3194 100644 --- a/src/sbatch/opt.h +++ b/src/sbatch/opt.h @@ -108,6 +108,7 @@ typedef struct sbatch_options { char *mpi_type; /* --mpi=type */ char *dependency; /* --dependency, -P type:jobid */ int nice; /* --nice */ + uint32_t priority; /* --priority */ char *account; /* --account, -U acct_name */ char *comment; /* --comment */ char *propagate; /* --propagate[=RLIMIT_CORE,...]*/ diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 35823a630..a4c85279c 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -384,6 +384,9 @@ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) desc->network = opt.network; if (opt.nice) desc->nice = NICE_OFFSET + opt.nice; + if (opt.priority) + desc->priority = opt.priority; + desc->mail_type = opt.mail_type; if (opt.mail_user) desc->mail_user = xstrdup(opt.mail_user); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index e87e06190..facbcaf29 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -1804,8 +1804,10 @@ scontrol [<OPTION>] [<COMMAND>] \n\ descriptive string. \n\ exit terminate scontrol \n\ help print this description of use. \n\ - hold <jobid_list> prevent specified job from starting (see release)\n\ - holdu <jobid_list> place user hold on specified job (see release)\n\ + hold <job_list> prevent specified job from starting. <job_list>\n\ + is either a space separate list of job IDs or\n\ + job names \n\ + holdu <job_list> place user hold on specified job (see hold) \n\ hide do not display information about hidden \n\ partitions \n\ listpids <job_id<.step>> List pids associated with the given jobid, or\n\ @@ -1823,7 +1825,7 @@ scontrol [<OPTION>] [<COMMAND>] \n\ reboot_nodes [<nodelist>] reboot the nodes when they become idle. \n\ By default all nodes are rebooted. \n\ reconfigure re-read configuration files. \n\ - release <jobid_list> permit specified job to start (see hold) \n\ + release <job_list> permit specified job to start (see hold) \n\ requeue <job_id> re-queue a batch job \n\ requeuehold <job_id> re-queue and hold a batch \n\ resume <jobid_list> resume previously suspended job (see suspend)\n\ @@ -1834,9 +1836,9 @@ scontrol [<OPTION>] [<COMMAND>] \n\ is all records. \n\ shutdown <OPTS> shutdown slurm daemons \n\ (the primary controller will be stopped) \n\ - suspend <jobid_list> susend specified job (see resume) \n\ + suspend <job_list> susend specified job (see resume) \n\ takeover ask slurm backup controller to take over \n\ - uhold <jobid_list> place user hold on specified job (see release)\n\ + uhold <jobid_list> place user hold on specified job (see hold)\n\ update <SPECIFICATIONS> update job, node, partition, reservation, \n\ step or bluegene block/submp configuration \n\ verbose enable detailed logging. \n\ diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 997119a01..56618a5c9 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -237,34 +237,45 @@ static uint32_t _get_job_time(uint32_t job_id) /* * scontrol_hold - perform some job hold/release operation - * IN op - suspend/resume operation - * IN job_id_str - a job id + * IN op - suspend/resume operation + * IN job_str - a job ID or job name * RET 0 if no slurm error, errno otherwise. parsing error prints * error message and returns 0 */ extern int -scontrol_hold(char *op, char *job_id_str) +scontrol_hold(char *op, char *job_str) { static uint32_t last_job_id = NO_VAL; static job_info_msg_t *resp = NULL; int i, rc = SLURM_SUCCESS; char *next_str; job_desc_msg_t job_msg; - uint32_t job_id; + uint32_t job_id = 0; uint32_t array_id; + char *job_name = NULL; slurm_job_info_t *job_ptr; - if (job_id_str) { - job_id = (uint32_t) strtol(job_id_str, &next_str, 10); + if (job_str && !strncasecmp(job_str, "JobID=", 6)) + job_str += 6; + if (job_str && !strncasecmp(job_str, "Name=", 5)) + job_str += 5; + + if (job_str && (job_str[0] >= '0') && (job_str[0] <= '9')) { + job_id = (uint32_t) strtol(job_str, &next_str, 10); if (next_str[0] == '_') array_id = strtol(next_str+1, &next_str, 10); else array_id = NO_VAL; if ((job_id == 0) || (next_str[0] != '\0')) { fprintf(stderr, "Invalid job id specified (%s)\n", - job_id_str); + job_str); return 1; } + } else if (job_str) { + array_id = NO_VAL; + job_id = 0; + job_name = job_str; + last_job_id = NO_VAL; } else { last_job_id = NO_VAL; /* Refresh cache on next call */ return 0; @@ -280,7 +291,6 @@ scontrol_hold(char *op, char *job_id_str) } slurm_init_job_desc_msg (&job_msg); - job_msg.job_id = job_id; /* set current user, needed e.g., for AllowGroups checks */ job_msg.user_id = getuid(); if ((strncasecmp(op, "holdu", 5) == 0) || @@ -297,11 +307,17 @@ scontrol_hold(char *op, char *job_id_str) if ((array_id != NO_VAL) && (job_ptr->array_task_id != array_id)) continue; + if (job_name && + ((job_ptr->name == NULL) || + strcmp(job_name, job_ptr->name))) + continue; if (!IS_JOB_PENDING(job_ptr)) { if ((array_id == NO_VAL) && (job_ptr->array_task_id != NO_VAL)) continue; + if (job_name) + continue; slurm_seterrno(ESLURM_JOB_NOT_PENDING); return ESLURM_JOB_NOT_PENDING; } diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 8bb8785ce..1465601b2 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -272,7 +272,7 @@ static void _adjust_limit_usage(int type, struct job_record *job_ptr) } assoc_ptr = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; - while(assoc_ptr) { + while (assoc_ptr) { switch(type) { case ACCT_POLICY_ADD_SUBMIT: assoc_ptr->usage->used_submit_jobs++; @@ -378,6 +378,63 @@ extern void acct_policy_job_fini(struct job_record *job_ptr) _adjust_limit_usage(ACCT_POLICY_JOB_FINI, job_ptr); } +extern void acct_policy_alter_job(struct job_record *job_ptr, + uint32_t new_time_limit) +{ + slurmdb_association_rec_t *assoc_ptr = NULL; + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + WRITE_LOCK, NO_LOCK, NO_LOCK }; + uint64_t used_cpu_run_secs, new_used_cpu_run_secs; + + if (!IS_JOB_RUNNING(job_ptr) || (job_ptr->time_limit == new_time_limit)) + return; + + if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) + || !_valid_job_assoc(job_ptr)) + return; + + used_cpu_run_secs = (uint64_t)job_ptr->total_cpus + * (uint64_t)job_ptr->time_limit * 60; + new_used_cpu_run_secs = (uint64_t)job_ptr->total_cpus + * (uint64_t)new_time_limit * 60; + + assoc_mgr_lock(&locks); + if (job_ptr->qos_ptr) { + slurmdb_qos_rec_t *qos_ptr = + (slurmdb_qos_rec_t *)job_ptr->qos_ptr; + + qos_ptr->usage->grp_used_cpu_run_secs -= + used_cpu_run_secs; + qos_ptr->usage->grp_used_cpu_run_secs += + new_used_cpu_run_secs; + debug2("altering %u QOS %s got %"PRIu64" " + "just removed %"PRIu64" and added %"PRIu64"", + job_ptr->job_id, + qos_ptr->name, + qos_ptr->usage->grp_used_cpu_run_secs, + used_cpu_run_secs, + new_used_cpu_run_secs); + } + + assoc_ptr = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; + while (assoc_ptr) { + assoc_ptr->usage->grp_used_cpu_run_secs -= + used_cpu_run_secs; + assoc_ptr->usage->grp_used_cpu_run_secs += + new_used_cpu_run_secs; + debug2("altering %u acct %s got %"PRIu64" " + "just removed %"PRIu64" and added %"PRIu64"", + job_ptr->job_id, + assoc_ptr->acct, + assoc_ptr->usage->grp_used_cpu_run_secs, + used_cpu_run_secs, + new_used_cpu_run_secs); + /* now handle all the group limits of the parents */ + assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; + } + assoc_mgr_unlock(&locks); +} + extern bool acct_policy_validate(job_desc_msg_t *job_desc, struct part_record *part_ptr, slurmdb_association_rec_t *assoc_in, @@ -480,25 +537,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, qos_ptr->name); rc = false; goto end_it; - } else if ((job_desc->max_cpus == NO_VAL) - || (acct_policy_limit_set->max_cpus - && (job_desc->max_cpus > qos_max_cpus_limit))) { - job_desc->max_cpus = qos_max_cpus_limit; - acct_policy_limit_set->max_cpus = 1; - } else if (strict_checking - && (job_desc->max_cpus > qos_max_cpus_limit)) { - if (reason) - *reason = WAIT_QOS_RESOURCE_LIMIT; - info("job submit for user %s(%u): " - "max cpu changed %u -> %u because " - "of qos limit", - user_name, - job_desc->user_id, - job_desc->max_cpus, - qos_max_cpus_limit); - if (job_desc->max_cpus == NO_VAL) - acct_policy_limit_set->max_cpus = 1; - job_desc->max_cpus = qos_max_cpus_limit; } /* for validation we don't need to look at @@ -555,26 +593,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, qos_ptr->name); rc = false; goto end_it; - } else if ((job_desc->max_nodes == 0) - || (acct_policy_limit_set->max_nodes - && (job_desc->max_nodes - > qos_max_nodes_limit))) { - job_desc->max_nodes = qos_max_nodes_limit; - acct_policy_limit_set->max_nodes = 1; - } else if (strict_checking - && job_desc->max_nodes > qos_max_nodes_limit) { - if (reason) - *reason = WAIT_QOS_JOB_LIMIT; - info("job submit for user %s(%u): " - "max node changed %u -> %u because " - "of qos limit", - user_name, - job_desc->user_id, - job_desc->max_nodes, - qos_max_nodes_limit); - if (job_desc->max_nodes == NO_VAL) - acct_policy_limit_set->max_nodes = 1; - job_desc->max_nodes = qos_max_nodes_limit; } if ((qos_ptr->grp_submit_jobs != INFINITE) && @@ -628,25 +646,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, qos_ptr->max_cpus_pj); rc = false; goto end_it; - } else if ((job_desc->max_cpus == NO_VAL) - || (acct_policy_limit_set->max_cpus - && (job_desc->max_cpus - > qos_ptr->max_cpus_pj))) { - job_desc->max_cpus = qos_ptr->max_cpus_pj; - acct_policy_limit_set->max_cpus = 1; - } else if (reason - && job_desc->max_cpus > qos_ptr->max_cpus_pj) { - *reason = WAIT_QOS_JOB_LIMIT; - info("job submit for user %s(%u): " - "max cpu changed %u -> %u because " - "of qos limit", - user_name, - job_desc->user_id, - job_desc->max_cpus, - qos_ptr->max_cpus_pj); - if (job_desc->max_cpus == NO_VAL) - acct_policy_limit_set->max_cpus = 1; - job_desc->max_cpus = qos_ptr->max_cpus_pj; } /* for validation we don't need to look at @@ -670,26 +669,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, qos_ptr->max_nodes_pj); rc = false; goto end_it; - } else if ((job_desc->max_nodes == 0) - || (acct_policy_limit_set->max_nodes - && (job_desc->max_nodes - > qos_ptr->max_nodes_pj))) { - job_desc->max_nodes = qos_ptr->max_nodes_pj; - acct_policy_limit_set->max_nodes = 1; - } else if (strict_checking - && job_desc->max_nodes > qos_ptr->max_nodes_pj) { - if (reason) - *reason = WAIT_QOS_JOB_LIMIT; - info("job submit for user %s(%u): " - "max node changed %u -> %u because " - "of qos limit", - user_name, - job_desc->user_id, - job_desc->max_nodes, - qos_ptr->max_nodes_pj); - if (job_desc->max_nodes == NO_VAL) - acct_policy_limit_set->max_nodes = 1; - job_desc->max_nodes = qos_ptr->max_nodes_pj; } if (qos_ptr->max_submit_jobs_pu != INFINITE) { @@ -770,22 +749,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, assoc_ptr->acct); rc = false; break; - } else if ((job_desc->max_cpus == NO_VAL) - || (acct_policy_limit_set->max_cpus - && (job_desc->max_cpus > assoc_ptr->grp_cpus))) { - job_desc->max_cpus = assoc_ptr->grp_cpus; - acct_policy_limit_set->max_cpus = 1; - } else if (job_desc->max_cpus > assoc_ptr->grp_cpus) { - info("job submit for user %s(%u): " - "max cpu changed %u -> %u because " - "of account limit", - user_name, - job_desc->user_id, - job_desc->max_cpus, - assoc_ptr->grp_cpus); - if (job_desc->max_cpus == NO_VAL) - acct_policy_limit_set->max_cpus = 1; - job_desc->max_cpus = assoc_ptr->grp_cpus; } /* for validation we don't need to look at @@ -824,23 +787,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, assoc_ptr->acct); rc = false; break; - } else if ((job_desc->max_nodes == 0) - || (acct_policy_limit_set->max_nodes - && (job_desc->max_nodes - > assoc_ptr->grp_nodes))) { - job_desc->max_nodes = assoc_ptr->grp_nodes; - acct_policy_limit_set->max_nodes = 1; - } else if (job_desc->max_nodes > assoc_ptr->grp_nodes) { - info("job submit for user %s(%u): " - "max node changed %u -> %u because " - "of account limit", - user_name, - job_desc->user_id, - job_desc->max_nodes, - assoc_ptr->grp_nodes); - if (job_desc->max_nodes == NO_VAL) - acct_policy_limit_set->max_nodes = 1; - job_desc->max_nodes = assoc_ptr->grp_nodes; } if ((!qos_ptr || @@ -893,23 +839,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, assoc_ptr->max_cpus_pj); rc = false; break; - } else if (job_desc->max_cpus == NO_VAL - || (acct_policy_limit_set->max_cpus - && (job_desc->max_cpus - > assoc_ptr->max_cpus_pj))) { - job_desc->max_cpus = assoc_ptr->max_cpus_pj; - acct_policy_limit_set->max_cpus = 1; - } else if (job_desc->max_cpus > assoc_ptr->max_cpus_pj) { - info("job submit for user %s(%u): " - "max cpu changed %u -> %u because " - "of account limit", - user_name, - job_desc->user_id, - job_desc->max_cpus, - assoc_ptr->max_cpus_pj); - if (job_desc->max_cpus == NO_VAL) - acct_policy_limit_set->max_cpus = 1; - job_desc->max_cpus = assoc_ptr->max_cpus_pj; } /* for validation we don't need to look at @@ -932,24 +861,6 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, assoc_ptr->max_nodes_pj); rc = false; break; - } else if (((job_desc->max_nodes == NO_VAL) - || (job_desc->max_nodes == 0)) - || (acct_policy_limit_set->max_nodes - && (job_desc->max_nodes - > assoc_ptr->max_nodes_pj))) { - job_desc->max_nodes = assoc_ptr->max_nodes_pj; - acct_policy_limit_set->max_nodes = 1; - } else if (job_desc->max_nodes > assoc_ptr->max_nodes_pj) { - info("job submit for user %s(%u): " - "max node changed %u -> %u because " - "of account limit", - user_name, - job_desc->user_id, - job_desc->max_nodes, - assoc_ptr->max_nodes_pj); - if (job_desc->max_nodes == NO_VAL) - acct_policy_limit_set->max_nodes = 1; - job_desc->max_nodes = assoc_ptr->max_nodes_pj; } if ((!qos_ptr || @@ -1049,6 +960,7 @@ extern bool acct_policy_job_runnable_pre_select(struct job_record *job_ptr) return true; if (!_valid_job_assoc(job_ptr)) { + xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_ACCOUNT; return false; } @@ -1058,8 +970,10 @@ extern bool acct_policy_job_runnable_pre_select(struct job_record *job_ptr) return true; /* clear old state reason */ - if (!acct_policy_job_runnable_state(job_ptr)) + if (!acct_policy_job_runnable_state(job_ptr)) { + xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_NO_REASON; + } assoc_mgr_lock(&locks); qos_ptr = job_ptr->qos_ptr; @@ -1173,7 +1087,7 @@ extern bool acct_policy_job_runnable_pre_select(struct job_record *job_ptr) } assoc_ptr = job_ptr->assoc_ptr; - while(assoc_ptr) { + while (assoc_ptr) { wall_mins = assoc_ptr->usage->grp_used_wall / 60; #if _DEBUG @@ -1340,8 +1254,10 @@ extern bool acct_policy_job_runnable_post_select( safe_limits = true; /* clear old state reason */ - if (!acct_policy_job_runnable_state(job_ptr)) + if (!acct_policy_job_runnable_state(job_ptr)) { + xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_NO_REASON; + } job_cpu_time_limit = (uint64_t)job_ptr->time_limit * (uint64_t)cpu_cnt; @@ -1414,13 +1330,15 @@ extern bool acct_policy_job_runnable_post_select( "the job is at or exceeds QOS %s's " "group max cpu minutes of %"PRIu64" " "of which %"PRIu64" are still available " - "but request is for %"PRIu64" cpu " + "but request is for %"PRIu64" " + "(%"PRIu64" already used) cpu " "minutes (%u cpus)", job_ptr->job_id, qos_ptr->name, qos_ptr->grp_cpu_mins, qos_ptr->grp_cpu_mins - usage_mins, job_cpu_time_limit + cpu_run_mins, + cpu_run_mins, cpu_cnt); rc = false; @@ -2010,6 +1928,56 @@ end_it: return rc; } +extern uint32_t acct_policy_get_max_nodes(struct job_record *job_ptr) +{ + uint32_t max_nodes_limit = INFINITE; + assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, + READ_LOCK, NO_LOCK, NO_LOCK }; + + /* check to see if we are enforcing associations */ + if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) + return max_nodes_limit; + + assoc_mgr_lock(&locks); + if (job_ptr->qos_ptr) { + slurmdb_qos_rec_t *qos_ptr = job_ptr->qos_ptr; + max_nodes_limit = + MIN(qos_ptr->grp_nodes, qos_ptr->max_nodes_pu); + max_nodes_limit = + MIN(max_nodes_limit, qos_ptr->max_nodes_pj); + } + + if (max_nodes_limit == INFINITE) { + slurmdb_association_rec_t *assoc_ptr = job_ptr->assoc_ptr; + bool parent = 0; /*flag to tell us if we are looking at the + * parent or not + */ + bool grp_set = 0; + + while (assoc_ptr) { + if (assoc_ptr->grp_nodes != INFINITE) { + max_nodes_limit = MIN(max_nodes_limit, + assoc_ptr->grp_nodes); + grp_set = 1; + } + + if (!parent && (assoc_ptr->max_nodes_pj != INFINITE)) + max_nodes_limit = MIN(max_nodes_limit, + assoc_ptr->max_nodes_pj); + + /* only check the first grp set */ + if (grp_set) + break; + + assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; + parent = 1; + continue; + } + + } + assoc_mgr_unlock(&locks); + return max_nodes_limit; +} /* * acct_policy_update_pending_job - Make sure the limits imposed on a job on * submission are correct after an update to a qos or association. If diff --git a/src/slurmctld/acct_policy.h b/src/slurmctld/acct_policy.h index d492b86d1..65c6a2d07 100644 --- a/src/slurmctld/acct_policy.h +++ b/src/slurmctld/acct_policy.h @@ -76,6 +76,17 @@ extern void acct_policy_job_begin(struct job_record *job_ptr); */ extern void acct_policy_job_fini(struct job_record *job_ptr); +/* + * acct_policy_alter_job - if resources change on a job this needs to + * be called after they have been validated, but before they actually + * do. Each of the resources can be changed one at a time. If no + * change happens on a resouce just put old values in for the new. + * At the time of writing this function any node or cpu size change + * while running was already handled in the job_pre|post_resize_acctg functions. + */ +extern void acct_policy_alter_job(struct job_record *job_ptr, + uint32_t new_time_limit); + extern bool acct_policy_validate(job_desc_msg_t *job_desc, struct part_record *part_ptr, slurmdb_association_rec_t *assoc_in, @@ -107,6 +118,11 @@ extern bool acct_policy_job_runnable_post_select( */ extern bool acct_policy_job_runnable_state(struct job_record *job_ptr); +/* + * Using the limits on the job get the max nodes possible. + */ +extern uint32_t acct_policy_get_max_nodes(struct job_record *job_ptr); + /* * acct_policy_update_pending_job - Make sure the limits imposed on a * job on submission are correct after an update to a qos or diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 9c5433138..52e7b7881 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -1517,6 +1517,26 @@ static void _set_job_time(struct job_record *job_ptr, uint16_t mail_type, } } +static void _set_job_term_info(struct job_record *job_ptr, uint16_t mail_type, + char *buf, int buf_len) +{ + uint16_t base_state = job_ptr->job_state & JOB_STATE_BASE; + + buf[0] = '\0'; + if ((mail_type == MAIL_JOB_END) || (mail_type == MAIL_JOB_FAIL)) { + if (WIFEXITED(job_ptr->exit_code)) { + int exit_code = WEXITSTATUS(job_ptr->exit_code); + snprintf(buf, buf_len, ", %s, ExitCode %d", + job_state_string(base_state), exit_code); + } else { + snprintf(buf, buf_len, ", %s", + job_state_string(base_state)); + } + } else if (buf_len > 0) { + buf[0] = '\0'; + } +} + /* * mail_job_info - Send e-mail notice of job state change * IN job_ptr - job identification @@ -1524,7 +1544,7 @@ static void _set_job_time(struct job_record *job_ptr, uint16_t mail_type, */ extern void mail_job_info (struct job_record *job_ptr, uint16_t mail_type) { - char job_time[128]; + char job_time[128], term_msg[128]; mail_info_t *mi = _mail_alloc(); if (!job_ptr->mail_user) @@ -1533,9 +1553,11 @@ extern void mail_job_info (struct job_record *job_ptr, uint16_t mail_type) mi->user_name = xstrdup(job_ptr->mail_user); _set_job_time(job_ptr, mail_type, job_time, sizeof(job_time)); - mi->message = xstrdup_printf("SLURM Job_id=%u Name=%s %s%s", + _set_job_term_info(job_ptr, mail_type, term_msg, sizeof(term_msg)); + mi->message = xstrdup_printf("SLURM Job_id=%u Name=%s %s%s%s", job_ptr->job_id, job_ptr->name, - _mail_type_str(mail_type), job_time); + _mail_type_str(mail_type), job_time, + term_msg); debug("email msg to %s: %s", mi->user_name, mi->message); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 3ac636ed6..8c270cc70 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -172,6 +172,7 @@ bool ping_nodes_now = false; uint32_t cluster_cpus = 0; int with_slurmdbd = 0; bool want_nodes_reboot = true; +int batch_sched_delay = 3; int sched_interval = 60; /* Next used for stats/diagnostics */ @@ -1562,7 +1563,8 @@ static void *_slurmctld_background(void *no_data) slurm_mutex_unlock(&sched_cnt_mutex); last_full_sched_time = now; } else if (job_sched_cnt && - (difftime(now, last_sched_time) >= 3)) { + (difftime(now, last_sched_time) >= + batch_sched_delay)) { slurm_mutex_lock(&sched_cnt_mutex); job_limit = 0; /* Default depth */ job_sched_cnt = 0; diff --git a/src/slurmctld/groups.c b/src/slurmctld/groups.c index 7caf0d6ac..ad3a237d9 100644 --- a/src/slurmctld/groups.c +++ b/src/slurmctld/groups.c @@ -112,13 +112,16 @@ extern uid_t *get_group_members(char *group_name) buflen = MAX(buflen, i); #endif grp_buffer = xmalloc(buflen); - /* We need to check for !grp_result, since it appears some - * versions of this function do not return an error on failure. */ while (1) { slurm_seterrno(0); res = getgrnam_r(group_name, &grp, grp_buffer, buflen, &grp_result); - if (res != 0) { + + /* We need to check for !grp_result, since it appears some + * versions of this function do not return an error on + * failure. + */ + if (res != 0 || !grp_result) { if (errno == ERANGE) { buflen *= 2; xrealloc(grp_buffer, buflen); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 31e024730..35d7c25c4 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4380,7 +4380,6 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, char **err_msg) { static int launch_type_poe = -1; - static uint32_t acct_freq_task = NO_VAL; int error_code = SLURM_SUCCESS, i, qos_error; struct part_record *part_ptr = NULL; List part_ptr_list = NULL; @@ -4394,7 +4393,6 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, static uint32_t node_scaling = 1; static uint32_t cpus_per_mp = 1; acct_policy_limit_set_t acct_policy_limit_set; - int acctg_freq; #ifdef HAVE_BG uint16_t geo[SYSTEM_DIMENSIONS]; @@ -4440,25 +4438,6 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, if (error_code != SLURM_SUCCESS) return error_code; - /* Validate a job's accounting frequency, if specified */ - if (acct_freq_task == NO_VAL) { - char *acct_freq = slurm_get_jobacct_gather_freq(); - int i = acct_gather_parse_freq(PROFILE_TASK, acct_freq); - xfree(acct_freq); - if (i != -1) - acct_freq_task = i; - else - acct_freq_task = (uint16_t) NO_VAL; - } - acctg_freq = acct_gather_parse_freq(PROFILE_TASK, job_desc->acctg_freq); - if ((acctg_freq != -1) && - ((acctg_freq == 0) || (acctg_freq > acct_freq_task))) { - error("Invalid accounting frequency (%d > %u)", - acctg_freq, acct_freq_task); - error_code = ESLURMD_INVALID_ACCT_FREQ; - goto cleanup_fail; - } - /* insure that selected nodes are in this partition */ if (job_desc->req_nodes) { error_code = node_name2bitmap(job_desc->req_nodes, false, @@ -5989,6 +5968,11 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, } else if (!_validate_min_mem_partition(job_desc_msg, part_ptr, part_list)) return ESLURM_INVALID_TASK_MEMORY; + /* Validate a job's accounting frequency, if specified */ + if (acct_gather_check_acct_freq_task( + job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq)) + return ESLURMD_INVALID_ACCT_FREQ; + if (job_desc_msg->min_nodes == NO_VAL) job_desc_msg->min_nodes = 1; /* default node count of 1 */ if (job_desc_msg->min_cpus == NO_VAL) @@ -8259,6 +8243,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) time_t old_time = job_ptr->time_limit; if (old_time == INFINITE) /* one year in mins */ old_time = (365 * 24 * 60); + acct_policy_alter_job(job_ptr, job_specs->time_limit); job_ptr->time_limit = job_specs->time_limit; if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { @@ -8432,14 +8417,20 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) job_ptr->state_reason = WAIT_HELD; } } else if ((job_ptr->priority == 0) && - (job_ptr->state_reason == WAIT_HELD_USER)) { + (job_specs->priority == INFINITE) && + (authorized || + (job_ptr->state_reason == WAIT_HELD_USER))) { job_ptr->direct_set_prio = 0; set_job_prio(job_ptr); - info("sched: update_job: releasing user hold " - "for job_id %u", job_specs->job_id); + info("sched: update_job: releasing hold for job_id %u", + job_specs->job_id); job_ptr->state_reason = WAIT_NO_REASON; job_ptr->job_state &= ~JOB_SPECIAL_EXIT; xfree(job_ptr->state_desc); + } else if ((job_ptr->priority == 0) && + (job_specs->priority != INFINITE)) { + info("ignore priority reset request on held job %u", + job_specs->job_id); } else if (authorized || (job_ptr->priority > job_specs->priority)) { if (job_specs->priority != 0) @@ -8463,11 +8454,6 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) } else job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); - } else if ((job_ptr->state_reason == WAIT_HELD) || - (job_ptr->state_reason == WAIT_HELD_USER)) { - job_ptr->state_reason = WAIT_NO_REASON; - job_ptr->job_state &= ~JOB_SPECIAL_EXIT; - xfree(job_ptr->state_desc); } } else if (job_specs->priority == INFINITE && job_ptr->state_reason != WAIT_HELD_USER) { @@ -9009,8 +8995,11 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) error_code = SLURM_SUCCESS; else error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; - job_ptr->state_reason = fail_reason; - xfree(job_ptr->state_desc); + if ((job_ptr->state_reason != WAIT_HELD) && + (job_ptr->state_reason != WAIT_HELD_USER)) { + job_ptr->state_reason = fail_reason; + xfree(job_ptr->state_desc); + } return error_code; } else if ((job_ptr->state_reason != WAIT_HELD) && (job_ptr->state_reason != WAIT_HELD_USER)) { diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index a0023425f..f4d2224a5 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -282,7 +282,7 @@ extern List build_job_queue(bool clear_start, bool backfill) xfree(job_ptr->state_desc); } /* priority_array index matches part_ptr_list - * position: increment inx*/ + * position: increment inx */ inx++; if (reason != WAIT_NO_REASON) continue; @@ -750,6 +750,7 @@ extern int schedule(uint32_t job_limit) time_t now, sched_start; uint32_t reject_array_job_id = 0; struct part_record *reject_array_part = NULL; + uint16_t reject_state_reason = WAIT_NO_REASON; DEF_TIMERS; if (sched_update != slurmctld_conf.last_update) { @@ -775,6 +776,18 @@ extern int schedule(uint32_t job_limit) xfree(prio_type); sched_params = slurm_get_sched_params(); + + + if (sched_params && + (tmp_ptr=strstr(sched_params, "batch_sched_delay="))) + /* 012345678901234567 */ + batch_sched_delay = atoi(tmp_ptr + 18); + if (batch_sched_delay < 0) { + error("Invalid batch_sched_delay: %d", + batch_sched_delay); + batch_sched_delay = 3; + } + if (sched_params && (tmp_ptr = strstr(sched_params, "default_queue_depth="))) { /* 01234567890123456789 */ @@ -988,8 +1001,11 @@ next_part: part_ptr = (struct part_record *) if (job_ptr->array_task_id != NO_VAL) { if ((reject_array_job_id == job_ptr->array_job_id) && - (reject_array_part == job_ptr->part_ptr)) + (reject_array_part == job_ptr->part_ptr)) { + xfree(job_ptr->state_desc); + job_ptr->state_reason = reject_state_reason; continue; /* already rejected array element */ + } /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; @@ -1010,6 +1026,10 @@ next_part: part_ptr = (struct part_record *) continue; debug2("sched: reached partition %s job limit", job_ptr->part_ptr->name); + if (job_ptr->state_reason == WAIT_NO_REASON) { + xfree(job_ptr->state_desc); + job_ptr->state_reason = WAIT_PRIORITY; + } skip_part_ptr = job_ptr->part_ptr; continue; } @@ -1050,8 +1070,8 @@ next_part: part_ptr = (struct part_record *) } } else if (_failed_partition(job_ptr->part_ptr, failed_parts, failed_part_cnt)) { - if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) - || (job_ptr->state_reason == WAIT_NO_REASON)) { + if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) || + (job_ptr->state_reason == WAIT_NO_REASON)) { job_ptr->state_reason = WAIT_PRIORITY; xfree(job_ptr->state_desc); last_job_update = now; @@ -1085,6 +1105,9 @@ next_part: part_ptr = (struct part_record *) } else { debug("sched: JobId=%u has invalid association", job_ptr->job_id); + xfree(job_ptr->state_desc); + job_ptr->state_reason = + WAIT_ASSOC_RESOURCE_LIMIT; continue; } } @@ -1301,6 +1324,13 @@ next_part: part_ptr = (struct part_record *) delete_job_details(job_ptr); } } + + if ((reject_array_job_id == job_ptr->array_job_id) && + (reject_array_part == job_ptr->part_ptr)) { + /* All other elements of this job array get the + * same reason */ + reject_state_reason = job_ptr->state_reason; + } } save_last_part_update = last_part_update; @@ -1313,7 +1343,7 @@ next_part: part_ptr = (struct part_record *) list_iterator_destroy(job_iterator); if (part_iterator) list_iterator_destroy(part_iterator); - } else { + } else if (job_queue) { FREE_NULL_LIST(job_queue); } xfree(sched_part_ptr); @@ -1366,6 +1396,15 @@ extern int sort_job_queue2(void *x, void *y) if (!has_resv1 && has_resv2) return 1; + if (job_rec1->part_ptr && job_rec2->part_ptr) { + p1 = job_rec1->part_ptr->priority; + p2 = job_rec2->part_ptr->priority; + if (p1 < p2) + return 1; + if (p1 > p2) + return -1; + } + if (job_rec1->job_ptr->part_ptr_list && job_rec1->job_ptr->priority_array) p1 = job_rec1->priority; @@ -1694,52 +1733,6 @@ static void _depend_list2str(struct job_record *job_ptr) list_iterator_destroy(depend_iter); } -/* - * Remove a dependency from within the job dependency string. - */ -static void _rm_dependency(struct job_record *job_ptr, - struct depend_spec *dep_ptr) -{ - int rmv_len; - char *base_off, *rmv_dep, *rmv_off; - - if (dep_ptr->array_task_id == INFINITE) { - rmv_dep = xstrdup_printf(":%u_*", dep_ptr->job_id); - } else if (dep_ptr->array_task_id != NO_VAL) { - rmv_dep = xstrdup_printf(":%u_%u", - dep_ptr->job_id, - dep_ptr->array_task_id); - } else { - rmv_dep = xstrdup_printf(":%u", dep_ptr->job_id); - } - rmv_len = strlen(rmv_dep); - base_off = job_ptr->details->dependency; - while ((rmv_off = strstr(base_off, rmv_dep))) { - if (isdigit(rmv_off[rmv_len])) { - /* Partial job ID match (e.g. "123" rather than "12") */ - base_off += rmv_len; - continue; - } - memmove(rmv_off, rmv_off + rmv_len, - strlen(rmv_off + rmv_len) + 1); - if (rmv_off[0] == ':') - continue; - if ((rmv_off == job_ptr->details->dependency) || - ! isalpha(rmv_off[-1])) - continue; - /* Remove dependency type also (e.g. "afterany"); */ - for (base_off = rmv_off - 1; - base_off > job_ptr->details->dependency; base_off--) { - if (!isalpha(base_off[0])) { - base_off++; - break; - } - } - memmove(base_off, rmv_off, strlen(rmv_off) + 1); - } - xfree(rmv_dep); -} - /* * Determine if a job's dependencies are met * RET: 0 = no dependencies @@ -1750,10 +1743,10 @@ extern int test_job_dependency(struct job_record *job_ptr) { ListIterator depend_iter, job_iterator; struct depend_spec *dep_ptr; - bool failure = false, depends = false, expands = false; + bool failure = false, depends = false, rebuild_str = false; List job_queue = NULL; bool run_now; - int count = 0, results = 0; + int results = 0; struct job_record *qjob_ptr, *djob_ptr; time_t now = time(NULL); /* For performance reasons with job arrays, we cache dependency @@ -1765,7 +1758,7 @@ extern int test_job_dependency(struct job_record *job_ptr) if ((job_ptr->details == NULL) || (job_ptr->details->depend_list == NULL) || - ((count = list_count(job_ptr->details->depend_list)) == 0)) + (list_count(job_ptr->details->depend_list) == 0)) return 0; if ((job_ptr->array_task_id != NO_VAL) && @@ -1774,6 +1767,8 @@ extern int test_job_dependency(struct job_record *job_ptr) (cache_job_ptr->job_id == cache_job_id) && (cache_job_ptr->array_job_id == job_ptr->array_job_id) && (cache_job_ptr->details) && + (cache_job_ptr->details->orig_dependency) && + (job_ptr->details->orig_dependency) && (!strcmp(cache_job_ptr->details->orig_dependency, job_ptr->details->orig_dependency))) { return cache_results; @@ -1782,7 +1777,6 @@ extern int test_job_dependency(struct job_record *job_ptr) depend_iter = list_iterator_create(job_ptr->details->depend_list); while ((dep_ptr = list_next(depend_iter))) { bool clear_dep = false; - count--; if (dep_ptr->array_task_id == INFINITE) { /* Advance to latest element of this job array */ dep_ptr->job_ptr = find_job_array_rec(dep_ptr->job_id, @@ -1853,7 +1847,6 @@ extern int test_job_dependency(struct job_record *job_ptr) } } else if (dep_ptr->depend_type == SLURM_DEPEND_EXPAND) { time_t now = time(NULL); - expands = true; if (IS_JOB_PENDING(dep_ptr->job_ptr)) { depends = true; } else if (IS_JOB_FINISHED(dep_ptr->job_ptr)) { @@ -1874,12 +1867,14 @@ extern int test_job_dependency(struct job_record *job_ptr) } else failure = true; if (clear_dep) { - _rm_dependency(job_ptr, dep_ptr); list_delete_item(depend_iter); + rebuild_str = true; } } list_iterator_destroy(depend_iter); - if (!depends && !expands && (count == 0)) + if (rebuild_str) + _depend_list2str(job_ptr); + if (list_count(job_ptr->details->depend_list) == 0) xfree(job_ptr->details->dependency); if (failure) @@ -1946,33 +1941,36 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) /* dep_ptr->job_id = 0; set by xmalloc */ /* dep_ptr->job_ptr = NULL; set by xmalloc */ (void) list_append(new_depend_list, dep_ptr); - if ( *(tok + 9 ) == ',' ) { + if (tok[9] == ',') { tok += 10; continue; } - else - break; + if (tok[9] != '\0') + rc = ESLURM_DEPENDENCY; + break; } + /* Test for old format, just a job ID */ sep_ptr = strchr(tok, ':'); - if ((sep_ptr == NULL) && (job_id == 0)) { + if ((sep_ptr == NULL) && (tok[0] >= '0') && (tok[0] <= '9')) { job_id = strtol(tok, &sep_ptr, 10); if ((sep_ptr != NULL) && (sep_ptr[0] == '_')) { if (sep_ptr[1] == '*') { array_task_id = INFINITE; - sep_ptr++; + sep_ptr += 2; /* Past "_*" */ } else { array_task_id = strtol(sep_ptr+1, &sep_ptr, 10); } - } else + } else { array_task_id = NO_VAL; - if ((sep_ptr == NULL) || (sep_ptr[0] != '\0') || - (job_id == 0) || (job_id == job_ptr->job_id)) { + } + if ((sep_ptr == NULL) || + (job_id == 0) || (job_id == job_ptr->job_id) || + ((sep_ptr[0] != '\0') && (sep_ptr[0] != ','))) { rc = ESLURM_DEPENDENCY; break; } - /* old format, just a single job_id */ if (array_task_id == NO_VAL) { dep_job_ptr = find_job_record(job_id); if (!dep_job_ptr) { @@ -1988,23 +1986,31 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) dep_job_ptr = find_job_array_rec(job_id, array_task_id); } - if (!dep_job_ptr) /* assume already done */ + if (dep_job_ptr) { + dep_ptr = xmalloc(sizeof(struct depend_spec)); + dep_ptr->array_task_id = array_task_id; + dep_ptr->depend_type = SLURM_DEPEND_AFTER_ANY; + if (array_task_id == NO_VAL) { + dep_ptr->job_id = dep_job_ptr->job_id; + } else { + dep_ptr->job_id = + dep_job_ptr->array_job_id; + } + dep_ptr->job_ptr = dep_job_ptr; + (void) list_append(new_depend_list, dep_ptr); + } + if (sep_ptr && (sep_ptr[0] == ',')) { + tok = sep_ptr + 1; + continue; + } else { break; - dep_ptr = xmalloc(sizeof(struct depend_spec)); - dep_ptr->array_task_id = array_task_id; - dep_ptr->depend_type = SLURM_DEPEND_AFTER_ANY; - if (array_task_id == NO_VAL) - dep_ptr->job_id = dep_job_ptr->job_id; - else - dep_ptr->job_id = dep_job_ptr->array_job_id; - dep_ptr->job_ptr = dep_job_ptr; - (void) list_append(new_depend_list, dep_ptr); - break; + } } else if (sep_ptr == NULL) { rc = ESLURM_DEPENDENCY; break; } + /* New format, <test>:job_ID */ if (strncasecmp(tok, "afternotok", 10) == 0) depend_type = SLURM_DEPEND_AFTER_NOT_OK; else if (strncasecmp(tok, "afterany", 8) == 0) @@ -2029,7 +2035,7 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) if ((sep_ptr2 != NULL) && (sep_ptr2[0] == '_')) { if (sep_ptr2[1] == '*') { array_task_id = INFINITE; - sep_ptr++; + sep_ptr2 += 2; /* Past "_*" */ } else { array_task_id = strtol(sep_ptr2+1, &sep_ptr2, 10); @@ -2039,7 +2045,7 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) if ((sep_ptr2 == NULL) || (job_id == 0) || (job_id == job_ptr->job_id) || ((sep_ptr2[0] != '\0') && (sep_ptr2[0] != ',') && - (sep_ptr2[0] != ':') && (sep_ptr2[0] != '_'))) { + (sep_ptr2[0] != ':'))) { rc = ESLURM_DEPENDENCY; break; } diff --git a/src/slurmctld/job_scheduler.h b/src/slurmctld/job_scheduler.h index aae313599..39a58370b 100644 --- a/src/slurmctld/job_scheduler.h +++ b/src/slurmctld/job_scheduler.h @@ -45,10 +45,11 @@ #include "src/slurmctld/slurmctld.h" typedef struct job_queue_rec { - uint32_t job_id; - struct job_record *job_ptr; - struct part_record *part_ptr; - uint32_t priority; + uint32_t job_id; /* Job ID */ + struct job_record *job_ptr; /* Pointer to job record */ + struct part_record *part_ptr; /* Pointer to partition record. Each + * job may have multiple partitions. */ + uint32_t priority; /* Job priority in THIS partition */ } job_queue_rec_t; /* diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 632cc2756..e84dc1ebc 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1605,6 +1605,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); + max_nodes = MIN(max_nodes, acct_policy_get_max_nodes(job_ptr)); + if (job_ptr->details->req_node_bitmap && job_ptr->details->max_nodes) { i = bit_set_count(job_ptr->details->req_node_bitmap); if (i > job_ptr->details->max_nodes) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 4662a576e..c6fad7684 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -211,6 +211,7 @@ extern int association_based_accounting; extern uint32_t cluster_cpus; extern int with_slurmdbd; extern bool load_2_4_state; +extern int batch_sched_delay; extern int sched_interval; extern bool slurmctld_init_db; diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index e2273b6cc..be7cb6b55 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -241,6 +241,8 @@ static pthread_mutex_t suspend_mutex = PTHREAD_MUTEX_INITIALIZER; static uint32_t job_suspend_array[NUM_PARALLEL_SUSPEND]; static int job_suspend_size = 0; +static pthread_mutex_t prolog_mutex = PTHREAD_MUTEX_INITIALIZER; + void slurmd_req(slurm_msg_t *msg) { @@ -1114,6 +1116,7 @@ _rpc_launch_tasks(slurm_msg_t *msg) req->envc = envcount(req->env); #ifndef HAVE_FRONT_END + slurm_mutex_lock(&prolog_mutex); first_job_run = !slurm_cred_jobid_cached(conf->vctx, req->job_id); #endif if (_check_job_credential(req, req_uid, nodeid, &step_hset, @@ -1121,6 +1124,9 @@ _rpc_launch_tasks(slurm_msg_t *msg) errnum = errno; error("Invalid job credential from %ld@%s: %m", (long) req_uid, host); +#ifndef HAVE_FRONT_END + slurm_mutex_unlock(&prolog_mutex); +#endif goto done; } @@ -1130,6 +1136,8 @@ _rpc_launch_tasks(slurm_msg_t *msg) job_env_t job_env; slurm_cred_insert_jobid(conf->vctx, req->job_id); + _add_job_running_prolog(req->job_id); + slurm_mutex_unlock(&prolog_mutex); if (container_g_create(req->job_id)) error("container_g_create(%u): %m", req->job_id); @@ -1159,9 +1167,10 @@ _rpc_launch_tasks(slurm_msg_t *msg) errnum = ESLURMD_PROLOG_FAILED; goto done; } - } else + } else { + slurm_mutex_unlock(&prolog_mutex); _wait_for_job_running_prolog(req->job_id); - + } #endif if (req->job_mem_lim || req->step_mem_lim) { @@ -1464,10 +1473,13 @@ static void _rpc_prolog(slurm_msg_t *msg) if (container_g_create(req->job_id)) error("container_g_create(%u): %m", req->job_id); + slurm_mutex_lock(&prolog_mutex); first_job_run = !slurm_cred_jobid_cached(conf->vctx, req->job_id); if (first_job_run) { slurm_cred_insert_jobid(conf->vctx, req->job_id); + _add_job_running_prolog(req->job_id); + slurm_mutex_unlock(&prolog_mutex); memset(&job_env, 0, sizeof(job_env_t)); @@ -1501,7 +1513,8 @@ static void _rpc_prolog(slurm_msg_t *msg) req->job_id, exit_status, term_sig); rc = ESLURMD_PROLOG_FAILED; } - } + } else + slurm_mutex_unlock(&prolog_mutex); if (!(slurmctld_conf.prolog_flags & PROLOG_FLAG_NOHOLD)) _notify_slurmctld_prolog_fini(req->job_id, rc); @@ -1535,6 +1548,7 @@ _rpc_batch_job(slurm_msg_t *msg, bool new_msg) task_g_slurmd_batch_request(req->job_id, req); /* determine task affinity */ + slurm_mutex_lock(&prolog_mutex); first_job_run = !slurm_cred_jobid_cached(conf->vctx, req->job_id); /* BlueGene prolog waits for partition boot and is very slow. @@ -1551,6 +1565,7 @@ _rpc_batch_job(slurm_msg_t *msg, bool new_msg) error("Could not confirm batch launch for job %u, " "aborting request", req->job_id); rc = SLURM_COMMUNICATIONS_SEND_ERROR; + slurm_mutex_unlock(&prolog_mutex); goto done; } @@ -1560,8 +1575,9 @@ _rpc_batch_job(slurm_msg_t *msg, bool new_msg) */ if (first_job_run) { job_env_t job_env; - slurm_cred_insert_jobid(conf->vctx, req->job_id); + _add_job_running_prolog(req->job_id); + slurm_mutex_unlock(&prolog_mutex); memset(&job_env, 0, sizeof(job_env_t)); @@ -1603,8 +1619,10 @@ _rpc_batch_job(slurm_msg_t *msg, bool new_msg) rc = ESLURMD_PROLOG_FAILED; goto done; } - } else + } else { + slurm_mutex_unlock(&prolog_mutex); _wait_for_job_running_prolog(req->job_id); + } _get_user_env(req); _set_batch_job_limits(msg); @@ -4060,10 +4078,11 @@ _rpc_terminate_job(slurm_msg_t *msg) */ if (_waiter_init(req->job_id) == SLURM_ERROR) { if (msg->conn_fd >= 0) { - if (_step_is_starting(req->job_id, NO_VAL)) - slurm_send_rc_msg (msg, EAGAIN); - else - slurm_send_rc_msg (msg, SLURM_SUCCESS); + /* No matter if the step hasn't started yet or + * not just send a success to let the + * controller know we got this request. + */ + slurm_send_rc_msg (msg, SLURM_SUCCESS); } return; } @@ -4087,8 +4106,12 @@ _rpc_terminate_job(slurm_msg_t *msg) */ if (_step_is_starting(req->job_id, NO_VAL)) { if (msg->conn_fd >= 0) { - debug4("sent EAGAIN"); - slurm_send_rc_msg (msg, EAGAIN); + /* If the step hasn't started yet just send a + * success to let the controller know we got + * this request. + */ + debug("sent SUCCESS, waiting for step to start"); + slurm_send_rc_msg (msg, SLURM_SUCCESS); if (slurm_close_accepted_conn(msg->conn_fd) < 0) error ( "rpc_kill_job: close(%d): %m", msg->conn_fd); @@ -4624,7 +4647,6 @@ _run_prolog(job_env_t *job_env) slurm_mutex_lock(&conf->config_mutex); my_prolog = xstrdup(conf->prolog); slurm_mutex_unlock(&conf->config_mutex); - _add_job_running_prolog(job_env->jobid); rc = _run_job_script("prolog", my_prolog, job_env->jobid, -1, my_env, job_env->uid); @@ -4694,7 +4716,6 @@ _run_prolog(job_env_t *job_env) slurm_mutex_lock(&conf->config_mutex); my_prolog = xstrdup(conf->prolog); slurm_mutex_unlock(&conf->config_mutex); - _add_job_running_prolog(job_env->jobid); slurm_attr_init(&timer_attr); timer_struct.job_id = job_env->jobid; @@ -4709,9 +4730,6 @@ _run_prolog(job_env_t *job_env) prolog_fini = true; pthread_cond_broadcast(&timer_cond); slurm_mutex_unlock(&timer_mutex); - _remove_job_running_prolog(job_env->jobid); - xfree(my_prolog); - _destroy_env(my_env); diff_time = difftime(time(NULL), start_time); if (diff_time >= (msg_timeout / 2)) { @@ -4719,6 +4737,10 @@ _run_prolog(job_env_t *job_env) job_env->jobid, diff_time); } + _remove_job_running_prolog(job_env->jobid); + xfree(my_prolog); + _destroy_env(my_env); + pthread_join(timer_id, NULL); return rc; } diff --git a/src/slurmd/slurmstepd/fname.c b/src/slurmd/slurmstepd/fname.c index c606c2b4c..402f010e1 100644 --- a/src/slurmd/slurmstepd/fname.c +++ b/src/slurmd/slurmstepd/fname.c @@ -101,29 +101,34 @@ fname_create(stepd_step_rec_t *job, const char *format, int taskid) switch (*p) { case 'a': /* '%a' => array task id */ xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, + xstrfmtcat(name, "%0*u", wid, job->array_task_id); q = ++p; break; case 'A': /* '%A' => array master job id */ xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, - job->array_job_id); + if (job->array_task_id == NO_VAL) { + xstrfmtcat(name, "%0*u", wid, + job->jobid); + } else { + xstrfmtcat(name, "%0*u", wid, + job->array_job_id); + } q = ++p; break; case 's': /* '%s' => step id */ xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, job->stepid); + xstrfmtcat(name, "%0*u", wid, job->stepid); q = ++p; break; case 't': /* '%t' => taskid */ xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, taskid); + xstrfmtcat(name, "%0*u", wid, taskid); q = ++p; break; case 'n': /* '%n' => nodeid */ xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, job->nodeid); + xstrfmtcat(name, "%0*u", wid, job->nodeid); q = ++p; break; case 'N': /* '%N' => node name */ @@ -142,10 +147,10 @@ fname_create(stepd_step_rec_t *job, const char *format, int taskid) case 'J': /* '%J' => jobid.stepid */ case 'j': /* '%j' => jobid */ xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, job->jobid); + xstrfmtcat(name, "%0*u", wid, job->jobid); if ((*p == 'J') && (job->stepid != NO_VAL)) - xstrfmtcat(name, ".%d", job->stepid); + xstrfmtcat(name, ".%u", job->stepid); q = ++p; break; diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index cc2ebd0c3..2fb824fea 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -77,30 +77,6 @@ static void _job_init_task_info(stepd_step_rec_t *job, uint32_t **gtid, char *ifname, char *ofname, char *efname); static void _task_info_destroy(stepd_step_task_info_t *t, uint16_t multi_prog); -static int _check_acct_freq_task(uint32_t job_mem_lim, char *acctg_freq) -{ - int task_freq; - - if (!job_mem_lim || !conf->acct_freq_task) - return 0; - - task_freq = acct_gather_parse_freq(PROFILE_TASK, acctg_freq); - - if (task_freq == -1) - return 0; - - if ((task_freq == 0) || (task_freq > conf->acct_freq_task)) { - error("Can't set frequency to %d, it is higher than %u. " - "We need it to be at least at this level to " - "monitor memory usage.", - task_freq, conf->acct_freq_task); - slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); - return 1; - } - - return 0; -} - /* returns 0 if invalid gid, otherwise returns 1. Set gid with * correct gid if root launched job. Also set user_name * if not already set. */ @@ -318,7 +294,7 @@ stepd_step_rec_create(launch_tasks_request_msg_t *msg) if (!_valid_uid_gid((uid_t)msg->uid, &(msg->gid), &(msg->user_name))) return NULL; - if (_check_acct_freq_task(msg->job_mem_lim, msg->acctg_freq)) + if (acct_gather_check_acct_freq_task(msg->job_mem_lim, msg->acctg_freq)) return NULL; job = xmalloc(sizeof(stepd_step_rec_t)); @@ -363,7 +339,7 @@ stepd_step_rec_create(launch_tasks_request_msg_t *msg) job->env = _array_copy(msg->envc, msg->env); job->array_job_id = msg->job_id; - job->array_task_id = (uint16_t) NO_VAL; + job->array_task_id = NO_VAL; for (i = 0; i < msg->envc; i++) { /* 1234567890123456789 */ if (!strncmp(msg->env[i], "SLURM_ARRAY_JOB_ID=", 19)) @@ -491,7 +467,7 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) if (!_valid_uid_gid((uid_t)msg->uid, &(msg->gid), &(msg->user_name))) return NULL; - if (_check_acct_freq_task(msg->job_mem, msg->acctg_freq)) + if (acct_gather_check_acct_freq_task(msg->job_mem, msg->acctg_freq)) return NULL; job = xmalloc(sizeof(stepd_step_rec_t)); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h index a0fdfb6e5..78ea976e3 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.h +++ b/src/slurmd/slurmstepd/slurmstepd_job.h @@ -130,7 +130,7 @@ typedef struct { uint32_t jobid; /* Current SLURM job id */ uint32_t stepid; /* Current step id (or NO_VAL) */ uint32_t array_job_id; /* job array master job ID */ - uint16_t array_task_id; /* job array ID */ + uint32_t array_task_id; /* job array ID */ uint32_t nnodes; /* number of nodes in current job */ uint32_t ntasks; /* total number of tasks in current job */ uint32_t nodeid; /* relative position of this node in job */ diff --git a/src/squeue/print.c b/src/squeue/print.c index 868dd474c..1efc20fcc 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -157,16 +157,21 @@ static bool _merge_job_array(List l, job_info_t * job_ptr) return merge; if (!IS_JOB_PENDING(job_ptr)) return merge; + if (IS_JOB_COMPLETING(job_ptr)) + return merge; xfree(job_ptr->node_inx); if (!l) return merge; iter = list_iterator_create(l); while ((list_job_ptr = list_next(iter))) { - if ((list_job_ptr->array_task_id == NO_VAL) || - (job_ptr->array_job_id != list_job_ptr->array_job_id) || - (!IS_JOB_PENDING(list_job_ptr))) + + if ((list_job_ptr->array_task_id == NO_VAL) + || (job_ptr->array_job_id != list_job_ptr->array_job_id) + || (!IS_JOB_PENDING(list_job_ptr)) + || (IS_JOB_COMPLETING(list_job_ptr))) continue; + /* We re-purpose the job's node_inx array to store the * array_task_id values */ if (!list_job_ptr->node_inx) { @@ -396,9 +401,11 @@ int _print_job_job_id(job_info_t * job, int width, bool right, char* suffix) { if (job == NULL) { /* Print the Header instead */ _print_str("JOBID", width, right, true); - } else if ((job->array_task_id != NO_VAL) && - !params.array_flag && IS_JOB_PENDING(job) && - job->node_inx) { + } else if ((job->array_task_id != NO_VAL) + && !params.array_flag + && IS_JOB_PENDING(job) + && job->node_inx + && (!IS_JOB_COMPLETING(job))) { uint32_t i, local_width = width, max_task_id = 0; char *id, *task_str; bitstr_t *task_bits; diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c index 9c2bef3c4..20f193892 100644 --- a/src/srun/libsrun/allocate.c +++ b/src/srun/libsrun/allocate.c @@ -675,6 +675,8 @@ job_desc_msg_create_from_opts (void) j->dependency = opt.dependency; if (opt.nice) j->nice = NICE_OFFSET + opt.nice; + if (opt.priority) + j->priority = opt.priority; if (opt.cpu_bind) j->cpu_bind = opt.cpu_bind; diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c index c3b22deba..cfbf5e109 100644 --- a/src/srun/libsrun/opt.c +++ b/src/srun/libsrun/opt.c @@ -87,6 +87,7 @@ #include "src/common/uid.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" +#include "src/common/util-net.h" #include "src/api/pmi_server.h" @@ -195,6 +196,8 @@ #define LONG_OPT_LAUNCH_CMD 0x156 #define LONG_OPT_PROFILE 0x157 #define LONG_OPT_EXPORT 0x158 +#define LONG_OPT_PRIORITY 0x160 + extern char **environ; @@ -527,6 +530,9 @@ static void _opt_default() opt.wait4switch = -1; opt.launcher_opts = NULL; opt.launch_cmd = false; + + opt.nice = 0; + opt.priority = 0; } /*---[ env var processing ]-----------------------------------------------*/ @@ -909,6 +915,7 @@ static void _set_options(const int argc, char **argv) {"ntasks-per-node", required_argument, 0, LONG_OPT_NTASKSPERNODE}, {"ntasks-per-socket",required_argument, 0, LONG_OPT_NTASKSPERSOCKET}, {"open-mode", required_argument, 0, LONG_OPT_OPEN_MODE}, + {"priority", required_argument, 0, LONG_OPT_PRIORITY}, {"profile", required_argument, 0, LONG_OPT_PROFILE}, {"prolog", required_argument, 0, LONG_OPT_PROLOG}, {"propagate", optional_argument, 0, LONG_OPT_PROPAGATE}, @@ -1003,7 +1010,10 @@ static void _set_options(const int argc, char **argv) case (int)'D': opt.cwd_set = true; xfree(opt.cwd); - opt.cwd = xstrdup(optarg); + if (is_full_path(optarg)) + opt.cwd = xstrdup(optarg); + else + opt.cwd = make_full_path(optarg); break; case (int)'e': if (opt.pty) { @@ -1416,6 +1426,19 @@ static void _set_options(const int argc, char **argv) } } break; + case LONG_OPT_PRIORITY: { + long long priority = strtoll(optarg, NULL, 10); + if (priority < 0) { + error("Priority must be >= 0"); + exit(error_exit); + } + if (priority >= NO_VAL) { + error("Priority must be < %i", NO_VAL); + exit(error_exit); + } + opt.priority = priority; + break; + } case LONG_OPT_MULTI: opt.multi_prog = true; break; @@ -2544,6 +2567,7 @@ static void _help(void) " -o, --output=out location of stdout redirection\n" " -O, --overcommit overcommit resources\n" " -p, --partition=partition partition requested\n" +" --priority=value set the priority of the job to value\n" " --prolog=program run \"program\" before launching job step\n" " --profile=value enable acct_gather_profile for detailed data\n" " value is all or none or any combination of\n" diff --git a/src/srun/libsrun/opt.h b/src/srun/libsrun/opt.h index a81fbef9f..f0a1cd218 100644 --- a/src/srun/libsrun/opt.h +++ b/src/srun/libsrun/opt.h @@ -138,6 +138,7 @@ typedef struct srun_options { char *mpi_type; /* --mpi=type */ char *dependency; /* --dependency, -P type:jobid */ int nice; /* --nice */ + uint32_t priority; /* --priority */ char *account; /* --account, -U acct_name */ char *comment; /* --comment */ char *qos; /* --qos */ diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index d1e8ad5ff..ca33725d6 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -523,6 +523,13 @@ extern void create_srun_job(srun_job_t **p_job, bool *got_alloc, "within an existing job. Set specialized cores " "at job allocation time."); } +#ifdef HAVE_NATIVE_CRAY + if (opt.network) { + error("Ignoring --network value for a job step " + "within an existing job. Set network options " + "at job allocation time."); + } +#endif if (opt.alloc_nodelist == NULL) opt.alloc_nodelist = xstrdup(resp->node_list); if (opt.exclusive) diff --git a/testsuite/expect/globals b/testsuite/expect/globals index b68e4dbbd..ec588fad1 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -1761,11 +1761,11 @@ proc test_super_user { } { ################################################################ # -# Proc: dec2hex16 +# Proc: dec2hex # -# Purpose: Create a 16 bit hex number from a signed decimal number +# Purpose: Create a 32 bit hex number from a signed decimal number # -# Returns: 16 bit hex version of input 'value' +# Returns: 32 bit hex version of input 'value' # # Input: value -- decimal number to convert # @@ -1773,43 +1773,42 @@ proc test_super_user { } { # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 ################################################################ # Replace all non-decimal characters -proc dec2hex16 {value} { +proc dec2hex {value} { regsub -all {[^0-x\.-]} $value {} newtemp set value [string trim $newtemp] - if {$value < 32767 && $value > -32768} { + if {$value < 2147483647 && $value > -2147483648} { set tempvalue [format "%#010X" [expr $value]] - return [string range $tempvalue 6 9] - } elseif {$value < 32768} { - return "8000" + return [string range $tempvalue 2 9] + } elseif {$value < -2147483647} { + return "80000000" } else { - return "7FFF" + return "7FFFFFFF" } } ################################################################ # -# Proc: dec2hex32 +# Proc: uint2hex # -# Purpose: Create a 32 bit hex number from a signed decimal number +# Purpose: Create a 32 bit hex number from an unsigned decimal +# number. # # Returns: 32 bit hex version of input 'value' # -# Input: value -- decimal number to convert +# Input: value -- unsigneddecimal number to convert # # Courtesy of Chris Cornish # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 ################################################################ # Replace all non-decimal characters -proc dec2hex {value} { +proc uint2hex {value} { regsub -all {[^0-x\.-]} $value {} newtemp set value [string trim $newtemp] - if {$value < 2147483647 && $value > -2147483648} { + if {$value <= 4294967295 && $value >= 0} { set tempvalue [format "%#010X" [expr $value]] return [string range $tempvalue 2 9] - } elseif {$value < -2147483647} { - return "80000000" } else { - return "7FFFFFFF" + return "FFFFFFFF" } } diff --git a/testsuite/expect/globals_accounting b/testsuite/expect/globals_accounting index 9535f0dc7..20c458a67 100644 --- a/testsuite/expect/globals_accounting +++ b/testsuite/expect/globals_accounting @@ -152,7 +152,7 @@ maxcpumin maxcpu maxjob maxnode maxsubmit maxwall} { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $my_pid - exit_code 1 + incr exit_code 1 } eof { wait @@ -256,7 +256,7 @@ proc add_cluster2 { name clus_req_in } { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $my_pid - exit_code 1 + incr exit_code 1 } eof { wait @@ -1264,7 +1264,7 @@ proc add_qos {name} { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $my_pid - exit_code 1 + incr exit_code 1 } eof { wait @@ -1590,7 +1590,7 @@ proc add_res {name count manager server desc} { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $my_pid - exit_code 1 + incr exit_code 1 } eof { wait @@ -1647,7 +1647,7 @@ proc add_clus_res {name allowed} { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $my_pid - exit_code 1 + incr exit_code 1 } eof { wait diff --git a/testsuite/expect/inc21.21.4 b/testsuite/expect/inc21.21.4 index cf907574a..58858482d 100644 --- a/testsuite/expect/inc21.21.4 +++ b/testsuite/expect/inc21.21.4 @@ -38,7 +38,7 @@ proc inc21_21_4 {maxwall } { # spawn $srun -t[expr $maxwall + 1] --account=$ta $bin_id expect { - -re "Job violates accounting policy" { + -re "Job violates accounting/QOS policy" { send_user "\nThis error is expected, not a problem (Within: inc21.21.4)\n" exp_continue } diff --git a/testsuite/expect/inc21.30.2 b/testsuite/expect/inc21.30.2 index ffd830a3e..ef1461f57 100644 --- a/testsuite/expect/inc21.30.2 +++ b/testsuite/expect/inc21.30.2 @@ -43,7 +43,7 @@ proc inc21_30_2 {} { incr jobmatch } -re "Pending job allocation ($number)" { - send_user "\nFAILURE: Job should be running, but is not. (Within: inc21.30.2)\n" + send_user "\nFAILURE: Job should be running, but is not. If you have CR_CORE_* and have ThreadsPerCore > 1 this could happen. (Within: inc21.30.2)\n" set job_id1 $expect_out(1,string) set exit_code 1 } @@ -69,7 +69,7 @@ proc inc21_30_2 {} { incr jobmatch } timeout { - send_user "\nFAILURE: salloc is not reponding (Within: inc21.30.2)\n" + send_user "\nFAILURE: salloc is not reponding (Within: inc21.30.2) If you have CR_CORE_* and have ThreadsPerCore > 1 this could happen. \n" set exit_code 1 } eof { diff --git a/testsuite/expect/inc21.30.5 b/testsuite/expect/inc21.30.5 index 2a4187c6a..ffc0ec3ad 100644 --- a/testsuite/expect/inc21.30.5 +++ b/testsuite/expect/inc21.30.5 @@ -28,7 +28,7 @@ # Test MaxCpus limits proc inc21_30_5 {} { - global salloc acct number srun bin_sleep maxcpu_num + global salloc acct number srun bin_sleep maxcpu_num exit_code send_user "\nStarting MaxCPUs limit test (Within: inc21.30.5)\n\n" set job_id1 0 diff --git a/testsuite/expect/inc21.30.6 b/testsuite/expect/inc21.30.6 index 1fb093182..2bb4438a0 100644 --- a/testsuite/expect/inc21.30.6 +++ b/testsuite/expect/inc21.30.6 @@ -28,7 +28,7 @@ # Test MaxNode limit proc inc21_30_6 {} { - global salloc acct number srun job_id1 bin_sleep maxnode_num + global salloc acct number srun job_id1 bin_sleep maxnode_num exit_code send_user "\nStarting MaxNode limit test (Within: inc21.30.6)\n\n" set job_id1 0 diff --git a/testsuite/expect/test1.89 b/testsuite/expect/test1.89 index aafd52f58..1ac192f44 100755 --- a/testsuite/expect/test1.89 +++ b/testsuite/expect/test1.89 @@ -149,6 +149,31 @@ expect { } } +if {$task_cnt > 32} { + send "exit\r" + expect { + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + } + timeout { + send_user "\nFAILURE: salloc not responding " + send_user "or failure to recognize prompt\n" + slow_kill $salloc_pid + set exit_code 1 + } + eof { + wait + } + } + + if {$exit_code == 0} { + exec $bin_rm -f $file_prog + send_user "\nWARNING: Expect unable to work with more than 32-bit numbers\n" + } + exit $exit_code +} + # # Run a job step with affinity # @@ -285,7 +310,7 @@ set cpu_cnt 0 while {$cpu_cnt < $task_cnt} { set mask_sum 0 set mask [ expr 1 << $cpu_cnt ] - set mstr [ dec2hex $mask] + set mstr [ uint2hex $mask ] send "$srun -c1 --cpu_bind=mask_cpu:$mstr ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { @@ -326,7 +351,7 @@ set full_mask [ expr (1 << $task_cnt) - 1 ] while {$cpu_cnt < $task_cnt} { set mask_sum 0 set mask [ expr 1 << $cpu_cnt ] - set mstr [ dec2hex $mask] + set mstr [ uint2hex $mask ] set fwd_mask "$fwd_mask,$mstr" set fwd_map "$fwd_map,$cpu_cnt" set rev_mask "$mstr,$rev_mask" @@ -555,4 +580,3 @@ if {$exit_code == 0} { send_user " or if Shared=FORCE for the default partition.\n" } exit $exit_code - diff --git a/testsuite/expect/test1.90 b/testsuite/expect/test1.90 index 35f9ec428..c0dd3c325 100755 --- a/testsuite/expect/test1.90 +++ b/testsuite/expect/test1.90 @@ -336,7 +336,7 @@ set cpu_cnt 0 while {$cpu_cnt < $task_cnt} { set mask_sum 0 set mask [ expr 1 << $cpu_cnt ] - set mstr [ dec2hex $mask] + set mstr [ uint2hex $mask ] send "$srun -n $task_cnt --mem_bind=mask_mem:$mstr ./$file_prog\r" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { @@ -377,7 +377,7 @@ set full_mask [ expr (1 << $task_cnt) - 1 ] while {$cpu_cnt < $task_cnt} { set mask_sum 0 set mask [ expr 1 << $cpu_cnt ] - set mstr [ dec2hex $mask] + set mstr [ uint2hex $mask ] set fwd_mask "$fwd_mask,$mstr" set fwd_map "$fwd_map,$cpu_cnt" set rev_mask "$mstr,$rev_mask" diff --git a/testsuite/expect/test21.21 b/testsuite/expect/test21.21 index 6d99b14fd..eec3aa709 100755 --- a/testsuite/expect/test21.21 +++ b/testsuite/expect/test21.21 @@ -135,7 +135,7 @@ proc _test_limits { } { # then submit one more over the limit and it should fail set mypid [spawn $sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in] expect { - -re "Job violates accounting policy" { + -re "Job violates accounting/QOS policy" { send_user "\nThis error is expected, not a problem\n" exp_continue } @@ -162,7 +162,7 @@ proc _test_limits { } { if { $exit_code } { return $exit_code } - + sleep 2 set matches 0 set mypid [spawn $squeue -o "\%i \%t \%r"] expect { diff --git a/testsuite/expect/test3.4 b/testsuite/expect/test3.4 index 367771ca4..3c1fd7f50 100755 --- a/testsuite/expect/test3.4 +++ b/testsuite/expect/test3.4 @@ -97,7 +97,27 @@ if {$read_priority != 0} { # # Change that job's priority # -spawn $scontrol update JobId=$job_id Priority=$new_prio +spawn $scontrol release $job_id +expect { + -re "slurm_update error: Access.*denied" { + send_user "\nWARNING: User not authorized to modify jobs\n" + cancel_job $job_id + exit $exit_code + exp_continue + } + timeout { + send_user "\nFAILURE: scontrol not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +# +# Change that job's priority +# +spawn $scontrol update Jobid=$job_id Priority=$new_prio expect { -re "slurm_update error: Access.*denied" { send_user "\nWARNING: User not authorized to modify jobs\n" -- GitLab