diff --git a/META b/META index d0ca0a54fd18bfe62bed71bd1dd8db2333a69f76..8a6458fac4f59fc1bd59612d9862544939a04f6f 100644 --- a/META +++ b/META @@ -9,8 +9,8 @@ Name: slurm Major: 14 Minor: 11 - Micro: 1 - Version: 14.11.1 + Micro: 2 + Version: 14.11.2 Release: 1 ## diff --git a/NEWS b/NEWS index 30cef962b996419a2fd5b35b84b2a7c51c871ff9..905ceb9b719ef0202f878cddf13933dacfe76e61 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,46 @@ This file describes changes in recent versions of Slurm. It primarily documents those changes that are of interest to users and administrators. +* Changes in Slurm 14.11.2 +========================== + -- Fix Centos5 compile errors. + -- Fix issue with association hash not getting the correct index which + could result in seg fault. + -- Fix salloc/sbatch -B segfault. + -- Avoid huge malloc if GRES configured with "Type" and huge "Count". + -- Fix jobs from starting in overlapping reservations that won't finish before + a "maint" reservation begins. + -- When node gets drained while in state mixed display its status as draining + in sinfo output. + -- Allow priority/multifactor to work with sched/wiki(2) if all priorities + have no weight. This allows for association and QOS decay limits to work. + -- Fix "squeue --start" to override SQUEUE_FORMAT env variable. + -- Fix scancel to be able to cancel multiple jobs that are space delimited. + -- Log Cray MPI job calling exit() without mpi_fini(), but do not treat it as + a fatal error. This partially reverts logic added in version 14.03.9. + -- sview - Fix displaying of suspended steps elapsed times. + -- Increase number of messages that get cached before throwing them away + when the DBD is down. + -- Fix jobs from starting in overlapping reservations that won't finish before + a "maint" reservation begins. + -- Restore GRES functionality with select/linear plugin. It was broken in + version 14.03.10. + -- Fix bug with GRES having multiple types that can cause slurmctld abort. + -- Fix squeue issue with not recognizing "localhost" in --nodelist option. + -- Make sure the bitstrings for a partitions Allow/DenyQOS are up to date + when running from cache. + -- Add smap support for job arrays and larger job ID values. + -- Fix possible race condition when attempting to use QOS on a system running + accounting_storage/filetxt. + -- Fix issue with accounting_storage/filetxt and job arrays not being printed + correctly. + -- In proctrack/linuxproc and proctrack/pgid, check the result of strtol() + for error condition rather than errno, which might have a vestigial error + code. + -- Improve information recording for jobs deferred due to advanced reservation. + -- Exports eio_new_initial_obj to the plugins and initialize kvs_seq on mpi/pmi2 + setup to support launching. + * Changes in Slurm 14.11.1 ========================== -- Get libs correct when doing the xtree/xhash make check. @@ -324,9 +364,13 @@ documents those changes that are of interest to users and administrators. -- When a job dependency can never be satisfied do not cancel the job but keep pending with reason WAIT_DEP_INVALID (DependencyNeverSatisfied). +* Changes in Slurm 14.03.12 +=========================== + * Changes in Slurm 14.03.11 =========================== - -- ALPS - Fix depth for Memory items in BASIL with CLE 5.2. + -- ALPS - Fix depth for Memory items in BASIL with CLE 5.2 + (changed starting in 5.2.3). -- ALPS - Fix issue when tracking memory on a PerNode basis instead of PerCPU. -- Modify assoc_mgr_fill_in_qos() to allow for a flag to know if the QOS read @@ -350,6 +394,20 @@ documents those changes that are of interest to users and administrators. -- ALPS - Fix --ntasks-per-core option on multiple nodes. -- Double max string that Slurm can pack from 16MB to 32MB to support larger MPI2 configurations. + -- Fix Centos5 compile issues. + -- Log Cray MPI job calling exit() without mpi_fini(), but do not treat it as + a fatal error. This partially reverts logic added in version 14.03.9. + -- sview - Fix displaying of suspended steps elapsed times. + -- Increase number of messages that get cached before throwing them away + when the DBD is down. + -- Fix jobs from starting in overlapping reservations that won't finish before + a "maint" reservation begins. + -- Fix "squeue --start" to override SQUEUE_FORMAT env variable. + -- Restore GRES functionality with select/linear plugin. It was broken in + version 14.03.10. + -- Fix possible race condition when attempting to use QOS on a system running + accounting_storage/filetxt. + -- Sanity check for Correct QOS on startup. * Changes in Slurm 14.03.10 =========================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 3da931d8c9c4d040dda9c277bfa8b2dd07dd0de9..f848dd22650d6477a589de7ba74e2180f15faa5d 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -22,6 +22,8 @@ Slurm can be upgraded from version 2.6 or 14.03 to version 14.11 without loss of jobs or other state information. Upgrading directly from an earlier version of Slurm will result in loss of state information. +If using SPANK plugins, they should be recompiled against this version. + HIGHLIGHTS ========== diff --git a/doc/html/download.shtml b/doc/html/download.shtml index 8684c52dff15cd63633e2c67aaf5402231069702..ce74cd6c20add919b88fba904ba19fe53e4c25c9 100644 --- a/doc/html/download.shtml +++ b/doc/html/download.shtml @@ -227,7 +227,12 @@ As assortment of SPANK plugins are available from<br> http://code.google.com/p/slurm-spank-plugins/</a>.<br> The current source for the plugins can be checked out of the subversion repository with the following command:<br> -<i>svn checkout http://slurm-spank-plugins.googlecode.com/svn/trunk/ slurm-plugins</i></li><br> +<i>svn checkout http://slurm-spank-plugins.googlecode.com/svn/trunk/slurm-plugins</i><br><br> + +A SPANK plugin called "spunnel" to support ssh port forwarding is available +from Harvard University. +It can be downloaded from the +<a href="https://github.com/harvardinformatics/spunnel">spunnel repository</a>.</li><br> <li><b>Sqlog</b><br> A set of scripts that leverages Slurm's job completion logging facility @@ -343,6 +348,6 @@ easy and elegantly manner. </ul> -<p style="text-align:center;">Last modified 9 October 2014</p> +<p style="text-align:center;">Last modified 11 December 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index 43d73c026e6694160f4001df3692265fbb913d1d..8dfd6900572e18d47f16463919fdb64749506025 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -24,7 +24,7 @@ The first field is the resource name, which matches the GresType configuration parameter name. The optional type field might be used to identify a model of that generic resource. -A generic resource can also be specified as non\-consumable (i.e. multiple +A generic resource can also be specified as non-consumable (i.e. multiple jobs can use the same generic resource) with the optional field ":no_consume". The final field must specify a generic resource count. A suffix of "K", "M" or "G" may be used to multiply the count by 1024, @@ -69,8 +69,8 @@ use this resource. For example, it may be strongly preferable to use specific CPUs with specific devices (e.g. on a NUMA architecture). Multiple CPUs may be specified using a comma -delimited list or a range may be specified using a "\-" separator -(e.g. "0,1,2,3" or "0\-3"). +delimited list or a range may be specified using a "-" separator +(e.g. "0,1,2,3" or "0-3"). <B>If specified, then only the identified CPUs can be allocated with each generic resource; an attempt to use other CPUs will not be honored.</B> If not specified, then any CPU can be used with the resources. @@ -101,6 +101,7 @@ file.</LI> <LI><B>Type</B> Optionally specify the device type. For example, this might be used to identify a specific model of GPU, which users can then specify in their job request. +If <B>Type</B> is specified, then <B>Count</B> is limited in size (currently 1024). NOTE: This is a new capability added in Slurm version 14.11.</LI> </UL> @@ -206,6 +207,6 @@ to a physical device</pre> explicitly defined in the offload pragmas.</P> <!--------------------------------------------------------------------------> -<p style="text-align: center;">Last modified 10 April 2014</p> +<p style="text-align: center;">Last modified 4 December 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/hdf5_profile_user_guide.shtml b/doc/html/hdf5_profile_user_guide.shtml index ddb5fa79d16fef2aa8da279345b291f84f1fa6a0..782a08075f6f8b9ddda48fb8a2dd3ce9183bf597 100644 --- a/doc/html/hdf5_profile_user_guide.shtml +++ b/doc/html/hdf5_profile_user_guide.shtml @@ -31,7 +31,7 @@ cpu consumption, and memory use from a jobacct_gather plugin. Data from other sources may be added in the future.</p> <p>The data is collected into a file on a shared file system for each step on -each allocated node of a job and then merged into a HDF5 file. +each allocated node of a job and then merged into an HDF5 file. Individual files on a shared file system was chosen because it is possible that the data is voluminous so solutions that pass data to the Slurm control daemon via RPC may not scale to very large clusters or jobs with @@ -39,7 +39,7 @@ many allocated nodes.</p> <p>A separate <a href="acct_gather_profile_plugins.html"> SLURM Profile Accounting Plugin API (AcctGatherProfileType)</a> documents how -write other Profile Accounting plugins.</P> +to write other Profile Accounting plugins.</P> <a id="Administration"></a> <h2>Administration</h2> @@ -57,13 +57,13 @@ option in the acct_gather.conf file. The directory will be created by Slurm if it doesn't exist. Each user will have their own directory created in the ProfileHDF5Dir which contains the HDF5 files. All the directories and files are created by the -SlurmdUser which is usually root. The user specific directories as well -as the files inside are chowned to the user running the job so they +SlurmdUser which is usually root. The user specific directories, as well +as the files inside, are chowned to the user running the job so they can access the files. Since user root is usually creating these files/directories a root squashed file system will not work for the ProfileHDF5Dir.</p> -<p>Each user that creates a profile will have a subdirector to the profile +<p>Each user that creates a profile will have a subdirectory in the profile directory that has read/write permission only for the user.</p> </span> </div> @@ -85,14 +85,14 @@ This sets the sampling frequency for data types: </div> </div> <div style="margin-left: 20px;"> -<h4>act_gather.conf parameters</h4> +<h4>acct_gather.conf parameters</h4> <div style="margin-left: 20px;"> <p>These parameters are directly used by the HDF5 Profile Plugin.</p> <dl> <dt><b>ProfileHDF5Dir</b> = <path></dt> <p> This parameter is the path to the shared folder into which the -acct_gather_profile plugin will write detailed data as a HDF5 file. +acct_gather_profile plugin will write detailed data as an HDF5 file. The directory is assumed to be on a file system shared by the controller and all compute nodes. This is a required parameter.<p> @@ -207,7 +207,7 @@ to be attached to groups to store application defined properties.</p> <p>There are commodity programs, notably <a href="http://www.hdfgroup.org/hdf-java-html/hdfview/index.html"> -HDFView</a> for viewing and manipulating these files. +HDFView</a>, for viewing and manipulating these files. <p>Below is a screen shot from HDFView expanding the job tree and showing the attributes for a specific task.</p> diff --git a/doc/html/mpi_guide.shtml b/doc/html/mpi_guide.shtml index 2c388db7351570dc0bedffd40a032392a3c816b1..aa11bed90339cafc0ee8525fac7d5ff0920f373e 100644 --- a/doc/html/mpi_guide.shtml +++ b/doc/html/mpi_guide.shtml @@ -373,19 +373,37 @@ documentation for "CQ or QP Creation failure".</p> <h2><a name="mvapich2" href="http://nowlab.cse.ohio-state.edu/projects/mpi-iba"><b>MVAPICH2</b></a></h2> -<p>MVAPICH2 jobs can be launched directly by <b>srun</b> command. -SLURM's <i>none</i> MPI plugin must be used to establish communications -between the launched tasks. This can be accomplished either using the SLURM -configuration parameter <i>MpiDefault=none</i> in <b>slurm.conf</b> -or srun's <i>--mpi=none</i> option. The program must also be linked with -SLURM's implementation of the PMI library so that tasks can communicate +<p>MVAPICH2 supports launching multithreaded programs by Slurm as well as +mpirun_rsh. +Please note that if you intend to use use srun, you need to build MVAPICH2 +with Slurm support. +Please refer to the +<a href="http://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-2.0-userguide.html#x1-100004.3.2"> +MVAPICH2 User Guide</a> for details. +You can also use Slurm for only resource allocation (using the salloc or sbatch +command) and launch the jobs using mpirun_rsh. +However, please note that mpirun_rsh does not accept "-env" parameter, so +the command would be something like this:</p> +<pre> +$ mpirun_rsh -np 2 -hostfile <path_to_hostfile> \ + MV2_USE_CUDA=1 MV2_ENABLE_AFFINITY=0 ./mpi <application> +</pre> + +<p>Slurm must be configured to use the <i>none</i> MPI plugin to establish +communications between the launched tasks. +This can be accomplished either using the Slurm configuration parameter +<i>MpiDefault=none</i> in <b>slurm.conf</b> or srun's <i>--mpi=none</i> option. +<b>Do not use Slurm's MVAPICH plugin for MVAPICH2.</b> +The program must also be linked with +Slurm's implementation of the PMI library so that tasks can communicate host and port information at startup. (The system administrator can add these option to the mpicc and mpif77 commands directly, so the user will not -need to bother). <b>Do not use Slurm's MVAPICH plugin for MVAPICH2.</b> +need to bother). <pre> $ mpicc -L<path_to_slurm_lib> -lpmi ... $ srun -n16 --mpi=none a.out </pre> + <hr size=4 width="100%"> @@ -508,6 +526,6 @@ $ srun -N4 -n16 a.out <hr size=4 width="100%"> -<p style="text-align:center;">Last modified 13 September 2013</p> +<p style="text-align:center;">Last modified 28 November 2013</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/programmer_guide.shtml b/doc/html/programmer_guide.shtml index fbd0fedd8b2fa803889c668ab294c54d2f91172c..4cf3f81aa360bfdfc5b6f71db91e5f0e955fc084 100644 --- a/doc/html/programmer_guide.shtml +++ b/doc/html/programmer_guide.shtml @@ -217,7 +217,7 @@ Update <b>Makefile.am</b> files as needed then execute <h3>HAVE_FRONT_END</h3> <p>You can make a single node appear to Slurm as a Linux cluster by running <i>configure</i> with the <i>--enable-front-end</i> option. This -defines b>HAVE_FRONT_END</b> with a non-zero value in the file <b>config.h</b>. +defines <b>HAVE_FRONT_END</b> with a non-zero value in the file <b>config.h</b>. All (fake) nodes should be defined in the <b>slurm.conf</b> file. These nodes should be configured with a single <b>NodeAddr</b> value indicating the node on which single <span class="commandline">slurmd</span> daemon diff --git a/doc/html/qos.shtml b/doc/html/qos.shtml index f787592473066c926742191039a4e48f2366cd58..38c4c3e2aa0e18cb1c4be98b57162d8eb91ef6ab 100644 --- a/doc/html/qos.shtml +++ b/doc/html/qos.shtml @@ -264,7 +264,7 @@ By default when a cluster is added to the database a default qos named normal is created. <pre> -$sacctmgr show qos format=name,priority +$ sacctmgr show qos format=name,priority Name Priority ---------- ---------- normal 0 @@ -274,13 +274,13 @@ $sacctmgr show qos format=name,priority Add a new qos <pre> -$sacctmgr add qos zebra +$ sacctmgr add qos zebra Adding QOS(s) zebra Settings Description = QOS Name -$sacctmgr show qos format=name,priority +$ sacctmgr show qos format=name,priority Name Priority ---------- ---------- normal 0 @@ -291,11 +291,11 @@ $sacctmgr show qos format=name,priority Set QOS priority <pre> -$sacctmgr modify qos zebra set priority=10 +$ sacctmgr modify qos zebra set priority=10 Modified qos... zebra -$sacctmgr show qos format=name,priority +$ sacctmgr show qos format=name,priority Name Priority ---------- ---------- normal 0 @@ -306,12 +306,11 @@ $sacctmgr show qos format=name,priority Set some other limits: <pre> -$sacctmgr modify qos zebra set GrpCPUs=24 +$ sacctmgr modify qos zebra set GrpCPUs=24 Modified qos... zebra -$sacctmgr show qos format=name,priority,GrpCPUs -format=name,priority,GrpCPUs +$ sacctmgr show qos format=name,priority,GrpCPUs Name Priority GrpCPUs ---------- ---------- -------- normal 0 @@ -322,9 +321,9 @@ format=name,priority,GrpCPUs Add a qos to a user account <pre> -$sacctmgr modify user crock set qos=zebra +$ sacctmgr modify user crock set qos=zebra -$sacctmgr show assoc format=cluster,user,qos +$ sacctmgr show assoc format=cluster,user,qos Cluster User QOS ---------- ---------- -------------------- canis_major normal @@ -337,8 +336,8 @@ canis_major crock zebra Users can belong to multiple qos <pre> -$sacctmgr modify user crock set qos+=alligator -$sacctmgr show assoc format=cluster,user,qos +$ sacctmgr modify user crock set qos+=alligator +$ sacctmgr show assoc format=cluster,user,qos Cluster User QOS ---------- ---------- -------------------- canis_major normal @@ -352,11 +351,11 @@ canis_major crock alligator,zebra Finally delete a qos <pre> -$sacctmgr delete qos alligator +$ sacctmgr delete qos alligator Deleting QOS(s)... alligator </pre> -<p style="text-align: center;">Last modified 24 November 2009</p> +<p style="text-align: center;">Last modified 26 November 2009</p> </ul></body></html> diff --git a/doc/html/team.shtml b/doc/html/team.shtml index eb393b5e9ae766423a38e6b766d9d8d3c9213577..3c59135b3839d0e4604a347ebb2cca8925f82bd1 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -57,12 +57,14 @@ Lead Slurm developers are: <li>Hongjia Cao (National University of Defense Technology, China)</li> <li>Jimmy Cao (Greenplum/EMC)</li> <li>Ralph Castain (Intel, Greenplum/EMC, Los Alamos National Laboratory)</li> +<li>Sourav Chakraborty (The Ohio State University)</li> <li>François Chevallier (CEA)</li> <li>Daniel Christians (HP)</li> <li>Brian Christiansen (SchedMD)</li> <li>Gilles Civario (Bull)</li> <li>Chuck Clouston (Bull)</li> <li>J.T. Conklin</li> +<li>Trevor Cooper (San Diego Supercomputer Center)</li> <li>Ryan Cox (Brigham Young University)</li> <br> <li>Yuri D'Elia (Center for Biomedicine, EURAC Research, Italy)</li> diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 3eae7d03538e9998ac95776bddafeec84e449112..bf32ae3f6c405186cf2fff9e08c1f681658e56bc 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -288,7 +288,7 @@ The job_list argument is a comma separated list of job IDs. Requeue a running, suspended or finished SLURM batch job into pending state, moreover the job is put in held state (priority zero). The job_list argument is a comma separated list of job IDs. -A held job can be release using scontrol to reset its priority (e.g. +A held job can be released using scontrol to reset its priority (e.g. "scontrol release <job_id>"). The command accepts the following option: .RS .TP 12 diff --git a/doc/man/man1/slurm.1 b/doc/man/man1/slurm.1 index 65d9a97c5c0b96daa736292bb88b60447032abcc..c98f0536c816e66f740608a9e0f40315c43d3c10 100644 --- a/doc/man/man1/slurm.1 +++ b/doc/man/man1/slurm.1 @@ -64,7 +64,7 @@ details. \fBsacct\fR(1), \fBsacctmgr\fR(1), \fBsalloc\fR(1), \fBsattach\fR(1), \fBsbatch\fR(1), \fBsbcast\fR(1), \fBscancel\fR(1), \fBscontrol\fR(1), \fBsinfo\fR(1), \fBsmap\fR(1), \fBsqueue\fR(1), \fBsreport\fR(1), -\fBsrun\fR(1), \fBsshare\fR(1), \fBsstate\fR(1), \fBstrigger\fR(1), +\fBsrun\fR(1), \fBsshare\fR(1), \fBsstat\fR(1), \fBstrigger\fR(1), \fBsview\fR(1), \fBbluegene.conf\fR(5), \fBslurm.conf\fR(5), \fBslurmdbd.conf\fR(5), \fBwiki.conf\fR(5), diff --git a/doc/man/man5/gres.conf.5 b/doc/man/man5/gres.conf.5 index d928f8543daaf87bf7a4859a2d5f0de6f5cb5fb1..f8b2a0002f86201bc0c0518fdb0d2b0bb039ffb7 100644 --- a/doc/man/man5/gres.conf.5 +++ b/doc/man/man5/gres.conf.5 @@ -104,6 +104,7 @@ the example below. \fBType\fR An arbitrary string identifying the type of device. For example, a particular model of GPU. +If \fBType\fR is specified, then \fBCount\fR is limited in size (currently 1024). .SH "EXAMPLES" .LP diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index cacf2d3004dd1c8c6716d3fd0a862d08ffdefc3f..98af0cbb0a270c6cd3f7d3d611324e9d21726a34 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -804,7 +804,7 @@ configured resources will be set to DRAIN. Consider the configuration of each node to be that specified in the slurm.conf configuration file and any node with less than the configured resources will \fBnot\fR be set DRAIN. -This can be useful for testing purposes. +This option is generally only useful for testing purposes. .RE .TP @@ -1686,13 +1686,13 @@ enable user login, etc. By default there is no prolog. Any configured script is expected to complete execution quickly (in less time than \fBMessageTimeout\fR). If the prolog fails (returns a non\-zero exit code), this will result in the -node being set to a DRAIN state and the job requeued to executed on another node. +node being set to a DRAIN state and the job being requeued in a held state. See \fBProlog and Epilog Scripts\fR for more information. .TP \fBPrologFlags\fR Flags to control the Prolog behavior. By default no flags are set. -Currently the only option defined is: +Currently the options are: .RS .TP 6 \fBAlloc\fR @@ -1839,7 +1839,7 @@ NOTE: This configuration option does not apply to IBM BlueGene systems. .TP \fBReconfigFlags\fR Flags to control various actions that may be taken when an "scontrol -reconfig" command is issued. Currently the only option defined is: +reconfig" command is issued. Currently the options are: .RS .TP 17 \fBKeepPartInfo\fR @@ -2384,6 +2384,9 @@ The following options are supported for \fBSelectType=select/cons_res\fR: \fBCR_ALLOCATE_FULL_SOCKET\fR Jobs are allocated whole sockets rather than individual cores. Must be used with \fBCR_Socket\fR or \fBCR_Socket_Memory\fR option. +NOTE: This is also needed for more accurate accounting. Without it the +number of CPUs allocated the job will not be rounded up to include CPUs +not used by the allocation even though they are allocated. .TP \fBCR_CPU\fR CPUs are consumable resources. @@ -2457,17 +2460,21 @@ Sockets are consumable resources. On nodes with multiple cores, each core or thread is counted as a CPU to satisfy a job's resource requirement, but multiple jobs are not allocated resources on the same socket. -The count of CPUs allocated to a job may be rounded up to account for every -CPU on an allocated socket. +NOTE: With this option even though the entire socket will be allocated +to the job the count of CPUs allocated to a job will not round up to +account for every CPU on an allocated socket without the +\fBCR_ALLOCATE_FULL_SOCKET\fR option. .TP \fBCR_Socket_Memory\fR Memory and sockets are consumable resources. On nodes with multiple cores, each core or thread is counted as a CPU to satisfy a job's resource requirement, but multiple jobs are not allocated resources on the same socket. -The count of CPUs allocated to a job may be rounded up to account for every -CPU on an allocated socket. Setting a value for \fBDefMemPerCPU\fR is strongly recommended. +NOTE: With this option even though the entire socket will be allocated +to the job the count of CPUs allocated to a job will not round up to +account for every CPU on an allocated socket without the +\fBCR_ALLOCATE_FULL_SOCKET\fR option. .TP \fBCR_Memory\fR Memory is a consumable resource. @@ -4023,7 +4030,7 @@ node being set to a DRAIN state. If the EpilogSlurmctld fails (returns a non\-zero exit code), this will only be logged. If the Prolog fails (returns a non\-zero exit code), this will result in the -node being set to a DRAIN state and the job requeued to executed on another node. +node being set to a DRAIN state and the job being requeued in a held state. If the PrologSlurmctld fails (returns a non\-zero exit code), this will result in the job requeued to executed on another node if possible. Only batch jobs can be requeued. Interactive jobs (salloc and srun) will be cancelled if the diff --git a/doc/man/man8/spank.8 b/doc/man/man8/spank.8 index c34b16f3ab7c35afae31d169391042e109d867ef..e3b0896c5cbac970cfd812ed447cb366f3d7ff44 100644 --- a/doc/man/man8/spank.8 +++ b/doc/man/man8/spank.8 @@ -1,4 +1,4 @@ -.TH "SPANK" "8" "June 2009" "SPANK" "SLURM plug\-in architecture for Node and job (K)control" +.TH "SPANK" "8" "December 2014" "SPANK" "SLURM plug\-in architecture for Node and job (K)control" .SH "NAME" \fBSPANK\fR \- SLURM Plug\-in Architecture for Node and job (K)control @@ -18,6 +18,8 @@ the \fBSPANK\fR infrastructure provides administrators and other developers a low cost, low effort ability to dynamically modify the runtime behavior of SLURM job launch. .LP +\fBNote\fR: \fBSPANK\fR plugins must be recompiled for each new version. +.LP .SH "SPANK PLUGINS" \fBSPANK\fR plugins are loaded in up to five separate contexts during a diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index f85c0db4d13f31cd4ecbe0a4e38169d1aa9b5909..0822564a61ed9959e749070d890d6b2fc8cd22bf 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -121,7 +121,7 @@ static void _add_assoc_hash(slurmdb_association_rec_t *assoc) if (!assoc_hash_id) assoc_hash_id = xmalloc(ASSOC_HASH_SIZE * - sizeof(slurmdb_association_rec_t *)); + sizeof(slurmdb_association_rec_t *)); if (!assoc_hash) assoc_hash = xmalloc(ASSOC_HASH_SIZE * sizeof(slurmdb_association_rec_t *)); @@ -249,19 +249,21 @@ static slurmdb_association_rec_t *_find_assoc_rec( * assoc_count - count of assoc list entries * assoc_hash - hash table into assoc records */ -static void _delete_assoc_hash(void *assoc) +static void _delete_assoc_hash(slurmdb_association_rec_t *assoc) { - slurmdb_association_rec_t *assoc_ptr = - (slurmdb_association_rec_t *) assoc; + slurmdb_association_rec_t *assoc_ptr = assoc; slurmdb_association_rec_t **assoc_pptr; xassert(assoc); /* Remove the record from assoc hash table */ assoc_pptr = &assoc_hash_id[ASSOC_HASH_ID_INX(assoc_ptr->id)]; - while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != - (slurmdb_association_rec_t *) assoc)) - assoc_pptr = &assoc_ptr->assoc_next_id; + while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != assoc)) { + if (!assoc_ptr->assoc_next_id) + assoc_pptr = NULL; + else + assoc_pptr = &assoc_ptr->assoc_next_id; + } if (!assoc_pptr) { fatal("assoc id hash error"); @@ -269,11 +271,14 @@ static void _delete_assoc_hash(void *assoc) } else *assoc_pptr = assoc_ptr->assoc_next_id; - assoc_ptr = (slurmdb_association_rec_t *) assoc; + assoc_ptr = assoc; assoc_pptr = &assoc_hash[_assoc_hash_index(assoc_ptr)]; - while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != - (slurmdb_association_rec_t *) assoc)) - assoc_pptr = &assoc_ptr->assoc_next; + while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != assoc)) { + if (!assoc_ptr->assoc_next) + assoc_pptr = NULL; + else + assoc_pptr = &assoc_ptr->assoc_next; + } if (!assoc_pptr) { fatal("assoc hash error"); @@ -456,13 +461,16 @@ static int _change_user_name(slurmdb_user_rec_t *user) if (!assoc->user) continue; if (!strcmp(user->old_name, assoc->user)) { - xfree(assoc->user); - assoc->user = xstrdup(user->name); - assoc->uid = user->uid; /* Since the uid changed the - hash as well will change. + hash as well will change. Remove + the assoc from the hash before the + change or you won't find it. */ _delete_assoc_hash(assoc); + + xfree(assoc->user); + assoc->user = xstrdup(user->name); + assoc->uid = user->uid; _add_assoc_hash(assoc); debug3("changing assoc %d", assoc->id); } @@ -1127,16 +1135,13 @@ static int _get_assoc_mgr_res_list(void *db_conn, int enforce) static int _get_assoc_mgr_qos_list(void *db_conn, int enforce) { uid_t uid = getuid(); + List new_list = NULL; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; - assoc_mgr_lock(&locks); - if (assoc_mgr_qos_list) - list_destroy(assoc_mgr_qos_list); - assoc_mgr_qos_list = acct_storage_g_get_qos(db_conn, uid, NULL); + new_list = acct_storage_g_get_qos(db_conn, uid, NULL); - if (!assoc_mgr_qos_list) { - assoc_mgr_unlock(&locks); + if (!new_list) { if (enforce & ACCOUNTING_ENFORCE_ASSOCS) { error("_get_assoc_mgr_qos_list: no list was made."); return SLURM_ERROR; @@ -1145,9 +1150,16 @@ static int _get_assoc_mgr_qos_list(void *db_conn, int enforce) } } + assoc_mgr_lock(&locks); + + FREE_NULL_LIST(assoc_mgr_qos_list); + assoc_mgr_qos_list = new_list; + new_list = NULL; + _post_qos_list(assoc_mgr_qos_list); assoc_mgr_unlock(&locks); + return SLURM_SUCCESS; } @@ -1835,13 +1847,21 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, if (assoc_pptr) *assoc_pptr = NULL; - /* Call assoc_mgr_refresh_lists instead of just getting the - association list because we need qos and user lists before - the association list can be made. - */ - if (!assoc_mgr_association_list) - if (assoc_mgr_refresh_lists(db_conn) == SLURM_ERROR) - return SLURM_ERROR; + /* Since we might be locked we can't come in here and try to + * get the list since we would need the WRITE_LOCK to do that, + * so just return as this would only happen on a system not + * talking to the database. + */ + if (!assoc_mgr_association_list) { + int rc = SLURM_SUCCESS; + + if (enforce & ACCOUNTING_ENFORCE_QOS) { + error("No Association list available, " + "this should never happen"); + rc = SLURM_ERROR; + } + return rc; + } if ((!assoc_mgr_association_list || !list_count(assoc_mgr_association_list)) @@ -2092,14 +2112,28 @@ extern int assoc_mgr_fill_in_qos(void *db_conn, slurmdb_qos_rec_t *qos, if (qos_pptr) *qos_pptr = NULL; - if (!assoc_mgr_qos_list) - if (_get_assoc_mgr_qos_list(db_conn, enforce) == SLURM_ERROR) - return SLURM_ERROR; if (!locked) assoc_mgr_lock(&locks); - if ((!assoc_mgr_qos_list || !list_count(assoc_mgr_qos_list)) - && !(enforce & ACCOUNTING_ENFORCE_QOS)) { + + /* Since we might be locked we can't come in here and try to + * get the list since we would need the WRITE_LOCK to do that, + * so just return as this would only happen on a system not + * talking to the database. + */ + if (!assoc_mgr_qos_list) { + int rc = SLURM_SUCCESS; + + if (enforce & ACCOUNTING_ENFORCE_QOS) { + error("No QOS list available, " + "this should never happen"); + rc = SLURM_ERROR; + } + if (!locked) + assoc_mgr_unlock(&locks); + return rc; + } else if (!list_count(assoc_mgr_qos_list) + && !(enforce & ACCOUNTING_ENFORCE_QOS)) { if (!locked) assoc_mgr_unlock(&locks); return SLURM_SUCCESS; @@ -4576,11 +4610,14 @@ extern int assoc_mgr_set_missing_uids() "couldn't get a uid for user %s", object->user); } else { - object->uid = pw_uid; /* Since the uid changed the - hash as well will change. + hash as well will change. Remove + the assoc from the hash before the + change or you won't find it. */ _delete_assoc_hash(object); + + object->uid = pw_uid; _add_assoc_hash(object); } } diff --git a/src/common/eio.c b/src/common/eio.c index 2c83bd696ccae4c63598e5522ee2050b5187f803..76a9a684ba262aa562d6d6cb40a7389c20fafaab 100644 --- a/src/common/eio.c +++ b/src/common/eio.c @@ -64,6 +64,7 @@ strong_alias(eio_handle_mainloop, slurm_eio_handle_mainloop); strong_alias(eio_message_socket_readable, slurm_eio_message_socket_readable); strong_alias(eio_message_socket_accept, slurm_eio_message_socket_accept); strong_alias(eio_new_obj, slurm_eio_new_obj); +strong_alias(eio_new_initial_obj, slurm_eio_new_initial_obj); strong_alias(eio_obj_create, slurm_eio_obj_create); strong_alias(eio_obj_destroy, slurm_eio_obj_destroy); strong_alias(eio_remove_obj, slurm_eio_remove_obj); diff --git a/src/common/gres.c b/src/common/gres.c index c28918bbbe003558218a1d1fddee051bf6197db1..ff87fbd1f857c0d778292795bba6a3ee34b7a09c 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -86,6 +86,7 @@ #include "src/common/xstring.h" #define GRES_MAGIC 0x438a34d4 +#define MAX_GRES_BITMAP 1024 /* Gres symbols provided by the plugin */ typedef struct slurm_gres_ops { @@ -1098,15 +1099,15 @@ extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name) tmp_name, node_name); has_file = 1; } - if (has_file && (count > 1024)) { + if (has_file && (count > MAX_GRES_BITMAP)) { /* Avoid over-subscribing memory with * huge bitmaps */ - error("gres_plugin_node_config_unpack: " - "gres/%s has File plus very " + error("%s: gres/%s has File plus very " "large Count (%u) for node %s, " - "resetting value to 1024", - tmp_name, count, node_name); - count = 1024; + "resetting value to %d", + __func__, tmp_name, count, + node_name, MAX_GRES_BITMAP); + count = MAX_GRES_BITMAP; } if (has_file) /* Don't clear if already set */ gres_context[j].has_file = has_file; @@ -1702,7 +1703,7 @@ extern int _node_config_validate(char *node_name, char *orig_config, } else if (cpus_config) { error("%s: has CPUs configured for only" " some of the records on node %s", - context_ptr->gres_type,node_name); + context_ptr->gres_type, node_name); } gres_data->topo_gres_bitmap[i] = bit_alloc(gres_cnt); for (j = 0; j < gres_slurmd_conf->count; j++) { @@ -1734,6 +1735,14 @@ extern int _node_config_validate(char *node_name, char *orig_config, gres_data->gres_cnt_avail = 0; if (context_ptr->has_file) { + if (gres_data->gres_cnt_avail > MAX_GRES_BITMAP) { + error("%s: gres/%s has File plus very large Count (%u) " + "for node %s, resetting value to %u", + __func__, context_ptr->gres_type, + gres_data->gres_cnt_avail, node_name, + MAX_GRES_BITMAP); + gres_data->gres_cnt_avail = MAX_GRES_BITMAP; + } if (gres_data->gres_bit_alloc == NULL) { gres_data->gres_bit_alloc = bit_alloc(gres_data->gres_cnt_avail); @@ -3102,7 +3111,9 @@ static uint32_t _job_test(void *job_gres_data, void *node_gres_data, int cpu_start_bit, int cpu_end_bit, bool *topo_set, uint32_t job_id, char *node_name, char *gres_name) { - int i, j, cpu_size, cpus_ctld, gres_avail = 0, top_inx; +// int i, j, cpu_size, cpus_ctld, gres_avail = 0, top_inx; +int i, j, cpu_size, cpus_ctld, gres_avail = 0; +static int top_inx; gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data; uint32_t *cpus_addnt = NULL; /* Additional CPUs avail from this GRES */ @@ -3266,7 +3277,7 @@ static uint32_t _job_test(void *job_gres_data, void *node_gres_data, continue; } /* update counts of allocated CPUs and GRES */ - if (!node_gres_ptr->topo_cpus_bitmap[i]) { + if (!node_gres_ptr->topo_cpus_bitmap[top_inx]) { bit_nset(alloc_cpu_bitmap, 0, cpus_ctld - 1); } else if (gres_avail) { bit_or(alloc_cpu_bitmap, diff --git a/src/common/parse_value.c b/src/common/parse_value.c index 9ba15f73c9f51161d6097ffcdf8e08a12567c255..4243af2b324037156bd7bbd3e94902cef9c41f4c 100644 --- a/src/common/parse_value.c +++ b/src/common/parse_value.c @@ -38,9 +38,14 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ +#ifndef _ISOC99_SOURCE +# define _ISOC99_SOURCE /* strtof() */ +#endif + #include <stdint.h> #include <stdlib.h> #include <string.h> +#include <strings.h> #include <unistd.h> #include <math.h> diff --git a/src/common/proc_args.c b/src/common/proc_args.c index 0305a9a4ca1d0c38576fe33faa21166138446af7..14b91b64cd2b3dddb8fda5034b603cafcb96e9ba 100644 --- a/src/common/proc_args.c +++ b/src/common/proc_args.c @@ -608,7 +608,8 @@ bool verify_socket_core_thread_count(const char *arg, int *min_sockets, /* if cpu_bind_type doesn't already have a auto preference, choose * the level based on the level of the -E specification */ - if (!(*cpu_bind_type & (CPU_BIND_TO_SOCKETS | + if (cpu_bind_type && + !(*cpu_bind_type & (CPU_BIND_TO_SOCKETS | CPU_BIND_TO_CORES | CPU_BIND_TO_THREADS))) { if (j == 0) { diff --git a/src/common/read_config.c b/src/common/read_config.c index 5f82e688cc8eaac091c0151f0392de7f00d88f57..5b92f08b798ad3ba9e11cfdcb7229a624423acde 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -2851,7 +2851,7 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) char *default_storage_loc = NULL; uint32_t default_storage_port = 0; uint16_t uint16_tmp; - uint64_t tmp64; + uint64_t tot_prio_weight; if (s_p_get_string(&conf->backup_controller, "BackupController", hashtbl) @@ -3651,14 +3651,15 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (!s_p_get_uint32(&conf->priority_weight_qos, "PriorityWeightQOS", hashtbl)) conf->priority_weight_qos = 0; + /* Check for possible overflow of priority. * We also check when doing the computation for each job. */ - tmp64 = (uint64_t) conf->priority_weight_age + + tot_prio_weight = (uint64_t) conf->priority_weight_age + (uint64_t) conf->priority_weight_fs + (uint64_t) conf->priority_weight_js + (uint64_t) conf->priority_weight_part + (uint64_t) conf->priority_weight_qos; - if (tmp64 > 0xffffffff) { + if (tot_prio_weight > 0xffffffff) { error("PriorityWeight values too high, job priority value may " "overflow"); } @@ -3832,14 +3833,16 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) conf->schedtype = xstrdup(DEFAULT_SCHEDTYPE); if (strcmp(conf->priority_type, "priority/multifactor") == 0) { - if ((strcmp(conf->schedtype, "sched/wiki") == 0) || - (strcmp(conf->schedtype, "sched/wiki2") == 0)) { + if (tot_prio_weight && + (!strcmp(conf->schedtype, "sched/wiki") || + !strcmp(conf->schedtype, "sched/wiki2"))) { error("PriorityType=priority/multifactor is " "incompatible with SchedulerType=%s", conf->schedtype); return SLURM_ERROR; } } + if (conf->preempt_mode) { if ((strcmp(conf->schedtype, "sched/wiki") == 0) || (strcmp(conf->schedtype, "sched/wiki2") == 0)) { diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 28f338a9fba1328e4829a107880a82419d9ce481..a3038f17d0174dc9e2c979996f2ec1c4cfd705eb 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -261,7 +261,7 @@ static slurm_errtab_t slurm_errtab[] = { { ESLURM_QOS_PREEMPTION_LOOP, "QOS Preemption loop detected" }, { ESLURM_NODE_NOT_AVAIL, - "Required node not available (down or drained)" }, + "Required node not available (down, drained or reserved)"}, { ESLURM_INVALID_CPU_COUNT, "CPU count specification invalid" }, { ESLURM_PARTITION_NOT_AVAIL, diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index d839764afe0270aed9803855165958d8e958e7c6..6d6237cf4170de1e88b898ab14d8ec8af940cdd1 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1699,7 +1699,9 @@ extern char *node_state_string(uint32_t inx) return "MAINT"; } if (drain_flag) { - if (comp_flag || (base == NODE_STATE_ALLOCATED)) { + if (comp_flag + || (base == NODE_STATE_ALLOCATED) + || (base == NODE_STATE_MIXED)) { if (no_resp_flag) return "DRAINING*"; return "DRAINING"; @@ -1707,10 +1709,6 @@ extern char *node_state_string(uint32_t inx) if (no_resp_flag) return "ERROR*"; return "ERROR"; - } else if (base == NODE_STATE_MIXED) { - if (no_resp_flag) - return "MIXED*"; - return "MIXED"; } else { if (no_resp_flag) return "DRAINED*"; @@ -1833,7 +1831,9 @@ extern char *node_state_string_compact(uint32_t inx) return "MAINT"; } if (drain_flag) { - if (comp_flag || (inx == NODE_STATE_ALLOCATED)) { + if (comp_flag + || (inx == NODE_STATE_ALLOCATED) + || (inx == NODE_STATE_MIXED)) { if (no_resp_flag) return "DRNG*"; return "DRNG"; @@ -1841,10 +1841,6 @@ extern char *node_state_string_compact(uint32_t inx) if (no_resp_flag) return "ERROR*"; return "ERROR"; - } else if (inx == NODE_STATE_MIXED) { - if (no_resp_flag) - return "MIXED*"; - return "MIXED"; } else { if (no_resp_flag) return "DRAIN*"; diff --git a/src/common/slurm_xlator.h b/src/common/slurm_xlator.h index 8a837e2371557c3ba2e5b2fb2ccde2ef728fa17c..45c33d3be578f046af424a239c5279d3667447f9 100644 --- a/src/common/slurm_xlator.h +++ b/src/common/slurm_xlator.h @@ -399,6 +399,7 @@ #define eio_message_socket_accept slurm_eio_message_socket_accept #define eio_message_socket_readable slurm_eio_message_socket_readable #define eio_new_obj slurm_eio_new_obj +#define eio_new_initial_obj slurm_eio_new_initial_obj #define eio_obj_create slurm_eio_obj_create #define eio_obj_destroy slurm_eio_obj_destroy #define eio_remove_obj slurm_eio_remove_obj diff --git a/src/common/slurmdb_defs.c b/src/common/slurmdb_defs.c index f2d80be5b6c44288088bc96f4d255e641b56cded..4c4171b18f11c1e34a33321abeccd912b5e2c3d8 100644 --- a/src/common/slurmdb_defs.c +++ b/src/common/slurmdb_defs.c @@ -384,6 +384,7 @@ extern slurmdb_job_rec_t *slurmdb_create_job_rec() { slurmdb_job_rec_t *job = xmalloc(sizeof(slurmdb_job_rec_t)); memset(&job->stats, 0, sizeof(slurmdb_stats_t)); + job->array_task_id = NO_VAL; job->derived_ec = NO_VAL; job->stats.cpu_min = NO_VAL; job->state = JOB_PENDING; diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index 086c1674d2a97b1c29b5840dccee846aca4060ee..ec8b3d5ae3564a18fd907710a614b44fc794eb07 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -376,7 +376,9 @@ extern int slurm_send_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req) * MAX_AGENT_QUEUE which ever is bigger */ if (!max_agent_queue) max_agent_queue = - MAX(MAX_AGENT_QUEUE, slurmctld_conf.max_job_cnt * 2); + MAX(MAX_AGENT_QUEUE, + ((slurmctld_conf.max_job_cnt * 2) + + (node_record_count * 4))); buffer = pack_slurmdbd_msg(req, rpc_version); diff --git a/src/plugins/accounting_storage/filetxt/filetxt_jobacct_process.c b/src/plugins/accounting_storage/filetxt/filetxt_jobacct_process.c index bcc1b20d6f476bb301a9e293cd13ad3672dc3843..4802d1896debb222e09ec0910b587e41ad664a68 100644 --- a/src/plugins/accounting_storage/filetxt/filetxt_jobacct_process.c +++ b/src/plugins/accounting_storage/filetxt/filetxt_jobacct_process.c @@ -1109,8 +1109,9 @@ extern List filetxt_jobacct_process_get_jobs(slurmdb_job_cond_t *job_cond) if (curr_job->jobid == slurmdb_job->jobid) { list_delete_item(itr2); - info("removing job %d", - slurmdb_job->jobid); + debug3("removing duplicate " + "of job %d", + slurmdb_job->jobid); break; } } diff --git a/src/plugins/mpi/pmi2/client.c b/src/plugins/mpi/pmi2/client.c index cf309d6264c74119292353d447d6b95bacc594d0..f4ba6feadb9e46e47ebd44e2e9e262b70269ce99 100644 --- a/src/plugins/mpi/pmi2/client.c +++ b/src/plugins/mpi/pmi2/client.c @@ -117,10 +117,10 @@ _parse_cmd(client_req_t *req) len = strlen (MCMD_KEY"="); if (! strncmp(req->buf, MCMD_KEY"=", len)) { - req->cmd = MCMD_KEY; /* XXX: mcmd=spawn */ - req->sep = '\n'; - req->term = '\n'; - return SLURM_SUCCESS; + req->cmd = MCMD_KEY; /* XXX: mcmd=spawn */ + req->sep = '\n'; + req->term = '\n'; + return SLURM_SUCCESS; } len = strlen (CMD_KEY"="); @@ -587,7 +587,7 @@ send_kvs_fence_resp_to_clients(int rc, char *errmsg) int i = 0; client_resp_t *resp; char *msg; - + resp = client_resp_new(); if ( is_pmi11() ) { if (rc != 0 && errmsg != NULL) { diff --git a/src/plugins/mpi/pmi2/setup.c b/src/plugins/mpi/pmi2/setup.c index da6ee423486e0555a3321ad86fba20718dc14308..68a6a8e8c6ed515522b151dc55a8fe1c07146437 100644 --- a/src/plugins/mpi/pmi2/setup.c +++ b/src/plugins/mpi/pmi2/setup.c @@ -305,6 +305,7 @@ _setup_stepd_kvs(const stepd_step_rec_t *job, char ***env) int rc = SLURM_SUCCESS, i = 0, pp_cnt = 0; char *p, env_key[32], *ppkey, *ppval; + kvs_seq = 1; rc = temp_kvs_init(); if (rc != SLURM_SUCCESS) return rc; @@ -643,6 +644,7 @@ _setup_srun_kvs(const mpi_plugin_client_info_t *job) { int rc; + kvs_seq = 1; rc = temp_kvs_init(); return rc; } diff --git a/src/plugins/priority/multifactor/fair_tree.c b/src/plugins/priority/multifactor/fair_tree.c index 153a9acc05080931891da7f92ba366831b87012b..8a66404c7cc5b0b6f1f781c9fc13064afbd06520 100644 --- a/src/plugins/priority/multifactor/fair_tree.c +++ b/src/plugins/priority/multifactor/fair_tree.c @@ -35,6 +35,10 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ +#ifndef _ISOC99_SOURCE +# define _ISOC99_SOURCE /* INFINITY */ +#endif + #include <math.h> #include <stdlib.h> @@ -204,34 +208,52 @@ static void _calc_assoc_fs(slurmdb_association_rec_t *assoc) } -static slurmdb_association_rec_t** _append_children_to_array( +/* Append list of associations to array + * IN list - list of associations + * IN merged - array of associations to append to + * IN/OUT merged_size - number of associations in merged array + * RET - New array. Must be freed. + */ +static slurmdb_association_rec_t** _append_list_to_array( List list, slurmdb_association_rec_t** merged, - size_t *child_count) + size_t *merged_size) { ListIterator itr; slurmdb_association_rec_t *next; - size_t i = *child_count; - *child_count += list_count(list); + size_t bytes; + size_t i = *merged_size; + *merged_size += list_count(list); - merged = xrealloc(merged, sizeof(slurmdb_association_rec_t*) - * (*child_count + 1)); + /* must be null-terminated, so add one extra slot */ + bytes = sizeof(slurmdb_association_rec_t*) * (*merged_size + 1); + merged = xrealloc(merged, bytes); itr = list_iterator_create(list); while ((next = list_next(itr))) merged[i++] = next; list_iterator_destroy(itr); + /* null terminate the array */ + merged[*merged_size] = NULL; return merged; } +/* Returns number of tied sibling accounts. + * IN assocs - array of siblings, sorted by level_fs + * IN begin_ndx - begin looking for ties at this index + * RET - number of sibling accounts with equal level_fs values + */ static size_t _count_tied_accounts(slurmdb_association_rec_t** assocs, - size_t i) + size_t begin_ndx) { slurmdb_association_rec_t* next_assoc; - slurmdb_association_rec_t* assoc = assocs[i]; + slurmdb_association_rec_t* assoc = assocs[begin_ndx]; + size_t i = begin_ndx; size_t tied_accounts = 0; while ((next_assoc = assocs[++i])) { + /* Users are sorted to the left of accounts, so no user we + * encounter here will be equal to this account */ if (!next_assoc->user) break; if (assoc->usage->level_fs != next_assoc->usage->level_fs) @@ -242,12 +264,20 @@ static size_t _count_tied_accounts(slurmdb_association_rec_t** assocs, } +/* Copy the children of accounts [begin, end] into a single array. + * IN siblings - array of siblings, sorted by level_fs + * IN begin - index of first account to merge + * IN end - index of last account to merge + * IN assoc_level - depth in the tree (root is 0) + * RET - Array of the children. Must be freed. + */ static slurmdb_association_rec_t** _merge_accounts( slurmdb_association_rec_t** siblings, size_t begin, size_t end, uint16_t assoc_level) { size_t i; - size_t child_count = 0; + /* number of associations in merged array */ + size_t merged_size = 0; /* merged is a null terminated array */ slurmdb_association_rec_t** merged = (slurmdb_association_rec_t **) xmalloc(sizeof(slurmdb_association_rec_t *)); @@ -256,6 +286,7 @@ static slurmdb_association_rec_t** _merge_accounts( for (i = begin; i <= end; i++) { List children = siblings[i]->usage->children_list; + /* the first account's debug was already printed */ if (priority_debug && i > begin) _ft_debug(siblings[i], assoc_level, true); @@ -263,8 +294,7 @@ static slurmdb_association_rec_t** _merge_accounts( continue; } - merged = _append_children_to_array(children, merged, - &child_count); + merged = _append_list_to_array(children, merged, &merged_size); } return merged; } @@ -275,61 +305,77 @@ static slurmdb_association_rec_t** _merge_accounts( * This portion of the tree is now sorted and users are given a fairshare value * based on the order they are operated on. The basic equation is * (rank / g_user_assoc_count), though ties are allowed. The rank is decremented - * for each user that is encountered. + * for each user that is encountered except when ties occur. + * + * Tie Handling Rules: + * 1) Sibling users with the same level_fs receive the same rank + * 2) Sibling accounts with the same level_fs have their children lists + * merged before sorting + * 3) A user with the same level_fs as a sibling account will receive + * the same rank as the account's highest ranked user + * + * IN siblings - array of siblings + * IN assoc_level - depth in the tree (root is 0) + * IN/OUT rank - current user ranking, starting at g_user_assoc_count + * IN/OUT rnt - rank, no ties (what rank would be if no tie exists) + * IN account_tied - is this account tied with the previous user */ static void _calc_tree_fs(slurmdb_association_rec_t** siblings, - uint16_t assoc_level, uint32_t *rank, uint32_t *i, - bool account_tied) + uint16_t assoc_level, uint32_t *rank, + uint32_t *rnt, bool account_tied) { slurmdb_association_rec_t *assoc = NULL; long double prev_level_fs = (long double) NO_VAL; bool tied = false; - size_t ndx; + size_t i; /* Calculate level_fs for each child */ - for (ndx = 0; (assoc = siblings[ndx]); ndx++) + for (i = 0; (assoc = siblings[i]); i++) _calc_assoc_fs(assoc); /* Sort children by level_fs */ - qsort(siblings, ndx, sizeof(slurmdb_association_rec_t *), - _cmp_level_fs); + qsort(siblings, i, sizeof(slurmdb_association_rec_t *), _cmp_level_fs); /* Iterate through children in sorted order. If it's a user, calculate * fs_factor, otherwise recurse. */ - for (ndx = 0; (assoc = siblings[ndx]); ndx++) { - if (account_tied) { + for (i = 0; (assoc = siblings[i]); i++) { + /* tied is used while iterating across siblings. + * account_tied preserves ties while recursing */ + if (i == 0 && account_tied) { + /* The parent was tied so this level starts out tied */ tied = true; - account_tied = false; } else { tied = prev_level_fs == assoc->usage->level_fs; } if (priority_debug) _ft_debug(assoc, assoc_level, tied); + + /* If user, set their final fairshare factor and handle ranking. + * If account, merge any tied accounts then recurse with the + * merged children array. */ if (assoc->user) { if (!tied) - *rank = *i; + *rank = *rnt; - /* Set the final fairshare factor for this user */ assoc->usage->fs_factor = *rank / (double) g_user_assoc_count; - (*i)--; + + (*rnt)--; } else { slurmdb_association_rec_t** children; - size_t merge_count = - _count_tied_accounts(siblings, ndx); + size_t merge_count = _count_tied_accounts(siblings, i); /* Merging does not affect child level_fs calculations * since the necessary information is stored on each * assoc's usage struct */ - children = _merge_accounts(siblings, ndx, - ndx + merge_count, + children = _merge_accounts(siblings, i, i + merge_count, assoc_level); - _calc_tree_fs(children, assoc_level + 1, rank, i, tied); + _calc_tree_fs(children, assoc_level+1, rank, rnt, tied); /* Skip over any merged accounts */ - ndx += merge_count; + i += merge_count; xfree(children); } @@ -344,21 +390,21 @@ static void _apply_priority_fs(void) { slurmdb_association_rec_t** children = NULL; uint32_t rank = g_user_assoc_count; - uint32_t i = rank; + uint32_t rnt = rank; size_t child_count = 0; if (priority_debug) info("Fair Tree fairshare algorithm, starting at root:"); - assoc_mgr_root_assoc->usage->level_fs = 1L; + assoc_mgr_root_assoc->usage->level_fs = (long double) NO_VAL; /* _calc_tree_fs requires an array instead of List */ - children = _append_children_to_array( + children = _append_list_to_array( assoc_mgr_root_assoc->usage->children_list, children, &child_count); - _calc_tree_fs(children, 0, &rank, &i, false); + _calc_tree_fs(children, 0, &rank, &rnt, false); xfree(children); } diff --git a/src/plugins/proctrack/linuxproc/kill_tree.c b/src/plugins/proctrack/linuxproc/kill_tree.c index e45f13ea6241cc852ebee40dd1812fbd38e37325..ea6f12dbfd8d42ed2fcb1ff157d58b9d3c662396 100644 --- a/src/plugins/proctrack/linuxproc/kill_tree.c +++ b/src/plugins/proctrack/linuxproc/kill_tree.c @@ -171,8 +171,7 @@ static xppid_t **_build_hashtbl(void) if ((num[0] < '0') || (num[0] > '9')) continue; ret_l = strtol(num, &endptr, 10); - if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX) || - (errno == ERANGE)) { + if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX)) { error("couldn't do a strtol on str %s(%ld): %m", num, ret_l); continue; diff --git a/src/plugins/proctrack/pgid/proctrack_pgid.c b/src/plugins/proctrack/pgid/proctrack_pgid.c index 28270be69cde7794152f8ad9dae9f6d20b32836c..f5c334bcbe1813899de0bc16a8e1463ae66d5445 100644 --- a/src/plugins/proctrack/pgid/proctrack_pgid.c +++ b/src/plugins/proctrack/pgid/proctrack_pgid.c @@ -218,8 +218,7 @@ proctrack_p_get_pids(uint64_t cont_id, pid_t **pids, int *npids) if ((num[0] < '0') || (num[0] > '9')) continue; ret_l = strtol(num, &endptr, 10); - if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX) || - (errno == ERANGE)) { + if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX)) { error("couldn't do a strtol on str %s(%ld): %m", num, ret_l); continue; diff --git a/src/plugins/select/alps/basil_alps.h b/src/plugins/select/alps/basil_alps.h index ad6f29bb988a54e5768fe8df3135bee426b023a1..703a3cd1df43ab8299b5594be76b024395173604 100644 --- a/src/plugins/select/alps/basil_alps.h +++ b/src/plugins/select/alps/basil_alps.h @@ -60,6 +60,7 @@ enum basil_version { BV_5_0, /* Basil 1.2 CLE 5.x unconfirmed simulator version */ BV_5_1, /* Basil 1.3 CLE 5.x unconfirmed simulator version */ BV_5_2, /* Basil 1.3 CLE 5.2 */ + BV_5_2_3, /* Basil 1.3 CLE 5.2.46+ */ BV_MAX }; diff --git a/src/plugins/select/alps/libalps/do_query.c b/src/plugins/select/alps/libalps/do_query.c index 2e72248c91f028a5f52738adb517a3d6e5a4538f..02f16f7f1557af94fa0613ce2af6dee7a306082e 100644 --- a/src/plugins/select/alps/libalps/do_query.c +++ b/src/plugins/select/alps/libalps/do_query.c @@ -5,6 +5,7 @@ * Licensed under the GPLv2. */ #include "../basil_alps.h" +#include "parser_internal.h" /** * _get_alps_engine - run QUERY of type ENGINE @@ -80,10 +81,15 @@ extern enum basil_version get_basil_version(void) if (_get_alps_engine(engine_version, sizeof(engine_version)) == NULL) fatal("can not determine ALPS Engine version"); - else if ((strncmp(engine_version, "latest", 6) == 0) || - (strncmp(engine_version, "5.2", 3) == 0)) - bv = BV_5_2; - else if (strncmp(engine_version, "5.1", 3) == 0) + else if (strncmp(engine_version, "latest", 6) == 0) { + bv = BV_5_2_3; + } else if (strncmp(engine_version, "5.2", 3) == 0) { + int macro = atoi(engine_version+4); + if (macro >= 3) /* means 5.2.44+ */ + bv = BV_5_2_3; + else + bv = BV_5_2; + } else if (strncmp(engine_version, "5.1", 3) == 0) bv = BV_5_1; else if (strncmp(engine_version, "5.0", 3) == 0) bv = BV_5_0; @@ -112,6 +118,15 @@ extern enum basil_version get_basil_version(void) "src/plugins/select/cray/libalps/do_query.c " "for this version", engine_version); + + if (bv == BV_5_2_3) { + /* Starting in 5.2.UP03 (5.2.44) things changed, so + make it that way */ + basil_5_2_elements[BT_MEMARRAY].depth = 9; + basil_5_2_elements[BT_MEMORY].depth = 10; + basil_5_2_elements[BT_MEMALLOC].depth = 8; + } + return bv; } diff --git a/src/plugins/select/alps/libalps/parser_basil_5.2.c b/src/plugins/select/alps/libalps/parser_basil_5.2.c index 2a69c3ec37514ba7341b198085a1cd08f1c24bac..c08f71f2cbd1cb83b2f4fa626e3221c240c12832 100644 --- a/src/plugins/select/alps/libalps/parser_basil_5.2.c +++ b/src/plugins/select/alps/libalps/parser_basil_5.2.c @@ -6,7 +6,7 @@ */ #include "parser_internal.h" -const struct element_handler basil_5_2_elements[] = { +struct element_handler basil_5_2_elements[] = { [BT_MESSAGE] = { .tag = "Message", .depth = 0xff, /* unused, can appear at any depth */ @@ -135,19 +135,19 @@ const struct element_handler basil_5_2_elements[] = { }, [BT_MEMARRAY] = { .tag = "MemoryArray", - .depth = 9, + .depth = 5, .uniq = true, .hnd = NULL }, [BT_MEMORY] = { .tag = "Memory", - .depth = 10, + .depth = 6, .uniq = false, .hnd = eh_mem }, [BT_MEMALLOC] = { .tag = "MemoryAllocation", - .depth = 8, + .depth = 7, .uniq = false, .hnd = eh_mem_alloc }, diff --git a/src/plugins/select/alps/libalps/parser_common.c b/src/plugins/select/alps/libalps/parser_common.c index 332fe1c565bb66b7355963b9701213cc0a4db020..c12276f16093c647211f813727c2adb992a70305 100644 --- a/src/plugins/select/alps/libalps/parser_common.c +++ b/src/plugins/select/alps/libalps/parser_common.c @@ -515,7 +515,8 @@ static const struct element_handler *basil_tables[BV_MAX] = { [BV_4_1] = basil_4_0_elements, [BV_5_0] = basil_4_0_elements, [BV_5_1] = basil_5_1_elements, - [BV_5_2] = basil_5_2_elements + [BV_5_2] = basil_5_2_elements, + [BV_5_2_3] = basil_5_2_elements }; /** diff --git a/src/plugins/select/alps/libalps/parser_internal.h b/src/plugins/select/alps/libalps/parser_internal.h index 84bf28ea76e7abbdef95ff31f4d8a2e6fd744d24..9df224e996c7091fa2507cd75224ea2c399b6c43 100644 --- a/src/plugins/select/alps/libalps/parser_internal.h +++ b/src/plugins/select/alps/libalps/parser_internal.h @@ -59,7 +59,7 @@ extern const struct element_handler basil_1_1_elements[]; extern const struct element_handler basil_3_1_elements[]; extern const struct element_handler basil_4_0_elements[]; extern const struct element_handler basil_5_1_elements[]; -extern const struct element_handler basil_5_2_elements[]; +extern struct element_handler basil_5_2_elements[]; /* atoul.c */ extern int atou64(const char *str, uint64_t *value); extern int atou32(const char *str, uint32_t *value); diff --git a/src/plugins/select/alps/parser_common.h b/src/plugins/select/alps/parser_common.h index 00dd1b63bd60490c34bfc633954682251f002188..85cd0c6832dd3a8759a62e26a210aefb5eb4cf13 100644 --- a/src/plugins/select/alps/parser_common.h +++ b/src/plugins/select/alps/parser_common.h @@ -23,7 +23,8 @@ const char *bv_names[BV_MAX] = { /* Basil Protocol version */ [BV_4_1] = "1.2", [BV_5_0] = "1.2", [BV_5_1] = "1.3", - [BV_5_2] = "1.3" + [BV_5_2] = "1.3", + [BV_5_2_3] = "1.3" }; const char *bv_names_long[BV_MAX] = { /* Actual version name */ @@ -35,7 +36,8 @@ const char *bv_names_long[BV_MAX] = { /* Actual version name */ [BV_4_1] = "4.1", [BV_5_0] = "5.0", [BV_5_1] = "5.1", - [BV_5_2] = "5.2" + [BV_5_2] = "5.2", + [BV_5_2_3] = "5.2" }; /* Basil methods */ diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c index 4274f5a6964e9c492977b25bad2ab4443386bf09..56c265b852132d93e02a6a7ef0e35454c8649fbd 100644 --- a/src/plugins/select/cons_res/dist_tasks.c +++ b/src/plugins/select/cons_res/dist_tasks.c @@ -41,14 +41,6 @@ #include "select_cons_res.h" #include "dist_tasks.h" -#if (0) -/* Using CR_SOCKET or CR_SOCKET_MEMORY will not allocate a socket to more - * than one job at a time, but it also will not grant a job access to more - * CPUs on the socket than requested. If ALLOCATE_FULL_SOCKET is defined, - * then a job will be given access to every cores on each allocated socket. - */ -#define ALLOCATE_FULL_SOCKET 1 -#endif /* Max boards supported for best-fit across boards */ /* Larger board configurations may require new algorithm */ diff --git a/src/plugins/task/affinity/affinity.c b/src/plugins/task/affinity/affinity.c index 49a04759d542310544938efd166ec1f79135f504..43730e54bc1a7b0dd0156a93bd86e13e8f67349f 100644 --- a/src/plugins/task/affinity/affinity.c +++ b/src/plugins/task/affinity/affinity.c @@ -36,6 +36,26 @@ #include "affinity.h" +/* Older versions of sched.h (ie. Centos5) don't include CPU_OR. */ +#ifndef CPU_OR + +#ifndef CPU_OP_S +# define __CPU_OP_S(setsize, destset, srcset1, srcset2, op) \ + (__extension__ \ + ({ cpu_set_t *__dest = (destset); \ + const __cpu_mask *__arr1 = (srcset1)->__bits; \ + const __cpu_mask *__arr2 = (srcset2)->__bits; \ + size_t __imax = (setsize) / sizeof (__cpu_mask); \ + size_t __i; \ + for (__i = 0; __i < __imax; ++__i) \ + ((__cpu_mask *) __dest->__bits)[__i] = __arr1[__i] op __arr2[__i]; \ + __dest; })) +#endif + +# define CPU_OR(destset, srcset1, srcset2) \ + __CPU_OP_S (sizeof (cpu_set_t), destset, srcset1, srcset2, |) +#endif + static int is_power = -1; void slurm_chkaffinity(cpu_set_t *mask, stepd_step_rec_t *job, int statval) diff --git a/src/plugins/task/cray/task_cray.c b/src/plugins/task/cray/task_cray.c index 3a2270897944c011e5d251e3e128f36d2a1f0839..5ea375e11873eff8b50ecef73e257c7e77d1bd87 100644 --- a/src/plugins/task/cray/task_cray.c +++ b/src/plugins/task/cray/task_cray.c @@ -100,12 +100,12 @@ const char plugin_name[] = "task CRAY plugin"; const char plugin_type[] = "task/cray"; const uint32_t plugin_version = 100; +#ifdef HAVE_NATIVE_CRAY #ifdef HAVE_NUMA // TODO: Remove this prototype once the prototype appears in numa.h. unsigned int numa_bitmask_weight(const struct bitmask *bmp); #endif -#ifdef HAVE_NATIVE_CRAY static void _alpsc_debug(const char *file, int line, const char *func, int rc, int expected_rc, const char *alpsc_func, char *err_msg); @@ -120,7 +120,6 @@ static int _update_num_steps(int val); static int _step_prologue(void); static int _step_epilogue(void); static int track_status = 1; -static int terminated = 0; // A directory on the compute node where temporary files will be kept #define TASK_CRAY_RUN_DIR "/var/run/task_cray" @@ -603,27 +602,15 @@ static int _check_status_file(stepd_step_rec_t *job, } // Check the result - if (status == 0 && !terminated) { + if (status == 0) { if (task->killed_by_cmd) { // We've been killed by request. User already knows return SLURM_SUCCESS; } - // Cancel the job step, since we didn't find the mpi_fini msg - // srun only gets the error() messages by default, send one - // per compute node, but log all other events with info(). - if (terminated) { - info("step %u.%u task %u exited without calling " - "PMI_Finalize()", - job->jobid, job->stepid, task->gtid); - } else { - error("step %u.%u task %u exited without calling " - "PMI_Finalize()", - job->jobid, job->stepid, task->gtid); - terminated = 1; - } - info("reset estatus from %d to %d", task->estatus, SIGKILL); - task->estatus = SIGKILL; + verbose("step %u.%u task %u exited without calling " + "PMI_Finalize()", + job->jobid, job->stepid, task->gtid); } return SLURM_SUCCESS; } diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index d5b52ebb287706d541dd1d98b36e80df87916eae..bfd1e413a7e50f5b3c189ef540a7b80d85a2e6aa 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -972,8 +972,7 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ One can get an number of characters by following the field option with \n\ a %%NUMBER option. i.e. format=name%%30 will print 30 chars of field name.\n\ \n\ - Account - Account, CoordinatorList, Description, \n\ - Organization \n\ + Account - Account, Coordinators, Description, Organization\n\ \n\ Association - Account, Cluster, DefaultQOS, Fairshare, \n\ GrpCPUMins, GrpCPURunMins, GrpCPUs, GrpJobs, \n\ @@ -1009,7 +1008,7 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ \n\ Transactions - Action, Actor, Info, TimeStamp, Where \n\ \n\ - User - AdminLevel, CoordinatorList, DefaultAccount, \n\ + User - AdminLevel, Coordinators, DefaultAccount, \n\ DefaultWCKey, User \n\ \n\ WCKey - Cluster, ID, Name, User \n\ diff --git a/src/scancel/opt.c b/src/scancel/opt.c index 4a29a5891fbed6d8df374d3709c84c3043b13564..a5f858738c13aaa962855fc0ca4a8ad6a1c6b4da 100644 --- a/src/scancel/opt.c +++ b/src/scancel/opt.c @@ -457,7 +457,7 @@ static void _opt_args(int argc, char **argv) if (optind < argc) { char **rest = argv + optind; - opt.job_list = xstrdup(*rest); + opt.job_list = rest; _xlate_job_step_ids(rest); } diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c index bc21909ac493210978dc0083ae95535110dfb422..165e8cda6060e605f74a637417b873afd6667b89 100644 --- a/src/scancel/scancel.c +++ b/src/scancel/scancel.c @@ -749,17 +749,20 @@ _confirmation (int i, uint32_t step_id) static int _signal_job_by_str(void) { - int cc; + int cc, i; + int rc = 0; if (opt.signal == (uint16_t) - 1) opt.signal = SIGKILL; - verbose("Terminating job %s", opt.job_list); + for (i = 0; opt.job_list[i]; i++) { + verbose("Terminating job %s", opt.job_list[i]); - cc = slurm_kill_job2(opt.job_list, opt.signal, 0); - if ((cc != SLURM_SUCCESS) && (opt.verbose != -1)) { - error("slurm_kill_job2() failed %s", slurm_strerror(errno)); - return -1; + cc = slurm_kill_job2(opt.job_list[i], opt.signal, 0); + if ((cc != SLURM_SUCCESS) && (opt.verbose != -1)) { + error("slurm_kill_job2() failed %s", slurm_strerror(errno)); + rc = -1; + } } - return 0; + return rc; } diff --git a/src/scancel/scancel.h b/src/scancel/scancel.h index 82e925940d19b7e59a54e8075666fea5744c1b1f..e2f8883ca9f26a1da2ac4d275348392ae2196c2c 100644 --- a/src/scancel/scancel.h +++ b/src/scancel/scancel.h @@ -67,7 +67,7 @@ typedef struct scancel_options { uint32_t *step_id; /* list of job step id's */ char *wckey; /* --wckey */ char *nodelist; /* --nodelist, -w */ - char *job_list; /* list of job ids as char * */ + char **job_list; /* list of job ids as char * */ } opt_t; opt_t opt; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 745c07f154eee037fa4fb655c05ebc77e008dd68..e7a788a936d5e16df74af54332dddb6f52951dfa 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -2255,16 +2255,17 @@ static void *_assoc_cache_mgr(void *no_data) { ListIterator itr = NULL; struct job_record *job_ptr = NULL; + struct part_record *part_ptr = NULL; slurmdb_qos_rec_t qos_rec; slurmdb_association_rec_t assoc_rec; /* Write lock on jobs, read lock on nodes and partitions */ slurmctld_lock_t job_write_lock = - { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; + { NO_LOCK, WRITE_LOCK, READ_LOCK, WRITE_LOCK }; if (!running_cache) lock_slurmctld(job_write_lock); - while(running_cache == 1) { + while (running_cache == 1) { slurm_mutex_lock(&assoc_cache_mutex); pthread_cond_wait(&assoc_cache_cond, &assoc_cache_mutex); /* This is here to see if we are exiting. If we get @@ -2287,7 +2288,7 @@ static void *_assoc_cache_mgr(void *no_data) * will be in sync. */ debug2("No job list yet"); - goto end_it; + goto handle_parts; } debug2("got real data from the database " @@ -2336,6 +2337,29 @@ static void *_assoc_cache_mgr(void *no_data) } } list_iterator_destroy(itr); + +handle_parts: + if (!part_list) { + /* This could happen in rare occations, it doesn't + * matter since when the job_list is populated things + * will be in sync. + */ + debug2("No part list yet"); + goto end_it; + } + + itr = list_iterator_create(part_list); + while ((part_ptr = list_next(itr))) { + if (part_ptr->allow_qos) + qos_list_build(part_ptr->allow_qos, + &part_ptr->allow_qos_bitstr); + + if (part_ptr->deny_qos) + qos_list_build(part_ptr->deny_qos, + &part_ptr->deny_qos_bitstr); + } + list_iterator_destroy(itr); + end_it: unlock_slurmctld(job_write_lock); /* This needs to be after the lock and after we update the diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 16c78f6d73f526a4e303f7308ae45c7784113040..7f6088cfaf7ce19652f9d11b6e6d0c9d9620f827 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2021,8 +2021,9 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) info("Holding job %u with invalid qos", job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_QOS; - } - job_ptr->qos_id = qos_rec.id; + job_ptr->qos_id = 0; + } else + job_ptr->qos_id = qos_rec.id; } build_node_details(job_ptr, false); /* set node_addr */ return SLURM_SUCCESS; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 73d7f4340b89ff1d38484264ca39433bfb9ffc5d..7b3f3923633efcca7be182c6dd5a4989b2a7c875 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -772,9 +772,17 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, return ESLURM_NODES_BUSY; /* reserved */ } else if (resv_bitmap && (!bit_equal(resv_bitmap, avail_node_bitmap))) { + int cnt_in, cnt_out; + cnt_in = bit_set_count(avail_node_bitmap); bit_and(resv_bitmap, avail_node_bitmap); save_avail_node_bitmap = avail_node_bitmap; avail_node_bitmap = resv_bitmap; + cnt_out = bit_set_count(avail_node_bitmap); + if (cnt_in != cnt_out) { + debug2("Advanced reservation removed %d nodes " + "from consideration for job %u", + (cnt_in - cnt_out), job_ptr->job_id); + } resv_bitmap = NULL; } else { FREE_NULL_BITMAP(resv_bitmap); @@ -1823,6 +1831,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } job_end_time_reset(job_ptr); + /* Clear any vestigial GRES in case job was requeued */ + gres_plugin_job_clear(job_ptr->gres_list); job_array_post_sched(job_ptr); if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) { @@ -1862,8 +1872,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, if (configuring || bit_overlap(job_ptr->node_bitmap, power_node_bitmap)) job_ptr->job_state |= JOB_CONFIGURING; - /* Clear any vestigial GRES in case job was requeued */ - gres_plugin_job_clear(job_ptr->gres_list); if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) { error("select_g_select_nodeinfo_set(%u): %m", job_ptr->job_id); /* not critical ... by now */ diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index b4db063d155de83f4bd9774e553b63320c592e2e..86c47f6bd1a072d00e19507a080fafb5fc399b63 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -595,6 +595,16 @@ extern void qos_list_build(char *qos, bitstr_t **qos_bits) /* Lock here to avoid g_qos_count changing under us */ assoc_mgr_lock(&locks); + if (!g_qos_count) { + error("We have no QOS on the system Ignoring invalid " + "Allow/DenyQOS value(s) %s", + qos); + assoc_mgr_unlock(&locks); + FREE_NULL_BITMAP(*qos_bits); + *qos_bits = NULL; + return; + } + tmp_qos_bitstr = bit_alloc(g_qos_count); tmp_qos = xstrdup(qos); one_qos_name = strtok_r(tmp_qos, ",", &name_ptr); diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 806fd7900891d2baa76cf4e5055b6517c0336402..888d59153eecebd1150ccae9cbe46f9dcaf2f832 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -4087,7 +4087,8 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, iter = list_iterator_create(resv_list); while ((res2_ptr = (slurmctld_resv_t *) list_next(iter))) { if ((resv_ptr->flags & RESERVE_FLAG_MAINT) || - (resv_ptr->flags & RESERVE_FLAG_OVERLAP) || + ((resv_ptr->flags & RESERVE_FLAG_OVERLAP) && + !(res2_ptr->flags & RESERVE_FLAG_MAINT)) || (res2_ptr == resv_ptr) || (res2_ptr->node_bitmap == NULL) || (res2_ptr->start_time >= job_end_time) || diff --git a/src/slurmdbd/read_config.c b/src/slurmdbd/read_config.c index e6f011e6aaf48d9916e1c41879f983a5428f1932..624b9218f36c705ff5f07671d1f41c5a2c2c1ee4 100644 --- a/src/slurmdbd/read_config.c +++ b/src/slurmdbd/read_config.c @@ -567,42 +567,26 @@ extern void log_config(void) private_data_string(slurmdbd_conf->private_data, tmp_str, sizeof(tmp_str)); - debug2("PrivateData = %s", tmp_str); - if (slurmdbd_conf->purge_event != NO_VAL) - slurmdb_purge_string(slurmdbd_conf->purge_event, - tmp_str, sizeof(tmp_str), 1); - else - sprintf(tmp_str, "NONE"); + slurmdb_purge_string(slurmdbd_conf->purge_event, + tmp_str, sizeof(tmp_str), 1); debug2("PurgeEventAfter = %s", tmp_str); - if (slurmdbd_conf->purge_job != NO_VAL) - slurmdb_purge_string(slurmdbd_conf->purge_job, - tmp_str, sizeof(tmp_str), 1); - else - sprintf(tmp_str, "NONE"); + slurmdb_purge_string(slurmdbd_conf->purge_job, + tmp_str, sizeof(tmp_str), 1); debug2("PurgeJobAfter = %s", tmp_str); - if (slurmdbd_conf->purge_resv != NO_VAL) - slurmdb_purge_string(slurmdbd_conf->purge_resv, - tmp_str, sizeof(tmp_str), 1); - else - sprintf(tmp_str, "NONE"); + slurmdb_purge_string(slurmdbd_conf->purge_resv, + tmp_str, sizeof(tmp_str), 1); debug2("PurgeResvAfter = %s", tmp_str); - if (slurmdbd_conf->purge_step != NO_VAL) - slurmdb_purge_string(slurmdbd_conf->purge_step, - tmp_str, sizeof(tmp_str), 1); - else - sprintf(tmp_str, "NONE"); + slurmdb_purge_string(slurmdbd_conf->purge_step, + tmp_str, sizeof(tmp_str), 1); debug2("PurgeStepAfter = %s", tmp_str); - if (slurmdbd_conf->purge_suspend != NO_VAL) - slurmdb_purge_string(slurmdbd_conf->purge_suspend, - tmp_str, sizeof(tmp_str), 1); - else - sprintf(tmp_str, "NONE"); + slurmdb_purge_string(slurmdbd_conf->purge_suspend, + tmp_str, sizeof(tmp_str), 1); debug2("PurgeSuspendAfter = %s", tmp_str); debug2("SlurmUser = %s(%u)", diff --git a/src/smap/job_functions.c b/src/smap/job_functions.c index 4e6cb2979f961d8bb0a5f6954022ee6e91ea1e84..d8672429a9cdc2ed853ef8c1417247d3a5587528 100644 --- a/src/smap/job_functions.c +++ b/src/smap/job_functions.c @@ -208,7 +208,7 @@ static void _print_header_job(void) main_xcord += 3; mvwprintw(text_win, main_ycord, main_xcord, "JOBID"); - main_xcord += 8; + main_xcord += 19; mvwprintw(text_win, main_ycord, main_xcord, "PARTITION"); main_xcord += 10; @@ -320,9 +320,21 @@ static int _print_text_job(job_info_t * job_ptr) mvwprintw(text_win, main_ycord, main_xcord, "%c", job_ptr->num_cpus); main_xcord += 3; - mvwprintw(text_win, main_ycord, - main_xcord, "%d", job_ptr->job_id); - main_xcord += 8; + if (job_ptr->array_task_str) { + mvwprintw(text_win, main_ycord, + main_xcord, "%u_[%s]", + job_ptr->array_job_id, + job_ptr->array_task_str); + } else if (job_ptr->array_task_id != NO_VAL) { + mvwprintw(text_win, main_ycord, + main_xcord, "%u_%u (%u)", + job_ptr->array_job_id, + job_ptr->array_task_id, job_ptr->job_id); + } else { + mvwprintw(text_win, main_ycord, + main_xcord, "%u", job_ptr->job_id); + } + main_xcord += 19; mvwprintw(text_win, main_ycord, main_xcord, "%.10s", job_ptr->partition); main_xcord += 10; diff --git a/src/smap/smap.c b/src/smap/smap.c index 1af2fcdc1edfb0c9d2bc3fbaf03d33a00eee569e..1501acb96df077add0dca4f31eeb1816449f9a60 100644 --- a/src/smap/smap.c +++ b/src/smap/smap.c @@ -45,7 +45,7 @@ #include <signal.h> #include "src/smap/smap.h" -static int min_screen_width = 72; +static int min_screen_width = 80; /******************** * Global Variables * diff --git a/src/squeue/opts.c b/src/squeue/opts.c index cdf673a688ee618bdd52ad9a94f8b66f4c779d45..b453e5b4d82c0420072979fe0a57a2099c8f4a84 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -87,7 +87,7 @@ static void _parse_long_token( char *token, char *sep, int *field_size, bool *right_justify, char **suffix); static void _print_options( void ); static void _usage( void ); -static bool _check_node_names(char *); +static bool _check_node_names(hostset_t); static bool _find_a_host(char *, node_info_msg_t *); /* @@ -97,7 +97,6 @@ extern void parse_command_line( int argc, char* argv[] ) { char *env_val = NULL; - char *nodes; bool override_format_env = false; int opt_char; int option_index; @@ -151,7 +150,6 @@ parse_command_line( int argc, char* argv[] ) } if (getenv("SQUEUE_PRIORITY")) params.priority_flag = true; - nodes = NULL; while ((opt_char = getopt_long(argc, argv, "A:ahi:j::lL:n:M:O:o:p:Pq:R:rs::S:t:u:U:vVw:", long_options, &option_index)) != -1) { @@ -299,7 +297,6 @@ parse_command_line( int argc, char* argv[] ) optarg); exit(1); } - nodes = xstrdup(optarg); break; case OPT_LONG_HELP: _help(); @@ -309,6 +306,7 @@ parse_command_line( int argc, char* argv[] ) break; case OPT_LONG_START: params.start_flag = true; + override_format_env = true; break; case OPT_LONG_USAGE: _usage(); @@ -374,17 +372,15 @@ parse_command_line( int argc, char* argv[] ) xfree(name2); } - /* Replace params.nodename with the new one */ + /* Replace params.nodes with the new one */ hostset_destroy(params.nodes); params.nodes = nodenames; /* Check if all node names specified * with -w are known to the controller. */ - if (!_check_node_names(nodes)) { - xfree(nodes); + if (!_check_node_names(params.nodes)) { exit(1); } - xfree(nodes); } if ( ( params.accounts == NULL ) && @@ -1936,11 +1932,10 @@ Usage: squeue [OPTIONS]\n\ /* _check_node_names() */ static bool -_check_node_names(char *names) +_check_node_names(hostset_t names) { int cc; node_info_msg_t *node_info; - hostlist_t l; char *host; hostlist_iterator_t itr; @@ -1955,8 +1950,7 @@ _check_node_names(char *names) return false; } - l = slurm_hostlist_create(names); - itr = hostlist_iterator_create(l); + itr = hostset_iterator_create(names); while ((host = hostlist_next(itr))) { if (!_find_a_host(host, node_info)) { error("Invalid node name %s", host); diff --git a/src/squeue/print.c b/src/squeue/print.c index b2fa51d3d4089ad6bd95d639fc2a2cf5386f91cb..a18ded03ee72050f5d7d03eef2b75a6fa77ebe3e 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -351,7 +351,7 @@ int print_job_from_format(squeue_job_rec_t *job_rec_ptr, List list) xfree(job_rec_ptr->job_ptr->partition); job_rec_ptr->job_ptr->partition = xstrdup(job_rec_ptr-> part_name); - + } if (job_rec_ptr->job_ptr->array_task_str && params.array_flag) { if (max_array_size == -1) @@ -846,6 +846,8 @@ int _print_job_schednodes(job_info_t * job, int width, bool right, char* suffix) int _print_job_reason_list(job_info_t * job, int width, bool right, char* suffix) { + int l; + if (job == NULL) { /* Print the Header instead */ char *title = "NODELIST(REASON)"; if (params.cluster_flags & CLUSTER_FLAG_BG) @@ -860,7 +862,8 @@ int _print_job_reason_list(job_info_t * job, int width, bool right, reason = job->state_desc; else reason = job_reason_string(job->state_reason); - snprintf(id, FORMAT_STRING_SIZE, "(%s)", reason); + l = strlen(reason) + 3; /* 3 = () + "" */ + snprintf(id, l, "(%s)", reason); _print_str(id, width, right, true); } else { char *nodes = xstrdup(job->nodes); diff --git a/src/sview/job_info.c b/src/sview/job_info.c index 1838df6cdaae9ac302a5a7331241d1d8e1ac9494..7ffe07c3c1a16e85c1a10cd31d9b2eb5c3307051 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -2502,11 +2502,10 @@ static void _get_step_nodelist(job_step_info_t *step_ptr, char *buf, static void _layout_step_record(GtkTreeView *treeview, job_step_info_t *step_ptr, - int update) + int update, bool suspended) { char *uname; char tmp_char[50], tmp_nodes[50], tmp_time[50]; - time_t now_time = time(NULL); GtkTreeIter iter; enum job_states state; GtkTreeStore *treestore = @@ -2551,14 +2550,19 @@ static void _layout_step_record(GtkTreeView *treeview, SORTID_NAME), step_ptr->name); + + if (suspended) + state = JOB_SUSPENDED; + else + state = step_ptr->state; + if (!step_ptr->nodes || !strcasecmp(step_ptr->nodes, "waiting...")) { sprintf(tmp_time,"00:00:00"); snprintf(tmp_nodes, sizeof(tmp_nodes), "waiting..."); state = JOB_PENDING; } else { - now_time -= step_ptr->start_time; - secs2time_str(now_time, tmp_time, sizeof(tmp_time)); + secs2time_str(step_ptr->run_time, tmp_time, sizeof(tmp_time)); _get_step_nodelist(step_ptr, tmp_nodes, sizeof(tmp_nodes)); if (cluster_flags & CLUSTER_FLAG_BGQ) { uint32_t nodes = 0; @@ -2580,7 +2584,6 @@ static void _layout_step_record(GtkTreeView *treeview, find_col_name(display_data_job, SORTID_NODES), tmp_char); - state = JOB_RUNNING; } add_display_treestore_line(update, treestore, &iter, @@ -2625,20 +2628,24 @@ static void _layout_step_record(GtkTreeView *treeview, static void _update_step_record(job_step_info_t *step_ptr, GtkTreeStore *treestore, - GtkTreeIter *iter) + GtkTreeIter *iter, bool suspended) { char *tmp_uname; char tmp_nodes[50]; char tmp_cpu_min[40], tmp_time_run[40], tmp_time_limit[40]; char tmp_node_cnt[40], tmp_time_start[40], tmp_task_cnt[40]; char tmp_step_id[40], tmp_job_id[400]; - time_t now_time = time(NULL); enum job_states state; int color_inx = step_ptr->step_id % sview_colors_cnt; convert_num_unit((float)step_ptr->num_cpus, tmp_cpu_min, sizeof(tmp_cpu_min), UNIT_NONE); + if (suspended) + state = JOB_SUSPENDED; + else + state = step_ptr->state; + if (!step_ptr->nodes || !strcasecmp(step_ptr->nodes,"waiting...")) { sprintf(tmp_time_run, "00:00:00"); @@ -2646,8 +2653,8 @@ static void _update_step_record(job_step_info_t *step_ptr, tmp_node_cnt[0] = '\0'; state = JOB_PENDING; } else { - now_time -= step_ptr->start_time; - secs2time_str(now_time, tmp_time_run, sizeof(tmp_time_run)); + secs2time_str(step_ptr->run_time, + tmp_time_run, sizeof(tmp_time_run)); _get_step_nodelist(step_ptr, tmp_nodes, sizeof(tmp_nodes)); if (cluster_flags & CLUSTER_FLAG_BGQ) { uint32_t nodes = 0; @@ -2665,7 +2672,6 @@ static void _update_step_record(job_step_info_t *step_ptr, tmp_node_cnt, sizeof(tmp_node_cnt), UNIT_NONE); } - state = JOB_RUNNING; } convert_num_unit((float)step_ptr->num_tasks, tmp_task_cnt, @@ -2746,13 +2752,13 @@ static void _append_task_record(sview_job_info_t *sview_job_info_ptr, static void _append_step_record(job_step_info_t *step_ptr, GtkTreeStore *treestore, GtkTreeIter *iter, - int jobid) + int jobid, bool suspended) { GtkTreeIter step_iter; gtk_tree_store_append(treestore, &step_iter, iter); gtk_tree_store_set(treestore, &step_iter, SORTID_POS, jobid, -1); - _update_step_record(step_ptr, treestore, &step_iter); + _update_step_record(step_ptr, treestore, &step_iter, suspended); } static void _handle_task_check(sview_job_info_t *task_ptr, @@ -2919,9 +2925,10 @@ static void _update_info_step(sview_job_info_t *sview_job_info_ptr, g_free(tmp_stepid); if (stepid == (int)step_ptr->step_id) { /* update with new info */ - _update_step_record(step_ptr, - GTK_TREE_STORE(model), - step_iter); + _update_step_record( + step_ptr, GTK_TREE_STORE(model), + step_iter, IS_JOB_SUSPENDED( + sview_job_info_ptr->job_ptr)); goto found; } @@ -2932,7 +2939,9 @@ static void _update_info_step(sview_job_info_t *sview_job_info_ptr, } adding: _append_step_record(step_ptr, GTK_TREE_STORE(model), - iter, sview_job_info_ptr->job_ptr->job_id); + iter, sview_job_info_ptr->job_ptr->job_id, + IS_JOB_SUSPENDED( + sview_job_info_ptr->job_ptr)); found: ; } @@ -3375,8 +3384,11 @@ need_refresh: color_set_flag, false, 0); xfree(color_inx); xfree(color_set_flag); - _layout_step_record(treeview, - step_ptr, update); + + _layout_step_record( + treeview, step_ptr, update, + IS_JOB_SUSPENDED( + sview_job_info->job_ptr)); found = 1; break; } @@ -3551,7 +3563,9 @@ extern int get_new_info_job_step(job_step_info_response_msg_t **info_ptr, */ show_flags |= SHOW_ALL; if (g_step_info_ptr) { - error_code = slurm_get_job_steps(g_step_info_ptr->last_update, + /* Use a last_update time of NULL so that we can get an updated + * run_time for jobs rather than just its start_time */ + error_code = slurm_get_job_steps((time_t) NULL, NO_VAL, NO_VAL, &new_step_ptr, show_flags); if (error_code == SLURM_SUCCESS) { @@ -3839,7 +3853,7 @@ extern void get_info_job(GtkTable *table, display_data_t *display_data) force_refresh)) == SLURM_NO_CHANGE_IN_DATA){ if ((!display_widget || view == ERROR_VIEW) - || (job_error_code != SLURM_NO_CHANGE_IN_DATA)) + || (step_error_code != SLURM_NO_CHANGE_IN_DATA)) goto display_it; } else if (step_error_code != SLURM_SUCCESS) { if (view == ERROR_VIEW) @@ -4013,7 +4027,7 @@ extern void specific_info_job(popup_info_t *popup_win) == SLURM_NO_CHANGE_IN_DATA) { if ((!spec_info->display_widget || spec_info->view == ERROR_VIEW) - || (job_error_code != SLURM_NO_CHANGE_IN_DATA)) + || (step_error_code != SLURM_NO_CHANGE_IN_DATA)) goto display_it; } else if (step_error_code != SLURM_SUCCESS) { if (spec_info->view == ERROR_VIEW) diff --git a/testsuite/expect/test2.24 b/testsuite/expect/test2.24 index ab112efbd399f2d15344a96fb08f3f589b8c4096..5acaf7a78cc3b31565acc33e18ade01a03d574ce 100755 --- a/testsuite/expect/test2.24 +++ b/testsuite/expect/test2.24 @@ -56,7 +56,10 @@ set val "" set option "" spawn $scontrol show config expect { - + -re "^$" { + # break here since other config files are listed here + # scontrol write config only prints out the normal slurm.conf + } -re "($alpha_numeric_under) *= ($format_time)" { set option $expect_out(1,string) set val $expect_out(2,string) diff --git a/testsuite/expect/test20.7 b/testsuite/expect/test20.7 index 2c5e8b5ce82770551e6f40e86b3d5c6d89f4d5ad..7aa738f7444518c8e8ced819bae7274334bba26f 100755 --- a/testsuite/expect/test20.7 +++ b/testsuite/expect/test20.7 @@ -56,10 +56,13 @@ make_bash_script $file_in "sleep 200" proc check_output { path job_id } { global scontrol exit_code +# Support leading options (i.e. original "stdout=...") + set alpha_numeric_comma_slash "\[a-zA-Z0-9=_,\-\/\]*" + set check_out 0 spawn $scontrol show job $job_id expect { - -re "Comment=stdout=$path" { + -re "Comment=($alpha_numeric_comma_slash)stdout=($path)" { set check_out 1 exp_continue } diff --git a/testsuite/expect/test21.16 b/testsuite/expect/test21.16 index 8a4b9d368e73d8fd6614647f5a14775942d80c4e..9ef63ad79be4fa7f2e611dfb797d466dc264fd75 100755 --- a/testsuite/expect/test21.16 +++ b/testsuite/expect/test21.16 @@ -189,21 +189,16 @@ if { $exit_code } { # Add user with all limits set # incr exit_code [add_user "$tu4,$tu5" [array get user_req2]] -if { $exit_code } { - remove_user "" "" "$tu1,$tu2,$tu3,$tu4,$tu5" - remove_acct "" "$ta1,$ta2,$ta3" - remove_cluster "$tc1,$tc2,$tc3" - remove_qos "$dqos1" - exit $exit_code +if { $exit_code == 0} { + # Check Limits of both users + check_assoc_limit 3 user $tu4 [array get user_req2] + check_assoc_limit 3 user $tu5 [array get user_req2] } -# Check Limits of both users -check_assoc_limit 3 user $tu4 [array get user_req2] -check_assoc_limit 3 user $tu5 [array get user_req2] - incr exit_code [remove_user "" "" "$tu4,$tu5"] incr exit_code [remove_acct "" "$ta1,$ta2,$ta3"] incr exit_code [remove_cluster "$tc1,$tc2,$tc3"] +incr exit_code [remove_qos "$dqos1"] if {$exit_code == 0} { print_success $test_id diff --git a/testsuite/expect/test24.4 b/testsuite/expect/test24.4 index 3ea66e729724cfa4d2495321165ff989de93f9de..7fb9c7ea466ee5b5540602efcb06f522dbb7a86c 100755 --- a/testsuite/expect/test24.4 +++ b/testsuite/expect/test24.4 @@ -78,7 +78,7 @@ expect { exp_continue } - "root|||0.000000|240||1.000000||1.000000|0|0|" { + "root|||0.000000|240||1.000000|||0|0|" { incr matches exp_continue } diff --git a/testsuite/expect/test28.2 b/testsuite/expect/test28.2 index 21224f7dcb911c1035bfca4453053944c99f9922..1bc44dbc67407848222b88cef5097698f7a93456 100755 --- a/testsuite/expect/test28.2 +++ b/testsuite/expect/test28.2 @@ -44,6 +44,8 @@ set array_in "" set array_var "" set exit_code 0 +print_header $test_id + if {[get_array_config] < [expr $array_end + 1]} { send_user "\nWARNING: MaxArraySize is too small\n" exit 0 diff --git a/testsuite/expect/test5.6 b/testsuite/expect/test5.6 index f0f7c0505fd0edc2ddd4f56d860e3d36b717f51b..4fe514ddc56c0c01504e91bc69d82bb66b489641 100755 --- a/testsuite/expect/test5.6 +++ b/testsuite/expect/test5.6 @@ -223,11 +223,14 @@ expect { # # Check the squeue node filter option # +set match 0 spawn $squeue --format=%u --noheader --user=$this_uid --nodelist=dummy_name expect { - -re "($alpha_numeric_under)" { - send_user "\nFAILURE: squeue node filter failure\n" - set exit_code 1 + + -re "Invalid node name dummy_name" { + send_user "\nNo worries. This error is expected\n" + incr match + exp_continue } timeout { send_user "\nFAILURE: squeue not responding\n" @@ -237,6 +240,11 @@ expect { wait } } +if {$match != 1} { + send_user "\nFAILURE: squeue node filter failure\n" + set exit_code 1 +} + # The node filtering really only works if the job has begun execution set node_name_set 0 spawn $squeue --format=%N --noheader --jobs=$job_id1 --states=RUNNING