From 5a6c72897913bc85c705ce47521e7bd5608c7374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= <remi@rezib.org> Date: Mon, 11 May 2015 23:05:52 +0200 Subject: [PATCH] Imported Upstream version 14.11.3 --- META | 4 +- NEWS | 54 ++++++++++++- doc/html/cgroups.shtml | 8 +- doc/html/faq.shtml | 35 +++++++-- doc/html/gres.shtml | 2 +- doc/html/header.txt | 2 +- doc/html/high_throughput.shtml | 24 +++++- doc/html/mpi_guide.shtml | 60 +++++++------- doc/html/team.shtml | 1 + doc/man/man1/salloc.1 | 4 + doc/man/man1/sbatch.1 | 4 + doc/man/man1/srun.1 | 7 ++ doc/man/man5/slurm.conf.5 | 20 ++--- doc/man/man8/slurmctld.8 | 9 ++- src/api/cancel.c | 3 + src/api/slurm_pmi.c | 3 + src/common/slurm_jobacct_gather.c | 1 + src/common/slurm_protocol_defs.c | 1 + src/common/slurm_protocol_util.c | 15 ++-- .../accounting_storage/mysql/as_mysql_acct.c | 15 ++-- .../accounting_storage/mysql/as_mysql_assoc.c | 41 ++++++++++ .../mysql/as_mysql_jobacct_process.c | 22 +++--- .../accounting_storage/mysql/as_mysql_user.c | 51 +++++++++--- src/plugins/auth/munge/auth_munge.c | 2 +- src/plugins/mpi/pmi2/tree.c | 9 ++- .../select/bluegene/ba_bgq/block_allocator.c | 4 +- src/plugins/slurmctld/nonstop/do_work.c | 17 ++-- src/plugins/task/affinity/dist_tasks.c | 15 ++++ src/plugins/task/cgroup/task_cgroup_cpuset.c | 7 ++ src/sbatch/opt.c | 2 +- src/slurmctld/gang.c | 47 ++++++----- src/slurmctld/job_mgr.c | 71 +++++++++-------- src/slurmctld/job_scheduler.c | 11 +-- src/slurmctld/node_scheduler.c | 2 +- src/slurmctld/proc_req.c | 23 ++++-- src/slurmctld/reservation.c | 1 + src/slurmctld/slurmctld.h | 12 ++- src/slurmctld/step_mgr.c | 12 ++- src/slurmd/slurmd/req.c | 1 + src/squeue/print.c | 7 +- src/srun/libsrun/fname.c | 11 +-- src/srun/libsrun/opt.c | 4 +- src/srun/libsrun/srun_job.c | 41 ++++------ testsuite/expect/globals | 78 ++++++++++++------- testsuite/expect/test1.75 | 13 ++-- testsuite/expect/test17.36 | 2 +- testsuite/expect/test21.26 | 33 ++++---- testsuite/expect/test28.7 | 2 +- 48 files changed, 544 insertions(+), 269 deletions(-) diff --git a/META b/META index 8a6458fac..2432d01b6 100644 --- a/META +++ b/META @@ -9,8 +9,8 @@ Name: slurm Major: 14 Minor: 11 - Micro: 2 - Version: 14.11.2 + Micro: 3 + Version: 14.11.3 Release: 1 ## diff --git a/NEWS b/NEWS index 905ceb9b7..f4c3977a9 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,47 @@ This file describes changes in recent versions of Slurm. It primarily documents those changes that are of interest to users and administrators. +* Changes in Slurm 14.11.3 +========================== + -- Prevent vestigial job record when cancelling a pending job array record. + -- Fixed squeue core dump. + -- Fix job array hash table bug, could result in slurmctld infinite loop or + invalid memory reference. + -- In srun honor ntasks_per_node before looking at cpu count when the user + doesn't request a number of tasks. + -- Fix ghost job when submitting job after all jobids are exhausted. + -- MySQL - Enhanced coordinator security checks. + -- Fix for task/affinity if an admin configures a node for having threads + but then sets CPUs to only represent the number of cores on the node. + -- Make it so previous versions of salloc/srun work with newer versions + of Slurm daemons. + -- Avoid delay on commit for PMI rank 0 to improve performance with some + MPI implementations. + -- auth/munge - Correct logic to read old format AccountingStoragePass. + -- Reset node "RESERVED" state as appropriate when deleting a maintenance + reservation. + -- Prevent a job manually suspended from being resumed by gang scheduler once + free resources are available. + -- Prevent invalid job array task ID value if a task is started using gang + scheduling. + -- Fixes for clean build on FreeBSD. + -- Fix documentation bugs in slurm.conf.5. DenyAccount should be DenyAccounts. + -- For backward compatibility with older versions of OMPI not compiled + with --with-pmi restore the SLURM_STEP_RESV_PORTS in the job environment. + -- Update the html documentation describing the integration with openmpi. + -- Fix sacct when searching by nodelist. + -- Fix cosmetic info statements when dealing with a job array task instead of + a normal job. + -- Fix segfault with job arrays. + -- Correct the sbatch pbs parser to process -j. + -- BGQ - Put print statement under a DebugFlag. This was just an oversight. + -- BLUEGENE - Remove check that would erroneously remove the CONFIGURING + flag from a job while the job is waiting for a block to boot. + -- Fix segfault in slurmstepd when job exceeded memory limit. + -- Fix race condition that could start a job that is dependent upon a job array + before all tasks of that job array complete. + -- PMI2 race condition fix. + * Changes in Slurm 14.11.2 ========================== -- Fix Centos5 compile errors. @@ -37,9 +78,10 @@ documents those changes that are of interest to users and administrators. -- In proctrack/linuxproc and proctrack/pgid, check the result of strtol() for error condition rather than errno, which might have a vestigial error code. - -- Improve information recording for jobs deferred due to advanced reservation. - -- Exports eio_new_initial_obj to the plugins and initialize kvs_seq on mpi/pmi2 - setup to support launching. + -- Improve information recording for jobs deferred due to advanced + reservation. + -- Exports eio_new_initial_obj to the plugins and initialize kvs_seq on + mpi/pmi2 setup to support launching. * Changes in Slurm 14.11.1 ========================== @@ -366,6 +408,12 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 14.03.12 =========================== + -- Make it so previous versions of salloc/srun work with newer versions + of Slurm daemons. + -- PMI2 race condition fix. + -- Avoid delay on commit for PMI rank 0 to improve performance with some + MPI implementations. + -- Correct the sbatch pbs parser to process -j. * Changes in Slurm 14.03.11 =========================== diff --git a/doc/html/cgroups.shtml b/doc/html/cgroups.shtml index 56fb9bdf1..4df1cf2ee 100644 --- a/doc/html/cgroups.shtml +++ b/doc/html/cgroups.shtml @@ -109,15 +109,15 @@ options also apply. See the <a href="cgroup.conf.html">cgroup.conf</a> man page for details.</p> <h3>jobacct_gather/cgroup plugin</h3> -<b>At present, jobacct_gather/cgroup should be considered experimental.</b> <p> The jobacct_gather/cgroup plugin is an alternative to the jobacct_gather/linux plugin for the collection of accounting statistics for jobs, steps and tasks. -The cgroup plugin may provide improved performance over jobacct_gather/linux. jobacct_gather/cgroup uses the cpuacct, memory and blkio subsystems. Note: the cpu and memory statistics collected by this plugin do not represent the same resources as the cpu and memory statistics collected by the -jobacct_gather/linux plugin (sourced from /proc stat). +jobacct_gather/linux plugin (sourced from /proc stat). While originally +thought to be faster, in practice it has been proven to be slower than the +jobacct_gather/linux plugin. <p>To enable this plugin, configure the following option in slurm.conf: <pre>JobacctGatherType=jobacct_gather/cgroup</pre> </p> @@ -205,6 +205,6 @@ the following example.</li> </ul> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 30 May 2014</p> +<p style="text-align:center;">Last modified 7 Jan 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 466319c4f..501f1abc0 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -175,6 +175,7 @@ launch a shell on a node in the job's allocation?</a></li> <li><a href="#state_info">How could some jobs submitted immediately before the slurmctld daemon crashed be lost?</a></li> <li><a href="#delete_partition">How do I safely remove partitions?</a></li> +<li><a href="#cpu_freq">Why is Slurm unable to set the CPU frequency for jobs?</a></li> </ol> <h2>For Management</h2> @@ -222,7 +223,7 @@ Note that setting the node DOWN will terminate all running or suspended jobs associated with that node. An alternative is to set the node's state to DRAIN until all jobs associated with it terminate before setting it DOWN and re-booting.</p> -<p>Note tha. Slurm has two configuration parameters that may be used to +<p>Note that Slurm has two configuration parameters that may be used to automate some of this process. <i>UnkillableStepProgram</i> specifies a program to execute when non-killable processes are identified. @@ -1195,13 +1196,13 @@ pathnames and port numbers to avoid conflicts. The only problem is if more than one version of Slurm is configured with <i>switch/elan</i> or <i>switch/federation</i>. In that case, there can be conflicting switch window requests from -the differen. Slurm systems. +the different Slurm systems. This can be avoided by configuring the test system with <i>switch/none</i>. MPI jobs started on an Elan or Federation switch system without the switch windows configured will not execute properly, but other jobs will run fine. Another option for testing on Elan or Federation systems is to use -a different set of nodes for the differen. Slurm systems. +a different set of nodes for the different Slurm systems. That will permit both systems to allocate switch windows without conflicts. @@ -1738,7 +1739,7 @@ It has integration with Slurm as well as Torque resource managers.</p> <p><a name="add_nodes"><b>47. What process should I follow to add nodes to Slurm?</b></a></br> The slurmctld daemon has a multitude of bitmaps to track state of nodes and cores in the system. Adding nodes to a running system would require the slurmctld daemon -rebuild all of those bitmaps, which the developers feel would be safer to do by +to rebuild all of those bitmaps, which the developers feel would be safer to do by restarting the daemon. Communications from the slurmd daemons on the compute nodes to the slurmctld daemon include a configuration file checksum, so you probably also want to maintain a common slurm.conf file on all nodes. The @@ -1752,13 +1753,17 @@ following procedure is recommended: "scontrol reconfig", no need to restart the daemons)</li> </ol> +NOTE: Jobs submitted with srun, and that are waiting for an allocation, +prior to new nodes being added to the slurm.conf can fail if the job is +allocated one of the new nodes. + <p><a name="licenses"><b>48. Can Slurm be configured to manage licenses?</b></a></br> Slurm is not currently integrated with FlexLM, but it does provide for the allocation of global resources called licenses. Use the Licenses configuration parameter in your slurm.conf file (e.g. "Licenses=foo:10,bar:20"). Jobs can request licenses and be granted exclusive use of those resources (e.g. "sbatch --licenses=foo:2,bar:1 ..."). -It is not currently possible to change the total number of lincenses on a system +It is not currently possible to change the total number of licenses on a system without restarting the slurmctld daemon, but it is possible to dynamically reserve licenses and remove them from being available to jobs on the system (e.g. "scontrol update reservation=licenses_held licenses=foo:5,bar:2").</p> @@ -1961,8 +1966,26 @@ Removing a partition from the slurm.conf and restarting will cancel any existing jobs that reference the removed partitions. </p> +<p><a name="cpu_freq"><b>61. Why is Slurm unable to set the CPU frequency for + jobs?</b></a><br> +First check that Slurm is configured to bind jobs to specific CPUs by +making sure that TaskPlugin is configured to either affinity or cgroup. +Next check that that your processor is configured to permit frequency +control by examining the values in the file +<i>/sys/devices/system/cpu/cpu0/cpufreq</i> where "cpu0" represents a CPU ID 0. +Of particular interest is the file <i>scaling_available_governors</i>, +which identifies the CPU governors available. +If "userspace" is not an available CPU governor, this may well be due to the +<i>intel_pstate</i> driver being installed. +Information about disabling the <i>intel_pstate</i> driver is available +from<br> +<a href="https://bugzilla.kernel.org/show_bug.cgi?id=57141"> +https://bugzilla.kernel.org/show_bug.cgi?id=57141</a> and<br> +<a href="http://unix.stackexchange.com/questions/121410/setting-cpu-governor-to-on-demand-or-conservative"> +http://unix.stackexchange.com/questions/121410/setting-cpu-governor-to-on-demand-or-conservative</a>.</p> + <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 15 October 2014</p> +<p style="text-align:center;">Last modified 30 December 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index 8dfd69005..37a373fb3 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -43,7 +43,7 @@ than configured will be placed in a DOWN state.</P> <P>Sample slurm.conf file:</P> <PRE> # Configure support for our four GPUs -GresType=gpu,bandwidth +GresTypes=gpu,bandwidth NodeName=tux[0-7] Gres=gpu:tesla:2,gpu:kepler:2,bandwidth:lustre:no_consume:4G </PRE> diff --git a/doc/html/header.txt b/doc/html/header.txt index 78e6e7b0b..f43f71f67 100644 --- a/doc/html/header.txt +++ b/doc/html/header.txt @@ -51,7 +51,7 @@ window.onresize = window_check; <ul> <li><a href="overview.shtml" class="nav">Overview</a></li> <li><a href="news.shtml" class="nav">What's New</a></li> - <li><a href="team.shtml" class="nav">SLURM Team</a></li> + <li><a href="team.shtml" class="nav">Slurm Team</a></li> <li><a href="meetings.shtml" class="nav">Meetings</a></li> <li><a href="testimonials.shtml" class="nav">Testimonials</a></li> <li><a href="disclaimer.shtml" class="nav">Legal Notices</a></li> diff --git a/doc/html/high_throughput.shtml b/doc/html/high_throughput.shtml index 1dd4581b2..2e41d57cc 100644 --- a/doc/html/high_throughput.shtml +++ b/doc/html/high_throughput.shtml @@ -18,7 +18,7 @@ configuration used.</p> <h2>System configuration</h2> -<p>Three system configuration parameters must be set to support a large number +<p>Several system configuration parameters may require modification to support a large number of open files and TCP connections with large bursts of messages. Changes can be made using the <b>/etc/rc.d/rc.local</b> or <b>/etc/sysctl.conf</b> script to preserve changes after reboot. In either case, you can write values @@ -29,11 +29,27 @@ directly into these files The maximum number of concurrently open files. We recommend a limit of at least 32,832.</li> <li><b>/proc/sys/net/ipv4/tcp_max_syn_backlog</b>: -Maximum number of remembered connection requests, which are still did not -receive an acknowledgment from connecting client. +The maximum number of SYN requests to keep in memory that we have yet to get +the third packet in a 3-way handshake from. +The tcp_max_syn_backlog variable is overridden by the tcp_syncookies variable, +which needs to be turned on for this variable to have any effect. The default value is 1024 for systems with more than 128Mb of memory, and 128 for low memory machines. If server suffers of overload, try to increase this number.</li> +<li><b>/proc/sys/net/ipv4/tcp_syncookies</b>: +Used to send out <i>syncookies</i> to hosts when the kernels syn backlog queue +for a specific socket is overflowed. +The default value is 0, which disables this functionality. +Set the value to 1. +<li><b>/proc/sys/net/ipv4/tcp_synack_retries</b>: +How many times to retransmit the SYN,ACK reply to an SYN request. +In other words, this tells the system how many times to try to establish a +passive TCP connection that was started by another host. +This variable takes an integer value, but should under no circumstances be +larger than 255. +Each retransmission will take aproximately 30 to 40 seconds. +The default value of 5, which results in a timeout of passive TCP connections +of aproximately 180 seconds and is generally satisfactory. <li><b>/proc/sys/net/core/somaxconn</b>: Limit of socket listen() backlog, known in userspace as SOMAXCONN. Defaults to 128. The value should be raised substantially to support bursts of request. @@ -193,6 +209,6 @@ speedup can be achieved by setting the CommitDelay option in the <li><b>PurgeSuspendAfer</b>=1month</li> </ul> -<p style="text-align:center;">Last modified 21 July 2014</p> +<p style="text-align:center;">Last modified 23 December 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/mpi_guide.shtml b/doc/html/mpi_guide.shtml index aa11bed90..581eacd47 100644 --- a/doc/html/mpi_guide.shtml +++ b/doc/html/mpi_guide.shtml @@ -14,7 +14,7 @@ mpirun launches tasks using Slurm's infrastructure (LAM/MPI and HP-MPI).</li> <li>Slurm creates a resource allocation for the job and then mpirun launches tasks using some mechanism other than SLURM, such as SSH or RSH (BlueGene MPI and some MPICH1 modes). -These tasks initiated outside of Slurm's monitoring +These tasks are initiated outside of Slurm's monitoring or control. Slurm's epilog should be configured to purge these tasks when the job's allocation is relinquished. </li> </ol> @@ -58,40 +58,48 @@ with Slurm are provided below. <span class="commandline">srun</span> command. It relies upon Slurm version 2.0 (or higher) managing reservations of communication ports for use by the Open MPI version 1.5 (or higher). -The system administrator must specify the range of ports to be reserved -in the <i>slurm.conf</i> file using the <i>MpiParams</i> parameter. -For example: <br> -<i>MpiParams=ports=12000-12999</i></p> -<p>OpenMPI must also be configured with <i>--with-pmi</i> to support the port -reservation method.</p> -<p>Launch tasks using the <span class="commandline">srun</span> command -plus the option <i>--resv-ports</i>. Alternately define the environment +<p>If OpenMPI is configured with <i>--with-pmi</i> either pmi or pmi2 +the OMPI jobs can be launched directly using the srun command. This is +the preferred way. If the pmi2 support is enabled then the command line +options '--mpi=pmi2' has to be specified on the srun command line. + +<p> +For older versions of OMPI not compiled with the pmi support +the system administrator must specify the range of ports to be reserved +in the <i>slurm.conf</i> file using the <i>MpiParams</i> parameter. +For example: MpiParams=ports=12000-12999 + +<p> +Alternatively tasks can be launched using the srun command +plus the option <i>--resv-ports</i> or using the environment variable <i>SLURM_RESV_PORT</i>, which is equivalent to always including <i>--resv-ports</i> on srun's execute line. + The ports reserved on every allocated node will be identified in an -environment variable available to the tasks as shown here: <br> -<i>SLURM_STEP_RESV_PORTS=12000-12015</i><br> -<b>NOTE:</b> In Slurm version 2.5, the <i>--resv-ports</i> option is -automatically set when the MPI type is set to OpenMPI, either explicitly with -the <i>--mpi</i> option or by the <i>MpiDefault</i> configuration parameter.</p> +environment variable available to the tasks as shown here: +SLURM_STEP_RESV_PORTS=12000-12015<p> <pre> $ salloc -n4 sh # allocates 4 processors and spawns shell for job -> srun --resv-port a.out -> exit # exits shell spawned by initial salloc command +> srun a.out +> exit # exits shell spawned by initial salloc command </pre> -<p>Or</p> +<p>or</p> <pre> -$ export SLURM_RESV_PORT=1 -$ salloc -n4 sh # allocates 4 processors and spawns shell for job -> srun a.out -> exit # exits shell spawned by initial salloc command +> srun -n 4 a.out </pre> +<p>or using the pmi2 support</p> + +<pre> +> srun --mpi=pmi2 -n 4 a.out +</pre> + + <p>If the ports reserved for a job step are found by the Open MPI library to be in use, a message of this form will be printed and the job step will be re-launched:<br> @@ -101,16 +109,6 @@ Repeated failures should be reported to your system administrator in order to rectify the problem by cancelling the processes holding those ports.</p> -<h3>OpenMPI Version 1.4 or earlier</h3> -<p>Older versions of Open MPI and Slurm rely upon Slurm to allocate resources -for the job and then mpirun to initiate the tasks. -For example:</p> -<pre> -$ salloc -n4 sh # allocates 4 processors and spawns shell for job -> mpirun a.out -> exit # exits shell spawned by initial salloc command -</pre> - <hr size=4 width="100%"> <h2><a name="intel_mpi"><b>Intel MPI</b></a></h2> diff --git a/doc/html/team.shtml b/doc/html/team.shtml index 3c59135b3..96c09d540 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -27,6 +27,7 @@ Lead Slurm developers are: <li><a href="http://www.schedmd.com">SchedMD</a></li> <li><a href="http://www.cscs.ch">Swiss National Supercomputing Centre</a></li> <br><!-- INDIVIDUALS, PLEASE KEEP IN ALPHABETICAL ORDER --><br> +<li>Daniel Ahlin (KTH, Sweden)</li> <li>Ramiro Alba (Centre Tecnològic de Tranferència de Calor, Spain)</li> <li>Amjad Majid Ali (Colorado State University)</li> <li>Pär Andersson (National Supercomputer Centre, Sweden)</li> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 0f5dc850a..c0e7e775b 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -1330,6 +1330,10 @@ The reservation ID on Cray systems running ALPS/BASIL only. \fBSLURM_CLUSTER_NAME\fR Name of the cluster on which the job is executing. .TP +\fBSLURM_CPUS_PER_TASK\fR +Number of cpus requested per task. +Only set if the \fB\-\-cpus\-per\-task\fR option is specified. +.TP \fBSLURM_DISTRIBUTION\fR Same as \fB\-m, \-\-distribution\fR .TP diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index c5042e3d1..443292b15 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1571,6 +1571,10 @@ Name of the cluster on which the job is executing. \fBSLURM_CPUS_ON_NODE\fR Number of CPUS on the allocated node. .TP +\fBSLURM_CPUS_PER_TASK\fR +Number of cpus requested per task. +Only set if the \fB\-\-cpus\-per\-task\fR option is specified. +.TP \fBSLURM_DISTRIBUTION\fR Same as \fB\-m, \-\-distribution\fR .TP diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index d76efaa46..cfbb4d2c2 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -2021,6 +2021,9 @@ Also see \fBSLURM_EXIT_ERROR\fR. \fBSLURM_GEOMETRY\fR Same as \fB\-g, \-\-geometry\fR .TP +\fBSLURM_HINT\fR +Same as \fB\-\-hint\fR +.TP \fBSLURM_GRES\fR Same as \fB\-\-gres\fR. Also see \fBSLURM_STEP_GRES\fR .TP @@ -2224,6 +2227,10 @@ jobs, so the value indicates the total count of CPUs on the node. For the select/cons_res plugin, this number indicates the number of cores on this node allocated to the job. .TP +\fBSLURM_CPUS_PER_TASK\fR +Number of cpus requested per task. +Only set if the \fB\-\-cpus\-per\-task\fR option is specified. +.TP \fBSLURM_DISTRIBUTION\fR Distribution type for the allocated jobs. Set the distribution with \-m, \-\-distribution. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 98af0cbb0..6a4037ce7 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -2065,9 +2065,8 @@ started immediately, only those tasks expected to start at some future time. The default value is 20 tasks. .TP \fBbf_max_job_part=#\fR -The maximum number of jobs per partition to attempt backfill scheduling for, -not counting jobs which cannot be started due to an association resource -limit. This can be especially helpful for systems with large numbers of +The maximum number of jobs per partition to attempt starting with the backfill +scheduler. This can be especially helpful for systems with large numbers of partitions and jobs. The default value is 0, which means no limit. This option applies only to \fBSchedulerType=sched/backfill\fR. @@ -2091,9 +2090,8 @@ desirable. This option applies only to \fBSchedulerType=sched/backfill\fR. .TP \fBbf_max_job_user=#\fR -The maximum number of jobs per user to attempt backfill scheduling for, -not counting jobs which cannot be started due to an association resource -limit. One can set this limit to prevent users from flooding the backfill +The maximum number of jobs per user to attempt starting with the backfill +scheduler. One can set this limit to prevent users from flooding the backfill queue with jobs that cannot start and that prevent jobs from other users to start. This is similar to the MAXIJOB limit in Maui. The default value is 0, which means no limit. @@ -3261,9 +3259,11 @@ The default value is 1. .TP \fBCPUs\fR Number of logical processors on the node (e.g. "2"). -If \fBCPUs\fR is omitted, it will set equal to the product of +CPUs and Boards are mutually exclusive. It can be set to the total +number of sockets, cores or threads. This can be useful when you +want to schedule only the cores on a hyper-threaded node. +If \fBCPUs\fR is omitted, it will be set equal to the product of \fBSockets\fR, \fBCoresPerSocket\fR, and \fBThreadsPerCore\fR. -CPUs and Boards are mutually exclusive. The default value is 1. .TP @@ -3730,11 +3730,11 @@ accounting, which samples memory use on a periodic basis (data need not be stored, just collected). .TP -\fBDenyAccount\fR +\fBDenyAccounts\fR Comma separated list of accounts which may not execute jobs in the partition. By default, no accounts are denied access \fBNOTE:\fR If AllowAccounts is used then DenyAccounts will not be enforced. -Also refer to AllowAccount. +Also refer to AllowAccounts. .TP \fBDenyQos\fR diff --git a/doc/man/man8/slurmctld.8 b/doc/man/man8/slurmctld.8 index b54530eb1..85a5e58f2 100644 --- a/doc/man/man8/slurmctld.8 +++ b/doc/man/man8/slurmctld.8 @@ -20,9 +20,12 @@ system. .TP \fB\-c\fR Clear all previous \fBslurmctld\fR state from its last checkpoint. -Without this option, previously running jobs will be preserved along -with node \fIState\fR of DOWN, DRAINED and DRAINING nodes and the associated -\fIReason\fR field for those nodes. +With this option, all jobs, including both running and queued, and all +node states, will be deleted. Without this option, previously running +jobs will be preserved along with node \fIState\fR of DOWN, DRAINED +and DRAINING nodes and the associated \fIReason\fR field for those nodes. +NOTE: It is rare you would ever want to use this in production as all +jobs will be killed. .TP \fB\-D\fR diff --git a/src/api/cancel.c b/src/api/cancel.c index 9a71403ab..85795e4bf 100644 --- a/src/api/cancel.c +++ b/src/api/cancel.c @@ -68,6 +68,7 @@ slurm_kill_job (uint32_t job_id, uint16_t signal, uint16_t flags) /* * Request message: */ + memset(&req, 0, sizeof(job_step_kill_msg_t)); req.job_id = job_id; req.sjob_id = NULL; req.job_step_id = NO_VAL; @@ -104,6 +105,7 @@ slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal) /* * Request message: */ + memset(&req, 0, sizeof(job_step_kill_msg_t)); req.job_id = job_id; req.sjob_id = NULL; req.job_step_id = step_id; @@ -137,6 +139,7 @@ slurm_kill_job2(const char *job_id, uint16_t signal, uint16_t batch_flag) slurm_msg_t_init(&msg); + memset(&req, 0, sizeof(job_step_kill_msg_t)); req.job_id = NO_VAL; req.sjob_id = xstrdup(job_id); req.job_step_id = NO_VAL; diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c index 490cb5aa0..faeaf35b5 100644 --- a/src/api/slurm_pmi.c +++ b/src/api/slurm_pmi.c @@ -77,6 +77,9 @@ static void _delay_rpc(int pmi_rank, int pmi_size) uint32_t delta_time, error_time; int retries = 0; + if (pmi_rank == 0) /* Rank 0 has extra communications with no */ + return; /* risk of induced packet storm */ + _set_pmi_time(); again: if (gettimeofday(&tv1, NULL)) { diff --git a/src/common/slurm_jobacct_gather.c b/src/common/slurm_jobacct_gather.c index 887911002..0c89d9317 100644 --- a/src/common/slurm_jobacct_gather.c +++ b/src/common/slurm_jobacct_gather.c @@ -134,6 +134,7 @@ static void _acct_kill_step(void) /* * Request message: */ + memset(&req, 0, sizeof(job_step_kill_msg_t)); req.job_id = jobacct_job_id; req.job_step_id = jobacct_step_id; req.signal = SIGKILL; diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 6d6237cf4..001031596 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -128,6 +128,7 @@ extern void slurm_msg_t_init(slurm_msg_t *msg) extern void slurm_msg_t_copy(slurm_msg_t *dest, slurm_msg_t *src) { slurm_msg_t_init(dest); + dest->protocol_version = src->protocol_version; dest->forward = src->forward; dest->ret_list = src->ret_list; dest->forward_struct = src->forward_struct; diff --git a/src/common/slurm_protocol_util.c b/src/common/slurm_protocol_util.c index f9987e8d8..8991808bc 100644 --- a/src/common/slurm_protocol_util.c +++ b/src/common/slurm_protocol_util.c @@ -88,11 +88,16 @@ int check_header_version(header_t * header) case REQUEST_RUN_JOB_STEP: case RESPONSE_LAUNCH_TASKS: case RESPONSE_RUN_JOB_STEP: - /* Disable job step creation/launch between major - * releases. Other RPCs should all be supported. */ - debug("unsupported RPC type %hu", header->msg_type); - slurm_seterrno_ret(SLURM_PROTOCOL_VERSION_ERROR); - break; + if (working_cluster_rec) { + /* Disable job step creation/launch + * between major releases. Other RPCs + * should all be supported. */ + debug("unsupported RPC type %hu", + header->msg_type); + slurm_seterrno_ret( + SLURM_PROTOCOL_VERSION_ERROR); + break; + } default: if ((header->version != SLURM_PROTOCOL_VERSION) && (header->version != SLURM_14_03_PROTOCOL_VERSION) && diff --git a/src/plugins/accounting_storage/mysql/as_mysql_acct.c b/src/plugins/accounting_storage/mysql/as_mysql_acct.c index 88b5345ac..56a304bf2 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_acct.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_acct.c @@ -206,6 +206,9 @@ extern int as_mysql_add_accts(mysql_conn_t *mysql_conn, uint32_t uid, if (!object->assoc_list) continue; + if (!assoc_list) + assoc_list = + list_create(slurmdb_destroy_association_rec); list_transfer(assoc_list, object->assoc_list); } list_iterator_destroy(itr); @@ -225,14 +228,12 @@ extern int as_mysql_add_accts(mysql_conn_t *mysql_conn, uint32_t uid, } else xfree(txn_query); - if (list_count(assoc_list)) { - if (as_mysql_add_assocs(mysql_conn, uid, assoc_list) - == SLURM_ERROR) { - error("Problem adding user associations"); - rc = SLURM_ERROR; - } + if (assoc_list && list_count(assoc_list)) { + if ((rc = as_mysql_add_assocs(mysql_conn, uid, assoc_list)) + != SLURM_SUCCESS) + error("Problem adding accounts associations"); } - list_destroy(assoc_list); + FREE_NULL_LIST(assoc_list); return rc; } diff --git a/src/plugins/accounting_storage/mysql/as_mysql_assoc.c b/src/plugins/accounting_storage/mysql/as_mysql_assoc.c index 14e9a9125..4bfe6b47b 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_assoc.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_assoc.c @@ -2449,6 +2449,47 @@ extern int as_mysql_add_assocs(mysql_conn_t *mysql_conn, uint32_t uid, if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; + if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_OPERATOR)) { + ListIterator itr2 = NULL; + slurmdb_user_rec_t user; + slurmdb_coord_rec_t *coord = NULL; + slurmdb_association_rec_t *object = NULL; + + memset(&user, 0, sizeof(slurmdb_user_rec_t)); + user.uid = uid; + + if (!is_user_any_coord(mysql_conn, &user)) { + error("Only admins/operators/coordinators " + "can add associations"); + return ESLURM_ACCESS_DENIED; + } + + itr = list_iterator_create(association_list); + itr2 = list_iterator_create(user.coord_accts); + while ((object = list_next(itr))) { + char *account = "root"; + if (object->user) + account = object->acct; + else if (object->parent_acct) + account = object->parent_acct; + list_iterator_reset(itr2); + while ((coord = list_next(itr2))) { + if (!strcasecmp(coord->name, account)) + break; + } + if (!coord) + break; + } + list_iterator_destroy(itr2); + list_iterator_destroy(itr); + if (!coord) { + error("Coordinator %s(%d) tried to add associations " + "where they were not allowed", + user.name, user.uid); + return ESLURM_ACCESS_DENIED; + } + } + local_cluster_list = list_create(NULL); user_name = uid_to_string((uid_t) uid); /* these need to be in a specific order */ diff --git a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c index 067121101..bc29e509f 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c @@ -487,7 +487,7 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, while ((row = mysql_fetch_row(result))) { char *id = row[JOB_REQ_ID]; bool job_ended = 0; - int submit = slurm_atoul(row[JOB_REQ_SUBMIT]); + int start = slurm_atoul(row[JOB_REQ_START]); curr_id = slurm_atoul(row[JOB_REQ_JOBID]); @@ -498,9 +498,11 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, /* check the bitmap to see if this is one of the jobs we are looking for */ + /* Use start time instead of submit time because node + * indexes are determined at start time and not submit. */ if (!good_nodes_from_inx(local_cluster_list, (void **)&curr_cluster, - row[JOB_REQ_NODE_INX], submit)) { + row[JOB_REQ_NODE_INX], start)) { last_id = curr_id; continue; } @@ -563,8 +565,8 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, job->blockid = xstrdup(row[JOB_REQ_BLOCKID]); job->eligible = slurm_atoul(row[JOB_REQ_ELIGIBLE]); - job->submit = submit; - job->start = slurm_atoul(row[JOB_REQ_START]); + job->submit = slurm_atoul(row[JOB_REQ_SUBMIT]); + job->start = start; job->end = slurm_atoul(row[JOB_REQ_END]); job->timelimit = slurm_atoul(row[JOB_REQ_TIMELIMIT]); @@ -758,7 +760,7 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, if (!good_nodes_from_inx(local_cluster_list, (void **)&curr_cluster, step_row[STEP_REQ_NODE_INX], - submit)) + start)) continue; step = slurmdb_create_step_rec(); @@ -1060,7 +1062,7 @@ no_hosts: extern int good_nodes_from_inx(List local_cluster_list, void **object, char *node_inx, - int submit) + int start) { local_cluster_t **curr_cluster = (local_cluster_t **)object; @@ -1070,15 +1072,15 @@ extern int good_nodes_from_inx(List local_cluster_list, bitstr_t *job_bitmap = NULL; if (!node_inx || !node_inx[0]) return 0; - if ((submit < (*curr_cluster)->start) - || (submit > (*curr_cluster)->end)) { + if ((start < (*curr_cluster)->start) + || (start > (*curr_cluster)->end)) { local_cluster_t *local_cluster = NULL; ListIterator itr = list_iterator_create(local_cluster_list); while ((local_cluster = list_next(itr))) { - if ((submit >= local_cluster->start) - && (submit <= local_cluster->end)) { + if ((start >= local_cluster->start) + && (start <= local_cluster->end)) { *curr_cluster = local_cluster; break; } diff --git a/src/plugins/accounting_storage/mysql/as_mysql_user.c b/src/plugins/accounting_storage/mysql/as_mysql_user.c index a87abe2e9..9bed9a4f6 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_user.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_user.c @@ -409,20 +409,16 @@ extern int as_mysql_add_users(mysql_conn_t *mysql_conn, uint32_t uid, xfree(txn_query); if (list_count(assoc_list)) { - if (as_mysql_add_assocs(mysql_conn, uid, assoc_list) - == SLURM_ERROR) { + if ((rc = as_mysql_add_assocs(mysql_conn, uid, assoc_list)) + != SLURM_SUCCESS) error("Problem adding user associations"); - rc = SLURM_ERROR; - } } list_destroy(assoc_list); - if (list_count(wckey_list)) { - if (as_mysql_add_wckeys(mysql_conn, uid, wckey_list) - == SLURM_ERROR) { + if (rc == SLURM_SUCCESS && list_count(wckey_list)) { + if ((rc = as_mysql_add_wckeys(mysql_conn, uid, wckey_list)) + != SLURM_SUCCESS) error("Problem adding user wckeys"); - rc = SLURM_ERROR; - } } list_destroy(wckey_list); return rc; @@ -449,6 +445,43 @@ extern int as_mysql_add_coord(mysql_conn_t *mysql_conn, uint32_t uid, if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; + if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_OPERATOR)) { + slurmdb_user_rec_t user; + slurmdb_coord_rec_t *coord = NULL; + char *acct = NULL; + + memset(&user, 0, sizeof(slurmdb_user_rec_t)); + user.uid = uid; + + if (!is_user_any_coord(mysql_conn, &user)) { + error("Only admins/operators/coordinators " + "can add account coordinators"); + return ESLURM_ACCESS_DENIED; + } + + itr = list_iterator_create(acct_list); + itr2 = list_iterator_create(user.coord_accts); + while ((acct = list_next(itr))) { + while ((coord = list_next(itr2))) { + if (!strcasecmp(coord->name, acct)) + break; + } + if (!coord) + break; + list_iterator_reset(itr2); + } + list_iterator_destroy(itr2); + list_iterator_destroy(itr); + + if (!coord) { + error("Coordinator %s(%d) tried to add another " + "coordinator to an account they aren't " + "coordinator over.", + user.name, user.uid); + return ESLURM_ACCESS_DENIED; + } + } + user_name = uid_to_string((uid_t) uid); itr = list_iterator_create(user_cond->assoc_cond->user_list); itr2 = list_iterator_create(acct_list); diff --git a/src/plugins/auth/munge/auth_munge.c b/src/plugins/auth/munge/auth_munge.c index 5817a1a8d..fe3550708 100644 --- a/src/plugins/auth/munge/auth_munge.c +++ b/src/plugins/auth/munge/auth_munge.c @@ -709,7 +709,7 @@ static char *_auth_opts_to_socket(char *opts) } else if (strchr(opts, '=')) { ; /* New format, but socket not specified */ } else { - socket = xstrdup(tmp); /* Old format */ + socket = xstrdup(opts); /* Old format */ } return socket; diff --git a/src/plugins/mpi/pmi2/tree.c b/src/plugins/mpi/pmi2/tree.c index d9bccd44f..88e41476c 100644 --- a/src/plugins/mpi/pmi2/tree.c +++ b/src/plugins/mpi/pmi2/tree.c @@ -4,6 +4,10 @@ * Copyright (C) 2011-2012 National University of Defense Technology. * Written by Hongjia Cao <hjcao@nudt.edu.cn>. * All rights reserved. + * Portions copyright (C) 2014 Institute of Semiconductor Physics + * Siberian Branch of Russian Academy of Science + * Written by Artem Polyakov <artpol84@gmail.com>. + * All rights reserved. * * This file is part of SLURM, a resource management program. * For details, see <http://slurm.schedmd.com/>. @@ -168,7 +172,10 @@ _handle_kvs_fence_resp(int fd, Buf buf) debug3("mpi/pmi2: in _handle_kvs_fence_resp"); safe_unpack32(&seq, buf); - if (seq != kvs_seq - 1) { + if( seq == kvs_seq - 2) { + debug("mpi/pmi2: duplicate KVS_FENCE_RESP from srun ignored"); + return rc; + } else if (seq != kvs_seq - 1) { error("mpi/pmi2: invalid kvs seq from srun, expect %u" " got %u", kvs_seq - 1, seq); rc = SLURM_ERROR;; diff --git a/src/plugins/select/bluegene/ba_bgq/block_allocator.c b/src/plugins/select/bluegene/ba_bgq/block_allocator.c index 662e3380f..ee8b08648 100644 --- a/src/plugins/select/bluegene/ba_bgq/block_allocator.c +++ b/src/plugins/select/bluegene/ba_bgq/block_allocator.c @@ -2121,7 +2121,9 @@ static int _fill_in_wires(List mps, ba_mp_t *start_mp, int dim, } } else { /* we can't use this so return with a nice 0 */ - info("_fill_in_wires: we can't use this so return"); + if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP) + info("_fill_in_wires: we can't use this " + "so return"); return 0; } diff --git a/src/plugins/slurmctld/nonstop/do_work.c b/src/plugins/slurmctld/nonstop/do_work.c index b3826c57d..6a59c0ed3 100644 --- a/src/plugins/slurmctld/nonstop/do_work.c +++ b/src/plugins/slurmctld/nonstop/do_work.c @@ -1285,11 +1285,13 @@ extern char *replace_node(char *cmd_ptr, uid_t cmd_uid, 1, /* allocate */ cmd_uid, /* submit UID */ &new_job_ptr, /* pointer to new job */ - NULL); /* error message */ + NULL, /* error message */ + SLURM_PROTOCOL_VERSION); if (rc != SLURM_SUCCESS) { /* Determine expected start time */ i = job_allocate(&job_alloc_req, 1, 1, &will_run, 1, - cmd_uid, &new_job_ptr, NULL); + cmd_uid, &new_job_ptr, NULL, + SLURM_PROTOCOL_VERSION); if (i == SLURM_SUCCESS) { will_run_idle = will_run->start_time; slurm_free_will_run_response_msg(will_run); @@ -1309,12 +1311,14 @@ extern char *replace_node(char *cmd_ptr, uid_t cmd_uid, (void) update_resv(&resv_desc); xfree(resv_desc.users); rc = job_allocate(&job_alloc_req, 1, 0, NULL, 1, - cmd_uid, &new_job_ptr, NULL); + cmd_uid, &new_job_ptr, NULL, + SLURM_PROTOCOL_VERSION); if (rc != SLURM_SUCCESS) { /* Determine expected start time */ i = job_allocate(&job_alloc_req, 1, 1, &will_run, 1, cmd_uid, - &new_job_ptr, NULL); + &new_job_ptr, NULL, + SLURM_PROTOCOL_VERSION); if (i == SLURM_SUCCESS) { will_run_resv = will_run->start_time; slurm_free_will_run_response_msg( @@ -1324,7 +1328,8 @@ extern char *replace_node(char *cmd_ptr, uid_t cmd_uid, /* Submit job in resv for later use */ i = job_allocate(&job_alloc_req, 0, 0, NULL, 1, cmd_uid, - &new_job_ptr, NULL); + &new_job_ptr, NULL, + SLURM_PROTOCOL_VERSION); if (i == SLURM_SUCCESS) will_run_time = will_run_resv; } @@ -1339,7 +1344,7 @@ extern char *replace_node(char *cmd_ptr, uid_t cmd_uid, if ((rc != SLURM_SUCCESS) && (will_run_time == 0) && will_run_idle) { /* Submit job for later use without using reservation */ i = job_allocate(&job_alloc_req, 0, 0, NULL, 1, cmd_uid, - &new_job_ptr, NULL); + &new_job_ptr, NULL, SLURM_PROTOCOL_VERSION); if (i == SLURM_SUCCESS) will_run_time = will_run_idle; } diff --git a/src/plugins/task/affinity/dist_tasks.c b/src/plugins/task/affinity/dist_tasks.c index 0bc4b3a7b..5fbda2b42 100644 --- a/src/plugins/task/affinity/dist_tasks.c +++ b/src/plugins/task/affinity/dist_tasks.c @@ -377,6 +377,21 @@ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) CPU_BIND_RANK | CPU_BIND_MAP | CPU_BIND_LDMASK | CPU_BIND_LDRANK | CPU_BIND_LDMAP; + static int only_one_thread_per_core = -1; + + if (only_one_thread_per_core == -1) { + if (conf->cpus == (conf->sockets * conf->cores)) + only_one_thread_per_core = 1; + else + only_one_thread_per_core = 0; + } + + /* If we are telling the system we only want to use 1 thread + * per core with the CPUs node option this is the easiest way + * to portray that to the affinity plugin. + */ + if (only_one_thread_per_core) + req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; if (req->cpu_bind_type & bind_mode) { /* Explicit step binding specified by user */ diff --git a/src/plugins/task/cgroup/task_cgroup_cpuset.c b/src/plugins/task/cgroup/task_cgroup_cpuset.c index 8e3c59ad9..e04f83244 100644 --- a/src/plugins/task/cgroup/task_cgroup_cpuset.c +++ b/src/plugins/task/cgroup/task_cgroup_cpuset.c @@ -61,7 +61,14 @@ #ifdef HAVE_HWLOC #include <hwloc.h> + +#if !defined(__FreeBSD__) #include <hwloc/glibc-sched.h> +#else +// For cpuset +#include <pthread_np.h> +#define cpu_set_t cpuset_t +#endif # if HWLOC_API_VERSION <= 0x00010000 /* After this version the cpuset structure and all it's functions diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 02ddd73b5..2444e746b 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -1725,7 +1725,7 @@ static void _set_pbs_options(int argc, char **argv) { int opt_char, option_index = 0; char *sep = ""; - char *pbs_opt_string = "+a:A:c:C:e:hIj::J:k:l:m:M:N:o:p:q:r:S:t:u:v:VW:z"; + char *pbs_opt_string = "+a:A:c:C:e:hIj:J:k:l:m:M:N:o:p:q:r:S:t:u:v:VW:z"; struct option pbs_long_options[] = { {"start_time", required_argument, 0, 'a'}, diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index e6f6f7e58..4a9c50c34 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -844,7 +844,8 @@ static void _update_active_row(struct gs_part *p_ptr, int add_new_jobs) /* attempt to add any new jobs */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; - if (j_ptr->row_state != GS_NO_ACTIVE) + if ((j_ptr->row_state != GS_NO_ACTIVE) || + (j_ptr->job_ptr->priority == 0)) continue; if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) { _add_job_to_active(j_ptr->job_ptr, p_ptr); @@ -918,8 +919,9 @@ static void _remove_job_from_part(uint32_t job_id, struct gs_part *p_ptr, } p_ptr->job_list[i] = NULL; - /* make sure the job is not suspended, and then delete it */ - if (!fini && (j_ptr->sig_state == GS_SUSPEND)) { + /* make sure the job is not suspended by gang, and then delete it */ + if (!fini && (j_ptr->sig_state == GS_SUSPEND) && + j_ptr->job_ptr->priority) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _remove_job_from_part: resuming " "suspended job %u", j_ptr->job_id); @@ -1077,7 +1079,7 @@ static void _scan_slurm_job_list(void) /* We're not tracking this job. Resume it if it's * suspended, and then add it to the job list. */ - if (IS_JOB_SUSPENDED(job_ptr)) { + if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { /* The likely scenario here is that the * failed over, and this is a job that gang * had previously suspended. It's not possible @@ -1349,6 +1351,7 @@ extern int gs_reconfig(void) struct gs_part *p_ptr, *newp_ptr; List old_part_list; struct job_record *job_ptr; + struct gs_job *j_ptr; if (!timeslicer_thread_id) { /* gs_init() will be called later from read_slurm_conf() @@ -1377,16 +1380,15 @@ extern int gs_reconfig(void) p_ptr->part_name); if (!newp_ptr) { /* this partition was removed, so resume - * any suspended jobs and continue */ + * any jobs suspended by gang and continue */ for (i = 0; i < p_ptr->num_jobs; i++) { - if (p_ptr->job_list[i]->sig_state == - GS_SUSPEND) { + j_ptr = p_ptr->job_list[i]; + if ((j_ptr->sig_state == GS_SUSPEND) && + (j_ptr->job_ptr->priority != 0)) { info("resuming job in missing part %s", p_ptr->part_name); - _resume_job(p_ptr->job_list[i]-> - job_id); - p_ptr->job_list[i]->sig_state = - GS_RESUME; + _resume_job(j_ptr->job_id); + j_ptr->sig_state = GS_RESUME; } } continue; @@ -1411,9 +1413,8 @@ extern int gs_reconfig(void) continue; } /* resume any job that is suspended by us */ - if (IS_JOB_SUSPENDED(job_ptr) && - (job_ptr->priority != 0)) { - if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { + if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { + if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG){ info("resuming job %u apparently " "suspended by gang", job_ptr->job_id); @@ -1453,6 +1454,7 @@ extern int gs_reconfig(void) static void _build_active_row(struct gs_part *p_ptr) { int i; + struct gs_job *j_ptr; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering _build_active_row"); @@ -1467,10 +1469,12 @@ static void _build_active_row(struct gs_part *p_ptr) /* attempt to add jobs from the job_list in the current order */ for (i = 0; i < p_ptr->num_jobs; i++) { - if (_job_fits_in_active_row(p_ptr->job_list[i]->job_ptr, - p_ptr)) { - _add_job_to_active(p_ptr->job_list[i]->job_ptr, p_ptr); - p_ptr->job_list[i]->row_state = GS_ACTIVE; + j_ptr = p_ptr->job_list[i]; + if (j_ptr->job_ptr->priority == 0) + continue; + if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) { + _add_job_to_active(j_ptr->job_ptr, p_ptr); + j_ptr->row_state = GS_ACTIVE; } } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) @@ -1525,7 +1529,7 @@ static void _cycle_job_list(struct gs_part *p_ptr) for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->row_state == GS_NO_ACTIVE) && - (j_ptr->sig_state == GS_RESUME)) { + (j_ptr->sig_state == GS_RESUME)) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _cycle_job_list: suspending job %u", j_ptr->job_id); @@ -1544,8 +1548,9 @@ static void _cycle_job_list(struct gs_part *p_ptr) /* Resume suspended jobs that are GS_ACTIVE */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; - if (j_ptr->row_state == GS_ACTIVE && - j_ptr->sig_state == GS_SUSPEND) { + if ((j_ptr->row_state == GS_ACTIVE) && + (j_ptr->sig_state == GS_SUSPEND) && + (j_ptr->job_ptr->priority != 0)) { /* Redundant check */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _cycle_job_list: resuming job %u", j_ptr->job_id); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 7f6088cfa..d6e61833a 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -178,7 +178,7 @@ static struct job_record *_job_rec_copy(struct job_record *job_ptr); static void _job_timed_out(struct job_record *job_ptr); static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run, struct job_record **job_rec_ptr, uid_t submit_uid, - char **err_msg); + char **err_msg, uint16_t protocol_version); static void _list_delete_job(void *job_entry); static int _list_find_job_id(void *job_entry, void *key); static int _list_find_job_old(void *job_entry, void *key); @@ -2422,8 +2422,8 @@ static void _remove_job_hash(struct job_record *job_entry) fatal("job hash error"); return; /* Fix CLANG false positive error */ } - job_entry->job_next = NULL; - + *job_pptr = job_entry->job_next; + job_entry->job_next = NULL; } /* _add_job_array_hash - add a job hash entry for given job record, @@ -2476,8 +2476,6 @@ extern bool test_job_array_complete(uint32_t array_job_id) if (job_ptr) { if (!IS_JOB_COMPLETE(job_ptr)) return false; - if (job_ptr->array_recs && job_ptr->array_recs->task_cnt) - return false; if (job_ptr->array_recs && job_ptr->array_recs->max_exit_code) return false; } @@ -2505,8 +2503,6 @@ extern bool test_job_array_completed(uint32_t array_job_id) if (job_ptr) { if (!IS_JOB_COMPLETED(job_ptr)) return false; - if (job_ptr->array_recs && job_ptr->array_recs->task_cnt) - return false; } /* Need to test individual job array records */ @@ -2568,6 +2564,12 @@ extern struct job_record *find_job_array_rec(uint32_t array_job_id, return find_job_record(array_job_id); if (array_task_id == INFINITE) { /* find by job ID */ + /* Look for job record with all of the pending tasks */ + job_ptr = find_job_record(array_job_id); + if (job_ptr && job_ptr->array_recs && + (job_ptr->array_job_id == array_job_id)) + return job_ptr; + inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { @@ -2579,14 +2581,6 @@ extern struct job_record *find_job_array_rec(uint32_t array_job_id, } job_ptr = job_ptr->job_array_next_j; } - if (match_job_ptr) - return match_job_ptr; - - /* Look for job record with all of the pending tasks */ - job_ptr = find_job_record(array_job_id); - if (job_ptr && job_ptr->array_recs && - (job_ptr->array_job_id == array_job_id)) - return job_ptr; return match_job_ptr; } else { /* Find specific task ID */ inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id); @@ -3552,6 +3546,7 @@ struct job_record *_job_rec_copy(struct job_record *job_ptr) job_ptr_pend->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); } + job_ptr_pend->sched_nodes = NULL; if (job_ptr->spank_job_env_size) { job_ptr_pend->spank_job_env = xmalloc(sizeof(char *) * @@ -3723,6 +3718,7 @@ static int _select_nodes_parts(struct job_record *job_ptr, bool test_only, * IN submit_uid -uid of user issuing the request * OUT job_pptr - set to pointer to job record * OUT err_msg - Custom error message to the user, caller to xfree results + * IN protocol_version - version of the code the caller is using * RET 0 or an error code. If the job would only be able to execute with * some change in partition configuration then * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned @@ -3734,7 +3730,8 @@ static int _select_nodes_parts(struct job_record *job_ptr, bool test_only, extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, will_run_response_msg_t **resp, int allocate, uid_t submit_uid, - struct job_record **job_pptr, char **err_msg) + struct job_record **job_pptr, char **err_msg, + uint16_t protocol_version) { static int defer_sched = -1; int error_code, i; @@ -3757,7 +3754,8 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, } error_code = _job_create(job_specs, allocate, will_run, - &job_ptr, submit_uid, err_msg); + &job_ptr, submit_uid, err_msg, + protocol_version); *job_pptr = job_ptr; if (error_code) { @@ -5158,7 +5156,7 @@ extern int job_limits_check(struct job_record **job_pptr, bool check_min_time) static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, struct job_record **job_pptr, uid_t submit_uid, - char **err_msg) + char **err_msg, uint16_t protocol_version) { static int launch_type_poe = -1; int error_code = SLURM_SUCCESS, i, qos_error; @@ -5462,10 +5460,11 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, &exc_bitmap))) { if (error_code == SLURM_ERROR) error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY; + job_ptr = *job_pptr; goto cleanup_fail; } job_ptr = *job_pptr; - job_ptr->start_protocol_ver = SLURM_PROTOCOL_VERSION; + job_ptr->start_protocol_ver = protocol_version; job_ptr->part_ptr = part_ptr; job_ptr->part_ptr_list = part_ptr_list; @@ -6362,6 +6361,7 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, if (error_code) return error_code; + *job_rec_ptr = job_ptr; job_ptr->partition = xstrdup(job_desc->partition); if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET) job_ptr->profile = job_desc->profile; @@ -6548,7 +6548,6 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, */ detail_ptr->mc_ptr = _set_multi_core_data(job_desc); - *job_rec_ptr = job_ptr; return SLURM_SUCCESS; } @@ -6673,16 +6672,25 @@ void job_time_limit(void) while ((job_ptr =(struct job_record *) list_next(job_iterator))) { xassert (job_ptr->magic == JOB_MAGIC); +#ifndef HAVE_BG + /* If the CONFIGURING flag is removed elsewhere like + * on a Bluegene system this check is not needed and + * should be avoided. In the case of BG blocks that + * are booting aren't associated with + * power_node_bitmap so bit_overlap always returns 0 + * and erroneously removes the flag. + */ if (IS_JOB_CONFIGURING(job_ptr)) { if (!IS_JOB_RUNNING(job_ptr) || (bit_overlap(job_ptr->node_bitmap, power_node_bitmap) == 0)) { - debug("%s: Configuration for job %u is complete", + debug("%s: Configuration for job %u is " + "complete", __func__, job_ptr->job_id); job_ptr->job_state &= (~JOB_CONFIGURING); } } - +#endif /* This needs to be near the top of the loop, checks every * running, suspended and pending job */ resv_status = job_resv_check(job_ptr); @@ -7270,7 +7278,6 @@ extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, struct job_record *job_ptr; uint32_t jobs_packed = 0, tmp_offset; Buf buffer; - time_t min_age = 0, now = time(NULL); buffer_ptr[0] = NULL; *buffer_size = 0; @@ -7280,10 +7287,7 @@ extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, /* write message body header : size and time */ /* put in a place holder job record count of 0 for now */ pack32(jobs_packed, buffer); - pack_time(now, buffer); - - if (slurmctld_conf.min_job_age > 0) - min_age = now - slurmctld_conf.min_job_age; + pack_time(time(NULL), buffer); /* write individual job records */ part_filter_set(uid); @@ -7298,10 +7302,6 @@ extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, if (_hide_job(job_ptr, uid)) continue; - if ((min_age > 0) && (job_ptr->end_time < min_age) && - (! IS_JOB_COMPLETING(job_ptr)) && IS_JOB_FINISHED(job_ptr)) - continue; /* job ready for purging, don't dump */ - if ((filter_uid != NO_VAL) && (filter_uid != job_ptr->user_id)) continue; @@ -10347,6 +10347,9 @@ extern int update_job(slurm_msg_t *msg, uid_t uid) struct job_record *job_ptr; int rc; + xfree(job_specs->job_id_str); + xstrfmtcat(job_specs->job_id_str, "%u", job_specs->job_id); + job_ptr = find_job_record(job_specs->job_id); if (job_ptr == NULL) { error("update_job: job_id %u does not exist.", @@ -12087,6 +12090,10 @@ static int _job_suspend(struct job_record *job_ptr, uint16_t op, bool indf_susp) /* perform the operation */ if (op == SUSPEND_JOB) { + if (IS_JOB_SUSPENDED(job_ptr) && indf_susp) { + job_ptr->priority = 0; /* Prevent gang sched resume */ + return SLURM_SUCCESS; + } if (!IS_JOB_RUNNING(job_ptr)) return ESLURM_JOB_NOT_RUNNING; rc = _suspend_job_nodes(job_ptr, indf_susp); @@ -13558,7 +13565,7 @@ extern int job_restart(checkpoint_msg_t *ckpt_ptr, uid_t uid, slurm_fd_t conn_fd NULL, /* resp */ 0, /* allocate */ 0, /* submit_uid. set to 0 to set job_id */ - &job_ptr, NULL); + &job_ptr, NULL, SLURM_PROTOCOL_VERSION); /* set restart directory */ if (job_ptr) { diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 7c03a9010..9775b5c48 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -1903,12 +1903,8 @@ extern int test_job_dependency(struct job_record *job_ptr) depend_iter = list_iterator_create(job_ptr->details->depend_list); while ((dep_ptr = list_next(depend_iter))) { bool clear_dep = false; - if (dep_ptr->array_task_id == INFINITE) { - dep_ptr->job_ptr = find_job_record(dep_ptr->job_id); - } else if (dep_ptr->array_task_id != NO_VAL) { - dep_ptr->job_ptr = find_job_array_rec(dep_ptr->job_id, - dep_ptr->array_task_id); - } + dep_ptr->job_ptr = find_job_array_rec(dep_ptr->job_id, + dep_ptr->array_task_id); djob_ptr = dep_ptr->job_ptr; if ((dep_ptr->depend_type == SLURM_DEPEND_SINGLETON) && job_ptr->name) { @@ -1942,8 +1938,7 @@ extern int test_job_dependency(struct job_record *job_ptr) (djob_ptr->array_job_id != dep_ptr->job_id))) { /* job is gone, dependency lifted */ clear_dep = true; - } else if ((djob_ptr->array_task_id == INFINITE) && - (djob_ptr->array_recs != NULL)) { + } else if (dep_ptr->array_task_id == INFINITE) { bool array_complete, array_completed, array_pending; array_complete=test_job_array_complete(dep_ptr->job_id); array_completed=test_job_array_completed(dep_ptr->job_id); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 7b3f39236..d8c264bd6 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1910,7 +1910,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, cleanup: if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap && - !IS_JOB_RUNNING(job_ptr)) { + !IS_JOB_STARTED(job_ptr)) { job_ptr->array_task_id = NO_VAL; } if (preemptee_job_list) diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index bca551240..51a57eb99 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1016,7 +1016,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) error_code = job_allocate(job_desc_msg, immediate, false, NULL, true, uid, &job_ptr, - &err_msg); + &err_msg, + msg->protocol_version); /* unlock after finished using the job structure data */ END_TIMER2("_slurm_rpc_allocate_resources"); } @@ -1082,6 +1083,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) slurm_msg_t_init(&response_msg); response_msg.flags = msg->flags; + response_msg.protocol_version = msg->protocol_version; response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION; response_msg.data = &alloc_msg; @@ -2321,7 +2323,8 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg) error_code = job_allocate(job_desc_msg, false, true, &resp, true, uid, &job_ptr, - &err_msg); + &err_msg, + msg->protocol_version); } else { /* existing job test */ error_code = job_start_data(job_desc_msg, &resp); @@ -3188,7 +3191,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) /* Create new job allocation */ error_code = job_allocate(job_desc_msg, job_desc_msg->immediate, false, - NULL, 0, uid, &job_ptr, &err_msg); + NULL, 0, uid, &job_ptr, &err_msg, + msg->protocol_version); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); END_TIMER2("_slurm_rpc_submit_batch_job"); @@ -3261,11 +3265,11 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) /* return result */ if (error_code) { - info("_slurm_rpc_update_job JobId=%u uid=%d: %s", - job_desc_msg->job_id, uid, slurm_strerror(error_code)); + info("_slurm_rpc_update_job JobId=%s uid=%d: %s", + job_desc_msg->job_id_str, uid, slurm_strerror(error_code)); } else { - info("_slurm_rpc_update_job complete JobId=%u uid=%d %s", - job_desc_msg->job_id, uid, TIME_STR); + info("_slurm_rpc_update_job complete JobId=%s uid=%d %s", + job_desc_msg->job_id_str, uid, TIME_STR); /* Below functions provide their own locking */ schedule_job_save(); schedule_node_save(); @@ -3938,7 +3942,10 @@ inline static void _slurm_rpc_requeue(slurm_msg_t * msg) END_TIMER2("_slurm_rpc_requeue"); if (error_code) { - info("%s: %u: %s", __func__, req_ptr->job_id, + if (!req_ptr->job_id_str) + xstrfmtcat(req_ptr->job_id_str, "%u", req_ptr->job_id); + + info("%s: %s: %s", __func__, req_ptr->job_id_str, slurm_strerror(error_code)); } diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 888d59153..b8556737a 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -2572,6 +2572,7 @@ extern int delete_resv(reservation_name_msg_t *resv_desc_ptr) return ESLURM_RESERVATION_INVALID; } + (void) set_node_maint_mode(true); last_resv_update = time(NULL); schedule_resv_save(); return rc; diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 10fe6f12b..e5c71d015 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -525,6 +525,10 @@ typedef struct job_array_struct { uint32_t tot_comp_tasks; /* Completed task count */ } job_array_struct_t; +/* + * NOTE: When adding fields to the job_record, or any underlying structures, + * be sure to sync with _rec_job_copy. + */ struct job_record { char *account; /* account number to charge */ char *alias_list; /* node name to address aliases */ @@ -1099,6 +1103,7 @@ extern int job_alloc_info(uint32_t uid, uint32_t job_id, * IN submit_uid -uid of user issuing the request * OUT job_pptr - set to pointer to job record * OUT err_msg - Custom error message to the user, caller to xfree results + * IN protocol_version - version of the code the caller is using * RET 0 or an error code. If the job would only be able to execute with * some change in partition configuration then * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned @@ -1111,9 +1116,10 @@ extern int job_alloc_info(uint32_t uid, uint32_t job_id, * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part */ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, - int will_run, will_run_response_msg_t **resp, - int allocate, uid_t submit_uid, struct job_record **job_pptr, - char **err_msg); + int will_run, will_run_response_msg_t **resp, + int allocate, uid_t submit_uid, + struct job_record **job_pptr, + char **err_msg, uint16_t protocol_version); /* If this is a job array meta-job, prepare it for being scheduled */ extern void job_array_pre_sched(struct job_record *job_ptr); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index ceceb76f0..16d714751 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2399,7 +2399,17 @@ step_create(job_step_create_request_msg_t *step_specs, return ESLURM_INVALID_TASK_MEMORY; return SLURM_ERROR; } - + if ((step_specs->resv_port_cnt != (uint16_t) NO_VAL) + && (step_specs->resv_port_cnt == 0)) { + /* reserved port count set to maximum task count on + * any node plus one */ + for (i = 0; i < step_ptr->step_layout->node_cnt; i++) { + step_specs->resv_port_cnt = + MAX(step_specs->resv_port_cnt, + step_ptr->step_layout->tasks[i]); + } + step_specs->resv_port_cnt++; + } if (step_specs->resv_port_cnt != (uint16_t) NO_VAL && step_specs->resv_port_cnt != 0) { step_ptr->resv_port_cnt = step_specs->resv_port_cnt; diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index ff475e386..7d0afecfb 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2018,6 +2018,7 @@ _cancel_step_mem_limit(uint32_t job_id, uint32_t step_id) msg.data = ¬ify_req; slurm_send_only_controller_msg(&msg); + memset(&kill_req, 0, sizeof(job_step_kill_msg_t)); kill_req.job_id = job_id; kill_req.job_step_id = step_id; kill_req.signal = SIGKILL; diff --git a/src/squeue/print.c b/src/squeue/print.c index a18ded03e..b2fa51d3d 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -351,7 +351,7 @@ int print_job_from_format(squeue_job_rec_t *job_rec_ptr, List list) xfree(job_rec_ptr->job_ptr->partition); job_rec_ptr->job_ptr->partition = xstrdup(job_rec_ptr-> part_name); - + } if (job_rec_ptr->job_ptr->array_task_str && params.array_flag) { if (max_array_size == -1) @@ -846,8 +846,6 @@ int _print_job_schednodes(job_info_t * job, int width, bool right, char* suffix) int _print_job_reason_list(job_info_t * job, int width, bool right, char* suffix) { - int l; - if (job == NULL) { /* Print the Header instead */ char *title = "NODELIST(REASON)"; if (params.cluster_flags & CLUSTER_FLAG_BG) @@ -862,8 +860,7 @@ int _print_job_reason_list(job_info_t * job, int width, bool right, reason = job->state_desc; else reason = job_reason_string(job->state_reason); - l = strlen(reason) + 3; /* 3 = () + "" */ - snprintf(id, l, "(%s)", reason); + snprintf(id, FORMAT_STRING_SIZE, "(%s)", reason); _print_str(id, width, right, true); } else { char *nodes = xstrdup(job->nodes); diff --git a/src/srun/libsrun/fname.c b/src/srun/libsrun/fname.c index 419ccc095..eda688942 100644 --- a/src/srun/libsrun/fname.c +++ b/src/srun/libsrun/fname.c @@ -72,8 +72,9 @@ fname_create(srun_job_t *job, char *format) fname_t *fname = NULL; char *p, *q, *name, *tmp_env; uint32_t array_job_id = job->jobid; - uint16_t array_task_id = (uint16_t) NO_VAL; + uint32_t array_task_id = NO_VAL; char *esc; + char *end; fname = xmalloc(sizeof(*fname)); fname->type = IO_ALL; @@ -141,17 +142,17 @@ fname_create(srun_job_t *job, char *format) case 'a': /* '%a' => array task id */ tmp_env = getenv("SLURM_ARRAY_TASK_ID"); if (tmp_env) - array_task_id = atoi(tmp_env); + array_task_id = strtoul(tmp_env, &end, 10); xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, array_task_id); + xstrfmtcat(name, "%0*u", wid, array_task_id); q = ++p; break; case 'A': /* '%A' => array master job id */ tmp_env = getenv("SLURM_ARRAY_JOB_ID"); if (tmp_env) - array_job_id = atoi(tmp_env); + array_job_id = strtoul(tmp_env, &end, 10); xmemcat(name, q, p - 1); - xstrfmtcat(name, "%0*d", wid, array_job_id); + xstrfmtcat(name, "%0*u", wid, array_job_id); q = ++p; break; diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c index d9c4b7e3f..cd7ce06a2 100644 --- a/src/srun/libsrun/opt.c +++ b/src/srun/libsrun/opt.c @@ -2054,8 +2054,10 @@ static bool _opt_verify(void) opt.ntasks *= opt.cores_per_socket; opt.ntasks *= opt.threads_per_core; opt.ntasks_set = true; - } else if (opt.ntasks_per_node != NO_VAL) + } else if (opt.ntasks_per_node != NO_VAL) { opt.ntasks *= opt.ntasks_per_node; + opt.ntasks_set = true; + } /* massage the numbers */ if (opt.nodelist) { diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index 8b5ebc160..2238c4624 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -106,7 +106,6 @@ static int pty_sigarray[] = { SIGWINCH, 0 }; /* * Prototypes: */ -static int _compute_task_count(allocation_info_t *info); static void _set_ntasks(allocation_info_t *info); static srun_job_t *_job_create_structure(allocation_info_t *info); static char * _normalize_hostlist(const char *hostlist); @@ -747,33 +746,27 @@ job_force_termination(srun_job_t *job) kill_sent++; } -static int -_compute_task_count(allocation_info_t *ainfo) -{ - int i, cnt = 0; -#if defined HAVE_BGQ -//#if defined HAVE_BGQ && HAVE_BG_FILES - /* always return the ntasks here for Q */ - return opt.ntasks; -#endif - if (opt.cpus_set) { - for (i = 0; i < ainfo->num_cpu_groups; i++) - cnt += ( ainfo->cpu_count_reps[i] * - (ainfo->cpus_per_node[i]/opt.cpus_per_task)); - } else if (opt.ntasks_per_node != NO_VAL) - cnt = ainfo->nnodes * opt.ntasks_per_node; - - return (cnt < ainfo->nnodes) ? ainfo->nnodes : cnt; -} - static void _set_ntasks(allocation_info_t *ai) { - if (!opt.ntasks_set) { - opt.ntasks = _compute_task_count(ai); - if (opt.cpus_set) - opt.ntasks_set = true; /* implicit */ + int cnt = 0; + + if (opt.ntasks_set) + return; + + if (opt.ntasks_per_node != NO_VAL) { + cnt = ai->nnodes * opt.ntasks_per_node; + opt.ntasks_set = true; /* implicit */ + } else if (opt.cpus_set) { + int i; + + for (i = 0; i < ai->num_cpu_groups; i++) + cnt += (ai->cpu_count_reps[i] * + (ai->cpus_per_node[i] / opt.cpus_per_task)); + opt.ntasks_set = true; /* implicit */ } + + opt.ntasks = (cnt < ai->nnodes) ? ai->nnodes : cnt; } /* diff --git a/testsuite/expect/globals b/testsuite/expect/globals index 9471c8fb7..8649f3b1b 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -216,6 +216,7 @@ cset sleep_error_message "(invalid time interval)|(bad character in argument)" # Other common variables set alpha "\[a-zA-Z\]+" set alpha_cap "\[A-Z\]+" +set alpha_comma_slash "\[a-zA-Z/,\]+" set alpha_numeric "\[a-zA-Z0-9\]+" set alpha_numeric_comma "\[a-zA-Z0-9_,\-\]+" set alpha_numeric_under "\[a-zA-Z0-9_\-\]+" @@ -990,24 +991,28 @@ proc test_topology { } { ################################################################ # -# Proc: test_cpu_affinity +# Proc: get_task_plugins # -# Purpose: Determine if system is using the task/affinity plugin +# Purpose: get the task plugins running with task/ stripped # -# Returns 1 if enforcing, 0 if none +# Returns Returns comma separated list of task plugins running without the task/ # ################################################################ -proc test_cpu_affinity { } { - global scontrol alpha +proc get_affinity_types { } { + global scontrol alpha_comma_slash log_user 0 set affinity 0 spawn $scontrol show config expect { - -re "task/($alpha)" { - if {![string compare $expect_out(1,string) "affinity"]} { - set affinity 1 + -re "TaskPlugin *= ($alpha_comma_slash)" { + set parts [split $expect_out(1,string) ",/"] + while 1 { + set task_found [lsearch $parts "task"] + if { $task_found == -1 } break + set parts [lreplace $parts $task_found $task_found] } + set affinity [join $parts ","] exp_continue } eof { @@ -1019,6 +1024,29 @@ proc test_cpu_affinity { } { return $affinity } +################################################################ +# +# Proc: test_cpu_affinity +# +# Purpose: Determine if system is using the task/affinity plugin +# +# Returns 1 if enforcing, 0 if none +# +################################################################ +proc test_cpu_affinity { } { + log_user 0 + + set affinity 0 + set parts [split [get_affinity_types] ","] + + if { [lsearch $parts "affinity"] != -1 } { + set affinity 1 + } + + log_user 1 + return $affinity +} + ################################################################ # # Proc: test_cpu_affinity_or_cgroup @@ -1033,18 +1061,14 @@ proc test_cpu_affinity_or_cgroup { } { global scontrol alpha log_user 0 + set affinity 0 - spawn $scontrol show config - expect { - -re "task/($alpha)" { - if {![string compare $expect_out(1,string) "affinity"] || ![string compare $expect_out(1,string) "cgroup"]} { - set affinity 1 - } - exp_continue - } - eof { - wait - } + set parts [split [get_affinity_types] ","] + + if { [lsearch $parts "affinity"] != -1 } { + set affinity 1 + } elseif { [lsearch $parts "cgroup"] != -1 } { + set affinity 1 } log_user 1 @@ -1064,16 +1088,12 @@ proc test_mem_affinity { } { global scontrol alpha log_user 0 + set affinity 0 - spawn $scontrol show config - expect { - -re "task/affinity" { - set affinity 1 - exp_continue - } - eof { - wait - } + set parts [split [get_affinity_types] ","] + + if { [lsearch $parts "affinity"] != -1 } { + set affinity 1 } log_user 1 @@ -2257,7 +2277,7 @@ proc check_acct_associations { } { set num2 $expect_out(2,string) set first [info exists found($cluster,$num1)] set sec [info exists found($cluster,$num2)] - #send_user "$first=$num1 $sec=$num2\n"; + #send_user "$first=$num1 $sec=$num2\n" if { $first } { send_user "FAILURE: $cluster found lft $num1 again\n" set rc 0 diff --git a/testsuite/expect/test1.75 b/testsuite/expect/test1.75 index a127997fe..d861cac34 100755 --- a/testsuite/expect/test1.75 +++ b/testsuite/expect/test1.75 @@ -37,11 +37,10 @@ set threads 0 set job_id 0 set exit_code 0 array set freq_lvl_1 { - low 0 high 0 highm1 0 medium 0 - + low 0 } array set freq_lvl_2 { conservative 0 @@ -195,10 +194,14 @@ cancel_job $job_id array set freq_lvl_1 [sub_job [array get freq_lvl_1]] -if {!(($freq_lvl_1(low) < $freq_lvl_1(medium)) && ($freq_lvl_1(medium) < $freq_lvl_1(high)) - && ($freq_lvl_1(high) <= $freq_lvl_1(highm1)))} { +if { (($freq_lvl_1(low) > $freq_lvl_1(medium)) || + ($freq_lvl_1(medium) > $freq_lvl_1(high)) || + ($freq_lvl_1(highm1) > $freq_lvl_1(high)))} { send_user "\nFAILURE: CPU frequencies are not correct\n" - set exit_code 1 + foreach name [array names freq_lvl_1] { + send_user "$name is $freq_lvl_1($name)\n" + } + exit 1 } array set freq_lvl_2 [sub_job [array get freq_lvl_2]] diff --git a/testsuite/expect/test17.36 b/testsuite/expect/test17.36 index c7a7f6c04..0dff4d425 100755 --- a/testsuite/expect/test17.36 +++ b/testsuite/expect/test17.36 @@ -352,7 +352,7 @@ send_user "\n\nTest partition with shared=YES:2\n" create_part $test_part_2 "YES:$shared_j_cnt" $node_name # Submit a job with shared (expect 2 jobs per core/CPU) -set new_job_limit [expr $num_jobs *2] +set new_job_limit [expr $num_jobs * 2] sub_job "0-$new_job_limit" 1 $test_part_2 check_job $new_job_limit cancel_job $job_id diff --git a/testsuite/expect/test21.26 b/testsuite/expect/test21.26 index 686c5ed9c..97c98a431 100755 --- a/testsuite/expect/test21.26 +++ b/testsuite/expect/test21.26 @@ -34,13 +34,17 @@ source ./globals_accounting set test_id "21.26" set exit_code 0 -set tc1 [get_cluster_name] +set tc1 "test$test_id\_cluster" set ta1 "test$test_id-account.1" set ta2 "test$test_id-account.2" set qos1 qqostest set tu1 "test$test_id-user.1" set access_err 0 +# Cluster +array set clus_req {} +set clus_req(qos) "''" + # Accounts array set acct_req1 {} set acct_req1(cluster) $tc1 @@ -55,19 +59,6 @@ set user_req(cluster) $tc1 set user_req(account) $ta1,$ta2 set user_req(defaultaccount) $ta1 -# Mod Account root 1 -array set mod_acct_desc1 {} -set mod_acct_desc1(cluster) $tc1 -array set mod_acct_vals1 {} -array set mod_assoc_vals1 {} - -# Mod Account root 2 -array set mod_acct_desc2 {} -set mod_assoc_desc2(cluster) $tc1 -array set mod_acct_vals2 {} -array set mod_assoc_vals2 {} -set mod_assoc_vals2(qos) "''" - # Mod Account for ta1 and ta2 array set mod_acct_desc3 {} set mod_acct_desc3(cluster) $tc1 @@ -105,15 +96,19 @@ if { [string compare [check_accounting_admin_level] "Administrator"] } { proc end_test { } { global tu1 ta1 ta2 tc1 qos1 mod_acct_desc1 mod_acct_vals1 mod_assoc_vals1 set exit_code 0 + incr exit_code [remove_cluster $tc1] incr exit_code [remove_user "" "" "$tu1"] incr exit_code [remove_acct "" "$ta1,$ta2"] incr exit_code [remove_qos "$qos1"] - if { [string length $mod_assoc_vals1(qoslevel)] } { - incr exit_code [mod_acct "root" [array get mod_acct_desc1] [array get mod_acct_vals1] [array get mod_assoc_vals1]] - } return $exit_code } +# +# Remove any residual clusters and +# Add a test cluster for testing +# +remove_cluster $tc1 +add_cluster $tc1 "" # # remove test associations to make sure we have a clean system @@ -153,12 +148,14 @@ if { !$match } { } #now set default for cluster to "" -incr exit_code [mod_acct "root" [array get mod_acct_desc2] [array get mod_acct_vals2] [array get mod_assoc_vals2]] +incr exit_code [mod_cluster $tc1 [array get clus_req]] if { $exit_code } { remove_qos "$qos1" exit $exit_code } +# send_user "moded the cluster to qos = ''" +# exit #add qos incr exit_code [add_qos "$qos1"] diff --git a/testsuite/expect/test28.7 b/testsuite/expect/test28.7 index 72fb5b03b..91e13c9b3 100755 --- a/testsuite/expect/test28.7 +++ b/testsuite/expect/test28.7 @@ -44,7 +44,7 @@ if {$array_size > [get_array_config]} { } exec $bin_rm -f $script -make_bash_script $script "sleep 10; sleep \$SLURM_ARRAY_TASK_ID" +make_bash_script $script "sleep \$(( ( RANDOM % 10 ) + 1 ))" # # Submit a job array for first dependency test -- GitLab