From 820cf818fbd9518c471ac729f8bba1ced451fe2a Mon Sep 17 00:00:00 2001 From: Mehdi Dogguy <mehdi@debian.org> Date: Mon, 8 Sep 2014 22:40:12 +0200 Subject: [PATCH] Imported Upstream version 14.03.5 --- META | 4 +- NEWS | 98 +- auxdir/slurm.m4 | 4 +- configure | 6 +- doc/html/accounting.shtml | 19 +- doc/html/accounting_storageplugins.shtml | 3 +- doc/html/cray_alps.shtml | 18 +- doc/html/elastic_computing.shtml | 7 - doc/html/faq.shtml | 10 +- doc/html/meetings.shtml | 6 +- doc/html/slurm_ug_agenda.shtml | 1256 ++++++++++------- doc/man/man1/salloc.1 | 5 +- doc/man/man1/sbatch.1 | 10 +- doc/man/man1/scontrol.1 | 5 +- doc/man/man1/sinfo.1 | 2 +- doc/man/man1/srun.1 | 5 +- doc/man/man5/nonstop.conf.5 | 2 +- doc/man/man5/slurm.conf.5 | 13 +- src/api/job_info.c | 7 +- src/common/assoc_mgr.c | 4 +- src/common/env.h | 4 +- src/common/gres.c | 59 +- src/common/gres.h | 3 + src/common/slurm_protocol_defs.c | 4 + src/common/slurm_protocol_defs.h | 2 + src/common/slurm_step_layout.c | 14 +- src/common/xcpuinfo.c | 20 +- .../mysql/accounting_storage_mysql.c | 11 +- .../accounting_storage/mysql/as_mysql_job.c | 36 +- .../accounting_storage/mysql/as_mysql_qos.c | 5 +- .../accounting_storage/mysql/as_mysql_wckey.c | 6 +- .../slurmdbd/accounting_storage_slurmdbd.c | 5 +- .../rapl/acct_gather_energy_rapl.c | 2 +- .../acct_gather_profile/hdf5/hdf5_api.c | 2 + .../hdf5/sh5util/sh5util.c | 4 +- .../proctrack/cgroup/proctrack_cgroup.c | 8 +- .../proctrack/linuxproc/proctrack_linuxproc.c | 15 +- src/plugins/proctrack/pgid/proctrack_pgid.c | 5 +- src/plugins/sched/backfill/backfill.c | 109 +- src/plugins/select/cons_res/job_test.c | 11 +- src/plugins/select/cons_res/select_cons_res.c | 27 +- src/plugins/task/cgroup/task_cgroup_cpuset.c | 86 +- src/sacct/print.c | 6 +- src/sacctmgr/cluster_functions.c | 4 +- src/sacctmgr/file_functions.c | 32 +- src/sbatch/sbatch.c | 3 +- src/scontrol/update_job.c | 11 +- src/sinfo/opts.c | 2 +- src/sinfo/sinfo.c | 21 +- src/slurmctld/acct_policy.c | 65 +- src/slurmctld/controller.c | 24 +- src/slurmctld/job_mgr.c | 100 +- src/slurmctld/job_scheduler.c | 29 +- src/slurmctld/node_mgr.c | 4 +- src/slurmctld/node_scheduler.c | 36 +- src/slurmctld/proc_req.c | 17 +- src/slurmctld/reservation.c | 28 +- src/slurmctld/slurmctld.h | 2 +- src/slurmctld/statistics.c | 2 + src/slurmctld/step_mgr.c | 10 +- src/slurmd/slurmd/req.c | 2 + src/slurmd/slurmstepd/io.c | 4 +- src/slurmd/slurmstepd/mgr.c | 5 + src/slurmd/slurmstepd/req.c | 32 +- src/slurmd/slurmstepd/slurmstepd_job.c | 2 +- src/srun/libsrun/launch.c | 4 + src/srun/libsrun/srun_job.c | 22 +- src/srun/libsrun/srun_job.h | 4 +- src/srun/srun_pty.c | 4 +- testsuite/expect/globals | 34 + testsuite/expect/test17.34 | 11 +- testsuite/expect/test2.18 | 127 +- testsuite/expect/test2.19 | 112 +- testsuite/expect/test2.21 | 6 + testsuite/expect/test2.22 | 16 +- testsuite/expect/test2.23 | 6 + testsuite/expect/test21.30 | 11 +- testsuite/expect/test3.11 | 19 +- testsuite/expect/test4.5 | 10 +- testsuite/slurm_unit/common/bitstring-test.c | 6 + testsuite/slurm_unit/common/pack-test.c | 2 +- 81 files changed, 1694 insertions(+), 1063 deletions(-) diff --git a/META b/META index 32989c2ff..ec84a2650 100644 --- a/META +++ b/META @@ -9,8 +9,8 @@ Name: slurm Major: 14 Minor: 03 - Micro: 4 - Version: 14.03.4 + Micro: 5 + Version: 14.03.5 Release: 1 ## diff --git a/NEWS b/NEWS index 56540290d..be4c259fd 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,89 @@ This file describes changes in recent versions of Slurm. It primarily documents those changes that are of interest to users and admins. +* Changes in Slurm 14.03.5 +========================== + -- If a srun runs in an exclusive allocation and doesn't use the entire + allocation and CR_PACK_NODES is set layout tasks appropriately. + -- Correct Shared field in job state information seen by scontrol, sview, etc. + -- Print Slurm error string in scontrol update job and reset the Slurm errno + before each call to the API. + -- Fix task/cgroup to handle -mblock:fcyclic correctly + -- Fix for core-based advanced reservations where the distribution of cores + across nodes is not even. + -- Fix issue where association maxnodes wouldn't be evaluated correctly if a + QOS had a GrpNodes set. + -- GRES fix with multiple files defined per line in gres.conf. + -- When a job is requeued make sure accounting marks it as such. + -- Print the state of requeued job as REQUEUED. + -- Fix if a job's partition was taken away from it don't allow a requeue. + -- Make sure we lock on the conf when sending slurmd's conf to the slurmstepd. + -- Fix issue with sacctmgr 'load' not able to gracefully handle bad formatted + file. + -- sched/backfill: Correct job start time estimate with advanced reservations. + -- Error message added when in proctrack/cgroup the step freezer path isn't + able to be destroyed for debug. + -- Added extra index's into the database for better performance when + deleting users. + -- Fix issue with wckeys when tracking wckeys, but not enforcing them, + you could get multiple '*' wckeys. + -- Fix bug which could report to squeue the wrong partition for a running job + that is submitted to multiple partitions. + -- Report correct CPU count allocated to job when allocated whole node even if + not using all CPUs. + -- If job's constraints cannot be satisfied put it in pending state with reason + BadConstraints and don't remove it. + -- sched/backfill - If job started with infinite time limit, set its end_time + one year in the future. + -- Clear record of a job's gres when requeued. + -- Clear QOS GrpUsedCPUs when resetting raw usage if QOS is not using any cpus. + -- Remove log message left over from debugging. + -- When using CR_PACK_NODES fix make --ntasks-per-node work correctly. + -- Report correct partition associated with a step if the job is submitted to + multiple partitions. + -- Fix to allow removing of preemption from a QOS + -- If the proctrack plugins fail to destroy the job container print an error + message and avoid to loop forever, give up after 120 seconds. + -- Make srun obey POSIX convention and increase the exit code by 128 when the + process terminated by a signal. + -- Sanity check for acct_gather_energy/rapl + -- If the proctrack plugins fail to destroy the job container print an error + message and avoid to loop forever, give up after 120 seconds. + -- If the sbatch command specifies the option --signal=B:signum sent the signal + to the batch script only. + -- If we cancel a task and we have no other exit code send the signal and + exit code. + -- Added note about InnoDB storage engine being used with MySQL. + -- Set the job exit code when the job is signaled and set the log level to + debug2() when processing an already completed job. + -- Reset diagnostics time stamp when "sdiag --reset" is called. + -- squeue and scontrol to report a job's "shared" value based upon partition + options rather than reporting "unknown" if job submission does not use + --exclusive or --shared option. + -- task/cgroup - Fix cpuset binding for batch script. + -- sched/backfill - Fix anomaly that could result in jobs being scheduled out + of order. + -- Expand pseudo-terminal size data structure field sizes from 8 to 16 bits. + -- Set the job exit code when the job is signaled and set the log level to + debug2() when processing an already completed job. + -- Distinguish between two identical error messages. + -- If using accounting_storage/mysql directly without a DBD fix issue with + start of requeued jobs. + -- If a job fails because of batch node failure and the job is requeued and an + epilog complete message comes from that node do not process the batch step + information since the job has already been requeued because the epilog + script running isn't guaranteed in this situation. + -- Change message to note a NO_VAL for return code could of come from node + failure as well as interactive user. + -- Modify test4.5 to only look at one partition instead of all of them. + -- Fix sh5util -u to accept username different from the user that runs the + command. + -- Corrections to man pages:salloc.1 sbatch.1 srun.1 nonstop.conf.5 + slurm.conf.5. + -- Restore srun --pty resize ability. + -- Have sacctmgr dump cluster handle situations where users or such have + special characters in their names like ':' + * Changes in Slurm 14.03.4 ========================== -- Fix issue where not enforcing QOS but a partition either allows or denies @@ -39,7 +122,7 @@ documents those changes that are of interest to users and admins. -- Keep supporting 'srun -N x --pty bash' for historical reasons. -- If EnforcePartLimits=Yes and QOS job is using can override limits, allow it. - -- Fix issues if partition allows or denys account's or QOS' and either are + -- Fix issues if partition allows or denies account's or QOS' and either are not set. -- If a job requests a partition and it doesn't allow a QOS or account the job is requesting pend unless EnforcePartLimits=Yes. Before it would @@ -89,8 +172,8 @@ documents those changes that are of interest to users and admins. is already running. -- Email messages for job array events print now use the job ID using the format "#_# (#)" rather than just the internal job ID. - -- Set the number of free licenses to be 0 if the global license count decreases - and total is less than in use. + -- Set the number of free licenses to be 0 if the global license count + decreases and total is less than in use. -- Add DebugFlag of BackfillMap. Previously a DebugFlag value of Backfill logged information about what it was doing plus a map of expected resouce use in the future. Now that very verbose resource use map is only logged @@ -104,6 +187,13 @@ documents those changes that are of interest to users and admins. jobs. -- For "scontrol --details show job" report the correct CPU_IDs when thre are multiple threads per core (we are translating a core bitmap to CPU IDs). + -- If DebugFlags=Protocol is configured in slurm.conf print details of the + connection, ip address and port accepted by the controller. + -- Fix minor memory leak when reading in incomplete node data checkpoint file. + -- Enlarge the width specifier when printing partition SHARE to display larger + sharing values. + -- sinfo locks added to prevent possibly duplicate record printing for + resources in multiple partitions. * Changes in Slurm 14.03.3-2 ============================ @@ -554,6 +644,8 @@ documents those changes that are of interest to users and admins. -- Properly enforce job --requeue and --norequeue options. -- If a job --mem-per-cpu limit exceeds the partition or system limit, then scale the job's memory limit and CPUs per task to satisfy the limit. + -- Correct logic to support Power7 processor with 1 or 2 threads per core + (CPU IDs are not consecutive). * Changes in Slurm 2.6.9 ======================== diff --git a/auxdir/slurm.m4 b/auxdir/slurm.m4 index 0f9d71f44..62fbce28d 100644 --- a/auxdir/slurm.m4 +++ b/auxdir/slurm.m4 @@ -66,9 +66,9 @@ AC_DEFUN([X_AC_SLURM_PORTS], [Define the default port count for slurmctld]) AC_SUBST(SLURMCTLD_PORT_COUNT) - AC_MSG_CHECKING([for dynamic allocation port to be enabled]) + AC_MSG_CHECKING([for dynamic allocation port to be enabled for Hadoop]) AC_ARG_ENABLE([dynamic-allocation], - AS_HELP_STRING([--enable-dynamic-allocation, enable dynamic allocation requests from user programs ([disabled])])) + AS_HELP_STRING([--enable-dynamic-allocation, enable dynamic allocation requests from user programs for Hadoop ([disabled])])) if test "$enable_dynamic_allocation" = "yes"; then AC_MSG_RESULT([yes]) slurm_enable_dynamic_allocation="yes" diff --git a/configure b/configure index 5aa00cff7..7ecb6a4b2 100755 --- a/configure +++ b/configure @@ -1700,7 +1700,7 @@ Optional Features: --disable-salloc-background disable salloc execution in the background --enable-simulator enable slurm simulator - --enable-dynamic-allocation, enable dynamic allocation requests from user programs (disabled) + --enable-dynamic-allocation, enable dynamic allocation requests from user programs for Hadoop (disabled) --enable-multiple-slurmd enable multiple-slurmd support @@ -22727,8 +22727,8 @@ _ACEOF - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dynamic allocation port to be enabled" >&5 -$as_echo_n "checking for dynamic allocation port to be enabled... " >&6; } + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dynamic allocation port to be enabled for Hadoop" >&5 +$as_echo_n "checking for dynamic allocation port to be enabled for Hadoop... " >&6; } # Check whether --enable-dynamic-allocation was given. if test "${enable_dynamic_allocation+set}" = set; then : enableval=$enable_dynamic_allocation; diff --git a/doc/html/accounting.shtml b/doc/html/accounting.shtml index 25c6b7d8a..f5389a960 100644 --- a/doc/html/accounting.shtml +++ b/doc/html/accounting.shtml @@ -22,7 +22,7 @@ these plugins include:</p> <li><b>AccountingStorageType</b> controls how detailed job and job step information is recorded. You can store this information in a text file, <a href="http://www.mysql.com/">MySQL</a> -or MariaDB database, optionally using SlurmDBD for added security.</li> +or MariaDB database (using the InnoDB storage engine), optionally using SlurmDBD for added security.</li> <li><b>JobAcctGatherType</b> is operating system dependent and controls what mechanism is used to collect accounting information. Supported values are <i>jobacct_gather/aix</i>, <i>jobacct_gather/linux</i> @@ -188,8 +188,11 @@ a configuration. <p><b>MySQL or MariaDB is the preferred database.</b> To enable this database support one only needs to have the development package for the database they -wish to use on the system. The slurm configure script uses -mysql_config and pg-config to find out the information it needs +wish to use on the system. <b>Slurm uses the InnoDB storage +engine in MySQL to make rollback possible. This must be available on your +MySQL installation or rollback will not work.</b> +</p><p>The slurm configure +script uses mysql_config to find out the information it needs about installed libraries and headers. You can specify where your mysql_config script is with the </i>--with-mysql_conf=/path/to/mysql_config</i> option when configuring your @@ -442,6 +445,16 @@ mysql> grant all on slurm_acct_db.* TO 'slurm'@'system0' where 'system0' is the localhost or database storage host. </pre> +<p>Verify you have InnoDB support</p> +<pre> +mysql> SHOW VARIABLES LIKE 'have_innodb'; ++---------------+-------+ +| Variable_name | Value | ++---------------+-------+ +| have_innodb | YES | ++---------------+-------+ +</pre> + <p>Then create the database:</p> <pre> mysql> create database slurm_acct_db; diff --git a/doc/html/accounting_storageplugins.shtml b/doc/html/accounting_storageplugins.shtml index a6180363b..0452e3075 100644 --- a/doc/html/accounting_storageplugins.shtml +++ b/doc/html/accounting_storageplugins.shtml @@ -23,7 +23,8 @@ The minor type can be any suitable name for the type of accounting package. We currently use <ul> <li><b>filetxt</b>—Information written to a text file. -<li><b>mysql</b>— Store information in a mysql database. +<li><b>mysql</b>— Store information in a mysql database (using + the InnoDB storage engine). <li><b>slurmdbd</b>— Send information to the Slurm Database Daemon (SlurmDBD). Extra configuration is needed and described <a href="accounting.html">here</a>. <li><b>none</b>— Information is not stored anywhere. diff --git a/doc/html/cray_alps.shtml b/doc/html/cray_alps.shtml index c071b6fbd..10ac91ccf 100644 --- a/doc/html/cray_alps.shtml +++ b/doc/html/cray_alps.shtml @@ -238,25 +238,9 @@ default: # rpm -qa <ul> <li>expat-2.0.xxx</li> <li>libexpat-devel-2.0.xxx</li> -<li>cray-MySQL-devel-enterprise-5.0.64 (this should be on the Cray ISO)</li> +<li>mysql-devel (this should be on the Cray ISO)</li> </ul> -<p>For example, loading MySQL can be done like this:</p> -<pre> -smw: # mkdir mnt -smw: # mount -o loop, ro xe-sles11sp1-trunk.201107070231a03.iso mnt -smw: # find mnt -name cray-MySQL-devel-enterprise\* -mnt/craydist/xt-packages/cray-MySQL-devel-enterprise-5.0.64.1.0000.2899.19.2.x86_64.rpm -smw: # scp mnt/craydist/xt-packages/cray-MySQL-devel-enterprise-5.0.64.1.0000.2899.19.2.x86_64 -</pre> - -<p>Then switch to boot node and run:</p> -<pre> -boot: # xtopview -default: # rpm -ivh /software/cray-MySQL-devel-enterprise-5.0.64.1.0000.2899.19.2.x86_64.rpm -default: # exit -</pre> - <p>All Cray-specific PrgEnv and compiler modules should be removed and root privileges will be required to install these files.</p> diff --git a/doc/html/elastic_computing.shtml b/doc/html/elastic_computing.shtml index d5a08f727..6291c6a5b 100644 --- a/doc/html/elastic_computing.shtml +++ b/doc/html/elastic_computing.shtml @@ -14,13 +14,6 @@ cluster. Good responsiveness and throughput can be achieved while you only pay for the resources needed.</p> -<p>The -<a href="http://web.mit.edu/star/cluster/docs/latest/index.html">StarCluster</a> -cloud computing toolkit has a -<a href="https://github.com/jlafon/StarCluster">SLURM port available</a>. -<a href="https://github.com/jlafon/StarCluster/wiki/Getting-started-with-SLURM-on-Amazon's-EC2"> -Instructions</a> for the SLURM port of StartCLuster are available online.</p> - <p>The rest of this document describes details about SLURM's infrastructure that can be used to support Elastic Computing.</p> diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 6bd15524a..6986f6af6 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -842,11 +842,11 @@ salloc: Relinquishing job allocation 65542 SLURM? Why does the DAKOTA program not run with SLURM?</b></a><br> The SLURM library used to support MPIHCH2 or MVAPICH2 references a variety of symbols. If those symbols resolve to functions or variables in your program -rather than the appropriate library, the application will fail. In the case of -<a href="http://dakota.sandia.gov">DAKOTA</a>, it contains a function named -<b>regcomp</b>, which will get used rather than the POSIX regex functions. -Rename DAKOTA's function and references from regcomp to something else to make -it work properly.</p> +rather than the appropriate library, the application will fail. For example +<a href="http://dakota.sandia.gov">DAKOTA</a>, versions 5.1 and +older, contains a function named regcomp, which will get used rather +than the POSIX regex functions. Rename DAKOTA's function and +references from regcomp to something else to make it work properly.</p> <p><a name="estimated_start_time"><b>26. Why does squeue (and "scontrol show jobid") sometimes not display a job's estimated start time?</b></a><br> diff --git a/doc/html/meetings.shtml b/doc/html/meetings.shtml index dc690923c..9d0900ec1 100644 --- a/doc/html/meetings.shtml +++ b/doc/html/meetings.shtml @@ -6,8 +6,8 @@ 23-24 September 2014<br> Lugano, Switzerland<br> Host: <a href="http://cscs.ch/">Swiss National Supercomputing Centre</a></p> -<a href="slurm_ug_cfp.html">Call for Abstracts: Due 6 June 2014</a><br> -<!--<a href="slurm_ug_agenda.html">Meeting agenda</a><br>--> +<!--<a href="slurm_ug_cfp.html">Call for Abstracts: Due 6 June 2014</a><br>--> +<a href="slurm_ug_agenda.html">Meeting agenda</a><br> <!--<a href="slurm_ug_registration.html">Registration information</a>--></p> <br> @@ -33,6 +33,6 @@ Host: Bull</p> Paris, France<br> Host: CEA</p> -<p style="text-align:center;">Last modified 31 March 2014</p> +<p style="text-align:center;">Last modified 3 July 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/slurm_ug_agenda.shtml b/doc/html/slurm_ug_agenda.shtml index 2bafc1d7c..1097968b1 100644 --- a/doc/html/slurm_ug_agenda.shtml +++ b/doc/html/slurm_ug_agenda.shtml @@ -1,57 +1,34 @@ <!--#include virtual="header.txt"--> -<h1>Slurm User Group Meeting 2013</h1> +<h1>Slurm User Group Meeting 2014</h1> -<p>Hosted by <a href="http:///www.schedmd.com">SchedMD</a> +<p>Hosted by the <a href="http:///www.cscs.ch">Swiss National Supercomputing Centre</a> <h1>Agenda</h1> -<p>The 2013 SLURM User Group Meeting will be held on September 18 and 19 -in Oakland, California, USA. +<p>The 2014 SLURM User Group Meeting will be held on September 23 and 24 +in Lugano, Switzerland. The meeting will include an assortment of tutorials, technical presentations, and site reports. The <a href="#schedule">Schedule</a> amd <a href="#abstracts">Abstracts</a> are shown below.</p> <h2>Meeting Information</h2> -<p>The meeting will be held at -<a href="http://www.ce.csueastbay.edu/businessservices/conference_facilities/index.shtml"> -California State University's Conference Center</a>, -1000 Broadway Avenue, Suite 109, Oakland, California -(Phone 510-208-7001, access from 11th Street). -This state of the art facility is located adjacent to the 12th Street -<a href="http://www.bart.gov">BART</a> (Metro) station, with easy access to -the entire San Francisco area. -There is also frequent and free bus service to -<a href="http://www.jacklondonsquare.com">Jack London Square</a> using the -<a href="http://Bshuttle.com">Broadway Shuttle</a>. +<p>The meeting will be held at the +<a href="http://www.lugano-tourism.ch/en/129/default.aspx"> +Lugano Convention Centre</a>, Lugano, Switzerland. +More information will be made available later.</p> <h2>Hotel Information</h2> -<p>Many hotel options are available in Oakland, San Fransisco, and elsewhere in -the area. Just be sure that your hotel has easy access to BART. -Consider the hotels listed below as suggestions:</p> - -<p><a href="http://www.waterfronthoteloakland.com"><b>Waterfront Hotel</b></a><br> -Like it says in the name, on the waterfront, with several nice restaurants nearby. -About 1 mile (2 km) from the conference center via the -<a href="http://Bshuttle.com">Broadway Shuttle</a>. -Ferry service to San Fransisco adjacent to the hotel.</p> - -<p><a href="http://www.marriott.com/hotels/travel/oakdt-oakland-marriott-city-center/"> -<b>Oakland Marriott City Center</b></a><br> -Across the street from the conference center. -Discounted rooms are available to government employees.</p> +<p>Hotels may be bookded through the Lugano Convention Centre (Palazzo dei Congressi).<br> +<a href="https://www.aec-internet.it/booking_engine/prenota_congresso.htm?graph_be=4&n_tappe=1&headvar=ok&lingua_int=eng&id_stile=7434&id_congresso=54&id_canale=704">Hotel booking</a>. <h2>Registration</h2> -<p>The conference cost is $250 per person for registrations by 29 August and -$300 per person for late registration. -This includes presentations, tutorials, lunch and snacks on both days, -plus dinner on Wednesday evening.<br><br> -<a href="http://sug2013.eventbrite.com">Register here.</a></p> +<p>Information will be made available later.</p> <a name="schedule"><h1>Schedule</h1></a> -<h2>September 18, 2013</h2> +<h2>23 September 2014</h2> <table width="100%" border=1 cellspacing=0 cellpadding=0> @@ -63,541 +40,816 @@ plus dinner on Wednesday evening.<br><br> </tr> <tr> - <td width="15%" bgcolor="#F0F1C9">08:00 - 09:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration / Breakfast</td> + <td width="15%" bgcolor="#F0F1C9">08:00 - 08:30</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration </td> </tr> <tr> - <td width="15%">09:00 - 09:15</td> + <td width="15%">08:30 - 08:45</td> <td width="15%"> Welcome</td> - <td width="25%"> Morris Jette (SchedMD)</td> + <td width="25%"> TBD (CSCS)</td> <td width="45%"> Welcome to Slurm User Group Meeting</td> </tr> <tr> - <td width="15%">09:15 - 10:00</td> + <td width="15%">08:45 - 09:30</td> <td width="15%"> Keynote</td> - <td width="25%"> Dona Crawford (LLNL)</td> - <td width="45%"> Future Outlook for Advanced Computing</td> + <td width="25%"> TBD</td> + <td width="45%"> TBD</td> </tr> <tr> - <td width="15%" bgcolor="#F0F1C9">10:00 - 10:30</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> + <td width="15%" bgcolor="#F0F1C9">09:30 - 09:45</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> </tr> <tr> - <td width="15%">10:30 - 11:00</td> + <td width="15%">09:45 - 10:15</td> <td width="15%"> Technical</td> - <td width="25%"> Morris Jette, Danny Auble (SchedMD), Yiannis Georgiou (Bull)</td> - <td width="45%"> Overview of Slurm version 2.6</td> + <td width="25%"> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</td> + <td width="45%"> Overview of Slurm Versions 14.03 and 14.11</td> </tr> <tr> - <td width="15%">11:00 - 12:00</td> + <td width="15%">10:15 - 10:45</td> <td width="15%"> Tutorial</td> - <td width="25%"> Yiannis Georgiou, Martin Perry, Thomas Cadeau (Bull), Danny Auble (SchedMD)</td> - <td width="45%"> Energy Accounting and External Sensor Plugins</td> + <td width="25%"> Michael Jennings, Jacqueline Scoggins (LBL)</td> + <td width="45%"> Warewulf Node Health Check</td> </tr> - -<tr> - <td width="15%" bgcolor="#F0F1C9">12:00 - 13:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch at conference center</td> -</tr> - - -<tr> - <td width="15%">13:00 - 13:30</td> + <td width="15%">10:45 - 11:15</td> <td width="15%"> Technical</td> - <td width="25%"> Yiannis Georgiou , Thomas Cadeau (Bull), Danny Auble, Moe Jette (SchedMD) Matthieu Hautreux (CEA)</td> - <td width="45%"> Evaluation of Monitoring and Control Features for Power Management</td> + <td width="25%"> Yiannis Georgiou (BULL), David Glesser (BULL), + Matthieu Hautreux (CEA), Denis Trystram (Univ. Grenoble-Alpes)</td> + <td width="45%"> SLURM processes isolation</td> </tr> <tr> - <td width="15%">13:30 - 14:00</td> + <td width="15%">11:15 - 11:45</td> <td width="15%"> Technical</td> - <td width="25%"> Matthieu Hautreux (CEA)</td> - <td width="45%"> Debugging Large Machines</td> -<tr> - <td width="15%">14:00 - 14:30</td> - <td width="15%"> Technical</td> - <td width="25%"> Alberto Falzone, Paolo Maggi (Nice)</td> - <td width="45%"> Creating easy to use HPC portals with NICE EnginFrame and Slurm</td> -</tr> - -<tr> - <td width="15%" bgcolor="#F0F1C9">14:30 - 15:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> + <td width="25%"> Rod Schultz (BULL), Martin Perry (BULL), + Yiannis Georgiou (BULL), Danny Auble (SchedMD), Morris Jette (SchedMD), + Matthieu Hautreux (CEA)</td> + <td width="45%"> Improving forwarding logic in SLURM</td> </tr> - -<tr> - <td width="15%">15:00 - 15:30</td> - <td width="15%"> Technical</td> - <td width="25%"> David Glesser, Yiannis Georgiou, Joseph Emeras, Olivier Richard (Bull)</td> - <td width="45%"> Slurm evaluation using emulation and replay of real workload traces</td> + <td width="15%" bgcolor="#F0F1C9">11:45 - 12:45</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch </tr> <tr> - <td width="15%">15:30 - 16:30</td> + <td width="15%">12:45 - 13:45</td> <td width="15%"> Tutorial</td> - <td width="25%"> Rod Schultz, Yiannis Georgiou (Bull) Danny Auble (SchedMD)</td> - <td width="45%"> Usage of new profiling functionalities</td> -</tr> - -<tr> - <td width="15%" bgcolor="#F0F1C9">18:00 - </td> - <td width="15%" bgcolor="#F0F1C9"> Dinner</td> - <td width="70%" colspan="2" bgcolor="#F0F1C9"> Lungomare, 1 Broadway Ave.</td> -</tr> -</table> - -<h2>September 19, 2013</h2> - -<table width="100%" border=1 cellspacing=0 cellpadding=0> - -<tr> - <th width="15%">Time</th> - <th width="15%">Theme</th> - <th width="25%">Speaker</th> - <th width="45%">Title</th> + <td width="25%"> Morris Jette (SchedMD)</td> + <td width="45%"> Tuning Slurm Scheduling for Optimal + Responsiveness and Utilization</td> </tr> - <tr> - <td width="15%" bgcolor="#F0F1C9">08:00 - 08:30</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration / Breakfast</td> - </tr> - -<tr> - <td width="15%">08:30 - 09:00</td> + <td width="15%">13:45 - 14:15</td> <td width="15%"> Technical</td> - <td width="25%"> Morris Jette, David Bigagli, Danny Auble (SchedMD)</td> - <td width="45%"> Fault Tolerant Workload Management</td> -</tr> + <td width="25%"> Carles Fenoy (BSC)</td> + <td width="45%"> Improving HPC applications scheduling with + predictions based on automatically-collected historical data</td> <tr> - <td width="15%">09:00 - 09:30</td> + <td width="15%">14:15 - 14:45</td> <td width="15%"> Technical</td> - <td width="25%"> Yiannis Georgiou (Bull) Matthieu Hautreux (CEA)</td> - <td width="45%"> Slurm Layouts Framework</td> + <td width="25%"> Filip Skalski, Krzysztof Rzadca (University of + Warsaw)</td> + <td width="45%"> Fair Scheduler for Burst Submissions of + Parallel Job</td> </tr> <tr> - <td width="15%">09:30 - 10:00</td> - <td width="15%"> Technical</td> - <td width="25%"> Bill Brophy (Bull)</td> - <td width="45%"> License Management</td> + <td width="15%" bgcolor="#F0F1C9">14:45 - 15:00</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> </tr> - <tr> - <td width="15%" bgcolor="#F0F1C9">10:00 - 10:30</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> + <td width="15%">15:00 - 15:30</td> + <td width="15%"> Technical</td> + <td width="25%"> Yiannis Georgiou (BULL), David Glesser (BULL), + Matthieu Hautreux (CEA), Denis Trystram (Univ. Grenoble-Alpes)</td> + <td width="45%"> Introducing Power-capping in SLURM scheduling</td> </tr> <tr> - <td width="15%">10:30 - 11:00</td> + <td width="15%">15:30 - 16:00</td> <td width="15%"> Technical</td> - <td width="25%"> Juan Pancorbo Armada (IRZ)</td> - <td width="45%"> Multi-Cluster Management</td> + <td width="25%"> David Glesser (BULL), Yiannis Georgiou (BULL), + Denis Trystram (Univ. Grenoble-Alpes)</td> + <td width="45%"> Introducing Energy based fair-share scheduling</td> </tr> - <tr> - <td width="15%">11:00 - 11:30</td> + <td width="15%">16:00 - 16:30</td> <td width="15%"> Technical</td> - <td width="25%"> Francois Daikhate, Matthieu Hautreux (CEA)</td> - <td width="45%"> Depth Oblivious Hierarchical Fairshare Priority Factor</td> + <td width="25%"> Aamir Rashid (Terascala)</td> + <td width="45%"> Data movement between Lustre and Enterprise + storage systems</td> </tr> - <tr> - <td width="15%">11:30 - 12:00</td> + <td width="15%">16:30 - 17:00</td> <td width="15%"> Technical</td> - <td width="25%"> Dave Wallace (Cray)</td> - <td width="45%"> Refactoring ALPS</td> + <td width="25%"> Sergio Iserte, Adrian Castello, Rafael Mayo, + Enrique S. Quintana-Ort (Universitat Jaume I de Castello), + Federico Silla, Jose Duato (Universitat Politecnica de Valencia)</td> + <td width="45%"> Extending SLURM with Support for Remote GPU + Virtualization</td> </tr> <tr> - <td width="15%" bgcolor="#F0F1C9">12:00 - 13:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch at conference center</td> + <td width="15%" bgcolor="#F0F1C9">17:00 - </td> + <td width="15%" bgcolor="#F0F1C9"> Dinner</td> + <td width="70%" colspan="2" bgcolor="#F0F1C9"> TBD</td> </tr> +</table> -<tr> - <td width="15%">13:00 - 13:20</td> - <td width="15%"> Site Report</td> - <td width="25%"> Francois Diakhate, Francis Belot, Matthieu Hautreux (CEA)</td> - <td width="45%"> CEA Site Report</td> -</tr> -<tr> - <td width="15%">13:20 - 13:40</td> - <td width="15%"> Site Report</td> - <td width="25%"> Tim Wickberg (George Washington University)</td> - <td width="45%"> George Washington University Site Report</td> -</tr> -<tr> - <td width="15%">13:40 - 14:00</td> - <td width="15%"> Site Report</td> - <td width="25%"> Ryan Cox (BYU)</td> - <td width="45%"> Brigham Young University Site Report</td> -</tr> -<tr> - <td width="15%">14:00 - 14:20</td> - <td width="15%"> Site Report</td> - <td width="25%"> Doug Hughes, Chris Harwell, Eric Radman, Goran Pocina, Michael Fenn (D.E. Shaw Research)</td> - <td width="45%"> D.E. Shaw Research Site Report</td> -</tr> -<tr> - <td width="15%">14:20 - 14:40</td> - <td width="15%"> Site Report</td> - <td width="25%"> Dr. Ulf Markwardt (Technische Universitat Dresden)</td> - <td width="45%"> Technische Universitat Dresden Site Report</td> -</tr> +<h2>24 September 2014</h2> -<tr> - <td width="15%" bgcolor="#F0F1C9">14:40 - 15:10</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> -</tr> +<table width="100%" border=1 cellspacing=0 cellpadding=0> -<tr> - <td width="15%">15:00 - 15:30</td> - <td width="15%"> Technical</td> - <td width="25%"> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</td> - <td width="45%"> Slurm Roadmap</td> -</tr> -<tr> - <td width="15%">15:30 - 16:30</td> - <td width="15%"> Discussion</td> - <td width="25%"> Everyone</td> - <td width="45%"> Open Discussion</td> -</tr> + <tr> + <th width="15%">Time</th> + <th width="15%">Theme</th> + <th width="25%">Speaker</th> + <th width="45%">Title</th> + </tr> + + <tr> + <td width="15%" bgcolor="#F0F1C9">08:00 - 08:30</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration</td> + </tr> + + <tr> + <td width="15%">08:30 - 09:00</td> + <td width="15%"> Technical</td> + <td width="25%"> Jacqueline Scoggins (Lawrence Berkeley + National Lab)</td> + <td width="45%"> Complex environment migration from + Moab/Torque to Slurm</td> + </tr> + <tr> + <td width="15%">09:00 - 09:30</td> + <td width="15%"> Technical</td> + <td width="25%"> Huub Stoffers (SURFsara)</td> + <td width="45%"> A budget checking / budget tracking plug-in + for SLURM</td> + </tr> + + <tr> + <td width="15%">09:30 - 10:00</td> + <td width="15%"> Technical</td> + <td width="25%"> Ryan Cox, Levi Morrison (Brigham Young + University)</td> + <td width="45%"> Level-based job prioritization</td> + </tr> + + <tr> + <td width="15%" bgcolor="#F0F1C9">10:00 - 10:15</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> + </tr> + + <tr> + <td width="15%">10:15 - 10:45</td> + <td width="15%"> Technical</td> + <td width="25%"> Thomas Cadeau (BULL), Yiannis Georgiou + (BULL), Matthieu Hautreux (CEA)</td> + <td width="45%"> Integrating Layouts Framework in SLURM</td> + </tr> + + <tr> + <td width="15%">10:45 - 11:15</td> + <td width="15%"> Technical</td> + <td width="25%"> Emmanuel Jeannot, Guillaume Mercier, Adèle + Villiermet (INRIA)</td> + <td width="45%"> Topology-aware Resource Selection with Slurm</td> + </tr> + + <tr> + <td width="15%">11:15 - 11:45</td> + <td width="15%"> Technical</td> + <td width="25%"> Stephen Trofinoff (CSCS)</td> + <td width="45%"> Exploring the implementation of several key + Slurm Inter-cluster features</td> + </tr> + <tr> + <td width="15%">11:45 - 12:15</td> + <td width="15%"> Technical</td> + <td width="25%"> Danny Auble (SchedMD)</td> + <td width="45%"> Slurm Native Workload Management on Cray Systems</td> + </tr> + <tr> + <td width="15%" bgcolor="#F0F1C9">12:15 - 13:15</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch</td> + </tr> + + <tr> + <td width="15%">13:15 - 13:45</td> + <td width="15%"> Technical</td> + <td width="25%"> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</td> + <td width="45%"> Slurm Roadmap</td> + </tr> + <tr> + <td width="15%">13:45 - 14:05</td> + <td width="15%"> Site Report</td> + <td width="25%"> Magnus Jonsson (Umea University)</td> + <td width="45%"> Umea University Site Report</td> + </tr> + <tr> + <td width="15%">14:05 - 14:25</td> + <td width="15%"> Site Report</td> + <td width="25%"> Marcin Stolarek (Interdisciplinary Centre + for Mathematical and Computational Modelling (ICM), University of + Warsaw, Poland)</td> + <td width="45%"> University of Warsaw Site Report</td> + </tr> + <tr> + <td width="15%">14:25 - 14:45</td> + <td width="15%"> Site Report</td> + <td width="25%"> Andrew Elwell (iVEC)</td> + <td width="45%"> iVEC Site Report</td> + </tr> + <tr> + <td width="15%">14:45 - 15:05</td> + <td width="15%"> Site Report</td> + <td width="25%"> Matthieu Hautreux (CEA)</td> + <td width="45%"> CEA Site Report</td> + </tr> + + <tr> + <td width="15%" bgcolor="#F0F1C9">15:05 - 15:20</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> + </tr> + + <tr> + <td width="15%">15:20 - 15:40</td> + <td width="15%"> Site Report</td> + <td width="25%"> Benini Massimo (CSCS)</td> + <td width="45%"> CSCS Site Report</td> + </tr> + <tr> + <td width="15%">15:40 - 16:00</td> + <td width="15%"> Site Report</td> + <td width="25%"> Janne Blomqvist, Ivan Degtyarenko, Mikko + Hakala (Aalto University)</td> + <td width="45%"> Aalto University Site Report</td> + </tr> + <tr> + <td width="15%">16:00 - 16:20</td> + <td width="15%"> Site Report</td> + <td width="25%"> Tim Wickberg (George Washington University)</td> + <td width="45%"> George Washington University Site Report</td> + </tr> + <tr> + <td width="15%">16:20 - 16:30</td> + <td width="15%"> Closing</td> + <td width="25%"> Tim Wickberg (George Washington University), + Morris Jette (SchedMD)</td> + <td width="45%"> Closing/Invitation to Slurm User Group + Meeting 2015</td> + </tr> </table> <br><br> <a name="abstracts"><h1>Abstracts</h1></a> -<h2>September 18, 2013</h2> +<h2>September 23, 2014</h2> -<h3>Overview of Slurm Version 2.6</h3> -<p>Danny Auble, Morris Jette (SchedMD) -Yiannis Georgiou (Bull)</p> -<p>This presentation will provide an overview of Slurm enhancements in -version 2.6, released in May. Specific development to be described include:</p> +<h3>Overview of Slurm Versions 14.03 and 14.11</h3> +<p> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</p> +<p>This presentation will describe new capabilities provided in Slurm + versions 14.03 (released March 2014) and planned for version 14.11 + (to be released in November 2014). Major enhancements in version 14.03 + include:</p> +<ul> + <li>Access control options for partitions</li> + <li>Load-based scheduling</li> + <li>Reservation of cores for system use</li> + <li>Native support for Cray systems</li> +</ul> +<p>Major enhancements planned for version 14.11 include:</p> <ul> -<li>Support for job arrays, which increases performance and ease of use for -sets of similar jobs.</li> -<li>Support for MapReduce+.</li> -<li>Added prolog and epilog support for advanced reservations.</li> -<li>Much faster throughput for job step execution.</li> -<li>Advanced reservations now supports specific different core count for each node.</li> -<li>Added external sensors plugin to capture temperature and power data.</li> -<li>Added job profiling capability.</li> -<li>CPU count limits by partition.</li> + <li>Support for heterogeneous generic resources</li> + <li>Support for non-consumable generic resources</li> + <li>Automatic job requeue based upon exit code</li> + <li>User control over CPU governor</li> + <li>Communication gateways</li> + <li>New options for job scheduling and task layout</li> + <li>Improved job array support</li> </ul> -<h3>Usage of Energy Accounting and External Sensor Plugins</h3> -<p>Yiannis Georgiou, Martin Perry, Thomas Cadeau (Bull) -Danny Auble (SchedMD)</p> -<p>Power Management has gradually passed from a trend to an important need in -High Performance Computing. Slurm version 2.6 provides functionalities for -energy consumption recording and accounting per node and job following both -in-band and out-of-band strategies. The new implementations consist of two new -plugins: One plugin allowing in-band collection of energy consumption data from -the BMC of each node based on freeipmi library; Another plugin allowing -out-of-band collection from a centralized storage based on rrdtool library. -The second plugin allows the integration of external mechanisms like wattmeters -to be taken into account for the energy consumption recording and accounting -per node and job. The data can be used by users and administrators to improve -the energy efficiency of their applications and the whole clusters in general.</p> -<p>The tutorial will provide a brief description of the various power -management features in Slurm and will make a detailed review of the new plugins -introduced in 2.6, with configuration and usage details along with examples of -actual deployment.</p> - -<h3>Evaluation of Monitoring and Control Features for Power Management</h3> -<p>Yiannis Georgiou , Thomas Cadeau(Bull), Danny Auble, Moe Jette(SchedMD), -Matthieu Hautreux (CEA)</p> -<p>High Performance Computing platforms are characterized by their - increasing needs in power consumption. The Resource and Job - Management System (RJMS) is the HPC middleware responsible for - distributing computing resources to user applications. Appearance of - hardware sensors along with their support on the kernel/software side can be - taken into account by the RJMS in order to enhance the monitoring - and control of the executions with energy considerations. This - essentially enables the applications' execution statistics for - online energy profiling and gives the possibility to users to - control the tradeoffs between energy consumption and performance. In - this work we present the design and evaluation of a new framework, - developed upon SLURM Resource and Job Management System, - which allows energy consumption recording and accounting per node - and job along with parameters for job energy control features based on static - frequency scaling of the CPUs. We evaluate the overhead of the design choices - and the precision of the energy consumption results with different - HPC benchmarks (IMB,stream,HPL) on real-scale platforms and - integrated wattmeters. Having as goal the deployment of the - framework on large petaflopic clusters such as Curie, scalability is - an important aspect.</p> - -<h3>Debugging Large Machines</h3> -<p>Matthieu Hautreux (CEA)</p> -<p>This talk will present some cases of particularly interesting bugs - that were studied/worked-around/corrected over the past few years - on the petaflopic machines installed and used at CEA. The goal - is to share with the administrator community some methods and tools - helping to identify and in some cases work-around or correct - unexpected performance issues or bugs.</p> - -<h3>Creating easy to use HPC portals with NICE EnginFrame and Slurm</h3> -<p>Alberto Falzone, Paolo Maggi (Nice)</p> -<p>NICE EnginFrame is a popular framework to easily create HPC portals -that provide user-friendly application-oriented computing and data -services, hiding all the complexity of the underlying IT infrastructure. -Designed for technical computing users in a broad range of markets -(Oil&Gas, Automotive, Aerospace, Medical, Finance, Research, and -more), EnginFrame simplifies engineers' and scientists' work -through its intuitive, self-documenting interfaces, increasing -productivity and streamlining data and resource -management. Leveraging all the major HPC job schedulers and remote -visualization technologies, EnginFrame translates user clicks into the -appropriate actions to submit HPC jobs, create remote visualization -sessions, monitor workloads on distributed resources, manage data -and much more. In this work we describe the integration between the -SLURM Workload Manager and EnginFrame. We will then illustrate how -this integration can be leveraged to create easy to use HPC portals -for SLURM-based HPC infrastructures.</p> - -<h3>Slurm evaluation using emulation and replay of real workload traces</h3> -<p>David Glesser, Yiannis Georgiou, Joseph Emeras, Olivier Richard (Bull)</p> -<p>The experimentation and evaluation of Resource and Job Management - Systems in HPC supercomputers are characterized by important - complexities due to the inter-dependency of multiple parameters that - have to be taken into control. In our study we have developed a - methodology based upon emulated controlled experimentation, under - real conditions, with submission of workload traces extracted from a - production system. The methodology is used to perform comparisons of - different Slurm configurations in order to deduce the best - configuration for the typical workload that takes place on the - supercomputer, without disturbing the production. We will present - observations and evaluations results using real workload traces - extracted from Curie supercomputer,Top500 system with 80640, - replayed upon only 128 cores of a machine with similar - architecture. Various interesting results are extracted and important - side effects are discussed along with proposed configurations for - each type of workloads. Ideas for improvements on Slurm are also - proposed.</p> - -<h3>Usage of new profiling functionalities</h3> -<p>Rod Schultz, Yiannis Georgiou (Bull), Danny Auble (SchedMD)</p> -<p>SLURM Version 2.6 includes the ability to gather detailed -performance data on jobs. It has a plugin that stores the detailed -data in an HDF5 file. Other plugin gather data on task performance -such as cpu usage, memory usage, and local disk I/O; I/O to the -Lustre file system; traffic through and Infiniband network -interface; and energy information collected from IPMI. -This tutorial will describe the new capability, show how to configure -the various data sources, show examples of different data streams, -and report on actual usage.</p> - -<h2>September 19, 2013</h2> - -<h3>Fault Tolerant Workload Management</h3> -<p>Morris Jette, David Bigagli, Danny Auble (SchedMD)</p> -<p>One of the major issues facing exascale computing is fault -tolerance; how can a computer be effectively used if the typical job -execution time exceeds its mean time between failure. Part of the -solution is providing users with means to address failures in a -coordinated fashion with a highly adaptable workload manager. Such a -solution would support coordinated recognition of failures, -notification of failing and failed components, replacement -resources, and extended job time limits using negotiated interactive -communications. This paper describes fault tolerance issues from the -perspective of a workload manager and the implementation of solution -designed to optimize job fault tolerance based upon the popular open -source workload manager, Slurm.</p> - -<h3>Slurm Layouts Framework</h3> -<p>Yiannis Georgiou (Bull), Matthieu Hautreux (CEA)</p> -<p>This talk will describe the origins and goals of the study -concerning the Layouts Framework as well as first targets, current -developments and results. The layouts framework aims at providing a -uniform and generalized way to describe the hierarchical -relations between resources managed by a RM in order to use that -information in related RM internal logic. Examples of -instantiated layouts could be the description of the network -connectivity of nodes for the Slurm internal communication, the -description of the power supply network and capacities per branch -powering up the nodes, the description of the racking of the nodes, ...<p> - -<h3>License Management</h3> -<p>Bill Brophy (Bull)</p> -<p>License management becomes an increasingly critical issue as the -size of systems increase. These valuable resources deserve the same -careful management as all other resources configured in a -cluster. When licenses are being utilized in both interactive and -batch execution environments with multiple resource managers -involved the complexity of this task increases -significantly. Current license management within SLURM is not -integrated with any external license managers. This approach is -adequate if all jobs requiring licenses are submitted through SLURM -or if SLURM is given a subset of the licenses available on the -system to sub manage. However, the case of sub management can result -in underutilization of valuable license resources. Documentation for -other resource managers describes their interaction with external -license managers. For SLURM to become an active participant in -license management an evolution to its management approach must -occur. This article proposes a two-phased approach for accomplishing -that transformation. In the first phase, enhancements are proposed for -now SLURM internally deals with licenses: restriction of license to -specific accounts or users, provides recommendations for keeping -track of license information and suggestions for how this -information can be displayed for a SLURM users or -administrators. The second phase of this effort, which is -considerably more ambitious, is to define an evolution of SLURM's -approach to license management. This phase introduces an interaction -between SLURM and external license managers. The goal of this effort -is to increase SLURM's effectiveness in another area of resource -management, namely management of software licenses.</p> - -<h3>Multi-Cluster Management</h3> -<p>Juan Pancorbo Armada (IRZ)</p> -<p>As a service provider for scientific high performance computing, -Leibniz Rechen Zentrum (LRZ) operates compute systems for use by -educational institutions in Munich, Bavaria, as well as on the -national level. LRZ provides own computing resources as well as -housing and managing computing resources from other institutions -such as Max Planck Institute, or Ludwig Maximilians University. -The tier 2 Linux cluster operated at LRZ is a heterogeneous system -with different types of compute nodes, divided into 13 different -partitions, each of which is managed by SLURM. The various -partitions are configured for the different needs and services -requested, ranging from single node multiple core NUMAlink shared -memory clusters, to a 16-way infiniband- connected cluster for -parallel job execution, or an 8-way Gbit Ethernet cluster for serial -job execution. The management of all partitions is centralized on a -single VM. In this VM one SLURM cluster for each of these Linux -cluster partitions is configured. The required SLURM control daemons -run concurrently on this VM. With the use of a wrapper script called -MSLURM, the SLURM administrator can send SLURM commands to any -cluster in an easy-to use and flexible manner, including starting or -stopping the complete SLURM subsystem. Although such a setup may not -be desirable for large homogeneous supercomputing clusters, on small -heterogeneous clusters it has its own advantages. No separate control -node is required for each cluster for the slurmctld to run, so the -control of small clusters can be grouped in a single control -node. This feature also help to solve the restriction for some -parameters that cannot be set to different values for different -partitions in the same slurm.conf file; in that case it is possible -to move such parameters to partition-specific slurm.conf files.</p> - -<h3>Preparing Slurm for use on the Cray XC30</h3> -<p>Stephen Trofinoff, Colin McMurtrie (CSCS)</p> -<p>In this paper we describe the technical details associated with the -preparation of Slurm for use on a XC30 system installed at the Swiss -National Supercomputing Centre (CSCS). The system comprises external -login nodes, internal login nodes and a new ALPS/BASIL version so a -number of technical details needed to be overcome in order to have -Slurm working, as desired, on the system. Due to the backward -compatibility of ALPS/BASIL and the well-written code of Slurm, -Slurm was able to run, as it had in the past on previous Cray -systems, with little effort. However some problems were encountered -and their identification and resolution is described in -detail. Moreover, we describe the work involved in enhancing Slurm -to utilize the new BASIL protocol. Finally, we provide detail on the -work done to improve the Slurm task affinity bindings on a -general-purpose Linux cluster so that they, as closely as possible, -match the Cray bindings, thereby providing our users with some -degree of consistency in application behavior between these systems.</p> - -<h3>Refactoring ALPS</h3> -<p>Dave Wallace (Cray)</p> -<p>One of the hallmarks of the Cray Linux Environment is the Cray -Application Level Placement Scheduler (ALPS). ALPS is a resource -placement infrastructure used on all Cray systems. Developed by -Cray, ALPS addresses the size, complexity, and unique resource -management challenges presented by Cray systems. It works in -conjunction with workload management tools such as SLURM to -schedule, allocate, and launch applications. ALPS separates policy -from placement, so it launches applications but does not conflict -with batch system policies. The batch system interacts with ALPS via -an XML interface. Over time, the requirement to support more and -varied platform and processor capabilities, dynamic resource -management and new workload manager features has led Cray to -investigate alternatives to provide more flexible methods for -supporting expanding workload manager capabilities on Cray -systems. This presentation will highlight Cray's plans to expose low -level hardware interfaces by refactoring ALPS to allow 'native' -workload manager implementations that don't rely on the current ALPS -interface mechanism.</p> +<h3>Warewulf Node Health Check</h3> +<p>Michael Jennings, Jacqueline Scoggins (Lawrence Berkeley + National Lab)</p> +<p>Since its release to the HPC community in 2011, the Warewulf Node + Health Check system has gained wide acceptance across the industry + and has become the de facto standard community solution for compute + node health checking. It provides a complete, optimized framework + for creating and executing node-level checks and already comes with + more than 40 of its own pre-written checks. It fully supports SLURM + (as well as other popular schedulers & resource managers) and can + directly error/drain failed nodes and subsequently return them to + service once fixed. Having been used in production at Lawrence + Berkeley National Laboratory since late-2010, Warewulf NHC has + evolved and matured to become a vital asset in maximizing the + integrity and reliability of high-performance computational + resources.</p> + +<p>In this talk, we'll discuss what makes Warewulf NHC such a unique + and robust solution to the problem of compute node health, look at + the feature set of NHC and its integration with SLURM, examine LBNL's + configuration and utilization of SLURM and NHC with tips on how to + quickly deploy it in your environment, and survey many of the + available checks that are supplied out-of-the-box. Time permitting, a + brief introduction to writing custom or site-specific checks may also + be included.</p> + +<h3>SLURM processes isolation</h3> +<p>Martin Perry (BULL), Bill Brophy (BULL), Yiannis Georgiou (BULL), + Danny Auble (SchedMD), Morris Jette (SchedMD), Matthieu Hautreux (CEA)</p> +<p>On the compute nodes Slurm related processes and threads share the + resources (CPUs, Memory) with the applications. Even if the overhead + of slurm processes and threads is not really important, there could + be interference and de-synchronization in cases where the application + makes heavy usage of resources.</p> +<p>The goal is to automatically confine the slurm related process and + threads (slurmd, slurmstepd, jobacct, etc) on particular cores and + memory of the compute node. This will limit the interference of slurm + on the application execution and may improve the performance of the + applications. We present the design choices along with the developed + code and we provide experiments and observations.</p> + +<h3>Improving forwarding logic in SLURM</h3> +<p>Rod Schultz (BULL), Martin Perry (BULL), Yiannis Georgiou (BULL), + Danny Auble (SchedMD), Morris Jette (SchedMD), Matthieu Hautreux (CEA)</p> +<p>In this presentation we describe the motivations and design of the + communication logic re-factoring in Slurm in order to provide + partially deterministic direct and reverse tree communications. The + goals of these developments are to:</p> +<ul> + <li>Better handle the mapping between the trees of communication + used by SLURM and the existing physical network connections in + order to improve performance.</li> + <li> Provide the ability to aggregate messages directed to the + controller in order to limit the amount of RPC that have to be + managed simultaneously so that we can diminish communication + bottlenecks.</li> +</ul> -<h3>CEA Site Report</h3> -<p>Francois Daikhate, Francis Belot, Matthieu Hautreux (CEA)</p> -<p>The site report will detail the evolution of Slurm usage at CEA -as well as recent developments used on production systems. A -modification of the fairshare logic to better handle fair sharing of -resources between unbalanced groups hierarchies will be detailed.</p> - -<h3>George Washington University Site Report</h3> -<p>Tim Wickberg (George Washington University)<p> -<p>The site report will detail the evaluation of Slurm usage at -George Washington University, and the new Colonial One System.</p> - -<h3>Brigham Young University Site Report</h3> -<p>Ryan Cox (BYU)<p> -<p>The site report will detail the evaluation of Slurm at Brigham Young -University.</p> - -<h3>D.E. Shaw Research Site Report</h3> -<p>Doug Hughes, Chris Harwell, Eric Radman, Goran Pocina, Michael Fenn -(D.E. Shaw Research)</p> -<p>DESRES uses SLURM to schedule Anton. Anton is a specialized -supercomputer which executes molecular dynamics (MD) simulations of -proteins and other biological macromolecules orders of magnitude -faster than was previously possible. In this report, we present the -current SLURM configuration for scheduling Anton and launching our -MD application. We take advantage of the ability to run multiple -slurmd programs on a single node and use them as place-holders for -the Anton machines. We combine that with a pool of commodity Linux -nodes which act as frontends to any of the Anton machines where the -application is launched. We run a partition-specific prolog to insure -machine health prior to starting a job and to reset ASICs if -necessary. We also periodically run health checks and set nodes to -drain or resume via scontrol. Recently we have also used the prolog -to set a specific QOS for jobs which run on an early (and slower) -version of the ASIC in order to adjust the fair-share UsageFactor.</p> -<p>DESRES also uses SLURM to schedule a cluster of commodity nodes for -running regressions, our DESMOND MD program and various other -computational chemistry software. The jobs are an interesting mix of -those with MPI required and those without, short (minutes) and long (weeks).</p> -<p>DESRES is also investigating using SLURM to schedule a small -cluster of 8-GPU nodes for a port of the DESMOND MD program to -GPUs. This workload includes both full node 8-GPU jobs and multi-node -full 8-GPU per node jobs, but also jobs with lower GPU requirements -such that multiple jobs would be on a single node. We've made use of -CPU affinity and binding. GRES was not quite flexible enough and we -ended up taking advantage of the 8 CPU to 8 GPU opting to assign -GPUs to specific CPUs.</p> - -<h3>Technische Universitat Dresden Site Report</h3> -<p>Dr. Ulf Markwardt (Technische Universitat Dresden)</p> -<p>This site report will detail the recent introduction of Slurm on a new -computer at Technische Universitat Dresden.</p> - -<h3>Depth Oblivious Hierarchical Fairshare Priority Factor</h3> -<p>Francois Daikhate, Matthieu Hautreux (CEA)</p> -<p>As High Performance Computing use becomes prevalent in increasingly varied -scientific and industrial fields, clusters often need to be shared by a growing -number of user communities. One aspect of managing these heterogenous groups -involves being able to schedule their jobs fairly according to their respective -machine shares. In this talk we look at how slurm hierarchical fairshare -algorithms handle this task when user groups form complex hierarchies. We -propose an alternative formula to compute job priorities which improves -fairness in this situation.</p> - -<h3>Slurm Roadmap</h3> +<h3>Tuning Slurm Scheduling for Optimal Responsiveness and Utilization</h3> +<p>Morris Jette (SchedMD)</p> +<p>Slurm supports a multitude of scheduling options to achieve + administrative goals for responsiveness, utilization, and service + level under a wide assortment of workloads. Many of these options + have been added in the past year and are still little known. This + tutorial will present an overview of scheduling configuration options + for job prioritiziation, Quality of Service, backfill scheduling, job + preemption, and gang scheduling. Advice will be provided on how to + analyze current workload and tune the system.</p> + +<h3>Improving HPC applications scheduling with predictions based on + automatically-collected historical data</h3> +<p>Carles Fenoy (BSC)</p> +<p>This work analyses the benefits of a system, which being able to + get real performance data from jobs, uses it for future scheduling in + order to improve the performance of the applications with minimal + user input. The study is focused on the memory bandwidth usage of + applications and its impact on the running time when sharing the same + node with other jobs. The data used for scheduling purposes is + extracted from the hardware counters during the application execution + identified by a tag specified by <th></th>e user. This information allows + the system to predict the resource requirements of a job and allocate + it more effectively.</p> + +<h3>Fair Scheduler for Burst Submissions of Parallel Job</h3> +<p>Filip Skalski, Krzysztof Rzadca (Institute of Informatics, + University of Warsaw, Poland)</p> +<p>Large-scale HPC systems are shared by many users. Beside system's + efficiency, the main goal of the scheduler is to serve users + according to a scheduling policy. The fair-share algorithm strives + to build schedules in which each user achieves her target average + utilization rate. This method was fine when each user had just a few + jobs. However, modern workloads are often composed of campaigns: + many jobs submitted by the same user at roughly the same time (e.g. + bag-of-tasks or SLURM's job arrays). For such workloads, fair-share + is not optimal because users frequently have similar utilization + metrics and, in such situations, the schedule switches between + users, executing just a few jobs of each one of them. However, it + would be more efficient to assign the maximum number of resources to + one user per time.</p> +<p>OStrich, our scheduling algorithm, is optimized for campaigns of + jobs. OStrich maintains a virtual schedule that partitions resources + between users' workloads according to pre-defined shares. The + virtual schedule drives the allocation of the real processors.</p> +<p>We implemented OStrich as a priority plugin for SLURM and performed + experimental evaluation on an emulated cluster. Comparing with + fair-share (the multifactor plugin), OStrich schedules have lower + slowdowns while maintaining equal system utilization. Moreover, + OStrich plugin uses normalized shares similarly to the multifactor + plugin, therefore it doesn't require any administrative changes other + than a simple change to the SLURM configuration file. We think that + OStrich is a viable alternative to fair-share in supercomputers with + campaign-like workloads.</p> + +<h3>Introducing Power-capping in SLURM scheduling</h3> +<p>Yiannis Georgiou (BULL), David Glesser (BULL), Matthieu Hautreux + (CEA), Denis Trystram (Univ. Grenoble-Alpes)</p> +<p>The last decades have been characterized by an ever growing + requirement in terms of computing and storage resources. This + tendency has recently put the pressure on the ability to efficiently + manage the power required to operate the huge amount of electrical + components associated with state-of-the-art computing and data + centers. The power consumption of a supercomputer needs to be + adjusted based on varying power budget or electricity + availabilities. As a consequence, Resource and Job Management Systems + have to be adequately adapted in order to efficiently schedule jobs + with optimized performance while limiting power usage whenever + needed. Our goal is to introduce a new power consumption adaptive + scheduling strategy that provides the capability to autonomously + adapt the executed workload to the available or planned power + budget. The originality of this approach relies on a combination of + DVFS (Dynamic Voltage and Frequency Scaling) and node shut-down + techniques.</p> + +<h3>Introducing Energy based fair-share scheduling</h3> +<p>David Glesser (BULL), Yiannis Georgiou (BULL), + Denis Trystram (Univ. Grenoble-Alpes)</p> +<p>Energy consumption has become one of the most important parameters + in High Performance computing platforms. Fair-share scheduling is a + widely used technique in job schedulers to prioritize jobs, + depending to past users allocations. In practice this technique is + mainly based on CPU-Time usage. Since power is managed as a new type + of resources by SLURM and energy consumption can be charged + independently, there is a real need for fairness in terms of energy + consumption.</p> +<p>This presentation will introduce fair-share scheduling based on + past energy usage in SLURM. The new technique will allow users that + have optimized their codes to be more energy efficient or make better + usage of DVFS techniques to improve the stretch times of their + workload.</p> + +<h3>Data movement between Lustre and Enterprise storage systems</h3> +<p>Aamir Rashid (Terascala)</p> +<p>High Performance Data movement is a requirement and a challenge for + HPC (large data sets, high rate of processing, over-provisioning, + compliance, etc.). An example is the data movement inherent in HPC + workflows like genome sequencing. This problem belongs to application + users and is related to HSM. If users are able to effectively manage + data movement tasks as part of their workflows then the IT storage + management problem is significantly diminished. However, to + accomplish this, users need tools that they currently do not + have.</p> +<p>Terascala has developed a new product, Intelligent Storage Bridge + (ISB), for effective data movement between a Lustre appliance and + Enterprise storage systems. ISB is a highly available, scalable and a + policy driven engine that is geared towards end users and automated + workflows. This talk will discuss the features of SLURM that are most + important in a user driven data management solution and highlight + lessons learned.</p> + +<h3>Extending SLURM with Support for Remote GPU Virtualization</h3> +<p>Sergio Iserte, Adrian Castello, Rafael Mayo, Enrique + S. Quintana-Ort (Universitat Jaume I de Castello) Federico Silla, Jose Duato + (Universitat Politecnica de Valencia)</p> +<p>Remote GPU virtualization offers an alluring means to increase + utilization of the GPUs installed in a cluster, which can + potentially yield a faster amortization of the total costs of + ownership (TCO). Concretely, GPU virtualization logically decouples + the GPUs in the cluster from the nodes they are located in, opening + a path to share the accelerators among all the applications that + request GPGPU services, independently of whether the node(s) these + applications are mapped to are equipped with a GPU or not. In this + manner the amount of these accelerators can be reduced, and their + utilization rate can be significantly improved.</p> +<p>SLURM can use a generic resource plug-in (GRes) to manage + GPUs. With this solution the hardware accelerators, like the GPUs, + can only be accessed by the job that is in execution on the node to + which the GPU is attached. This is a serious constraint for remote + GPU virtualization technologies, which aim to provide a completely + user-transparent access to all GPUs in cluster, independently of the + specific locations of the application node and the GPU node.</p> +<p>In this work we introduce a new type of resource in SLURM, the + remote GPU (rGPU), in order to gain access from any application node + to any GPU node in the cluster using rCUDA as the remote GPU + virtualization solution. With this new resource, users can access + all GPUs needed for their jobs, as SLURM schedules the task taking + into account all the GPUs available in the whole cluster. In other + words, introducing GPU-virtualization aware mechanism into SLURM + allow applications to execute CUDA kernels in all GPUs, + independently of their location.</p> + +<h2>September 24, 2014</h2> + +<h3>Complex environment migration from Moab/Torque to Slurm</h3> +<p>Jacqueline Scoggins (Lawrence Berkeley National Lab)</p> +<p>In most HPC environments admins are faced with setting up a + scheduling environment based on the individual or institutional + cluster requirements. Sites that have multiple clusters may have + to install the same scheduler on each system but the policies and + functionality might be different between the various installations. + But as the number of clusters grow and the policies and + requirements change this can become very difficult to manage. How + can this be done simpler without the integration nightmares? At + LBNL we merged our distinct resources under a + common infrastructure to leverage a uniform support architecture and + scavenge unused CPU cycles and expand into a condo-cluster model + using one scheduler. We previously did this using Moab/Torque for + several years but recently migrated to SLURM. The challenge was + how to make SLURM meet the exceedingly arduous needs of our + environment – Accounting, backfill, reservations, fairshare, QOS, + Partitions, Multifactor job prioritization and the ability to have + limits set on a user/group level bases so that the individual and + institutional clusters would not affect each other. Considering our + extremely complicated environment and the many production resources + and users that were impacted by this change, we took a very careful + and diligent approach to the migration and it resulted with minimal + adverse effects on our user base and support engineers. This talk + will be focused on our method and experiences of this + migration.</p> + + +<h3>A budget checking / budget tracking plug-in for SLURM</h3> +<p>Huub Stoffers (SURFsara)</p> +<p>We propose to design and implement a plug-in for the SLURM control + daemon that is capable of calculating “job cost†on the basis of job + resource usage and that keeps track of budgets, registered per + account, as they are spent by running jobs. SLURM does a good job + logging the run time of jobs and their usage of resources during + that time interval. It however does not know how to reduce the usage + of resources to the spending of budget that was granted to + projects.<p> +<p>Traditionally, this is not the responsibility of the batch system + but of the site's accounting system, because the decision which + resource(s) to account, and at what price, are very site + specific. Moreover, translating the resource usage of a job to + budget reductions is most conveniently done after job completion, + when the resource usage is final and completely known. Then, the + "raw" data can simply be handed over to the accounting system for + subsequent interpretation. But this division of labor and its + associated sequence of events have a serious disadvantage: + Overspending by projects is only noticed when it has already + happened.</p> +<p>Projects running on our compute facilities generally can do so + because they have successfully passed through a review process and + were granted a budget to be spent on compute resources on behalf of + the project. Sometimes it is possible to get a prolongation for a + project or to shift an amount of budget between two projects granted + to the same primary investigator. But funding agencies are quite + strict. They do not wish to tolerate that any project spends more + budget then it was formally granted.</p> +<p>New jobs likely to cost more than their project’s remaining budgets + simply should not be dispatched. SLURM already has the concept that + a job is run under an account that is associated with one or more + users. A budget should be associated with such an account too. “Job + cost†is presumably highly dependent on the actual run time of the + job. When a job is about to be dispatched, its maximum “job cost†+ must be calculated, based on its attributes, such as number of cores + or nodes, the partition, its maximum wall clock time. The maximum + job cost must be temporarily claimed, subtracted from the project’s + budget, for as long as the job runs. When the job is finished the + actual job cost can be calculated, permanently subtracted from the + budget while at the same time, the temporarily claimed maximum “job + cost†is given back – i.e. added again.</p> +<p>Preventive, “liveâ€, budget checking during each job dispatch + presently can be implemented, or at least approximated, by a + prologue script. But this involves substantial sacct and squeue + querying and subsequent calculations based on the returned results + that can strain the system much more that directly keeping track of + a project’s budget. Budgets are typically specified in terms of + abstract "system billable units" that can be spent by using, + discrete quantities the resources that the compute facility has to + offer. The number of core hours is usually an important resource + that is accounted, but there may be differences in pricing, + e.g. between the core hours on nodes with or without GPU support, or + with lesser or larger quantities of memory per core. Other + consumable resources, such as the time that particular software + licenses are checked out by a job, may be accounted too. In SLURM it + is customary to use partitions to differentiate between + heterogeneously equipped nodes. Clearly, the relative pricing of + core hours of different partitions should be a configurable in the + configuration file of the plug-in. The actual details of “Job cost†+ calculation will remain site specific and hence should be + concentrated in a single jobcost function. Hooks should be added for + it to be called and process its outcome at job dispatch time and – + for a job that is dispatched – at job completion time.</p> + +<h3>Level-based job prioritization</h3> +<p>Ryan Cox and Levi Morrison (Brigham Young University)</p> +<p>We will present about our new LEVEL_BASED job prioritization mechanism. + The algorithm prioritizes users such that users in an under-served account + will always have a higher fair share factor than users in an over-served + account. It recurses through the account tree, calculates fair share at + each level, then uses bitwise math to ensure that the effect of usage and + shares below the current level cannot affect calculations at the current + level.</p> +<p>Basically, if accounts A and B have the same shares but A has higher usage + than B then children of account A will have a lower fair share factor than + children of account B. This is not guaranteed in other prioritization methods. + LEVEL_BASED was also designed to reduce the likelihood of errors due to + floating point precision loss.</p> + +<h3>Integrating Layouts Framework in SLURM</h3> +<p>Thomas Cadeau (BULL), Yiannis Georgiou (BULL), Matthieu Hautreux (CEA)</p> +<p>Supercomputers become more powerful but more complicated to + manage. Resources hide information that can be taken into account + for more efficient management. Those characteristics may impact the + way resources should be used and may provide valuable information ( + such as power consumption, network details, etc) that can be used to + optimize automatic decisions such as Scheduling, Energy Efficiency, + Placement, Scalability.</p> +<p>The layouts framework has been introduced in the last SLURM User + Group. This presentation will introduce a new API that has been + developed to get, update and consolidate information described by + layouts so that they can be used wherever needed internally in + SLURM. Information such as the placement of each resource in the + actual infrastructure can be taken into account for more efficient + scheduling of jobs. Information such as the power consumption of + resources can be taken into account for power aware scheduling.</p> +<p>Furthermore a new set of scontrol options will be presented to + enable users and administrators to dynamically modify and display + layouts information.</p> + +<h3>Topology-aware Resource Selection with Slurm</h3> +<p>Emmanuel Jeannot, Guillaume Mercier, Adèle Villiermet (INRIA)</p> +<p>Remote GPU virtualization offers an alluring means to increase + utilization of the GPUs installed in a cluster, which can + potentially yield a faster amortization of the total costs of + ownership (TCO). Concretely, GPU virtualization logically decouples + the GPUs in the cluster from the nodes they are located in, opening + a path to share the accelerators among all the applications that + request GPGPU services, independently of whether the node(s) these + applications are mapped to are equipped with a GPU or not. In this + manner the amount of these accelerators can be reduced, and their + utilization rate can be significantly improved.</p> +<p>SLURM can use a generic resource plug-in (GRes) to manage + GPUs. With this solution the hardware accelerators, like the GPUs, + can only be accessed by the job that is in execution on the node to + which the GPU is attached. This is a serious constraint for remote + GPU virtualization technologies, which aim to provide a completely + user-transparent access to all GPUs in cluster, independently of the + specific locations of the application node and the GPU node.</p> +<p>In this work we introduce a new type of resource in SLURM, the + remote GPU (rGPU), in order to gain access from any application node + to any GPU node in the cluster using rCUDA as the remote GPU + virtualization solution. With this new resource, users can access + all GPUs needed for their jobs, as SLURM schedules the task taking + into account all the GPUs available in the whole cluster. In other + words, introducing GPU-virtualization aware mechanism into SLURM + allow applications to execute CUDA kernels in all GPUs, independently of their + location.</p> + +<h3>Exploring the implementation of several key Slurm Inter-cluster + features</h3> +<p>Stephen Trofinoff (CSCS)</p> +<p>Over the course of several years, both at our site (CSCS) and at + others of which we were told, various instances have arisen where + there was a need for some inter-cluster Slurm features. These + features would simplify or in some cases enable use cases for our + various computing facilities and potentially make administering them + easier. One prominent such request, was for the ability to chain a + job to one or more jobs on a remote Slurm cluster. These features, + of course, do not currently exist or are limited in their scope. + For instance, a job can be submitted to a remote Slurm cluster but + can not be "chained" to a job on another cluster since one Slurm + cluster's controller has no knowledge of the jobs of another. + Therefore, after various discussions, it was decided to start a + small project at our site to explore the potential implementation of + some of these features. The project is a work-in-progress.</p> +<p>This paper and the corresponding presentation will discuss some of + the work done thus far. This includes specifying the particular + features chosen for examination and any issues related to their + implementation.</p> + +<h3>Slurm Native Workload Management on Cray Systems</h3> +<p>Danny Auble (SchedMD)</p> +<p>Cray’s Application Level Placement Scheduler (ALPS) software has + recently been refactored to expose low level network management + interfaces in a new library. Slurm is the first workload manager to + utilize this new Cray infrastructure to directly manage network + resources and launch applications without ALPS. New capabilities + provided by Slurm include the ability to execute multiple jobs per + node, the ability to execute many applications within a single job + allocation (ALPS reservation), greater flexibility in scheduling, + and higher throughput without sacrificing scalability or + performance. This presentation includes a description of ALPS + refactoring, new Slurm plugins for Cray systems, and the changes in + functionality provided by this new architecture.</p> + +<h3>Slurm RoadMap</h3> <p>Morris Jette (SchedMD), Yiannis Georgiou (Bull)</p> -<p>Slurm continues to evolve rapidly, with two major releases per -year. This presentation will outline Slurm development plans in the -coming years. Particular attention will be given to describing -anticipated workload management requirements for Exascale -computing. These requirements include not only scalability issues, -but a new focus on power management, fault tolerance, topology -optimized scheduling, and heterogeneous computing.</p> - -<p style="text-align:center;">Last modified 16 September 2013</p> +<p>Slurm long-term development remains focused on the needs of high + performance computing. The Slurm roadmaps continues to evolve as a + greater understanding of unique Exascale computer requirements + develops. For example, Exascale computers may well contain tens of + thousands of compute nodes, which necessitates changes in Slurm + communications infrastructure. Exascale power consumption will need + to be carefully regulated with power capping, throttling the rate of + change and managing the workload to maximize system + utilization. This presentation will describe upcoming Slurm + development plans.</p> + +<h3>Umea University Site Report</h3> +<p>Magnus Jonsson (Umea University)<p> +<p>Use of SPANK plugins to create a private temporary file systems for + each job. This eliminates interference between jobs without the need + to obey the TMPDIR environment variable. The module is using the + features of private namespace/mount --bind in Linux.</p> + +<h3>University of Warsaw Site Report</h3> +<p>Marcin Stolarek (Interdisciplinary Centre for Mathematical and + Computational Modelling (ICM), University of Warsaw, Poland)</p> +<ul> + <li>Our own SPANK plugins using unshare system call limit lustre + availability for job</li> + <li>SPANK plugin + prologue/epilogue preparing separate /tmp + directory</li> + <li>Job submit plugin which checks if job specification is "sane"</li> + <li>Our work on integration of Slurm with middlewares in European and + Polish grid infrastructures.</li> +</ul> + +<h3>iVEC Site Report</h3> +<p>Andrew Elwell (iVEC)</p> +<p>iVEC (An unincorporated joint venture between CSIRO, Curtin + University, Edith Cowan University, Murdoch University and the + University of Western Australia and is supported by the Western + Australian Government) provides supercomputing facilities and + expertise to the research, education and industrial communities. Its + new (2013) purpose built computing centre (the Pawsey Centre) houses + several Cray XC30 systems as well as 6TB SGI UV2000, all connected + via infiniband to multi-petabyte disk storage systems.</p> +<p>Although initially deployed with PBS Pro, senior management + indicated that moving to SLURM as a unified centre-wide scheduler + would be a good idea. This site report describes the issues faced by + an operations team new to SLURM and the configuration choices that + were made within the site.</p> +<p>Pawsey infrastructure runs with a single slurmdbd instance on KVM, + with five different clusters using this as the accounting + repository. The clusters are:</p> +<ul> + <li>Magnus, a Cray XC30 with 208 nodes, 2 external login nodes and 2 + data mover nodes.</li> + <li>Galaxy, a Cray XC30 with 472 nodes, 2 external login nodes, 2 data + mover nodes and 16 'ingest' nodes</li> + <li>Chaos, a small test and development XC30 but without any external + nodes</li> + <li>Zythos, the SGI UV2000 with 4 GPU cards</li> + <li>Pawsey, used as a generic cluster to support 'copyq' + partitions.</li> +</ul> +<p>Because of the interaction between SLURM and ALPS/BASIL (the Cray + node alllocation system) the cray-aware slurm binaries were compiled + separately to the rest of the site (which uses a mixture of SLES and + CentOS) with a patched 2.6.6 and 2.6.9 being deployed. Linux cgroups + were used to control user access within shared nodes.</p> +<p>The report also covers some of the issues the users faced when + migrating from PBS Pro, and the quirks associated with running on + external login nodes with interactive jobs. Finally it describes + some of the user facing reporting still under development</p> + +<h3>CEA Site Report</h3> +<p>Matthieu Hautreux (CEA)</p> +<p>CEA Site Report</p> + +<h3>CSCS Site Report</h3> +<p>Benini Massimo (CSCS)</p> +<p>CSCS Site Report</p> + +<h3>Aalto University Site Report</h3> +<p>Janne Blomqvist, Ivan Degtyarenko, Mikko Hakala (Aalto + University)</p> +<p>We will present the computational science done at Aalto University, + and the HPC infrastructure supporting this. Our cluster currently + has around 550 compute nodes, with a mix of different hardware + generations acquired at different points in time. The cluster is + part of the Finnish Grid Initiative (FGI), a consortium of + Universities and the national supercomputing center CSC - IT Center + for Science, where FGI clusters are accessible to outside users via + grid middleware. FGI also has a common base software stack. The + funding of the Aalto HPC infrastructure is through a stakeholder + model, where University departments using the cluster provide + funding and manpower to run it. Currently there are three major + departments that provide the core manpower and are responsible for + the majority of the usage, but the cluster is also open to other + users in the University without funding/manpower requirements as + long as use remains moderate.</p> +<p>The funding model of the cluster results in pressure to show that + resource usage is fair among the different departments, and to + improve this we developed the ticket-based fairshare algorithm that + has been included in upstream SLURM as of version 2.5 (originally + called priority/multifactor2). We will present the ticket-based + algorithm, and show how it achieves fairness in an account + hierarchy.</p> +<p>We have also developed a wrapper for slurm user commands that some + of our users have found easier to use than the "raw" slurm commands + when investigating the state of the cluster. The wrapper is purely + for read-only commands, so it is always safe to use.</p> + +<h3>George Washington University</h3> +<p>Tim Wickberg (George Washington University)</p> +<p>In particular, I would expect to elaborate and discuss usage of the + fairshare scheduling system, including how it maps to our (slightly + convoluted) internal funding model. Additional discussion may + include our expected use / abuse of the generic resource scheduling + system to dynamically allocate disk space on our test high-IOPS SSD + scratch system.</p> + +<p style="text-align:center;">Last modified 9 July 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index a0d95b54d..f4ae90153 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -693,6 +693,9 @@ value will equal the original \fB\-\-mem\-per\-cpu\fR value specified by the user. This parameter would generally be used if individual processors are allocated to jobs (\fBSelectType=select/cons_res\fR). +If resources are allocated by the core, socket or whole nodes; the number +of CPUs allocated to a job may be higher than the task count and the value +of \fB\-\-mem\-per\-cpu\fR should be adjusted accordingly. Also see \fB\-\-mem\fR. \fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive. @@ -830,7 +833,7 @@ PerfCnts. These nodes are still available for other jobs not using NPC. .br .br In all cases the job allocation request \fBmust specify the ---exclusive option\fR. Otherwise the request will be denied. +\-\-exclusive option\fR. Otherwise the request will be denied. .br .br diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index fa7651c60..ebe6eb953 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -780,6 +780,9 @@ value will equal the original \fB\-\-mem\-per\-cpu\fR value specified by the user. This parameter would generally be used if individual processors are allocated to jobs (\fBSelectType=select/cons_res\fR). +If resources are allocated by the core, socket or whole nodes; the number +of CPUs allocated to a job may be higher than the task count and the value +of \fB\-\-mem\-per\-cpu\fR should be adjusted accordingly. Also see \fB\-\-mem\fR. \fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive. @@ -917,7 +920,7 @@ PerfCnts. These nodes are still available for other jobs not using NPC. .br .br In all cases the job allocation request \fBmust specify the ---exclusive option\fR. Otherwise the request will be denied. +\-\-exclusive option\fR. Otherwise the request will be denied. .br .br @@ -1274,8 +1277,9 @@ be sent up to 60 seconds earlier than specified. By default, no signal is sent before the job's end time. If a \fIsig_num\fR is specified without any \fIsig_time\fR, the default time will be 60 seconds. -Use the "B:" option to signal the batch shell. -By default all job steps will be signalled, but not the batch shell itself. +Use the "B:" option to signal only the batch shell, none of the other +processes will be signaled. By default all job steps will be signalled, +but not the batch shell itself. .TP \fB\-\-sockets\-per\-node\fR=<\fIsockets\fR> diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 062a06c2b..3e12b1add 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -656,8 +656,9 @@ Permit the job's geometry to be rotated. Possible values are "YES" and "NO". .TP \fIShared\fP=<yes|no> -Set the job's ability to share nodes with other jobs. Possible values are -"YES" and "NO". Only the Slurm administrator or root can increase job's priority. +Set the job's ability to share nodes with other jobs. +Possible values are "YES" and "NO". +This option can only be changed for pending jobs. .TP \fIStartTime\fP=<time_spec> Set the job's earliest initiation time. diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index 97cc8b68a..60db3b51e 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -92,7 +92,7 @@ when running with various options are "%9P %.5a %.10l %.16F %N" .TP .I "\-\-long" -"%9P %.5a %.10l %.10s %.4r %.5h %.10g %.6D %.11T %N" +"%9P %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" .TP .I "\-\-Node" "%N %.6D %.9P %6t" diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index d198bf4de..c664cfbc7 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -827,6 +827,9 @@ value will equal the original \fB\-\-mem\-per\-cpu\fR value specified by the user. This parameter would generally be used if individual processors are allocated to jobs (\fBSelectType=select/cons_res\fR). +If resources are allocated by the core, socket or whole nodes; the number +of CPUs allocated to a job may be higher than the task count and the value +of \fB\-\-mem\-per\-cpu\fR should be adjusted accordingly. Specifying a memory limit of zero for a job step will restrict the job step to the amount of memory allocated to the job, but not remove any of the job's memory allocation from being available to other job steps. @@ -1019,7 +1022,7 @@ PerfCnts. These nodes are still available for other jobs not using NPC. .br .br In all cases the job or step allocation request \fBmust specify the ---exclusive option\fR. Otherwise the request will be denied. +\-\-exclusive option\fR. Otherwise the request will be denied. .br .br diff --git a/doc/man/man5/nonstop.conf.5 b/doc/man/man5/nonstop.conf.5 index 8d4a0d243..a9b36c467 100644 --- a/doc/man/man5/nonstop.conf.5 +++ b/doc/man/man5/nonstop.conf.5 @@ -71,7 +71,7 @@ secure replacement resources up to the number of minutes specified by \fBTimeLimitDelay\fR. This option will only take effect if no hot spare resouces are available at the time replacement resources are requested. -This time limit extention is in addition to the value calculated using the +This time limit extension is in addition to the value calculated using the \fBTimeLimitExtend\fR. The default value is zero (no time limit extension). The value may not exceed 65533 seconds. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index e22c2540d..5a53a99c3 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -892,7 +892,7 @@ Arbitrary parameters for the job account gather plugin Acceptable values at present include: .RS .TP 20 -fB\NoShared\fR +\fBNoShared\fR Exclude shared memory from accounting. .RE @@ -2170,6 +2170,7 @@ The node's \fBBoards\fR, \fBSockets\fR, \fBCoresPerSocket\fR and \fBThreadsPerCore\fR may optionally be configured and result in job allocations which have improved locality; however doing so will prevent more than one job being from being allocated on each core. +.RE .TP \fBCR_CPU_Memory\fR @@ -3237,6 +3238,8 @@ The front end configuration specifies the following information: \fBAllowGroups\fR Comma separated list of group names which may execute jobs on this front end node. By default, all groups may use this front end node. +If \fBat least\fR one group associated with the user attempting to execute the +job is in AllowGroups, he will be permitted to use this front end node. May not be used with the \fBDenyGroups\fR option. .TP @@ -3373,7 +3376,7 @@ Also refer to DenyAccounts. .TP \fBAllowGroups\fR Comma separated list of group names which may execute jobs in the partition. -If at least one group associated with the user attempting to execute the +If \fBat least\fR one group associated with the user attempting to execute the job is in AllowGroups, he will be permitted to use this partition. Jobs executed as user root can use any partition without regard to the value of AllowGroups. @@ -3393,7 +3396,7 @@ described above. .TP \fBAllowQos\fR -Comma seperated list of Qos which may execute jobs in the partition. +Comma separated list of Qos which may execute jobs in the partition. Jobs executed as user root can use any partition without regard to the value of AllowQos. The default value is "ALL". @@ -3441,14 +3444,14 @@ not be stored, just collected). .TP \fBDenyAccount\fR -Comma seperated list of accounts which may not execute jobs in the partition. +Comma separated list of accounts which may not execute jobs in the partition. By default, no accounts are denied access \fBNOTE:\fR If AllowAccounts is used then DenyAccounts will not be enforced. Also refer to AllowAccount. .TP \fBDenyQos\fR -Comma seperated list of Qos which may not execute jobs in the partition. +Comma separated list of Qos which may not execute jobs in the partition. By default, no QOS are denied access \fBNOTE:\fR If AllowQos is used then DenyQos will not be enforced. Also refer AllowQos. diff --git a/src/api/job_info.c b/src/api/job_info.c index 2329f8fbe..e739f6377 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -80,12 +80,13 @@ static uint32_t _threads_per_core(char *host) { uint32_t i, threads = 1; - if (!job_node_ptr) + if (!job_node_ptr || !host) return threads; slurm_mutex_lock(&job_node_info_lock); for (i = 0; i < job_node_ptr->record_count; i++) { - if (!strcmp(host, job_node_ptr->node_array[i].name)) { + if (job_node_ptr->node_array[i].name && + !strcmp(host, job_node_ptr->node_array[i].name)) { threads = job_node_ptr->node_array[i].threads; break; } @@ -95,12 +96,14 @@ static uint32_t _threads_per_core(char *host) } static void _free_node_info(void) { +#if 0 slurm_mutex_lock(&job_node_info_lock); if (job_node_ptr) { slurm_free_node_info_msg(job_node_ptr); job_node_ptr = NULL; } slurm_mutex_unlock(&job_node_info_lock); +#endif } /* Perform file name substitutions diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index dc87c5e7f..cdb8879d2 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * accounting_storage_slurmdbd.c - accounting interface to slurmdbd. + * assoc_mgr.c - File to keep track of associations/QOS used by the daemons ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. * Copyright (C) 2008-2009 Lawrence Livermore National Security. @@ -3630,6 +3630,8 @@ extern void assoc_mgr_remove_qos_usage(slurmdb_qos_rec_t *qos) qos->usage->usage_raw = 0; qos->usage->grp_used_wall = 0; + if (!qos->usage->grp_used_cpus) + qos->usage->grp_used_cpu_run_secs = 0; } extern int dump_assoc_mgr_state(char *state_save_location) diff --git a/src/common/env.h b/src/common/env.h index d61708064..383a68df6 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -73,8 +73,8 @@ typedef struct env_options { pid_t task_pid; char *sgtids; /* global ranks array of integers */ uint16_t pty_port; /* used to communicate window size changes */ - uint8_t ws_col; /* window size, columns */ - uint8_t ws_row; /* window size, row count */ + uint16_t ws_col; /* window size, columns */ + uint16_t ws_row; /* window size, row count */ char *ckpt_dir; /* --ckpt-dir= */ uint16_t restart_cnt; /* count of job restarts */ uint16_t batch_flag; /* 1 if batch: queued job with script */ diff --git a/src/common/gres.c b/src/common/gres.c index a790e991a..6b3a5f32b 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -148,7 +148,7 @@ static uint32_t _get_gres_cnt(char *orig_config, char *gres_name, static uint32_t _get_tot_gres_cnt(uint32_t plugin_id, uint32_t *set_cnt); static int _gres_find_id(void *x, void *key); static void _gres_job_list_delete(void *list_element); -extern int _job_alloc(void *job_gres_data, void *node_gres_data, +static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt, int node_offset, uint32_t cpu_cnt, char *gres_name, uint32_t job_id, char *node_name, bitstr_t *core_bitmap); @@ -166,7 +166,7 @@ static void * _job_state_dup(void *gres_data); static void * _job_state_dup2(void *gres_data, int node_index); static int _job_state_validate(char *config, void **gres_data, slurm_gres_context_t *gres_name); -extern uint32_t _job_test(void *job_gres_data, void *node_gres_data, +static uint32_t _job_test(void *job_gres_data, void *node_gres_data, bool use_total_gres, bitstr_t *cpu_bitmap, int cpu_start_bit, int cpu_end_bit, bool *topo_set, uint32_t job_id, char *node_name, char *gres_name); @@ -1423,12 +1423,12 @@ extern int _node_config_validate(char *node_name, char *orig_config, gres_data->topo_gres_cnt_avail = xrealloc(gres_data->topo_gres_cnt_avail, set_cnt * sizeof(uint32_t)); - for (i=0; i<gres_data->topo_cnt; i++) + for (i = 0; i < gres_data->topo_cnt; i++) FREE_NULL_BITMAP(gres_data->topo_gres_bitmap[i]); gres_data->topo_gres_bitmap = xrealloc(gres_data->topo_gres_bitmap, set_cnt * sizeof(bitstr_t *)); - for (i=0; i<gres_data->topo_cnt; i++) + for (i = 0; i < gres_data->topo_cnt; i++) FREE_NULL_BITMAP(gres_data->topo_cpus_bitmap[i]); gres_data->topo_cpus_bitmap = xrealloc(gres_data->topo_cpus_bitmap, @@ -2584,7 +2584,7 @@ static void _job_core_filter(void *job_gres_data, void *node_gres_data, FREE_NULL_BITMAP(avail_cpu_bitmap); } -extern uint32_t _job_test(void *job_gres_data, void *node_gres_data, +static uint32_t _job_test(void *job_gres_data, void *node_gres_data, bool use_total_gres, bitstr_t *cpu_bitmap, int cpu_start_bit, int cpu_end_bit, bool *topo_set, uint32_t job_id, char *node_name, char *gres_name) @@ -2904,18 +2904,55 @@ extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, static bool _cores_on_gres(bitstr_t *core_bitmap, gres_node_state_t *node_gres_ptr, int gres_inx) { + int i; + if ((core_bitmap == NULL) || (node_gres_ptr->topo_cnt == 0)) return true; - if (bit_size(node_gres_ptr->topo_cpus_bitmap[gres_inx]) != - bit_size(core_bitmap)) - return false; - if (bit_overlap(node_gres_ptr->topo_cpus_bitmap[gres_inx], core_bitmap)) - return true; + for (i = 0; i < node_gres_ptr->topo_cnt; i++) { + if (bit_size(node_gres_ptr->topo_gres_bitmap[i]) < gres_inx) + continue; + if (!bit_test(node_gres_ptr->topo_gres_bitmap[i], gres_inx)) + continue; + if (bit_size(node_gres_ptr->topo_cpus_bitmap[i]) != + bit_size(core_bitmap)) + break; + if (bit_overlap(node_gres_ptr->topo_cpus_bitmap[i],core_bitmap)) + return true; + } return false; } -extern int _job_alloc(void *job_gres_data, void *node_gres_data, +/* Clear any vestigial job gres state. This may be needed on job requeue. */ +extern void gres_plugin_job_clear(List job_gres_list) +{ + int i; + ListIterator job_gres_iter; + gres_state_t *job_gres_ptr; + gres_job_state_t *job_state_ptr; + + if (job_gres_list == NULL) + return; + + (void) gres_plugin_init(); + job_gres_iter = list_iterator_create(job_gres_list); + while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) { + if (!job_gres_ptr) + continue; + job_state_ptr = (gres_job_state_t *) job_gres_ptr->gres_data; + for (i = 0; i < job_state_ptr->node_cnt; i++) { + FREE_NULL_BITMAP(job_state_ptr->gres_bit_alloc[i]); + FREE_NULL_BITMAP(job_state_ptr->gres_bit_step_alloc[i]); + } + xfree(job_state_ptr->gres_bit_alloc); + xfree(job_state_ptr->gres_bit_step_alloc); + xfree(job_state_ptr->gres_cnt_step_alloc); + } + job_state_ptr->node_cnt = 0; + list_iterator_destroy(job_gres_iter); +} + +static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt, int node_offset, uint32_t cpu_cnt, char *gres_name, uint32_t job_id, char *node_name, bitstr_t *core_bitmap) diff --git a/src/common/gres.h b/src/common/gres.h index e2d20de11..9d364627e 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -449,6 +449,9 @@ extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list, uint32_t cpu_cnt, uint32_t job_id, char *node_name, bitstr_t *core_bitmap); +/* Clear any vestigial job gres state. This may be needed on job requeue. */ +extern void gres_plugin_job_clear(List job_gres_list); + /* * Deallocate resource from a job and update node and job gres information * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 1d616f752..e543f5bdf 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1246,6 +1246,8 @@ extern char *job_state_string(uint16_t inx) return "RESIZING"; if (inx & JOB_SPECIAL_EXIT) return "SPECIAL_EXIT"; + if (inx & JOB_REQUEUE) + return "REQUEUED"; /* Process JOB_STATE_BASE */ switch (inx & JOB_STATE_BASE) { @@ -1285,6 +1287,8 @@ extern char *job_state_string_compact(uint16_t inx) return "RS"; if (inx & JOB_SPECIAL_EXIT) return "SE"; + if (inx & JOB_REQUEUE) + return "RQ"; /* Process JOB_STATE_BASE */ switch (inx & JOB_STATE_BASE) { diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index cf9450dd1..64d1ef8fd 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -103,6 +103,8 @@ (IS_JOB_FINISHED(_X) && ((_X->job_state & JOB_COMPLETING) == 0)) #define IS_JOB_RESIZING(_X) \ (_X->job_state & JOB_RESIZING) +#define IS_JOB_REQUEUED(_X) \ + (_X->job_state & JOB_REQUEUE) /* Defined node states */ #define IS_NODE_UNKNOWN(_X) \ diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c index fd14e9a8f..4d36390a8 100644 --- a/src/common/slurm_step_layout.c +++ b/src/common/slurm_step_layout.c @@ -437,6 +437,18 @@ static int _init_task_layout(slurm_step_layout_t *step_layout, * cpus_per_task=3) */ cpus[i] = 1; } + + if ((plane_size != (uint16_t)NO_VAL) + && (task_dist != SLURM_DIST_PLANE)) { + /* plane_size when dist != plane is used to + convey ntasks_per_node. Adjust the number + of cpus to reflect that. + */ + uint16_t cpus_per_node = plane_size * cpus_per_task; + if (cpus[i] > cpus_per_node) + cpus[i] = cpus_per_node; + } + //info("got %d cpus", cpus[i]); if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { /* move to next record */ @@ -568,7 +580,7 @@ static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus) } } - /* Pass 3: Spread remainign tasks across all nodes */ + /* Pass 3: Spread remaining tasks across all nodes */ while (task_id < step_layout->task_cnt) { for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { diff --git a/src/common/xcpuinfo.c b/src/common/xcpuinfo.c index a10615426..1e145242d 100644 --- a/src/common/xcpuinfo.c +++ b/src/common/xcpuinfo.c @@ -499,15 +499,15 @@ get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) { uint32_t val; if (_chk_cpuinfo_uint32(buffer, "processor", &val)) { + curcpu = numcpu; numcpu++; - curcpu = val; - if (val >= numproc) { /* out of bounds, ignore */ - debug("cpuid is %u (> %d), ignored", - val, numproc); + if (curcpu >= numproc) { + info("processor limit reached (%u >= %d)", + curcpu, numproc); continue; } - cpuinfo[val].seen = 1; - cpuinfo[val].cpuid = val; + cpuinfo[curcpu].seen = 1; + cpuinfo[curcpu].cpuid = val; maxcpuid = MAX(maxcpuid, val); mincpuid = MIN(mincpuid, val); } else if (_chk_cpuinfo_uint32(buffer, "physical id", &val)) { @@ -629,7 +629,6 @@ get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, #if DEBUG_DETAIL /*** Display raw data ***/ - debug3(""); debug3("numcpu: %u", numcpu); debug3("numphys: %u", numphys); debug3("numcores: %u", numcores); @@ -641,19 +640,18 @@ get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, debug3("physid: %u->%u", minphysid, maxphysid); debug3("coreid: %u->%u", mincoreid, maxcoreid); - for (i = 0; i <= maxcpuid; i++) { + for (i = 0; i < numproc; i++) { debug3("CPU %d:", i); + debug3(" cpuid: %u", cpuinfo[i].cpuid); debug3(" seen: %u", cpuinfo[i].seen); debug3(" physid: %u", cpuinfo[i].physid); debug3(" physcnt: %u", cpuinfo[i].physcnt); debug3(" siblings: %u", cpuinfo[i].siblings); debug3(" cores: %u", cpuinfo[i].cores); debug3(" coreid: %u", cpuinfo[i].coreid); - debug3(" corecnt: %u", cpuinfo[i].corecnt); - debug3(""); + debug3(" corecnt: %u\n", cpuinfo[i].corecnt); } - debug3(""); debug3("Sockets: %u", sockets); debug3("Cores per socket: %u", cores); debug3("Threads per core: %u", threads); diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index 09b76275f..afec3d385 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -687,7 +687,8 @@ static int _as_mysql_acct_check_tables(mysql_conn_t *mysql_conn) if (mysql_db_create_table(mysql_conn, acct_coord_table, acct_coord_table_fields, - ", primary key (acct(20), user(20)))") + ", primary key (acct(20), user(20)), " + "key user (user(20)))") == SLURM_ERROR) return SLURM_ERROR; @@ -1122,8 +1123,9 @@ extern int create_cluster_tables(mysql_conn_t *mysql_conn, char *cluster_name) if (mysql_db_create_table(mysql_conn, table_name, assoc_table_fields, ", primary key (id_assoc), " - " unique index (user(20), acct(20), " - "`partition`(20)))") + "unique index (user(20), acct(20), " + "`partition`(20)), " + "key lft (lft))") == SLURM_ERROR) return SLURM_ERROR; @@ -1219,6 +1221,9 @@ extern int create_cluster_tables(mysql_conn_t *mysql_conn, char *cluster_name) "unique index (id_job, " "id_assoc, time_submit), " "key rollup (time_eligible, time_end), " + "key wckey (id_wckey), " + "key qos (id_qos), " + "key association (id_assoc), " "key sacct_def (id_user, time_start, " "time_end))") == SLURM_ERROR) diff --git a/src/plugins/accounting_storage/mysql/as_mysql_job.c b/src/plugins/accounting_storage/mysql/as_mysql_job.c index 803b39c30..0c559879b 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_job.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_job.c @@ -176,6 +176,9 @@ static uint32_t _get_wckeyid(mysql_conn_t *mysql_conn, char **name, NULL) != SLURM_SUCCESS) { List wckey_list = NULL; slurmdb_wckey_rec_t *wckey_ptr = NULL; + /* we have already checked to make + sure this was the slurm user before + calling this */ wckey_list = list_create(slurmdb_destroy_wckey_rec); @@ -187,9 +190,30 @@ static uint32_t _get_wckeyid(mysql_conn_t *mysql_conn, char **name, /* info("adding wckey '%s' '%s' '%s'", */ /* wckey_ptr->name, wckey_ptr->user, */ /* wckey_ptr->cluster); */ - /* we have already checked to make - sure this was the slurm user before - calling this */ + + if (*name[0] == '*') { + /* make sure the non * wckey has been added */ + wckey_rec.name = (*name)+1; + if (assoc_mgr_fill_in_wckey( + mysql_conn, &wckey_rec, + ACCOUNTING_ENFORCE_WCKEYS, + NULL) != SLURM_SUCCESS) { + wckey_ptr = xmalloc( + sizeof(slurmdb_wckey_rec_t)); + wckey_ptr->name = + xstrdup(wckey_rec.name); + wckey_ptr->user = xstrdup(user); + wckey_ptr->cluster = xstrdup(cluster); + list_prepend(wckey_list, wckey_ptr); + /* info("adding wckey '%s' '%s' " */ + /* "'%s'", */ + /* wckey_ptr->name, */ + /* wckey_ptr->user, */ + /* wckey_ptr->cluster); */ + } + wckey_rec.name = (*name); + } + if (as_mysql_add_wckeys(mysql_conn, slurm_get_slurm_user_id(), wckey_list) @@ -734,7 +758,11 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, return SLURM_SUCCESS; } end_time = job_ptr->end_time; - job_state = job_ptr->job_state & JOB_STATE_BASE; + + if (IS_JOB_REQUEUED(job_ptr)) + job_state = JOB_REQUEUE; + else + job_state = job_ptr->job_state & JOB_STATE_BASE; } slurm_mutex_lock(&rollup_lock); diff --git a/src/plugins/accounting_storage/mysql/as_mysql_qos.c b/src/plugins/accounting_storage/mysql/as_mysql_qos.c index cca982c72..7a8f9b8ea 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_qos.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_qos.c @@ -424,10 +424,13 @@ static int _setup_qos_limits(slurmdb_qos_rec_t *qos, if (adding_straight) { xstrfmtcat(*vals, ", \'%s,\'", preempt_val); xstrfmtcat(*extra, ", preempt=\'%s,\'", preempt_val); - } else { + } else if (preempt_val[0]) { xstrfmtcat(*vals, ", %s", preempt_val); xstrfmtcat(*extra, ", preempt=if(%s=',', '', %s)", preempt_val, preempt_val); + } else { + xstrcat(*vals, ", ''"); + xstrcat(*extra, ", preempt=''"); } xfree(preempt_val); } diff --git a/src/plugins/accounting_storage/mysql/as_mysql_wckey.c b/src/plugins/accounting_storage/mysql/as_mysql_wckey.c index c10aa629b..458f26fd4 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_wckey.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_wckey.c @@ -133,9 +133,11 @@ static int _make_sure_users_have_default( MYSQL_RES *result = NULL; MYSQL_ROW row; char *wckey = NULL; + /* only look at non * and non deleted ones */ query = xstrdup_printf( "select distinct is_def, wckey_name from " - "\"%s_%s\" where user='%s' FOR UPDATE;", + "\"%s_%s\" where user='%s' and wckey_name " + "not like '*%%' and deleted=0 FOR UPDATE;", cluster, wckey_table, user); debug4("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); @@ -503,7 +505,7 @@ extern int as_mysql_add_wckeys(mysql_conn_t *mysql_conn, uint32_t uid, while ((object = list_next(itr))) { if (!object->cluster || !object->cluster[0] || !object->user || !object->user[0] - || !object->name || !object->name[0]) { + || !object->name) { error("We need a wckey name, cluster, " "and user to add."); rc = SLURM_ERROR; diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index 029229a8c..49a2681b2 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -2273,7 +2273,10 @@ extern int jobacct_storage_p_job_complete(void *db_conn, req.job_state = JOB_RESIZING; } else { req.end_time = job_ptr->end_time; - req.job_state = job_ptr->job_state & JOB_STATE_BASE; + if (IS_JOB_REQUEUED(job_ptr)) + req.job_state = JOB_REQUEUE; + else + req.job_state = job_ptr->job_state & JOB_STATE_BASE; } req.req_uid = job_ptr->requid; req.nodes = job_ptr->nodes; diff --git a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c index 91ed60d36..032da0fc4 100644 --- a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c +++ b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c @@ -420,7 +420,7 @@ extern int acct_gather_energy_p_update_node_energy(void) xassert(_run_in_daemon()); - if (local_energy->current_watts == NO_VAL) + if (!local_energy || local_energy->current_watts == NO_VAL) return rc; _get_joules_task(local_energy); diff --git a/src/plugins/acct_gather_profile/hdf5/hdf5_api.c b/src/plugins/acct_gather_profile/hdf5/hdf5_api.c index 25f0580fa..f9beb1385 100644 --- a/src/plugins/acct_gather_profile/hdf5/hdf5_api.c +++ b/src/plugins/acct_gather_profile/hdf5/hdf5_api.c @@ -1703,6 +1703,8 @@ extern void put_string_attribute(hid_t parent, char *name, char *value) hid_t attr, space_attr, typ_attr; hsize_t dim_attr[1] = {1}; // Single dimension array of values + if (!value) + value = ""; typ_attr = H5Tcopy(H5T_C_S1); if (typ_attr < 0) { debug3("PROFILE: failed to copy type for attribute %s", name); diff --git a/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c b/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c index 838b752d8..181dfa219 100644 --- a/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c +++ b/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c @@ -299,7 +299,7 @@ static int _set_options(const int argc, char **argv) _init_opts(); - while ((cc = getopt_long(argc, argv, "d:Ehi:Ij:l:N:o:p:s:S:uUvV", + while ((cc = getopt_long(argc, argv, "d:Ehi:Ij:l:N:o:p:s:S:u:UvV", long_options, &option_index)) != EOF) { switch (cc) { case 'd': @@ -351,11 +351,11 @@ static int _set_options(const int argc, char **argv) params.keepfiles = 1; break; case 'u': - u = atoi(optarg); if (uid_from_string(optarg, &u) < 0) { error("No such user --uid=\"%s\"", optarg); return -1; } + params.user = uid_to_string(u); break; case 'U': _help_msg(); diff --git a/src/plugins/proctrack/cgroup/proctrack_cgroup.c b/src/plugins/proctrack/cgroup/proctrack_cgroup.c index 846b6f9d4..ded28f82d 100644 --- a/src/plugins/proctrack/cgroup/proctrack_cgroup.c +++ b/src/plugins/proctrack/cgroup/proctrack_cgroup.c @@ -285,6 +285,8 @@ int _slurm_cgroup_destroy(void) if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { + error("_slurm_cgroup_destroy: problem deleting step " + "cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } @@ -584,8 +586,10 @@ extern int proctrack_p_wait(uint64_t cont_id) if (delay < 120) { delay *= 2; } else { - error("Unable to destroy container %"PRIu64"", - cont_id); + error("%s: Unable to destroy container %"PRIu64" " + "in cgroup plugin, giving up after %d sec", + __func__, cont_id, delay); + break; } } diff --git a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c index 7ea146687..a21af1f59 100644 --- a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c +++ b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c @@ -153,25 +153,12 @@ extern bool proctrack_p_has_pid(uint64_t cont_id, pid_t pid) extern int proctrack_p_wait(uint64_t cont_id) { - int delay = 1; - if (cont_id == 0 || cont_id == 1) { errno = EINVAL; return SLURM_ERROR; } - /* Spin until the container is successfully destroyed */ - while (proctrack_p_destroy(cont_id) != SLURM_SUCCESS) { - proctrack_p_signal(cont_id, SIGKILL); - sleep(delay); - if (delay < 120) { - delay *= 2; - } else { - error("Unable to destroy container %"PRIu64"", cont_id); - } - } - - return SLURM_SUCCESS; + return proctrack_p_destroy(cont_id); } extern int diff --git a/src/plugins/proctrack/pgid/proctrack_pgid.c b/src/plugins/proctrack/pgid/proctrack_pgid.c index cf4c68dd2..28270be69 100644 --- a/src/plugins/proctrack/pgid/proctrack_pgid.c +++ b/src/plugins/proctrack/pgid/proctrack_pgid.c @@ -185,7 +185,10 @@ proctrack_p_wait(uint64_t cont_id) if (delay < 120) { delay *= 2; } else { - error("Unable to destroy container %"PRIu64"", cont_id); + error("%s: Unable to destroy container %"PRIu64" " + "in pgid plugin, giving up after %d sec", + __func__, cont_id, delay); + break; } } diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 092e1d8e6..ababedfe3 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -611,8 +611,7 @@ static int _attempt_backfill(void) uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL; - bitstr_t *previous_bitmap = NULL; - time_t now, sched_start, later_start, start_res, resv_end; + time_t now, sched_start, later_start, start_res, resv_end, window_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int sched_timeout = 2, yield_sleep = 1; @@ -684,7 +683,8 @@ static int _attempt_backfill(void) node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt * 2 + 1)); node_space[0].begin_time = sched_start; - node_space[0].end_time = sched_start + backfill_window; + window_end = sched_start + backfill_window; + node_space[0].end_time = window_end; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; @@ -782,8 +782,11 @@ static int _attempt_backfill(void) } job_ptr->part_ptr = part_ptr; - if (debug_flags & DEBUG_FLAG_BACKFILL) - info("backfill test for job %u", job_ptr->job_id); + if (debug_flags & DEBUG_FLAG_BACKFILL) { + info("backfill test for JobID=%u Prio=%u Partition=%s", + job_ptr->job_id, job_ptr->priority, + job_ptr->part_ptr->name); + } if (max_backfill_job_per_part) { bool skip_job = false; @@ -797,13 +800,13 @@ static int _attempt_backfill(void) } if (skip_job) { if (debug_flags & DEBUG_FLAG_BACKFILL) - debug("backfill: have already " - "checked %u jobs for " - "partition %s; skipping " - "job %u", - max_backfill_job_per_part, - job_ptr->part_ptr->name, - job_ptr->job_id); + info("backfill: have already " + "checked %u jobs for " + "partition %s; skipping " + "job %u", + max_backfill_job_per_part, + job_ptr->part_ptr->name, + job_ptr->job_id); continue; } } @@ -838,27 +841,34 @@ static int _attempt_backfill(void) if (njobs[j] >= max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) - debug("backfill: have already " - "checked %u jobs for " - "user %u; skipping " - "job %u", - max_backfill_job_per_user, - job_ptr->user_id, - job_ptr->job_id); + info("backfill: have already " + "checked %u jobs for " + "user %u; skipping " + "job %u", + max_backfill_job_per_user, + job_ptr->user_id, + job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || - (part_ptr->node_bitmap == NULL)) - continue; - if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) + (part_ptr->node_bitmap == NULL) || + ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) { + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: partition %s not usable", + job_ptr->part_ptr->name); continue; + } if ((!job_independent(job_ptr, 0)) || - (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) + (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) { + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: job %u not runable now", + job_ptr->job_id); continue; + } /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, @@ -874,7 +884,9 @@ static int _attempt_backfill(void) else req_nodes = min_nodes; if (min_nodes > max_nodes) { - /* job's min_nodes exceeds partition's max_nodes */ + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: job %u node count too high", + job_ptr->job_id); continue; } @@ -902,7 +914,6 @@ static int _attempt_backfill(void) /* Determine impact of any resource reservations */ later_start = now; - FREE_NULL_BITMAP(previous_bitmap); TRY_LATER: if (slurmctld_config.shutdown_time) break; @@ -961,6 +972,9 @@ static int _attempt_backfill(void) j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: job %u reservation defer", + job_ptr->job_id); job_ptr->time_limit = orig_time_limit; continue; } @@ -987,7 +1001,7 @@ static int _attempt_backfill(void) if ((j = node_space[j].next) == 0) break; } - if ((resv_end++) && + if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } @@ -1008,22 +1022,18 @@ static int _attempt_backfill(void) ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || - (job_req_node_filter(job_ptr, avail_bitmap)) || - (previous_bitmap && - bit_equal(previous_bitmap, avail_bitmap))) { + (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } + /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } - FREE_NULL_BITMAP(previous_bitmap); - previous_bitmap = bit_copy(avail_bitmap); - /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); @@ -1056,6 +1066,8 @@ static int _attempt_backfill(void) } if (job_ptr->start_time <= now) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; + uint32_t hard_limit; + bool reset_time = false; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) { @@ -1067,28 +1079,31 @@ static int _attempt_backfill(void) job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } - job_ptr->end_time = job_ptr->start_time + - (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; - job_ptr->end_time = job_ptr->start_time + - (comp_time_limit * 60); - _reset_job_time_limit(job_ptr, now, - node_space); - time_limit = job_ptr->time_limit; + reset_time = true; } else if (orig_time_limit == NO_VAL) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; - job_ptr->end_time = job_ptr->start_time + - (job_ptr->time_limit * 60); } else { acct_policy_alter_job(job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; - job_ptr->end_time = job_ptr->start_time + - (job_ptr->time_limit * 60); + + } + if (job_ptr->time_limit == INFINITE) + hard_limit = 365 * 24 * 60; /* one year */ + else + hard_limit = job_ptr->time_limit; + job_ptr->end_time = job_ptr->start_time + + (hard_limit * 60); + if (reset_time) { + _reset_job_time_limit(job_ptr, now, + node_space); + time_limit = job_ptr->time_limit; } + if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; @@ -1146,6 +1161,9 @@ static int _attempt_backfill(void) if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ + if (debug_flags & DEBUG_FLAG_BACKFILL) + _dump_job_sched(job_ptr, end_reserve, + avail_bitmap); continue; } @@ -1171,12 +1189,12 @@ static int _attempt_backfill(void) /* * Add reservation to scheduling table if appropriate */ + if (debug_flags & DEBUG_FLAG_BACKFILL) + _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; reject_array_part = NULL; - if (debug_flags & DEBUG_FLAG_BACKFILL) - _dump_job_sched(job_ptr, end_reserve, avail_bitmap); bit_not(avail_bitmap); _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); @@ -1191,7 +1209,6 @@ static int _attempt_backfill(void) FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); FREE_NULL_BITMAP(non_cg_bitmap); - FREE_NULL_BITMAP(previous_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index a555d1c13..cc33112fb 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -1598,7 +1598,7 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, * the former one */ if ((best_fit_inx == -1) || - (!switches_required[best_fit_inx] && switches_required[j]) || + (!switches_required[best_fit_inx] && switches_required[j]) || (switch_record_table[j].level < switch_record_table[best_fit_inx].level) || ((switch_record_table[j].level == @@ -1621,13 +1621,14 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, } } if (best_fit_inx == -1) { - debug("job %u: best_fit topology failure : no switch " - "satisfying the request found", job_ptr->job_id); + debug("job %u: best_fit topology failure: no switch currently " + "has sufficient resource to satisfy the request", + job_ptr->job_id); rc = SLURM_ERROR; goto fini; } if (!switches_required[best_fit_inx] && req_nodes_bitmap ) { - debug("job %u: best_fit topology failure : no switch " + debug("job %u: best_fit topology failure: no switch " "including requested nodes and satisfying the " "request found", job_ptr->job_id); rc = SLURM_ERROR; @@ -2652,7 +2653,7 @@ alloc_job: /* translate job_res->cpus array into format with rep count */ build_cnt = build_job_resources_cpu_array(job_res); - if (job_ptr->details->core_spec) { + if (job_ptr->details->whole_node) { int first, last = -1; first = bit_ffs(job_res->node_bitmap); if (first != -1) diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 20dde2172..82734b36e 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1059,9 +1059,12 @@ static int _job_expand(struct job_record *from_job_ptr, } } } - - to_job_ptr->total_cpus += new_job_resrcs_ptr-> - cpus[new_node_offset]; + if (to_job_ptr->details->whole_node) { + to_job_ptr->total_cpus += select_node_record[i].cpus; + } else { + to_job_ptr->total_cpus += new_job_resrcs_ptr-> + cpus[new_node_offset]; + } } build_job_resources_cpu_array(new_job_resrcs_ptr); gres_plugin_job_merge(from_job_ptr->gres_list, @@ -2618,7 +2621,7 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, { bitstr_t *sp_avail_bitmap; char str[300]; - uint32_t cores_per_node = 0; + uint32_t cores_per_node = 0, extra_cores_needed = 0; bitstr_t *tmpcore; int total_core_cnt = 0; @@ -2634,10 +2637,12 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, */ if ((node_cnt) && (core_cnt)) { - debug2("reserving %u cores per node in %d nodes", - cores_per_node, node_cnt); total_core_cnt = core_cnt[0]; cores_per_node = core_cnt[0] / MAX(node_cnt, 1); + debug2("Reserving %u cores across %d nodes", + total_core_cnt, node_cnt); + extra_cores_needed = total_core_cnt - + (cores_per_node * node_cnt); } if ((!node_cnt) && (core_cnt)) { int num_nodes = bit_set_count(avail_bitmap); @@ -2648,7 +2653,8 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, total_core_cnt += core_cnt[i]; } - debug2("Reservations requires %d cores", total_core_cnt); + debug2("Reservations requires %d cores (%u each on %d nodes, plus %u)", + total_core_cnt, cores_per_node, node_cnt, extra_cores_needed); sp_avail_bitmap = bit_alloc(bit_size(avail_bitmap)); bit_fmt(str, (sizeof(str) - 1), avail_bitmap); @@ -2718,8 +2724,11 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, bit_set(*core_bitmap, coff + i); total_core_cnt--; cores_in_node++; - if ((cores_in_node == cores_per_node) || - (total_core_cnt == 0)) + if (cores_in_node > cores_per_node) + extra_cores_needed--; + if ((total_core_cnt == 0) || + ((extra_cores_needed == 0) && + (cores_in_node >= cores_per_node))) break; } } diff --git a/src/plugins/task/cgroup/task_cgroup_cpuset.c b/src/plugins/task/cgroup/task_cgroup_cpuset.c index 03e13d444..6e081eb61 100644 --- a/src/plugins/task/cgroup/task_cgroup_cpuset.c +++ b/src/plugins/task/cgroup/task_cgroup_cpuset.c @@ -53,6 +53,7 @@ #include "src/common/cpu_frequency.h" #include "src/common/slurm_resource_info.h" #include "src/common/bitstring.h" +#include "src/common/proc_args.h" #include "src/common/xstring.h" #include "src/common/xcgroup_read_config.h" #include "src/common/xcgroup.h" @@ -691,63 +692,81 @@ static int _task_cgroup_cpuset_dist_cyclic( { hwloc_obj_t obj; uint32_t *obj_idx; - uint32_t i, sock_idx, npskip, npdist, nsockets; + uint32_t i, j, sock_idx, sock_loop, ntskip, npdist, nsockets; uint32_t taskid = job->envtp->localid; if (bind_verbose) - info("task/cgroup: task[%u] using cyclic distribution, " - "task_dist %u", taskid, job->task_dist); + info("task/cgroup: task[%u] using %s distribution " + "(task_dist=%u)", taskid, + format_task_dist_states(job->task_dist), job->task_dist); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); obj_idx = xmalloc(nsockets * sizeof(uint32_t)); if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ - npskip = taskid * job->cpus_per_task; + ntskip = taskid; npdist = job->cpus_per_task; } else { /* sockets or ldoms granularity */ - npskip = taskid; + ntskip = taskid; npdist = 1; } - /* skip objs for lower taskids */ - i = 0; + /* skip objs for lower taskids, then add them to the + current task cpuset. To prevent infinite loop, check + that we do not loop more than npdist times around the available + sockets, which is the worst scenario we should afford here. */ + i = 0; j = 0; sock_idx = 0; - while (i < npskip) { - while ((sock_idx < nsockets) && (i < npskip)) { + sock_loop = 0; + while (i < ntskip + 1 && sock_loop < npdist + 1) { + /* fill one or multiple sockets using block mode, unless + otherwise stated in the job->task_dist field */ + while ((sock_idx < nsockets) && (j < npdist)) { obj = hwloc_get_obj_below_by_type( topology, HWLOC_OBJ_SOCKET, sock_idx, hwtype, obj_idx[sock_idx]); if (obj != NULL) { obj_idx[sock_idx]++; - i++; + j++; + if (i == ntskip) + _add_hwloc_cpuset(hwtype, req_hwtype, + obj, taskid, + bind_verbose, cpuset); + if ((j < npdist) && + ((job->task_dist == + SLURM_DIST_CYCLIC_CFULL) || + (job->task_dist == + SLURM_DIST_BLOCK_CFULL))) + sock_idx++; + } else { + sock_idx++; } - sock_idx++; } - if (i < npskip) + /* if it succeed, switch to the next task, starting + with the next available socket, otherwise, loop back + from the first socket trying to find available slots. */ + if (j == npdist) { + i++; j = 0; + sock_idx++; // no validity check, handled by the while + sock_loop = 0; + } else { + sock_loop++; sock_idx = 0; - } - - /* distribute objs cyclically across sockets */ - i = npdist; - while (i > 0) { - while ((sock_idx < nsockets) && (i > 0)) { - obj = hwloc_get_obj_below_by_type( - topology, HWLOC_OBJ_SOCKET, sock_idx, - hwtype, obj_idx[sock_idx]); - if (obj != NULL) { - obj_idx[sock_idx]++; - _add_hwloc_cpuset(hwtype, req_hwtype, obj, - taskid, bind_verbose, cpuset); - i--; - } - sock_idx++; } - sock_idx = 0; } + xfree(obj_idx); - return XCGROUP_SUCCESS; + + /* should never happened in normal scenario */ + if (sock_loop > npdist) { + error("task/cgroup: task[%u] infinite loop broken while trying" + "to provision compute elements using %s", taskid, + format_task_dist_states(job->task_dist)); + return XCGROUP_ERROR; + } else + return XCGROUP_SUCCESS; } static int _task_cgroup_cpuset_dist_block( @@ -1119,8 +1138,11 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) uint32_t jntasks = job->node_tasks; uint32_t jnpus; - job->cpus_per_task = MAX(1, job->cpus_per_task); - jnpus = jntasks * job->cpus_per_task; + if (job->batch) { + jnpus = job->cpus; + job->cpus_per_task = job->cpus; + } else + jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if (conf->task_plugin_param & CPU_BIND_VERBOSE || diff --git a/src/sacct/print.c b/src/sacct/print.c index d45c6a373..44930447e 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -579,9 +579,11 @@ void print_fields(type_t type, void *object) } if (WIFSIGNALED(tmp_int)) tmp_int2 = WTERMSIG(tmp_int); - + tmp_int = WEXITSTATUS(tmp_int); + if (tmp_int >= 128) + tmp_int -= 128; snprintf(outbuf, sizeof(outbuf), "%d:%d", - WEXITSTATUS(tmp_int), tmp_int2); + tmp_int, tmp_int2); field->print_routine(field, outbuf, diff --git a/src/sacctmgr/cluster_functions.c b/src/sacctmgr/cluster_functions.c index 9c0a29524..885937987 100644 --- a/src/sacctmgr/cluster_functions.c +++ b/src/sacctmgr/cluster_functions.c @@ -1148,10 +1148,10 @@ extern int sacctmgr_dump_cluster (int argc, char *argv[]) return SLURM_ERROR; } - line = xstrdup_printf("Cluster - %s", cluster_name); + line = xstrdup_printf("Cluster - '%s'", cluster_name); if (class_str) - xstrfmtcat(line, ":Classification=%s", class_str); + xstrfmtcat(line, ":Classification='%s'", class_str); slurmdb_hierarchical_rec = list_peek(slurmdb_hierarchical_rec_list); assoc = slurmdb_hierarchical_rec->assoc; diff --git a/src/sacctmgr/file_functions.c b/src/sacctmgr/file_functions.c index 7359dd193..717e5498c 100644 --- a/src/sacctmgr/file_functions.c +++ b/src/sacctmgr/file_functions.c @@ -287,7 +287,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) fprintf(stderr, " Bad format on %s: " "End your option with " "an '=' sign\n", sub); - _destroy_sacctmgr_file_opts(file_opts); break; } file_opts->name = xstrdup(option); @@ -320,12 +319,12 @@ static sacctmgr_file_opts_t *_parse_options(char *options) g_qos_list, option); if (file_opts->def_qos_id == NO_VAL) { + exit_code=1; fprintf(stderr, "You gave a bad qos '%s'. " "Use 'list qos' to get " "complete list.\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "DefaultWCKey", @@ -347,7 +346,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad FairShare value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpCPUMins", @@ -357,7 +355,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpCPUMins value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpCPUs", MAX(command_len, 7))) { @@ -366,7 +363,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpCPUs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpJobs", MAX(command_len, 4))) { @@ -375,7 +371,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpMemory", @@ -385,7 +380,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpMemory value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpNodes", @@ -395,7 +389,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpNodes value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpSubmitJobs", @@ -405,7 +398,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpWall", MAX(command_len, 4))) { @@ -420,7 +412,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) fprintf(stderr, " Bad GrpWall time format: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxCPUMinsPerJob", @@ -432,7 +423,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxCPUMins value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxCPUsPerJob", @@ -442,7 +432,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxCPUs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxJobs", MAX(command_len, 4))) { @@ -451,7 +440,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxNodesPerJob", @@ -461,7 +449,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxNodes value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxSubmitJobs", @@ -471,7 +458,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxWallDurationPerJob", @@ -487,7 +473,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) fprintf(stderr, " Bad MaxWall time format: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "Organization", @@ -521,6 +506,7 @@ static sacctmgr_file_opts_t *_parse_options(char *options) } else { exit_code=1; fprintf(stderr, " Unknown option: %s\n", sub); + break; } xfree(sub); @@ -539,9 +525,9 @@ static sacctmgr_file_opts_t *_parse_options(char *options) if (!file_opts->name) { exit_code=1; fprintf(stderr, " No name given\n"); - _destroy_sacctmgr_file_opts(file_opts); - file_opts = NULL; - } else if (exit_code) { + } + + if (exit_code) { _destroy_sacctmgr_file_opts(file_opts); file_opts = NULL; } @@ -1615,7 +1601,7 @@ static int _print_file_slurmdb_hierarchical_rec_children( user_list, slurmdb_hierarchical_rec->assoc->user); line = xstrdup_printf( - "User - %s", + "User - '%s'", slurmdb_hierarchical_rec->sort_name); if (slurmdb_hierarchical_rec->assoc->partition) xstrfmtcat(line, ":Partition='%s'", @@ -1702,7 +1688,7 @@ static int _print_file_slurmdb_hierarchical_rec_children( acct_list, slurmdb_hierarchical_rec->assoc->acct); line = xstrdup_printf( - "Account - %s", + "Account - '%s'", slurmdb_hierarchical_rec->sort_name); if (acct_rec) { xstrfmtcat(line, ":Description='%s'", @@ -1822,12 +1808,12 @@ extern int print_file_slurmdb_hierarchical_rec_list( slurmdb_hierarchical_rec->assoc->user); */ if (!list_count(slurmdb_hierarchical_rec->children)) continue; - if (fprintf(fd, "Parent - %s\n", + if (fprintf(fd, "Parent - '%s'\n", slurmdb_hierarchical_rec->assoc->acct) < 0) { error("Can't write to file"); return SLURM_ERROR; } - info("%s - %s", "Parent", + info("%s - '%s'", "Parent", slurmdb_hierarchical_rec->assoc->acct); /* info("sending %d from %s", */ /* list_count(slurmdb_hierarchical_rec->children), */ diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index a4c85279c..553138564 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -466,7 +466,8 @@ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) desc->time_limit = opt.time_limit; if (opt.time_min != NO_VAL) desc->time_min = opt.time_min; - desc->shared = opt.shared; + if (opt.shared != (uint16_t) NO_VAL) + desc->shared = opt.shared; desc->wait_all_nodes = opt.wait_all_nodes; if (opt.warn_flags) diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 50822f4d2..75b67e45a 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -1002,14 +1002,17 @@ scontrol_update_job (int argc, char *argv[]) } for (i = 0; i < num_ids; i++) { job_msg.job_id = ids[i].job_id; + rc = 0; if (slurm_update_job(&job_msg)) { rc = slurm_get_errno(); if (ids[i].array_task_id == NO_VAL) { - error("Error updating job %u", ids[i].job_id); + error("Error updating job %u: %s", + ids[i].job_id, slurm_strerror(rc)); } else { - error("Error updating job %u_%u (%u)", - ids[i].array_job_id, ids[i].array_task_id, - ids[i].job_id); + error("Error updating job %u_%u (%u): %s", + ids[i].array_job_id, + ids[i].array_task_id, + ids[i].job_id, slurm_strerror(rc)); } } } diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c index cae1e3eda..ab91ad61d 100644 --- a/src/sinfo/opts.c +++ b/src/sinfo/opts.c @@ -287,7 +287,7 @@ extern void parse_command_line(int argc, char *argv[]) } else { params.part_field_flag = true; /* compute size later */ params.format = params.long_output ? - "%9P %.5a %.10l %.10s %.4r %.5h %.10g %.6D %.11T %N" : + "%9P %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" : "%9P %.5a %.10l %.6D %.6t %N"; } } diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index 15a17dc88..366363db1 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -65,6 +65,7 @@ static int g_node_scaling = 1; static int sinfo_cnt; /* thread count */ static pthread_mutex_t sinfo_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t sinfo_cnt_cond = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t sinfo_list_mutex = PTHREAD_MUTEX_INITIALIZER; /************ * Funtions * @@ -90,6 +91,7 @@ static int _query_server(partition_info_msg_t ** part_pptr, block_info_msg_t ** block_pptr, reserve_info_msg_t ** reserv_pptr, bool clear_old); static int _reservation_report(reserve_info_msg_t *resv_ptr); +static bool _serial_part_data(void); static void _sort_hostlist(List sinfo_list); static int _strcmp(char *data1, char *data2); static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr, @@ -425,6 +427,8 @@ void *_build_part_info(void *args) uint16_t part_num; int j = 0; + if (_serial_part_data()) + slurm_mutex_lock(&sinfo_list_mutex); build_struct_ptr = (build_part_info_t *) args; sinfo_list = build_struct_ptr->sinfo_list; part_num = build_struct_ptr->part_num; @@ -462,6 +466,8 @@ void *_build_part_info(void *args) } xfree(args); + if (_serial_part_data()) + slurm_mutex_unlock(&sinfo_list_mutex); slurm_mutex_lock(&sinfo_cnt_mutex); if (sinfo_cnt > 0) { sinfo_cnt--; @@ -794,6 +800,18 @@ static bool _match_node_data(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr) return true; } +/* Return true if the processing of partition data must be serialized. In that + * case, multiple partitions can write into the same sinfo data structure + * entries. The logic here is similar to that in _match_part_data() below. */ +static bool _serial_part_data(void) +{ + if (params.list_reasons) /* Don't care about partition */ + return true; + if (params.match_flags.partition_flag) /* Match partition name */ + return false; + return true; +} + static bool _match_part_data(sinfo_data_t *sinfo_ptr, partition_info_t* part_ptr) { @@ -804,7 +822,8 @@ static bool _match_part_data(sinfo_data_t *sinfo_ptr, if ((part_ptr == NULL) || (sinfo_ptr->part_info == NULL)) return false; - if ((_strcmp(part_ptr->name, sinfo_ptr->part_info->name))) + if (params.match_flags.partition_flag + && (_strcmp(part_ptr->name, sinfo_ptr->part_info->name))) return false; if (params.match_flags.avail_flag && diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 0202aa71b..34f60155f 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -559,6 +559,7 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, qos_max_nodes_limit = MIN(qos_ptr->grp_nodes, qos_ptr->max_nodes_pu); + if ((acct_policy_limit_set->max_nodes == ADMIN_SET_LIMIT) || (qos_max_nodes_limit == INFINITE) || (update_call && (job_desc->max_nodes == NO_VAL))) { @@ -1928,54 +1929,58 @@ end_it: extern uint32_t acct_policy_get_max_nodes(struct job_record *job_ptr) { - uint32_t max_nodes_limit = INFINITE; + uint32_t max_nodes_limit = INFINITE, qos_max_p_limit = INFINITE; assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + slurmdb_qos_rec_t *qos_ptr = job_ptr->qos_ptr; + slurmdb_association_rec_t *assoc_ptr = job_ptr->assoc_ptr; + bool parent = 0; /* flag to tell us if we are looking at the + * parent or not + */ + bool grp_set = 0; /* check to see if we are enforcing associations */ if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) return max_nodes_limit; assoc_mgr_lock(&locks); - if (job_ptr->qos_ptr) { - slurmdb_qos_rec_t *qos_ptr = job_ptr->qos_ptr; - max_nodes_limit = - MIN(qos_ptr->grp_nodes, qos_ptr->max_nodes_pu); + if (qos_ptr) { + qos_max_p_limit = max_nodes_limit = + MIN(qos_ptr->max_nodes_pj, qos_ptr->max_nodes_pu); max_nodes_limit = - MIN(max_nodes_limit, qos_ptr->max_nodes_pj); + MIN(max_nodes_limit, qos_ptr->grp_nodes); } - if (max_nodes_limit == INFINITE) { - slurmdb_association_rec_t *assoc_ptr = job_ptr->assoc_ptr; - bool parent = 0; /* flag to tell us if we are looking at the - * parent or not - */ - bool grp_set = 0; - - while (assoc_ptr) { - if (assoc_ptr->grp_nodes != INFINITE) { - max_nodes_limit = MIN(max_nodes_limit, - assoc_ptr->grp_nodes); - grp_set = 1; - } + /* We have to traverse all the associations because QOS might + not override a particular limit. + */ + while (assoc_ptr) { + if ((!qos_ptr || (qos_ptr->grp_nodes == INFINITE)) + && (assoc_ptr->grp_nodes != INFINITE)) { + max_nodes_limit = MIN(max_nodes_limit, + assoc_ptr->grp_nodes); + grp_set = 1; + } - if (!parent && (assoc_ptr->max_nodes_pj != INFINITE)) - max_nodes_limit = MIN(max_nodes_limit, - assoc_ptr->max_nodes_pj); + if (!parent + && (qos_max_p_limit == INFINITE) + && (assoc_ptr->max_nodes_pj != INFINITE)) + max_nodes_limit = MIN(max_nodes_limit, + assoc_ptr->max_nodes_pj); - /* only check the first grp set */ - if (grp_set) - break; - - assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; - parent = 1; - continue; - } + /* only check the first grp set */ + if (grp_set) + break; + assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; + parent = 1; + continue; } + assoc_mgr_unlock(&locks); return max_nodes_limit; } + /* * acct_policy_update_pending_job - Make sure the limits imposed on a job on * submission are correct after an update to a qos or association. If diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index fe0a603e5..1e7f5b424 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -87,6 +87,7 @@ #include "src/common/uid.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" +#include "src/common/slurm_protocol_interface.h" #include "src/slurmctld/acct_policy.h" #include "src/slurmctld/agent.h" @@ -975,6 +976,15 @@ static void *_slurmctld_rpc_mgr(void *no_data) conn_arg->newsockfd = newsockfd; memcpy(&conn_arg->cli_addr, &cli_addr, sizeof(slurm_addr_t)); + if (slurmctld_conf.debug_flags & DEBUG_FLAG_PROTOCOL) { + char inetbuf[64]; + + _slurm_print_slurm_addr(&cli_addr, + inetbuf, + sizeof(inetbuf)); + info("%s: accept() connection from %s", __func__, inetbuf); + } + if (slurmctld_config.shutdown_time) no_thread = 1; else if (pthread_create(&thread_id_rpc_req, @@ -1613,20 +1623,12 @@ static void *_slurmctld_background(void *no_data) _accounting_cluster_ready(); } + /* Stats will reset at midnight (approx) local time. */ if (last_proc_req_start == 0) { - /* Stats will reset at midnight (aprox). - * Uhmmm... UTC time?... It is not so important. - * Just resetting during the night */ - last_proc_req_start = now; - next_stats_reset = last_proc_req_start - - (last_proc_req_start % 86400) + - 86400; - } - - if ((next_stats_reset > 0) && (now > next_stats_reset)) { - /* Resetting stats values */ last_proc_req_start = now; next_stats_reset = now - (now % 86400) + 86400; + } else if (now >= next_stats_reset) { + next_stats_reset = now - (now % 86400) + 86400; reset_stats(0); } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a40dee506..0b29666f8 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2841,7 +2841,7 @@ void dump_job_desc(job_desc_msg_t * job_specs) { long job_id, time_min; long pn_min_cpus, pn_min_memory, pn_min_tmp_disk, min_cpus; - long time_limit, priority, contiguous; + long time_limit, priority, contiguous, nice; long kill_on_node_fail, shared, immediate, wait_all_nodes; long cpus_per_task, requeue, num_tasks, overcommit; long ntasks_per_node, ntasks_per_socket, ntasks_per_core; @@ -2973,10 +2973,11 @@ void dump_job_desc(job_desc_msg_t * job_specs) (long) job_specs->num_tasks : -1L; overcommit = (job_specs->overcommit != (uint8_t) NO_VAL) ? (long) job_specs->overcommit : -1L; - debug3(" mail_type=%u mail_user=%s nice=%d num_tasks=%ld " + nice = (job_specs->nice != (uint16_t) NO_VAL) ? + (job_specs->nice - NICE_OFFSET) : 0; + debug3(" mail_type=%u mail_user=%s nice=%ld num_tasks=%ld " "open_mode=%u overcommit=%ld acctg_freq=%s", - job_specs->mail_type, job_specs->mail_user, - (int)job_specs->nice - NICE_OFFSET, num_tasks, + job_specs->mail_type, job_specs->mail_user, nice, num_tasks, job_specs->open_mode, overcommit, job_specs->acctg_freq); slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf)); @@ -3707,6 +3708,7 @@ _signal_batch_job(struct job_record *job_ptr, uint16_t signal) bitoff_t i; kill_tasks_msg_t *kill_tasks_msg = NULL; agent_arg_t *agent_args = NULL; + uint32_t z; xassert(job_ptr); xassert(job_ptr->batch_host); @@ -3734,7 +3736,13 @@ _signal_batch_job(struct job_record *job_ptr, uint16_t signal) kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t)); kill_tasks_msg->job_id = job_ptr->job_id; kill_tasks_msg->job_step_id = NO_VAL; - kill_tasks_msg->signal = signal; + /* Encode the KILL_JOB_BATCH flag for + * stepd to know if has to signal only + * the batch script. The job was submitted + * using the --signal=B:sig sbatch option. + */ + z = KILL_JOB_BATCH << 24; + kill_tasks_msg->signal = z|signal; agent_args->msg_args = kill_tasks_msg; agent_args->node_count = 1;/* slurm/477 be sure to update node_count */ @@ -3805,8 +3813,11 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, return ESLURM_INVALID_JOB_ID; } - if (IS_JOB_FINISHED(job_ptr)) + if (IS_JOB_FINISHED(job_ptr)) { + if (job_ptr->exit_code == 0) + job_ptr->exit_code = job_return_code; return ESLURM_ALREADY_DONE; + } if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) { error("Security violation, JOB_COMPLETE RPC for job %u " @@ -3827,7 +3838,8 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, if ((job_return_code == NO_VAL) && (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) { - info("Job %u cancelled from interactive user", job_ptr->job_id); + info("Job %u cancelled from interactive user or node failure", + job_ptr->job_id); } if (IS_JOB_SUSPENDED(job_ptr)) { @@ -4061,7 +4073,6 @@ static int _part_access_check(struct part_record *part_ptr, } if (slurmctld_conf.enforce_part_limits) { - info("checking here"); if ((rc = part_policy_valid_acct(part_ptr, acct)) != SLURM_SUCCESS) goto fini; @@ -6998,6 +7009,26 @@ static void _pack_default_job_details(struct job_record *job_ptr, char *cmd_line = NULL; char *tmp = NULL; uint32_t len = 0; + uint16_t shared = 0; + + if (!detail_ptr) + shared = (uint16_t) NO_VAL; + else if (detail_ptr->share_res == 1) /* User --share */ + shared = 1; + else if ((detail_ptr->share_res == 0) || + (detail_ptr->whole_node == 1)) /* User --exclusive */ + shared = 0; + else if (job_ptr->part_ptr) { + /* Report shared status based upon latest partition info */ + if ((job_ptr->part_ptr->max_share & SHARED_FORCE) && + ((job_ptr->part_ptr->max_share & (~SHARED_FORCE)) > 1)) + shared = 1; /* Partition Shared=force */ + else if (job_ptr->part_ptr->max_share == 0) + shared = 0; /* Partition Shared=exclusive */ + else + shared = 0; /* Part Shared=yes or no */ + } else + shared = (uint16_t) NO_VAL; /* No user or partition info */ if (max_cpu_cnt == -1) max_cpu_cnt = _find_node_max_cpu_cnt(); @@ -7068,6 +7099,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, } pack16(detail_ptr->requeue, buffer); pack16(detail_ptr->ntasks_per_node, buffer); + pack16(shared, buffer); } else { packnull(buffer); packnull(buffer); @@ -7084,6 +7116,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, pack32((uint32_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); + pack16((uint16_t) 0, buffer); } } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { if (detail_ptr) { @@ -7139,6 +7172,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, pack32(detail_ptr->max_nodes, buffer); } pack16(detail_ptr->requeue, buffer); + pack16(shared, buffer); } else { packnull(buffer); packnull(buffer); @@ -7154,6 +7188,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, pack32(job_ptr->node_cnt, buffer); pack32((uint32_t) 0, buffer); pack16((uint16_t) 0, buffer); + pack16((uint16_t) 0, buffer); } } else { error("_pack_default_job_details: protocol_version " @@ -7165,20 +7200,8 @@ static void _pack_default_job_details(struct job_record *job_ptr, static void _pack_pending_job_details(struct job_details *detail_ptr, Buf buffer, uint16_t protocol_version) { - uint16_t shared = 0; - - if (!detail_ptr) - shared = (uint16_t) NO_VAL; - else if (detail_ptr->share_res == 1) - shared = 1; - else if (detail_ptr->whole_node == 1) - shared = 0; - else - shared = (uint16_t) NO_VAL; - if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { if (detail_ptr) { - pack16(shared, buffer); pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->core_spec, buffer); pack16(detail_ptr->cpus_per_task, buffer); @@ -7204,7 +7227,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); - pack16((uint16_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); @@ -7222,7 +7244,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, } } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { if (detail_ptr) { - pack16(shared, buffer); pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->cpus_per_task, buffer); pack16(detail_ptr->pn_min_cpus, buffer); @@ -7242,7 +7263,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); - pack16((uint16_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); @@ -7716,8 +7736,9 @@ static bool _top_priority(struct job_record *job_ptr) if ((!top) && detail_ptr) { /* not top prio */ if (job_ptr->priority == 0) { /* user/admin hold */ - if ((job_ptr->state_reason != WAIT_HELD) && - (job_ptr->state_reason != WAIT_HELD_USER)) { + if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS + && (job_ptr->state_reason != WAIT_HELD) + && (job_ptr->state_reason != WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } @@ -9531,8 +9552,8 @@ static void _send_job_kill(struct job_record *job_ptr) if (agent_args->node_count == 0) { if ((job_ptr->details->expanding_jobid == 0) && (select_serial == 0)) { - error("Job %u allocated no nodes to be killed on", - job_ptr->job_id); + error("%s: job %u allocated no nodes to be killed on", + __func__, job_ptr->job_id); } xfree(kill_job->nodes); xfree(kill_job); @@ -10258,14 +10279,12 @@ extern bool job_epilog_complete(uint32_t job_id, char *node_name, * subsequent jobs appear in a separate accounting record. */ void batch_requeue_fini(struct job_record *job_ptr) { - time_t now; - if (IS_JOB_COMPLETING(job_ptr) || !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag) return; info("requeue batch job %u", job_ptr->job_id); - now = time(NULL); + /* Clear everything so this appears to be a new job and then restart * it in accounting. */ job_ptr->start_time = 0; @@ -10289,14 +10308,14 @@ void batch_requeue_fini(struct job_record *job_ptr) FREE_NULL_BITMAP(job_ptr->node_bitmap); FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); if (job_ptr->details) { + time_t now = time(NULL); /* the time stamp on the new batch launch credential must be * larger than the time stamp on the revoke request. Also the * I/O must be all cleared out and the named socket purged, * so delay for at least ten seconds. */ if (job_ptr->details->begin_time <= now) job_ptr->details->begin_time = now + 10; - if (!with_slurmdbd) - jobacct_storage_g_job_start(acct_db_conn, job_ptr); + /* Since this could happen on a launch we need to make sure the * submit isn't the same as the last submit so put now + 1 so * we get different records in the database */ @@ -10308,6 +10327,8 @@ void batch_requeue_fini(struct job_record *job_ptr) /* Reset this after the batch step has finished or the batch step * information will be attributed to the next run of the job. */ job_ptr->db_index = 0; + if (!with_slurmdbd) + jobacct_storage_g_job_start(acct_db_conn, job_ptr); } @@ -10965,7 +10986,12 @@ extern int job_requeue(uid_t uid, goto reply; } - if ((job_ptr->details == NULL) || (job_ptr->details->requeue == 0)) { + /* If the partition was removed don't allow the job to be + * requeued. If it doesn't have details then something is very + * wrong and if the job doesn't want to be requeued don't. + */ + if (!job_ptr->part_ptr || !job_ptr->details + || !job_ptr->details->requeue) { rc = ESLURM_DISABLED; goto reply; } @@ -11000,7 +11026,7 @@ extern int job_requeue(uid_t uid, /* we can't have it as suspended when we call the * accounting stuff. */ - job_ptr->job_state = JOB_CANCELLED; + job_ptr->job_state = JOB_REQUEUE; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; suspended = true; @@ -11021,10 +11047,10 @@ extern int job_requeue(uid_t uid, || IS_JOB_RUNNING(job_ptr)) is_running = true; - /* We want this job to look like it was cancelled in the + /* We want this job to have the requeued state in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ - job_ptr->job_state = JOB_CANCELLED; + job_ptr->job_state = JOB_REQUEUE; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, true); @@ -12080,7 +12106,7 @@ extern void job_hold_requeue(struct job_record *job_ptr) job_ptr->state_reason, job_ptr->priority); } -/* Reset a job's end-time based upon it's end_time. +/* Reset a job's end_time based upon it's start_time and time_limit. * NOTE: Do not reset the end_time if already being preempted */ extern void job_end_time_reset(struct job_record *job_ptr) { diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 6f147b104..350558767 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -198,8 +198,9 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool clear_start) if (clear_start) job_ptr->start_time = (time_t) 0; if (job_ptr->priority == 0) { /* held */ - if ((job_ptr->state_reason != WAIT_HELD) && - (job_ptr->state_reason != WAIT_HELD_USER)) { + if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS + && (job_ptr->state_reason != WAIT_HELD) + && (job_ptr->state_reason != WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); last_job_update = time(NULL); @@ -849,8 +850,8 @@ extern int schedule(uint32_t job_limit) xfree(sched_params); sched_update = slurmctld_conf.last_update; - info("SchedulingParameters: default_queue_depth=%d " - "max_rpc_cnt=%d max_sched_time=%d partition_job_depth=%d ", + info("SchedulerParameters=default_queue_depth=%d," + "max_rpc_cnt=%d,max_sched_time=%d,partition_job_depth=%d", def_job_limit, defer_rpc_cnt, sched_timeout, max_jobs_per_part); } @@ -1319,13 +1320,11 @@ next_part: part_ptr = (struct part_record *) job_ptr->job_id, slurm_strerror(error_code)); if (!wiki_sched) { last_job_update = now; - job_ptr->job_state = JOB_FAILED; - job_ptr->exit_code = 1; + job_ptr->job_state = JOB_PENDING; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = now; - job_completion_logger(job_ptr, false); - delete_job_details(job_ptr); + job_ptr->priority = 0; } } @@ -3003,15 +3002,23 @@ static int _valid_node_feature(char *feature) return rc; } -/* If a job can run in multiple partitions, make sure that the one - * actually used is first in the string. Needed for job state save/restore */ +/* If a job can run in multiple partitions, when it is started we want to + * put the name of the partition used _first_ in that list. When slurmctld + * restarts, that will be used to set the job's part_ptr and that will be + * reported to squeue. We leave all of the partitions in the list though, + * so the job can be requeued and have access to them all. */ extern void rebuild_job_part_list(struct job_record *job_ptr) { ListIterator part_iterator; struct part_record *part_ptr; - if ((job_ptr->part_ptr_list == NULL) || (job_ptr->part_ptr == NULL)) + if (!job_ptr->part_ptr_list) return; + if (!job_ptr->part_ptr || !job_ptr->part_ptr->name) { + error("Job %u has NULL part_ptr or the partition name is NULL", + job_ptr->job_id); + return; + } xfree(job_ptr->partition); job_ptr->partition = xstrdup(job_ptr->part_ptr->name); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 5c583991a..97f80c504 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -605,7 +605,9 @@ unpack_error: list_destroy(gres_list); gres_list = NULL; } - xfree (node_name); + xfree(comm_name); + xfree(node_hostname); + xfree(node_name); xfree(reason); goto fini; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 4c35a5f71..ec86c166f 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -200,7 +200,7 @@ static int _get_gres_alloc(struct job_record *job_ptr) /* * _get_gres_config - Fill in the gres_alloc string field for a given * job_record with the count of gres on each node (e.g. for whole node - * allocations. + * allocations). * IN job_ptr - the job record whose "gres_alloc" field is to be constructed * RET Error number. Currently not used (always set to 0). */ @@ -602,8 +602,8 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if (agent_args->node_count == 0) { if ((job_ptr->details->expanding_jobid == 0) && (select_serial == 0)) { - error("Job %u allocated no nodes to be killed on", - job_ptr->job_id); + error("%s: job %u allocated no nodes to be killed on", + __func__, job_ptr->job_id); } slurm_free_kill_job_msg(kill_job); hostlist_destroy(agent_args->hostlist); @@ -681,8 +681,8 @@ static int _match_feature(char *seek, struct node_set *node_set_ptr) * 1 = exclusive * * Return values: - * 0 = no sharing - * 1 = share resources + * 0 = requires idle nodes + * 1 = can use non-idle nodes */ static int _resolve_shared_status(struct job_record *job_ptr, uint16_t part_max_share, @@ -691,31 +691,36 @@ _resolve_shared_status(struct job_record *job_ptr, uint16_t part_max_share, /* no sharing if partition Shared=EXCLUSIVE */ if (part_max_share == 0) { job_ptr->details->whole_node = 1; + job_ptr->details->share_res = 0; return 0; } /* sharing if partition Shared=FORCE with count > 1 */ if ((part_max_share & SHARED_FORCE) && - ((part_max_share & (~SHARED_FORCE)) > 1)) + ((part_max_share & (~SHARED_FORCE)) > 1)) { + job_ptr->details->share_res = 1; return 1; + } if (cons_res_flag) { - if (part_max_share == 1) /* partition configured Shared=NO */ - return 0; if ((job_ptr->details->share_res == 0) || - (job_ptr->details->share_res == (uint8_t) NO_VAL) || - (job_ptr->details->whole_node == 1)) + (job_ptr->details->whole_node == 1)) { + job_ptr->details->share_res = 0; return 0; + } return 1; } else { job_ptr->details->whole_node = 1; - if (part_max_share == 1) /* partition configured Shared=NO */ + if (part_max_share == 1) { /* partition configured Shared=NO */ + job_ptr->details->share_res = 0; return 0; + } /* share if the user requested it */ if (job_ptr->details->share_res == 1) return 1; + job_ptr->details->share_res = 0; + return 0; } - return 0; } /* @@ -1078,7 +1083,6 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, shared = _resolve_shared_status(job_ptr, part_ptr->max_share, cr_enabled); - job_ptr->details->share_res = shared; if (cr_enabled) job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */ @@ -1587,8 +1591,9 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } if (job_ptr->priority == 0) { /* user/admin hold */ - if ((job_ptr->state_reason != WAIT_HELD) && - (job_ptr->state_reason != WAIT_HELD_USER)) { + if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS + && (job_ptr->state_reason != WAIT_HELD) + && (job_ptr->state_reason != WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_HELD; } return ESLURM_JOB_HELD; @@ -1794,6 +1799,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, select_bitmap = NULL; /* nothing left to free */ allocate_nodes(job_ptr); build_node_details(job_ptr, true); + rebuild_job_part_list(job_ptr); /* This could be set in the select plugin so we want to keep the flag. */ diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 6feb3336a..dfa2be925 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1797,8 +1797,15 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) return; } - /* Send batch step info to accounting */ - if (association_based_accounting && job_ptr) { + /* Send batch step info to accounting, only if the job is + * still completing. If the job was requeued because of node + * failure (state == pending) an epilog script might not of + * ran so we already finished the last instance of the job so + * this would be put on the requeued instance which is + * incorrect. + */ + if (association_based_accounting && job_ptr + && !IS_JOB_PENDING(job_ptr)) { struct step_record batch_step; memset(&batch_step, 0, sizeof(struct step_record)); batch_step.job_ptr = job_ptr; @@ -1924,9 +1931,9 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) /* return result */ if (error_code) { - info("_slurm_rpc_complete_batch_script JobId=%u: %s ", - comp_msg->job_id, - slurm_strerror(error_code)); + debug2("_slurm_rpc_complete_batch_script JobId=%u: %s ", + comp_msg->job_id, + slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); } else { debug2("_slurm_rpc_complete_batch_script JobId=%u %s", diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 2576d5557..6d58834f0 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1733,7 +1733,7 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) } } - /* Sort the list of jobs in descending order */ + /* Sort the list of node counts in order descending size */ if (resv_desc_ptr->node_cnt) { for (i = 0; resv_desc_ptr->node_cnt[i]; i++) { int max_inx = i; @@ -1869,6 +1869,13 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) total_node_cnt = bit_set_count(node_bitmap); } + if (resv_desc_ptr->core_cnt && !core_bitmap) { + info("Attempt to reserve cores not possible with current " + "configuration"); + rc = ESLURM_INVALID_CPU_COUNT; + goto bad_parse; + } + _generate_resv_id(); if (resv_desc_ptr->name) { resv_ptr = (slurmctld_resv_t *) list_find_first (resv_list, @@ -3140,13 +3147,14 @@ static int _select_nodes(resv_desc_msg_t *resv_desc_ptr, bit_and(node_bitmap, avail_node_bitmap); } - /* If *resv_bitmap exists we probably don't need to delete it, - when it gets created off of node_bitmap it will be the - same, but just to be safe we do. */ + /* If *resv_bitmap exists we probably don't need to delete it, when it + * gets created off of node_bitmap it will be the same, but just to be + * safe we do. */ FREE_NULL_BITMAP(*resv_bitmap); - if (rc == SLURM_SUCCESS) + if (rc == SLURM_SUCCESS) { *resv_bitmap = _pick_idle_nodes(node_bitmap, resv_desc_ptr, core_bitmap); + } FREE_NULL_BITMAP(node_bitmap); if (*resv_bitmap == NULL) { if (rc == SLURM_SUCCESS) @@ -3154,8 +3162,7 @@ static int _select_nodes(resv_desc_msg_t *resv_desc_ptr, return rc; } - /* Same thing as the *resv_bitmap, might as well keep them in - sync */ + /* Same thing as the *resv_bitmap, might as well keep them in sync */ xfree(resv_desc_ptr->node_list); resv_desc_ptr->node_list = bitmap2node_name(*resv_bitmap); @@ -3859,15 +3866,16 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, if ((resv_ptr->full_nodes) || (job_ptr->details->whole_node)) { #if _DEBUG - info("reservation uses full nodes or job will " - "not share nodes"); + info("reservation %s uses full nodes or job %u " + "will not share nodes", + resv_ptr->name, job_ptr->job_id); #endif bit_not(resv_ptr->node_bitmap); bit_and(*node_bitmap, resv_ptr->node_bitmap); bit_not(resv_ptr->node_bitmap); } else { #if _DEBUG - info("job_test_resv: %s reservation uses " + info("job_test_resv: reservation %s uses " "partial nodes", resv_ptr->name); #endif if (*exc_core_bitmap == NULL) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 3b85ac769..f26cbc770 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1063,7 +1063,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int allocate, uid_t submit_uid, struct job_record **job_pptr, char **err_msg); -/* Reset a job's end-time based upon it's end_time. +/* Reset a job's end_time based upon it's start_time and time_limit. * NOTE: Do not reset the end_time if already being preempted */ extern void job_end_time_reset(struct job_record *job_ptr); /* diff --git a/src/slurmctld/statistics.c b/src/slurmctld/statistics.c index 6089e087d..5ce071ba3 100644 --- a/src/slurmctld/statistics.c +++ b/src/slurmctld/statistics.c @@ -157,4 +157,6 @@ extern void reset_stats(int level) slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_active = 0; + + last_proc_req_start = time(NULL); } diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 463b599db..6e91257f5 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -241,9 +241,9 @@ extern void delete_step_records (struct job_record *job_ptr) struct step_record *step_ptr; xassert(job_ptr); - step_iterator = list_iterator_create (job_ptr->step_list); last_job_update = time(NULL); + step_iterator = list_iterator_create(job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { /* Only check if not a pending step */ if (step_ptr->step_id != INFINITE) { @@ -259,8 +259,9 @@ extern void delete_step_records (struct job_record *job_ptr) list_remove (step_iterator); _free_step_rec(step_ptr); } + list_iterator_destroy(step_iterator); + gres_plugin_job_clear(job_ptr->gres_list); - list_iterator_destroy (step_iterator); } /* _free_step_rec - delete a step record's data structures */ @@ -2604,7 +2605,10 @@ static void _pack_ctld_job_step_info(struct step_record *step_ptr, Buf buffer, } pack_time(run_time, buffer); - packstr(step_ptr->job_ptr->partition, buffer); + if (step_ptr->job_ptr->part_ptr) + packstr(step_ptr->job_ptr->part_ptr->name, buffer); + else + packstr(step_ptr->job_ptr->partition, buffer); packstr(step_ptr->resv_ports, buffer); packstr(node_list, buffer); packstr(step_ptr->name, buffer); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index fc2534db7..cea7618fb 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -448,7 +448,9 @@ static int _send_slurmd_conf_lite (int fd, slurmd_conf_t *cf) { int len; Buf buffer = init_buf(0); + slurm_mutex_lock(&cf->config_mutex); pack_slurmd_conf_lite(cf, buffer); + slurm_mutex_unlock(&cf->config_mutex); len = get_buf_offset(buffer); safe_write(fd, &len, sizeof(int)); safe_write(fd, get_buf_data(buffer), len); diff --git a/src/slurmd/slurmstepd/io.c b/src/slurmd/slurmstepd/io.c index 2b0294196..1d72413fd 100644 --- a/src/slurmd/slurmstepd/io.c +++ b/src/slurmd/slurmstepd/io.c @@ -843,7 +843,9 @@ static void *_window_manager(void *arg) break; } len = slurm_read_stream(win_info->pty_fd, buf, 4); - if ((len == -1) && ((errno == EINTR) || (errno == EAGAIN))) + if ((len == -1) && + ((errno == EINTR) || (errno == EAGAIN) || + (errno == SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT))) continue; if (len < 4) { error("read window size error: %m"); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 76abd8cf0..2b6c903e9 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -315,6 +315,11 @@ static uint32_t _get_exit_code(stepd_step_rec_t *job) } step_rc = MAX(step_complete.step_rc, job->task[i]->estatus); } + /* If we killed all the tasks by cmd give at least one return + code. */ + if (step_rc == NO_VAL && job->task[0]) + step_rc = job->task[0]->estatus; + return step_rc; } diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index eae973425..b365ac3fc 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -670,8 +670,13 @@ _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) int target_node_id = 0; stepd_step_task_info_t *task; uint32_t i; + uint32_t flag; + uint32_t signal; + + safe_read(fd, &signal, sizeof(int)); + flag = signal >> 24; + sig = signal & 0xfff; - safe_read(fd, &sig, sizeof(int)); debug("_handle_signal_container for step=%u.%u uid=%d signal=%d", job->jobid, job->stepid, (int) uid, sig); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { @@ -777,6 +782,31 @@ _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) goto done; } + if (flag & KILL_JOB_BATCH + && job->stepid == SLURM_BATCH_SCRIPT) { + /* We should only signal the batch script + * and nothing else, the job pgid is the + * equal to the pid of the batch script. + */ + if (kill(job->pgid, sig) < 0) { + error("%s: failed signal %d container pid" + "%u job %u.%u %m", + __func__, sig, job->pgid, + job->jobid, job->stepid); + rc = SLURM_ERROR; + errnum = errno; + pthread_mutex_unlock(&suspend_mutex); + goto done; + } + rc = SLURM_SUCCESS; + errnum = 0; + verbose("%s: sent signal %d to container pid %u job %u.%u", + __func__, sig, job->pgid, + job->jobid, job->stepid); + pthread_mutex_unlock(&suspend_mutex); + goto done; + } + /* * Signal the container */ diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index f43d2d770..2f7a5ba66 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -484,6 +484,7 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job->job_core_spec = msg->job_core_spec; job->batch = true; + job->node_name = xstrdup(conf->node_name); /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ @@ -495,7 +496,6 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job->multi_prog = 0; job->open_mode = msg->open_mode; job->overcommit = (bool) msg->overcommit; - job->node_name = xstrdup(conf->node_name); job->uid = (uid_t) msg->uid; job->user_name = xstrdup(msg->user_name); diff --git a/src/srun/libsrun/launch.c b/src/srun/libsrun/launch.c index 45005fc3c..e94f3e568 100644 --- a/src/srun/libsrun/launch.c +++ b/src/srun/libsrun/launch.c @@ -243,6 +243,8 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; + if (opt.ntasks_per_node != NO_VAL) + job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; @@ -252,6 +254,8 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; + if (opt.ntasks_per_node != NO_VAL) + job->ctx_params.plane_size = opt.ntasks_per_node; opt.distribution = job->ctx_params.task_dist; break; diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index ca33725d6..53b7fcc7d 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -101,6 +101,7 @@ typedef struct allocation_info { static int shepard_fd = -1; static pthread_t signal_thread = (pthread_t) 0; +static int pty_sigarray[] = { SIGWINCH, 0 }; /* * Prototypes: @@ -414,11 +415,12 @@ extern void init_srun(int ac, char **av, bool handle_signals) { /* This must happen before we spawn any threads - * which are not designed to handle them */ + * which are not designed to handle arbitrary signals */ if (handle_signals) { if (xsignal_block(sig_array) < 0) error("Unable to block signals"); } + xsignal_block(pty_sigarray); /* Initialize plugin stack, read options from plugins, etc. */ @@ -676,6 +678,8 @@ cleanup: if (WIFEXITED(*global_rc)) *global_rc = WEXITSTATUS(*global_rc); + else if (WIFSIGNALED(*global_rc)) + *global_rc = 128 + WTERMSIG(*global_rc); mpir_cleanup(); log_fini(); @@ -867,9 +871,19 @@ _job_create_structure(allocation_info_t *ainfo) job->jobid = ainfo->jobid; job->ntasks = opt.ntasks; - for (i=0; i<ainfo->num_cpu_groups; i++) { - job->cpu_count += ainfo->cpus_per_node[i] * - ainfo->cpu_count_reps[i]; + + /* If cpus_per_task is set then get the exact count of cpus + for the requested step (we might very well use less, + especially if --exclusive is used). Else get the total for the + allocation given. + */ + if (opt.cpus_set) + job->cpu_count = opt.ntasks * opt.cpus_per_task; + else { + for (i=0; i<ainfo->num_cpu_groups; i++) { + job->cpu_count += ainfo->cpus_per_node[i] * + ainfo->cpu_count_reps[i]; + } } job->rc = -1; diff --git a/src/srun/libsrun/srun_job.h b/src/srun/libsrun/srun_job.h index a88982130..c3c6aea5a 100644 --- a/src/srun/libsrun/srun_job.h +++ b/src/srun/libsrun/srun_job.h @@ -110,8 +110,8 @@ typedef struct srun_job { pthread_t pty_id; /* pthread to communicate window size changes */ int pty_fd; /* file to communicate window size changes */ uint16_t pty_port; /* used to communicate window size changes */ - uint8_t ws_col; /* window size, columns */ - uint8_t ws_row; /* window size, row count */ + uint16_t ws_col; /* window size, columns */ + uint16_t ws_row; /* window size, row count */ slurm_step_ctx_t *step_ctx; slurm_step_ctx_params_t ctx_params; } srun_job_t; diff --git a/src/srun/srun_pty.c b/src/srun/srun_pty.c index 7c3602d59..ccc1aee9e 100644 --- a/src/srun/srun_pty.c +++ b/src/srun/srun_pty.c @@ -96,7 +96,7 @@ int set_winsize(srun_job_t *job) return 0; } -/* SIGWINCH should already be blocked by srun/signal.c */ +/* SIGWINCH should already be blocked by srun/libsrun/srun_job.c */ void block_sigwinch(void) { xsignal_block(pty_sigarray); @@ -178,5 +178,3 @@ static void *_pty_thread(void *arg) } return NULL; } - - diff --git a/testsuite/expect/globals b/testsuite/expect/globals index ec588fad1..965ac0d1c 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -2539,6 +2539,40 @@ proc get_node_cnt_in_part { partition } { return $node_cnt } +################################################################ +# +# Proc: get_idle_node_in_part +# +# Purpose: Get an idle node in a given partition +# +# Returns name of node in a partition or "" if unknown +# +################################################################ + +proc get_idle_node_in_part { partition } { + global sinfo alpha_numeric_under + + log_user 0 + set node_name "" + set scon_pid [spawn -noecho $sinfo -oNAME=%n -h -p$partition --state=idle] + expect { + -re "not found" { + send_user "\nFAILURE: partition $partition doesn't exist\n" + } + -re "NAME=($alpha_numeric_under)" { + set node_name $expect_out(1,string) + } + timeout { + send_user "\nFAILURE: scontrol not responding\n" + } + eof { + } + } + log_user 1 + + return $node_name +} + ################################################################ # diff --git a/testsuite/expect/test17.34 b/testsuite/expect/test17.34 index d2f52f663..20873f0c0 100755 --- a/testsuite/expect/test17.34 +++ b/testsuite/expect/test17.34 @@ -152,6 +152,12 @@ proc core_spec_job {task node core_spec exp_nodes} { print_header $test_id +set select_type [test_select_type] +if {![string compare $select_type "linear"]} { + send_user "\nWARNING: This test is incompatible with select/$select_type\n" + exit 0 +} + # Remove any vestigial files exec $bin_rm -f $file_in $file_out $spec_in @@ -228,7 +234,7 @@ expect { wait } } -set $core_cnt [expr $core_cnt * $socket_cnt] +set core_cnt [expr $core_cnt * $socket_cnt] if {$core_cnt == 0} { send_user "\nFAILURE: sbatch did not find the number of cores\n" exit 1 @@ -241,6 +247,7 @@ if {$core_cnt < 4} { # # Using the core spec within the node limits # +send_user "\n\nRun within the specified node\n" core_spec_job 0 $first_node [expr $core_cnt - 2] 0 core_spec_job -2 $first_node [expr $core_cnt - 2] 0 @@ -248,12 +255,14 @@ core_spec_job -2 $first_node [expr $core_cnt - 2] 0 # Using core spec with more tasks then the node can handle. This should # cause the tasks to spread accross mutliple nodes as needed # +send_user "\n\nSpread job across multiple nodes\n" core_spec_job 1 $first_node [expr $core_cnt - 2] 1 core_spec_job 1 $first_node [expr $core_cnt - 1] 1 # # Using core spec with more cores then the specified node has # +send_user "\n\nFail by trying to use more cores than exist\n" core_spec_job 1 $first_node [expr $core_cnt + 5] -1 core_spec_job 1 $first_node [expr $core_cnt + 7] -1 diff --git a/testsuite/expect/test2.18 b/testsuite/expect/test2.18 index 48b63060c..76861c523 100755 --- a/testsuite/expect/test2.18 +++ b/testsuite/expect/test2.18 @@ -34,7 +34,7 @@ source ./globals set test_id "2.18" set user_name "" set node_name "" -set host_name "" +set cluster_name "" set acct_good "test${test_id}_acct_good" set acct_bad "test${test_id}_acct_bad" set part_name "test${test_id}_part" @@ -73,18 +73,15 @@ proc set_part_val {part_type part_val} { } } -proc delete_part { } { - global scontrol sacctmgr part_name acct_good acct_bad exit_code +proc cleanup { } { + global scancel scontrol sacctmgr part_name acct_good acct_bad exit_code set del_part 0 - spawn $sacctmgr -i delete account $acct_good $acct_bad + + spawn $scancel -p $part_name expect { - -re "Deleting accounts" { - set del_part 1 - exp_continue - } timeout { - send_user "\nFAILURE: sacctmgr is not responding\n" + send_user "FAILURE: scancel is not responding\n" set exit_code 1 } eof { @@ -92,6 +89,8 @@ proc delete_part { } { } } + send_user "Any error, except for unresponsiveness, from the previous scancel is expected and should be ignored.\n" + spawn $scontrol delete partition=$part_name expect { -re "error" { @@ -109,16 +108,30 @@ proc delete_part { } { } } - return $del_part + spawn $sacctmgr -i delete account $acct_good $acct_bad + expect { + -re "Deleting accounts" { + set del_part 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr is not responding\n" + set exit_code 1 + } + eof { + wait + } + } + return $del_part } proc create_acct { acct } { - global sacctmgr exit_code user_name + global sacctmgr exit_code user_name cluster_name set create_acct 0 - spawn $sacctmgr -i create account $acct + spawn $sacctmgr -i create account $acct cluster=$cluster_name expect { -re "Adding Account" { set create_acct 1 @@ -133,7 +146,7 @@ proc create_acct { acct } { } } - spawn $sacctmgr -i create user $user_name account=$acct + spawn $sacctmgr -i create user $user_name account=$acct cluster=$cluster_name expect { timeout { send_user "\nFAILURE: sacctmgr is not responding\n" @@ -154,19 +167,14 @@ proc create_acct { acct } { proc test_part { acct part acct_con } { - global srun host_name exit_code + global srun exit_code set sub_job 0 - spawn $srun -I -A $acct -p $part hostname + spawn $srun -I -A $acct -p $part true expect { - -re "$host_name" { - set sub_job 1 - exp_continue - } -re "error" { - set sub_job 2 - if { $acct_con == 1 && $sub_job == 2} { + if { $acct_con == 1 } { send_user "\nThis error is expected\n" } else { send_user "\nFAILURE: This error should not have occured\n" @@ -187,7 +195,7 @@ proc test_part { acct part acct_con } { } # Remove any vestigial accounts or partitions -delete_part +cleanup spawn $bin_id -un expect { @@ -204,56 +212,24 @@ expect { } } -spawn hostname -expect { +set node_name [ get_idle_node_in_part $partition ] +set cluster_name [ get_cluster_name ] - -re "($alpha_numeric_under)" { - set host_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: hostname is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# NOTE: acct_good should always work and +# acct_bad should always cause an error +# +# Create good account +# +create_acct $acct_good -spawn $scontrol show node -expect { - -re "NodeName=($alpha_numeric_under)" { - set node_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# +# Create bad account +# +create_acct $acct_bad # Create partition -spawn $scontrol create partition=$part_name -expect { - -re "error" { - send_user "\nFAILURE: partition was not created\n" - set exit_code 1 - } - timeout { - send_user "\nFAILURE: scontrol is not reponding\n" - set exit_code 1 - } - eof { - wait - } -} - -# Add nodes to partition -spawn $scontrol update partition=$part_name nodes=$node_name +spawn $scontrol create partition=$part_name nodes=$node_name expect { -re "error" { send_user "\nFAILURE: partition was not created\n" @@ -268,19 +244,6 @@ expect { } } -# NOTE: acct_good should always work and -# acct_bad should always cause an error - -# -# Create good account -# -create_acct $acct_good - -# -# Create bad account -# -create_acct $acct_bad - # # Set Allow Account to good values # @@ -325,9 +288,9 @@ test_part $acct_good $part_name 0 test_part $acct_bad $part_name 1 -sleep 2 +sleep 5 # Delete partition and accounts -if {[delete_part] != 1} { +if {[cleanup] != 1} { send_user "\nFAILURE: Account was not deleted\n" set exit_code 1 } diff --git a/testsuite/expect/test2.19 b/testsuite/expect/test2.19 index 56516a643..174117d82 100755 --- a/testsuite/expect/test2.19 +++ b/testsuite/expect/test2.19 @@ -35,6 +35,7 @@ set test_id "2.19" set user_name "" set node_name "" set host_name "" +set cluster_name "" set acct1 "test${test_id}_acct_1" set acct2 "test${test_id}_acct_2" set qos_good "test${test_id}_qos_good" @@ -75,8 +76,21 @@ proc set_part_val {part_type part_val} { } } -proc delete_part { } { - global scontrol sacctmgr part_name qos_good qos_bad acct1 acct2 exit_code +proc cleanup { } { + global scancel scontrol sacctmgr part_name qos_good qos_bad acct1 acct2 exit_code + + spawn $scancel -p $part_name + expect { + timeout { + send_user "FAILURE: scancel is not responding\n" + set exit_code 1 + } + eof { + wait + } + } + + send_user "Any error, except for unresponsiveness, from the previous scancel is expected and should be ignored.\n" spawn $scontrol delete partition=$part_name expect { @@ -131,7 +145,7 @@ proc delete_part { } { } proc create_qos { acct qos } { - global sacctmgr user_name exit_code + global sacctmgr user_name exit_code cluster_name set create_qos 0 spawn $sacctmgr -i create qos $qos @@ -150,7 +164,7 @@ proc create_qos { acct qos } { } - spawn $sacctmgr -i create account $acct qos=$qos + spawn $sacctmgr -i create account $acct qos=$qos cluster=$cluster_name expect { -re "Adding Account" { incr create_qos @@ -166,7 +180,7 @@ proc create_qos { acct qos } { } set create_acct 0 - spawn $sacctmgr -i create user $user_name account=$acct + spawn $sacctmgr -i create user $user_name account=$acct cluster=$cluster_name expect { timeout { send_user "\nFAILURE: sacctmgr is not responding\n" @@ -185,18 +199,13 @@ proc create_qos { acct qos } { proc test_part {acct qos part qos_con } { - global srun host_name part_name exit_code + global srun part_name exit_code set sub_job 0 - spawn $srun -I -A $acct --qos $qos -p $part hostname + spawn $srun -I -A $acct --qos $qos -p $part true expect { - -re "$host_name" { - set sub_job 1 - exp_continue - } -re "error" { - set sub_job 2 - if { $qos_con == 1 && $sub_job == 2} { + if { $qos_con == 1 } { send_user "\nThis error is expected\n" } else { send_user "\nFAILURE: This error should not have occured\n" @@ -215,7 +224,7 @@ proc test_part {acct qos part qos_con } { } # Delete any vestigial qos or accounts -delete_part +cleanup spawn $bin_id -un expect { @@ -232,56 +241,25 @@ expect { } } -spawn hostname -expect { +set node_name [ get_idle_node_in_part $partition ] +set cluster_name [ get_cluster_name ] - -re "($alpha_numeric_under)" { - set host_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: hostname is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# NOTE: qos_good should always work and +# qos_bad should always cause an error +# +# Create good QOS +# +create_qos $acct1 $qos_good -spawn $scontrol show node -expect { - -re "NodeName=($alpha_numeric_under)" { - set node_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# +# Create bad QOS +# +create_qos $acct2 $qos_bad -# Create partition -spawn $scontrol create partition=$part_name -expect { - -re "error" { - send_user "\nFAILURE: partition was not created\n" - set exit_code 1 - } - timeout { - send_user "\nFAILURE: scontrol is not reponding\n" - set exit_code 1 - } - eof { - wait - } -} -# Add nodes to partition -spawn $scontrol update partition=$part_name nodes=$node_name +# Create partition +spawn $scontrol create partition=$part_name nodes=$node_name expect { -re "error" { send_user "\nFAILURE: partition was not created\n" @@ -296,19 +274,6 @@ expect { } } -# NOTE: qos_good should always work and -# qos_bad should always cause an error - -# -# Create good QOS -# -create_qos $acct1 $qos_good - -# -# Create bad QOS -# -create_qos $acct2 $qos_bad - # # Set Allow Qos to good value # @@ -351,7 +316,8 @@ test_part $acct1 $qos_good $part_name 0 # test_part $acct2 $qos_bad $part_name 1 -if {[delete_part] != 2} { +sleep 5 +if {[cleanup] != 2} { send_user "\nFAILURE: Qos/account was not deleted\n" set exit_code 1 } diff --git a/testsuite/expect/test2.21 b/testsuite/expect/test2.21 index 8b9f8b1ef..e5f804285 100755 --- a/testsuite/expect/test2.21 +++ b/testsuite/expect/test2.21 @@ -39,6 +39,12 @@ set exit_code 0 print_header $test_id +set min_age [get_min_job_age] +if {$min_age < 10} { + send_user "\nWARNING: MinJobAge too low for this test ($min_age < 10)\n" + exit 0 +} + # Remove any vestigial scripts exec $bin_rm -f $complete_script $fail_script diff --git a/testsuite/expect/test2.22 b/testsuite/expect/test2.22 index cd77d0897..46e19d147 100755 --- a/testsuite/expect/test2.22 +++ b/testsuite/expect/test2.22 @@ -38,6 +38,12 @@ set exit_code 0 print_header $test_id +set min_age [get_min_job_age] +if {$min_age < 10} { + send_user "\nWARNING: MinJobAge too low for this test ($min_age < 10)\n" + exit 0 +} + # Remove any vestigial scripts exec $bin_rm -f $script @@ -55,7 +61,7 @@ proc check_hold { job } { exp_continue } timeout { - send_user "\nFAILURE scontrol is not responding\n" + send_user "\nFAILURE: scontrol is not responding\n" set exit_code 1 } eof { @@ -64,7 +70,7 @@ proc check_hold { job } { } if { $hold != 1 } { - send_user "\nFAILURE scontrol did not hold job after it was requeued\n" + send_user "\nFAILURE: scontrol did not hold job after it was requeued\n" set exit_code 1 } } @@ -128,8 +134,8 @@ expect { exp_continue } timeout { - send_user "\nFAILURE sbatch is not responding\n" - set exit_code 1 + send_user "\nFAILURE: sbatch is not responding\n" + set exit_code 1get_min_job_age } eof { wait @@ -146,7 +152,7 @@ wait_for_job $job_id DONE spawn $scontrol requeuehold $job_id expect { timeout { - send_user "\nFAILURE scontrol is not responding\n" + send_user "\nFAILURE: scontrol is not responding\n" set exit_code 1 } eof { diff --git a/testsuite/expect/test2.23 b/testsuite/expect/test2.23 index 1e79857c3..877d766c1 100755 --- a/testsuite/expect/test2.23 +++ b/testsuite/expect/test2.23 @@ -37,6 +37,12 @@ set exit_code 0 print_header $test_id +set min_age [get_min_job_age] +if {$min_age < 10} { + send_user "\nWARNING: MinJobAge too low for this test ($min_age < 10)\n" + exit 0 +} + # Remove any vestigial files exec $bin_rm -f $script diff --git a/testsuite/expect/test21.30 b/testsuite/expect/test21.30 index 733a85556..333848d80 100755 --- a/testsuite/expect/test21.30 +++ b/testsuite/expect/test21.30 @@ -52,7 +52,7 @@ set exit_code 0 set acct test_acct set user_name "" set qosname name -set qostest [format "%s %s" $test_id "qosTest"] +set qostest [format "%s_%s" $test_id "qosTest"] set grn GrpNodes set grn_num 2 set grcpu GrpCpus @@ -207,6 +207,15 @@ if { [test_limits_enforced] == 0 } { exit 0 } +# +# Some tests will not work properly when allocating whole nodes to jobs +# +set select_type [test_select_type] +if {![string compare $select_type "linear"]} { + send_user "\nWARNING: This test is incompatible with select/$select_type\n" + exit 0 +} + # Remove any vesitgial accounts or qos spawn $sacctmgr -i delete qos $qostest expect { diff --git a/testsuite/expect/test3.11 b/testsuite/expect/test3.11 index 57b916dd2..f82e145f2 100755 --- a/testsuite/expect/test3.11 +++ b/testsuite/expect/test3.11 @@ -359,22 +359,15 @@ expect { set cons_res_actived 0 if {$def_share_force == 0} { - spawn $scontrol show config - expect { - -re "select/cons_res" { - set cons_res_actived 1 - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } + set select_type [test_select_type] + if {![string compare $select_type "cons_res"]} { + set cons_res_actived 1 } } -inc3_11_1 +if {$cons_res_actived == 1} { + inc3_11_1 +} inc3_11_2 inc3_11_3 inc3_11_4 diff --git a/testsuite/expect/test4.5 b/testsuite/expect/test4.5 index a663a54a8..33260b78e 100755 --- a/testsuite/expect/test4.5 +++ b/testsuite/expect/test4.5 @@ -42,12 +42,14 @@ set node_name "" set mismatches 0 print_header $test_id - +if (![string compare $partition ""]) { + set partition [default_partition] +} # # Check the sinfo long format looking for filtering options # -spawn $sinfo --Node --long --exact +spawn $sinfo --Node --long --exact -p$partition expect { -re "($end_of_line)($name_string) *($number_with_suffix) *($name_string) *($alpha)" { if (![string compare $node_name ""]) { @@ -75,7 +77,7 @@ expect { # Use sinfo state filter # -spawn $sinfo --Node --long --exact --state=$node_state +spawn $sinfo --Node --long --exact --state=$node_state -p$partition expect { -re "($end_of_line)($name_string) *($number_with_suffix) *($name_string) *($alpha)" { if ([string compare $expect_out(5,string) $node_state]) { @@ -104,7 +106,7 @@ expect { # Use sinfo node name filter # -spawn $sinfo --Node --long --exact --nodes=$node_name +spawn $sinfo --Node --long --exact --nodes=$node_name -p$partition expect { -re "($end_of_line)($name_string) *($number_with_suffix) *($name_string) *($alpha)" { if ([string compare $expect_out(2,string) $node_name]) { diff --git a/testsuite/slurm_unit/common/bitstring-test.c b/testsuite/slurm_unit/common/bitstring-test.c index 8cf8e32f9..3b7b921dd 100644 --- a/testsuite/slurm_unit/common/bitstring-test.c +++ b/testsuite/slurm_unit/common/bitstring-test.c @@ -5,6 +5,12 @@ #include <sys/time.h> #include <testsuite/dejagnu.h> +/* Copied from src/common/bitstring.c */ +#define _bitstr_words(nbits) \ + ((((nbits) + BITSTR_MAXPOS) >> BITSTR_SHIFT) + BITSTR_OVERHEAD) +#define bit_decl(name, nbits) \ + (name)[_bitstr_words(nbits)] = { BITSTR_MAGIC_STACK, (nbits) } + /* Test for failure: */ #define TEST(_tst, _msg) do { \ diff --git a/testsuite/slurm_unit/common/pack-test.c b/testsuite/slurm_unit/common/pack-test.c index 82ac5172d..9b6ce7c41 100644 --- a/testsuite/slurm_unit/common/pack-test.c +++ b/testsuite/slurm_unit/common/pack-test.c @@ -69,7 +69,7 @@ int main (int argc, char *argv[]) unpack64(&test64, buffer); test_double2 = (long double)test64; - TEST(test64 != (uint64_t)test_double, "un/pack double as a uint64"); + TEST((uint64_t)test_double2 != (uint64_t)test_double, "un/pack double as a uint64"); /* info("Original\t %Lf", test_double); */ /* info("uint64\t %ld", test64); */ /* info("converted LD\t %Lf", test_double2); */ -- GitLab