diff --git a/META b/META index 32989c2ff9077cef6b9fb2d8073579f67b682330..ec84a2650d3503ead47a053f932c5f808ab5a768 100644 --- a/META +++ b/META @@ -9,8 +9,8 @@ Name: slurm Major: 14 Minor: 03 - Micro: 4 - Version: 14.03.4 + Micro: 5 + Version: 14.03.5 Release: 1 ## diff --git a/NEWS b/NEWS index 56540290d6ea463f5ccb46a488bcf42dd472e04d..be4c259fd5f4d5e8d5826100ec2667bfdec5984f 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,89 @@ This file describes changes in recent versions of Slurm. It primarily documents those changes that are of interest to users and admins. +* Changes in Slurm 14.03.5 +========================== + -- If a srun runs in an exclusive allocation and doesn't use the entire + allocation and CR_PACK_NODES is set layout tasks appropriately. + -- Correct Shared field in job state information seen by scontrol, sview, etc. + -- Print Slurm error string in scontrol update job and reset the Slurm errno + before each call to the API. + -- Fix task/cgroup to handle -mblock:fcyclic correctly + -- Fix for core-based advanced reservations where the distribution of cores + across nodes is not even. + -- Fix issue where association maxnodes wouldn't be evaluated correctly if a + QOS had a GrpNodes set. + -- GRES fix with multiple files defined per line in gres.conf. + -- When a job is requeued make sure accounting marks it as such. + -- Print the state of requeued job as REQUEUED. + -- Fix if a job's partition was taken away from it don't allow a requeue. + -- Make sure we lock on the conf when sending slurmd's conf to the slurmstepd. + -- Fix issue with sacctmgr 'load' not able to gracefully handle bad formatted + file. + -- sched/backfill: Correct job start time estimate with advanced reservations. + -- Error message added when in proctrack/cgroup the step freezer path isn't + able to be destroyed for debug. + -- Added extra index's into the database for better performance when + deleting users. + -- Fix issue with wckeys when tracking wckeys, but not enforcing them, + you could get multiple '*' wckeys. + -- Fix bug which could report to squeue the wrong partition for a running job + that is submitted to multiple partitions. + -- Report correct CPU count allocated to job when allocated whole node even if + not using all CPUs. + -- If job's constraints cannot be satisfied put it in pending state with reason + BadConstraints and don't remove it. + -- sched/backfill - If job started with infinite time limit, set its end_time + one year in the future. + -- Clear record of a job's gres when requeued. + -- Clear QOS GrpUsedCPUs when resetting raw usage if QOS is not using any cpus. + -- Remove log message left over from debugging. + -- When using CR_PACK_NODES fix make --ntasks-per-node work correctly. + -- Report correct partition associated with a step if the job is submitted to + multiple partitions. + -- Fix to allow removing of preemption from a QOS + -- If the proctrack plugins fail to destroy the job container print an error + message and avoid to loop forever, give up after 120 seconds. + -- Make srun obey POSIX convention and increase the exit code by 128 when the + process terminated by a signal. + -- Sanity check for acct_gather_energy/rapl + -- If the proctrack plugins fail to destroy the job container print an error + message and avoid to loop forever, give up after 120 seconds. + -- If the sbatch command specifies the option --signal=B:signum sent the signal + to the batch script only. + -- If we cancel a task and we have no other exit code send the signal and + exit code. + -- Added note about InnoDB storage engine being used with MySQL. + -- Set the job exit code when the job is signaled and set the log level to + debug2() when processing an already completed job. + -- Reset diagnostics time stamp when "sdiag --reset" is called. + -- squeue and scontrol to report a job's "shared" value based upon partition + options rather than reporting "unknown" if job submission does not use + --exclusive or --shared option. + -- task/cgroup - Fix cpuset binding for batch script. + -- sched/backfill - Fix anomaly that could result in jobs being scheduled out + of order. + -- Expand pseudo-terminal size data structure field sizes from 8 to 16 bits. + -- Set the job exit code when the job is signaled and set the log level to + debug2() when processing an already completed job. + -- Distinguish between two identical error messages. + -- If using accounting_storage/mysql directly without a DBD fix issue with + start of requeued jobs. + -- If a job fails because of batch node failure and the job is requeued and an + epilog complete message comes from that node do not process the batch step + information since the job has already been requeued because the epilog + script running isn't guaranteed in this situation. + -- Change message to note a NO_VAL for return code could of come from node + failure as well as interactive user. + -- Modify test4.5 to only look at one partition instead of all of them. + -- Fix sh5util -u to accept username different from the user that runs the + command. + -- Corrections to man pages:salloc.1 sbatch.1 srun.1 nonstop.conf.5 + slurm.conf.5. + -- Restore srun --pty resize ability. + -- Have sacctmgr dump cluster handle situations where users or such have + special characters in their names like ':' + * Changes in Slurm 14.03.4 ========================== -- Fix issue where not enforcing QOS but a partition either allows or denies @@ -39,7 +122,7 @@ documents those changes that are of interest to users and admins. -- Keep supporting 'srun -N x --pty bash' for historical reasons. -- If EnforcePartLimits=Yes and QOS job is using can override limits, allow it. - -- Fix issues if partition allows or denys account's or QOS' and either are + -- Fix issues if partition allows or denies account's or QOS' and either are not set. -- If a job requests a partition and it doesn't allow a QOS or account the job is requesting pend unless EnforcePartLimits=Yes. Before it would @@ -89,8 +172,8 @@ documents those changes that are of interest to users and admins. is already running. -- Email messages for job array events print now use the job ID using the format "#_# (#)" rather than just the internal job ID. - -- Set the number of free licenses to be 0 if the global license count decreases - and total is less than in use. + -- Set the number of free licenses to be 0 if the global license count + decreases and total is less than in use. -- Add DebugFlag of BackfillMap. Previously a DebugFlag value of Backfill logged information about what it was doing plus a map of expected resouce use in the future. Now that very verbose resource use map is only logged @@ -104,6 +187,13 @@ documents those changes that are of interest to users and admins. jobs. -- For "scontrol --details show job" report the correct CPU_IDs when thre are multiple threads per core (we are translating a core bitmap to CPU IDs). + -- If DebugFlags=Protocol is configured in slurm.conf print details of the + connection, ip address and port accepted by the controller. + -- Fix minor memory leak when reading in incomplete node data checkpoint file. + -- Enlarge the width specifier when printing partition SHARE to display larger + sharing values. + -- sinfo locks added to prevent possibly duplicate record printing for + resources in multiple partitions. * Changes in Slurm 14.03.3-2 ============================ @@ -554,6 +644,8 @@ documents those changes that are of interest to users and admins. -- Properly enforce job --requeue and --norequeue options. -- If a job --mem-per-cpu limit exceeds the partition or system limit, then scale the job's memory limit and CPUs per task to satisfy the limit. + -- Correct logic to support Power7 processor with 1 or 2 threads per core + (CPU IDs are not consecutive). * Changes in Slurm 2.6.9 ======================== diff --git a/auxdir/slurm.m4 b/auxdir/slurm.m4 index 0f9d71f4490615650cb6bfc0f1438006d65b5e01..62fbce28d4285396b147bc7e170c0a1798319f79 100644 --- a/auxdir/slurm.m4 +++ b/auxdir/slurm.m4 @@ -66,9 +66,9 @@ AC_DEFUN([X_AC_SLURM_PORTS], [Define the default port count for slurmctld]) AC_SUBST(SLURMCTLD_PORT_COUNT) - AC_MSG_CHECKING([for dynamic allocation port to be enabled]) + AC_MSG_CHECKING([for dynamic allocation port to be enabled for Hadoop]) AC_ARG_ENABLE([dynamic-allocation], - AS_HELP_STRING([--enable-dynamic-allocation, enable dynamic allocation requests from user programs ([disabled])])) + AS_HELP_STRING([--enable-dynamic-allocation, enable dynamic allocation requests from user programs for Hadoop ([disabled])])) if test "$enable_dynamic_allocation" = "yes"; then AC_MSG_RESULT([yes]) slurm_enable_dynamic_allocation="yes" diff --git a/configure b/configure index 5aa00cff7d9f8ae8f51e6b557745d63326c07eca..7ecb6a4b20e031503b28983f0c12833515f7979a 100755 --- a/configure +++ b/configure @@ -1700,7 +1700,7 @@ Optional Features: --disable-salloc-background disable salloc execution in the background --enable-simulator enable slurm simulator - --enable-dynamic-allocation, enable dynamic allocation requests from user programs (disabled) + --enable-dynamic-allocation, enable dynamic allocation requests from user programs for Hadoop (disabled) --enable-multiple-slurmd enable multiple-slurmd support @@ -22727,8 +22727,8 @@ _ACEOF - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dynamic allocation port to be enabled" >&5 -$as_echo_n "checking for dynamic allocation port to be enabled... " >&6; } + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dynamic allocation port to be enabled for Hadoop" >&5 +$as_echo_n "checking for dynamic allocation port to be enabled for Hadoop... " >&6; } # Check whether --enable-dynamic-allocation was given. if test "${enable_dynamic_allocation+set}" = set; then : enableval=$enable_dynamic_allocation; diff --git a/doc/html/accounting.shtml b/doc/html/accounting.shtml index 25c6b7d8a31437bdabc9713f16da1d1853976387..f5389a960f42293a41cd2b456850dbc29aa0b457 100644 --- a/doc/html/accounting.shtml +++ b/doc/html/accounting.shtml @@ -22,7 +22,7 @@ these plugins include:</p> <li><b>AccountingStorageType</b> controls how detailed job and job step information is recorded. You can store this information in a text file, <a href="http://www.mysql.com/">MySQL</a> -or MariaDB database, optionally using SlurmDBD for added security.</li> +or MariaDB database (using the InnoDB storage engine), optionally using SlurmDBD for added security.</li> <li><b>JobAcctGatherType</b> is operating system dependent and controls what mechanism is used to collect accounting information. Supported values are <i>jobacct_gather/aix</i>, <i>jobacct_gather/linux</i> @@ -188,8 +188,11 @@ a configuration. <p><b>MySQL or MariaDB is the preferred database.</b> To enable this database support one only needs to have the development package for the database they -wish to use on the system. The slurm configure script uses -mysql_config and pg-config to find out the information it needs +wish to use on the system. <b>Slurm uses the InnoDB storage +engine in MySQL to make rollback possible. This must be available on your +MySQL installation or rollback will not work.</b> +</p><p>The slurm configure +script uses mysql_config to find out the information it needs about installed libraries and headers. You can specify where your mysql_config script is with the </i>--with-mysql_conf=/path/to/mysql_config</i> option when configuring your @@ -442,6 +445,16 @@ mysql> grant all on slurm_acct_db.* TO 'slurm'@'system0' where 'system0' is the localhost or database storage host. </pre> +<p>Verify you have InnoDB support</p> +<pre> +mysql> SHOW VARIABLES LIKE 'have_innodb'; ++---------------+-------+ +| Variable_name | Value | ++---------------+-------+ +| have_innodb | YES | ++---------------+-------+ +</pre> + <p>Then create the database:</p> <pre> mysql> create database slurm_acct_db; diff --git a/doc/html/accounting_storageplugins.shtml b/doc/html/accounting_storageplugins.shtml index a6180363be4a23d97d1b7db05caf2e16295adaff..0452e3075ec81caef691fbe67eb7f166efa29fe2 100644 --- a/doc/html/accounting_storageplugins.shtml +++ b/doc/html/accounting_storageplugins.shtml @@ -23,7 +23,8 @@ The minor type can be any suitable name for the type of accounting package. We currently use <ul> <li><b>filetxt</b>—Information written to a text file. -<li><b>mysql</b>— Store information in a mysql database. +<li><b>mysql</b>— Store information in a mysql database (using + the InnoDB storage engine). <li><b>slurmdbd</b>— Send information to the Slurm Database Daemon (SlurmDBD). Extra configuration is needed and described <a href="accounting.html">here</a>. <li><b>none</b>— Information is not stored anywhere. diff --git a/doc/html/cray_alps.shtml b/doc/html/cray_alps.shtml index c071b6fbdc8394fa762a80e080b16cacf29546ac..10ac91ccf24cc1b21a5fd17722a096ca242d11f0 100644 --- a/doc/html/cray_alps.shtml +++ b/doc/html/cray_alps.shtml @@ -238,25 +238,9 @@ default: # rpm -qa <ul> <li>expat-2.0.xxx</li> <li>libexpat-devel-2.0.xxx</li> -<li>cray-MySQL-devel-enterprise-5.0.64 (this should be on the Cray ISO)</li> +<li>mysql-devel (this should be on the Cray ISO)</li> </ul> -<p>For example, loading MySQL can be done like this:</p> -<pre> -smw: # mkdir mnt -smw: # mount -o loop, ro xe-sles11sp1-trunk.201107070231a03.iso mnt -smw: # find mnt -name cray-MySQL-devel-enterprise\* -mnt/craydist/xt-packages/cray-MySQL-devel-enterprise-5.0.64.1.0000.2899.19.2.x86_64.rpm -smw: # scp mnt/craydist/xt-packages/cray-MySQL-devel-enterprise-5.0.64.1.0000.2899.19.2.x86_64 -</pre> - -<p>Then switch to boot node and run:</p> -<pre> -boot: # xtopview -default: # rpm -ivh /software/cray-MySQL-devel-enterprise-5.0.64.1.0000.2899.19.2.x86_64.rpm -default: # exit -</pre> - <p>All Cray-specific PrgEnv and compiler modules should be removed and root privileges will be required to install these files.</p> diff --git a/doc/html/elastic_computing.shtml b/doc/html/elastic_computing.shtml index d5a08f727ee9faf1f8bba5bcb3dd837ba2743a39..6291c6a5b351d6c0e4adf062ffd540d2d8edb60a 100644 --- a/doc/html/elastic_computing.shtml +++ b/doc/html/elastic_computing.shtml @@ -14,13 +14,6 @@ cluster. Good responsiveness and throughput can be achieved while you only pay for the resources needed.</p> -<p>The -<a href="http://web.mit.edu/star/cluster/docs/latest/index.html">StarCluster</a> -cloud computing toolkit has a -<a href="https://github.com/jlafon/StarCluster">SLURM port available</a>. -<a href="https://github.com/jlafon/StarCluster/wiki/Getting-started-with-SLURM-on-Amazon's-EC2"> -Instructions</a> for the SLURM port of StartCLuster are available online.</p> - <p>The rest of this document describes details about SLURM's infrastructure that can be used to support Elastic Computing.</p> diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 6bd15524a1d3b77d81afc8ea405db7e12fe5196a..6986f6af6724ba04a777e4a6a151c47df5b84177 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -842,11 +842,11 @@ salloc: Relinquishing job allocation 65542 SLURM? Why does the DAKOTA program not run with SLURM?</b></a><br> The SLURM library used to support MPIHCH2 or MVAPICH2 references a variety of symbols. If those symbols resolve to functions or variables in your program -rather than the appropriate library, the application will fail. In the case of -<a href="http://dakota.sandia.gov">DAKOTA</a>, it contains a function named -<b>regcomp</b>, which will get used rather than the POSIX regex functions. -Rename DAKOTA's function and references from regcomp to something else to make -it work properly.</p> +rather than the appropriate library, the application will fail. For example +<a href="http://dakota.sandia.gov">DAKOTA</a>, versions 5.1 and +older, contains a function named regcomp, which will get used rather +than the POSIX regex functions. Rename DAKOTA's function and +references from regcomp to something else to make it work properly.</p> <p><a name="estimated_start_time"><b>26. Why does squeue (and "scontrol show jobid") sometimes not display a job's estimated start time?</b></a><br> diff --git a/doc/html/meetings.shtml b/doc/html/meetings.shtml index dc690923c9f7915f6f9a8c30c9894eb1c9ed5b7c..9d0900ec1c4ec9c1216ceac4b8b7dbb4cf1ed844 100644 --- a/doc/html/meetings.shtml +++ b/doc/html/meetings.shtml @@ -6,8 +6,8 @@ 23-24 September 2014<br> Lugano, Switzerland<br> Host: <a href="http://cscs.ch/">Swiss National Supercomputing Centre</a></p> -<a href="slurm_ug_cfp.html">Call for Abstracts: Due 6 June 2014</a><br> -<!--<a href="slurm_ug_agenda.html">Meeting agenda</a><br>--> +<!--<a href="slurm_ug_cfp.html">Call for Abstracts: Due 6 June 2014</a><br>--> +<a href="slurm_ug_agenda.html">Meeting agenda</a><br> <!--<a href="slurm_ug_registration.html">Registration information</a>--></p> <br> @@ -33,6 +33,6 @@ Host: Bull</p> Paris, France<br> Host: CEA</p> -<p style="text-align:center;">Last modified 31 March 2014</p> +<p style="text-align:center;">Last modified 3 July 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/slurm_ug_agenda.shtml b/doc/html/slurm_ug_agenda.shtml index 2bafc1d7c674bfa9594b676da83fe322a1982508..1097968b1a118e09f1ed0acc5389d643680af021 100644 --- a/doc/html/slurm_ug_agenda.shtml +++ b/doc/html/slurm_ug_agenda.shtml @@ -1,57 +1,34 @@ <!--#include virtual="header.txt"--> -<h1>Slurm User Group Meeting 2013</h1> +<h1>Slurm User Group Meeting 2014</h1> -<p>Hosted by <a href="http:///www.schedmd.com">SchedMD</a> +<p>Hosted by the <a href="http:///www.cscs.ch">Swiss National Supercomputing Centre</a> <h1>Agenda</h1> -<p>The 2013 SLURM User Group Meeting will be held on September 18 and 19 -in Oakland, California, USA. +<p>The 2014 SLURM User Group Meeting will be held on September 23 and 24 +in Lugano, Switzerland. The meeting will include an assortment of tutorials, technical presentations, and site reports. The <a href="#schedule">Schedule</a> amd <a href="#abstracts">Abstracts</a> are shown below.</p> <h2>Meeting Information</h2> -<p>The meeting will be held at -<a href="http://www.ce.csueastbay.edu/businessservices/conference_facilities/index.shtml"> -California State University's Conference Center</a>, -1000 Broadway Avenue, Suite 109, Oakland, California -(Phone 510-208-7001, access from 11th Street). -This state of the art facility is located adjacent to the 12th Street -<a href="http://www.bart.gov">BART</a> (Metro) station, with easy access to -the entire San Francisco area. -There is also frequent and free bus service to -<a href="http://www.jacklondonsquare.com">Jack London Square</a> using the -<a href="http://Bshuttle.com">Broadway Shuttle</a>. +<p>The meeting will be held at the +<a href="http://www.lugano-tourism.ch/en/129/default.aspx"> +Lugano Convention Centre</a>, Lugano, Switzerland. +More information will be made available later.</p> <h2>Hotel Information</h2> -<p>Many hotel options are available in Oakland, San Fransisco, and elsewhere in -the area. Just be sure that your hotel has easy access to BART. -Consider the hotels listed below as suggestions:</p> - -<p><a href="http://www.waterfronthoteloakland.com"><b>Waterfront Hotel</b></a><br> -Like it says in the name, on the waterfront, with several nice restaurants nearby. -About 1 mile (2 km) from the conference center via the -<a href="http://Bshuttle.com">Broadway Shuttle</a>. -Ferry service to San Fransisco adjacent to the hotel.</p> - -<p><a href="http://www.marriott.com/hotels/travel/oakdt-oakland-marriott-city-center/"> -<b>Oakland Marriott City Center</b></a><br> -Across the street from the conference center. -Discounted rooms are available to government employees.</p> +<p>Hotels may be bookded through the Lugano Convention Centre (Palazzo dei Congressi).<br> +<a href="https://www.aec-internet.it/booking_engine/prenota_congresso.htm?graph_be=4&n_tappe=1&headvar=ok&lingua_int=eng&id_stile=7434&id_congresso=54&id_canale=704">Hotel booking</a>. <h2>Registration</h2> -<p>The conference cost is $250 per person for registrations by 29 August and -$300 per person for late registration. -This includes presentations, tutorials, lunch and snacks on both days, -plus dinner on Wednesday evening.<br><br> -<a href="http://sug2013.eventbrite.com">Register here.</a></p> +<p>Information will be made available later.</p> <a name="schedule"><h1>Schedule</h1></a> -<h2>September 18, 2013</h2> +<h2>23 September 2014</h2> <table width="100%" border=1 cellspacing=0 cellpadding=0> @@ -63,541 +40,816 @@ plus dinner on Wednesday evening.<br><br> </tr> <tr> - <td width="15%" bgcolor="#F0F1C9">08:00 - 09:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration / Breakfast</td> + <td width="15%" bgcolor="#F0F1C9">08:00 - 08:30</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration </td> </tr> <tr> - <td width="15%">09:00 - 09:15</td> + <td width="15%">08:30 - 08:45</td> <td width="15%"> Welcome</td> - <td width="25%"> Morris Jette (SchedMD)</td> + <td width="25%"> TBD (CSCS)</td> <td width="45%"> Welcome to Slurm User Group Meeting</td> </tr> <tr> - <td width="15%">09:15 - 10:00</td> + <td width="15%">08:45 - 09:30</td> <td width="15%"> Keynote</td> - <td width="25%"> Dona Crawford (LLNL)</td> - <td width="45%"> Future Outlook for Advanced Computing</td> + <td width="25%"> TBD</td> + <td width="45%"> TBD</td> </tr> <tr> - <td width="15%" bgcolor="#F0F1C9">10:00 - 10:30</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> + <td width="15%" bgcolor="#F0F1C9">09:30 - 09:45</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> </tr> <tr> - <td width="15%">10:30 - 11:00</td> + <td width="15%">09:45 - 10:15</td> <td width="15%"> Technical</td> - <td width="25%"> Morris Jette, Danny Auble (SchedMD), Yiannis Georgiou (Bull)</td> - <td width="45%"> Overview of Slurm version 2.6</td> + <td width="25%"> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</td> + <td width="45%"> Overview of Slurm Versions 14.03 and 14.11</td> </tr> <tr> - <td width="15%">11:00 - 12:00</td> + <td width="15%">10:15 - 10:45</td> <td width="15%"> Tutorial</td> - <td width="25%"> Yiannis Georgiou, Martin Perry, Thomas Cadeau (Bull), Danny Auble (SchedMD)</td> - <td width="45%"> Energy Accounting and External Sensor Plugins</td> + <td width="25%"> Michael Jennings, Jacqueline Scoggins (LBL)</td> + <td width="45%"> Warewulf Node Health Check</td> </tr> - -<tr> - <td width="15%" bgcolor="#F0F1C9">12:00 - 13:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch at conference center</td> -</tr> - - -<tr> - <td width="15%">13:00 - 13:30</td> + <td width="15%">10:45 - 11:15</td> <td width="15%"> Technical</td> - <td width="25%"> Yiannis Georgiou , Thomas Cadeau (Bull), Danny Auble, Moe Jette (SchedMD) Matthieu Hautreux (CEA)</td> - <td width="45%"> Evaluation of Monitoring and Control Features for Power Management</td> + <td width="25%"> Yiannis Georgiou (BULL), David Glesser (BULL), + Matthieu Hautreux (CEA), Denis Trystram (Univ. Grenoble-Alpes)</td> + <td width="45%"> SLURM processes isolation</td> </tr> <tr> - <td width="15%">13:30 - 14:00</td> + <td width="15%">11:15 - 11:45</td> <td width="15%"> Technical</td> - <td width="25%"> Matthieu Hautreux (CEA)</td> - <td width="45%"> Debugging Large Machines</td> -<tr> - <td width="15%">14:00 - 14:30</td> - <td width="15%"> Technical</td> - <td width="25%"> Alberto Falzone, Paolo Maggi (Nice)</td> - <td width="45%"> Creating easy to use HPC portals with NICE EnginFrame and Slurm</td> -</tr> - -<tr> - <td width="15%" bgcolor="#F0F1C9">14:30 - 15:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> + <td width="25%"> Rod Schultz (BULL), Martin Perry (BULL), + Yiannis Georgiou (BULL), Danny Auble (SchedMD), Morris Jette (SchedMD), + Matthieu Hautreux (CEA)</td> + <td width="45%"> Improving forwarding logic in SLURM</td> </tr> - -<tr> - <td width="15%">15:00 - 15:30</td> - <td width="15%"> Technical</td> - <td width="25%"> David Glesser, Yiannis Georgiou, Joseph Emeras, Olivier Richard (Bull)</td> - <td width="45%"> Slurm evaluation using emulation and replay of real workload traces</td> + <td width="15%" bgcolor="#F0F1C9">11:45 - 12:45</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch </tr> <tr> - <td width="15%">15:30 - 16:30</td> + <td width="15%">12:45 - 13:45</td> <td width="15%"> Tutorial</td> - <td width="25%"> Rod Schultz, Yiannis Georgiou (Bull) Danny Auble (SchedMD)</td> - <td width="45%"> Usage of new profiling functionalities</td> -</tr> - -<tr> - <td width="15%" bgcolor="#F0F1C9">18:00 - </td> - <td width="15%" bgcolor="#F0F1C9"> Dinner</td> - <td width="70%" colspan="2" bgcolor="#F0F1C9"> Lungomare, 1 Broadway Ave.</td> -</tr> -</table> - -<h2>September 19, 2013</h2> - -<table width="100%" border=1 cellspacing=0 cellpadding=0> - -<tr> - <th width="15%">Time</th> - <th width="15%">Theme</th> - <th width="25%">Speaker</th> - <th width="45%">Title</th> + <td width="25%"> Morris Jette (SchedMD)</td> + <td width="45%"> Tuning Slurm Scheduling for Optimal + Responsiveness and Utilization</td> </tr> - <tr> - <td width="15%" bgcolor="#F0F1C9">08:00 - 08:30</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration / Breakfast</td> - </tr> - -<tr> - <td width="15%">08:30 - 09:00</td> + <td width="15%">13:45 - 14:15</td> <td width="15%"> Technical</td> - <td width="25%"> Morris Jette, David Bigagli, Danny Auble (SchedMD)</td> - <td width="45%"> Fault Tolerant Workload Management</td> -</tr> + <td width="25%"> Carles Fenoy (BSC)</td> + <td width="45%"> Improving HPC applications scheduling with + predictions based on automatically-collected historical data</td> <tr> - <td width="15%">09:00 - 09:30</td> + <td width="15%">14:15 - 14:45</td> <td width="15%"> Technical</td> - <td width="25%"> Yiannis Georgiou (Bull) Matthieu Hautreux (CEA)</td> - <td width="45%"> Slurm Layouts Framework</td> + <td width="25%"> Filip Skalski, Krzysztof Rzadca (University of + Warsaw)</td> + <td width="45%"> Fair Scheduler for Burst Submissions of + Parallel Job</td> </tr> <tr> - <td width="15%">09:30 - 10:00</td> - <td width="15%"> Technical</td> - <td width="25%"> Bill Brophy (Bull)</td> - <td width="45%"> License Management</td> + <td width="15%" bgcolor="#F0F1C9">14:45 - 15:00</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> </tr> - <tr> - <td width="15%" bgcolor="#F0F1C9">10:00 - 10:30</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> + <td width="15%">15:00 - 15:30</td> + <td width="15%"> Technical</td> + <td width="25%"> Yiannis Georgiou (BULL), David Glesser (BULL), + Matthieu Hautreux (CEA), Denis Trystram (Univ. Grenoble-Alpes)</td> + <td width="45%"> Introducing Power-capping in SLURM scheduling</td> </tr> <tr> - <td width="15%">10:30 - 11:00</td> + <td width="15%">15:30 - 16:00</td> <td width="15%"> Technical</td> - <td width="25%"> Juan Pancorbo Armada (IRZ)</td> - <td width="45%"> Multi-Cluster Management</td> + <td width="25%"> David Glesser (BULL), Yiannis Georgiou (BULL), + Denis Trystram (Univ. Grenoble-Alpes)</td> + <td width="45%"> Introducing Energy based fair-share scheduling</td> </tr> - <tr> - <td width="15%">11:00 - 11:30</td> + <td width="15%">16:00 - 16:30</td> <td width="15%"> Technical</td> - <td width="25%"> Francois Daikhate, Matthieu Hautreux (CEA)</td> - <td width="45%"> Depth Oblivious Hierarchical Fairshare Priority Factor</td> + <td width="25%"> Aamir Rashid (Terascala)</td> + <td width="45%"> Data movement between Lustre and Enterprise + storage systems</td> </tr> - <tr> - <td width="15%">11:30 - 12:00</td> + <td width="15%">16:30 - 17:00</td> <td width="15%"> Technical</td> - <td width="25%"> Dave Wallace (Cray)</td> - <td width="45%"> Refactoring ALPS</td> + <td width="25%"> Sergio Iserte, Adrian Castello, Rafael Mayo, + Enrique S. Quintana-Ort (Universitat Jaume I de Castello), + Federico Silla, Jose Duato (Universitat Politecnica de Valencia)</td> + <td width="45%"> Extending SLURM with Support for Remote GPU + Virtualization</td> </tr> <tr> - <td width="15%" bgcolor="#F0F1C9">12:00 - 13:00</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch at conference center</td> + <td width="15%" bgcolor="#F0F1C9">17:00 - </td> + <td width="15%" bgcolor="#F0F1C9"> Dinner</td> + <td width="70%" colspan="2" bgcolor="#F0F1C9"> TBD</td> </tr> +</table> -<tr> - <td width="15%">13:00 - 13:20</td> - <td width="15%"> Site Report</td> - <td width="25%"> Francois Diakhate, Francis Belot, Matthieu Hautreux (CEA)</td> - <td width="45%"> CEA Site Report</td> -</tr> -<tr> - <td width="15%">13:20 - 13:40</td> - <td width="15%"> Site Report</td> - <td width="25%"> Tim Wickberg (George Washington University)</td> - <td width="45%"> George Washington University Site Report</td> -</tr> -<tr> - <td width="15%">13:40 - 14:00</td> - <td width="15%"> Site Report</td> - <td width="25%"> Ryan Cox (BYU)</td> - <td width="45%"> Brigham Young University Site Report</td> -</tr> -<tr> - <td width="15%">14:00 - 14:20</td> - <td width="15%"> Site Report</td> - <td width="25%"> Doug Hughes, Chris Harwell, Eric Radman, Goran Pocina, Michael Fenn (D.E. Shaw Research)</td> - <td width="45%"> D.E. Shaw Research Site Report</td> -</tr> -<tr> - <td width="15%">14:20 - 14:40</td> - <td width="15%"> Site Report</td> - <td width="25%"> Dr. Ulf Markwardt (Technische Universitat Dresden)</td> - <td width="45%"> Technische Universitat Dresden Site Report</td> -</tr> +<h2>24 September 2014</h2> -<tr> - <td width="15%" bgcolor="#F0F1C9">14:40 - 15:10</td> - <td width="85%" colspan="3" bgcolor="#F0F1C9"> Coffee break</td> -</tr> +<table width="100%" border=1 cellspacing=0 cellpadding=0> -<tr> - <td width="15%">15:00 - 15:30</td> - <td width="15%"> Technical</td> - <td width="25%"> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</td> - <td width="45%"> Slurm Roadmap</td> -</tr> -<tr> - <td width="15%">15:30 - 16:30</td> - <td width="15%"> Discussion</td> - <td width="25%"> Everyone</td> - <td width="45%"> Open Discussion</td> -</tr> + <tr> + <th width="15%">Time</th> + <th width="15%">Theme</th> + <th width="25%">Speaker</th> + <th width="45%">Title</th> + </tr> + + <tr> + <td width="15%" bgcolor="#F0F1C9">08:00 - 08:30</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Registration</td> + </tr> + + <tr> + <td width="15%">08:30 - 09:00</td> + <td width="15%"> Technical</td> + <td width="25%"> Jacqueline Scoggins (Lawrence Berkeley + National Lab)</td> + <td width="45%"> Complex environment migration from + Moab/Torque to Slurm</td> + </tr> + <tr> + <td width="15%">09:00 - 09:30</td> + <td width="15%"> Technical</td> + <td width="25%"> Huub Stoffers (SURFsara)</td> + <td width="45%"> A budget checking / budget tracking plug-in + for SLURM</td> + </tr> + + <tr> + <td width="15%">09:30 - 10:00</td> + <td width="15%"> Technical</td> + <td width="25%"> Ryan Cox, Levi Morrison (Brigham Young + University)</td> + <td width="45%"> Level-based job prioritization</td> + </tr> + + <tr> + <td width="15%" bgcolor="#F0F1C9">10:00 - 10:15</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> + </tr> + + <tr> + <td width="15%">10:15 - 10:45</td> + <td width="15%"> Technical</td> + <td width="25%"> Thomas Cadeau (BULL), Yiannis Georgiou + (BULL), Matthieu Hautreux (CEA)</td> + <td width="45%"> Integrating Layouts Framework in SLURM</td> + </tr> + + <tr> + <td width="15%">10:45 - 11:15</td> + <td width="15%"> Technical</td> + <td width="25%"> Emmanuel Jeannot, Guillaume Mercier, Adèle + Villiermet (INRIA)</td> + <td width="45%"> Topology-aware Resource Selection with Slurm</td> + </tr> + + <tr> + <td width="15%">11:15 - 11:45</td> + <td width="15%"> Technical</td> + <td width="25%"> Stephen Trofinoff (CSCS)</td> + <td width="45%"> Exploring the implementation of several key + Slurm Inter-cluster features</td> + </tr> + <tr> + <td width="15%">11:45 - 12:15</td> + <td width="15%"> Technical</td> + <td width="25%"> Danny Auble (SchedMD)</td> + <td width="45%"> Slurm Native Workload Management on Cray Systems</td> + </tr> + <tr> + <td width="15%" bgcolor="#F0F1C9">12:15 - 13:15</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Lunch</td> + </tr> + + <tr> + <td width="15%">13:15 - 13:45</td> + <td width="15%"> Technical</td> + <td width="25%"> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</td> + <td width="45%"> Slurm Roadmap</td> + </tr> + <tr> + <td width="15%">13:45 - 14:05</td> + <td width="15%"> Site Report</td> + <td width="25%"> Magnus Jonsson (Umea University)</td> + <td width="45%"> Umea University Site Report</td> + </tr> + <tr> + <td width="15%">14:05 - 14:25</td> + <td width="15%"> Site Report</td> + <td width="25%"> Marcin Stolarek (Interdisciplinary Centre + for Mathematical and Computational Modelling (ICM), University of + Warsaw, Poland)</td> + <td width="45%"> University of Warsaw Site Report</td> + </tr> + <tr> + <td width="15%">14:25 - 14:45</td> + <td width="15%"> Site Report</td> + <td width="25%"> Andrew Elwell (iVEC)</td> + <td width="45%"> iVEC Site Report</td> + </tr> + <tr> + <td width="15%">14:45 - 15:05</td> + <td width="15%"> Site Report</td> + <td width="25%"> Matthieu Hautreux (CEA)</td> + <td width="45%"> CEA Site Report</td> + </tr> + + <tr> + <td width="15%" bgcolor="#F0F1C9">15:05 - 15:20</td> + <td width="85%" colspan="3" bgcolor="#F0F1C9"> Break</td> + </tr> + + <tr> + <td width="15%">15:20 - 15:40</td> + <td width="15%"> Site Report</td> + <td width="25%"> Benini Massimo (CSCS)</td> + <td width="45%"> CSCS Site Report</td> + </tr> + <tr> + <td width="15%">15:40 - 16:00</td> + <td width="15%"> Site Report</td> + <td width="25%"> Janne Blomqvist, Ivan Degtyarenko, Mikko + Hakala (Aalto University)</td> + <td width="45%"> Aalto University Site Report</td> + </tr> + <tr> + <td width="15%">16:00 - 16:20</td> + <td width="15%"> Site Report</td> + <td width="25%"> Tim Wickberg (George Washington University)</td> + <td width="45%"> George Washington University Site Report</td> + </tr> + <tr> + <td width="15%">16:20 - 16:30</td> + <td width="15%"> Closing</td> + <td width="25%"> Tim Wickberg (George Washington University), + Morris Jette (SchedMD)</td> + <td width="45%"> Closing/Invitation to Slurm User Group + Meeting 2015</td> + </tr> </table> <br><br> <a name="abstracts"><h1>Abstracts</h1></a> -<h2>September 18, 2013</h2> +<h2>September 23, 2014</h2> -<h3>Overview of Slurm Version 2.6</h3> -<p>Danny Auble, Morris Jette (SchedMD) -Yiannis Georgiou (Bull)</p> -<p>This presentation will provide an overview of Slurm enhancements in -version 2.6, released in May. Specific development to be described include:</p> +<h3>Overview of Slurm Versions 14.03 and 14.11</h3> +<p> Morris Jette (SchedMD), Yiannis Georgiou (Bull)</p> +<p>This presentation will describe new capabilities provided in Slurm + versions 14.03 (released March 2014) and planned for version 14.11 + (to be released in November 2014). Major enhancements in version 14.03 + include:</p> +<ul> + <li>Access control options for partitions</li> + <li>Load-based scheduling</li> + <li>Reservation of cores for system use</li> + <li>Native support for Cray systems</li> +</ul> +<p>Major enhancements planned for version 14.11 include:</p> <ul> -<li>Support for job arrays, which increases performance and ease of use for -sets of similar jobs.</li> -<li>Support for MapReduce+.</li> -<li>Added prolog and epilog support for advanced reservations.</li> -<li>Much faster throughput for job step execution.</li> -<li>Advanced reservations now supports specific different core count for each node.</li> -<li>Added external sensors plugin to capture temperature and power data.</li> -<li>Added job profiling capability.</li> -<li>CPU count limits by partition.</li> + <li>Support for heterogeneous generic resources</li> + <li>Support for non-consumable generic resources</li> + <li>Automatic job requeue based upon exit code</li> + <li>User control over CPU governor</li> + <li>Communication gateways</li> + <li>New options for job scheduling and task layout</li> + <li>Improved job array support</li> </ul> -<h3>Usage of Energy Accounting and External Sensor Plugins</h3> -<p>Yiannis Georgiou, Martin Perry, Thomas Cadeau (Bull) -Danny Auble (SchedMD)</p> -<p>Power Management has gradually passed from a trend to an important need in -High Performance Computing. Slurm version 2.6 provides functionalities for -energy consumption recording and accounting per node and job following both -in-band and out-of-band strategies. The new implementations consist of two new -plugins: One plugin allowing in-band collection of energy consumption data from -the BMC of each node based on freeipmi library; Another plugin allowing -out-of-band collection from a centralized storage based on rrdtool library. -The second plugin allows the integration of external mechanisms like wattmeters -to be taken into account for the energy consumption recording and accounting -per node and job. The data can be used by users and administrators to improve -the energy efficiency of their applications and the whole clusters in general.</p> -<p>The tutorial will provide a brief description of the various power -management features in Slurm and will make a detailed review of the new plugins -introduced in 2.6, with configuration and usage details along with examples of -actual deployment.</p> - -<h3>Evaluation of Monitoring and Control Features for Power Management</h3> -<p>Yiannis Georgiou , Thomas Cadeau(Bull), Danny Auble, Moe Jette(SchedMD), -Matthieu Hautreux (CEA)</p> -<p>High Performance Computing platforms are characterized by their - increasing needs in power consumption. The Resource and Job - Management System (RJMS) is the HPC middleware responsible for - distributing computing resources to user applications. Appearance of - hardware sensors along with their support on the kernel/software side can be - taken into account by the RJMS in order to enhance the monitoring - and control of the executions with energy considerations. This - essentially enables the applications' execution statistics for - online energy profiling and gives the possibility to users to - control the tradeoffs between energy consumption and performance. In - this work we present the design and evaluation of a new framework, - developed upon SLURM Resource and Job Management System, - which allows energy consumption recording and accounting per node - and job along with parameters for job energy control features based on static - frequency scaling of the CPUs. We evaluate the overhead of the design choices - and the precision of the energy consumption results with different - HPC benchmarks (IMB,stream,HPL) on real-scale platforms and - integrated wattmeters. Having as goal the deployment of the - framework on large petaflopic clusters such as Curie, scalability is - an important aspect.</p> - -<h3>Debugging Large Machines</h3> -<p>Matthieu Hautreux (CEA)</p> -<p>This talk will present some cases of particularly interesting bugs - that were studied/worked-around/corrected over the past few years - on the petaflopic machines installed and used at CEA. The goal - is to share with the administrator community some methods and tools - helping to identify and in some cases work-around or correct - unexpected performance issues or bugs.</p> - -<h3>Creating easy to use HPC portals with NICE EnginFrame and Slurm</h3> -<p>Alberto Falzone, Paolo Maggi (Nice)</p> -<p>NICE EnginFrame is a popular framework to easily create HPC portals -that provide user-friendly application-oriented computing and data -services, hiding all the complexity of the underlying IT infrastructure. -Designed for technical computing users in a broad range of markets -(Oil&Gas, Automotive, Aerospace, Medical, Finance, Research, and -more), EnginFrame simplifies engineers' and scientists' work -through its intuitive, self-documenting interfaces, increasing -productivity and streamlining data and resource -management. Leveraging all the major HPC job schedulers and remote -visualization technologies, EnginFrame translates user clicks into the -appropriate actions to submit HPC jobs, create remote visualization -sessions, monitor workloads on distributed resources, manage data -and much more. In this work we describe the integration between the -SLURM Workload Manager and EnginFrame. We will then illustrate how -this integration can be leveraged to create easy to use HPC portals -for SLURM-based HPC infrastructures.</p> - -<h3>Slurm evaluation using emulation and replay of real workload traces</h3> -<p>David Glesser, Yiannis Georgiou, Joseph Emeras, Olivier Richard (Bull)</p> -<p>The experimentation and evaluation of Resource and Job Management - Systems in HPC supercomputers are characterized by important - complexities due to the inter-dependency of multiple parameters that - have to be taken into control. In our study we have developed a - methodology based upon emulated controlled experimentation, under - real conditions, with submission of workload traces extracted from a - production system. The methodology is used to perform comparisons of - different Slurm configurations in order to deduce the best - configuration for the typical workload that takes place on the - supercomputer, without disturbing the production. We will present - observations and evaluations results using real workload traces - extracted from Curie supercomputer,Top500 system with 80640, - replayed upon only 128 cores of a machine with similar - architecture. Various interesting results are extracted and important - side effects are discussed along with proposed configurations for - each type of workloads. Ideas for improvements on Slurm are also - proposed.</p> - -<h3>Usage of new profiling functionalities</h3> -<p>Rod Schultz, Yiannis Georgiou (Bull), Danny Auble (SchedMD)</p> -<p>SLURM Version 2.6 includes the ability to gather detailed -performance data on jobs. It has a plugin that stores the detailed -data in an HDF5 file. Other plugin gather data on task performance -such as cpu usage, memory usage, and local disk I/O; I/O to the -Lustre file system; traffic through and Infiniband network -interface; and energy information collected from IPMI. -This tutorial will describe the new capability, show how to configure -the various data sources, show examples of different data streams, -and report on actual usage.</p> - -<h2>September 19, 2013</h2> - -<h3>Fault Tolerant Workload Management</h3> -<p>Morris Jette, David Bigagli, Danny Auble (SchedMD)</p> -<p>One of the major issues facing exascale computing is fault -tolerance; how can a computer be effectively used if the typical job -execution time exceeds its mean time between failure. Part of the -solution is providing users with means to address failures in a -coordinated fashion with a highly adaptable workload manager. Such a -solution would support coordinated recognition of failures, -notification of failing and failed components, replacement -resources, and extended job time limits using negotiated interactive -communications. This paper describes fault tolerance issues from the -perspective of a workload manager and the implementation of solution -designed to optimize job fault tolerance based upon the popular open -source workload manager, Slurm.</p> - -<h3>Slurm Layouts Framework</h3> -<p>Yiannis Georgiou (Bull), Matthieu Hautreux (CEA)</p> -<p>This talk will describe the origins and goals of the study -concerning the Layouts Framework as well as first targets, current -developments and results. The layouts framework aims at providing a -uniform and generalized way to describe the hierarchical -relations between resources managed by a RM in order to use that -information in related RM internal logic. Examples of -instantiated layouts could be the description of the network -connectivity of nodes for the Slurm internal communication, the -description of the power supply network and capacities per branch -powering up the nodes, the description of the racking of the nodes, ...<p> - -<h3>License Management</h3> -<p>Bill Brophy (Bull)</p> -<p>License management becomes an increasingly critical issue as the -size of systems increase. These valuable resources deserve the same -careful management as all other resources configured in a -cluster. When licenses are being utilized in both interactive and -batch execution environments with multiple resource managers -involved the complexity of this task increases -significantly. Current license management within SLURM is not -integrated with any external license managers. This approach is -adequate if all jobs requiring licenses are submitted through SLURM -or if SLURM is given a subset of the licenses available on the -system to sub manage. However, the case of sub management can result -in underutilization of valuable license resources. Documentation for -other resource managers describes their interaction with external -license managers. For SLURM to become an active participant in -license management an evolution to its management approach must -occur. This article proposes a two-phased approach for accomplishing -that transformation. In the first phase, enhancements are proposed for -now SLURM internally deals with licenses: restriction of license to -specific accounts or users, provides recommendations for keeping -track of license information and suggestions for how this -information can be displayed for a SLURM users or -administrators. The second phase of this effort, which is -considerably more ambitious, is to define an evolution of SLURM's -approach to license management. This phase introduces an interaction -between SLURM and external license managers. The goal of this effort -is to increase SLURM's effectiveness in another area of resource -management, namely management of software licenses.</p> - -<h3>Multi-Cluster Management</h3> -<p>Juan Pancorbo Armada (IRZ)</p> -<p>As a service provider for scientific high performance computing, -Leibniz Rechen Zentrum (LRZ) operates compute systems for use by -educational institutions in Munich, Bavaria, as well as on the -national level. LRZ provides own computing resources as well as -housing and managing computing resources from other institutions -such as Max Planck Institute, or Ludwig Maximilians University. -The tier 2 Linux cluster operated at LRZ is a heterogeneous system -with different types of compute nodes, divided into 13 different -partitions, each of which is managed by SLURM. The various -partitions are configured for the different needs and services -requested, ranging from single node multiple core NUMAlink shared -memory clusters, to a 16-way infiniband- connected cluster for -parallel job execution, or an 8-way Gbit Ethernet cluster for serial -job execution. The management of all partitions is centralized on a -single VM. In this VM one SLURM cluster for each of these Linux -cluster partitions is configured. The required SLURM control daemons -run concurrently on this VM. With the use of a wrapper script called -MSLURM, the SLURM administrator can send SLURM commands to any -cluster in an easy-to use and flexible manner, including starting or -stopping the complete SLURM subsystem. Although such a setup may not -be desirable for large homogeneous supercomputing clusters, on small -heterogeneous clusters it has its own advantages. No separate control -node is required for each cluster for the slurmctld to run, so the -control of small clusters can be grouped in a single control -node. This feature also help to solve the restriction for some -parameters that cannot be set to different values for different -partitions in the same slurm.conf file; in that case it is possible -to move such parameters to partition-specific slurm.conf files.</p> - -<h3>Preparing Slurm for use on the Cray XC30</h3> -<p>Stephen Trofinoff, Colin McMurtrie (CSCS)</p> -<p>In this paper we describe the technical details associated with the -preparation of Slurm for use on a XC30 system installed at the Swiss -National Supercomputing Centre (CSCS). The system comprises external -login nodes, internal login nodes and a new ALPS/BASIL version so a -number of technical details needed to be overcome in order to have -Slurm working, as desired, on the system. Due to the backward -compatibility of ALPS/BASIL and the well-written code of Slurm, -Slurm was able to run, as it had in the past on previous Cray -systems, with little effort. However some problems were encountered -and their identification and resolution is described in -detail. Moreover, we describe the work involved in enhancing Slurm -to utilize the new BASIL protocol. Finally, we provide detail on the -work done to improve the Slurm task affinity bindings on a -general-purpose Linux cluster so that they, as closely as possible, -match the Cray bindings, thereby providing our users with some -degree of consistency in application behavior between these systems.</p> - -<h3>Refactoring ALPS</h3> -<p>Dave Wallace (Cray)</p> -<p>One of the hallmarks of the Cray Linux Environment is the Cray -Application Level Placement Scheduler (ALPS). ALPS is a resource -placement infrastructure used on all Cray systems. Developed by -Cray, ALPS addresses the size, complexity, and unique resource -management challenges presented by Cray systems. It works in -conjunction with workload management tools such as SLURM to -schedule, allocate, and launch applications. ALPS separates policy -from placement, so it launches applications but does not conflict -with batch system policies. The batch system interacts with ALPS via -an XML interface. Over time, the requirement to support more and -varied platform and processor capabilities, dynamic resource -management and new workload manager features has led Cray to -investigate alternatives to provide more flexible methods for -supporting expanding workload manager capabilities on Cray -systems. This presentation will highlight Cray's plans to expose low -level hardware interfaces by refactoring ALPS to allow 'native' -workload manager implementations that don't rely on the current ALPS -interface mechanism.</p> +<h3>Warewulf Node Health Check</h3> +<p>Michael Jennings, Jacqueline Scoggins (Lawrence Berkeley + National Lab)</p> +<p>Since its release to the HPC community in 2011, the Warewulf Node + Health Check system has gained wide acceptance across the industry + and has become the de facto standard community solution for compute + node health checking. It provides a complete, optimized framework + for creating and executing node-level checks and already comes with + more than 40 of its own pre-written checks. It fully supports SLURM + (as well as other popular schedulers & resource managers) and can + directly error/drain failed nodes and subsequently return them to + service once fixed. Having been used in production at Lawrence + Berkeley National Laboratory since late-2010, Warewulf NHC has + evolved and matured to become a vital asset in maximizing the + integrity and reliability of high-performance computational + resources.</p> + +<p>In this talk, we'll discuss what makes Warewulf NHC such a unique + and robust solution to the problem of compute node health, look at + the feature set of NHC and its integration with SLURM, examine LBNL's + configuration and utilization of SLURM and NHC with tips on how to + quickly deploy it in your environment, and survey many of the + available checks that are supplied out-of-the-box. Time permitting, a + brief introduction to writing custom or site-specific checks may also + be included.</p> + +<h3>SLURM processes isolation</h3> +<p>Martin Perry (BULL), Bill Brophy (BULL), Yiannis Georgiou (BULL), + Danny Auble (SchedMD), Morris Jette (SchedMD), Matthieu Hautreux (CEA)</p> +<p>On the compute nodes Slurm related processes and threads share the + resources (CPUs, Memory) with the applications. Even if the overhead + of slurm processes and threads is not really important, there could + be interference and de-synchronization in cases where the application + makes heavy usage of resources.</p> +<p>The goal is to automatically confine the slurm related process and + threads (slurmd, slurmstepd, jobacct, etc) on particular cores and + memory of the compute node. This will limit the interference of slurm + on the application execution and may improve the performance of the + applications. We present the design choices along with the developed + code and we provide experiments and observations.</p> + +<h3>Improving forwarding logic in SLURM</h3> +<p>Rod Schultz (BULL), Martin Perry (BULL), Yiannis Georgiou (BULL), + Danny Auble (SchedMD), Morris Jette (SchedMD), Matthieu Hautreux (CEA)</p> +<p>In this presentation we describe the motivations and design of the + communication logic re-factoring in Slurm in order to provide + partially deterministic direct and reverse tree communications. The + goals of these developments are to:</p> +<ul> + <li>Better handle the mapping between the trees of communication + used by SLURM and the existing physical network connections in + order to improve performance.</li> + <li> Provide the ability to aggregate messages directed to the + controller in order to limit the amount of RPC that have to be + managed simultaneously so that we can diminish communication + bottlenecks.</li> +</ul> -<h3>CEA Site Report</h3> -<p>Francois Daikhate, Francis Belot, Matthieu Hautreux (CEA)</p> -<p>The site report will detail the evolution of Slurm usage at CEA -as well as recent developments used on production systems. A -modification of the fairshare logic to better handle fair sharing of -resources between unbalanced groups hierarchies will be detailed.</p> - -<h3>George Washington University Site Report</h3> -<p>Tim Wickberg (George Washington University)<p> -<p>The site report will detail the evaluation of Slurm usage at -George Washington University, and the new Colonial One System.</p> - -<h3>Brigham Young University Site Report</h3> -<p>Ryan Cox (BYU)<p> -<p>The site report will detail the evaluation of Slurm at Brigham Young -University.</p> - -<h3>D.E. Shaw Research Site Report</h3> -<p>Doug Hughes, Chris Harwell, Eric Radman, Goran Pocina, Michael Fenn -(D.E. Shaw Research)</p> -<p>DESRES uses SLURM to schedule Anton. Anton is a specialized -supercomputer which executes molecular dynamics (MD) simulations of -proteins and other biological macromolecules orders of magnitude -faster than was previously possible. In this report, we present the -current SLURM configuration for scheduling Anton and launching our -MD application. We take advantage of the ability to run multiple -slurmd programs on a single node and use them as place-holders for -the Anton machines. We combine that with a pool of commodity Linux -nodes which act as frontends to any of the Anton machines where the -application is launched. We run a partition-specific prolog to insure -machine health prior to starting a job and to reset ASICs if -necessary. We also periodically run health checks and set nodes to -drain or resume via scontrol. Recently we have also used the prolog -to set a specific QOS for jobs which run on an early (and slower) -version of the ASIC in order to adjust the fair-share UsageFactor.</p> -<p>DESRES also uses SLURM to schedule a cluster of commodity nodes for -running regressions, our DESMOND MD program and various other -computational chemistry software. The jobs are an interesting mix of -those with MPI required and those without, short (minutes) and long (weeks).</p> -<p>DESRES is also investigating using SLURM to schedule a small -cluster of 8-GPU nodes for a port of the DESMOND MD program to -GPUs. This workload includes both full node 8-GPU jobs and multi-node -full 8-GPU per node jobs, but also jobs with lower GPU requirements -such that multiple jobs would be on a single node. We've made use of -CPU affinity and binding. GRES was not quite flexible enough and we -ended up taking advantage of the 8 CPU to 8 GPU opting to assign -GPUs to specific CPUs.</p> - -<h3>Technische Universitat Dresden Site Report</h3> -<p>Dr. Ulf Markwardt (Technische Universitat Dresden)</p> -<p>This site report will detail the recent introduction of Slurm on a new -computer at Technische Universitat Dresden.</p> - -<h3>Depth Oblivious Hierarchical Fairshare Priority Factor</h3> -<p>Francois Daikhate, Matthieu Hautreux (CEA)</p> -<p>As High Performance Computing use becomes prevalent in increasingly varied -scientific and industrial fields, clusters often need to be shared by a growing -number of user communities. One aspect of managing these heterogenous groups -involves being able to schedule their jobs fairly according to their respective -machine shares. In this talk we look at how slurm hierarchical fairshare -algorithms handle this task when user groups form complex hierarchies. We -propose an alternative formula to compute job priorities which improves -fairness in this situation.</p> - -<h3>Slurm Roadmap</h3> +<h3>Tuning Slurm Scheduling for Optimal Responsiveness and Utilization</h3> +<p>Morris Jette (SchedMD)</p> +<p>Slurm supports a multitude of scheduling options to achieve + administrative goals for responsiveness, utilization, and service + level under a wide assortment of workloads. Many of these options + have been added in the past year and are still little known. This + tutorial will present an overview of scheduling configuration options + for job prioritiziation, Quality of Service, backfill scheduling, job + preemption, and gang scheduling. Advice will be provided on how to + analyze current workload and tune the system.</p> + +<h3>Improving HPC applications scheduling with predictions based on + automatically-collected historical data</h3> +<p>Carles Fenoy (BSC)</p> +<p>This work analyses the benefits of a system, which being able to + get real performance data from jobs, uses it for future scheduling in + order to improve the performance of the applications with minimal + user input. The study is focused on the memory bandwidth usage of + applications and its impact on the running time when sharing the same + node with other jobs. The data used for scheduling purposes is + extracted from the hardware counters during the application execution + identified by a tag specified by <th></th>e user. This information allows + the system to predict the resource requirements of a job and allocate + it more effectively.</p> + +<h3>Fair Scheduler for Burst Submissions of Parallel Job</h3> +<p>Filip Skalski, Krzysztof Rzadca (Institute of Informatics, + University of Warsaw, Poland)</p> +<p>Large-scale HPC systems are shared by many users. Beside system's + efficiency, the main goal of the scheduler is to serve users + according to a scheduling policy. The fair-share algorithm strives + to build schedules in which each user achieves her target average + utilization rate. This method was fine when each user had just a few + jobs. However, modern workloads are often composed of campaigns: + many jobs submitted by the same user at roughly the same time (e.g. + bag-of-tasks or SLURM's job arrays). For such workloads, fair-share + is not optimal because users frequently have similar utilization + metrics and, in such situations, the schedule switches between + users, executing just a few jobs of each one of them. However, it + would be more efficient to assign the maximum number of resources to + one user per time.</p> +<p>OStrich, our scheduling algorithm, is optimized for campaigns of + jobs. OStrich maintains a virtual schedule that partitions resources + between users' workloads according to pre-defined shares. The + virtual schedule drives the allocation of the real processors.</p> +<p>We implemented OStrich as a priority plugin for SLURM and performed + experimental evaluation on an emulated cluster. Comparing with + fair-share (the multifactor plugin), OStrich schedules have lower + slowdowns while maintaining equal system utilization. Moreover, + OStrich plugin uses normalized shares similarly to the multifactor + plugin, therefore it doesn't require any administrative changes other + than a simple change to the SLURM configuration file. We think that + OStrich is a viable alternative to fair-share in supercomputers with + campaign-like workloads.</p> + +<h3>Introducing Power-capping in SLURM scheduling</h3> +<p>Yiannis Georgiou (BULL), David Glesser (BULL), Matthieu Hautreux + (CEA), Denis Trystram (Univ. Grenoble-Alpes)</p> +<p>The last decades have been characterized by an ever growing + requirement in terms of computing and storage resources. This + tendency has recently put the pressure on the ability to efficiently + manage the power required to operate the huge amount of electrical + components associated with state-of-the-art computing and data + centers. The power consumption of a supercomputer needs to be + adjusted based on varying power budget or electricity + availabilities. As a consequence, Resource and Job Management Systems + have to be adequately adapted in order to efficiently schedule jobs + with optimized performance while limiting power usage whenever + needed. Our goal is to introduce a new power consumption adaptive + scheduling strategy that provides the capability to autonomously + adapt the executed workload to the available or planned power + budget. The originality of this approach relies on a combination of + DVFS (Dynamic Voltage and Frequency Scaling) and node shut-down + techniques.</p> + +<h3>Introducing Energy based fair-share scheduling</h3> +<p>David Glesser (BULL), Yiannis Georgiou (BULL), + Denis Trystram (Univ. Grenoble-Alpes)</p> +<p>Energy consumption has become one of the most important parameters + in High Performance computing platforms. Fair-share scheduling is a + widely used technique in job schedulers to prioritize jobs, + depending to past users allocations. In practice this technique is + mainly based on CPU-Time usage. Since power is managed as a new type + of resources by SLURM and energy consumption can be charged + independently, there is a real need for fairness in terms of energy + consumption.</p> +<p>This presentation will introduce fair-share scheduling based on + past energy usage in SLURM. The new technique will allow users that + have optimized their codes to be more energy efficient or make better + usage of DVFS techniques to improve the stretch times of their + workload.</p> + +<h3>Data movement between Lustre and Enterprise storage systems</h3> +<p>Aamir Rashid (Terascala)</p> +<p>High Performance Data movement is a requirement and a challenge for + HPC (large data sets, high rate of processing, over-provisioning, + compliance, etc.). An example is the data movement inherent in HPC + workflows like genome sequencing. This problem belongs to application + users and is related to HSM. If users are able to effectively manage + data movement tasks as part of their workflows then the IT storage + management problem is significantly diminished. However, to + accomplish this, users need tools that they currently do not + have.</p> +<p>Terascala has developed a new product, Intelligent Storage Bridge + (ISB), for effective data movement between a Lustre appliance and + Enterprise storage systems. ISB is a highly available, scalable and a + policy driven engine that is geared towards end users and automated + workflows. This talk will discuss the features of SLURM that are most + important in a user driven data management solution and highlight + lessons learned.</p> + +<h3>Extending SLURM with Support for Remote GPU Virtualization</h3> +<p>Sergio Iserte, Adrian Castello, Rafael Mayo, Enrique + S. Quintana-Ort (Universitat Jaume I de Castello) Federico Silla, Jose Duato + (Universitat Politecnica de Valencia)</p> +<p>Remote GPU virtualization offers an alluring means to increase + utilization of the GPUs installed in a cluster, which can + potentially yield a faster amortization of the total costs of + ownership (TCO). Concretely, GPU virtualization logically decouples + the GPUs in the cluster from the nodes they are located in, opening + a path to share the accelerators among all the applications that + request GPGPU services, independently of whether the node(s) these + applications are mapped to are equipped with a GPU or not. In this + manner the amount of these accelerators can be reduced, and their + utilization rate can be significantly improved.</p> +<p>SLURM can use a generic resource plug-in (GRes) to manage + GPUs. With this solution the hardware accelerators, like the GPUs, + can only be accessed by the job that is in execution on the node to + which the GPU is attached. This is a serious constraint for remote + GPU virtualization technologies, which aim to provide a completely + user-transparent access to all GPUs in cluster, independently of the + specific locations of the application node and the GPU node.</p> +<p>In this work we introduce a new type of resource in SLURM, the + remote GPU (rGPU), in order to gain access from any application node + to any GPU node in the cluster using rCUDA as the remote GPU + virtualization solution. With this new resource, users can access + all GPUs needed for their jobs, as SLURM schedules the task taking + into account all the GPUs available in the whole cluster. In other + words, introducing GPU-virtualization aware mechanism into SLURM + allow applications to execute CUDA kernels in all GPUs, + independently of their location.</p> + +<h2>September 24, 2014</h2> + +<h3>Complex environment migration from Moab/Torque to Slurm</h3> +<p>Jacqueline Scoggins (Lawrence Berkeley National Lab)</p> +<p>In most HPC environments admins are faced with setting up a + scheduling environment based on the individual or institutional + cluster requirements. Sites that have multiple clusters may have + to install the same scheduler on each system but the policies and + functionality might be different between the various installations. + But as the number of clusters grow and the policies and + requirements change this can become very difficult to manage. How + can this be done simpler without the integration nightmares? At + LBNL we merged our distinct resources under a + common infrastructure to leverage a uniform support architecture and + scavenge unused CPU cycles and expand into a condo-cluster model + using one scheduler. We previously did this using Moab/Torque for + several years but recently migrated to SLURM. The challenge was + how to make SLURM meet the exceedingly arduous needs of our + environment – Accounting, backfill, reservations, fairshare, QOS, + Partitions, Multifactor job prioritization and the ability to have + limits set on a user/group level bases so that the individual and + institutional clusters would not affect each other. Considering our + extremely complicated environment and the many production resources + and users that were impacted by this change, we took a very careful + and diligent approach to the migration and it resulted with minimal + adverse effects on our user base and support engineers. This talk + will be focused on our method and experiences of this + migration.</p> + + +<h3>A budget checking / budget tracking plug-in for SLURM</h3> +<p>Huub Stoffers (SURFsara)</p> +<p>We propose to design and implement a plug-in for the SLURM control + daemon that is capable of calculating “job cost†on the basis of job + resource usage and that keeps track of budgets, registered per + account, as they are spent by running jobs. SLURM does a good job + logging the run time of jobs and their usage of resources during + that time interval. It however does not know how to reduce the usage + of resources to the spending of budget that was granted to + projects.<p> +<p>Traditionally, this is not the responsibility of the batch system + but of the site's accounting system, because the decision which + resource(s) to account, and at what price, are very site + specific. Moreover, translating the resource usage of a job to + budget reductions is most conveniently done after job completion, + when the resource usage is final and completely known. Then, the + "raw" data can simply be handed over to the accounting system for + subsequent interpretation. But this division of labor and its + associated sequence of events have a serious disadvantage: + Overspending by projects is only noticed when it has already + happened.</p> +<p>Projects running on our compute facilities generally can do so + because they have successfully passed through a review process and + were granted a budget to be spent on compute resources on behalf of + the project. Sometimes it is possible to get a prolongation for a + project or to shift an amount of budget between two projects granted + to the same primary investigator. But funding agencies are quite + strict. They do not wish to tolerate that any project spends more + budget then it was formally granted.</p> +<p>New jobs likely to cost more than their project’s remaining budgets + simply should not be dispatched. SLURM already has the concept that + a job is run under an account that is associated with one or more + users. A budget should be associated with such an account too. “Job + cost†is presumably highly dependent on the actual run time of the + job. When a job is about to be dispatched, its maximum “job cost†+ must be calculated, based on its attributes, such as number of cores + or nodes, the partition, its maximum wall clock time. The maximum + job cost must be temporarily claimed, subtracted from the project’s + budget, for as long as the job runs. When the job is finished the + actual job cost can be calculated, permanently subtracted from the + budget while at the same time, the temporarily claimed maximum “job + cost†is given back – i.e. added again.</p> +<p>Preventive, “liveâ€, budget checking during each job dispatch + presently can be implemented, or at least approximated, by a + prologue script. But this involves substantial sacct and squeue + querying and subsequent calculations based on the returned results + that can strain the system much more that directly keeping track of + a project’s budget. Budgets are typically specified in terms of + abstract "system billable units" that can be spent by using, + discrete quantities the resources that the compute facility has to + offer. The number of core hours is usually an important resource + that is accounted, but there may be differences in pricing, + e.g. between the core hours on nodes with or without GPU support, or + with lesser or larger quantities of memory per core. Other + consumable resources, such as the time that particular software + licenses are checked out by a job, may be accounted too. In SLURM it + is customary to use partitions to differentiate between + heterogeneously equipped nodes. Clearly, the relative pricing of + core hours of different partitions should be a configurable in the + configuration file of the plug-in. The actual details of “Job cost†+ calculation will remain site specific and hence should be + concentrated in a single jobcost function. Hooks should be added for + it to be called and process its outcome at job dispatch time and – + for a job that is dispatched – at job completion time.</p> + +<h3>Level-based job prioritization</h3> +<p>Ryan Cox and Levi Morrison (Brigham Young University)</p> +<p>We will present about our new LEVEL_BASED job prioritization mechanism. + The algorithm prioritizes users such that users in an under-served account + will always have a higher fair share factor than users in an over-served + account. It recurses through the account tree, calculates fair share at + each level, then uses bitwise math to ensure that the effect of usage and + shares below the current level cannot affect calculations at the current + level.</p> +<p>Basically, if accounts A and B have the same shares but A has higher usage + than B then children of account A will have a lower fair share factor than + children of account B. This is not guaranteed in other prioritization methods. + LEVEL_BASED was also designed to reduce the likelihood of errors due to + floating point precision loss.</p> + +<h3>Integrating Layouts Framework in SLURM</h3> +<p>Thomas Cadeau (BULL), Yiannis Georgiou (BULL), Matthieu Hautreux (CEA)</p> +<p>Supercomputers become more powerful but more complicated to + manage. Resources hide information that can be taken into account + for more efficient management. Those characteristics may impact the + way resources should be used and may provide valuable information ( + such as power consumption, network details, etc) that can be used to + optimize automatic decisions such as Scheduling, Energy Efficiency, + Placement, Scalability.</p> +<p>The layouts framework has been introduced in the last SLURM User + Group. This presentation will introduce a new API that has been + developed to get, update and consolidate information described by + layouts so that they can be used wherever needed internally in + SLURM. Information such as the placement of each resource in the + actual infrastructure can be taken into account for more efficient + scheduling of jobs. Information such as the power consumption of + resources can be taken into account for power aware scheduling.</p> +<p>Furthermore a new set of scontrol options will be presented to + enable users and administrators to dynamically modify and display + layouts information.</p> + +<h3>Topology-aware Resource Selection with Slurm</h3> +<p>Emmanuel Jeannot, Guillaume Mercier, Adèle Villiermet (INRIA)</p> +<p>Remote GPU virtualization offers an alluring means to increase + utilization of the GPUs installed in a cluster, which can + potentially yield a faster amortization of the total costs of + ownership (TCO). Concretely, GPU virtualization logically decouples + the GPUs in the cluster from the nodes they are located in, opening + a path to share the accelerators among all the applications that + request GPGPU services, independently of whether the node(s) these + applications are mapped to are equipped with a GPU or not. In this + manner the amount of these accelerators can be reduced, and their + utilization rate can be significantly improved.</p> +<p>SLURM can use a generic resource plug-in (GRes) to manage + GPUs. With this solution the hardware accelerators, like the GPUs, + can only be accessed by the job that is in execution on the node to + which the GPU is attached. This is a serious constraint for remote + GPU virtualization technologies, which aim to provide a completely + user-transparent access to all GPUs in cluster, independently of the + specific locations of the application node and the GPU node.</p> +<p>In this work we introduce a new type of resource in SLURM, the + remote GPU (rGPU), in order to gain access from any application node + to any GPU node in the cluster using rCUDA as the remote GPU + virtualization solution. With this new resource, users can access + all GPUs needed for their jobs, as SLURM schedules the task taking + into account all the GPUs available in the whole cluster. In other + words, introducing GPU-virtualization aware mechanism into SLURM + allow applications to execute CUDA kernels in all GPUs, independently of their + location.</p> + +<h3>Exploring the implementation of several key Slurm Inter-cluster + features</h3> +<p>Stephen Trofinoff (CSCS)</p> +<p>Over the course of several years, both at our site (CSCS) and at + others of which we were told, various instances have arisen where + there was a need for some inter-cluster Slurm features. These + features would simplify or in some cases enable use cases for our + various computing facilities and potentially make administering them + easier. One prominent such request, was for the ability to chain a + job to one or more jobs on a remote Slurm cluster. These features, + of course, do not currently exist or are limited in their scope. + For instance, a job can be submitted to a remote Slurm cluster but + can not be "chained" to a job on another cluster since one Slurm + cluster's controller has no knowledge of the jobs of another. + Therefore, after various discussions, it was decided to start a + small project at our site to explore the potential implementation of + some of these features. The project is a work-in-progress.</p> +<p>This paper and the corresponding presentation will discuss some of + the work done thus far. This includes specifying the particular + features chosen for examination and any issues related to their + implementation.</p> + +<h3>Slurm Native Workload Management on Cray Systems</h3> +<p>Danny Auble (SchedMD)</p> +<p>Cray’s Application Level Placement Scheduler (ALPS) software has + recently been refactored to expose low level network management + interfaces in a new library. Slurm is the first workload manager to + utilize this new Cray infrastructure to directly manage network + resources and launch applications without ALPS. New capabilities + provided by Slurm include the ability to execute multiple jobs per + node, the ability to execute many applications within a single job + allocation (ALPS reservation), greater flexibility in scheduling, + and higher throughput without sacrificing scalability or + performance. This presentation includes a description of ALPS + refactoring, new Slurm plugins for Cray systems, and the changes in + functionality provided by this new architecture.</p> + +<h3>Slurm RoadMap</h3> <p>Morris Jette (SchedMD), Yiannis Georgiou (Bull)</p> -<p>Slurm continues to evolve rapidly, with two major releases per -year. This presentation will outline Slurm development plans in the -coming years. Particular attention will be given to describing -anticipated workload management requirements for Exascale -computing. These requirements include not only scalability issues, -but a new focus on power management, fault tolerance, topology -optimized scheduling, and heterogeneous computing.</p> - -<p style="text-align:center;">Last modified 16 September 2013</p> +<p>Slurm long-term development remains focused on the needs of high + performance computing. The Slurm roadmaps continues to evolve as a + greater understanding of unique Exascale computer requirements + develops. For example, Exascale computers may well contain tens of + thousands of compute nodes, which necessitates changes in Slurm + communications infrastructure. Exascale power consumption will need + to be carefully regulated with power capping, throttling the rate of + change and managing the workload to maximize system + utilization. This presentation will describe upcoming Slurm + development plans.</p> + +<h3>Umea University Site Report</h3> +<p>Magnus Jonsson (Umea University)<p> +<p>Use of SPANK plugins to create a private temporary file systems for + each job. This eliminates interference between jobs without the need + to obey the TMPDIR environment variable. The module is using the + features of private namespace/mount --bind in Linux.</p> + +<h3>University of Warsaw Site Report</h3> +<p>Marcin Stolarek (Interdisciplinary Centre for Mathematical and + Computational Modelling (ICM), University of Warsaw, Poland)</p> +<ul> + <li>Our own SPANK plugins using unshare system call limit lustre + availability for job</li> + <li>SPANK plugin + prologue/epilogue preparing separate /tmp + directory</li> + <li>Job submit plugin which checks if job specification is "sane"</li> + <li>Our work on integration of Slurm with middlewares in European and + Polish grid infrastructures.</li> +</ul> + +<h3>iVEC Site Report</h3> +<p>Andrew Elwell (iVEC)</p> +<p>iVEC (An unincorporated joint venture between CSIRO, Curtin + University, Edith Cowan University, Murdoch University and the + University of Western Australia and is supported by the Western + Australian Government) provides supercomputing facilities and + expertise to the research, education and industrial communities. Its + new (2013) purpose built computing centre (the Pawsey Centre) houses + several Cray XC30 systems as well as 6TB SGI UV2000, all connected + via infiniband to multi-petabyte disk storage systems.</p> +<p>Although initially deployed with PBS Pro, senior management + indicated that moving to SLURM as a unified centre-wide scheduler + would be a good idea. This site report describes the issues faced by + an operations team new to SLURM and the configuration choices that + were made within the site.</p> +<p>Pawsey infrastructure runs with a single slurmdbd instance on KVM, + with five different clusters using this as the accounting + repository. The clusters are:</p> +<ul> + <li>Magnus, a Cray XC30 with 208 nodes, 2 external login nodes and 2 + data mover nodes.</li> + <li>Galaxy, a Cray XC30 with 472 nodes, 2 external login nodes, 2 data + mover nodes and 16 'ingest' nodes</li> + <li>Chaos, a small test and development XC30 but without any external + nodes</li> + <li>Zythos, the SGI UV2000 with 4 GPU cards</li> + <li>Pawsey, used as a generic cluster to support 'copyq' + partitions.</li> +</ul> +<p>Because of the interaction between SLURM and ALPS/BASIL (the Cray + node alllocation system) the cray-aware slurm binaries were compiled + separately to the rest of the site (which uses a mixture of SLES and + CentOS) with a patched 2.6.6 and 2.6.9 being deployed. Linux cgroups + were used to control user access within shared nodes.</p> +<p>The report also covers some of the issues the users faced when + migrating from PBS Pro, and the quirks associated with running on + external login nodes with interactive jobs. Finally it describes + some of the user facing reporting still under development</p> + +<h3>CEA Site Report</h3> +<p>Matthieu Hautreux (CEA)</p> +<p>CEA Site Report</p> + +<h3>CSCS Site Report</h3> +<p>Benini Massimo (CSCS)</p> +<p>CSCS Site Report</p> + +<h3>Aalto University Site Report</h3> +<p>Janne Blomqvist, Ivan Degtyarenko, Mikko Hakala (Aalto + University)</p> +<p>We will present the computational science done at Aalto University, + and the HPC infrastructure supporting this. Our cluster currently + has around 550 compute nodes, with a mix of different hardware + generations acquired at different points in time. The cluster is + part of the Finnish Grid Initiative (FGI), a consortium of + Universities and the national supercomputing center CSC - IT Center + for Science, where FGI clusters are accessible to outside users via + grid middleware. FGI also has a common base software stack. The + funding of the Aalto HPC infrastructure is through a stakeholder + model, where University departments using the cluster provide + funding and manpower to run it. Currently there are three major + departments that provide the core manpower and are responsible for + the majority of the usage, but the cluster is also open to other + users in the University without funding/manpower requirements as + long as use remains moderate.</p> +<p>The funding model of the cluster results in pressure to show that + resource usage is fair among the different departments, and to + improve this we developed the ticket-based fairshare algorithm that + has been included in upstream SLURM as of version 2.5 (originally + called priority/multifactor2). We will present the ticket-based + algorithm, and show how it achieves fairness in an account + hierarchy.</p> +<p>We have also developed a wrapper for slurm user commands that some + of our users have found easier to use than the "raw" slurm commands + when investigating the state of the cluster. The wrapper is purely + for read-only commands, so it is always safe to use.</p> + +<h3>George Washington University</h3> +<p>Tim Wickberg (George Washington University)</p> +<p>In particular, I would expect to elaborate and discuss usage of the + fairshare scheduling system, including how it maps to our (slightly + convoluted) internal funding model. Additional discussion may + include our expected use / abuse of the generic resource scheduling + system to dynamically allocate disk space on our test high-IOPS SSD + scratch system.</p> + +<p style="text-align:center;">Last modified 9 July 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index a0d95b54d49a5156631b232670358a93c37ccb6f..f4ae9015323711ca297f5c48c9907f305f12a1a8 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -693,6 +693,9 @@ value will equal the original \fB\-\-mem\-per\-cpu\fR value specified by the user. This parameter would generally be used if individual processors are allocated to jobs (\fBSelectType=select/cons_res\fR). +If resources are allocated by the core, socket or whole nodes; the number +of CPUs allocated to a job may be higher than the task count and the value +of \fB\-\-mem\-per\-cpu\fR should be adjusted accordingly. Also see \fB\-\-mem\fR. \fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive. @@ -830,7 +833,7 @@ PerfCnts. These nodes are still available for other jobs not using NPC. .br .br In all cases the job allocation request \fBmust specify the ---exclusive option\fR. Otherwise the request will be denied. +\-\-exclusive option\fR. Otherwise the request will be denied. .br .br diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index fa7651c60f2c8fbced2ccb45b21138d3c085d067..ebe6eb9536f10f5fdd4398e3c6e41b0b893f28cc 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -780,6 +780,9 @@ value will equal the original \fB\-\-mem\-per\-cpu\fR value specified by the user. This parameter would generally be used if individual processors are allocated to jobs (\fBSelectType=select/cons_res\fR). +If resources are allocated by the core, socket or whole nodes; the number +of CPUs allocated to a job may be higher than the task count and the value +of \fB\-\-mem\-per\-cpu\fR should be adjusted accordingly. Also see \fB\-\-mem\fR. \fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive. @@ -917,7 +920,7 @@ PerfCnts. These nodes are still available for other jobs not using NPC. .br .br In all cases the job allocation request \fBmust specify the ---exclusive option\fR. Otherwise the request will be denied. +\-\-exclusive option\fR. Otherwise the request will be denied. .br .br @@ -1274,8 +1277,9 @@ be sent up to 60 seconds earlier than specified. By default, no signal is sent before the job's end time. If a \fIsig_num\fR is specified without any \fIsig_time\fR, the default time will be 60 seconds. -Use the "B:" option to signal the batch shell. -By default all job steps will be signalled, but not the batch shell itself. +Use the "B:" option to signal only the batch shell, none of the other +processes will be signaled. By default all job steps will be signalled, +but not the batch shell itself. .TP \fB\-\-sockets\-per\-node\fR=<\fIsockets\fR> diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 062a06c2ba6276af1a41d6e40d7fc538ba67be8f..3e12b1add175c9319e7c962143420d300a29a9a1 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -656,8 +656,9 @@ Permit the job's geometry to be rotated. Possible values are "YES" and "NO". .TP \fIShared\fP=<yes|no> -Set the job's ability to share nodes with other jobs. Possible values are -"YES" and "NO". Only the Slurm administrator or root can increase job's priority. +Set the job's ability to share nodes with other jobs. +Possible values are "YES" and "NO". +This option can only be changed for pending jobs. .TP \fIStartTime\fP=<time_spec> Set the job's earliest initiation time. diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index 97cc8b68adac97c11c8cc12c453f0daa8bd73416..60db3b51ec3f2b9e6c07e3c46ca1eeb465463c74 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -92,7 +92,7 @@ when running with various options are "%9P %.5a %.10l %.16F %N" .TP .I "\-\-long" -"%9P %.5a %.10l %.10s %.4r %.5h %.10g %.6D %.11T %N" +"%9P %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" .TP .I "\-\-Node" "%N %.6D %.9P %6t" diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index d198bf4debc6a991414224834e66dfb9e2535c2f..c664cfbc751f96a7c502e4e767b312dad57e7117 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -827,6 +827,9 @@ value will equal the original \fB\-\-mem\-per\-cpu\fR value specified by the user. This parameter would generally be used if individual processors are allocated to jobs (\fBSelectType=select/cons_res\fR). +If resources are allocated by the core, socket or whole nodes; the number +of CPUs allocated to a job may be higher than the task count and the value +of \fB\-\-mem\-per\-cpu\fR should be adjusted accordingly. Specifying a memory limit of zero for a job step will restrict the job step to the amount of memory allocated to the job, but not remove any of the job's memory allocation from being available to other job steps. @@ -1019,7 +1022,7 @@ PerfCnts. These nodes are still available for other jobs not using NPC. .br .br In all cases the job or step allocation request \fBmust specify the ---exclusive option\fR. Otherwise the request will be denied. +\-\-exclusive option\fR. Otherwise the request will be denied. .br .br diff --git a/doc/man/man5/nonstop.conf.5 b/doc/man/man5/nonstop.conf.5 index 8d4a0d2434ba53576064104454822ed5a7d47a89..a9b36c467bc7c4c14e8086661c5c0c075695f4ea 100644 --- a/doc/man/man5/nonstop.conf.5 +++ b/doc/man/man5/nonstop.conf.5 @@ -71,7 +71,7 @@ secure replacement resources up to the number of minutes specified by \fBTimeLimitDelay\fR. This option will only take effect if no hot spare resouces are available at the time replacement resources are requested. -This time limit extention is in addition to the value calculated using the +This time limit extension is in addition to the value calculated using the \fBTimeLimitExtend\fR. The default value is zero (no time limit extension). The value may not exceed 65533 seconds. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index e22c2540d4a2a3309bc0908c991c1b84e13f4a30..5a53a99c37d152c143b22cf620199d7bc86ccf47 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -892,7 +892,7 @@ Arbitrary parameters for the job account gather plugin Acceptable values at present include: .RS .TP 20 -fB\NoShared\fR +\fBNoShared\fR Exclude shared memory from accounting. .RE @@ -2170,6 +2170,7 @@ The node's \fBBoards\fR, \fBSockets\fR, \fBCoresPerSocket\fR and \fBThreadsPerCore\fR may optionally be configured and result in job allocations which have improved locality; however doing so will prevent more than one job being from being allocated on each core. +.RE .TP \fBCR_CPU_Memory\fR @@ -3237,6 +3238,8 @@ The front end configuration specifies the following information: \fBAllowGroups\fR Comma separated list of group names which may execute jobs on this front end node. By default, all groups may use this front end node. +If \fBat least\fR one group associated with the user attempting to execute the +job is in AllowGroups, he will be permitted to use this front end node. May not be used with the \fBDenyGroups\fR option. .TP @@ -3373,7 +3376,7 @@ Also refer to DenyAccounts. .TP \fBAllowGroups\fR Comma separated list of group names which may execute jobs in the partition. -If at least one group associated with the user attempting to execute the +If \fBat least\fR one group associated with the user attempting to execute the job is in AllowGroups, he will be permitted to use this partition. Jobs executed as user root can use any partition without regard to the value of AllowGroups. @@ -3393,7 +3396,7 @@ described above. .TP \fBAllowQos\fR -Comma seperated list of Qos which may execute jobs in the partition. +Comma separated list of Qos which may execute jobs in the partition. Jobs executed as user root can use any partition without regard to the value of AllowQos. The default value is "ALL". @@ -3441,14 +3444,14 @@ not be stored, just collected). .TP \fBDenyAccount\fR -Comma seperated list of accounts which may not execute jobs in the partition. +Comma separated list of accounts which may not execute jobs in the partition. By default, no accounts are denied access \fBNOTE:\fR If AllowAccounts is used then DenyAccounts will not be enforced. Also refer to AllowAccount. .TP \fBDenyQos\fR -Comma seperated list of Qos which may not execute jobs in the partition. +Comma separated list of Qos which may not execute jobs in the partition. By default, no QOS are denied access \fBNOTE:\fR If AllowQos is used then DenyQos will not be enforced. Also refer AllowQos. diff --git a/src/api/job_info.c b/src/api/job_info.c index 2329f8fbea4c8d0d08faa43120b7269e2ce476e0..e739f6377b2bc5d047a3f817dae50f0892416b6f 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -80,12 +80,13 @@ static uint32_t _threads_per_core(char *host) { uint32_t i, threads = 1; - if (!job_node_ptr) + if (!job_node_ptr || !host) return threads; slurm_mutex_lock(&job_node_info_lock); for (i = 0; i < job_node_ptr->record_count; i++) { - if (!strcmp(host, job_node_ptr->node_array[i].name)) { + if (job_node_ptr->node_array[i].name && + !strcmp(host, job_node_ptr->node_array[i].name)) { threads = job_node_ptr->node_array[i].threads; break; } @@ -95,12 +96,14 @@ static uint32_t _threads_per_core(char *host) } static void _free_node_info(void) { +#if 0 slurm_mutex_lock(&job_node_info_lock); if (job_node_ptr) { slurm_free_node_info_msg(job_node_ptr); job_node_ptr = NULL; } slurm_mutex_unlock(&job_node_info_lock); +#endif } /* Perform file name substitutions diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index dc87c5e7f5d9448193a3f4fee7afb2dab3444afa..cdb8879d26f27e8db7f1d91389d9469740ba7611 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * accounting_storage_slurmdbd.c - accounting interface to slurmdbd. + * assoc_mgr.c - File to keep track of associations/QOS used by the daemons ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. * Copyright (C) 2008-2009 Lawrence Livermore National Security. @@ -3630,6 +3630,8 @@ extern void assoc_mgr_remove_qos_usage(slurmdb_qos_rec_t *qos) qos->usage->usage_raw = 0; qos->usage->grp_used_wall = 0; + if (!qos->usage->grp_used_cpus) + qos->usage->grp_used_cpu_run_secs = 0; } extern int dump_assoc_mgr_state(char *state_save_location) diff --git a/src/common/env.h b/src/common/env.h index d61708064d284a42da73c9e4d824f7d3a8ed5a87..383a68df67851ccf0342551a7d8629f6b05c6e51 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -73,8 +73,8 @@ typedef struct env_options { pid_t task_pid; char *sgtids; /* global ranks array of integers */ uint16_t pty_port; /* used to communicate window size changes */ - uint8_t ws_col; /* window size, columns */ - uint8_t ws_row; /* window size, row count */ + uint16_t ws_col; /* window size, columns */ + uint16_t ws_row; /* window size, row count */ char *ckpt_dir; /* --ckpt-dir= */ uint16_t restart_cnt; /* count of job restarts */ uint16_t batch_flag; /* 1 if batch: queued job with script */ diff --git a/src/common/gres.c b/src/common/gres.c index a790e991a96555d2c9ca451ee191db4fcb548557..6b3a5f32b1e546f68acaf8bc1bb53b36267a3f8a 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -148,7 +148,7 @@ static uint32_t _get_gres_cnt(char *orig_config, char *gres_name, static uint32_t _get_tot_gres_cnt(uint32_t plugin_id, uint32_t *set_cnt); static int _gres_find_id(void *x, void *key); static void _gres_job_list_delete(void *list_element); -extern int _job_alloc(void *job_gres_data, void *node_gres_data, +static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt, int node_offset, uint32_t cpu_cnt, char *gres_name, uint32_t job_id, char *node_name, bitstr_t *core_bitmap); @@ -166,7 +166,7 @@ static void * _job_state_dup(void *gres_data); static void * _job_state_dup2(void *gres_data, int node_index); static int _job_state_validate(char *config, void **gres_data, slurm_gres_context_t *gres_name); -extern uint32_t _job_test(void *job_gres_data, void *node_gres_data, +static uint32_t _job_test(void *job_gres_data, void *node_gres_data, bool use_total_gres, bitstr_t *cpu_bitmap, int cpu_start_bit, int cpu_end_bit, bool *topo_set, uint32_t job_id, char *node_name, char *gres_name); @@ -1423,12 +1423,12 @@ extern int _node_config_validate(char *node_name, char *orig_config, gres_data->topo_gres_cnt_avail = xrealloc(gres_data->topo_gres_cnt_avail, set_cnt * sizeof(uint32_t)); - for (i=0; i<gres_data->topo_cnt; i++) + for (i = 0; i < gres_data->topo_cnt; i++) FREE_NULL_BITMAP(gres_data->topo_gres_bitmap[i]); gres_data->topo_gres_bitmap = xrealloc(gres_data->topo_gres_bitmap, set_cnt * sizeof(bitstr_t *)); - for (i=0; i<gres_data->topo_cnt; i++) + for (i = 0; i < gres_data->topo_cnt; i++) FREE_NULL_BITMAP(gres_data->topo_cpus_bitmap[i]); gres_data->topo_cpus_bitmap = xrealloc(gres_data->topo_cpus_bitmap, @@ -2584,7 +2584,7 @@ static void _job_core_filter(void *job_gres_data, void *node_gres_data, FREE_NULL_BITMAP(avail_cpu_bitmap); } -extern uint32_t _job_test(void *job_gres_data, void *node_gres_data, +static uint32_t _job_test(void *job_gres_data, void *node_gres_data, bool use_total_gres, bitstr_t *cpu_bitmap, int cpu_start_bit, int cpu_end_bit, bool *topo_set, uint32_t job_id, char *node_name, char *gres_name) @@ -2904,18 +2904,55 @@ extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, static bool _cores_on_gres(bitstr_t *core_bitmap, gres_node_state_t *node_gres_ptr, int gres_inx) { + int i; + if ((core_bitmap == NULL) || (node_gres_ptr->topo_cnt == 0)) return true; - if (bit_size(node_gres_ptr->topo_cpus_bitmap[gres_inx]) != - bit_size(core_bitmap)) - return false; - if (bit_overlap(node_gres_ptr->topo_cpus_bitmap[gres_inx], core_bitmap)) - return true; + for (i = 0; i < node_gres_ptr->topo_cnt; i++) { + if (bit_size(node_gres_ptr->topo_gres_bitmap[i]) < gres_inx) + continue; + if (!bit_test(node_gres_ptr->topo_gres_bitmap[i], gres_inx)) + continue; + if (bit_size(node_gres_ptr->topo_cpus_bitmap[i]) != + bit_size(core_bitmap)) + break; + if (bit_overlap(node_gres_ptr->topo_cpus_bitmap[i],core_bitmap)) + return true; + } return false; } -extern int _job_alloc(void *job_gres_data, void *node_gres_data, +/* Clear any vestigial job gres state. This may be needed on job requeue. */ +extern void gres_plugin_job_clear(List job_gres_list) +{ + int i; + ListIterator job_gres_iter; + gres_state_t *job_gres_ptr; + gres_job_state_t *job_state_ptr; + + if (job_gres_list == NULL) + return; + + (void) gres_plugin_init(); + job_gres_iter = list_iterator_create(job_gres_list); + while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) { + if (!job_gres_ptr) + continue; + job_state_ptr = (gres_job_state_t *) job_gres_ptr->gres_data; + for (i = 0; i < job_state_ptr->node_cnt; i++) { + FREE_NULL_BITMAP(job_state_ptr->gres_bit_alloc[i]); + FREE_NULL_BITMAP(job_state_ptr->gres_bit_step_alloc[i]); + } + xfree(job_state_ptr->gres_bit_alloc); + xfree(job_state_ptr->gres_bit_step_alloc); + xfree(job_state_ptr->gres_cnt_step_alloc); + } + job_state_ptr->node_cnt = 0; + list_iterator_destroy(job_gres_iter); +} + +static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt, int node_offset, uint32_t cpu_cnt, char *gres_name, uint32_t job_id, char *node_name, bitstr_t *core_bitmap) diff --git a/src/common/gres.h b/src/common/gres.h index e2d20de115bc3782e3ca1cc9151ef120bf7d185e..9d364627ea3341e9d775fa628c5a01063847cc2e 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -449,6 +449,9 @@ extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list, uint32_t cpu_cnt, uint32_t job_id, char *node_name, bitstr_t *core_bitmap); +/* Clear any vestigial job gres state. This may be needed on job requeue. */ +extern void gres_plugin_job_clear(List job_gres_list); + /* * Deallocate resource from a job and update node and job gres information * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 1d616f75239d1fa84fd607cc47050dd918d08d31..e543f5bdfd84e4b4adc18c2d9a8e7ea29e12fc82 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1246,6 +1246,8 @@ extern char *job_state_string(uint16_t inx) return "RESIZING"; if (inx & JOB_SPECIAL_EXIT) return "SPECIAL_EXIT"; + if (inx & JOB_REQUEUE) + return "REQUEUED"; /* Process JOB_STATE_BASE */ switch (inx & JOB_STATE_BASE) { @@ -1285,6 +1287,8 @@ extern char *job_state_string_compact(uint16_t inx) return "RS"; if (inx & JOB_SPECIAL_EXIT) return "SE"; + if (inx & JOB_REQUEUE) + return "RQ"; /* Process JOB_STATE_BASE */ switch (inx & JOB_STATE_BASE) { diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index cf9450dd1c57be59dd99ca8b745073ed3200a8c5..64d1ef8fd60ea978156aa44311d34285879b5568 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -103,6 +103,8 @@ (IS_JOB_FINISHED(_X) && ((_X->job_state & JOB_COMPLETING) == 0)) #define IS_JOB_RESIZING(_X) \ (_X->job_state & JOB_RESIZING) +#define IS_JOB_REQUEUED(_X) \ + (_X->job_state & JOB_REQUEUE) /* Defined node states */ #define IS_NODE_UNKNOWN(_X) \ diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c index fd14e9a8f3f861e45c202871eda1e1454fd4031e..4d36390a879ffedf38b703cb9aee4993b9ef0a6b 100644 --- a/src/common/slurm_step_layout.c +++ b/src/common/slurm_step_layout.c @@ -437,6 +437,18 @@ static int _init_task_layout(slurm_step_layout_t *step_layout, * cpus_per_task=3) */ cpus[i] = 1; } + + if ((plane_size != (uint16_t)NO_VAL) + && (task_dist != SLURM_DIST_PLANE)) { + /* plane_size when dist != plane is used to + convey ntasks_per_node. Adjust the number + of cpus to reflect that. + */ + uint16_t cpus_per_node = plane_size * cpus_per_task; + if (cpus[i] > cpus_per_node) + cpus[i] = cpus_per_node; + } + //info("got %d cpus", cpus[i]); if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { /* move to next record */ @@ -568,7 +580,7 @@ static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus) } } - /* Pass 3: Spread remainign tasks across all nodes */ + /* Pass 3: Spread remaining tasks across all nodes */ while (task_id < step_layout->task_cnt) { for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { diff --git a/src/common/xcpuinfo.c b/src/common/xcpuinfo.c index a10615426aaea7629b120a21baa18763cde4e2d2..1e145242de60512d4bba37c015d51ab49ca4f6de 100644 --- a/src/common/xcpuinfo.c +++ b/src/common/xcpuinfo.c @@ -499,15 +499,15 @@ get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) { uint32_t val; if (_chk_cpuinfo_uint32(buffer, "processor", &val)) { + curcpu = numcpu; numcpu++; - curcpu = val; - if (val >= numproc) { /* out of bounds, ignore */ - debug("cpuid is %u (> %d), ignored", - val, numproc); + if (curcpu >= numproc) { + info("processor limit reached (%u >= %d)", + curcpu, numproc); continue; } - cpuinfo[val].seen = 1; - cpuinfo[val].cpuid = val; + cpuinfo[curcpu].seen = 1; + cpuinfo[curcpu].cpuid = val; maxcpuid = MAX(maxcpuid, val); mincpuid = MIN(mincpuid, val); } else if (_chk_cpuinfo_uint32(buffer, "physical id", &val)) { @@ -629,7 +629,6 @@ get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, #if DEBUG_DETAIL /*** Display raw data ***/ - debug3(""); debug3("numcpu: %u", numcpu); debug3("numphys: %u", numphys); debug3("numcores: %u", numcores); @@ -641,19 +640,18 @@ get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, debug3("physid: %u->%u", minphysid, maxphysid); debug3("coreid: %u->%u", mincoreid, maxcoreid); - for (i = 0; i <= maxcpuid; i++) { + for (i = 0; i < numproc; i++) { debug3("CPU %d:", i); + debug3(" cpuid: %u", cpuinfo[i].cpuid); debug3(" seen: %u", cpuinfo[i].seen); debug3(" physid: %u", cpuinfo[i].physid); debug3(" physcnt: %u", cpuinfo[i].physcnt); debug3(" siblings: %u", cpuinfo[i].siblings); debug3(" cores: %u", cpuinfo[i].cores); debug3(" coreid: %u", cpuinfo[i].coreid); - debug3(" corecnt: %u", cpuinfo[i].corecnt); - debug3(""); + debug3(" corecnt: %u\n", cpuinfo[i].corecnt); } - debug3(""); debug3("Sockets: %u", sockets); debug3("Cores per socket: %u", cores); debug3("Threads per core: %u", threads); diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index 09b76275fb02dd7f9254acd7bec61ab3191c7362..afec3d3854426bd0d1b8957934060bda065393d6 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -687,7 +687,8 @@ static int _as_mysql_acct_check_tables(mysql_conn_t *mysql_conn) if (mysql_db_create_table(mysql_conn, acct_coord_table, acct_coord_table_fields, - ", primary key (acct(20), user(20)))") + ", primary key (acct(20), user(20)), " + "key user (user(20)))") == SLURM_ERROR) return SLURM_ERROR; @@ -1122,8 +1123,9 @@ extern int create_cluster_tables(mysql_conn_t *mysql_conn, char *cluster_name) if (mysql_db_create_table(mysql_conn, table_name, assoc_table_fields, ", primary key (id_assoc), " - " unique index (user(20), acct(20), " - "`partition`(20)))") + "unique index (user(20), acct(20), " + "`partition`(20)), " + "key lft (lft))") == SLURM_ERROR) return SLURM_ERROR; @@ -1219,6 +1221,9 @@ extern int create_cluster_tables(mysql_conn_t *mysql_conn, char *cluster_name) "unique index (id_job, " "id_assoc, time_submit), " "key rollup (time_eligible, time_end), " + "key wckey (id_wckey), " + "key qos (id_qos), " + "key association (id_assoc), " "key sacct_def (id_user, time_start, " "time_end))") == SLURM_ERROR) diff --git a/src/plugins/accounting_storage/mysql/as_mysql_job.c b/src/plugins/accounting_storage/mysql/as_mysql_job.c index 803b39c3075c73d4c99daf3a84cda8e061190458..0c559879b57f5f8f174dd84b12c6b8b0fde1b693 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_job.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_job.c @@ -176,6 +176,9 @@ static uint32_t _get_wckeyid(mysql_conn_t *mysql_conn, char **name, NULL) != SLURM_SUCCESS) { List wckey_list = NULL; slurmdb_wckey_rec_t *wckey_ptr = NULL; + /* we have already checked to make + sure this was the slurm user before + calling this */ wckey_list = list_create(slurmdb_destroy_wckey_rec); @@ -187,9 +190,30 @@ static uint32_t _get_wckeyid(mysql_conn_t *mysql_conn, char **name, /* info("adding wckey '%s' '%s' '%s'", */ /* wckey_ptr->name, wckey_ptr->user, */ /* wckey_ptr->cluster); */ - /* we have already checked to make - sure this was the slurm user before - calling this */ + + if (*name[0] == '*') { + /* make sure the non * wckey has been added */ + wckey_rec.name = (*name)+1; + if (assoc_mgr_fill_in_wckey( + mysql_conn, &wckey_rec, + ACCOUNTING_ENFORCE_WCKEYS, + NULL) != SLURM_SUCCESS) { + wckey_ptr = xmalloc( + sizeof(slurmdb_wckey_rec_t)); + wckey_ptr->name = + xstrdup(wckey_rec.name); + wckey_ptr->user = xstrdup(user); + wckey_ptr->cluster = xstrdup(cluster); + list_prepend(wckey_list, wckey_ptr); + /* info("adding wckey '%s' '%s' " */ + /* "'%s'", */ + /* wckey_ptr->name, */ + /* wckey_ptr->user, */ + /* wckey_ptr->cluster); */ + } + wckey_rec.name = (*name); + } + if (as_mysql_add_wckeys(mysql_conn, slurm_get_slurm_user_id(), wckey_list) @@ -734,7 +758,11 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, return SLURM_SUCCESS; } end_time = job_ptr->end_time; - job_state = job_ptr->job_state & JOB_STATE_BASE; + + if (IS_JOB_REQUEUED(job_ptr)) + job_state = JOB_REQUEUE; + else + job_state = job_ptr->job_state & JOB_STATE_BASE; } slurm_mutex_lock(&rollup_lock); diff --git a/src/plugins/accounting_storage/mysql/as_mysql_qos.c b/src/plugins/accounting_storage/mysql/as_mysql_qos.c index cca982c724cc5f8c140f29a603a94a476f9116fb..7a8f9b8ea046f7fd25a68f7351955d1d1bd05772 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_qos.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_qos.c @@ -424,10 +424,13 @@ static int _setup_qos_limits(slurmdb_qos_rec_t *qos, if (adding_straight) { xstrfmtcat(*vals, ", \'%s,\'", preempt_val); xstrfmtcat(*extra, ", preempt=\'%s,\'", preempt_val); - } else { + } else if (preempt_val[0]) { xstrfmtcat(*vals, ", %s", preempt_val); xstrfmtcat(*extra, ", preempt=if(%s=',', '', %s)", preempt_val, preempt_val); + } else { + xstrcat(*vals, ", ''"); + xstrcat(*extra, ", preempt=''"); } xfree(preempt_val); } diff --git a/src/plugins/accounting_storage/mysql/as_mysql_wckey.c b/src/plugins/accounting_storage/mysql/as_mysql_wckey.c index c10aa629b58024b7cf42068d3ac0223488c7386d..458f26fd4c608f8112bc891d3b040cb77d905ec8 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_wckey.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_wckey.c @@ -133,9 +133,11 @@ static int _make_sure_users_have_default( MYSQL_RES *result = NULL; MYSQL_ROW row; char *wckey = NULL; + /* only look at non * and non deleted ones */ query = xstrdup_printf( "select distinct is_def, wckey_name from " - "\"%s_%s\" where user='%s' FOR UPDATE;", + "\"%s_%s\" where user='%s' and wckey_name " + "not like '*%%' and deleted=0 FOR UPDATE;", cluster, wckey_table, user); debug4("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); @@ -503,7 +505,7 @@ extern int as_mysql_add_wckeys(mysql_conn_t *mysql_conn, uint32_t uid, while ((object = list_next(itr))) { if (!object->cluster || !object->cluster[0] || !object->user || !object->user[0] - || !object->name || !object->name[0]) { + || !object->name) { error("We need a wckey name, cluster, " "and user to add."); rc = SLURM_ERROR; diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index 029229a8c1a89ba8f0609a2bbf37bf52ab92bd60..49a2681b2aa6b7087be144d6f3eda0338b064d0c 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -2273,7 +2273,10 @@ extern int jobacct_storage_p_job_complete(void *db_conn, req.job_state = JOB_RESIZING; } else { req.end_time = job_ptr->end_time; - req.job_state = job_ptr->job_state & JOB_STATE_BASE; + if (IS_JOB_REQUEUED(job_ptr)) + req.job_state = JOB_REQUEUE; + else + req.job_state = job_ptr->job_state & JOB_STATE_BASE; } req.req_uid = job_ptr->requid; req.nodes = job_ptr->nodes; diff --git a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c index 91ed60d3618b782b0b656648848051e77eb40b03..032da0fc4c57e62ee4ca58983bb3ec80462e44ed 100644 --- a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c +++ b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c @@ -420,7 +420,7 @@ extern int acct_gather_energy_p_update_node_energy(void) xassert(_run_in_daemon()); - if (local_energy->current_watts == NO_VAL) + if (!local_energy || local_energy->current_watts == NO_VAL) return rc; _get_joules_task(local_energy); diff --git a/src/plugins/acct_gather_profile/hdf5/hdf5_api.c b/src/plugins/acct_gather_profile/hdf5/hdf5_api.c index 25f0580fa590bffc22ac962ea149920d5fac6d78..f9beb13857105c02f7fda7d66b890beae37909bb 100644 --- a/src/plugins/acct_gather_profile/hdf5/hdf5_api.c +++ b/src/plugins/acct_gather_profile/hdf5/hdf5_api.c @@ -1703,6 +1703,8 @@ extern void put_string_attribute(hid_t parent, char *name, char *value) hid_t attr, space_attr, typ_attr; hsize_t dim_attr[1] = {1}; // Single dimension array of values + if (!value) + value = ""; typ_attr = H5Tcopy(H5T_C_S1); if (typ_attr < 0) { debug3("PROFILE: failed to copy type for attribute %s", name); diff --git a/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c b/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c index 838b752d85f34da0e319772ef6485cb4c34fe385..181dfa21950c9905e38682b3ff64a52138a6e018 100644 --- a/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c +++ b/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c @@ -299,7 +299,7 @@ static int _set_options(const int argc, char **argv) _init_opts(); - while ((cc = getopt_long(argc, argv, "d:Ehi:Ij:l:N:o:p:s:S:uUvV", + while ((cc = getopt_long(argc, argv, "d:Ehi:Ij:l:N:o:p:s:S:u:UvV", long_options, &option_index)) != EOF) { switch (cc) { case 'd': @@ -351,11 +351,11 @@ static int _set_options(const int argc, char **argv) params.keepfiles = 1; break; case 'u': - u = atoi(optarg); if (uid_from_string(optarg, &u) < 0) { error("No such user --uid=\"%s\"", optarg); return -1; } + params.user = uid_to_string(u); break; case 'U': _help_msg(); diff --git a/src/plugins/proctrack/cgroup/proctrack_cgroup.c b/src/plugins/proctrack/cgroup/proctrack_cgroup.c index 846b6f9d4830ba923220aed97effa2ce37ff0922..ded28f82d8a7e9850066559d10870c83987ff644 100644 --- a/src/plugins/proctrack/cgroup/proctrack_cgroup.c +++ b/src/plugins/proctrack/cgroup/proctrack_cgroup.c @@ -285,6 +285,8 @@ int _slurm_cgroup_destroy(void) if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { + error("_slurm_cgroup_destroy: problem deleting step " + "cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } @@ -584,8 +586,10 @@ extern int proctrack_p_wait(uint64_t cont_id) if (delay < 120) { delay *= 2; } else { - error("Unable to destroy container %"PRIu64"", - cont_id); + error("%s: Unable to destroy container %"PRIu64" " + "in cgroup plugin, giving up after %d sec", + __func__, cont_id, delay); + break; } } diff --git a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c index 7ea146687e87f060ca426306299985131b7a511a..a21af1f5962b071b5a59304d9e733e7378dcea8f 100644 --- a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c +++ b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c @@ -153,25 +153,12 @@ extern bool proctrack_p_has_pid(uint64_t cont_id, pid_t pid) extern int proctrack_p_wait(uint64_t cont_id) { - int delay = 1; - if (cont_id == 0 || cont_id == 1) { errno = EINVAL; return SLURM_ERROR; } - /* Spin until the container is successfully destroyed */ - while (proctrack_p_destroy(cont_id) != SLURM_SUCCESS) { - proctrack_p_signal(cont_id, SIGKILL); - sleep(delay); - if (delay < 120) { - delay *= 2; - } else { - error("Unable to destroy container %"PRIu64"", cont_id); - } - } - - return SLURM_SUCCESS; + return proctrack_p_destroy(cont_id); } extern int diff --git a/src/plugins/proctrack/pgid/proctrack_pgid.c b/src/plugins/proctrack/pgid/proctrack_pgid.c index cf4c68dd22f8aef0dea5c8e9b2de13c8bc6ba8d6..28270be69cde7794152f8ad9dae9f6d20b32836c 100644 --- a/src/plugins/proctrack/pgid/proctrack_pgid.c +++ b/src/plugins/proctrack/pgid/proctrack_pgid.c @@ -185,7 +185,10 @@ proctrack_p_wait(uint64_t cont_id) if (delay < 120) { delay *= 2; } else { - error("Unable to destroy container %"PRIu64"", cont_id); + error("%s: Unable to destroy container %"PRIu64" " + "in pgid plugin, giving up after %d sec", + __func__, cont_id, delay); + break; } } diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 092e1d8e6a97770208f8f5ec1190cf0d450bc396..ababedfe358dc0d7fd1f1f00b2a7e32d5113cfad 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -611,8 +611,7 @@ static int _attempt_backfill(void) uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL; - bitstr_t *previous_bitmap = NULL; - time_t now, sched_start, later_start, start_res, resv_end; + time_t now, sched_start, later_start, start_res, resv_end, window_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int sched_timeout = 2, yield_sleep = 1; @@ -684,7 +683,8 @@ static int _attempt_backfill(void) node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt * 2 + 1)); node_space[0].begin_time = sched_start; - node_space[0].end_time = sched_start + backfill_window; + window_end = sched_start + backfill_window; + node_space[0].end_time = window_end; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; @@ -782,8 +782,11 @@ static int _attempt_backfill(void) } job_ptr->part_ptr = part_ptr; - if (debug_flags & DEBUG_FLAG_BACKFILL) - info("backfill test for job %u", job_ptr->job_id); + if (debug_flags & DEBUG_FLAG_BACKFILL) { + info("backfill test for JobID=%u Prio=%u Partition=%s", + job_ptr->job_id, job_ptr->priority, + job_ptr->part_ptr->name); + } if (max_backfill_job_per_part) { bool skip_job = false; @@ -797,13 +800,13 @@ static int _attempt_backfill(void) } if (skip_job) { if (debug_flags & DEBUG_FLAG_BACKFILL) - debug("backfill: have already " - "checked %u jobs for " - "partition %s; skipping " - "job %u", - max_backfill_job_per_part, - job_ptr->part_ptr->name, - job_ptr->job_id); + info("backfill: have already " + "checked %u jobs for " + "partition %s; skipping " + "job %u", + max_backfill_job_per_part, + job_ptr->part_ptr->name, + job_ptr->job_id); continue; } } @@ -838,27 +841,34 @@ static int _attempt_backfill(void) if (njobs[j] >= max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) - debug("backfill: have already " - "checked %u jobs for " - "user %u; skipping " - "job %u", - max_backfill_job_per_user, - job_ptr->user_id, - job_ptr->job_id); + info("backfill: have already " + "checked %u jobs for " + "user %u; skipping " + "job %u", + max_backfill_job_per_user, + job_ptr->user_id, + job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || - (part_ptr->node_bitmap == NULL)) - continue; - if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) + (part_ptr->node_bitmap == NULL) || + ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) { + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: partition %s not usable", + job_ptr->part_ptr->name); continue; + } if ((!job_independent(job_ptr, 0)) || - (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) + (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) { + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: job %u not runable now", + job_ptr->job_id); continue; + } /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, @@ -874,7 +884,9 @@ static int _attempt_backfill(void) else req_nodes = min_nodes; if (min_nodes > max_nodes) { - /* job's min_nodes exceeds partition's max_nodes */ + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: job %u node count too high", + job_ptr->job_id); continue; } @@ -902,7 +914,6 @@ static int _attempt_backfill(void) /* Determine impact of any resource reservations */ later_start = now; - FREE_NULL_BITMAP(previous_bitmap); TRY_LATER: if (slurmctld_config.shutdown_time) break; @@ -961,6 +972,9 @@ static int _attempt_backfill(void) j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: job %u reservation defer", + job_ptr->job_id); job_ptr->time_limit = orig_time_limit; continue; } @@ -987,7 +1001,7 @@ static int _attempt_backfill(void) if ((j = node_space[j].next) == 0) break; } - if ((resv_end++) && + if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } @@ -1008,22 +1022,18 @@ static int _attempt_backfill(void) ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || - (job_req_node_filter(job_ptr, avail_bitmap)) || - (previous_bitmap && - bit_equal(previous_bitmap, avail_bitmap))) { + (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } + /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } - FREE_NULL_BITMAP(previous_bitmap); - previous_bitmap = bit_copy(avail_bitmap); - /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); @@ -1056,6 +1066,8 @@ static int _attempt_backfill(void) } if (job_ptr->start_time <= now) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; + uint32_t hard_limit; + bool reset_time = false; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) { @@ -1067,28 +1079,31 @@ static int _attempt_backfill(void) job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } - job_ptr->end_time = job_ptr->start_time + - (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; - job_ptr->end_time = job_ptr->start_time + - (comp_time_limit * 60); - _reset_job_time_limit(job_ptr, now, - node_space); - time_limit = job_ptr->time_limit; + reset_time = true; } else if (orig_time_limit == NO_VAL) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; - job_ptr->end_time = job_ptr->start_time + - (job_ptr->time_limit * 60); } else { acct_policy_alter_job(job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; - job_ptr->end_time = job_ptr->start_time + - (job_ptr->time_limit * 60); + + } + if (job_ptr->time_limit == INFINITE) + hard_limit = 365 * 24 * 60; /* one year */ + else + hard_limit = job_ptr->time_limit; + job_ptr->end_time = job_ptr->start_time + + (hard_limit * 60); + if (reset_time) { + _reset_job_time_limit(job_ptr, now, + node_space); + time_limit = job_ptr->time_limit; } + if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; @@ -1146,6 +1161,9 @@ static int _attempt_backfill(void) if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ + if (debug_flags & DEBUG_FLAG_BACKFILL) + _dump_job_sched(job_ptr, end_reserve, + avail_bitmap); continue; } @@ -1171,12 +1189,12 @@ static int _attempt_backfill(void) /* * Add reservation to scheduling table if appropriate */ + if (debug_flags & DEBUG_FLAG_BACKFILL) + _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; reject_array_part = NULL; - if (debug_flags & DEBUG_FLAG_BACKFILL) - _dump_job_sched(job_ptr, end_reserve, avail_bitmap); bit_not(avail_bitmap); _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); @@ -1191,7 +1209,6 @@ static int _attempt_backfill(void) FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); FREE_NULL_BITMAP(non_cg_bitmap); - FREE_NULL_BITMAP(previous_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index a555d1c13518723c4059e3f037ef43336d0f133d..cc33112fbe74b3b5a716b8376c11f4d1509ec4c5 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -1598,7 +1598,7 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, * the former one */ if ((best_fit_inx == -1) || - (!switches_required[best_fit_inx] && switches_required[j]) || + (!switches_required[best_fit_inx] && switches_required[j]) || (switch_record_table[j].level < switch_record_table[best_fit_inx].level) || ((switch_record_table[j].level == @@ -1621,13 +1621,14 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, } } if (best_fit_inx == -1) { - debug("job %u: best_fit topology failure : no switch " - "satisfying the request found", job_ptr->job_id); + debug("job %u: best_fit topology failure: no switch currently " + "has sufficient resource to satisfy the request", + job_ptr->job_id); rc = SLURM_ERROR; goto fini; } if (!switches_required[best_fit_inx] && req_nodes_bitmap ) { - debug("job %u: best_fit topology failure : no switch " + debug("job %u: best_fit topology failure: no switch " "including requested nodes and satisfying the " "request found", job_ptr->job_id); rc = SLURM_ERROR; @@ -2652,7 +2653,7 @@ alloc_job: /* translate job_res->cpus array into format with rep count */ build_cnt = build_job_resources_cpu_array(job_res); - if (job_ptr->details->core_spec) { + if (job_ptr->details->whole_node) { int first, last = -1; first = bit_ffs(job_res->node_bitmap); if (first != -1) diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 20dde21729073ca1e749fe588624b67527e7627b..82734b36e5cc5512b343ceb1f9cb6ed48e2a940e 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1059,9 +1059,12 @@ static int _job_expand(struct job_record *from_job_ptr, } } } - - to_job_ptr->total_cpus += new_job_resrcs_ptr-> - cpus[new_node_offset]; + if (to_job_ptr->details->whole_node) { + to_job_ptr->total_cpus += select_node_record[i].cpus; + } else { + to_job_ptr->total_cpus += new_job_resrcs_ptr-> + cpus[new_node_offset]; + } } build_job_resources_cpu_array(new_job_resrcs_ptr); gres_plugin_job_merge(from_job_ptr->gres_list, @@ -2618,7 +2621,7 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, { bitstr_t *sp_avail_bitmap; char str[300]; - uint32_t cores_per_node = 0; + uint32_t cores_per_node = 0, extra_cores_needed = 0; bitstr_t *tmpcore; int total_core_cnt = 0; @@ -2634,10 +2637,12 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, */ if ((node_cnt) && (core_cnt)) { - debug2("reserving %u cores per node in %d nodes", - cores_per_node, node_cnt); total_core_cnt = core_cnt[0]; cores_per_node = core_cnt[0] / MAX(node_cnt, 1); + debug2("Reserving %u cores across %d nodes", + total_core_cnt, node_cnt); + extra_cores_needed = total_core_cnt - + (cores_per_node * node_cnt); } if ((!node_cnt) && (core_cnt)) { int num_nodes = bit_set_count(avail_bitmap); @@ -2648,7 +2653,8 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, total_core_cnt += core_cnt[i]; } - debug2("Reservations requires %d cores", total_core_cnt); + debug2("Reservations requires %d cores (%u each on %d nodes, plus %u)", + total_core_cnt, cores_per_node, node_cnt, extra_cores_needed); sp_avail_bitmap = bit_alloc(bit_size(avail_bitmap)); bit_fmt(str, (sizeof(str) - 1), avail_bitmap); @@ -2718,8 +2724,11 @@ bitstr_t *_sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, bit_set(*core_bitmap, coff + i); total_core_cnt--; cores_in_node++; - if ((cores_in_node == cores_per_node) || - (total_core_cnt == 0)) + if (cores_in_node > cores_per_node) + extra_cores_needed--; + if ((total_core_cnt == 0) || + ((extra_cores_needed == 0) && + (cores_in_node >= cores_per_node))) break; } } diff --git a/src/plugins/task/cgroup/task_cgroup_cpuset.c b/src/plugins/task/cgroup/task_cgroup_cpuset.c index 03e13d4440ef8a1bc84cea289354e6d5b28376d4..6e081eb61209e080e7c66c9bada9c0424bce0cb7 100644 --- a/src/plugins/task/cgroup/task_cgroup_cpuset.c +++ b/src/plugins/task/cgroup/task_cgroup_cpuset.c @@ -53,6 +53,7 @@ #include "src/common/cpu_frequency.h" #include "src/common/slurm_resource_info.h" #include "src/common/bitstring.h" +#include "src/common/proc_args.h" #include "src/common/xstring.h" #include "src/common/xcgroup_read_config.h" #include "src/common/xcgroup.h" @@ -691,63 +692,81 @@ static int _task_cgroup_cpuset_dist_cyclic( { hwloc_obj_t obj; uint32_t *obj_idx; - uint32_t i, sock_idx, npskip, npdist, nsockets; + uint32_t i, j, sock_idx, sock_loop, ntskip, npdist, nsockets; uint32_t taskid = job->envtp->localid; if (bind_verbose) - info("task/cgroup: task[%u] using cyclic distribution, " - "task_dist %u", taskid, job->task_dist); + info("task/cgroup: task[%u] using %s distribution " + "(task_dist=%u)", taskid, + format_task_dist_states(job->task_dist), job->task_dist); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); obj_idx = xmalloc(nsockets * sizeof(uint32_t)); if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ - npskip = taskid * job->cpus_per_task; + ntskip = taskid; npdist = job->cpus_per_task; } else { /* sockets or ldoms granularity */ - npskip = taskid; + ntskip = taskid; npdist = 1; } - /* skip objs for lower taskids */ - i = 0; + /* skip objs for lower taskids, then add them to the + current task cpuset. To prevent infinite loop, check + that we do not loop more than npdist times around the available + sockets, which is the worst scenario we should afford here. */ + i = 0; j = 0; sock_idx = 0; - while (i < npskip) { - while ((sock_idx < nsockets) && (i < npskip)) { + sock_loop = 0; + while (i < ntskip + 1 && sock_loop < npdist + 1) { + /* fill one or multiple sockets using block mode, unless + otherwise stated in the job->task_dist field */ + while ((sock_idx < nsockets) && (j < npdist)) { obj = hwloc_get_obj_below_by_type( topology, HWLOC_OBJ_SOCKET, sock_idx, hwtype, obj_idx[sock_idx]); if (obj != NULL) { obj_idx[sock_idx]++; - i++; + j++; + if (i == ntskip) + _add_hwloc_cpuset(hwtype, req_hwtype, + obj, taskid, + bind_verbose, cpuset); + if ((j < npdist) && + ((job->task_dist == + SLURM_DIST_CYCLIC_CFULL) || + (job->task_dist == + SLURM_DIST_BLOCK_CFULL))) + sock_idx++; + } else { + sock_idx++; } - sock_idx++; } - if (i < npskip) + /* if it succeed, switch to the next task, starting + with the next available socket, otherwise, loop back + from the first socket trying to find available slots. */ + if (j == npdist) { + i++; j = 0; + sock_idx++; // no validity check, handled by the while + sock_loop = 0; + } else { + sock_loop++; sock_idx = 0; - } - - /* distribute objs cyclically across sockets */ - i = npdist; - while (i > 0) { - while ((sock_idx < nsockets) && (i > 0)) { - obj = hwloc_get_obj_below_by_type( - topology, HWLOC_OBJ_SOCKET, sock_idx, - hwtype, obj_idx[sock_idx]); - if (obj != NULL) { - obj_idx[sock_idx]++; - _add_hwloc_cpuset(hwtype, req_hwtype, obj, - taskid, bind_verbose, cpuset); - i--; - } - sock_idx++; } - sock_idx = 0; } + xfree(obj_idx); - return XCGROUP_SUCCESS; + + /* should never happened in normal scenario */ + if (sock_loop > npdist) { + error("task/cgroup: task[%u] infinite loop broken while trying" + "to provision compute elements using %s", taskid, + format_task_dist_states(job->task_dist)); + return XCGROUP_ERROR; + } else + return XCGROUP_SUCCESS; } static int _task_cgroup_cpuset_dist_block( @@ -1119,8 +1138,11 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) uint32_t jntasks = job->node_tasks; uint32_t jnpus; - job->cpus_per_task = MAX(1, job->cpus_per_task); - jnpus = jntasks * job->cpus_per_task; + if (job->batch) { + jnpus = job->cpus; + job->cpus_per_task = job->cpus; + } else + jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if (conf->task_plugin_param & CPU_BIND_VERBOSE || diff --git a/src/sacct/print.c b/src/sacct/print.c index d45c6a373d73583b799643d06937f9b9333ff8a2..44930447ec70941247b187d7a96fd666599e57a2 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -579,9 +579,11 @@ void print_fields(type_t type, void *object) } if (WIFSIGNALED(tmp_int)) tmp_int2 = WTERMSIG(tmp_int); - + tmp_int = WEXITSTATUS(tmp_int); + if (tmp_int >= 128) + tmp_int -= 128; snprintf(outbuf, sizeof(outbuf), "%d:%d", - WEXITSTATUS(tmp_int), tmp_int2); + tmp_int, tmp_int2); field->print_routine(field, outbuf, diff --git a/src/sacctmgr/cluster_functions.c b/src/sacctmgr/cluster_functions.c index 9c0a2952406bb1953c5ace168078fa9de432fda1..8859379872b1f3d8c318bd95598aef8904a1aa41 100644 --- a/src/sacctmgr/cluster_functions.c +++ b/src/sacctmgr/cluster_functions.c @@ -1148,10 +1148,10 @@ extern int sacctmgr_dump_cluster (int argc, char *argv[]) return SLURM_ERROR; } - line = xstrdup_printf("Cluster - %s", cluster_name); + line = xstrdup_printf("Cluster - '%s'", cluster_name); if (class_str) - xstrfmtcat(line, ":Classification=%s", class_str); + xstrfmtcat(line, ":Classification='%s'", class_str); slurmdb_hierarchical_rec = list_peek(slurmdb_hierarchical_rec_list); assoc = slurmdb_hierarchical_rec->assoc; diff --git a/src/sacctmgr/file_functions.c b/src/sacctmgr/file_functions.c index 7359dd193db45dd08f9551d654dcf1e83635f844..717e5498c3c4bedf44aeec14e17d99adc2130fad 100644 --- a/src/sacctmgr/file_functions.c +++ b/src/sacctmgr/file_functions.c @@ -287,7 +287,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) fprintf(stderr, " Bad format on %s: " "End your option with " "an '=' sign\n", sub); - _destroy_sacctmgr_file_opts(file_opts); break; } file_opts->name = xstrdup(option); @@ -320,12 +319,12 @@ static sacctmgr_file_opts_t *_parse_options(char *options) g_qos_list, option); if (file_opts->def_qos_id == NO_VAL) { + exit_code=1; fprintf(stderr, "You gave a bad qos '%s'. " "Use 'list qos' to get " "complete list.\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "DefaultWCKey", @@ -347,7 +346,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad FairShare value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpCPUMins", @@ -357,7 +355,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpCPUMins value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpCPUs", MAX(command_len, 7))) { @@ -366,7 +363,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpCPUs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpJobs", MAX(command_len, 4))) { @@ -375,7 +371,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpMemory", @@ -385,7 +380,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpMemory value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpNodes", @@ -395,7 +389,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpNodes value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpSubmitJobs", @@ -405,7 +398,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad GrpJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "GrpWall", MAX(command_len, 4))) { @@ -420,7 +412,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) fprintf(stderr, " Bad GrpWall time format: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxCPUMinsPerJob", @@ -432,7 +423,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxCPUMins value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxCPUsPerJob", @@ -442,7 +432,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxCPUs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxJobs", MAX(command_len, 4))) { @@ -451,7 +440,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxNodesPerJob", @@ -461,7 +449,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxNodes value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxSubmitJobs", @@ -471,7 +458,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) exit_code=1; fprintf(stderr, " Bad MaxJobs value: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "MaxWallDurationPerJob", @@ -487,7 +473,6 @@ static sacctmgr_file_opts_t *_parse_options(char *options) fprintf(stderr, " Bad MaxWall time format: %s\n", option); - _destroy_sacctmgr_file_opts(file_opts); break; } } else if (!strncasecmp (sub, "Organization", @@ -521,6 +506,7 @@ static sacctmgr_file_opts_t *_parse_options(char *options) } else { exit_code=1; fprintf(stderr, " Unknown option: %s\n", sub); + break; } xfree(sub); @@ -539,9 +525,9 @@ static sacctmgr_file_opts_t *_parse_options(char *options) if (!file_opts->name) { exit_code=1; fprintf(stderr, " No name given\n"); - _destroy_sacctmgr_file_opts(file_opts); - file_opts = NULL; - } else if (exit_code) { + } + + if (exit_code) { _destroy_sacctmgr_file_opts(file_opts); file_opts = NULL; } @@ -1615,7 +1601,7 @@ static int _print_file_slurmdb_hierarchical_rec_children( user_list, slurmdb_hierarchical_rec->assoc->user); line = xstrdup_printf( - "User - %s", + "User - '%s'", slurmdb_hierarchical_rec->sort_name); if (slurmdb_hierarchical_rec->assoc->partition) xstrfmtcat(line, ":Partition='%s'", @@ -1702,7 +1688,7 @@ static int _print_file_slurmdb_hierarchical_rec_children( acct_list, slurmdb_hierarchical_rec->assoc->acct); line = xstrdup_printf( - "Account - %s", + "Account - '%s'", slurmdb_hierarchical_rec->sort_name); if (acct_rec) { xstrfmtcat(line, ":Description='%s'", @@ -1822,12 +1808,12 @@ extern int print_file_slurmdb_hierarchical_rec_list( slurmdb_hierarchical_rec->assoc->user); */ if (!list_count(slurmdb_hierarchical_rec->children)) continue; - if (fprintf(fd, "Parent - %s\n", + if (fprintf(fd, "Parent - '%s'\n", slurmdb_hierarchical_rec->assoc->acct) < 0) { error("Can't write to file"); return SLURM_ERROR; } - info("%s - %s", "Parent", + info("%s - '%s'", "Parent", slurmdb_hierarchical_rec->assoc->acct); /* info("sending %d from %s", */ /* list_count(slurmdb_hierarchical_rec->children), */ diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index a4c85279c945647d6381563f9393145bf3545e8c..5531385640f39867fe2b69b7a13c398b20bec3f9 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -466,7 +466,8 @@ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) desc->time_limit = opt.time_limit; if (opt.time_min != NO_VAL) desc->time_min = opt.time_min; - desc->shared = opt.shared; + if (opt.shared != (uint16_t) NO_VAL) + desc->shared = opt.shared; desc->wait_all_nodes = opt.wait_all_nodes; if (opt.warn_flags) diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 50822f4d29ca99985750e999ba5752b81fe63612..75b67e45a86e66b6c05e389cc1a95e47975897e9 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -1002,14 +1002,17 @@ scontrol_update_job (int argc, char *argv[]) } for (i = 0; i < num_ids; i++) { job_msg.job_id = ids[i].job_id; + rc = 0; if (slurm_update_job(&job_msg)) { rc = slurm_get_errno(); if (ids[i].array_task_id == NO_VAL) { - error("Error updating job %u", ids[i].job_id); + error("Error updating job %u: %s", + ids[i].job_id, slurm_strerror(rc)); } else { - error("Error updating job %u_%u (%u)", - ids[i].array_job_id, ids[i].array_task_id, - ids[i].job_id); + error("Error updating job %u_%u (%u): %s", + ids[i].array_job_id, + ids[i].array_task_id, + ids[i].job_id, slurm_strerror(rc)); } } } diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c index cae1e3eda5ddd6a32014f10c30657cb0ee3b3256..ab91ad61d517674c8f6f708d3f0c80e740e3a837 100644 --- a/src/sinfo/opts.c +++ b/src/sinfo/opts.c @@ -287,7 +287,7 @@ extern void parse_command_line(int argc, char *argv[]) } else { params.part_field_flag = true; /* compute size later */ params.format = params.long_output ? - "%9P %.5a %.10l %.10s %.4r %.5h %.10g %.6D %.11T %N" : + "%9P %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" : "%9P %.5a %.10l %.6D %.6t %N"; } } diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index 15a17dc88c794a79c51083aa0bf92ada9818927b..366363db13dc04eb11f8972c3670eb1e0605cdce 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -65,6 +65,7 @@ static int g_node_scaling = 1; static int sinfo_cnt; /* thread count */ static pthread_mutex_t sinfo_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t sinfo_cnt_cond = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t sinfo_list_mutex = PTHREAD_MUTEX_INITIALIZER; /************ * Funtions * @@ -90,6 +91,7 @@ static int _query_server(partition_info_msg_t ** part_pptr, block_info_msg_t ** block_pptr, reserve_info_msg_t ** reserv_pptr, bool clear_old); static int _reservation_report(reserve_info_msg_t *resv_ptr); +static bool _serial_part_data(void); static void _sort_hostlist(List sinfo_list); static int _strcmp(char *data1, char *data2); static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr, @@ -425,6 +427,8 @@ void *_build_part_info(void *args) uint16_t part_num; int j = 0; + if (_serial_part_data()) + slurm_mutex_lock(&sinfo_list_mutex); build_struct_ptr = (build_part_info_t *) args; sinfo_list = build_struct_ptr->sinfo_list; part_num = build_struct_ptr->part_num; @@ -462,6 +466,8 @@ void *_build_part_info(void *args) } xfree(args); + if (_serial_part_data()) + slurm_mutex_unlock(&sinfo_list_mutex); slurm_mutex_lock(&sinfo_cnt_mutex); if (sinfo_cnt > 0) { sinfo_cnt--; @@ -794,6 +800,18 @@ static bool _match_node_data(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr) return true; } +/* Return true if the processing of partition data must be serialized. In that + * case, multiple partitions can write into the same sinfo data structure + * entries. The logic here is similar to that in _match_part_data() below. */ +static bool _serial_part_data(void) +{ + if (params.list_reasons) /* Don't care about partition */ + return true; + if (params.match_flags.partition_flag) /* Match partition name */ + return false; + return true; +} + static bool _match_part_data(sinfo_data_t *sinfo_ptr, partition_info_t* part_ptr) { @@ -804,7 +822,8 @@ static bool _match_part_data(sinfo_data_t *sinfo_ptr, if ((part_ptr == NULL) || (sinfo_ptr->part_info == NULL)) return false; - if ((_strcmp(part_ptr->name, sinfo_ptr->part_info->name))) + if (params.match_flags.partition_flag + && (_strcmp(part_ptr->name, sinfo_ptr->part_info->name))) return false; if (params.match_flags.avail_flag && diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 0202aa71b4cba8f0bfdf3782d64c5300a32265aa..34f60155fe312d53bcbf7d95ff11d8d5dec498ed 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -559,6 +559,7 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, qos_max_nodes_limit = MIN(qos_ptr->grp_nodes, qos_ptr->max_nodes_pu); + if ((acct_policy_limit_set->max_nodes == ADMIN_SET_LIMIT) || (qos_max_nodes_limit == INFINITE) || (update_call && (job_desc->max_nodes == NO_VAL))) { @@ -1928,54 +1929,58 @@ end_it: extern uint32_t acct_policy_get_max_nodes(struct job_record *job_ptr) { - uint32_t max_nodes_limit = INFINITE; + uint32_t max_nodes_limit = INFINITE, qos_max_p_limit = INFINITE; assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + slurmdb_qos_rec_t *qos_ptr = job_ptr->qos_ptr; + slurmdb_association_rec_t *assoc_ptr = job_ptr->assoc_ptr; + bool parent = 0; /* flag to tell us if we are looking at the + * parent or not + */ + bool grp_set = 0; /* check to see if we are enforcing associations */ if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) return max_nodes_limit; assoc_mgr_lock(&locks); - if (job_ptr->qos_ptr) { - slurmdb_qos_rec_t *qos_ptr = job_ptr->qos_ptr; - max_nodes_limit = - MIN(qos_ptr->grp_nodes, qos_ptr->max_nodes_pu); + if (qos_ptr) { + qos_max_p_limit = max_nodes_limit = + MIN(qos_ptr->max_nodes_pj, qos_ptr->max_nodes_pu); max_nodes_limit = - MIN(max_nodes_limit, qos_ptr->max_nodes_pj); + MIN(max_nodes_limit, qos_ptr->grp_nodes); } - if (max_nodes_limit == INFINITE) { - slurmdb_association_rec_t *assoc_ptr = job_ptr->assoc_ptr; - bool parent = 0; /* flag to tell us if we are looking at the - * parent or not - */ - bool grp_set = 0; - - while (assoc_ptr) { - if (assoc_ptr->grp_nodes != INFINITE) { - max_nodes_limit = MIN(max_nodes_limit, - assoc_ptr->grp_nodes); - grp_set = 1; - } + /* We have to traverse all the associations because QOS might + not override a particular limit. + */ + while (assoc_ptr) { + if ((!qos_ptr || (qos_ptr->grp_nodes == INFINITE)) + && (assoc_ptr->grp_nodes != INFINITE)) { + max_nodes_limit = MIN(max_nodes_limit, + assoc_ptr->grp_nodes); + grp_set = 1; + } - if (!parent && (assoc_ptr->max_nodes_pj != INFINITE)) - max_nodes_limit = MIN(max_nodes_limit, - assoc_ptr->max_nodes_pj); + if (!parent + && (qos_max_p_limit == INFINITE) + && (assoc_ptr->max_nodes_pj != INFINITE)) + max_nodes_limit = MIN(max_nodes_limit, + assoc_ptr->max_nodes_pj); - /* only check the first grp set */ - if (grp_set) - break; - - assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; - parent = 1; - continue; - } + /* only check the first grp set */ + if (grp_set) + break; + assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; + parent = 1; + continue; } + assoc_mgr_unlock(&locks); return max_nodes_limit; } + /* * acct_policy_update_pending_job - Make sure the limits imposed on a job on * submission are correct after an update to a qos or association. If diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index fe0a603e56c9be2f4441fef13e3bb9370312e0b1..1e7f5b42475a802d5059a56a7784c96314c727f8 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -87,6 +87,7 @@ #include "src/common/uid.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" +#include "src/common/slurm_protocol_interface.h" #include "src/slurmctld/acct_policy.h" #include "src/slurmctld/agent.h" @@ -975,6 +976,15 @@ static void *_slurmctld_rpc_mgr(void *no_data) conn_arg->newsockfd = newsockfd; memcpy(&conn_arg->cli_addr, &cli_addr, sizeof(slurm_addr_t)); + if (slurmctld_conf.debug_flags & DEBUG_FLAG_PROTOCOL) { + char inetbuf[64]; + + _slurm_print_slurm_addr(&cli_addr, + inetbuf, + sizeof(inetbuf)); + info("%s: accept() connection from %s", __func__, inetbuf); + } + if (slurmctld_config.shutdown_time) no_thread = 1; else if (pthread_create(&thread_id_rpc_req, @@ -1613,20 +1623,12 @@ static void *_slurmctld_background(void *no_data) _accounting_cluster_ready(); } + /* Stats will reset at midnight (approx) local time. */ if (last_proc_req_start == 0) { - /* Stats will reset at midnight (aprox). - * Uhmmm... UTC time?... It is not so important. - * Just resetting during the night */ - last_proc_req_start = now; - next_stats_reset = last_proc_req_start - - (last_proc_req_start % 86400) + - 86400; - } - - if ((next_stats_reset > 0) && (now > next_stats_reset)) { - /* Resetting stats values */ last_proc_req_start = now; next_stats_reset = now - (now % 86400) + 86400; + } else if (now >= next_stats_reset) { + next_stats_reset = now - (now % 86400) + 86400; reset_stats(0); } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a40dee5068cad7ca9caaa61d19a19689491cd053..0b29666f8310b99668dd99e6b81fc41ae73d5bb7 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2841,7 +2841,7 @@ void dump_job_desc(job_desc_msg_t * job_specs) { long job_id, time_min; long pn_min_cpus, pn_min_memory, pn_min_tmp_disk, min_cpus; - long time_limit, priority, contiguous; + long time_limit, priority, contiguous, nice; long kill_on_node_fail, shared, immediate, wait_all_nodes; long cpus_per_task, requeue, num_tasks, overcommit; long ntasks_per_node, ntasks_per_socket, ntasks_per_core; @@ -2973,10 +2973,11 @@ void dump_job_desc(job_desc_msg_t * job_specs) (long) job_specs->num_tasks : -1L; overcommit = (job_specs->overcommit != (uint8_t) NO_VAL) ? (long) job_specs->overcommit : -1L; - debug3(" mail_type=%u mail_user=%s nice=%d num_tasks=%ld " + nice = (job_specs->nice != (uint16_t) NO_VAL) ? + (job_specs->nice - NICE_OFFSET) : 0; + debug3(" mail_type=%u mail_user=%s nice=%ld num_tasks=%ld " "open_mode=%u overcommit=%ld acctg_freq=%s", - job_specs->mail_type, job_specs->mail_user, - (int)job_specs->nice - NICE_OFFSET, num_tasks, + job_specs->mail_type, job_specs->mail_user, nice, num_tasks, job_specs->open_mode, overcommit, job_specs->acctg_freq); slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf)); @@ -3707,6 +3708,7 @@ _signal_batch_job(struct job_record *job_ptr, uint16_t signal) bitoff_t i; kill_tasks_msg_t *kill_tasks_msg = NULL; agent_arg_t *agent_args = NULL; + uint32_t z; xassert(job_ptr); xassert(job_ptr->batch_host); @@ -3734,7 +3736,13 @@ _signal_batch_job(struct job_record *job_ptr, uint16_t signal) kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t)); kill_tasks_msg->job_id = job_ptr->job_id; kill_tasks_msg->job_step_id = NO_VAL; - kill_tasks_msg->signal = signal; + /* Encode the KILL_JOB_BATCH flag for + * stepd to know if has to signal only + * the batch script. The job was submitted + * using the --signal=B:sig sbatch option. + */ + z = KILL_JOB_BATCH << 24; + kill_tasks_msg->signal = z|signal; agent_args->msg_args = kill_tasks_msg; agent_args->node_count = 1;/* slurm/477 be sure to update node_count */ @@ -3805,8 +3813,11 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, return ESLURM_INVALID_JOB_ID; } - if (IS_JOB_FINISHED(job_ptr)) + if (IS_JOB_FINISHED(job_ptr)) { + if (job_ptr->exit_code == 0) + job_ptr->exit_code = job_return_code; return ESLURM_ALREADY_DONE; + } if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) { error("Security violation, JOB_COMPLETE RPC for job %u " @@ -3827,7 +3838,8 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, if ((job_return_code == NO_VAL) && (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) { - info("Job %u cancelled from interactive user", job_ptr->job_id); + info("Job %u cancelled from interactive user or node failure", + job_ptr->job_id); } if (IS_JOB_SUSPENDED(job_ptr)) { @@ -4061,7 +4073,6 @@ static int _part_access_check(struct part_record *part_ptr, } if (slurmctld_conf.enforce_part_limits) { - info("checking here"); if ((rc = part_policy_valid_acct(part_ptr, acct)) != SLURM_SUCCESS) goto fini; @@ -6998,6 +7009,26 @@ static void _pack_default_job_details(struct job_record *job_ptr, char *cmd_line = NULL; char *tmp = NULL; uint32_t len = 0; + uint16_t shared = 0; + + if (!detail_ptr) + shared = (uint16_t) NO_VAL; + else if (detail_ptr->share_res == 1) /* User --share */ + shared = 1; + else if ((detail_ptr->share_res == 0) || + (detail_ptr->whole_node == 1)) /* User --exclusive */ + shared = 0; + else if (job_ptr->part_ptr) { + /* Report shared status based upon latest partition info */ + if ((job_ptr->part_ptr->max_share & SHARED_FORCE) && + ((job_ptr->part_ptr->max_share & (~SHARED_FORCE)) > 1)) + shared = 1; /* Partition Shared=force */ + else if (job_ptr->part_ptr->max_share == 0) + shared = 0; /* Partition Shared=exclusive */ + else + shared = 0; /* Part Shared=yes or no */ + } else + shared = (uint16_t) NO_VAL; /* No user or partition info */ if (max_cpu_cnt == -1) max_cpu_cnt = _find_node_max_cpu_cnt(); @@ -7068,6 +7099,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, } pack16(detail_ptr->requeue, buffer); pack16(detail_ptr->ntasks_per_node, buffer); + pack16(shared, buffer); } else { packnull(buffer); packnull(buffer); @@ -7084,6 +7116,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, pack32((uint32_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); + pack16((uint16_t) 0, buffer); } } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { if (detail_ptr) { @@ -7139,6 +7172,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, pack32(detail_ptr->max_nodes, buffer); } pack16(detail_ptr->requeue, buffer); + pack16(shared, buffer); } else { packnull(buffer); packnull(buffer); @@ -7154,6 +7188,7 @@ static void _pack_default_job_details(struct job_record *job_ptr, pack32(job_ptr->node_cnt, buffer); pack32((uint32_t) 0, buffer); pack16((uint16_t) 0, buffer); + pack16((uint16_t) 0, buffer); } } else { error("_pack_default_job_details: protocol_version " @@ -7165,20 +7200,8 @@ static void _pack_default_job_details(struct job_record *job_ptr, static void _pack_pending_job_details(struct job_details *detail_ptr, Buf buffer, uint16_t protocol_version) { - uint16_t shared = 0; - - if (!detail_ptr) - shared = (uint16_t) NO_VAL; - else if (detail_ptr->share_res == 1) - shared = 1; - else if (detail_ptr->whole_node == 1) - shared = 0; - else - shared = (uint16_t) NO_VAL; - if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { if (detail_ptr) { - pack16(shared, buffer); pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->core_spec, buffer); pack16(detail_ptr->cpus_per_task, buffer); @@ -7204,7 +7227,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); - pack16((uint16_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); @@ -7222,7 +7244,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, } } else if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { if (detail_ptr) { - pack16(shared, buffer); pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->cpus_per_task, buffer); pack16(detail_ptr->pn_min_cpus, buffer); @@ -7242,7 +7263,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); - pack16((uint16_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); @@ -7716,8 +7736,9 @@ static bool _top_priority(struct job_record *job_ptr) if ((!top) && detail_ptr) { /* not top prio */ if (job_ptr->priority == 0) { /* user/admin hold */ - if ((job_ptr->state_reason != WAIT_HELD) && - (job_ptr->state_reason != WAIT_HELD_USER)) { + if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS + && (job_ptr->state_reason != WAIT_HELD) + && (job_ptr->state_reason != WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } @@ -9531,8 +9552,8 @@ static void _send_job_kill(struct job_record *job_ptr) if (agent_args->node_count == 0) { if ((job_ptr->details->expanding_jobid == 0) && (select_serial == 0)) { - error("Job %u allocated no nodes to be killed on", - job_ptr->job_id); + error("%s: job %u allocated no nodes to be killed on", + __func__, job_ptr->job_id); } xfree(kill_job->nodes); xfree(kill_job); @@ -10258,14 +10279,12 @@ extern bool job_epilog_complete(uint32_t job_id, char *node_name, * subsequent jobs appear in a separate accounting record. */ void batch_requeue_fini(struct job_record *job_ptr) { - time_t now; - if (IS_JOB_COMPLETING(job_ptr) || !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag) return; info("requeue batch job %u", job_ptr->job_id); - now = time(NULL); + /* Clear everything so this appears to be a new job and then restart * it in accounting. */ job_ptr->start_time = 0; @@ -10289,14 +10308,14 @@ void batch_requeue_fini(struct job_record *job_ptr) FREE_NULL_BITMAP(job_ptr->node_bitmap); FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); if (job_ptr->details) { + time_t now = time(NULL); /* the time stamp on the new batch launch credential must be * larger than the time stamp on the revoke request. Also the * I/O must be all cleared out and the named socket purged, * so delay for at least ten seconds. */ if (job_ptr->details->begin_time <= now) job_ptr->details->begin_time = now + 10; - if (!with_slurmdbd) - jobacct_storage_g_job_start(acct_db_conn, job_ptr); + /* Since this could happen on a launch we need to make sure the * submit isn't the same as the last submit so put now + 1 so * we get different records in the database */ @@ -10308,6 +10327,8 @@ void batch_requeue_fini(struct job_record *job_ptr) /* Reset this after the batch step has finished or the batch step * information will be attributed to the next run of the job. */ job_ptr->db_index = 0; + if (!with_slurmdbd) + jobacct_storage_g_job_start(acct_db_conn, job_ptr); } @@ -10965,7 +10986,12 @@ extern int job_requeue(uid_t uid, goto reply; } - if ((job_ptr->details == NULL) || (job_ptr->details->requeue == 0)) { + /* If the partition was removed don't allow the job to be + * requeued. If it doesn't have details then something is very + * wrong and if the job doesn't want to be requeued don't. + */ + if (!job_ptr->part_ptr || !job_ptr->details + || !job_ptr->details->requeue) { rc = ESLURM_DISABLED; goto reply; } @@ -11000,7 +11026,7 @@ extern int job_requeue(uid_t uid, /* we can't have it as suspended when we call the * accounting stuff. */ - job_ptr->job_state = JOB_CANCELLED; + job_ptr->job_state = JOB_REQUEUE; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; suspended = true; @@ -11021,10 +11047,10 @@ extern int job_requeue(uid_t uid, || IS_JOB_RUNNING(job_ptr)) is_running = true; - /* We want this job to look like it was cancelled in the + /* We want this job to have the requeued state in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ - job_ptr->job_state = JOB_CANCELLED; + job_ptr->job_state = JOB_REQUEUE; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, true); @@ -12080,7 +12106,7 @@ extern void job_hold_requeue(struct job_record *job_ptr) job_ptr->state_reason, job_ptr->priority); } -/* Reset a job's end-time based upon it's end_time. +/* Reset a job's end_time based upon it's start_time and time_limit. * NOTE: Do not reset the end_time if already being preempted */ extern void job_end_time_reset(struct job_record *job_ptr) { diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 6f147b1040a185d0ca2f7bd8595f3e825f62afa7..35055876786d8bdd736b692ffa19bdf489cc681c 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -198,8 +198,9 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool clear_start) if (clear_start) job_ptr->start_time = (time_t) 0; if (job_ptr->priority == 0) { /* held */ - if ((job_ptr->state_reason != WAIT_HELD) && - (job_ptr->state_reason != WAIT_HELD_USER)) { + if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS + && (job_ptr->state_reason != WAIT_HELD) + && (job_ptr->state_reason != WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); last_job_update = time(NULL); @@ -849,8 +850,8 @@ extern int schedule(uint32_t job_limit) xfree(sched_params); sched_update = slurmctld_conf.last_update; - info("SchedulingParameters: default_queue_depth=%d " - "max_rpc_cnt=%d max_sched_time=%d partition_job_depth=%d ", + info("SchedulerParameters=default_queue_depth=%d," + "max_rpc_cnt=%d,max_sched_time=%d,partition_job_depth=%d", def_job_limit, defer_rpc_cnt, sched_timeout, max_jobs_per_part); } @@ -1319,13 +1320,11 @@ next_part: part_ptr = (struct part_record *) job_ptr->job_id, slurm_strerror(error_code)); if (!wiki_sched) { last_job_update = now; - job_ptr->job_state = JOB_FAILED; - job_ptr->exit_code = 1; + job_ptr->job_state = JOB_PENDING; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = now; - job_completion_logger(job_ptr, false); - delete_job_details(job_ptr); + job_ptr->priority = 0; } } @@ -3003,15 +3002,23 @@ static int _valid_node_feature(char *feature) return rc; } -/* If a job can run in multiple partitions, make sure that the one - * actually used is first in the string. Needed for job state save/restore */ +/* If a job can run in multiple partitions, when it is started we want to + * put the name of the partition used _first_ in that list. When slurmctld + * restarts, that will be used to set the job's part_ptr and that will be + * reported to squeue. We leave all of the partitions in the list though, + * so the job can be requeued and have access to them all. */ extern void rebuild_job_part_list(struct job_record *job_ptr) { ListIterator part_iterator; struct part_record *part_ptr; - if ((job_ptr->part_ptr_list == NULL) || (job_ptr->part_ptr == NULL)) + if (!job_ptr->part_ptr_list) return; + if (!job_ptr->part_ptr || !job_ptr->part_ptr->name) { + error("Job %u has NULL part_ptr or the partition name is NULL", + job_ptr->job_id); + return; + } xfree(job_ptr->partition); job_ptr->partition = xstrdup(job_ptr->part_ptr->name); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 5c583991ad99b8aef615bc1a58419021519e4727..97f80c504661806c1b29ec203880305671ed4939 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -605,7 +605,9 @@ unpack_error: list_destroy(gres_list); gres_list = NULL; } - xfree (node_name); + xfree(comm_name); + xfree(node_hostname); + xfree(node_name); xfree(reason); goto fini; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 4c35a5f712da5650f78c1653f80a5093296a7a37..ec86c166fa32ef0a78492e5aea7f3de7acc270d1 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -200,7 +200,7 @@ static int _get_gres_alloc(struct job_record *job_ptr) /* * _get_gres_config - Fill in the gres_alloc string field for a given * job_record with the count of gres on each node (e.g. for whole node - * allocations. + * allocations). * IN job_ptr - the job record whose "gres_alloc" field is to be constructed * RET Error number. Currently not used (always set to 0). */ @@ -602,8 +602,8 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if (agent_args->node_count == 0) { if ((job_ptr->details->expanding_jobid == 0) && (select_serial == 0)) { - error("Job %u allocated no nodes to be killed on", - job_ptr->job_id); + error("%s: job %u allocated no nodes to be killed on", + __func__, job_ptr->job_id); } slurm_free_kill_job_msg(kill_job); hostlist_destroy(agent_args->hostlist); @@ -681,8 +681,8 @@ static int _match_feature(char *seek, struct node_set *node_set_ptr) * 1 = exclusive * * Return values: - * 0 = no sharing - * 1 = share resources + * 0 = requires idle nodes + * 1 = can use non-idle nodes */ static int _resolve_shared_status(struct job_record *job_ptr, uint16_t part_max_share, @@ -691,31 +691,36 @@ _resolve_shared_status(struct job_record *job_ptr, uint16_t part_max_share, /* no sharing if partition Shared=EXCLUSIVE */ if (part_max_share == 0) { job_ptr->details->whole_node = 1; + job_ptr->details->share_res = 0; return 0; } /* sharing if partition Shared=FORCE with count > 1 */ if ((part_max_share & SHARED_FORCE) && - ((part_max_share & (~SHARED_FORCE)) > 1)) + ((part_max_share & (~SHARED_FORCE)) > 1)) { + job_ptr->details->share_res = 1; return 1; + } if (cons_res_flag) { - if (part_max_share == 1) /* partition configured Shared=NO */ - return 0; if ((job_ptr->details->share_res == 0) || - (job_ptr->details->share_res == (uint8_t) NO_VAL) || - (job_ptr->details->whole_node == 1)) + (job_ptr->details->whole_node == 1)) { + job_ptr->details->share_res = 0; return 0; + } return 1; } else { job_ptr->details->whole_node = 1; - if (part_max_share == 1) /* partition configured Shared=NO */ + if (part_max_share == 1) { /* partition configured Shared=NO */ + job_ptr->details->share_res = 0; return 0; + } /* share if the user requested it */ if (job_ptr->details->share_res == 1) return 1; + job_ptr->details->share_res = 0; + return 0; } - return 0; } /* @@ -1078,7 +1083,6 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, shared = _resolve_shared_status(job_ptr, part_ptr->max_share, cr_enabled); - job_ptr->details->share_res = shared; if (cr_enabled) job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */ @@ -1587,8 +1591,9 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } if (job_ptr->priority == 0) { /* user/admin hold */ - if ((job_ptr->state_reason != WAIT_HELD) && - (job_ptr->state_reason != WAIT_HELD_USER)) { + if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS + && (job_ptr->state_reason != WAIT_HELD) + && (job_ptr->state_reason != WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_HELD; } return ESLURM_JOB_HELD; @@ -1794,6 +1799,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, select_bitmap = NULL; /* nothing left to free */ allocate_nodes(job_ptr); build_node_details(job_ptr, true); + rebuild_job_part_list(job_ptr); /* This could be set in the select plugin so we want to keep the flag. */ diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 6feb3336a7fb3d94a5daa2c912ddd931f860b09d..dfa2be9250399e7a1bf76b17c5f5fe07da337f96 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1797,8 +1797,15 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) return; } - /* Send batch step info to accounting */ - if (association_based_accounting && job_ptr) { + /* Send batch step info to accounting, only if the job is + * still completing. If the job was requeued because of node + * failure (state == pending) an epilog script might not of + * ran so we already finished the last instance of the job so + * this would be put on the requeued instance which is + * incorrect. + */ + if (association_based_accounting && job_ptr + && !IS_JOB_PENDING(job_ptr)) { struct step_record batch_step; memset(&batch_step, 0, sizeof(struct step_record)); batch_step.job_ptr = job_ptr; @@ -1924,9 +1931,9 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) /* return result */ if (error_code) { - info("_slurm_rpc_complete_batch_script JobId=%u: %s ", - comp_msg->job_id, - slurm_strerror(error_code)); + debug2("_slurm_rpc_complete_batch_script JobId=%u: %s ", + comp_msg->job_id, + slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); } else { debug2("_slurm_rpc_complete_batch_script JobId=%u %s", diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 2576d5557f2ff1fa36926a49d198e91f3e0f05ac..6d58834f0e3388cafbc444ccca8b45185121d967 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1733,7 +1733,7 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) } } - /* Sort the list of jobs in descending order */ + /* Sort the list of node counts in order descending size */ if (resv_desc_ptr->node_cnt) { for (i = 0; resv_desc_ptr->node_cnt[i]; i++) { int max_inx = i; @@ -1869,6 +1869,13 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) total_node_cnt = bit_set_count(node_bitmap); } + if (resv_desc_ptr->core_cnt && !core_bitmap) { + info("Attempt to reserve cores not possible with current " + "configuration"); + rc = ESLURM_INVALID_CPU_COUNT; + goto bad_parse; + } + _generate_resv_id(); if (resv_desc_ptr->name) { resv_ptr = (slurmctld_resv_t *) list_find_first (resv_list, @@ -3140,13 +3147,14 @@ static int _select_nodes(resv_desc_msg_t *resv_desc_ptr, bit_and(node_bitmap, avail_node_bitmap); } - /* If *resv_bitmap exists we probably don't need to delete it, - when it gets created off of node_bitmap it will be the - same, but just to be safe we do. */ + /* If *resv_bitmap exists we probably don't need to delete it, when it + * gets created off of node_bitmap it will be the same, but just to be + * safe we do. */ FREE_NULL_BITMAP(*resv_bitmap); - if (rc == SLURM_SUCCESS) + if (rc == SLURM_SUCCESS) { *resv_bitmap = _pick_idle_nodes(node_bitmap, resv_desc_ptr, core_bitmap); + } FREE_NULL_BITMAP(node_bitmap); if (*resv_bitmap == NULL) { if (rc == SLURM_SUCCESS) @@ -3154,8 +3162,7 @@ static int _select_nodes(resv_desc_msg_t *resv_desc_ptr, return rc; } - /* Same thing as the *resv_bitmap, might as well keep them in - sync */ + /* Same thing as the *resv_bitmap, might as well keep them in sync */ xfree(resv_desc_ptr->node_list); resv_desc_ptr->node_list = bitmap2node_name(*resv_bitmap); @@ -3859,15 +3866,16 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, if ((resv_ptr->full_nodes) || (job_ptr->details->whole_node)) { #if _DEBUG - info("reservation uses full nodes or job will " - "not share nodes"); + info("reservation %s uses full nodes or job %u " + "will not share nodes", + resv_ptr->name, job_ptr->job_id); #endif bit_not(resv_ptr->node_bitmap); bit_and(*node_bitmap, resv_ptr->node_bitmap); bit_not(resv_ptr->node_bitmap); } else { #if _DEBUG - info("job_test_resv: %s reservation uses " + info("job_test_resv: reservation %s uses " "partial nodes", resv_ptr->name); #endif if (*exc_core_bitmap == NULL) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 3b85ac7690374f47019b2e5decade8f65b7ff800..f26cbc77055a4d82855e9039b73e0d482f47faf3 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1063,7 +1063,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int allocate, uid_t submit_uid, struct job_record **job_pptr, char **err_msg); -/* Reset a job's end-time based upon it's end_time. +/* Reset a job's end_time based upon it's start_time and time_limit. * NOTE: Do not reset the end_time if already being preempted */ extern void job_end_time_reset(struct job_record *job_ptr); /* diff --git a/src/slurmctld/statistics.c b/src/slurmctld/statistics.c index 6089e087d7c244f0174b81f055d0786e63159df7..5ce071ba34997694d727dc453a9e73561ca86434 100644 --- a/src/slurmctld/statistics.c +++ b/src/slurmctld/statistics.c @@ -157,4 +157,6 @@ extern void reset_stats(int level) slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_active = 0; + + last_proc_req_start = time(NULL); } diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 463b599db78ed787dc1a169b6e24c751fb2c6d74..6e91257f5a5094a730e31f9cd3cff89096c8f2d4 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -241,9 +241,9 @@ extern void delete_step_records (struct job_record *job_ptr) struct step_record *step_ptr; xassert(job_ptr); - step_iterator = list_iterator_create (job_ptr->step_list); last_job_update = time(NULL); + step_iterator = list_iterator_create(job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { /* Only check if not a pending step */ if (step_ptr->step_id != INFINITE) { @@ -259,8 +259,9 @@ extern void delete_step_records (struct job_record *job_ptr) list_remove (step_iterator); _free_step_rec(step_ptr); } + list_iterator_destroy(step_iterator); + gres_plugin_job_clear(job_ptr->gres_list); - list_iterator_destroy (step_iterator); } /* _free_step_rec - delete a step record's data structures */ @@ -2604,7 +2605,10 @@ static void _pack_ctld_job_step_info(struct step_record *step_ptr, Buf buffer, } pack_time(run_time, buffer); - packstr(step_ptr->job_ptr->partition, buffer); + if (step_ptr->job_ptr->part_ptr) + packstr(step_ptr->job_ptr->part_ptr->name, buffer); + else + packstr(step_ptr->job_ptr->partition, buffer); packstr(step_ptr->resv_ports, buffer); packstr(node_list, buffer); packstr(step_ptr->name, buffer); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index fc2534db70cc331123551fb60b0320e32ef1f01f..cea7618fb8eca99a52efbdfbea9812f4f01273b4 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -448,7 +448,9 @@ static int _send_slurmd_conf_lite (int fd, slurmd_conf_t *cf) { int len; Buf buffer = init_buf(0); + slurm_mutex_lock(&cf->config_mutex); pack_slurmd_conf_lite(cf, buffer); + slurm_mutex_unlock(&cf->config_mutex); len = get_buf_offset(buffer); safe_write(fd, &len, sizeof(int)); safe_write(fd, get_buf_data(buffer), len); diff --git a/src/slurmd/slurmstepd/io.c b/src/slurmd/slurmstepd/io.c index 2b02941960d265ba34a901090993d6976fbf313c..1d72413fdfec01b24dd9e8933d698f658a52d65b 100644 --- a/src/slurmd/slurmstepd/io.c +++ b/src/slurmd/slurmstepd/io.c @@ -843,7 +843,9 @@ static void *_window_manager(void *arg) break; } len = slurm_read_stream(win_info->pty_fd, buf, 4); - if ((len == -1) && ((errno == EINTR) || (errno == EAGAIN))) + if ((len == -1) && + ((errno == EINTR) || (errno == EAGAIN) || + (errno == SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT))) continue; if (len < 4) { error("read window size error: %m"); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 76abd8cf077892215acaecc5fb61482af1fd5736..2b6c903e9515b0c7939e6dfdbae32477bd7102ed 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -315,6 +315,11 @@ static uint32_t _get_exit_code(stepd_step_rec_t *job) } step_rc = MAX(step_complete.step_rc, job->task[i]->estatus); } + /* If we killed all the tasks by cmd give at least one return + code. */ + if (step_rc == NO_VAL && job->task[0]) + step_rc = job->task[0]->estatus; + return step_rc; } diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index eae97342555a96c2d0ecad5ffd79f9207e802f0f..b365ac3fc6ab82a4f59cd191eb60b8341ccdb816 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -670,8 +670,13 @@ _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) int target_node_id = 0; stepd_step_task_info_t *task; uint32_t i; + uint32_t flag; + uint32_t signal; + + safe_read(fd, &signal, sizeof(int)); + flag = signal >> 24; + sig = signal & 0xfff; - safe_read(fd, &sig, sizeof(int)); debug("_handle_signal_container for step=%u.%u uid=%d signal=%d", job->jobid, job->stepid, (int) uid, sig); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { @@ -777,6 +782,31 @@ _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) goto done; } + if (flag & KILL_JOB_BATCH + && job->stepid == SLURM_BATCH_SCRIPT) { + /* We should only signal the batch script + * and nothing else, the job pgid is the + * equal to the pid of the batch script. + */ + if (kill(job->pgid, sig) < 0) { + error("%s: failed signal %d container pid" + "%u job %u.%u %m", + __func__, sig, job->pgid, + job->jobid, job->stepid); + rc = SLURM_ERROR; + errnum = errno; + pthread_mutex_unlock(&suspend_mutex); + goto done; + } + rc = SLURM_SUCCESS; + errnum = 0; + verbose("%s: sent signal %d to container pid %u job %u.%u", + __func__, sig, job->pgid, + job->jobid, job->stepid); + pthread_mutex_unlock(&suspend_mutex); + goto done; + } + /* * Signal the container */ diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index f43d2d770295636b41b5d8b3118fc09081a80b63..2f7a5ba66e5b3dd9cd4d1cac6bf6d145a7571443 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -484,6 +484,7 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job->job_core_spec = msg->job_core_spec; job->batch = true; + job->node_name = xstrdup(conf->node_name); /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ @@ -495,7 +496,6 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job->multi_prog = 0; job->open_mode = msg->open_mode; job->overcommit = (bool) msg->overcommit; - job->node_name = xstrdup(conf->node_name); job->uid = (uid_t) msg->uid; job->user_name = xstrdup(msg->user_name); diff --git a/src/srun/libsrun/launch.c b/src/srun/libsrun/launch.c index 45005fc3c173589ffe76e65ada3e2c20d49fa26a..e94f3e56806ec592bbc8be8fa4542aafa397faba 100644 --- a/src/srun/libsrun/launch.c +++ b/src/srun/libsrun/launch.c @@ -243,6 +243,8 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; + if (opt.ntasks_per_node != NO_VAL) + job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; @@ -252,6 +254,8 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; + if (opt.ntasks_per_node != NO_VAL) + job->ctx_params.plane_size = opt.ntasks_per_node; opt.distribution = job->ctx_params.task_dist; break; diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index ca33725d6b409e452410aef18650bfe6362b70a1..53b7fcc7d5bbd5f18aac8c69177eade6d5c98771 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -101,6 +101,7 @@ typedef struct allocation_info { static int shepard_fd = -1; static pthread_t signal_thread = (pthread_t) 0; +static int pty_sigarray[] = { SIGWINCH, 0 }; /* * Prototypes: @@ -414,11 +415,12 @@ extern void init_srun(int ac, char **av, bool handle_signals) { /* This must happen before we spawn any threads - * which are not designed to handle them */ + * which are not designed to handle arbitrary signals */ if (handle_signals) { if (xsignal_block(sig_array) < 0) error("Unable to block signals"); } + xsignal_block(pty_sigarray); /* Initialize plugin stack, read options from plugins, etc. */ @@ -676,6 +678,8 @@ cleanup: if (WIFEXITED(*global_rc)) *global_rc = WEXITSTATUS(*global_rc); + else if (WIFSIGNALED(*global_rc)) + *global_rc = 128 + WTERMSIG(*global_rc); mpir_cleanup(); log_fini(); @@ -867,9 +871,19 @@ _job_create_structure(allocation_info_t *ainfo) job->jobid = ainfo->jobid; job->ntasks = opt.ntasks; - for (i=0; i<ainfo->num_cpu_groups; i++) { - job->cpu_count += ainfo->cpus_per_node[i] * - ainfo->cpu_count_reps[i]; + + /* If cpus_per_task is set then get the exact count of cpus + for the requested step (we might very well use less, + especially if --exclusive is used). Else get the total for the + allocation given. + */ + if (opt.cpus_set) + job->cpu_count = opt.ntasks * opt.cpus_per_task; + else { + for (i=0; i<ainfo->num_cpu_groups; i++) { + job->cpu_count += ainfo->cpus_per_node[i] * + ainfo->cpu_count_reps[i]; + } } job->rc = -1; diff --git a/src/srun/libsrun/srun_job.h b/src/srun/libsrun/srun_job.h index a88982130ad8bf957bbe138ad0cb80335e94f43e..c3c6aea5a2541d889753f818a605f4ac60e6e21a 100644 --- a/src/srun/libsrun/srun_job.h +++ b/src/srun/libsrun/srun_job.h @@ -110,8 +110,8 @@ typedef struct srun_job { pthread_t pty_id; /* pthread to communicate window size changes */ int pty_fd; /* file to communicate window size changes */ uint16_t pty_port; /* used to communicate window size changes */ - uint8_t ws_col; /* window size, columns */ - uint8_t ws_row; /* window size, row count */ + uint16_t ws_col; /* window size, columns */ + uint16_t ws_row; /* window size, row count */ slurm_step_ctx_t *step_ctx; slurm_step_ctx_params_t ctx_params; } srun_job_t; diff --git a/src/srun/srun_pty.c b/src/srun/srun_pty.c index 7c3602d5956a66a44fdbfcd922f41786c7e794f5..ccc1aee9e46fcbadbdf677beb949072b0f3e8361 100644 --- a/src/srun/srun_pty.c +++ b/src/srun/srun_pty.c @@ -96,7 +96,7 @@ int set_winsize(srun_job_t *job) return 0; } -/* SIGWINCH should already be blocked by srun/signal.c */ +/* SIGWINCH should already be blocked by srun/libsrun/srun_job.c */ void block_sigwinch(void) { xsignal_block(pty_sigarray); @@ -178,5 +178,3 @@ static void *_pty_thread(void *arg) } return NULL; } - - diff --git a/testsuite/expect/globals b/testsuite/expect/globals index ec588fad1c23f19cb75a91683f5ef47465a9d29b..965ac0d1c0205150a26af09ab0a1ef4f009dc47f 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -2539,6 +2539,40 @@ proc get_node_cnt_in_part { partition } { return $node_cnt } +################################################################ +# +# Proc: get_idle_node_in_part +# +# Purpose: Get an idle node in a given partition +# +# Returns name of node in a partition or "" if unknown +# +################################################################ + +proc get_idle_node_in_part { partition } { + global sinfo alpha_numeric_under + + log_user 0 + set node_name "" + set scon_pid [spawn -noecho $sinfo -oNAME=%n -h -p$partition --state=idle] + expect { + -re "not found" { + send_user "\nFAILURE: partition $partition doesn't exist\n" + } + -re "NAME=($alpha_numeric_under)" { + set node_name $expect_out(1,string) + } + timeout { + send_user "\nFAILURE: scontrol not responding\n" + } + eof { + } + } + log_user 1 + + return $node_name +} + ################################################################ # diff --git a/testsuite/expect/test17.34 b/testsuite/expect/test17.34 index d2f52f66344ff9665cf3b2901766456e65224eed..20873f0c0fddc3522828efb072d24380995c2f09 100755 --- a/testsuite/expect/test17.34 +++ b/testsuite/expect/test17.34 @@ -152,6 +152,12 @@ proc core_spec_job {task node core_spec exp_nodes} { print_header $test_id +set select_type [test_select_type] +if {![string compare $select_type "linear"]} { + send_user "\nWARNING: This test is incompatible with select/$select_type\n" + exit 0 +} + # Remove any vestigial files exec $bin_rm -f $file_in $file_out $spec_in @@ -228,7 +234,7 @@ expect { wait } } -set $core_cnt [expr $core_cnt * $socket_cnt] +set core_cnt [expr $core_cnt * $socket_cnt] if {$core_cnt == 0} { send_user "\nFAILURE: sbatch did not find the number of cores\n" exit 1 @@ -241,6 +247,7 @@ if {$core_cnt < 4} { # # Using the core spec within the node limits # +send_user "\n\nRun within the specified node\n" core_spec_job 0 $first_node [expr $core_cnt - 2] 0 core_spec_job -2 $first_node [expr $core_cnt - 2] 0 @@ -248,12 +255,14 @@ core_spec_job -2 $first_node [expr $core_cnt - 2] 0 # Using core spec with more tasks then the node can handle. This should # cause the tasks to spread accross mutliple nodes as needed # +send_user "\n\nSpread job across multiple nodes\n" core_spec_job 1 $first_node [expr $core_cnt - 2] 1 core_spec_job 1 $first_node [expr $core_cnt - 1] 1 # # Using core spec with more cores then the specified node has # +send_user "\n\nFail by trying to use more cores than exist\n" core_spec_job 1 $first_node [expr $core_cnt + 5] -1 core_spec_job 1 $first_node [expr $core_cnt + 7] -1 diff --git a/testsuite/expect/test2.18 b/testsuite/expect/test2.18 index 48b63060ca60e997d559bc579a451aeded5100ff..76861c5239bcf14e661e8550869a386a6af8d013 100755 --- a/testsuite/expect/test2.18 +++ b/testsuite/expect/test2.18 @@ -34,7 +34,7 @@ source ./globals set test_id "2.18" set user_name "" set node_name "" -set host_name "" +set cluster_name "" set acct_good "test${test_id}_acct_good" set acct_bad "test${test_id}_acct_bad" set part_name "test${test_id}_part" @@ -73,18 +73,15 @@ proc set_part_val {part_type part_val} { } } -proc delete_part { } { - global scontrol sacctmgr part_name acct_good acct_bad exit_code +proc cleanup { } { + global scancel scontrol sacctmgr part_name acct_good acct_bad exit_code set del_part 0 - spawn $sacctmgr -i delete account $acct_good $acct_bad + + spawn $scancel -p $part_name expect { - -re "Deleting accounts" { - set del_part 1 - exp_continue - } timeout { - send_user "\nFAILURE: sacctmgr is not responding\n" + send_user "FAILURE: scancel is not responding\n" set exit_code 1 } eof { @@ -92,6 +89,8 @@ proc delete_part { } { } } + send_user "Any error, except for unresponsiveness, from the previous scancel is expected and should be ignored.\n" + spawn $scontrol delete partition=$part_name expect { -re "error" { @@ -109,16 +108,30 @@ proc delete_part { } { } } - return $del_part + spawn $sacctmgr -i delete account $acct_good $acct_bad + expect { + -re "Deleting accounts" { + set del_part 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr is not responding\n" + set exit_code 1 + } + eof { + wait + } + } + return $del_part } proc create_acct { acct } { - global sacctmgr exit_code user_name + global sacctmgr exit_code user_name cluster_name set create_acct 0 - spawn $sacctmgr -i create account $acct + spawn $sacctmgr -i create account $acct cluster=$cluster_name expect { -re "Adding Account" { set create_acct 1 @@ -133,7 +146,7 @@ proc create_acct { acct } { } } - spawn $sacctmgr -i create user $user_name account=$acct + spawn $sacctmgr -i create user $user_name account=$acct cluster=$cluster_name expect { timeout { send_user "\nFAILURE: sacctmgr is not responding\n" @@ -154,19 +167,14 @@ proc create_acct { acct } { proc test_part { acct part acct_con } { - global srun host_name exit_code + global srun exit_code set sub_job 0 - spawn $srun -I -A $acct -p $part hostname + spawn $srun -I -A $acct -p $part true expect { - -re "$host_name" { - set sub_job 1 - exp_continue - } -re "error" { - set sub_job 2 - if { $acct_con == 1 && $sub_job == 2} { + if { $acct_con == 1 } { send_user "\nThis error is expected\n" } else { send_user "\nFAILURE: This error should not have occured\n" @@ -187,7 +195,7 @@ proc test_part { acct part acct_con } { } # Remove any vestigial accounts or partitions -delete_part +cleanup spawn $bin_id -un expect { @@ -204,56 +212,24 @@ expect { } } -spawn hostname -expect { +set node_name [ get_idle_node_in_part $partition ] +set cluster_name [ get_cluster_name ] - -re "($alpha_numeric_under)" { - set host_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: hostname is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# NOTE: acct_good should always work and +# acct_bad should always cause an error +# +# Create good account +# +create_acct $acct_good -spawn $scontrol show node -expect { - -re "NodeName=($alpha_numeric_under)" { - set node_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# +# Create bad account +# +create_acct $acct_bad # Create partition -spawn $scontrol create partition=$part_name -expect { - -re "error" { - send_user "\nFAILURE: partition was not created\n" - set exit_code 1 - } - timeout { - send_user "\nFAILURE: scontrol is not reponding\n" - set exit_code 1 - } - eof { - wait - } -} - -# Add nodes to partition -spawn $scontrol update partition=$part_name nodes=$node_name +spawn $scontrol create partition=$part_name nodes=$node_name expect { -re "error" { send_user "\nFAILURE: partition was not created\n" @@ -268,19 +244,6 @@ expect { } } -# NOTE: acct_good should always work and -# acct_bad should always cause an error - -# -# Create good account -# -create_acct $acct_good - -# -# Create bad account -# -create_acct $acct_bad - # # Set Allow Account to good values # @@ -325,9 +288,9 @@ test_part $acct_good $part_name 0 test_part $acct_bad $part_name 1 -sleep 2 +sleep 5 # Delete partition and accounts -if {[delete_part] != 1} { +if {[cleanup] != 1} { send_user "\nFAILURE: Account was not deleted\n" set exit_code 1 } diff --git a/testsuite/expect/test2.19 b/testsuite/expect/test2.19 index 56516a6430727dfd3890c13a14ba0bdff11e86e8..174117d82e0392728014ef82cc6c6990bf1031da 100755 --- a/testsuite/expect/test2.19 +++ b/testsuite/expect/test2.19 @@ -35,6 +35,7 @@ set test_id "2.19" set user_name "" set node_name "" set host_name "" +set cluster_name "" set acct1 "test${test_id}_acct_1" set acct2 "test${test_id}_acct_2" set qos_good "test${test_id}_qos_good" @@ -75,8 +76,21 @@ proc set_part_val {part_type part_val} { } } -proc delete_part { } { - global scontrol sacctmgr part_name qos_good qos_bad acct1 acct2 exit_code +proc cleanup { } { + global scancel scontrol sacctmgr part_name qos_good qos_bad acct1 acct2 exit_code + + spawn $scancel -p $part_name + expect { + timeout { + send_user "FAILURE: scancel is not responding\n" + set exit_code 1 + } + eof { + wait + } + } + + send_user "Any error, except for unresponsiveness, from the previous scancel is expected and should be ignored.\n" spawn $scontrol delete partition=$part_name expect { @@ -131,7 +145,7 @@ proc delete_part { } { } proc create_qos { acct qos } { - global sacctmgr user_name exit_code + global sacctmgr user_name exit_code cluster_name set create_qos 0 spawn $sacctmgr -i create qos $qos @@ -150,7 +164,7 @@ proc create_qos { acct qos } { } - spawn $sacctmgr -i create account $acct qos=$qos + spawn $sacctmgr -i create account $acct qos=$qos cluster=$cluster_name expect { -re "Adding Account" { incr create_qos @@ -166,7 +180,7 @@ proc create_qos { acct qos } { } set create_acct 0 - spawn $sacctmgr -i create user $user_name account=$acct + spawn $sacctmgr -i create user $user_name account=$acct cluster=$cluster_name expect { timeout { send_user "\nFAILURE: sacctmgr is not responding\n" @@ -185,18 +199,13 @@ proc create_qos { acct qos } { proc test_part {acct qos part qos_con } { - global srun host_name part_name exit_code + global srun part_name exit_code set sub_job 0 - spawn $srun -I -A $acct --qos $qos -p $part hostname + spawn $srun -I -A $acct --qos $qos -p $part true expect { - -re "$host_name" { - set sub_job 1 - exp_continue - } -re "error" { - set sub_job 2 - if { $qos_con == 1 && $sub_job == 2} { + if { $qos_con == 1 } { send_user "\nThis error is expected\n" } else { send_user "\nFAILURE: This error should not have occured\n" @@ -215,7 +224,7 @@ proc test_part {acct qos part qos_con } { } # Delete any vestigial qos or accounts -delete_part +cleanup spawn $bin_id -un expect { @@ -232,56 +241,25 @@ expect { } } -spawn hostname -expect { +set node_name [ get_idle_node_in_part $partition ] +set cluster_name [ get_cluster_name ] - -re "($alpha_numeric_under)" { - set host_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: hostname is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# NOTE: qos_good should always work and +# qos_bad should always cause an error +# +# Create good QOS +# +create_qos $acct1 $qos_good -spawn $scontrol show node -expect { - -re "NodeName=($alpha_numeric_under)" { - set node_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol is not responding\n" - set exit_code 1 - } - eof { - wait - } -} +# +# Create bad QOS +# +create_qos $acct2 $qos_bad -# Create partition -spawn $scontrol create partition=$part_name -expect { - -re "error" { - send_user "\nFAILURE: partition was not created\n" - set exit_code 1 - } - timeout { - send_user "\nFAILURE: scontrol is not reponding\n" - set exit_code 1 - } - eof { - wait - } -} -# Add nodes to partition -spawn $scontrol update partition=$part_name nodes=$node_name +# Create partition +spawn $scontrol create partition=$part_name nodes=$node_name expect { -re "error" { send_user "\nFAILURE: partition was not created\n" @@ -296,19 +274,6 @@ expect { } } -# NOTE: qos_good should always work and -# qos_bad should always cause an error - -# -# Create good QOS -# -create_qos $acct1 $qos_good - -# -# Create bad QOS -# -create_qos $acct2 $qos_bad - # # Set Allow Qos to good value # @@ -351,7 +316,8 @@ test_part $acct1 $qos_good $part_name 0 # test_part $acct2 $qos_bad $part_name 1 -if {[delete_part] != 2} { +sleep 5 +if {[cleanup] != 2} { send_user "\nFAILURE: Qos/account was not deleted\n" set exit_code 1 } diff --git a/testsuite/expect/test2.21 b/testsuite/expect/test2.21 index 8b9f8b1efe9d2bb6c9e4f05177cf568a1c38453a..e5f8042859d847f851feff60520c247d5efc28dc 100755 --- a/testsuite/expect/test2.21 +++ b/testsuite/expect/test2.21 @@ -39,6 +39,12 @@ set exit_code 0 print_header $test_id +set min_age [get_min_job_age] +if {$min_age < 10} { + send_user "\nWARNING: MinJobAge too low for this test ($min_age < 10)\n" + exit 0 +} + # Remove any vestigial scripts exec $bin_rm -f $complete_script $fail_script diff --git a/testsuite/expect/test2.22 b/testsuite/expect/test2.22 index cd77d089715117d88aada0edd402aab11edfd4bf..46e19d147196355a491faf8377e55e7b77ba5904 100755 --- a/testsuite/expect/test2.22 +++ b/testsuite/expect/test2.22 @@ -38,6 +38,12 @@ set exit_code 0 print_header $test_id +set min_age [get_min_job_age] +if {$min_age < 10} { + send_user "\nWARNING: MinJobAge too low for this test ($min_age < 10)\n" + exit 0 +} + # Remove any vestigial scripts exec $bin_rm -f $script @@ -55,7 +61,7 @@ proc check_hold { job } { exp_continue } timeout { - send_user "\nFAILURE scontrol is not responding\n" + send_user "\nFAILURE: scontrol is not responding\n" set exit_code 1 } eof { @@ -64,7 +70,7 @@ proc check_hold { job } { } if { $hold != 1 } { - send_user "\nFAILURE scontrol did not hold job after it was requeued\n" + send_user "\nFAILURE: scontrol did not hold job after it was requeued\n" set exit_code 1 } } @@ -128,8 +134,8 @@ expect { exp_continue } timeout { - send_user "\nFAILURE sbatch is not responding\n" - set exit_code 1 + send_user "\nFAILURE: sbatch is not responding\n" + set exit_code 1get_min_job_age } eof { wait @@ -146,7 +152,7 @@ wait_for_job $job_id DONE spawn $scontrol requeuehold $job_id expect { timeout { - send_user "\nFAILURE scontrol is not responding\n" + send_user "\nFAILURE: scontrol is not responding\n" set exit_code 1 } eof { diff --git a/testsuite/expect/test2.23 b/testsuite/expect/test2.23 index 1e79857c3e65d3b19079065dff919c0b2464440d..877d766c100df21628970f277e8cb5c749707041 100755 --- a/testsuite/expect/test2.23 +++ b/testsuite/expect/test2.23 @@ -37,6 +37,12 @@ set exit_code 0 print_header $test_id +set min_age [get_min_job_age] +if {$min_age < 10} { + send_user "\nWARNING: MinJobAge too low for this test ($min_age < 10)\n" + exit 0 +} + # Remove any vestigial files exec $bin_rm -f $script diff --git a/testsuite/expect/test21.30 b/testsuite/expect/test21.30 index 733a855567c2daf65ce7115dd594dfe50f80dbc5..333848d80223166d107147a920725fb457d530b1 100755 --- a/testsuite/expect/test21.30 +++ b/testsuite/expect/test21.30 @@ -52,7 +52,7 @@ set exit_code 0 set acct test_acct set user_name "" set qosname name -set qostest [format "%s %s" $test_id "qosTest"] +set qostest [format "%s_%s" $test_id "qosTest"] set grn GrpNodes set grn_num 2 set grcpu GrpCpus @@ -207,6 +207,15 @@ if { [test_limits_enforced] == 0 } { exit 0 } +# +# Some tests will not work properly when allocating whole nodes to jobs +# +set select_type [test_select_type] +if {![string compare $select_type "linear"]} { + send_user "\nWARNING: This test is incompatible with select/$select_type\n" + exit 0 +} + # Remove any vesitgial accounts or qos spawn $sacctmgr -i delete qos $qostest expect { diff --git a/testsuite/expect/test3.11 b/testsuite/expect/test3.11 index 57b916dd25b5134eb3ef6a1eb37cbd7f174f5c71..f82e145f2b4aecb57aac41fce85479c12aef57c8 100755 --- a/testsuite/expect/test3.11 +++ b/testsuite/expect/test3.11 @@ -359,22 +359,15 @@ expect { set cons_res_actived 0 if {$def_share_force == 0} { - spawn $scontrol show config - expect { - -re "select/cons_res" { - set cons_res_actived 1 - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } + set select_type [test_select_type] + if {![string compare $select_type "cons_res"]} { + set cons_res_actived 1 } } -inc3_11_1 +if {$cons_res_actived == 1} { + inc3_11_1 +} inc3_11_2 inc3_11_3 inc3_11_4 diff --git a/testsuite/expect/test4.5 b/testsuite/expect/test4.5 index a663a54a831764081e525577fa6f1d0af9f9e356..33260b78e7024083cde8db1821e1ed7e0549b0e1 100755 --- a/testsuite/expect/test4.5 +++ b/testsuite/expect/test4.5 @@ -42,12 +42,14 @@ set node_name "" set mismatches 0 print_header $test_id - +if (![string compare $partition ""]) { + set partition [default_partition] +} # # Check the sinfo long format looking for filtering options # -spawn $sinfo --Node --long --exact +spawn $sinfo --Node --long --exact -p$partition expect { -re "($end_of_line)($name_string) *($number_with_suffix) *($name_string) *($alpha)" { if (![string compare $node_name ""]) { @@ -75,7 +77,7 @@ expect { # Use sinfo state filter # -spawn $sinfo --Node --long --exact --state=$node_state +spawn $sinfo --Node --long --exact --state=$node_state -p$partition expect { -re "($end_of_line)($name_string) *($number_with_suffix) *($name_string) *($alpha)" { if ([string compare $expect_out(5,string) $node_state]) { @@ -104,7 +106,7 @@ expect { # Use sinfo node name filter # -spawn $sinfo --Node --long --exact --nodes=$node_name +spawn $sinfo --Node --long --exact --nodes=$node_name -p$partition expect { -re "($end_of_line)($name_string) *($number_with_suffix) *($name_string) *($alpha)" { if ([string compare $expect_out(2,string) $node_name]) { diff --git a/testsuite/slurm_unit/common/bitstring-test.c b/testsuite/slurm_unit/common/bitstring-test.c index 8cf8e32f97df1ccbc4e677b3d72a8fa8faf62ac9..3b7b921dddd5e160fadf23efaa7c8582a72bfe79 100644 --- a/testsuite/slurm_unit/common/bitstring-test.c +++ b/testsuite/slurm_unit/common/bitstring-test.c @@ -5,6 +5,12 @@ #include <sys/time.h> #include <testsuite/dejagnu.h> +/* Copied from src/common/bitstring.c */ +#define _bitstr_words(nbits) \ + ((((nbits) + BITSTR_MAXPOS) >> BITSTR_SHIFT) + BITSTR_OVERHEAD) +#define bit_decl(name, nbits) \ + (name)[_bitstr_words(nbits)] = { BITSTR_MAGIC_STACK, (nbits) } + /* Test for failure: */ #define TEST(_tst, _msg) do { \ diff --git a/testsuite/slurm_unit/common/pack-test.c b/testsuite/slurm_unit/common/pack-test.c index 82ac5172d1e666110cb146295d83bc500c19d846..9b6ce7c4151379cc82e2b82453d6ead8acf4004b 100644 --- a/testsuite/slurm_unit/common/pack-test.c +++ b/testsuite/slurm_unit/common/pack-test.c @@ -69,7 +69,7 @@ int main (int argc, char *argv[]) unpack64(&test64, buffer); test_double2 = (long double)test64; - TEST(test64 != (uint64_t)test_double, "un/pack double as a uint64"); + TEST((uint64_t)test_double2 != (uint64_t)test_double, "un/pack double as a uint64"); /* info("Original\t %Lf", test_double); */ /* info("uint64\t %ld", test64); */ /* info("converted LD\t %Lf", test_double2); */