From 6e72a8d97fd9d410e924bf0a7ac4a98b6d083bf3 Mon Sep 17 00:00:00 2001 From: Mehdi Dogguy <mehdi@debian.org> Date: Mon, 8 Sep 2014 21:40:34 +0200 Subject: [PATCH] Imported Upstream version 2.3.3 --- META | 4 +- NEWS | 131 +++++++++++------ contribs/pam/pam_slurm.c | 33 ++++- doc/html/configurator.html.in | 3 +- doc/html/cons_res.shtml | 6 +- doc/html/faq.shtml | 48 +++---- doc/html/footer.txt | 2 +- doc/html/team.shtml | 3 + doc/man/man1/salloc.1 | 2 +- doc/man/man1/sbatch.1 | 13 +- doc/man/man1/srun.1 | 6 +- slurm.spec | 6 +- src/common/env.c | 132 ++++++++++++++++-- src/common/env.h | 11 ++ src/common/gres.c | 2 - src/common/jobacct_common.c | 2 +- src/common/print_fields.c | 8 +- src/common/print_fields.h | 4 +- src/common/read_config.c | 1 + src/common/slurm_protocol_api.c | 4 +- src/common/slurm_protocol_api.h | 1 + src/common/slurm_protocol_defs.c | 9 +- src/common/slurm_protocol_defs.h | 1 + src/common/slurmdb_pack.c | 7 + .../accounting_storage/mysql/as_mysql_job.c | 16 ++- .../accounting_storage/mysql/as_mysql_qos.c | 10 +- .../multifactor/priority_multifactor.c | 10 ++ src/plugins/select/bluegene/bg_job_place.c | 2 +- src/plugins/select/cons_res/job_test.c | 6 +- src/plugins/select/linear/select_linear.c | 2 +- src/plugins/task/cgroup/task_cgroup_devices.c | 8 ++ src/sacct/print.c | 25 ++-- src/sacct/sacct.c | 2 +- src/sacctmgr/event_functions.c | 3 +- src/sacctmgr/qos_functions.c | 2 +- src/sbatch/opt.c | 10 +- src/sbatch/opt.h | 1 + src/sbatch/sbatch.c | 12 ++ src/slurmctld/job_mgr.c | 5 +- src/slurmctld/job_scheduler.c | 15 +- src/slurmctld/node_scheduler.c | 1 + src/slurmctld/preempt.c | 9 +- src/slurmctld/proc_req.c | 1 + src/slurmctld/reservation.c | 2 +- src/slurmd/slurmd/req.c | 52 +++++-- src/slurmd/slurmstepd/mgr.c | 30 +++- src/slurmd/slurmstepd/task.c | 3 - src/slurmdbd/read_config.c | 2 + src/sreport/resv_reports.c | 2 +- src/srun/multi_prog.c | 28 ++-- src/sview/grid.c | 20 +-- testsuite/expect/Makefile.am | 2 + testsuite/expect/Makefile.in | 2 + testsuite/expect/README | 1 + testsuite/expect/test1.62 | 5 + testsuite/expect/test12.2 | 2 +- testsuite/expect/test7.15 | 85 +++++++++++ testsuite/expect/test7.15.prog.c | 30 ++++ 58 files changed, 649 insertions(+), 196 deletions(-) create mode 100755 testsuite/expect/test7.15 create mode 100644 testsuite/expect/test7.15.prog.c diff --git a/META b/META index 2ec6e12d1..1dece6400 100644 --- a/META +++ b/META @@ -3,9 +3,9 @@ Api_revision: 0 Major: 2 Meta: 1 - Micro: 2 + Micro: 3 Minor: 3 Name: slurm Release: 1 Release_tags: dist - Version: 2.3.2 + Version: 2.3.3 diff --git a/NEWS b/NEWS index a8acda33b..af970a369 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,47 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 2.3.3 +======================== + -- Fix task/cgroup plugin error when used with GRES. Patch by Alexander + Bersenev (Institute of Mathematics and Mechanics, Russia). + -- Permit pending job exceeding a partition limit to run if its QOS flag is + modified to permit the partition limit to be exceeded. Patch from Bill + Brophy, Bull. + -- BLUEGENE - Fixed preemption issue. + -- sacct search for jobs using filtering was ignoring wckey filter. + -- Fixed issue with QOS preemption when adding new QOS. + -- Fixed issue with comment field being used in a job finishing before it + starts in accounting. + -- Add slashes in front of derived exit code when modifying a job. + -- Handle numeric suffix of "T" for terabyte units. Patch from John Thiltges, + University of Nebraska-Lincoln. + -- Prevent resetting a held job's priority when updating other job parameters. + Patch from Alejandro Lucero Palau, BSC. + -- Improve logic to import a user's environment. Needed with --get-user-env + option used with Moab. Patch from Mark Grondona, LLNL. + -- Fix bug in sview layout if node count less than configured grid_x_width. + -- Modify PAM module to prefer to use SLURM library with same major release + number that it was built with. + -- Permit gres count configuration of zero. + -- Fix race condition where sbcast command can result in deadlock of slurmd + daemon. Patch by Don Albert, Bull. + -- Fix bug in srun --multi-prog configuration file to avoid printing duplicate + record error when "*" is used at the end of the file for the task ID. + -- Let operators see reservation data even if "PrivateData=reservations" flag + is set in slurm.conf. Patch from Don Albert, Bull. + -- Added new sbatch option "--export-file" as needed for latest version of + Moab. Patch from Phil Eckert, LLNL. + -- Fix for sacct printing CPUTime(RAW) where the the is greater than a 32 bit + number. + -- Fix bug in --switch option with topology resulting in bad switch count use. + Patch from Alejandro Lucero Palau (Barcelona Supercomputer Center). + -- Fix PrivateFlags bug when using Priority Multifactor plugin. If using sprio + all jobs would be returned even if the flag was set. + Patch from Bill Brophy, Bull. + -- Fix for possible invalid memory reference in slurmctld in job dependency + logic. Patch from Carles Fenoy (Barcelona Supercomputer Center). + * Changes in SLURM 2.3.2 ======================== -- Add configure option of "--without-rpath" which builds SLURM tools without @@ -564,7 +605,7 @@ documents those changes that are of interest to users and admins. different connection types (i.e. one torus, one mesh). -- Fix memory leak if MPI ports are reserved (for OpenMPI) and srun's --resv-ports option is used. - -- Fix some anomalies in select/cons_res task layout when using the + -- Fix some anomalies in select/cons_res task layout when using the --cpus-per-task option. Patch from Martin Perry, Bull. -- Improve backfill scheduling logic when job specifies --ntasks-per-node and --mem-per-cpu options on a heterogeneous cluster. Patch from Bjorn-Helge @@ -1024,7 +1065,7 @@ documents those changes that are of interest to users and admins. -- Fix for slurmd restart on completing node with no tasks to get the correct state, completing. Patch from Hongjia Cao (NUDT). -- Prevent scontrol setting a node's Reason="". Patch from Hongjia Cao (NUDT). - -- Add new functions hostlist_ranged_string_malloc, + -- Add new functions hostlist_ranged_string_malloc, hostlist_ranged_string_xmalloc, hostlist_deranged_string_malloc, and hostlist_deranged_string_xmalloc which will allocate memory as needed. -- Make the slurm commands support both the --cluster and --clusters option. @@ -1462,12 +1503,12 @@ documents those changes that are of interest to users and admins. -- In select/cons_res, for jobs that can run on a single node, use a best-fit packing approach. -- Add support for new partition states of DRAIN and INACTIVE and new partition - option of "Alternate" (alternate partition to use for jobs submitted to + option of "Alternate" (alternate partition to use for jobs submitted to partitions that are currently in a state of DRAIN or INACTIVE). -- Add group membership cache. This can substantially speed up slurmctld startup or reconfiguration if many partitions have AllowGroups configured. -- Added slurmdb api for accessing slurm DB information. - -- In select/linear: Modify data structures for better performance and to + -- In select/linear: Modify data structures for better performance and to avoid underflow error messages when slurmctld restarts while jobs are in completing state. -- Added hash for slurm.conf so when nodes check in to the controller it can @@ -1483,28 +1524,28 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 2.2.0.pre2 ============================= - -- Add support for spank_get_item() to get S_STEP_ALLOC_CORES and - S_STEP_ALLOC_MEM. Support will remain for S_JOB_ALLOC_CORES and - S_JOB_ALLOC_MEM. - -- Kill individual job steps that exceed their memory limit rather than + -- Add support for spank_get_item() to get S_STEP_ALLOC_CORES and + S_STEP_ALLOC_MEM. Support will remain for S_JOB_ALLOC_CORES and + S_JOB_ALLOC_MEM. + -- Kill individual job steps that exceed their memory limit rather than killing an entire job if one step exceeds its memory limit. - -- Added configuration parameter VSizeFactor to enforce virtual memory limits + -- Added configuration parameter VSizeFactor to enforce virtual memory limits for jobs and job steps as a percentage of their real memory allocation. -- Add scontrol ability to update job step's time limits. -- Add scontrol ability to update job's NumCPUs count. - -- Add --time-min options to salloc, sbatch and srun. The scontrol command + -- Add --time-min options to salloc, sbatch and srun. The scontrol command has been modified to display and modify the new field. sched/backfill - plugin has been changed to alter time limits of jobs with the + plugin has been changed to alter time limits of jobs with the --time-min option if doing so permits earlier job initiation. -- Add support for TotalView symbol MPIR_partial_attach_ok with srun support to release processes which TotalView does not attach to. - -- Add new option for SelectTypeParameters of CR_ONE_TASK_PER_CORE. This - option will allocate one task per core by default. Without this option, - by default one task will be allocated per thread on nodes with more than + -- Add new option for SelectTypeParameters of CR_ONE_TASK_PER_CORE. This + option will allocate one task per core by default. Without this option, + by default one task will be allocated per thread on nodes with more than one ThreadsPerCore configured. -- Avoid accounting separately for a current pid corresponds to a Light Weight Process (Thread POSIX) appearing in the /proc directory. Only account for - the original process (pid==tgid) to avoid accounting for memory use more + the original process (pid==tgid) to avoid accounting for memory use more than once. -- Add proctrack/cgroup plugin which uses Linux control groups (aka cgroup) to track processes on Linux systems having this feature enabled (kernel @@ -1533,16 +1574,16 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 2.2.0.pre1 ============================= -- Added RunTime field to scontrol show job report - -- Added SLURM_VERSION_NUMBER and removed SLURM_API_VERSION from + -- Added SLURM_VERSION_NUMBER and removed SLURM_API_VERSION from slurm/slurm.h. -- Added support to handle communication with SLURM 2.1 clusters. Job's - should not be lost in the future when upgrading to higher versions of + should not be lost in the future when upgrading to higher versions of SLURM. -- Added withdeleted options for listing clusters, users, and accounts -- Remove PLPA task affinity functions due to that package being deprecated. - -- Preserve current partition state information and node Feature and Weight - information rather than use contents of slurm.conf file after slurmctld - restart with -R option or SIGHUP. Replace information with contents of + -- Preserve current partition state information and node Feature and Weight + information rather than use contents of slurm.conf file after slurmctld + restart with -R option or SIGHUP. Replace information with contents of slurm.conf after slurmctld restart without -R or "scontrol reconfigure". See RELEASE_NOTES file fore more details. -- Modify SLURM's PMI library (for MPICH2) to properly execute an executable @@ -1551,7 +1592,7 @@ documents those changes that are of interest to users and admins. -- Moved all SQL dependant plugins into a seperate rpm slurm-sql. This should be needed only where a connection to a database is needed (i.e. where the slurmdbd is running) - -- Add command line option "no_sys_info" to PAM module to supress system + -- Add command line option "no_sys_info" to PAM module to supress system logging of "access granted for user ...", access denied and other errors will still be logged. -- sinfo -R now has the user and timestamp in separate fields from the reason. @@ -1615,7 +1656,7 @@ documents those changes that are of interest to users and admins. -- Fixed typo in job_mgr.c dealing with qos instead of associations. -- Make sure associations and qos' are initiated when added. -- Fixed wrong initialization for wckeys in the association manager. - -- Added wiki.conf configuration parameter of HidePartitionNodes. See + -- Added wiki.conf configuration parameter of HidePartitionNodes. See "man wiki.conf" for more information. -- Add "JobAggregationTime=#" field SchedulerParameter configuration parameter output. @@ -1905,16 +1946,16 @@ documents those changes that are of interest to users and admins. -- Permit a batch script to reset umask and have that propagate to tasks spawed by subsequent srun. Previously the umask in effect when sbatch was executed was propagated to tasks spawed by srun. - -- Modify slurm_job_cpus_allocated_on_node_id() and - slurm_job_cpus_allocated_on_node() functions to not write explanation of + -- Modify slurm_job_cpus_allocated_on_node_id() and + slurm_job_cpus_allocated_on_node() functions to not write explanation of failures to stderr. Only return -1 and set errno. -- Correction in configurator.html script. Prolog and Epilog were reversed. -- BLUEGENE - Fixed race condition where if a nodecard has an error on an un-booted block when a job comes to use it before the state checking thread notices it which could cause the slurmctld to lock up on a non-dynamic system. - -- In select/cons_res with FastSchedule=0 and Procs=# defined for the node, - but no specific socket/core/thread count configured, avoid fatal error if + -- In select/cons_res with FastSchedule=0 and Procs=# defined for the node, + but no specific socket/core/thread count configured, avoid fatal error if the number of cores on a node is less than the number of Procs configured. -- Added ability for the perlapi to utilize opaque data types returned from the C api. @@ -1934,23 +1975,23 @@ documents those changes that are of interest to users and admins. ======================== -- Fix for purge script in accounting to use correct options. -- If SelectType=select/linear and SelectTypeParameters=CR_Memory fix bug that - would fail to release memory reserved for a job if "scontrol reconfigure" + would fail to release memory reserved for a job if "scontrol reconfigure" is executed while the job is in completing state. - -- Fix bug in handling event trigger for job time limit while job is still + -- Fix bug in handling event trigger for job time limit while job is still in pending state. -- Fixed display of Ave/MaxCPU in sacct for jobs. Steps were printed correctly. -- When node current features differs from slurm.conf, log the node names using a hostlist expression rather than listing individual node names. -- Improve ability of srun to abort job step for some task launch failures. - -- Fix mvapich plugin logic to release the created job allocation on - initialization failure (previously the failures would cancel job step, + -- Fix mvapich plugin logic to release the created job allocation on + initialization failure (previously the failures would cancel job step, but retain job allocation). -- Fix bug in srun for task count so large that it overflows int data type. -- Fix important bug in select/cons_res handling of ntasks-per-core parameter that was uncovered by a bug fixed in v2.1.3. Bug produced fatal error for slurmctld: "cons_res: cpus computation error". - -- Fix bug in select/cons_res handling of partitions configured with + -- Fix bug in select/cons_res handling of partitions configured with Shared=YES. Prior logic failed to support running multiple jobs per node. * Changes in SLURM 2.1.3-2 @@ -1964,11 +2005,11 @@ documents those changes that are of interest to users and admins. was drained you would not be able to create new blocks on it. -- In sched/wiki2 (for Moab): Add excluded host list to job information using new keyword "EXCLUDE_HOSTLIST". - -- Correct slurmd reporting of incorrect socket/core/thread counts. + -- Correct slurmd reporting of incorrect socket/core/thread counts. -- For sched/wiki2 (Moab): Do not extend a job's end time for suspend/resume or startup delay due to node boot time. A job's end time will always be its start time plus time limit. - -- Added build-time option (to configure program) of --with-pam_dir to + -- Added build-time option (to configure program) of --with-pam_dir to specify the directory into which PAM modules get installed, although it should pick the proper directory by default. "make install" and "rpmbuild" should now put the pam_slurm.so file in the proper directory. @@ -1979,12 +2020,12 @@ documents those changes that are of interest to users and admins. modes). -- For topology/tree, log invalid hostnames in a single hostlist expression rather than one per line. - -- A job step's default time limit will be UNLIMITED rather than partition's + -- A job step's default time limit will be UNLIMITED rather than partition's default time limit. The step will automatically be cancelled as part of the job termination logic when the job's time limit is reached. -- sacct - fixed bug when checking jobs against a reservation -- In select/cons_res, fix support for job allocation with --ntasks_per_node - option. Previously could allocate too few CPUs on some nodes. + option. Previously could allocate too few CPUs on some nodes. -- Adjustment made to init message to the slurmdbd to allow backwards compatibility with future 2.2 release. YOU NEED TO UPGRADE SLURMDBD BEFORE ANYTHING ELSE. @@ -1997,16 +2038,16 @@ documents those changes that are of interest to users and admins. under some conditions. -- When a node silently fails which is already drained/down the reason for draining for the node is not changed. - -- Srun will ignore SLURM_NNODES environment variable and use the count of - currently allocated nodes if that count changes during the job's lifetime - (e.g. job allocation uses the --no-kill option and a node goes DOWN, job + -- Srun will ignore SLURM_NNODES environment variable and use the count of + currently allocated nodes if that count changes during the job's lifetime + (e.g. job allocation uses the --no-kill option and a node goes DOWN, job step would previously always fail). -- Made it so sacctmgr can't add blank user or account. The MySQL plugin will also reject such requests. -- Revert libpmi.so version for compatibility with SLURM version 2.0 and earlier to avoid forcing applications using a specific libpmi.so version to rebuild unnecessarily (revert from libpmi.so.21.0.0 to libpmi.so.0.0.0). - -- Restore support for a pending job's constraints (required node features) + -- Restore support for a pending job's constraints (required node features) when slurmctld is restarted (internal structure needed to be rebuilt). -- Removed checkpoint_blcr.so from the plugin rpm in the slurm.spec since it is also in the blcr rpm. @@ -2014,7 +2055,7 @@ documents those changes that are of interest to users and admins. of jobs to share resources. -- BLUEGENE - Fixed issue where tasks on steps weren't being displayed correctly with scontrol and sview. - -- BLUEGENE - fixed wiki2 plugin to report correct task count for pending + -- BLUEGENE - fixed wiki2 plugin to report correct task count for pending jobs. -- BLUEGENE - Added /etc/ld.so.conf.d/slurm.conf to point to the directory holding libsched_if64.so when building rpms. @@ -2031,7 +2072,7 @@ documents those changes that are of interest to users and admins. beginning of the day if not set previously. -- Defined slurm_free_reservation_info_msg and slurm_free_topo_info_msg in common/slurm_protocol_defs.h - -- Avoid generating error when a job step includes a memory specification and + -- Avoid generating error when a job step includes a memory specification and memory is not configured as a consumable resource. -- Patch for small memory leak in src/common/plugstack.c -- Fix sview search on node state. @@ -2044,12 +2085,12 @@ documents those changes that are of interest to users and admins. -- Fix spelling issues (from Gennaro Oliva) -- Fix issue when changing parents of an account in accounting all childern weren't always sent to their respected slurmctlds until a restart. - -- Restore support for srun/salloc/sbatch option --hint=nomultithread to + -- Restore support for srun/salloc/sbatch option --hint=nomultithread to bind tasks to cores rather than threads (broken in slurm v2.1.0-pre5). -- Fix issue where a 2.0 sacct could not talk correctly to a 2.1 slurmdbd. -- BLUEGENE - Fix issue where no partitions have any nodes assigned them to alert user no blocks can be created. - -- BLUEGENE - Fix smap to put BGP images when using -Dc on a Blue Gene/P + -- BLUEGENE - Fix smap to put BGP images when using -Dc on a Blue Gene/P system. -- Set SLURM_SUBMIT_DIR environment variable for srun and salloc commands to match behavior of sbatch command. @@ -2060,7 +2101,7 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - critical fix where jobs would be killed incorrectly. -- BLUEGENE - fix for sview putting multiple ionodes on to nodelists when viewing the jobs tab. - + * Changes in SLURM 2.1.0 ======================== -- Improve sview layout of blocks in use. @@ -2068,7 +2109,7 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - improved startup speed further for large numbers of defined blocks -- Fix to _get_job_min_nodes() in wiki2/get_jobs.c suggested by Michal Novotny - -- BLUEGENE - fixed issues when updating a pending job when a node + -- BLUEGENE - fixed issues when updating a pending job when a node count was incorrect for the asked for connection type. -- BLUEGENE - fixed issue when combining blocks that are in ready states to make a larger block from those or make multiple smaller blocks by diff --git a/contribs/pam/pam_slurm.c b/contribs/pam/pam_slurm.c index 426f10eae..62b761506 100644 --- a/contribs/pam/pam_slurm.c +++ b/contribs/pam/pam_slurm.c @@ -30,6 +30,9 @@ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. \*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif #include <ctype.h> #include <errno.h> @@ -357,12 +360,36 @@ _send_denial_msg(pam_handle_t *pamh, struct _options *opts, */ extern void libpam_slurm_init (void) { + char libslurmname[64]; + if (slurm_h) return; - if (!(slurm_h = dlopen("libslurm.so", RTLD_NOW|RTLD_GLOBAL))) - _log_msg (LOG_ERR, "Unable to dlopen libslurm: %s\n", - dlerror ()); + /* First try to use the same libslurm version ("libslurm.so.24.0.0"), + * Second try to match the major version number ("libslurm.so.24"), + * Otherwise use "libslurm.so" */ + if (snprintf(libslurmname, sizeof(libslurmname), + "libslurm.so.%d.%d.%d", SLURM_API_CURRENT, + SLURM_API_REVISION, SLURM_API_AGE) >= + sizeof(libslurmname) ) { + _log_msg (LOG_ERR, "Unable to write libslurmname\n"); + } else if (!(slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + _log_msg (LOG_INFO, "Unable to dlopen %s: %s\n", + libslurmname, dlerror ()); + } + + if (snprintf(libslurmname, sizeof(libslurmname), "libslurm.so.%d", + SLURM_API_CURRENT) >= sizeof(libslurmname) ) { + _log_msg (LOG_ERR, "Unable to write libslurmname\n"); + } else if (!(slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + _log_msg (LOG_INFO, "Unable to dlopen %s: %s\n", + libslurmname, dlerror ()); + } + + if (!(slurm_h = dlopen("libslurm.so", RTLD_NOW|RTLD_GLOBAL))) { + _log_msg (LOG_ERR, "Unable to dlopen libslurm.so: %s\n", + dlerror ()); + } return; } diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index 0296d4af6..a308af681 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -129,7 +129,7 @@ function displayfile() "#DisableRootJobs=NO <br>" + "#EnforcePartLimits=NO <br>" + get_field("Epilog",document.config.epilog) + "<br>" + - "#PrologSlurmctld= <br>" + + "#EpilogSlurmctld= <br>" + "#FirstJobId=1 <br>" + "#MaxJobId=999999 <br>" + "#GresTypes= <br>" + @@ -166,6 +166,7 @@ function displayfile() "SlurmdPort=" + document.config.slurmd_port.value + "<br>" + "SlurmdSpoolDir=" + document.config.slurmd_spool_dir.value + "<br>" + "SlurmUser=" + document.config.slurm_user.value + "<br>" + + "#SlurmdUser=root <br>" + get_field("SrunEpilog",document.config.srun_epilog) + "<br>" + get_field("SrunProlog",document.config.srun_prolog) + "<br>" + "StateSaveLocation=" + document.config.state_save_location.value + "<br>" + diff --git a/doc/html/cons_res.shtml b/doc/html/cons_res.shtml index 971f36df0..b253dafd9 100644 --- a/doc/html/cons_res.shtml +++ b/doc/html/cons_res.shtml @@ -45,9 +45,9 @@ this plug-in is described below. as consumable resources.</li> </ul> <li>In the cases where Memory is the consumable resource or one of - the two consumable resources the <b>Memory</b> parameter which - defines a node amount of real memory in slurm.conf must be - set when fastschedule=1. + the two consumable resources the <b>RealMemory</b> parameter, which + defines a node's amount of real memory in slurm.conf, must be + set when FastSchedule=1. <li>srun's <i>-E</i> extension for sockets, cores, and threads are ignored within the node allocation mechanism when CR_CPU or CR_CPU_MEMORY is selected. It is considered to compute the total diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 03ad05bd3..37cd5bb0e 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -40,7 +40,7 @@ (e.g. place it into a <i>hold</i> state)?</a></li> <li><a href="#mem_limit">Why are jobs not getting the appropriate memory limit?</a></li> -<li><a href="#mailing_list">Is an archive available of messages posted to +<li><a href="#mailing_list">Is an archive available of messages posted to the <i>slurm-dev</i> mailing list?</a></li> <li><a href="#job_size">Can I change my job's size after it has started running?</a></li> @@ -95,35 +95,35 @@ SLURM? Why does the DAKOTA program not run with SLURM?</a></li> <li><a href="#cred_replay">Why are "Task launch failed on node ... Job credential replayed" errors generated?</a></li> -<li><a href="#globus">Can SLURM be used with Globus?</li> -<li><a href="#time_format">Can SLURM time output format include the year?</li> +<li><a href="#globus">Can SLURM be used with Globus?</a></li> +<li><a href="#time_format">Can SLURM time output format include the year?</a></li> <li><a href="#file_limit">What causes the error - "Unable to accept new connection: Too many open files"?</li> + "Unable to accept new connection: Too many open files"?</a></li> <li><a href="#slurmd_log">Why does the setting of <i>SlurmdDebug</i> fail - to log job step information at the appropriate level?</li> + to log job step information at the appropriate level?</a></li> <li><a href="#rpm">Why isn't the auth_none.so (or other file) in a - SLURM RPM?</li> + SLURM RPM?</a></li> <li><a href="#slurmdbd">Why should I use the slurmdbd instead of the - regular database plugins?</li> -<li><a href="#debug">How can I build SLURM with debugging symbols?</li> + regular database plugins?</a></li> +<li><a href="#debug">How can I build SLURM with debugging symbols?</a></li> <li><a href="#state_preserve">How can I easily preserve drained node - information between major SLURM updates?</li> + information between major SLURM updates?</a></li> <li><a href="#health_check">Why doesn't the <i>HealthCheckProgram</i> - execute on DOWN nodes?</li> + execute on DOWN nodes?</a></li> <li><a href="#batch_lost">What is the meaning of the error "Batch JobId=# missing from master node, killing it"?</a></li> <li><a href="#accept_again">What does the messsage "srun: error: Unable to accept connection: Resources temporarily unavailable" indicate?</a></li> <li><a href="#task_prolog">How could I automatically print a job's - SLURM job ID to its standard output?</li> + SLURM job ID to its standard output?</a></li> <li><a href="#moab_start">I run SLURM with the Moab or Maui scheduler. - How can I start a job under SLURM without the scheduler?</li> + How can I start a job under SLURM without the scheduler?</a></li> <li><a href="#orphan_procs">Why are user processes and <i>srun</i> - running even though the job is supposed to be completed?</li> + running even though the job is supposed to be completed?</a></li> <li><a href="#slurmd_oom">How can I prevent the <i>slurmd</i> and <i>slurmstepd</i> daemons from being killed when a node's memory - is exhausted?</li> + is exhausted?</a></li> <li><a href="#ubuntu">I see my host of my calling node as 127.0.1.1 instead of the correct IP address. Why is that?</a></li> <li><a href="#stop_sched">How can I stop SLURM from scheduling jobs?</a></li> @@ -641,7 +641,7 @@ problem described above. Use the same solution for the AS (Address Space), RSS (Resident Set Size), or other limits as needed.</p> -<p><a name="mailing_list"><b>23. Is an archive available of messages posted to +<p><a name="mailing_list"><b>23. Is an archive available of messages posted to the <i>slurm-dev</i> mailing list?</b></a><br> Yes, it is at <a href="http://groups.google.com/group/slurm-devel"> http://groups.google.com/group/slurm-devel</a></p> @@ -665,7 +665,7 @@ job to be expanded.</li> <p>Use the <i>scontrol</i> command to change a job's size either by specifying a new node count (<i>NumNodes=</i>) for the job or identify the specific nodes -(<i>NodeList=</i>) that you want the job to retain. +(<i>NodeList=</i>) that you want the job to retain. Any job steps running on the nodes which are reliquished by the job will be killed unless initiated with the <i>--no-kill</i> option. After the job size is changed, some environment variables created by SLURM @@ -1288,15 +1288,15 @@ Index: src/slurmctld/ping_nodes.c --- src/slurmctld/ping_nodes.c (revision 15166) +++ src/slurmctld/ping_nodes.c (working copy) @@ -283,9 +283,6 @@ - node_ptr = &node_record_table_ptr[i]; - base_state = node_ptr->node_state & NODE_STATE_BASE; + node_ptr = &node_record_table_ptr[i]; + base_state = node_ptr->node_state & NODE_STATE_BASE; - if (base_state == NODE_STATE_DOWN) - continue; - #ifdef HAVE_FRONT_END /* Operate only on front-end */ - if (i > 0) - continue; + if (i > 0) + continue; </pre> <p><a name="batch_lost"><b>32. What is the meaning of the error @@ -1411,10 +1411,10 @@ advantage of its filtering and formatting options. For example: $ squeue -tpd -h -o "scontrol update jobid=%i priority=1000" >my.script </pre></p> -<p><a name="amazon_ec2"><b>41. Can SLURM be used to run jobs on +<p><a name="amazon_ec2"><b>41. Can SLURM be used to run jobs on Amazon's EC2?</b></a></br> -<p>Yes, here is a description of use SLURM use with -<a href="http://aws.amazon.com/ec2/">Amazon's EC2</a> courtesy of +<p>Yes, here is a description of use SLURM use with +<a href="http://aws.amazon.com/ec2/">Amazon's EC2</a> courtesy of Ashley Pittman:</p> <p>I do this regularly and have no problem with it, the approach I take is to start as many instances as I want and have a wrapper around @@ -1448,7 +1448,7 @@ pathname (starting with "/"). Otherwise it will be found in directory used for saving state (<i>SlurmdSpoolDir</i>).</p> <p>For <i>slurmstepd</i> the core file will depend upon when the failure -occurs. It will either be in spawned job's working directory on the same +occurs. It will either be in spawned job's working directory on the same location as that described above for the <i>slurmd</i> daemon.</p> <p><a name="totalview"><b>43. How can TotalView be configured to operate with diff --git a/doc/html/footer.txt b/doc/html/footer.txt index 35ec10de6..0ea9db512 100644 --- a/doc/html/footer.txt +++ b/doc/html/footer.txt @@ -2,7 +2,7 @@ <div id="footer"> <div id="left"> <a href="disclaimer.html" target="_blank" class="privacy">Legal Notices</a></div> -<div id="right"><span class="ucrlnum">27 June 2011 </span></div> +<div id="right"><span class="ucrlnum"></span></div> </div> <div id="footer2"> diff --git a/doc/html/team.shtml b/doc/html/team.shtml index 4a1b1d07e..eb76ba2bc 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -17,6 +17,7 @@ organizations. The current SLURM development staff includes: </p> <li>Ernest Artiaga (Barcelona Supercomputer Center, Spain)</li> <li>Susanne Balle (HP)</li> <li>Ralph Bean (Rochester Institute of Technology)</li> +<li>Alexander Bersenev (Institute of Mathematics and Mechanics, Russia)</li> <li>Anton Blanchard (Samba)</li> <li>Janne Blomqvist (Aalto University, Finland)</li> <li>David Bremer (LLNL)</li> @@ -26,6 +27,7 @@ organizations. The current SLURM development staff includes: </p> <li>Daniel Christians (HP)</li> <li>Gilles Civario (Bull)</li> <li>Chuck Clouston (Bull)</li> +<li>Phil Eckert (Lawrence Livermore National Laboratory)</li> <li>Yuri D'Elia (Center for Biomedicine, EURAC Research, Italy)</li> <li>Carles Fenoy (Barcelona Supercomputer Center, Spain)</li> <li>Joseph Donaghy (LLNL)</li> @@ -87,6 +89,7 @@ organizations. The current SLURM development staff includes: </p> <li>Prashanth Tamraparni (HP, India)</li> <li>Jimmy Tang (Trinity College, Ireland)</li> <li>Kevin Tew (LLNL/Bringham Young University)</li> +<li>John Thiltges (University of Nebraska-Lincoln)</li> <li>Adam Todorski (Rensselaer Polytechnic Institute)</li> <li>Stephen Trofinoff (Swiss National Supercomputing Centre)</li> <li>Nathan Weeks (Iowa State University)</li> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 9990122f1..a46a2bbad 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -849,7 +849,7 @@ sockets. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. .TP -\fB\-\-switch\fR=<\fIcount\fR>[@<\fImax\-time\fR>] +\fB\-\-switches\fR=<\fIcount\fR>[@<\fImax\-time\fR>] When a tree topology is used, this defines the maximum count of switches desired for the job allocation and optionally the maximum time to wait for that number of switches. If SLURM finds an allocation containing more diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index da6709d2b..aa8b91c07 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -375,6 +375,17 @@ specific environment variable names, then the \fB\-\-get\-user\-env\fR option will implicitly be set to load other environment variables based upon the user's configuration on the cluster which executes the job. +.TP +\fB\-\-export\-file\fR=<\fIfilename\fR | \fIfd\fR> +If a number between 3 and OPEN_MAX is specified as the argument to +this option, a readable file descriptor will be assumed (STDIN and +STDOUT are not supported as valid arguments). Otherwise a filename is +assumed. Export environment variables defined in <\fIfilename\fR> or +read from <\fIfd\fR> to the job's execution environment. The +content is one or more environment variable definitions of the form +NAME=value, each separated by a null character. This allows the use +of special characters in environment definitions. + .TP \fB\-F\fR, \fB\-\-nodefile\fR=<\fInode file\fR> Much like \-\-nodelist, but the list is contained in a file of name @@ -962,7 +973,7 @@ sockets. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. .TP -\fB\-\-switch\fR=<\fIcount\fR>[@<\fImax\-time\fR>] +\fB\-\-switches\fR=<\fIcount\fR>[@<\fImax\-time\fR>] When a tree topology is used, this defines the maximum count of switches desired for the job allocation and optionally the maximum time to wait for that number of switches. If SLURM finds an allocation containing more diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index abf7b5311..16d2fa7d1 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1055,7 +1055,7 @@ sockets. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. .TP -\fB\-\-switch\fR=<\fIcount\fR>[@<\fImax\-time\fR>] +\fB\-\-switches\fR=<\fIcount\fR>[@<\fImax\-time\fR>] When a tree topology is used, this defines the maximum count of switches desired for the job allocation and optionally the maximum time to wait for that number of switches. If SLURM finds an allocation containing more @@ -1821,8 +1821,8 @@ One or more task ranks to use this configuration. Multiple values may be comma separated. Ranges may be indicated with two numbers separated with a '\-' with the smaller number first (e.g. "0\-4" and not "4\-0"). -To indicate all tasks, specify a rank of '*' (in which case you probably -should not be using this option). +To indicate all tasks not otherwise specified, specify a rank of '*' as the +last line of the file. If an attempt is made to initiate a task for which no executable program is defined, the following error message will be produced "No executable program specified for this task". diff --git a/slurm.spec b/slurm.spec index 58e92eae6..4f97caf79 100644 --- a/slurm.spec +++ b/slurm.spec @@ -88,14 +88,14 @@ %endif Name: slurm -Version: 2.3.2 +Version: 2.3.3 Release: 1%{?dist} Summary: Simple Linux Utility for Resource Management License: GPL Group: System Environment/Base -Source: slurm-2.3.2.tar.bz2 +Source: slurm-2.3.3.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release} URL: http://www.schedmd.com/slurmdocs/ @@ -379,7 +379,7 @@ Gives the ability for SLURM to use Berkeley Lab Checkpoint/Restart ############################################################################# %prep -%setup -n slurm-2.3.2 +%setup -n slurm-2.3.3 %build %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \ diff --git a/src/common/env.c b/src/common/env.c index 7814062d7..2a80856a0 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -56,6 +56,7 @@ #include "slurm/slurm.h" #include "src/common/log.h" #include "src/common/env.h" +#include "src/common/fd.h" #include "src/common/read_config.h" #include "src/common/xassert.h" #include "src/common/xmalloc.h" @@ -80,6 +81,7 @@ strong_alias(env_array_append, slurm_env_array_append); strong_alias(env_array_append_fmt, slurm_env_array_append_fmt); strong_alias(env_array_overwrite, slurm_env_array_overwrite); strong_alias(env_array_overwrite_fmt, slurm_env_array_overwrite_fmt); +strong_alias(env_unset_environment, slurm_env_unset_environment); #define ENV_BUFSIZE (256 * 1024) @@ -1525,6 +1527,31 @@ void env_array_set_environment(char **env_array) } } +/* + * Unset all of the environment variables in a user's current + * environment. + * + * (Note: becuae the environ array is decrementing with each + * unsetenv, only increment the ptr on a failure to unset.) + */ +void env_unset_environment(void) +{ + extern char **environ; + char **ptr; + char name[256], *value; + + value = xmalloc(ENV_BUFSIZE); + for (ptr = (char **)environ; *ptr != NULL; ) { + if ((_env_array_entry_splitter(*ptr, name, sizeof(name), + value, ENV_BUFSIZE)) && + (unsetenv(name) != -1)) + ; + else + ptr++; + } + xfree(value); +} + /* * Merge all of the environment variables in src_array into the * array dest_array. Any variables already found in dest_array @@ -1569,16 +1596,99 @@ static void _strip_cr_nl(char *line) * Special case: return -1 if no open brackets are found */ static int _bracket_cnt(char *value) { - int open_br = 0, close_br = 0, i; + int count = 0, i; for (i=0; value[i]; i++) { if (value[i] == '{') - open_br++; + count++; else if (value[i] == '}') - close_br++; + count--; + } + return count; +} + +/* + * Load user environment from a specified file or file descriptor. + * + * This will read in a user specified file or fd, that is invoked + * via the --export-file option in sbatch. The NAME=value entries must + * be NULL separated to support special characters in the environment + * definitions. + * + * (Note: This is being added to a minor release. For the + * next major release, it might be a consideration to merge + * this funcitonality with that of load_env_cache and update + * env_cache_builder to use the NULL character.) + */ +char **env_array_from_file(const char *fname) +{ + char *buf = NULL, *ptr = NULL, *eptr = NULL; + char *value, *p; + char **env = NULL; + char name[256]; + int buf_size = BUFSIZ, buf_left; + int file_size = 0, tmp_size; + int separator = '\0'; + int fd; + + /* + * If file name is a numeric value, then it is assumed to be a + * file descriptor. + */ + fd = (int)strtol(fname, &p, 10); + if ((*p != '\0') || (fd < 3) || (fd > sysconf(_SC_OPEN_MAX)) || + (fcntl(fd, F_GETFL) < 0)) { + fd = open(fname, O_RDONLY); + if (fd == -1) { + error("Could not open user environment file %s", fname); + return NULL; + } + verbose("Getting environment variables from %s", fname); + } else + verbose("Getting environment variables from fd %d", fd); + + /* + * Read in the user's environment data. + */ + buf = ptr = xmalloc(buf_size); + buf_left = buf_size; + while ((tmp_size = read(fd, ptr, buf_left))) { + if (tmp_size < 0) { + if (errno == EINTR) + continue; + error("read(environment_file): %m"); + break; + } + buf_left -= tmp_size; + file_size += tmp_size; + if (buf_left == 0) { + buf_size += BUFSIZ; + xrealloc(buf, buf_size); + } + ptr = buf + file_size; + buf_left = buf_size - file_size; + } + close(fd); + + /* + * Parse the buffer into individual environment variable names + * and build the environment. + */ + env = env_array_create(); + value = xmalloc(ENV_BUFSIZE); + for (ptr = buf; ; ptr = eptr+1) { + eptr = strchr(ptr, separator); + if ((ptr == eptr) || (eptr == NULL)) + break; + if (_env_array_entry_splitter(ptr, name, sizeof(name), + value, ENV_BUFSIZE) && + (!_discard_env(name, value))) { + env_array_overwrite(&env, name, value); + } } - if (open_br == 0) - return -1; - return (open_br - close_br); + xfree(buf); + xfree(value); + + return env; } /* @@ -1621,13 +1731,12 @@ static char **_load_env_cache(const char *username) if (value[0] == '(') { /* This is a bash function. * It may span multiple lines */ - int bracket_cnt; - while ((bracket_cnt = _bracket_cnt(value))) { + while (_bracket_cnt(value) > 0) { if (!fgets(line, ENV_BUFSIZE, fp)) break; _strip_cr_nl(line); if ((strlen(value) + strlen(line)) > - (sizeof(value) - 1)) + (ENV_BUFSIZE - 2)) break; strcat(value, "\n"); strcat(value, line); @@ -1853,13 +1962,12 @@ char **env_array_user_default(const char *username, int timeout, int mode) if (value[0] == '(') { /* This is a bash function. * It may span multiple lines */ - int bracket_cnt; - while ((bracket_cnt = _bracket_cnt(value))) { + while (_bracket_cnt(value) > 0) { line = strtok_r(NULL, "\n", &last); if (!line) break; if ((strlen(value) + strlen(line)) > - (sizeof(value) - 1)) + (ENV_BUFSIZE - 2)) break; strcat(value, "\n"); strcat(value, line); diff --git a/src/common/env.h b/src/common/env.h index 776e01ae3..53531f7fb 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -180,6 +180,12 @@ env_array_for_step(char ***dest, */ char **env_array_create(void); +/* + * Unset all of the environment variables in a user's current + * environment. + */ +void env_unset_environment(void); + /* * Merge all of the environment variables in src_array into the * array dest_array. Any variables already found in dest_array @@ -251,6 +257,11 @@ int env_array_overwrite_fmt(char ***array_ptr, const char *name, */ void env_array_set_environment(char **env_array); +/* + * load environment from specified file name. + */ +char **env_array_from_file(const char *filename); + /* * Return an array of strings representing the specified user's default * environment variables following a two-prongged approach. diff --git a/src/common/gres.c b/src/common/gres.c index a0e4523e3..4837db52a 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -692,8 +692,6 @@ static int _parse_gres_config(void **dest, slurm_parser_enum_t type, fatal("Invalid gres data for %s, Count=%s", p->name, tmp_str); } - if (tmp_long == 0) - fatal("Invalid gres data for %s, Count=0", p->name); if (p->count && (p->count != tmp_long)) { fatal("Invalid gres data for %s, Count does not match " "File value", p->name); diff --git a/src/common/jobacct_common.c b/src/common/jobacct_common.c index 5bac311b5..987d71a77 100644 --- a/src/common/jobacct_common.c +++ b/src/common/jobacct_common.c @@ -252,7 +252,7 @@ extern int jobacct_common_getinfo(struct jobacctinfo *jobacct, *uint32 = jobacct->tot_cpu; break; default: - debug("jobacct_g_set_setinfo data_type %d invalid", type); + debug("jobacct_g_set_getinfo data_type %d invalid", type); } slurm_mutex_unlock(&jobacct_lock); return rc; diff --git a/src/common/print_fields.c b/src/common/print_fields.c index 91e1bf5ff..c4ef4de86 100644 --- a/src/common/print_fields.c +++ b/src/common/print_fields.c @@ -295,11 +295,11 @@ extern void print_fields_long_double( } -extern void print_fields_time(print_field_t *field, uint32_t value, int last) +extern void print_fields_time(print_field_t *field, uint64_t value, int last) { int abs_len = abs(field->len); /* (value == unset) || (value == cleared) */ - if((value == NO_VAL) || (value == INFINITE)) { + if((value == (uint64_t)NO_VAL) || (value == (uint64_t)INFINITE)) { if(print_fields_parsable_print == PRINT_FIELDS_PARSABLE_NO_ENDING && last) @@ -325,11 +325,11 @@ extern void print_fields_time(print_field_t *field, uint32_t value, int last) } extern void print_fields_time_from_secs(print_field_t *field, - uint32_t value, int last) + uint64_t value, int last) { int abs_len = abs(field->len); /* (value == unset) || (value == cleared) */ - if((value == NO_VAL) || (value == INFINITE)) { + if((value == (uint64_t)NO_VAL) || (value == (uint64_t)INFINITE)) { if(print_fields_parsable_print == PRINT_FIELDS_PARSABLE_NO_ENDING && last) diff --git a/src/common/print_fields.h b/src/common/print_fields.h index 92153e05c..786af35d4 100644 --- a/src/common/print_fields.h +++ b/src/common/print_fields.h @@ -97,9 +97,9 @@ extern void print_fields_uint32( extern void print_fields_uint64( print_field_t *field, uint64_t value, int last); extern void print_fields_time_from_mins(print_field_t *field, - uint32_t value, int last); + uint64_t value, int last); extern void print_fields_time_from_secs(print_field_t *field, - uint32_t value, int last); + uint64_t value, int last); extern void print_fields_char_list(print_field_t *field, List value, int last); #define print_fields_uint print_fields_uint32 diff --git a/src/common/read_config.c b/src/common/read_config.c index d78e8a5eb..67742952c 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -2779,6 +2779,7 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) && (!strcmp(conf->proctrack_type,"proctrack/linuxproc"))) fatal("proctrack/linuxproc is incompatible with switch/elan"); + conf->private_data = 0; /* Set to default before parsing PrivateData */ if (s_p_get_string(&temp_str, "PrivateData", hashtbl)) { if (strstr(temp_str, "account")) conf->private_data |= PRIVATE_DATA_ACCOUNTS; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 3593cfee5..d7c679d8c 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -3581,7 +3581,7 @@ extern int nodelist_find(const char *nodelist, const char *name) extern void convert_num_unit(float num, char *buf, int buf_size, int orig_type) { - char *unit = "\0KMGP?"; + char *unit = "\0KMGTP?"; int i = (int)num % 512; if ((int)num == 0) { @@ -3612,7 +3612,7 @@ extern void convert_num_unit(float num, char *buf, int buf_size, int orig_type) extern int revert_num_unit(const char *buf) { - char *unit = "\0KMGP\0"; + char *unit = "\0KMGTP\0"; int i = 1, j = 0, number = 0; if (!buf) diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 7968bf939..55d9b9613 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -78,6 +78,7 @@ enum { UNIT_KILO, UNIT_MEGA, UNIT_GIGA, + UNIT_TERA, UNIT_PETA, UNIT_UNKNOWN }; diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 428e38ac7..562f21b4e 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1522,7 +1522,7 @@ extern void private_data_string(uint16_t private_data, char *str, int str_len) { if (str_len > 0) str[0] = '\0'; - if (str_len < 42) { + if (str_len < 55) { error("private_data_string: output buffer too small"); return; } @@ -1554,7 +1554,12 @@ extern void private_data_string(uint16_t private_data, char *str, int str_len) strcat(str, ","); strcat(str, "accounts"); //9 len } - // total len 42 + if (private_data & PRIVATE_DATA_RESERVATIONS) { + if (str[0]) + strcat(str, ","); + strcat(str, "reservations"); //13 len + } + // total len 55 if (str[0] == '\0') strcat(str, "none"); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index a2404347a..386555504 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -449,6 +449,7 @@ typedef struct priority_factors_object { typedef struct priority_factors_request_msg { List job_id_list; List uid_list; + uid_t uid; /* used as a stop gap to verify auth DO NOT PACK */ } priority_factors_request_msg_t; typedef struct priority_factors_response_msg { diff --git a/src/common/slurmdb_pack.c b/src/common/slurmdb_pack.c index e714aa74e..b6cfd919b 100644 --- a/src/common/slurmdb_pack.c +++ b/src/common/slurmdb_pack.c @@ -3928,6 +3928,7 @@ extern void slurmdb_pack_job_cond(void *in, uint16_t rpc_version, Buf buffer) while((tmp_info = list_next(itr))) { packstr(tmp_info, buffer); } + list_iterator_destroy(itr); } count = NO_VAL; @@ -4032,6 +4033,9 @@ extern void slurmdb_pack_job_cond(void *in, uint16_t rpc_version, Buf buffer) } count = NO_VAL; + if(object->wckey_list) + count = list_count(object->wckey_list); + pack32(count, buffer); if(count && count != NO_VAL) { itr = list_iterator_create(object->wckey_list); @@ -4209,6 +4213,9 @@ extern void slurmdb_pack_job_cond(void *in, uint16_t rpc_version, Buf buffer) } count = NO_VAL; + if(object->wckey_list) + count = list_count(object->wckey_list); + pack32(count, buffer); if(count && count != NO_VAL) { itr = list_iterator_create(object->wckey_list); diff --git a/src/plugins/accounting_storage/mysql/as_mysql_job.c b/src/plugins/accounting_storage/mysql/as_mysql_job.c index c5a37deaa..2e03d6bec 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_job.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_job.c @@ -584,9 +584,11 @@ extern List as_mysql_modify_job(mysql_conn_t *mysql_conn, uint32_t uid, if (job->derived_ec != NO_VAL) xstrfmtcat(vals, ", derived_ec=%u", job->derived_ec); - if (job->derived_es) - xstrfmtcat(vals, ", derived_es='%s'", job->derived_es); - + if (job->derived_es) { + char *derived_es = slurm_add_slash_to_quotes(job->derived_es); + xstrfmtcat(vals, ", derived_es='%s'", derived_es); + xfree(derived_es); + } if (!vals) { errno = SLURM_NO_CHANGE_IN_DATA; error("No change specified for job modification"); @@ -713,15 +715,23 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, submit_time, job_ptr->job_id, job_ptr->assoc_id))) { + /* Comment is overloaded in job_start to be + the block_id, so we will need to store this + for later. + */ + char *comment = job_ptr->comment; + job_ptr->comment = NULL; /* If we get an error with this just fall * through to avoid an infinite loop */ if (as_mysql_job_start( mysql_conn, job_ptr) == SLURM_ERROR) { + job_ptr->comment = comment; error("couldn't add job %u at job completion", job_ptr->job_id); return SLURM_SUCCESS; } + job_ptr->comment = comment; } } diff --git a/src/plugins/accounting_storage/mysql/as_mysql_qos.c b/src/plugins/accounting_storage/mysql/as_mysql_qos.c index 556eb6f70..3102e90b8 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_qos.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_qos.c @@ -92,6 +92,8 @@ static int _setup_qos_limits(slurmdb_qos_rec_t *qos, qos->description = xstrdup(""); if (qos->flags & QOS_FLAG_NOTSET) qos->flags = 0; + if (qos->grace_time == NO_VAL) + qos->grace_time = 0; if (qos->grp_cpu_mins == (uint64_t)NO_VAL) qos->grp_cpu_mins = (uint64_t)INFINITE; if (qos->grp_cpu_run_mins == (uint64_t)NO_VAL) @@ -125,9 +127,11 @@ static int _setup_qos_limits(slurmdb_qos_rec_t *qos, if (qos->max_wall_pj == NO_VAL) qos->max_wall_pj = INFINITE; if (qos->preempt_mode == (uint16_t)NO_VAL) - qos->preempt_mode = (uint16_t)INFINITE; + qos->preempt_mode = 0; + if (qos->priority == NO_VAL) + qos->priority = 0; if (fuzzy_equal(qos->usage_factor, NO_VAL)) - qos->usage_factor = (double)INFINITE; + qos->usage_factor = 1; if (fuzzy_equal(qos->usage_thres, NO_VAL)) qos->usage_thres = (double)INFINITE; } @@ -1113,8 +1117,6 @@ empty: if (row[QOS_REQ_GRACE]) qos->grace_time = slurm_atoul(row[QOS_REQ_GRACE]); - else - qos->grace_time = (uint32_t)NO_VAL; if (row[QOS_REQ_GCM]) qos->grp_cpu_mins = slurm_atoull(row[QOS_REQ_GCM]); diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 04eaf2604..9da77c9a3 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -71,10 +71,12 @@ * overwritten when linking with the slurmctld. */ #if defined (__APPLE__) +void *acct_db_conn __attribute__((weak_import)) = NULL; uint32_t cluster_cpus __attribute__((weak_import)) = NO_VAL; List job_list __attribute__((weak_import)) = NULL; time_t last_job_update __attribute__((weak_import)); #else +void *acct_db_conn = NULL; uint32_t cluster_cpus = NO_VAL; List job_list = NULL; time_t last_job_update; @@ -1392,6 +1394,14 @@ extern List priority_p_get_priority_factors_list( if (_filter_job(job_ptr, req_job_list, req_user_list)) continue; + if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) + && (job_ptr->user_id != req_msg->uid) + && !validate_operator(req_msg->uid) + && !assoc_mgr_is_user_acct_coord( + acct_db_conn, req_msg->uid, + job_ptr->account)) + continue; + obj = xmalloc(sizeof(priority_factors_object_t)); memcpy(obj, job_ptr->prio_factors, sizeof(priority_factors_object_t)); diff --git a/src/plugins/select/bluegene/bg_job_place.c b/src/plugins/select/bluegene/bg_job_place.c index 74d0b92b4..82142e4b6 100644 --- a/src/plugins/select/bluegene/bg_job_place.c +++ b/src/plugins/select/bluegene/bg_job_place.c @@ -1507,7 +1507,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, max_nodes, req_nodes, &bg_record, local_mode, avail_cpus); - if (rc == SLURM_SUCCESS && SELECT_IS_PREEMPT_SET(local_mode)) { + if (rc != SLURM_SUCCESS && SELECT_IS_PREEMPT_SET(local_mode)) { ListIterator itr; ListIterator job_itr; bg_record_t *found_record; diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 2fc1a1b35..cb6deda6f 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -1628,13 +1628,13 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, best_fit_nodes = switches_node_cnt[j]; best_fit_location = j; best_fit_sufficient = sufficient; - leaf_switch_count++; } } } if (best_fit_nodes == 0) break; + leaf_switch_count++; /* Use select nodes from this leaf */ first = bit_ffs(switches_bitmap[best_fit_location]); last = bit_fls(switches_bitmap[best_fit_location]); @@ -2105,7 +2105,7 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); - if (cpu_count) { + if ((cpu_count) && (job_ptr->best_switch)) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("cons_res: cr_job_test: test 1 pass - " @@ -2319,7 +2319,7 @@ alloc_job: FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(avail_cores); FREE_NULL_BITMAP(tmpcore); - if (!cpu_count) { + if ((!cpu_count) || (!job_ptr->best_switch)) { /* we were sent here to cleanup and exit */ FREE_NULL_BITMAP(free_cores); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 22b57e055..704e7083b 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -1306,7 +1306,6 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, best_fit_nodes = switches_node_cnt[j]; best_fit_location = j; best_fit_sufficient = sufficient; - leaf_switch_count++; } } #if SELECT_DEBUG @@ -1344,6 +1343,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, break; } switches_node_cnt[best_fit_location] = 0; + leaf_switch_count++; if (job_ptr->req_switch > 0) { if (time_waiting > job_ptr->wait4switch) { job_ptr->best_switch = true; diff --git a/src/plugins/task/cgroup/task_cgroup_devices.c b/src/plugins/task/cgroup/task_cgroup_devices.c index 6a39f87ea..303bb4384 100644 --- a/src/plugins/task/cgroup/task_cgroup_devices.c +++ b/src/plugins/task/cgroup/task_cgroup_devices.c @@ -82,6 +82,7 @@ static int read_allowed_devices_file(char *allowed_devices[PATH_MAX]); extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf) { char release_agent_path[PATH_MAX]; + uint16_t cpunum; /* initialize cpuinfo internal data */ if ( xcpuinfo_init() != XCPUINFO_SUCCESS ) @@ -96,6 +97,13 @@ extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf) /* initialize allowed_devices_filename */ cgroup_allowed_devices_file[0] = '\0'; + if ( get_procs(&cpunum) != 0 ) { + error("task/cgroup: unable to get a number of CPU"); + goto error; + } + + (void) gres_plugin_node_config_load(cpunum); + strcpy(cgroup_allowed_devices_file, slurm_cgroup_conf->allowed_devices_file); if ( snprintf(release_agent_path,PATH_MAX,"%s/release_devices", diff --git a/src/sacct/print.c b/src/sacct/print.c index 09ec1a888..03ac3ae17 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -139,6 +139,7 @@ void print_fields(type_t type, void *object) char *tmp_char = NULL; int tmp_int = NO_VAL, tmp_int2 = NO_VAL; double tmp_dub = (double)NO_VAL; + uint64_t tmp_uint64 = (uint64_t)NO_VAL; memset(&outbuf, 0, sizeof(outbuf)); switch(field->type) { @@ -333,10 +334,12 @@ void print_fields(type_t type, void *object) case PRINT_CPU_TIME: switch(type) { case JOB: - tmp_int = job->elapsed * job->alloc_cpus; + tmp_uint64 = (uint64_t)job->elapsed + * (uint64_t)job->alloc_cpus; break; case JOBSTEP: - tmp_int = step->elapsed * step->ncpus; + tmp_uint64 = (uint64_t)step->elapsed + * (uint64_t)step->ncpus; break; case JOBCOMP: break; @@ -344,16 +347,18 @@ void print_fields(type_t type, void *object) break; } field->print_routine(field, - tmp_int, + tmp_uint64, (curr_inx == field_count)); break; case PRINT_CPU_TIME_RAW: switch(type) { case JOB: - tmp_int = job->elapsed * job->alloc_cpus; + tmp_uint64 = (uint64_t)job->elapsed + * (uint64_t)job->alloc_cpus; break; case JOBSTEP: - tmp_int = step->elapsed * step->ncpus; + tmp_uint64 = (uint64_t)step->elapsed + * (uint64_t)step->ncpus; break; case JOBCOMP: break; @@ -361,7 +366,7 @@ void print_fields(type_t type, void *object) break; } field->print_routine(field, - tmp_int, + tmp_uint64, (curr_inx == field_count)); break; case PRINT_DERIVED_EC: @@ -406,7 +411,7 @@ void print_fields(type_t type, void *object) break; } field->print_routine(field, - tmp_int, + (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_ELIGIBLE: @@ -1020,7 +1025,7 @@ void print_fields(type_t type, void *object) break; } field->print_routine(field, - tmp_int, + (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_RESV_CPU: @@ -1043,7 +1048,7 @@ void print_fields(type_t type, void *object) break; } field->print_routine(field, - tmp_int, + (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_RESV_CPU_RAW: @@ -1160,7 +1165,7 @@ void print_fields(type_t type, void *object) break; } field->print_routine(field, - tmp_int, + (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_SYSTEMCPU: diff --git a/src/sacct/sacct.c b/src/sacct/sacct.c index 769bae2ed..6d5b9801c 100644 --- a/src/sacct/sacct.c +++ b/src/sacct/sacct.c @@ -55,7 +55,7 @@ print_field_t fields[] = { {10, "Cluster", print_fields_str, PRINT_CLUSTER}, {14, "Comment", print_fields_str, PRINT_COMMENT}, {10, "CPUTime", print_fields_time_from_secs, PRINT_CPU_TIME}, - {10, "CPUTimeRAW", print_fields_int, PRINT_CPU_TIME_RAW}, + {10, "CPUTimeRAW", print_fields_uint64, PRINT_CPU_TIME_RAW}, {15, "DerivedExitCode", print_fields_str, PRINT_DERIVED_EC}, {10, "Elapsed", print_fields_time_from_secs, PRINT_ELAPSED}, {19, "Eligible", print_fields_date, PRINT_ELIGIBLE}, diff --git a/src/sacctmgr/event_functions.c b/src/sacctmgr/event_functions.c index 8936ff809..67f371e8e 100644 --- a/src/sacctmgr/event_functions.c +++ b/src/sacctmgr/event_functions.c @@ -665,7 +665,8 @@ extern int sacctmgr_list_event(int argc, char *argv[]) newend = time(NULL); field->print_routine( field, - (newend - event->period_start), + (uint64_t)(newend + - event->period_start), (curr_inx == field_count)); break; case PRINT_END: diff --git a/src/sacctmgr/qos_functions.c b/src/sacctmgr/qos_functions.c index 5ce45a4f9..90cda4d73 100644 --- a/src/sacctmgr/qos_functions.c +++ b/src/sacctmgr/qos_functions.c @@ -783,7 +783,7 @@ extern int sacctmgr_list_qos(int argc, char *argv[]) break; case PRINT_GRACE: field->print_routine( - field, qos->grace_time, + field, (uint64_t)qos->grace_time, (curr_inx == field_count)); break; case PRINT_GRPCM: diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index d60ebe449..f9c55805e 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -168,6 +168,7 @@ #define LONG_OPT_WAIT_ALL_NODES 0x150 #define LONG_OPT_EXPORT 0x151 #define LONG_OPT_REQ_SWITCH 0x152 +#define LONG_OPT_EXPORT_FILE 0x153 /*---- global variables, defined in opt.h ----*/ opt_t opt; @@ -365,6 +366,7 @@ static void _opt_default() opt.efname = NULL; opt.export_env = NULL; + opt.export_file = NULL; opt.get_user_env_time = -1; opt.get_user_env_mode = -1; opt.acctg_freq = -1; @@ -707,6 +709,7 @@ static struct option long_options[] = { {"cpu_bind", required_argument, 0, LONG_OPT_CPU_BIND}, {"exclusive", no_argument, 0, LONG_OPT_EXCLUSIVE}, {"export", required_argument, 0, LONG_OPT_EXPORT}, + {"export-file", required_argument, 0, LONG_OPT_EXPORT_FILE}, {"get-user-env", optional_argument, 0, LONG_OPT_GET_USER_ENV}, {"gres", required_argument, 0, LONG_OPT_GRES}, {"gid", required_argument, 0, LONG_OPT_GID}, @@ -1631,6 +1634,10 @@ static void _set_options(int argc, char **argv) xfree(opt.export_env); opt.export_env = xstrdup(optarg); break; + case LONG_OPT_EXPORT_FILE: + xfree(opt.export_file); + opt.export_file = xstrdup(optarg); + break; case LONG_OPT_REQ_SWITCH: pos_delimit = strstr(optarg,"@"); if (pos_delimit != NULL) { @@ -2796,7 +2803,7 @@ static void _usage(void) " [--network=type] [--mem-per-cpu=MB] [--qos=qos] [--gres=list]\n" " [--cpu_bind=...] [--mem_bind=...] [--reservation=name]\n" " [--switch=max-switches{@max-time-to-wait}]\n" -" [--export[=names]] executable [args...]\n"); +" [--export[=names]] [--export-file=file|fd] executable [args...]\n"); } static void _help(void) @@ -2815,6 +2822,7 @@ static void _help(void) " -D, --workdir=directory set working directory for batch script\n" " -e, --error=err file for batch script's standard error\n" " --export[=names] specify environment variables to export\n" +" --export-file=file|fd specify environment variables file or file descriptor to export\n" " --get-user-env load environment from local cluster\n" " --gid=group_id group ID to run job as (user root only)\n" " --gres=list required generic resources\n" diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h index 556cae705..f61dd1adb 100644 --- a/src/sbatch/opt.h +++ b/src/sbatch/opt.h @@ -163,6 +163,7 @@ typedef struct sbatch_options { int get_user_env_time; /* --get-user-env[=timeout] */ int get_user_env_mode; /* --get-user-env=[S|L] */ char *export_env; /* --export */ + char *export_file; /* --export-file=file */ char *wckey; /* --wckey workload characterization key */ char *reservation; /* --reservation */ int ckpt_interval; /* --checkpoint (int minutes) */ diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index e562f815e..07d23510e 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -137,6 +137,13 @@ int main(int argc, char *argv[]) (void) _set_rlimit_env(); } + /* + * if the environment is coming from a file, the + * environment at execution startup, must be unset. + */ + if (opt.export_file != NULL) + env_unset_environment(); + _set_prio_process_env(); _set_spank_env(); _set_submit_dir_env(); @@ -389,6 +396,11 @@ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) desc->warn_time = opt.warn_time; desc->environment = NULL; + if (opt.export_file) { + desc->environment = env_array_from_file(opt.export_file); + if (desc->environment == NULL) + exit(1); + } if (opt.export_env == NULL) { env_array_merge(&desc->environment, (const char **)environ); } else if (!strcasecmp(opt.export_env, "ALL")) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b7ef4edbb..653999dcd 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -7916,8 +7916,9 @@ fini: /* If job update is successful and priority is calculated (not only * based upon job submit order), recalculate the job priority, since - * many factors of an update may affect priority considerations. */ - if ((error_code == SLURM_SUCCESS) && + * many factors of an update may affect priority considerations. + * If job has a hold then do nothing */ + if ((error_code == SLURM_SUCCESS) && (job_ptr->priority > 1) && strcmp(slurmctld_conf.priority_type, "priority/basic")) _set_job_prio(job_ptr); diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index d39c3cd9c..fda9faf52 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -200,6 +200,10 @@ extern List build_job_queue(bool clear_start) fatal("list_iterator_create malloc failure"); while ((part_ptr = (struct part_record *) list_next(part_iterator))) { + job_ptr->part_ptr = part_ptr; + if (job_limits_check(&job_ptr) != + WAIT_NO_REASON) + continue; _job_queue_append(job_queue, job_ptr, part_ptr); } list_iterator_destroy(part_iterator); @@ -909,17 +913,14 @@ extern int test_job_dependency(struct job_record *job_ptr) } else if ((dep_ptr->job_ptr->magic != JOB_MAGIC) || (dep_ptr->job_ptr->job_id != dep_ptr->job_id)) { /* job is gone, dependency lifted */ - list_delete_item(depend_iter); clear_dep = true; } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER) { if (!IS_JOB_PENDING(dep_ptr->job_ptr)) { - list_delete_item(depend_iter); clear_dep = true; } else depends = true; } else if (dep_ptr->depend_type == SLURM_DEPEND_AFTER_ANY) { if (IS_JOB_FINISHED(dep_ptr->job_ptr)) { - list_delete_item(depend_iter); clear_dep = true; } else depends = true; @@ -927,7 +928,6 @@ extern int test_job_dependency(struct job_record *job_ptr) if (!IS_JOB_FINISHED(dep_ptr->job_ptr)) depends = true; else if (!IS_JOB_COMPLETE(dep_ptr->job_ptr)) { - list_delete_item(depend_iter); clear_dep = true; } else { failure = true; @@ -937,7 +937,6 @@ extern int test_job_dependency(struct job_record *job_ptr) if (!IS_JOB_FINISHED(dep_ptr->job_ptr)) depends = true; else if (IS_JOB_COMPLETE(dep_ptr->job_ptr)) { - list_delete_item(depend_iter); clear_dep = true; } else { failure = true; @@ -964,12 +963,12 @@ extern int test_job_dependency(struct job_record *job_ptr) } else failure = true; if (clear_dep) { - char *rmv_dep; - rmv_dep = xstrdup_printf(":%u", - dep_ptr->job_ptr->job_id); + char *rmv_dep = xstrdup_printf( + ":%u", dep_ptr->job_ptr->job_id); xstrsubstitute(job_ptr->details->dependency, rmv_dep, ""); xfree(rmv_dep); + list_delete_item(depend_iter); } } list_iterator_destroy(depend_iter); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 62e9625d4..1ce915857 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1078,6 +1078,7 @@ static void _preempt_jobs(List preemptee_job_list, int *error_code) job_ptr->job_id); } else { error("Invalid preempt_mode: %u", mode); + rc = SLURM_ERROR; } if (rc != SLURM_SUCCESS) { diff --git a/src/slurmctld/preempt.c b/src/slurmctld/preempt.c index 2b536bbfe..fa51a0517 100644 --- a/src/slurmctld/preempt.c +++ b/src/slurmctld/preempt.c @@ -206,7 +206,7 @@ extern int slurm_job_check_grace(struct job_record *job_ptr) { /* Preempt modes: -1 (unset), 0 (none), 1 (partition), 2 (QOS) */ static int preempt_mode = 0; - static time_t last_update_time = 0; + static time_t last_update_time = (time_t) 0; int rc = SLURM_SUCCESS; uint32_t grace_time = 0; @@ -225,6 +225,7 @@ extern int slurm_job_check_grace(struct job_record *job_ptr) else preempt_mode = 0; xfree(preempt_type); + last_update_time = slurmctld_conf.last_update; } if (preempt_mode == 1) @@ -235,9 +236,11 @@ extern int slurm_job_check_grace(struct job_record *job_ptr) grace_time = qos_ptr->grace_time; } - if (grace_time) + if (grace_time) { + debug("setting %u sec preemption grace time for job %u", + grace_time, job_ptr->job_id); _preempt_signal(job_ptr, grace_time); - else + } else rc = SLURM_ERROR; return rc; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 8ab0ace34..79cd10561 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1045,6 +1045,7 @@ static void _slurm_rpc_get_priority_factors(slurm_msg_t *msg) slurm_msg_t response_msg; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + req_msg->uid = uid; START_TIMER; debug2("Processing RPC: REQUEST_PRIORITY_FACTORS from uid=%d", uid); diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 098cecf30..4559fd50a 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1807,7 +1807,7 @@ extern void show_resv(char **buffer_ptr, int *buffer_size, uid_t uid) fatal("malloc: list_iterator_create"); while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) { if ((slurmctld_conf.private_data & PRIVATE_DATA_RESERVATIONS) - && !validate_slurm_user(uid)) { + && !validate_operator(uid)) { int i = 0; for (i=0; i<resv_ptr->user_cnt; i++) { if (resv_ptr->user_list[i] == uid) diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index c3f4ac825..add5e0374 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -125,11 +125,11 @@ static char **_build_env(uint32_t jobid, uid_t uid, char *resv_id, char **spank_job_env, uint32_t spank_job_env_size); static void _delay_rpc(int host_inx, int host_cnt, int usec_per_rpc); static void _destroy_env(char **env); -static bool _slurm_authorized_user(uid_t uid); +static int _get_grouplist(uid_t my_uid, gid_t my_gid, int *ngroups, + gid_t **groups); static void _job_limits_free(void *x); static int _job_limits_match(void *x, void *key); static bool _job_still_running(uint32_t job_id); -static int _init_groups(uid_t my_uid, gid_t my_gid); static int _kill_all_active_steps(uint32_t jobid, int sig, bool batch); static int _step_limits_match(void *x, void *key); static int _terminate_all_steps(uint32_t jobid, bool batch); @@ -163,6 +163,7 @@ static int _run_epilog(uint32_t jobid, uid_t uid, char *resv_id, static bool _pause_for_job_completion(uint32_t jobid, char *nodes, int maxtime); +static bool _slurm_authorized_user(uid_t uid); static void _sync_messages_kill(kill_job_msg_t *req); static int _waiter_init (uint32_t jobid); static int _waiter_complete (uint32_t jobid); @@ -2362,24 +2363,27 @@ static void _rpc_pid2jid(slurm_msg_t *msg) } } +/* Creates an array of group ids and stores in it the list of groups + * that user my_uid belongs to. The pointer to the list is returned + * in groups and the count of gids in ngroups. The caller must free + * the group list array pointed to by groups */ static int -_init_groups(uid_t my_uid, gid_t my_gid) +_get_grouplist(uid_t my_uid, gid_t my_gid, int *ngroups, gid_t **groups) { char *user_name = uid_to_string(my_uid); - int rc; if (user_name == NULL) { error("sbcast: Could not find uid %ld", (long)my_uid); return -1; } - rc = initgroups(user_name, my_gid); - xfree(user_name); - if (rc) { - error("sbcast: Error in initgroups(%s, %ld): %m", - user_name, (long)my_gid); -// return -1; + *groups = (gid_t *) xmalloc(*ngroups * sizeof(gid_t)); + + if (getgrouplist(user_name, my_gid, *groups, ngroups) < 0) { + *groups = xrealloc(*groups, *ngroups * sizeof(gid_t)); + getgrouplist(user_name, my_gid, *groups, ngroups); } + xfree(user_name); return 0; } @@ -2426,6 +2430,8 @@ _rpc_file_bcast(slurm_msg_t *msg) { file_bcast_msg_t *req = msg->data; int fd, flags, offset, inx, rc; + int ngroups = 16; + gid_t *groups; uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); gid_t req_gid = g_slurm_auth_get_gid(msg->auth_cred, NULL); pid_t child; @@ -2449,21 +2455,41 @@ _rpc_file_bcast(slurm_msg_t *msg) info("sbcast req_uid=%u fname=%s block_no=%u", req_uid, req->fname, req->block_no); + + if ((rc = _get_grouplist(req_uid, req_gid, &ngroups, &groups)) < 0) { + error("sbcast: getgrouplist(%u): %m", req_uid); + return rc; + } + child = fork(); if (child == -1) { error("sbcast: fork failure"); return errno; } else if (child > 0) { waitpid(child, &rc, 0); + xfree(groups); return WEXITSTATUS(rc); } /* The child actually performs the I/O and exits with * a return code, do not return! */ - if (_init_groups(req_uid, req_gid) < 0) { - error("sbcast: initgroups(%u): %m", req_uid); - exit(errno); + + /*********************************************************************\ + * NOTE: It would be best to do an exec() immediately after the fork() + * in order to help prevent a possible deadlock in the child process + * due to locks being set at the time of the fork and being freed by + * the parent process, but not freed by the child process. Performing + * the work inline is done for simplicity. Note that the logging + * performed by error() should be safe due to the use of + * atfork_install_handlers() as defined in src/common/log.c. + * Change the code below with caution. + \*********************************************************************/ + if (setgroups(ngroups, groups) < 0) { + error("sbcast: uid: %u setgroups: %s", req_uid, + strerror(errno)); + exit(errno); } + if (setgid(req_gid) < 0) { error("sbcast: uid:%u setgid(%u): %s", req_uid, req_gid, strerror(errno)); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index a8c4922f0..54f442b33 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -196,6 +196,7 @@ static void _setargs(slurmd_job_t *job); static void _random_sleep(slurmd_job_t *job); static int _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, int max_wait, char **env); +static void _unblock_signals(void); /* * Batch job management prototypes: @@ -997,7 +998,7 @@ job_manager(slurmd_job_t *job) io_close_task_fds(job); - xsignal_block(mgr_sigarray); + xsignal_block (mgr_sigarray); reattach_job = job; job->state = SLURMSTEPD_STEP_RUNNING; @@ -1192,7 +1193,7 @@ static int exec_wait_signal (struct exec_wait_info *e, slurmd_job_t *job) return (0); } -static void prepare_tty (slurmd_job_t *job, slurmd_task_info_t *task) +static void prepare_stdio (slurmd_job_t *job, slurmd_task_info_t *task) { #ifdef HAVE_PTY_H if (job->pty && (task->gtid == 0)) { @@ -1200,11 +1201,27 @@ static void prepare_tty (slurmd_job_t *job, slurmd_task_info_t *task) error("login_tty: %m"); else debug3("login_tty good"); + return; } #endif + io_dup_stdio(task); return; } +static void _unblock_signals (void) +{ + sigset_t set; + int i; + + for (i = 0; mgr_sigarray[i]; i++) { + /* eliminate pending signals, then set to default */ + xsignal(mgr_sigarray[i], SIG_IGN); + xsignal(mgr_sigarray[i], SIG_DFL); + } + sigemptyset(&set); + xsignal_set_mask (&set); +} + /* fork and exec N tasks */ static int @@ -1317,12 +1334,15 @@ _fork_all_tasks(slurmd_job_t *job) /* log_fini(); */ /* note: moved into exec_task() */ - xsignal_unblock(slurmstepd_blocked_signals); + _unblock_signals(); /* - * Setup tty before any setpgid() calls + * Need to setup stdio before setpgid() is called + * in case we are setting up a tty. (login_tty() + * must be called before setpgid() or it is + * effectively disabled). */ - prepare_tty (job, job->task[i]); + prepare_stdio (job, job->task[i]); /* * Block until parent notifies us that it is ok to diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c index 925e67e4a..0137fdc7e 100644 --- a/src/slurmd/slurmstepd/task.c +++ b/src/slurmd/slurmstepd/task.c @@ -83,7 +83,6 @@ #include "src/common/xmalloc.h" #include "src/slurmd/slurmd/slurmd.h" -#include "src/slurmd/slurmstepd/io.h" #include "src/slurmd/slurmstepd/pdebug.h" #include "src/slurmd/slurmstepd/task.h" #include "src/slurmd/slurmstepd/ulimits.h" @@ -392,8 +391,6 @@ exec_task(slurmd_job_t *job, int i) } } - io_dup_stdio(task); - /* task-specific pre-launch activities */ if (spank_user_task (job, i) < 0) { diff --git a/src/slurmdbd/read_config.c b/src/slurmdbd/read_config.c index 388b4a65c..b46baca9c 100644 --- a/src/slurmdbd/read_config.c +++ b/src/slurmdbd/read_config.c @@ -233,6 +233,8 @@ extern int read_slurmdbd_conf(void) } s_p_get_string(&slurmdbd_conf->pid_file, "PidFile", tbl); s_p_get_string(&slurmdbd_conf->plugindir, "PluginDir", tbl); + + slurmdbd_conf->private_data = 0; /* default visible to all */ if (s_p_get_string(&temp_str, "PrivateData", tbl)) { if (strstr(temp_str, "account")) slurmdbd_conf->private_data diff --git a/src/sreport/resv_reports.c b/src/sreport/resv_reports.c index 319839d89..39205c641 100644 --- a/src/sreport/resv_reports.c +++ b/src/sreport/resv_reports.c @@ -515,7 +515,7 @@ extern int resv_utilization(int argc, char *argv[]) break; case PRINT_RESV_TIME: field->print_routine(field, - total_time, + (uint64_t)total_time, (curr_inx == field_count)); break; diff --git a/src/srun/multi_prog.c b/src/srun/multi_prog.c index 8965b714b..4c2035db8 100644 --- a/src/srun/multi_prog.c +++ b/src/srun/multi_prog.c @@ -111,18 +111,19 @@ _build_path(char* fname) } static void -_set_range(int low_num, int high_num, char *exec_name) +_set_range(int low_num, int high_num, char *exec_name, bool ignore_duplicates) { int i; for (i=low_num; i<=high_num; i++) { MPIR_PROCDESC *tv; tv = &MPIR_proctable[i]; - if (tv->executable_name) { - error("duplicate configuration for task %d ignored", - i); - } else + if (tv->executable_name == NULL) { tv->executable_name = xstrdup(exec_name); + } else if (!ignore_duplicates) { + error("duplicate configuration for task %d ignored", + i); + } } } @@ -136,7 +137,7 @@ _set_exec_names(char *ranks, char *exec_name, int ntasks) if ((ranks[0] == '*') && (ranks[1] == '\0')) { low_num = 0; high_num = ntasks - 1; - _set_range(low_num, high_num, exec_path); + _set_range(low_num, high_num, exec_path, true); return; } @@ -150,14 +151,14 @@ _set_exec_names(char *ranks, char *exec_name, int ntasks) if ((ptrptr[0] == ',') || (ptrptr[0] == '\0')) { low_num = MAX(0, num); high_num = MIN((ntasks-1), num); - _set_range(low_num, high_num, exec_path); + _set_range(low_num, high_num, exec_path, false); } else if (ptrptr[0] == '-') { low_num = MAX(0, num); num = strtol(ptrptr+1, &ptrptr, 10); if ((ptrptr[0] != ',') && (ptrptr[0] != '\0')) goto invalid; high_num = MIN((ntasks-1), num); - _set_range(low_num, high_num, exec_path); + _set_range(low_num, high_num, exec_path, false); } else goto invalid; if (ptrptr[0] == '\0') @@ -276,7 +277,8 @@ mpir_dump_proctable() } static int -_update_task_mask(int low_num, int high_num, int ntasks, bitstr_t *task_mask) +_update_task_mask(int low_num, int high_num, int ntasks, bitstr_t *task_mask, + bool ignore_duplicates) { int i; @@ -294,6 +296,8 @@ _update_task_mask(int low_num, int high_num, int ntasks, bitstr_t *task_mask) } for (i=low_num; i<=high_num; i++) { if (bit_test(task_mask, i)) { + if (ignore_duplicates) + continue; error("Duplicate record for task %d", i); return -1; } @@ -312,7 +316,8 @@ _validate_ranks(char *ranks, int ntasks, bitstr_t *task_mask) if (ranks[0] == '*' && ranks[1] == '\0') { low_num = 0; high_num = ntasks - 1; - return _update_task_mask(low_num, high_num, ntasks, task_mask); + return _update_task_mask(low_num, high_num, ntasks, task_mask, + true); } for (range = strtok_r(ranks, ",", &ptrptr); range != NULL; @@ -340,7 +345,8 @@ _validate_ranks(char *ranks, int ntasks, bitstr_t *task_mask) return -1; } - if (_update_task_mask(low_num, high_num, ntasks, task_mask)) + if (_update_task_mask(low_num, high_num, ntasks, task_mask, + false)) return -1; } return 0; diff --git a/src/sview/grid.c b/src/sview/grid.c index 3b7608c40..9ccb4d2f9 100644 --- a/src/sview/grid.c +++ b/src/sview/grid.c @@ -848,7 +848,8 @@ static int _grid_table_by_switch(button_processor_t *button_processor, /* This is needed to get the correct width of the grid window. * If it is not given then we get a really narrow window. */ gtk_table_set_row_spacing(button_processor->table, - (*button_processor->coord_y)-1, 1); + (*button_processor->coord_y)? + ((*button_processor->coord_y)-1):0, 1); return rc; @@ -878,11 +879,11 @@ static int _grid_table_by_list(button_processor_t *button_processor, list_iterator_destroy(itr); rc = _add_button_to_list(NULL, button_processor); - /* This is needed to get the correct width of the grid - window. If it is not given then we get a really narrow - window. */ + /* This is needed to get the correct width of the grid window. + * If it is not given then we get a really narrow window. */ gtk_table_set_row_spacing(button_processor->table, - (*button_processor->coord_y)-1, 1); + (*button_processor->coord_y)? + ((*button_processor->coord_y)-1):0, 1); return rc; @@ -1502,7 +1503,7 @@ extern void put_buttons_in_table(GtkTable *table, List button_list) if (cluster_dims == 0) { /* This is needed to get the correct width of the grid window. * If it is not given then we get a really narrow window. */ - gtk_table_set_row_spacing(table, coord_y-1, 1); + gtk_table_set_row_spacing(table, coord_y?(coord_y-1):0, 1); } gtk_widget_show_all(GTK_WIDGET(table)); } @@ -1567,10 +1568,9 @@ extern int update_grid_table(GtkTable *table, List button_list, List node_list) } rc = _add_button_to_list(NULL, &button_processor); - /* This is needed to get the correct width of the grid - window. If it is not given then we get a really narrow - window. */ - gtk_table_set_row_spacing(table, coord_y-1, 1); + /* This is needed to get the correct width of the grid window. + * If it is not given then we get a really narrow window. */ + gtk_table_set_row_spacing(table, coord_y?(coord_y-1):0, 1); end_it: list_iterator_destroy(itr); diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 2b40d4a70..2f159b46f 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -191,6 +191,8 @@ EXTRA_DIST = \ test7.14 \ test7.14.prog1.c \ test7.14.prog2.c \ + test7.15 \ + test7.15.prog.c \ test8.1 \ test8.2 \ test8.3 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index bb98c8481..dad72be94 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -471,6 +471,8 @@ EXTRA_DIST = \ test7.14 \ test7.14.prog1.c \ test7.14.prog2.c \ + test7.15 \ + test7.15.prog.c \ test8.1 \ test8.2 \ test8.3 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index 0e20828eb..da2bc8579 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -316,6 +316,7 @@ test7.12 Test of slurm_job_step_stat() API call. test7.13 Verify the correct setting of a job's ExitCode test7.14 Verify the ability to modify the Derived Exit Code/String fields of a job record in the database +test7.15 Verify signal mask of tasks have no ignored signals. test8.# Test of Blue Gene specific functionality. diff --git a/testsuite/expect/test1.62 b/testsuite/expect/test1.62 index 6863526b8..072459705 100755 --- a/testsuite/expect/test1.62 +++ b/testsuite/expect/test1.62 @@ -95,6 +95,11 @@ proc run_gpu_test { gres_cnt } { print_header $test_id +if {[test_cray]} { + send_user "\nWARNING: This test is incompatible with Cray systems\n" + exit $exit_code +} + # # Test if gres/gpu is configured # diff --git a/testsuite/expect/test12.2 b/testsuite/expect/test12.2 index a12f8ad54..32f72192c 100755 --- a/testsuite/expect/test12.2 +++ b/testsuite/expect/test12.2 @@ -116,7 +116,7 @@ proc _get_mem {prog} { # Compute error in KB set diff_mem [expr $mem_used - $mem_size] set error_mem [expr abs($diff_mem)] - if {$error_mem > 4000} { + if {$error_mem > 4100} { send_user "\nFAILURE: sstat memory use discrepancy of $error_mem KB\n" send_user " Wanted $mem_size KB, got $mem_used KB\n" return 1 diff --git a/testsuite/expect/test7.15 b/testsuite/expect/test7.15 new file mode 100755 index 000000000..a8d56debd --- /dev/null +++ b/testsuite/expect/test7.15 @@ -0,0 +1,85 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Verify signal mask of tasks have no ignored signals. +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2010 Lawrence Livermore National Security. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# CODE-OCEC-09-009. All rights reserved. +# +# This file is part of SLURM, a resource management program. +# For details, see <http://www.schedmd.com/slurmdocs/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +############################################################################ +source ./globals + +set test_id "7.15" +set exit_code 0 +set file_prog "test$test_id.prog" + +print_header $test_id + +# +# Delete left-over programs and rebuild them. +# +exec $bin_rm -f $file_prog +exec $bin_cc -O -o $file_prog ${file_prog}.c + +# +# Run on multiple nodes in case the failure of this test +# is intermittent. +# +if { $partition == "" } { + if {[info exists env(SLURM_PARTITION)] } { + set partition $env(SLURM_PARTITION) + } else { + set partition [default_partition] + } +} +set nnodes [available_nodes $partition] + +# +# Run the test_prog to ensure that no signals are blocked by +# default for the job. +# +set timeout $max_job_delay +set srun_pid [exp_spawn $srun -N$nnodes -p $partition ./$file_prog] +expect { + -re "Signal (.*) is ignored!" { + send_user "FAILURE: At least one signal is ignored!\n" + set exit_code 1 + } + timeout { + send_user "\nFAILURE: srun not responding" + slow_kill $srun_pid + set exit_code 1 + } + eof { + catch wait result + set exit_code [lindex $result 3] + } +} + +if {$exit_code == 0} { + send_user "\nSUCCESS\n" + exec $bin_rm -f $file_prog +} + +exit $exit_code diff --git a/testsuite/expect/test7.15.prog.c b/testsuite/expect/test7.15.prog.c new file mode 100644 index 000000000..d1e379f76 --- /dev/null +++ b/testsuite/expect/test7.15.prog.c @@ -0,0 +1,30 @@ +#include <stdio.h> +#include <signal.h> +#include <unistd.h> +#include <string.h> + +int main (int ac, char **av) +{ + char hostname[1024]; + int i, rc = 0; + struct sigaction act; + + if (gethostname (hostname, sizeof (hostname)) < 0) { + fprintf (stderr, "Failed to get hostname on this node\n"); + strcpy (hostname, "Unknown"); + } + for (i = 1; i < SIGRTMAX; i++) { + sigaction (i, NULL, &act); + if (act.sa_handler == SIG_IGN) { + fprintf (stderr, "%s: Signal %d is ignored!\n", + hostname, i); + rc = 1; + } else if (act.sa_handler != SIG_DFL) { + fprintf (stderr, + "%s: Signal %d has handler function!\n", + hostname, i); + rc = 1; + } + } + return (rc); +} -- GitLab