diff --git a/META b/META index aef01630c7ce89b4abcef4a3b264c89a26cd79c4..d9d11cc0e5c9e0efa9ed81e75338f82b856c7c75 100644 --- a/META +++ b/META @@ -3,9 +3,9 @@ Api_revision: 0 Major: 2 Meta: 1 - Micro: 9 + Micro: 10 Minor: 1 Name: slurm Release: 1 Release_tags: dist - Version: 2.1.9 + Version: 2.1.10 diff --git a/NEWS b/NEWS index 6e070ec89239bca28ae108403926626790a46008..bb56de769ba045a01b91a06df5edfb1cfc363bcf 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,65 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. + +* Changes in SLURM 2.1.11 +========================= + +* Changes in SLURM 2.1.10 +========================= + -- Fix memory leak in sched/builtin plugin. + -- Fixed sbatch to work correctly when no nodes are specified, but + --ntasks-per-node is. + -- Make sure account and wckey for a job are lower case before inserting into + accounting. + -- Added note to squeue documentation about --jobs option displaying jobs + even if they are on hidden partitions. + -- Fix srun to work correctly with --uid when getting an allocation + and creating a step. + -- Fix for when removing a limit from a users association inside the + fairshare tree the parents limit is now inherited automatically in + the slurmctld. Previously the slurmctld would have to be restarted. + This problem only exists when setting a users association limit to -1. + -- Patch from Matthieu Hautreux (CEA) dealing with possible overflows that + could come up with the select/cons_res plugin with uint32_t's being treated + as uint16_t's. + -- Correct logic for creating a reservation with a Duration=Infinite (used to + set reservation end time in the past). + -- Correct logic for creating a reservation that properly handles the OVERLAP + and IGNORE_JOBS flags (flags were ignored under some conditions). + -- Fixed a fair-share calculation bug in the priority/multifactor plugin. + -- Make sure a user entry in the database that was previously deleted is + restored clean when added back, i.e. remove admin privileges previously + given. + -- BLUEGENE - Future start time is set correctly when eligible time for a job + is in the future, but the job can physically run earlier. + -- Updated Documentation for sacctmgr for Wall and CPUMin options stating when + the limit is reached running jobs will be killed. + -- Fix deadlock issue in the slurmctld when lowering limits in accounting to + lower than that of pending jobs. + -- Fix bug in salloc, sbatch and srun that could under some conditions process + the --threads-per-core, --cores-per-socket and --sockets-per-node options + improperly. + -- Fix bug in select/cons_res with memory management plus job preemption with + job removal (e.g. requeue) which under some conditions failed to preempt + jobs. + -- Fix deadlock potential when using qos and associations in the slurmctld. + -- Update documentation to state --ntasks-per-* is for a maximum value + instead of an absolute. + -- Get ReturnToService=2 working for front-end configurations (e.g. Cray or + BlueGene). + -- Fix issue when deleting a QOS on a pending/running job which previously + would create possible memory corruption. + -- Do not make a non-responding node available for use after running + "scontrol update nodename=<name> state=resume". Wait for node to respond + before use. + -- Added slurm_xlator.h to jobacct_gather plugins so they resolve symbols + correctly when linking to the slurm api. + -- You can now update a jobs QOS from scontrol. Previously you could only do + this from sview. + -- BLUEGENE - Fixed bug where if running in non-dynamic mode sometimes the + start time returned for a job when using test-only would not be correct. + * Changes in SLURM 2.1.9 ======================== -- In select/linear - Fix logic to prevent over-subscribing memory with shared @@ -36,7 +95,7 @@ documents those changes that are of interest to users and admins. slurmctld's running for a single cluster, which should rarely if ever happen. -- Fixed sacct -c option. - -- Critical bug fix in sched/backfill plugin that caused in memory corruption. + -- Critical bug fix in sched/backfill plugin that caused memory corruption. * Changes in SLURM 2.1.8 ======================== @@ -5074,4 +5133,4 @@ documents those changes that are of interest to users and admins. -- Change directory to /tmp in slurmd if daemonizing. -- Logfiles are reopened on reconfigure. -$Id: NEWS 20444 2010-06-08 22:20:32Z jette $ +$Id: NEWS 20750 2010-07-16 21:32:31Z da $ diff --git a/doc/man/man1/sacctmgr.1 b/doc/man/man1/sacctmgr.1 index 7431f49c81335c1ba04e680e1c4880035e926b7f..f908de3bb8c62f81332726bdc4229393ab3c97fa 100644 --- a/doc/man/man1/sacctmgr.1 +++ b/doc/man/man1/sacctmgr.1 @@ -223,15 +223,16 @@ Maximum number of CPU minutes running jobs are able to be allocated in aggregate for this association and all association which are children of this association. To clear a previously set value use the modify command with a new -value of \-1. (NOTE: This limit is not enforced if set on the root +value of \-1. +NOTE: This limit is not enforced if set on the root association of a cluster. So even though it may appear in sacctmgr output it will not be enforced. NOTE: This limit only applys when using the Priority Multifactor plugin. The time is decayed using the value of PriorityDecayHalfLife -or PriorityUsageResetPeriod as set in the slurm.conf. Currently when -this limit is reached jobs will be delayed until they are able to run -inside the limit. No jobs will be killed if this limit is reached, -this will change in future versions of SLURM.) +or PriorityUsageResetPeriod as set in the slurm.conf. When this limit +is reached all associated jobs running will be killed and all future jobs +submitted with associations in the group will be delayed until they +are able to run inside the limit. .TP \fIGrpCPUs\fP=<max cpus> @@ -266,15 +267,15 @@ To clear a previously set value use the modify command with a new value of \-1. Maximum wall clock time running jobs are able to be allocated in aggregate for this association and all association which are children of this association. To clear a previously set value use the modify command with a new value of \-1. -(NOTE: This limit is not enforced if set on the root +NOTE: This limit is not enforced if set on the root association of a cluster. So even though it may appear in sacctmgr output it will not be enforced. NOTE: This limit only applys when using the Priority Multifactor plugin. The time is decayed using the value of PriorityDecayHalfLife -or PriorityUsageResetPeriod as set in the slurm.conf. Currently when -this limit is reached jobs will be delayed until they are able to run -inside the limit. No jobs will be killed if this limit is reached, -this will change in future versions of SLURM.) +or PriorityUsageResetPeriod as set in the slurm.conf. When this limit +is reached all associated jobs running will be killed and all future jobs +submitted with associations in the group will be delayed until they +are able to run inside the limit. .TP \fIMaxCPUMins\fP=<max cpu minutes> @@ -743,10 +744,10 @@ To clear a previously set value use the modify command with a new value of \-1. NOTE: This limit only applys when using the Priority Multifactor plugin. The time is decayed using the value of PriorityDecayHalfLife -or PriorityUsageResetPeriod as set in the slurm.conf. Currently when -this limit is reached jobs will be delayed until they are able to run -inside the limit. No jobs will be killed if this limit is reached, -this will change in future versions of SLURM.) +or PriorityUsageResetPeriod as set in the slurm.conf. When this limit +is reached all associated jobs running will be killed and all future jobs +submitted with this QOS will be delayed until they are able to run +inside the limit. .TP \fIGrpCPUs\fP @@ -781,10 +782,10 @@ this QOS. To clear a previously set value use the modify command with a new value of \-1. NOTE: This limit only applys when using the Priority Multifactor plugin. The time is decayed using the value of PriorityDecayHalfLife -or PriorityUsageResetPeriod as set in the slurm.conf. Currently when -this limit is reached jobs will be delayed until they are able to run -inside the limit. No jobs will be killed if this limit is reached, -this will change in future versions of SLURM.) +or PriorityUsageResetPeriod as set in the slurm.conf. When this limit +is reached all associated jobs running will be killed and all future jobs +submitted with this QOS will be delayed until they are able to run +inside the limit. .TP \fIMaxCPUMins\fP diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 414d438181413483dcc276b75eedc0d47ab183cc..87c25b64b8cfb3db1b1b3278cb64e7ba2b734220 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -657,7 +657,7 @@ ignored if \fISchedulerType=sched/wiki\fR or .TP \fB\-\-ntasks\-per\-core\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each core. +Request the maximum \fIntasks\fR be invoked on each core. Meant to be used with the \fB\-\-ntasks\fR option. Related to \fB\-\-ntasks\-per\-node\fR except at the core level instead of the node level. Masks will automatically be generated @@ -669,7 +669,7 @@ NOTE: This option is not supported unless .TP \fB\-\-ntasks\-per\-socket\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each socket. +Request the maximum \fIntasks\fR be invoked on each socket. Meant to be used with the \fB\-\-ntasks\fR option. Related to \fB\-\-ntasks\-per\-node\fR except at the socket level instead of the node level. Masks will automatically be generated @@ -681,7 +681,7 @@ NOTE: This option is not supported unless .TP \fB\-\-ntasks\-per\-node\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each node. +Request the maximum \fIntasks\fR be invoked on each node. Meant to be used with the \fB\-\-nodes\fR option. This is related to \fB\-\-cpus\-per\-task\fR=\fIncpus\fR, but does not require knowledge of the actual number of cpus on diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index c62b53894694dd48faf047c40a2bd58cc175ce67..3b68213542f048566b410baa5074542eda1be2f3 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -696,7 +696,7 @@ behavior on the cluster. .TP \fB\-\-ntasks\-per\-core\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each core. +Request the maximum \fIntasks\fR be invoked on each core. Meant to be used with the \fB\-\-ntasks\fR option. Related to \fB\-\-ntasks\-per\-node\fR except at the core level instead of the node level. Masks will automatically be generated @@ -708,7 +708,7 @@ NOTE: This option is not supported unless .TP \fB\-\-ntasks\-per\-socket\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each socket. +Request the maximum \fIntasks\fR be invoked on each socket. Meant to be used with the \fB\-\-ntasks\fR option. Related to \fB\-\-ntasks\-per\-node\fR except at the socket level instead of the node level. Masks will automatically be generated @@ -720,7 +720,7 @@ NOTE: This option is not supported unless .TP \fB\-\-ntasks\-per\-node\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each node. +Request the maximum \fIntasks\fR be invoked on each node. Meant to be used with the \fB\-\-nodes\fR option. This is related to \fB\-\-cpus\-per\-task\fR=\fIncpus\fR, but does not require knowledge of the actual number of cpus on diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1 index d528284d20acaa63c1dd88995c475fe46f73abf8..8b7adc7bd422c7ed12438a7a2cdd5bc83637fdef 100644 --- a/doc/man/man1/squeue.1 +++ b/doc/man/man1/squeue.1 @@ -48,6 +48,8 @@ By default, prints a time stamp with the header. Requests a comma separated list of job ids to display. Defaults to all jobs. The \fB\-\-jobs=<job_id_list>\fR option may be used in conjunction with the \fB\-\-steps\fR option to print step information about specific jobs. +Note: If a list of job ids is provided, the jobs are displayed even if +they are on hidden partitions. .TP \fB\-l\fR, \fB\-\-long\fR diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 9524159ef7416294a22f0a9a36f6aba55e3e94d3..100f124599f5b0f09a400af163b9c42ab9f9a1e4 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -759,7 +759,7 @@ ignored if \fISchedulerType=sched/wiki\fR or .TP \fB\-\-ntasks\-per\-core\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each core. +Request the maximum \fIntasks\fR be invoked on each core. Meant to be used with the \fB\-\-ntasks\fR option. Related to \fB\-\-ntasks\-per\-node\fR except at the core level instead of the node level. Masks will automatically be generated @@ -771,7 +771,7 @@ NOTE: This option is not supported unless .TP \fB\-\-ntasks\-per\-socket\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each socket. +Request the maximum \fIntasks\fR be invoked on each socket. Meant to be used with the \fB\-\-ntasks\fR option. Related to \fB\-\-ntasks\-per\-node\fR except at the socket level instead of the node level. Masks will automatically be generated @@ -783,7 +783,7 @@ NOTE: This option is not supported unless .TP \fB\-\-ntasks\-per\-node\fR=<\fIntasks\fR> -Request that \fIntasks\fR be invoked on each node. +Request the maximum \fIntasks\fR be invoked on each node. Meant to be used with the \fB\-\-nodes\fR option. This is related to \fB\-\-cpus\-per\-task\fR=\fIncpus\fR, but does not require knowledge of the actual number of cpus on @@ -928,7 +928,7 @@ with the \fB\-\-relative\fR option, a warning message will be printed and the \fB\-\-relative\fR option will take precedence. .TP -\fB\-\-resv-ports\fR +\fB\-\-resv\-ports\fR Reserve communication ports for this job. Used for OpenMPI. @@ -1415,6 +1415,9 @@ Same as \fB\-l, \-\-label\fR \fBSLURM_MEM_BIND\fR Same as \fB\-\-mem_bind\fR .TP +\fBSLURM_MPI_TYPE\fR +Same as \fB\-\-mpi\fR +.TP \fBSLURM_NETWORK\fR Same as \fB\-\-network\fR .TP @@ -1457,6 +1460,9 @@ Same as \fB\-D, \-\-chdir=\fR \fBSLURM_RESTART_DIR\fR Same as \fB\-\-restart\-dir\fR .TP +\fBSLURM_RESV_PORTS\fR +Same as \fB\-\-resv\-ports\fR +.TP \fBSLURM_SIGNAL\fR Same as \fB\-\-signal\fR .TP diff --git a/slurm.spec b/slurm.spec index 40eb411235de739f3309577b550ea8d210eeccc2..dd82601f3c64fb835b8b6177cfd815e1c0650035 100644 --- a/slurm.spec +++ b/slurm.spec @@ -83,14 +83,14 @@ %endif Name: slurm -Version: 2.1.9 +Version: 2.1.10 Release: 1%{?dist} Summary: Simple Linux Utility for Resource Management License: GPL Group: System Environment/Base -Source: slurm-2.1.9.tar.bz2 +Source: slurm-2.1.10.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release} URL: https://computing.llnl.gov/linux/slurm/ @@ -352,7 +352,7 @@ Gives the ability for SLURM to use Berkeley Lab Checkpoint/Restart ############################################################################# %prep -%setup -n slurm-2.1.9 +%setup -n slurm-2.1.10 %build %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \ diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 6a8ac27e018da988248cc23fec2218162380d39c..96cd17121d81c7825ce7264442ea6b578965597e 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -61,6 +61,7 @@ static char *assoc_mgr_cluster_name = NULL; static int setup_childern = 0; void (*remove_assoc_notify) (acct_association_rec_t *rec) = NULL; +void (*remove_qos_notify) (acct_qos_rec_t *rec) = NULL; pthread_mutex_t assoc_mgr_association_lock = PTHREAD_MUTEX_INITIALIZER; @@ -802,6 +803,8 @@ extern int assoc_mgr_init(void *db_conn, assoc_init_args_t *args) enforce = args->enforce; if(args->remove_assoc_notify) remove_assoc_notify = args->remove_assoc_notify; + if(args->remove_qos_notify) + remove_qos_notify = args->remove_qos_notify; cache_level = args->cache_level; assoc_mgr_refresh_lists(db_conn, args); } @@ -1685,6 +1688,10 @@ extern int assoc_mgr_update_assocs(acct_update_object_t *update) if(!assoc_mgr_association_list) return SLURM_SUCCESS; + /* Since we could possibly need the qos lock handle it + now to avoid deadlock. Always do QOS first. + */ + slurm_mutex_lock(&assoc_mgr_qos_lock); slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_association_list); while((object = list_pop(update->objects))) { @@ -1814,7 +1821,11 @@ extern int assoc_mgr_update_assocs(acct_update_object_t *update) rec->valid_qos); rec->valid_qos = bit_alloc(g_qos_count); - } + } else + bit_nclear(rec->valid_qos, 0, + (bit_size(rec-> + valid_qos) + - 1)); set_qos_bitstr_from_list( rec->valid_qos, rec->qos_list); } @@ -1822,9 +1833,7 @@ extern int assoc_mgr_update_assocs(acct_update_object_t *update) if(!slurmdbd_conf && !parents_changed) { debug("updating assoc %u", rec->id); - slurm_mutex_lock(&assoc_mgr_qos_lock); log_assoc_rec(rec, assoc_mgr_qos_list); - slurm_mutex_unlock(&assoc_mgr_qos_lock); } break; case ACCT_ADD_ASSOC: @@ -1942,18 +1951,17 @@ extern int assoc_mgr_update_assocs(acct_update_object_t *update) } if(setup_childern) { /* Now normalize the static shares */ - slurm_mutex_lock(&assoc_mgr_qos_lock); list_iterator_reset(itr); while((object = list_next(itr))) { _normalize_assoc_shares(object); log_assoc_rec(object, assoc_mgr_qos_list); } - slurm_mutex_unlock(&assoc_mgr_qos_lock); } } list_iterator_destroy(itr); slurm_mutex_unlock(&assoc_mgr_association_lock); + slurm_mutex_unlock(&assoc_mgr_qos_lock); /* This needs to happen outside of the assoc_mgr_association_lock */ @@ -2173,11 +2181,16 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) acct_association_rec_t *assoc = NULL; int rc = SLURM_SUCCESS; bool resize_qos_bitstr = 0; + List remove_list = NULL; if(!assoc_mgr_qos_list) return SLURM_SUCCESS; slurm_mutex_lock(&assoc_mgr_qos_lock); + /* Since we could possibly need the association lock handle it + now to avoid deadlock. Always do QOS first. + */ + slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_qos_list); while((object = list_pop(update->objects))) { list_iterator_reset(itr); @@ -2269,7 +2282,22 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) break; case ACCT_REMOVE_QOS: - if(rec) + if(!rec) { + //rc = SLURM_ERROR; + break; + } + + if(remove_qos_notify) { + /* since there are some deadlock + issues while inside our lock here + we have to process a notify later + */ + if(!remove_list) + remove_list = list_create( + destroy_acct_qos_rec); + list_remove(itr); + list_append(remove_list, rec); + } else list_delete_item(itr); if(!assoc_mgr_association_list) @@ -2277,7 +2305,6 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) /* Remove this qos from all the associations on this cluster. */ - slurm_mutex_lock(&assoc_mgr_association_lock); assoc_itr = list_iterator_create( assoc_mgr_association_list); while((assoc = list_next(assoc_itr))) { @@ -2288,7 +2315,6 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) bit_clear(assoc->valid_qos, object->id); } list_iterator_destroy(assoc_itr); - slurm_mutex_unlock(&assoc_mgr_association_lock); break; default: @@ -2310,7 +2336,6 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) g_qos_count); } if(assoc_mgr_association_list) { - slurm_mutex_lock(&assoc_mgr_association_lock); assoc_itr = list_iterator_create( assoc_mgr_association_list); while((assoc = list_next(assoc_itr))) { @@ -2321,13 +2346,25 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) g_qos_count); } list_iterator_destroy(assoc_itr); - slurm_mutex_unlock(&assoc_mgr_association_lock); } } list_iterator_destroy(itr); + slurm_mutex_unlock(&assoc_mgr_association_lock); slurm_mutex_unlock(&assoc_mgr_qos_lock); + /* This needs to happen outside of the + assoc_mgr_association_lock */ + if(remove_list) { + itr = list_iterator_create(remove_list); + + while((rec = list_next(itr))) + remove_qos_notify(rec); + + list_iterator_destroy(itr); + list_destroy(remove_list); + } + return rc; } @@ -2708,13 +2745,16 @@ extern int load_assoc_usage(char *state_save_location) if(assoc->id == assoc_id) break; + /* We want to do this all the way up to and including + root. This way we can keep track of how much usage + has occured on the entire system and use that to + normalize against. + */ while(assoc) { assoc->grp_used_wall += grp_used_wall; assoc->usage_raw += (long double)usage_raw; assoc = assoc->parent_assoc_ptr; - if(assoc == assoc_mgr_root_assoc) - break; } list_iterator_reset(itr); } diff --git a/src/common/assoc_mgr.h b/src/common/assoc_mgr.h index a57f5a36abfe7893957cfe8adadd39ddea5465bb..1e2a723d0835ea660fac3a64594b1cf24c1cff29 100644 --- a/src/common/assoc_mgr.h +++ b/src/common/assoc_mgr.h @@ -61,6 +61,7 @@ typedef struct { uint16_t cache_level; uint16_t enforce; void (*remove_assoc_notify) (acct_association_rec_t *rec); + void (*remove_qos_notify) (acct_qos_rec_t *rec); } assoc_init_args_t; extern List assoc_mgr_association_list; diff --git a/src/common/slurm_accounting_storage.c b/src/common/slurm_accounting_storage.c index 372e3d45d661c8eba14626985c09e5a332d4fc23..be473b46d9d9b92c9874e476a8d8c58f08a5e772 100644 --- a/src/common/slurm_accounting_storage.c +++ b/src/common/slurm_accounting_storage.c @@ -7910,6 +7910,12 @@ extern void log_assoc_rec(acct_association_rec_t *assoc_ptr, List qos_list) if(temp_char) { debug2(" Qos : %s", temp_char); xfree(temp_char); + if(assoc_ptr->valid_qos) { + temp_char = get_qos_complete_str_bitstr( + qos_list, assoc_ptr->valid_qos); + debug3(" Valid Qos : %s", temp_char); + xfree(temp_char); + } } } else { debug2(" Qos : %s", "Normal"); diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index fb382893ef533bba8d2a730cfa0579ab2be4e398..2e13bdaf0c011fec459f520a8cbaf75d8da5a37b 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -76,7 +76,6 @@ static void _slurm_free_reserve_info_members (reserve_info_t * part); static void _free_all_step_info (job_step_info_response_msg_t *msg); static void _slurm_free_job_step_info_members (job_step_info_t * msg); -static void _make_lower(char *change); /* * slurm_msg_t_init - initialize a slurm message @@ -172,7 +171,7 @@ extern int slurm_addto_char_list(List char_list, char *names) else count++; - _make_lower(name); + xstrtolower(name); list_append(char_list, name); list_iterator_reset(itr); @@ -206,7 +205,7 @@ extern int slurm_addto_char_list(List char_list, char *names) else count++; - _make_lower(name); + xstrtolower(name); list_append(char_list, name); } list_iterator_destroy(itr); @@ -2007,20 +2006,6 @@ void inline slurm_free_job_notify_msg(job_notify_msg_t * msg) } } -/* make everything lowercase should not be called on static char *'s */ -static void _make_lower(char *change) -{ - if(change) { - int j = 0; - while(change[j]) { - char lower = tolower(change[j]); - if(lower != change[j]) - change[j] = lower; - j++; - } - } -} - /* * Sanitize spank_job_env by prepending "SPANK_" to all entries, * thus rendering them harmless in environment of scripts and diff --git a/src/common/xstring.c b/src/common/xstring.c index 7ae5df560f6ddef97594b552a32543f15b50fe9d..675c105324324976c5bac3b2c3fdcc8d0e81678c 100644 --- a/src/common/xstring.c +++ b/src/common/xstring.c @@ -86,6 +86,7 @@ strong_alias(_xstrsubstitute, slurm_xstrsubstitute); strong_alias(xstrstrip, slurm_xstrstrip); strong_alias(xshort_hostname, slurm_xshort_hostname); strong_alias(xstring_is_whitespace, slurm_xstring_is_whitespace); +strong_alias(xstrtolower, slurm_xstrtolower); /* * Ensure that a string has enough space to add 'needed' characters. @@ -487,3 +488,16 @@ bool xstring_is_whitespace(const char *str) return true; } +/* + * If str make everything lowercase. Should not be called on static char *'s + */ +void xstrtolower(char *str) +{ + if(str) { + int j = 0; + while(str[j]) { + str[j] = tolower(str[j]); + j++; + } + } +} diff --git a/src/common/xstring.h b/src/common/xstring.h index 7c7147f615134cb9c0cb0d271692ba6b384a92cd..458e5909c02435e2c6a255231c8ec4d7b067c7f4 100644 --- a/src/common/xstring.h +++ b/src/common/xstring.h @@ -149,4 +149,9 @@ char *xshort_hostname(void); */ bool xstring_is_whitespace(const char *str); +/* + * If str make everything lowercase. Should not be called on static char *'s + */ +void xstrtolower(char *str); + #endif /* !_XSTRING_H */ diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index c47073e7b4a28a96f6b5271dc3cf90c21f5b1af9..e460e546b241dec16700590182a1b17e81c41310 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -3919,14 +3919,17 @@ extern int acct_storage_p_add_users(mysql_conn_t *mysql_conn, uint32_t uid, xstrfmtcat(vals, ", %u", object->admin_level); xstrfmtcat(extra, ", admin_level=%u", object->admin_level); - } + } else + xstrfmtcat(extra, ", admin_level=%u", + ACCT_ADMIN_NONE); if(object->default_wckey) { xstrcat(cols, ", default_wckey"); xstrfmtcat(vals, ", \"%s\"", object->default_wckey); xstrfmtcat(extra, ", default_wckey=\"%s\"", object->default_wckey); - } + } else + xstrcat(extra, ", default_wckey=\"\""); query = xstrdup_printf( "insert into %s (%s) values (%s) " @@ -5540,8 +5543,8 @@ extern List acct_storage_p_modify_associations( time_t now = time(NULL); char *user_name = NULL; int set = 0, i = 0, is_admin=0; - MYSQL_RES *result = NULL; - MYSQL_ROW row; + MYSQL_RES *result = NULL, *result2 = NULL; + MYSQL_ROW row, row2; acct_user_rec_t user; char *tmp_char1=NULL, *tmp_char2=NULL; int set_qos_vals = 0; @@ -5572,6 +5575,18 @@ extern List acct_storage_p_modify_associations( MASSOC_COUNT }; + enum { + ASSOC2_REQ_PARENT_ID, + ASSOC2_REQ_MJ, + ASSOC2_REQ_MSJ, + ASSOC2_REQ_MCPJ, + ASSOC2_REQ_MNPJ, + ASSOC2_REQ_MWPJ, + ASSOC2_REQ_MCMPJ, + ASSOC2_REQ_QOS, + ASSOC2_REQ_DELTA_QOS, + }; + if(!assoc_cond || !assoc) { error("we need something to change"); return NULL; @@ -5677,24 +5692,26 @@ extern List acct_storage_p_modify_associations( */ uint32_t lft = atoi(row[MASSOC_LFT]); uint32_t rgt = atoi(row[MASSOC_RGT]); + char *account; + + /* Here we want to see if the person + * is a coord of the parent account + * since we don't want him to be able + * to alter the limits of the account + * he is directly coord of. They + * should be able to alter the + * sub-accounts though. If no parent account + * that means we are talking about a user + * association so account is really the parent + * of the user a coord can change that all day long. + */ + if(row[MASSOC_PACCT][0]) + account = row[MASSOC_PACCT]; + else + account = row[MASSOC_ACCT]; if(!is_admin) { acct_coord_rec_t *coord = NULL; - char *account = row[MASSOC_ACCT]; - - /* Here we want to see if the person - * is a coord of the parent account - * since we don't want him to be able - * to alter the limits of the account - * he is directly coord of. They - * should be able to alter the - * sub-accounts though. If no parent account - * that means we are talking about a user - * association so account is really the parent - * of the user a coord can change that all day long. - */ - if(row[MASSOC_PACCT][0]) - account = row[MASSOC_PACCT]; if(!user.coord_accts) { // This should never // happen @@ -5800,6 +5817,50 @@ extern List acct_storage_p_modify_associations( xstrfmtcat(name_char, " || id=%s", row[MASSOC_ID]); } + /* If there is a variable cleared here we need to make + sure we get the parent's information, if any. */ + query = xstrdup_printf( + "call get_parent_limits('%s', " + "'%s', '%s', %u);" + "select @par_id, @mj, @msj, @mcpj, " + "@mnpj, @mwpj, @mcmpj, @qos, @delta_qos;", + assoc_table, account, + row[MASSOC_CLUSTER], 0); + debug4("%d(%d) query\n%s", + mysql_conn->conn, __LINE__, query); + if(!(result2 = mysql_db_query_ret( + mysql_conn->db_conn, query, 1))) { + xfree(query); + break; + } + xfree(query); + + if((row2 = mysql_fetch_row(result2))) { + if((assoc->max_jobs == INFINITE) && row2[ASSOC2_REQ_MJ]) + assoc->max_jobs = atoi(row2[ASSOC2_REQ_MJ]); + if((assoc->max_submit_jobs == INFINITE) + && row2[ASSOC2_REQ_MSJ]) + assoc->max_submit_jobs = + atoi(row2[ASSOC2_REQ_MSJ]); + if((assoc->max_cpus_pj == INFINITE) + && row2[ASSOC2_REQ_MCPJ]) + assoc->max_cpus_pj = + atoi(row2[ASSOC2_REQ_MCPJ]); + if((assoc->max_nodes_pj == INFINITE) + && row2[ASSOC2_REQ_MNPJ]) + assoc->max_nodes_pj = + atoi(row2[ASSOC2_REQ_MNPJ]); + if((assoc->max_wall_pj == INFINITE) + && row2[ASSOC2_REQ_MWPJ]) + assoc->max_wall_pj = + atoi(row2[ASSOC2_REQ_MWPJ]); + if((assoc->max_cpu_mins_pj == INFINITE) + && row2[ASSOC2_REQ_MCMPJ]) + assoc->max_cpu_mins_pj = + atoi(row2[ASSOC2_REQ_MCMPJ]); + } + mysql_free_result(result2); + mod_assoc = xmalloc(sizeof(acct_association_rec_t)); init_acct_association_rec(mod_assoc); mod_assoc->id = atoi(row[MASSOC_ID]); @@ -8587,11 +8648,15 @@ empty: parent_id = atoi(row2[ASSOC2_REQ_PARENT_ID]); if(!without_parent_limits) { - if(row2[ASSOC2_REQ_MCMPJ]) - parent_mcmpj = - atoi(row2[ASSOC2_REQ_MCMPJ]); + if(row2[ASSOC2_REQ_MJ]) + parent_mj = atoi(row2[ASSOC2_REQ_MJ]); else - parent_mcmpj = INFINITE; + parent_mj = INFINITE; + + if(row2[ASSOC2_REQ_MSJ]) + parent_msj = atoi(row2[ASSOC2_REQ_MSJ]); + else + parent_msj = INFINITE; if(row2[ASSOC2_REQ_MCPJ]) parent_mcpj = @@ -8599,11 +8664,6 @@ empty: else parent_mcpj = INFINITE; - if(row2[ASSOC2_REQ_MJ]) - parent_mj = atoi(row2[ASSOC2_REQ_MJ]); - else - parent_mj = INFINITE; - if(row2[ASSOC2_REQ_MNPJ]) parent_mnpj = atoi(row2[ASSOC2_REQ_MNPJ]); @@ -8631,15 +8691,10 @@ empty: xfree(parent_delta_qos); if(row2[ASSOC2_REQ_DELTA_QOS][0]) - xstrcat(parent_delta_qos, + parent_delta_qos = xstrdup( row2[ASSOC2_REQ_DELTA_QOS]); else parent_delta_qos = NULL; - - if(row2[ASSOC2_REQ_MSJ]) - parent_msj = atoi(row2[ASSOC2_REQ_MSJ]); - else - parent_msj = INFINITE; } last_acct = parent_acct; last_cluster = row[ASSOC_REQ_CLUSTER]; diff --git a/src/plugins/jobacct_gather/aix/jobacct_gather_aix.c b/src/plugins/jobacct_gather/aix/jobacct_gather_aix.c index 32a1e0304dd372ae4cf74b4fec56c410b5399a73..aa50224e24eb258a0090e2df5f59a76afe632798 100644 --- a/src/plugins/jobacct_gather/aix/jobacct_gather_aix.c +++ b/src/plugins/jobacct_gather/aix/jobacct_gather_aix.c @@ -41,6 +41,7 @@ \*****************************************************************************/ #include <signal.h> +#include "src/common/slurm_xlator.h" #include "src/common/jobacct_common.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_protocol_defs.h" diff --git a/src/plugins/jobacct_gather/linux/jobacct_gather_linux.c b/src/plugins/jobacct_gather/linux/jobacct_gather_linux.c index c4b42643bcb2fe13d5651c4bf48cef0c24cf2ba4..d2c63108212f468fafbe4755c16b0add752efd0d 100644 --- a/src/plugins/jobacct_gather/linux/jobacct_gather_linux.c +++ b/src/plugins/jobacct_gather/linux/jobacct_gather_linux.c @@ -41,6 +41,7 @@ #include <fcntl.h> #include <signal.h> +#include "src/common/slurm_xlator.h" #include "src/common/jobacct_common.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_protocol_defs.h" diff --git a/src/plugins/jobacct_gather/none/jobacct_gather_none.c b/src/plugins/jobacct_gather/none/jobacct_gather_none.c index 1ced7a60fb598bd30827045a77fb31c9f7bc14fb..0cda67945302b8aba772a592aeaa179a3dd1dd76 100644 --- a/src/plugins/jobacct_gather/none/jobacct_gather_none.c +++ b/src/plugins/jobacct_gather/none/jobacct_gather_none.c @@ -39,6 +39,7 @@ * Copyright (C) 2002 The Regents of the University of California. \*****************************************************************************/ +#include "src/common/slurm_xlator.h" #include "src/common/slurm_jobacct_gather.h" /* diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 2d19542596c9514ddc792e58612f1bfa7957d6cd..db468ef5712d0beb4b1a6b885ed8b5f9b9115924 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -65,6 +65,11 @@ #define SECS_PER_DAY (24 * 60 * 60) #define SECS_PER_WEEK (7 * 24 * 60 * 60) +/* These are defined here so when we link with something other than + * the slurmctld we will have these symbols defined. They will get + * overwritten when linking with the slurmctld. + */ +time_t last_job_update; /* * These variables are required by the generic plugin interface. If they @@ -141,9 +146,10 @@ static int _apply_decay(double decay_factor) slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_association_list); + /* We want to do this to all associations including + root. All usage_raws are calculated from the bottom up. + */ while((assoc = list_next(itr))) { - if (assoc == assoc_mgr_root_assoc) - continue; assoc->usage_raw *= decay_factor; assoc->grp_used_wall *= decay_factor; } @@ -180,9 +186,10 @@ static int _reset_usage() slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_association_list); + /* We want to do this to all associations including + root. All usage_raws are calculated from the bottom up. + */ while((assoc = list_next(itr))) { - if (assoc == assoc_mgr_root_assoc) - continue; assoc->usage_raw = 0; assoc->grp_used_wall = 0; } @@ -781,25 +788,24 @@ static void *_decay_thread(void *no_data) } slurm_mutex_lock(&assoc_mgr_association_lock); + /* We want to do this all the way up + to and including root. This way we + can keep track of how much usage + has occured on the entire system + and use that to normalize against. + */ while(assoc) { - /* we don't want to make the - root assoc responsible for - keeping track of time - */ - if (assoc == assoc_mgr_root_assoc) - break; assoc->grp_used_wall += run_decay; assoc->usage_raw += (long double)real_decay; debug4("adding %f new usage to " "assoc %u (user='%s' acct='%s') " "raw usage is now %Lf. Group " - "wall added %d making it %f.", + "wall added %f making it %f.", real_decay, assoc->id, assoc->user, assoc->acct, assoc->usage_raw, run_decay, assoc->grp_used_wall); - assoc = assoc->parent_assoc_ptr; } slurm_mutex_unlock(&assoc_mgr_association_lock); @@ -1033,10 +1039,10 @@ extern int priority_p_set_max_cluster_usage(uint32_t procs, uint32_t half_life) last_procs = procs; last_half_life = half_life; - /* get the total decay for the entire cluster */ - assoc_mgr_root_assoc->usage_raw = - (long double)procs * (long double)half_life * (long double)2; - assoc_mgr_root_assoc->usage_norm = 1.0; + /* This should always be 1 and it doesn't get calculated later + so set it now. usage_raw and usage_norm get calculated the + same way the other associations do. */ + assoc_mgr_root_assoc->usage_efctv = 1.0; debug3("Total possible cpu usage for half_life of %d secs " "on the system is %.0Lf", half_life, assoc_mgr_root_assoc->usage_raw); @@ -1057,10 +1063,18 @@ extern void priority_p_set_assoc_usage(acct_association_rec_t *assoc) } xassert(assoc_mgr_root_assoc); - xassert(assoc_mgr_root_assoc->usage_raw); xassert(assoc->parent_assoc_ptr); - assoc->usage_norm = assoc->usage_raw / assoc_mgr_root_assoc->usage_raw; + if(assoc_mgr_root_assoc->usage_raw) + assoc->usage_norm = + assoc->usage_raw / assoc_mgr_root_assoc->usage_raw; + else + /* This should only happen when no usage has occured + at all so no big deal, the other usage should be 0 + as well here. + */ + assoc->usage_norm = 0; + debug4("Normalized usage for %s %s off %s %Lf / %Lf = %Lf", child, child_str, assoc->parent_assoc_ptr->acct, assoc->usage_raw, assoc_mgr_root_assoc->usage_raw, diff --git a/src/plugins/proctrack/linuxproc/kill_tree.c b/src/plugins/proctrack/linuxproc/kill_tree.c index 4f32c3827e7f588e087428b8609c1b90548dca6d..88a8211f9583f6e5f9c1fea6824b3ca3d8c5d8d9 100644 --- a/src/plugins/proctrack/linuxproc/kill_tree.c +++ b/src/plugins/proctrack/linuxproc/kill_tree.c @@ -54,9 +54,10 @@ #include <limits.h> #include "slurm/slurm.h" +#include "slurm/slurm_errno.h" +#include "src/common/log.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" -#include "src/common/log.h" #include "kill_tree.h" typedef struct xpid_s { @@ -163,12 +164,14 @@ static xppid_t **_build_hashtbl(void) hashtbl = (xppid_t **)xmalloc(HASH_LEN * sizeof(xppid_t *)); + slurm_seterrno(0); while ((de = readdir(dir)) != NULL) { num = de->d_name; if ((num[0] < '0') || (num[0] > '9')) continue; ret_l = strtol(num, &endptr, 10); - if(errno == ERANGE) { + if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX) || + (errno == ERANGE)) { error("couldn't do a strtol on str %s(%d): %m", num, ret_l); } diff --git a/src/plugins/sched/builtin/builtin_wrapper.c b/src/plugins/sched/builtin/builtin_wrapper.c index 30a28eb2c7629a9a45e92d6894d6a2684b689154..b7ea5acf10bba286223c916d9b0811adfb1c0a54 100644 --- a/src/plugins/sched/builtin/builtin_wrapper.c +++ b/src/plugins/sched/builtin/builtin_wrapper.c @@ -177,6 +177,7 @@ void slurm_sched_plugin_job_is_pending( void ) FREE_NULL_BITMAP(avail_bitmap); } + xfree(job_queue); } /**************************************************************************/ diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 8ff04f89178960bb6bac15eb49107a0eb9589d64..9b111fb16544b170be12ad4e02bf93a4d2b6faaa 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -314,8 +314,8 @@ static bg_record_t *_find_matching_block(List block_list, continue; } else if((bg_record->job_running != NO_JOB_RUNNING) && (bg_record->job_running != job_ptr->job_id) - && (bg_conf->layout_mode == LAYOUT_DYNAMIC - || (SELECT_IS_MODE_RUN_NOW(query_mode) + && ((bg_conf->layout_mode == LAYOUT_DYNAMIC) + || (!SELECT_IS_CHECK_FULL_SET(query_mode) && bg_conf->layout_mode != LAYOUT_DYNAMIC))) { debug("block %s in use by %s job %d", bg_record->bg_block_id, @@ -1525,6 +1525,10 @@ preempt: } else if(bg_record->job_running == BLOCK_ERROR_STATE) starttime = INFINITE; + /* make sure the job is eligible to run */ + if(job_ptr->details->begin_time > starttime) + starttime = job_ptr->details->begin_time; + job_ptr->start_time = starttime; select_g_select_jobinfo_set(job_ptr->select_jobinfo, diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index af31c8f1c6a1420cf4c9e3bc4141986b442cebf2..fabf2575c9e2718381690f0b910ca505504ff88c 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -133,9 +133,10 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, uint16_t si, cps, avail_cpus = 0, num_tasks = 0; uint32_t core_begin = cr_get_coremap_offset(node_i); uint32_t core_end = cr_get_coremap_offset(node_i+1); + uint32_t c; uint16_t cpus_per_task = job_ptr->details->cpus_per_task; uint16_t *used_cores, *free_cores, free_core_count = 0; - uint16_t i, c, sockets = select_node_record[node_i].sockets; + uint16_t i, j, sockets = select_node_record[node_i].sockets; uint16_t cores_per_socket = select_node_record[node_i].cores; uint16_t threads_per_core = select_node_record[node_i].vpus; uint16_t min_cores = 1, min_sockets = 1, ntasks_per_socket = 0; @@ -220,7 +221,7 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, used_cores = xmalloc(sockets * sizeof(uint16_t)); for (c = core_begin; c < core_end; c++) { - i = (c - core_begin) / cores_per_socket; + i = (uint16_t) (c - core_begin) / cores_per_socket; if (bit_test(core_map, c)) { free_cores[i]++; free_core_count++; @@ -241,7 +242,7 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, used_cores = NULL; /* Step 2: check min_cores per socket and min_sockets per node */ - c = 0; + j = 0; for (i = 0; i < sockets; i++) { if (free_cores[i] < min_cores) { /* cannot use this socket */ @@ -250,16 +251,16 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, continue; } /* count this socket as usable */ - c++; + j++; } - if (c < min_sockets) { + if (j < min_sockets) { /* cannot use this node */ num_tasks = 0; goto fini; } /* check max_cores and max_sockets */ - c = 0; + j = 0; for (i = 0; i < sockets; i++) { if (free_cores[i] > max_cores) { /* remove extra cores from this socket */ @@ -268,8 +269,8 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, free_cores[i] -= tmp; } if (free_cores[i] > 0) - c++; - if (free_cores[i] && c > max_sockets) { + j++; + if (free_cores[i] && j > max_sockets) { /* remove extra sockets from use */ free_core_count -= free_cores[i]; free_cores[i] = 0; @@ -310,8 +311,8 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, avail_cpus = num_tasks; cps = num_tasks; } else { - c = avail_cpus / cpus_per_task; - num_tasks = MIN(num_tasks, c); + j = avail_cpus / cpus_per_task; + num_tasks = MIN(num_tasks, j); avail_cpus = num_tasks * cpus_per_task; } if (job_ptr->details->ntasks_per_node && @@ -334,7 +335,7 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map, for (c = core_begin; c < core_end && avail_cpus > 0; c++) { if (bit_test(core_map, c) == 0) continue; - i = (c - core_begin) / cores_per_socket; + i = (uint16_t) (c - core_begin) / cores_per_socket; if (free_cores[i] > 0) { /* this socket has free cores, but make sure * we don't use more than are needed for @@ -390,9 +391,10 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, uint16_t cpu_count = 0, avail_cpus = 0, num_tasks = 0; uint32_t core_begin = cr_get_coremap_offset(node_i); uint32_t core_end = cr_get_coremap_offset(node_i+1); + uint32_t c; uint16_t cpus_per_task = job_ptr->details->cpus_per_task; uint16_t *free_cores, free_core_count = 0; - uint16_t i, c, sockets = select_node_record[node_i].sockets; + uint16_t i, j, sockets = select_node_record[node_i].sockets; uint16_t cores_per_socket = select_node_record[node_i].cores; uint16_t threads_per_core = select_node_record[node_i].vpus; uint16_t min_cores = 1, min_sockets = 1; @@ -471,7 +473,7 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, free_cores = xmalloc(sockets * sizeof(uint16_t)); for (c = core_begin; c < core_end; c++) { - i = (c - core_begin) / cores_per_socket; + i = (uint16_t) (c - core_begin) / cores_per_socket; if (bit_test(core_map, c)) { free_cores[i]++; free_core_count++; @@ -479,7 +481,7 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, } /* Step 2a: check min_cores per socket and min_sockets per node */ - c = 0; + j = 0; for (i = 0; i < sockets; i++) { if (free_cores[i] < min_cores) { /* cannot use this socket */ @@ -488,16 +490,16 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, continue; } /* count this socket as usable */ - c++; + j++; } - if (c < min_sockets) { + if (j < min_sockets) { /* cannot use this node */ num_tasks = 0; goto fini; } /* Step 2b: check max_cores per socket and max_sockets per node */ - c = 0; + j = 0; for (i = 0; i < sockets; i++) { if (free_cores[i] > max_cores) { /* remove extra cores from this socket */ @@ -506,8 +508,8 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, free_cores[i] -= tmp; } if (free_cores[i] > 0) - c++; - if (free_cores[i] && (c > max_sockets)) { + j++; + if (free_cores[i] && (j > max_sockets)) { /* remove extra sockets from use */ free_core_count -= free_cores[i]; free_cores[i] = 0; @@ -548,8 +550,8 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, if (cpus_per_task < 2) { avail_cpus = num_tasks; } else { - c = avail_cpus / cpus_per_task; - num_tasks = MIN(num_tasks, c); + j = avail_cpus / cpus_per_task; + num_tasks = MIN(num_tasks, j); avail_cpus = num_tasks * cpus_per_task; } if (job_ptr->details->ntasks_per_node && @@ -563,7 +565,7 @@ uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map, for (c = core_begin; c < core_end && avail_cpus > 0; c++) { if (bit_test(core_map, c) == 0) continue; - i = (c - core_begin) / cores_per_socket; + i = (uint16_t) (c - core_begin) / cores_per_socket; if (free_cores[i] == 0) bit_clear(core_map, c); else { diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 7278eea888b3f7916b396f7da14257fc4abe9608..cc6c9643af5ab39fb2165cf4fe3c13f1f067a2b8 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -101,7 +101,6 @@ # endif #endif -#include "src/common/slurm_xlator.h" #include "select_cons_res.h" #include "dist_tasks.h" #include "job_test.h" @@ -1182,7 +1181,7 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap, preemptee_candidates)) { /* Remove preemptable job now */ _rm_job_from_res(future_part, future_usage, - tmp_job_ptr, 2); + tmp_job_ptr, 0); bit_or(bitmap, orig_map); rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, diff --git a/src/salloc/opt.c b/src/salloc/opt.c index a45c5c0a5fc31e1ce4fdb2878f84ce328a4d3ca7..a013d2b2307a4e61f68f920a1937f3a32413201e 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -980,6 +980,7 @@ void set_options(const int argc, char **argv) opt.qos = xstrdup(optarg); break; case LONG_OPT_SOCKETSPERNODE: + max_val = 0; get_resource_arg_range( optarg, "sockets-per-node", &opt.min_sockets_per_node, &max_val, true ); @@ -988,6 +989,7 @@ void set_options(const int argc, char **argv) opt.min_sockets_per_node = NO_VAL; break; case LONG_OPT_CORESPERSOCKET: + max_val = 0; get_resource_arg_range( optarg, "cores-per-socket", &opt.min_cores_per_socket, &max_val, true ); @@ -996,6 +998,7 @@ void set_options(const int argc, char **argv) opt.min_cores_per_socket = NO_VAL; break; case LONG_OPT_THREADSPERCORE: + max_val = 0; get_resource_arg_range( optarg, "threads-per-core", &opt.min_threads_per_core, &max_val, true ); @@ -1218,7 +1221,7 @@ static bool _opt_verify(void) /* check for realistic arguments */ if (opt.nprocs <= 0) { - error("invalid number of processes (-n %d)", + error("invalid number of tasks (-n %d)", opt.nprocs); verified = false; } diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 09d63983904777293538e0ab5402a1a3fd7bbc0f..24ace1341cee96640b7d4958ecee7ec6ed53794b 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -339,9 +339,9 @@ int main(int argc, char *argv[]) return 1; } else { allocation_state = GRANTED; + pthread_mutex_unlock(&allocation_state_lock); command_pid = pid = _fork_command(command_argv); } - pthread_mutex_unlock(&allocation_state_lock); /* * Wait for command to exit, OR for waitpid to be interrupted by a diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 1f9155a03aa392e36dcbeead39af7af11ddd530f..d7f1c4940be336654c505edc0756e86b26c8ec5f 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -283,7 +283,7 @@ static void _opt_default() opt.nprocs_set = false; opt.cpus_per_task = 1; opt.cpus_set = false; - opt.min_nodes = 0; + opt.min_nodes = 1; opt.max_nodes = 0; opt.nodes_set = false; opt.min_sockets_per_node = NO_VAL; /* requested min sockets */ @@ -1421,6 +1421,7 @@ static void _set_options(int argc, char **argv) opt.qos = xstrdup(optarg); break; case LONG_OPT_SOCKETSPERNODE: + max_val = 0; get_resource_arg_range( optarg, "sockets-per-node", &opt.min_sockets_per_node, &max_val, true ); @@ -1429,6 +1430,7 @@ static void _set_options(int argc, char **argv) opt.min_sockets_per_node = NO_VAL; break; case LONG_OPT_CORESPERSOCKET: + max_val = 0; get_resource_arg_range( optarg, "cores-per-socket", &opt.min_cores_per_socket, &max_val, true ); @@ -1437,6 +1439,7 @@ static void _set_options(int argc, char **argv) opt.min_cores_per_socket = NO_VAL; break; case LONG_OPT_THREADSPERCORE: + max_val = 0; get_resource_arg_range( optarg, "threads-per-core", &opt.min_threads_per_core, &max_val, true ); @@ -1996,7 +1999,7 @@ static bool _opt_verify(void) /* check for realistic arguments */ if (opt.nprocs <= 0) { - error("invalid number of processes (-n %d)", opt.nprocs); + error("invalid number of tasks (-n %d)", opt.nprocs); verified = false; } diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 9cc255b13c4fe910220730f7aecd0c08b4fa8022..3c4502f4cbce3d9c1a11c9ce670668f72357bb6d 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -405,6 +405,10 @@ scontrol_update_job (int argc, char *argv[]) job_msg.partition = val; update_cnt++; } + else if (strncasecmp(tag, "QOS", MAX(taglen, 2)) == 0) { + job_msg.qos = val; + update_cnt++; + } else if (strncasecmp(tag, "ReservationName", MAX(taglen, 3)) == 0) { job_msg.reservation = val; diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 0fcaba9601a8578acad8df836ffa40c0bea07b43..c09bc3850ee439dc18924ee91978b3b865033a63 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -367,6 +367,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) bool rc = true; uint64_t usage_mins; uint32_t wall_mins; + bool cancel_job = 0; int parent = 0; /*flag to tell us if we are looking at the * parent or not */ @@ -391,6 +392,10 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) job_ptr->state_reason = WAIT_NO_REASON; slurm_mutex_lock(&assoc_mgr_qos_lock); + /* Since we could possibly need the association lock handle it + now to avoid deadlock. Always do QOS first. + */ + slurm_mutex_lock(&assoc_mgr_association_lock); qos_ptr = job_ptr->qos_ptr; if(qos_ptr) { usage_mins = (uint64_t)(qos_ptr->usage_raw / 60.0); @@ -407,7 +412,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) qos_ptr->name, qos_ptr->grp_cpu_mins, usage_mins); rc = false; - goto end_qos; + goto end_it; } /* NOTE: We can't enforce qos_ptr->grp_cpus at this @@ -426,7 +431,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) qos_ptr->grp_used_jobs, qos_ptr->name); rc = false; - goto end_qos; + goto end_it; } if (qos_ptr->grp_nodes != INFINITE) { @@ -438,7 +443,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) job_ptr->details->min_nodes, qos_ptr->grp_nodes, qos_ptr->name); - _cancel_job(job_ptr); + cancel_job = 1; } else if ((qos_ptr->grp_used_nodes + job_ptr->details->min_nodes) > qos_ptr->grp_nodes) { @@ -456,7 +461,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) job_ptr->details->min_nodes, qos_ptr->name); rc = false; - goto end_qos; + goto end_it; } } @@ -475,7 +480,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) wall_mins, qos_ptr->name); rc = false; - goto end_qos; + goto end_it; } /* NOTE: We can't enforce qos_ptr->max_cpu_mins_pj at this @@ -506,7 +511,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) qos_ptr->max_jobs_pu, used_limits->jobs, qos_ptr->name); rc = false; - goto end_qos; + goto end_it; } } @@ -519,9 +524,9 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) job_ptr->job_id, job_ptr->details->min_nodes, qos_ptr->max_nodes_pj); - _cancel_job(job_ptr); + cancel_job = 1; rc = false; - goto end_qos; + goto end_it; } } @@ -537,14 +542,13 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) "time limit %u exceeds account max %u", job_ptr->job_id, job_ptr->time_limit, time_limit); - _cancel_job(job_ptr); + cancel_job = 1; rc = false; - goto end_qos; + goto end_it; } } } - slurm_mutex_lock(&assoc_mgr_association_lock); assoc_ptr = job_ptr->assoc_ptr; while(assoc_ptr) { usage_mins = (uint64_t)(assoc_ptr->usage_raw / 60.0); @@ -599,7 +603,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) job_ptr->job_id, job_ptr->details->min_nodes, assoc_ptr->grp_nodes, assoc_ptr->acct); - _cancel_job(job_ptr); + cancel_job = 1; } else if ((assoc_ptr->grp_used_nodes + job_ptr->details->min_nodes) > assoc_ptr->grp_nodes) { @@ -686,7 +690,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) job_ptr->job_id, job_ptr->details->min_nodes, assoc_ptr->max_nodes_pj); - _cancel_job(job_ptr); + cancel_job = 1; rc = false; goto end_it; } @@ -706,7 +710,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) "time limit %u exceeds account max %u", job_ptr->job_id, job_ptr->time_limit, time_limit); - _cancel_job(job_ptr); + cancel_job = 1; rc = false; goto end_it; } @@ -717,7 +721,10 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) } end_it: slurm_mutex_unlock(&assoc_mgr_association_lock); -end_qos: slurm_mutex_unlock(&assoc_mgr_qos_lock); + + if(cancel_job) + _cancel_job(job_ptr); + return rc; } diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b05e78e7a3bb43bc6e46069a9e86fd80a69e1629..0054135508835b5859b81e75c7bb09c00226659c 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -187,6 +187,7 @@ static void _init_pidfile(void); static void _kill_old_slurmctld(void); static void _parse_commandline(int argc, char *argv[]); static void _remove_assoc(acct_association_rec_t *rec); +static void _remove_qos(acct_qos_rec_t *rec); inline static int _report_locks_set(void); static void * _service_connection(void *arg); static int _shutdown_backup_controller(int wait_time); @@ -326,6 +327,7 @@ int main(int argc, char *argv[]) memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t)); assoc_init_arg.enforce = accounting_enforce; assoc_init_arg.remove_assoc_notify = _remove_assoc; + assoc_init_arg.remove_qos_notify = _remove_qos; assoc_init_arg.cache_level = ASSOC_MGR_CACHE_ASSOC | ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS; @@ -1127,6 +1129,18 @@ static void _remove_assoc(acct_association_rec_t *rec) debug("Removed association id:%u user:%s", rec->id, rec->user); } +static void _remove_qos(acct_qos_rec_t *rec) +{ + int cnt = 0; + + cnt = job_cancel_by_qos_id(rec->id); + + if (cnt) { + info("Removed QOS:%s cancelled %u jobs", rec->name, cnt); + } else + debug("Removed QOS:%s", rec->name); +} + /* * _slurmctld_background - process slurmctld background activities * purge defunct job records, save state, schedule jobs, and diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 634119c59ba63b6b21d025b223dedc7b9b910c38..de213c6f030162de9f8e9ba91c86765ec078e3c5 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -934,6 +934,7 @@ static int _load_job_state(Buf buffer) xfree(job_ptr->account); job_ptr->account = account; + xstrtolower(job_ptr->account); account = NULL; /* reused, nothing left to free */ xfree(job_ptr->alloc_node); job_ptr->alloc_node = alloc_node; @@ -965,6 +966,7 @@ static int _load_job_state(Buf buffer) name = NULL; /* reused, nothing left to free */ xfree(job_ptr->wckey); /* in case duplicate record */ job_ptr->wckey = wckey; + xstrtolower(job_ptr->wckey); wckey = NULL; /* reused, nothing left to free */ xfree(job_ptr->network); job_ptr->network = network; @@ -2461,6 +2463,11 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, return error_code; } + /* Make sure anything that may be put in the database will be + lower case */ + xstrtolower(job_desc->account); + xstrtolower(job_desc->wckey); + if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid))) return error_code; @@ -3358,7 +3365,6 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, return ESLURM_INVALID_WCKEY; } } - job_desc->wckey = xstrdup(job_desc->wckey); } else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) { /* This should never happen */ info("_job_create: no wckey was given for " @@ -3645,9 +3651,6 @@ void job_time_limit(void) if (!IS_JOB_RUNNING(job_ptr)) continue; - qos = (acct_qos_rec_t *)job_ptr->qos_ptr; - assoc = (acct_association_rec_t *)job_ptr->assoc_ptr; - /* find out how many cpu minutes this job has been * running for. */ job_cpu_usage_mins = (uint64_t) @@ -3709,6 +3712,14 @@ void job_time_limit(void) (list_count(job_ptr->step_list) > 0)) check_job_step_time_limit(job_ptr, now); + slurm_mutex_lock(&assoc_mgr_qos_lock); + /* Handle both locks here to avoid deadlock. Always do + * QOS first. + */ + slurm_mutex_lock(&assoc_mgr_association_lock); + qos = (acct_qos_rec_t *)job_ptr->qos_ptr; + assoc = (acct_association_rec_t *)job_ptr->assoc_ptr; + /* The idea here is for qos to trump what an association * has set for a limit, so if an association set of * wall 10 mins and the qos has 20 mins set and the @@ -3716,7 +3727,6 @@ void job_time_limit(void) * until 20. */ if(qos) { - slurm_mutex_lock(&assoc_mgr_qos_lock); usage_mins = (uint64_t)(qos->usage_raw / 60.0); wall_mins = qos->grp_used_wall / 60; @@ -3730,7 +3740,6 @@ void job_time_limit(void) qos->name, qos->grp_cpu_mins, usage_mins); job_ptr->state_reason = FAIL_TIMEOUT; - slurm_mutex_unlock(&assoc_mgr_qos_lock); goto job_failed; } @@ -3744,7 +3753,6 @@ void job_time_limit(void) qos->name, qos->grp_wall, wall_mins); job_ptr->state_reason = FAIL_TIMEOUT; - slurm_mutex_unlock(&assoc_mgr_qos_lock); goto job_failed; } @@ -3758,14 +3766,11 @@ void job_time_limit(void) qos->name, qos->max_cpu_mins_pj, job_cpu_usage_mins); job_ptr->state_reason = FAIL_TIMEOUT; - slurm_mutex_unlock(&assoc_mgr_qos_lock); goto job_failed; } - slurm_mutex_unlock(&assoc_mgr_qos_lock); } /* handle any association stuff here */ - slurm_mutex_lock(&assoc_mgr_association_lock); while(assoc) { usage_mins = (uint64_t)(assoc->usage_raw / 60.0); wall_mins = assoc->grp_used_wall / 60; @@ -3817,8 +3822,10 @@ void job_time_limit(void) if(assoc == assoc_mgr_root_assoc) break; } - slurm_mutex_unlock(&assoc_mgr_association_lock); job_failed: + slurm_mutex_unlock(&assoc_mgr_association_lock); + slurm_mutex_unlock(&assoc_mgr_qos_lock); + if(job_ptr->state_reason == FAIL_TIMEOUT) { last_job_update = now; _job_timed_out(job_ptr); @@ -4843,6 +4850,11 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) &cpus_per_node); #endif + /* Make sure anything that may be put in the database will be + lower case */ + xstrtolower(job_specs->account); + xstrtolower(job_specs->wckey); + job_ptr = find_job_record(job_specs->job_id); if (job_ptr == NULL) { error("update_job: job_id %u does not exist.", @@ -4940,32 +4952,41 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) if (wiki_sched && strstr(job_ptr->comment, "QOS:")) { acct_qos_rec_t qos_rec; + if (!IS_JOB_PENDING(job_ptr)) + error_code = ESLURM_DISABLED; + else { + memset(&qos_rec, 0, sizeof(acct_qos_rec_t)); + + if (strstr(job_ptr->comment, + "FLAGS:PREEMPTOR")) + qos_rec.name = "expedite"; + else if (strstr(job_ptr->comment, + "FLAGS:PREEMPTEE")) + qos_rec.name = "standby"; + + job_ptr->qos_ptr = _determine_and_validate_qos( + job_ptr->assoc_ptr, &qos_rec, + &error_code); + job_ptr->qos = qos_rec.id; + update_accounting = true; + } + } + } else if(job_specs->qos) { + acct_qos_rec_t qos_rec; + if (!IS_JOB_PENDING(job_ptr)) + error_code = ESLURM_DISABLED; + else { + info("update_job: setting qos to %s for job_id %u", + job_specs->qos, job_specs->job_id); memset(&qos_rec, 0, sizeof(acct_qos_rec_t)); - - if (strstr(job_ptr->comment, "FLAGS:PREEMPTOR")) - qos_rec.name = "expedite"; - else if (strstr(job_ptr->comment, "FLAGS:PREEMPTEE")) - qos_rec.name = "standby"; + qos_rec.name = job_specs->qos; job_ptr->qos_ptr = _determine_and_validate_qos( job_ptr->assoc_ptr, &qos_rec, &error_code); job_ptr->qos = qos_rec.id; update_accounting = true; } - } else if(job_specs->qos) { - acct_qos_rec_t qos_rec; - - info("update_job: setting qos to %s for job_id %u", - job_specs->qos, job_specs->job_id); - - memset(&qos_rec, 0, sizeof(acct_qos_rec_t)); - qos_rec.name = job_specs->qos; - - job_ptr->qos_ptr = _determine_and_validate_qos( - job_ptr->assoc_ptr, &qos_rec, &error_code); - job_ptr->qos = qos_rec.id; - update_accounting = true; } if (!super_user && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) && @@ -6965,7 +6986,9 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, xassert(limit_set_max_nodes); //(*limit_set_max_nodes) = 0; + /* Handle both locks here to avoid deadlock. Always do QOS first. */ slurm_mutex_lock(&assoc_mgr_qos_lock); + slurm_mutex_lock(&assoc_mgr_association_lock); if(qos_ptr) { /* for validation we don't need to look at * qos_ptr->grp_cpu_mins. @@ -6990,7 +7013,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, qos_ptr->grp_nodes, qos_ptr->name); rc = false; - goto end_qos; + goto end_it; } else if (job_desc->max_nodes == 0 || (*limit_set_max_nodes && (job_desc->max_nodes @@ -7023,7 +7046,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, qos_ptr->grp_submit_jobs, qos_ptr->name); rc = false; - goto end_qos; + goto end_it; } @@ -7054,7 +7077,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, job_desc->min_nodes, qos_ptr->max_nodes_pj); rc = false; - goto end_qos; + goto end_it; } else if (job_desc->max_nodes == 0 || (*limit_set_max_nodes && (job_desc->max_nodes @@ -7096,7 +7119,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, job_desc->user_id, qos_ptr->max_submit_jobs_pu); rc = false; - goto end_qos; + goto end_it; } } @@ -7120,13 +7143,12 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, job_desc->user_id, job_desc->time_limit, time_limit); rc = false; - goto end_qos; + goto end_it; } } } - slurm_mutex_lock(&assoc_mgr_association_lock); while(assoc_ptr) { /* for validation we don't need to look at * assoc_ptr->grp_cpu_mins. @@ -7294,8 +7316,8 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, assoc_ptr = assoc_ptr->parent_assoc_ptr; parent = 1; } +end_it: slurm_mutex_unlock(&assoc_mgr_association_lock); -end_qos: slurm_mutex_unlock(&assoc_mgr_qos_lock); return rc; @@ -7347,6 +7369,50 @@ extern int job_cancel_by_assoc_id(uint32_t assoc_id) return cnt; } +/* + * job_cancel_by_qos_id - Cancel all pending and running jobs with a given + * QOS ID. This happens when a QOS is deleted (e.g. when + * a QOS is removed from the association database). + * RET count of cancelled jobs + */ +extern int job_cancel_by_qos_id(uint16_t qos_id) +{ + int cnt = 0; + ListIterator job_iterator; + struct job_record *job_ptr; + /* Write lock on jobs */ + slurmctld_lock_t job_write_lock = + { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; + + if (!job_list) + return cnt; + + lock_slurmctld(job_write_lock); + job_iterator = list_iterator_create(job_list); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + if (job_ptr->qos != qos_id) + continue; + + /* move up to the parent that should still exist */ + if(job_ptr->qos_ptr) + job_ptr->qos_ptr = NULL; + + if(IS_JOB_FINISHED(job_ptr)) + continue; + + info("QOS deleted, cancelling job %u", + job_ptr->job_id); + /* make sure the assoc_mgr_qos_lock isn't locked before this. */ + job_signal(job_ptr->job_id, SIGKILL, 0, 0); + job_ptr->state_reason = FAIL_BANK_ACCOUNT; + xfree(job_ptr->state_desc); + cnt++; + } + list_iterator_destroy(job_iterator); + unlock_slurmctld(job_write_lock); + return cnt; +} + /* * Modify the account associated with a pending job * IN module - where this is called from diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index ec70dd080a774a1579100e4f280b99bc6f236779..175a991a5ef806cf5fe18abf051027be0520c2d7 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -760,14 +760,16 @@ int update_node ( update_node_msg_t * update_node_msg ) } /* else already fully available */ node_ptr->node_state &= (~NODE_STATE_DRAIN); node_ptr->node_state &= (~NODE_STATE_FAIL); - bit_set (avail_node_bitmap, node_inx); + if (!IS_NODE_NO_RESPOND(node_ptr)) + bit_set (avail_node_bitmap, node_inx); bit_set (idle_node_bitmap, node_inx); bit_set (up_node_bitmap, node_inx); node_ptr->last_idle = now; reset_job_priority(); } else if (state_val == NODE_STATE_ALLOCATED) { if (!IS_NODE_DRAIN(node_ptr) && - !IS_NODE_FAIL(node_ptr)) + !IS_NODE_FAIL(node_ptr) && + !IS_NODE_NO_RESPOND(node_ptr)) bit_set(avail_node_bitmap, node_inx); bit_set (up_node_bitmap, node_inx); bit_clear (idle_node_bitmap, node_inx); @@ -1600,7 +1602,11 @@ extern int validate_nodes_via_front_end( node_ptr, now); } } else if (IS_NODE_DOWN(node_ptr) && - (slurmctld_conf.ret2service == 1)) { + ((slurmctld_conf.ret2service == 2) || + ((slurmctld_conf.ret2service == 1) && + (node_ptr->reason != NULL) && + (strncmp(node_ptr->reason, + "Not responding", 14) == 0)))) { updated_job = true; if (jobs_on_node) { node_ptr->node_state = @@ -1684,7 +1690,7 @@ static void _sync_bitmaps(struct node_record *node_ptr, int job_count) bit_set (share_node_bitmap, node_inx); } if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr) || - IS_NODE_FAIL(node_ptr)) + IS_NODE_FAIL(node_ptr) || IS_NODE_NO_RESPOND(node_ptr)) bit_clear (avail_node_bitmap, node_inx); else bit_set (avail_node_bitmap, node_inx); diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 46f05a2112e22dfd89d410b632f7a5abdfeb9828..0ddf8037b36e9e1c09cc0a87ad608977a6809fe8 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1157,6 +1157,9 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) rc = ESLURM_INVALID_TIME_VALUE; goto bad_parse; } + } else if (resv_desc_ptr->duration == INFINITE) { + resv_desc_ptr->end_time = resv_desc_ptr->start_time + + (365 * 24 * 60 * 60); } else if (resv_desc_ptr->duration) { resv_desc_ptr->end_time = resv_desc_ptr->start_time + (resv_desc_ptr->duration * 60); @@ -1224,7 +1227,8 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) } if (resv_desc_ptr->node_cnt == NO_VAL) resv_desc_ptr->node_cnt = 0; - if (_resv_overlap(resv_desc_ptr->start_time, + if (!(resv_desc_ptr->flags & RESERVE_FLAG_OVERLAP) && + _resv_overlap(resv_desc_ptr->start_time, resv_desc_ptr->end_time, resv_desc_ptr->flags, node_bitmap, NULL)) { @@ -1233,7 +1237,8 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) goto bad_parse; } resv_desc_ptr->node_cnt = bit_set_count(node_bitmap); - if (_job_overlap(resv_desc_ptr->start_time, + if (!(resv_desc_ptr->flags & RESERVE_FLAG_IGN_JOBS) && + _job_overlap(resv_desc_ptr->start_time, resv_desc_ptr->flags, node_bitmap)) { info("Reservation request overlaps jobs"); rc = ESLURM_NODES_BUSY; @@ -2283,19 +2288,21 @@ static int _select_nodes(resv_desc_msg_t *resv_desc_ptr, node_bitmap = bit_copy((*part_ptr)->node_bitmap); /* Don't use node already reserved */ - iter = list_iterator_create(resv_list); - if (!iter) - fatal("malloc: list_iterator_create"); - while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) { - if ((resv_ptr->node_bitmap == NULL) || - (resv_ptr->start_time >= resv_desc_ptr->end_time) || - (resv_ptr->end_time <= resv_desc_ptr->start_time)) - continue; - bit_not(resv_ptr->node_bitmap); - bit_and(node_bitmap, resv_ptr->node_bitmap); - bit_not(resv_ptr->node_bitmap); + if (!(resv_desc_ptr->flags & RESERVE_FLAG_OVERLAP)) { + iter = list_iterator_create(resv_list); + if (!iter) + fatal("malloc: list_iterator_create"); + while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) { + if ((resv_ptr->node_bitmap == NULL) || + (resv_ptr->start_time >= resv_desc_ptr->end_time) || + (resv_ptr->end_time <= resv_desc_ptr->start_time)) + continue; + bit_not(resv_ptr->node_bitmap); + bit_and(node_bitmap, resv_ptr->node_bitmap); + bit_not(resv_ptr->node_bitmap); + } + list_iterator_destroy(iter); } - list_iterator_destroy(iter); /* Satisfy feature specification */ if (resv_desc_ptr->features) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 19159834633dabf1ff2031a9fa181ca62f11c17d..bfa1107330061b7eadb29a7406563eb0f5a6fd0a 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -815,6 +815,14 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, */ extern int job_cancel_by_assoc_id(uint32_t assoc_id); +/* + * job_cancel_by_qos_id - Cancel all pending and running jobs with a given + * QOS ID. This happens when a QOS is deleted (e.g. when + * a QOS is removed from the association database). + * RET count of cancelled jobs + */ +extern int job_cancel_by_qos_id(uint16_t qos_id); + /* Perform checkpoint operation on a job */ extern int job_checkpoint(checkpoint_msg_t *ckpt_ptr, uid_t uid, slurm_fd conn_fd); diff --git a/src/srun/opt.c b/src/srun/opt.c index 430588a1f8bdd92135f5e1dde30649e29e38e30a..489467458f3e40e3b75a0f65ac0da46f8a08f11b 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -1302,6 +1302,7 @@ static void set_options(const int argc, char **argv) opt.qos = xstrdup(optarg); break; case LONG_OPT_SOCKETSPERNODE: + max_val = 0; get_resource_arg_range( optarg, "sockets-per-node", &opt.min_sockets_per_node, &max_val, true ); @@ -1310,6 +1311,7 @@ static void set_options(const int argc, char **argv) opt.min_sockets_per_node = NO_VAL; break; case LONG_OPT_CORESPERSOCKET: + max_val = 0; get_resource_arg_range( optarg, "cores-per-socket", &opt.min_cores_per_socket, &max_val, true ); @@ -1318,6 +1320,7 @@ static void set_options(const int argc, char **argv) opt.min_cores_per_socket = NO_VAL; break; case LONG_OPT_THREADSPERCORE: + max_val = 0; get_resource_arg_range( optarg, "threads-per-core", &opt.min_threads_per_core, &max_val, true ); @@ -1810,15 +1813,11 @@ static bool _opt_verify(void) opt.nprocs = opt.min_nodes; /* 1 proc / min_[socket * core * thread] default */ - if (opt.min_sockets_per_node > 0) { + if ((opt.min_sockets_per_node != NO_VAL) && + (opt.min_cores_per_socket != NO_VAL) && + (opt.min_threads_per_core != NO_VAL)) { opt.nprocs *= opt.min_sockets_per_node; - opt.nprocs_set = true; - } - if (opt.min_cores_per_socket > 0) { opt.nprocs *= opt.min_cores_per_socket; - opt.nprocs_set = true; - } - if (opt.min_threads_per_core > 0) { opt.nprocs *= opt.min_threads_per_core; opt.nprocs_set = true; } diff --git a/src/srun/srun.c b/src/srun/srun.c index 8182ba6078d2a3fc8bc83a9c40fbe44ca281bbca..35684307dd7bdccabe08f6d641420b36b3dfad7f 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -257,6 +257,12 @@ int srun(int ac, char **av) /* Set up slurmctld message handler */ slurmctld_msg_init(); + /* + * Become --uid user + */ + if (_become_user () < 0) + info ("Warning: Unable to assume uid=%lu\n", opt.uid); + /* now global "opt" should be filled in and available, * create a job from opt */ @@ -279,9 +285,9 @@ int srun(int ac, char **av) job_id = resp->job_id; if (opt.nodes_set_env && !opt.nodes_set_opt && (opt.min_nodes > resp->node_cnt)) { - /* This signifies the job used the --no-kill option + /* This signifies the job used the --no-kill option * and a node went DOWN or it used a node count range - * specification, was checkpointed from one size and + * specification, was checkpointed from one size and * restarted at a different size */ error("SLURM_NNODES environment varariable " "conflicts with allocated node count (%u!=%u).", @@ -294,7 +300,7 @@ int srun(int ac, char **av) opt.nprocs = opt.min_nodes; } if (opt.alloc_nodelist == NULL) - opt.alloc_nodelist = xstrdup(resp->node_list); + opt.alloc_nodelist = xstrdup(resp->node_list); if (opt.exclusive) _step_opt_exclusive(); _set_cpu_env_var(resp); @@ -351,12 +357,6 @@ int srun(int ac, char **av) slurm_free_resource_allocation_response_msg(resp); } - /* - * Become --uid user - */ - if (_become_user () < 0) - info ("Warning: Unable to assume uid=%lu\n", opt.uid); - /* * Enhance environment for job */ @@ -638,7 +638,7 @@ _print_job_information(resource_allocation_response_msg_t *resp) for (i = 0; i < resp->num_cpu_groups; i++) { xstrfmtcat(str, "%s%u(x%u)", sep, resp->cpus_per_node[i], - resp->cpu_count_reps[i]); + resp->cpu_count_reps[i]); sep = ","; } verbose("%s", str); @@ -1418,4 +1418,3 @@ static int _setup_signals() return rc; } - diff --git a/testsuite/expect/test1.40 b/testsuite/expect/test1.40 index edf53896a7c5cc9229c8d5b22acac072bdbcbce1..4353ed23f42c578faf661c78856e6ddc9cd67fe3 100755 --- a/testsuite/expect/test1.40 +++ b/testsuite/expect/test1.40 @@ -91,7 +91,7 @@ expect { set job_id2 $expect_out(1,string) exp_continue } - -re "Account=MY_ACCT" { + -re "Account=my_acct" { set match_acct 1 exp_continue } @@ -115,7 +115,7 @@ if {$match_acct == 0} { set match_acct 0 spawn $scontrol show job $job_id2 expect { - -re "Account=QA_ACCT" { + -re "Account=qa_acct" { set match_acct 1 exp_continue } diff --git a/testsuite/expect/test1.83 b/testsuite/expect/test1.83 index 51500e2cc0ea763d548999ce4d7398b108e04442..d51929014ad7125177473eb55c83c0e801675800 100755 --- a/testsuite/expect/test1.83 +++ b/testsuite/expect/test1.83 @@ -14,7 +14,7 @@ # Change tha node name parsing logic as needed for other formats. ############################################################################ # Copyright (C) 2002-2007 The Regents of the University of California. -# Copyright (C) 2008-2009 Lawrence Livermore National Security. +# Copyright (C) 2008-2010 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette <jette1@llnl.gov> # CODE-OCEC-09-009. All rights reserved. @@ -77,12 +77,9 @@ set host_2_num 0 set timeout $max_job_delay set srun_pid [spawn $srun -N3-3 --contiguous -l -t1 $bin_printenv SLURMD_NODENAME] expect { - -re "Node count specification invalid" { - send_user "\nWARNING: can't test srun task distribution\n" - exit $exit_code - } - -re "Requested node configuration is not available" { + -re "Node count specification invalid|configuration not available" { send_user "\nWARNING: can't test srun task distribution\n" + slow_kill $srun_pid exit $exit_code } -re "($number): ($alpha_numeric_under)" { diff --git a/testsuite/expect/test15.19 b/testsuite/expect/test15.19 index 69e0ea9577c75769147d2c6befc1495bb8054307..0a2b7482e9957ac078899f90c90585a313763892 100755 --- a/testsuite/expect/test15.19 +++ b/testsuite/expect/test15.19 @@ -10,7 +10,7 @@ # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2002-2007 The Regents of the University of California. -# Copyright (C) 2008 Lawrence Livermore National Security. +# Copyright (C) 2008-2010 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette <jette1@llnl.gov> # CODE-OCEC-09-009. All rights reserved. @@ -43,7 +43,6 @@ print_header $test_id # # Submit a 1 node job and validate that we don't get more than one # -set can_not_run 0 set job_id 0 set host_0 "" set task_cnt 0 @@ -54,12 +53,16 @@ expect { set job_id $expect_out(1,string) exp_continue } - -re "(Task count specification invalid|configuration is not available)" { + -re "(Task count specification invalid|configuration not available)" { if { [test_front_end] } { - set can_not_run 1 send_user "\nWARNING: error expected, Testing is incompatible with front-end systems\n" + } else { + send_user "\nWARNING: test not compatable with this configuration\n" } - exp_continue + if {$job_id != 0} { + cancel_job $job_id + } + exit $exit_code } -re "($number): ($alpha_numeric_under)" { if {$task_cnt == 0} { @@ -80,9 +83,6 @@ expect { wait } } -if {$can_not_run == 1} { - exit 0 -} if {[string compare $host_0 ""] == 0} { send_user "\nFAILURE: Did not get SLURMD_NODENAME of task 0\n" @@ -262,12 +262,11 @@ expect { set job_id $expect_out(1,string) exp_continue } - -re "More processors requested than permitted" { - send_user "\nWARNING: can't test salloc task distribution\n" - exit $exit_code - } - -re "Node count specification invalid" { + -re "More processors requested than permitted|Node count specification invalid|configuration not available" { send_user "\nWARNING: can't test salloc task distribution\n" + if {$job_id != 0} { + cancel_job $job_id + } exit $exit_code } -re "($number): ($alpha_numeric_under)" { diff --git a/testsuite/expect/test15.25 b/testsuite/expect/test15.25 index 9f03b947a1ef8de9b8f3d1e746594b0827f23304..fc8623de1a18ca375440889d5ddcc4c8655ab198 100755 --- a/testsuite/expect/test15.25 +++ b/testsuite/expect/test15.25 @@ -38,6 +38,7 @@ set exit_code 0 set job_id1 0 set job_id2 0 set job_acct "TEST_ACCT" +set job_acct_lc "test_acct" print_header $test_id @@ -92,7 +93,7 @@ expect { set job_id2 $expect_out(1,string) exp_continue } - -re "Account=$job_acct" { + -re "Account=$job_acct_lc" { set match_acct 1 exp_continue } @@ -119,7 +120,7 @@ if {$match_acct == 0} { set match_acct 0 spawn $scontrol show job $job_id2 expect { - -re "Account=$job_acct" { + -re "Account=$job_acct_lc" { set match_acct 1 exp_continue } diff --git a/testsuite/expect/test24.1 b/testsuite/expect/test24.1 index 95717691c0734c8cd457c7932d6f2afa629feb9d..4a20ff5b575ee57d1891a94c8a6abc9962c346bb 100755 --- a/testsuite/expect/test24.1 +++ b/testsuite/expect/test24.1 @@ -91,23 +91,31 @@ expect { incr matches exp_continue } - "AccountD||60|0.600000|25|0.250000|0.250000|" { + "AccountD||30|0.300000|25|0.250000|0.250000|" { incr matches exp_continue } - "AccountE||25|0.250000|25|0.250000|0.250000|" { + "AccountE||25|0.125000|25|0.250000|0.250000|" { incr matches exp_continue } - "AccountE|User4|1|0.250000|25|0.250000|0.250000|" { + "AccountE|User4|1|0.125000|25|0.250000|0.250000|" { incr matches exp_continue } - "AccountF||35|0.350000|0|0.000000|0.145833|" { + "AccountF||35|0.175000|0|0.000000|0.145833|" { incr matches exp_continue } - "AccountF|User5|1|0.350000|0|0.000000|0.145833|" { + "AccountF|User5|1|0.175000|0|0.000000|0.145833|" { + incr matches + exp_continue + } + "AccountG||30|0.300000|30|0.300000|0.300000|" { + incr matches + exp_continue + } + "AccountG|User6|1|0.300000|30|0.300000|0.300000|" { incr matches exp_continue } @@ -121,7 +129,7 @@ expect { } } -if {$matches != 11} { +if {$matches != 13} { send_user "\nFAILURE: we didn't get the correct priorities from the plugin $matches\n" set exit_code 1 } diff --git a/testsuite/expect/test24.1.prog.c b/testsuite/expect/test24.1.prog.c index 1eb7df76aa092aa8a733ed59754ae4afb720fb3e..48774465cb4cc4c5bc6ee2b39b12e07b3b47965e 100644 --- a/testsuite/expect/test24.1.prog.c +++ b/testsuite/expect/test24.1.prog.c @@ -71,6 +71,7 @@ int _setup_assoc_list() { acct_update_object_t update; acct_association_rec_t *assoc = NULL; + /* make the main list */ assoc_mgr_association_list = list_create(destroy_acct_association_rec); @@ -155,7 +156,7 @@ int _setup_assoc_list() assoc = xmalloc(sizeof(acct_association_rec_t)); assoc->id = 3; assoc->parent_id = 1; - assoc->shares_raw = 60; + assoc->shares_raw = 30; assoc->acct = xstrdup("AccountD"); list_push(update.objects, assoc); @@ -195,8 +196,26 @@ int _setup_assoc_list() assoc->user = xstrdup("User5"); list_push(update.objects, assoc); + assoc = xmalloc(sizeof(acct_association_rec_t)); + assoc->id = 4; + assoc->parent_id = 1; + assoc->shares_raw = 30; + assoc->acct = xstrdup("AccountG"); + list_push(update.objects, assoc); + + /* sub of AccountG id 4 */ + assoc = xmalloc(sizeof(acct_association_rec_t)); + assoc->id = 41; + assoc->parent_id = 4; + assoc->shares_raw = 1; + assoc->usage_raw = 30; + assoc->acct = xstrdup("AccountG"); + assoc->user = xstrdup("User6"); + list_push(update.objects, assoc); + assoc_mgr_update_assocs(&update); list_destroy(update.objects); + return SLURM_SUCCESS; } @@ -250,7 +269,7 @@ int main (int argc, char **argv) if (slurm_priority_init() != SLURM_SUCCESS) fatal("failed to initialize priority plugin"); /* on some systems that don't have multiple cores we need to - sleep to make sure the tread get started. */ + sleep to make sure the thread get started. */ sleep(1); memset(&resp, 0, sizeof(shares_response_msg_t)); resp.assoc_shares_list = assoc_mgr_get_shares(NULL, 0, NULL, NULL); diff --git a/testsuite/expect/test4.12 b/testsuite/expect/test4.12 index d7d8ae84b63cde7f00096a7404b9e2694889c292..b657d9ba4adf515c187ae516bc0a8058b0aa3163 100755 --- a/testsuite/expect/test4.12 +++ b/testsuite/expect/test4.12 @@ -283,29 +283,7 @@ proc allocate_and_quit { node proc_cnt total_procs } { print_header $test_id # find the default partition -log_user 0 -set def_part "" -set def_part_found 0 -spawn $sinfo -expect { - -re "($alpha_numeric_under)\\\* " { - set def_part $expect_out(1,string) - set def_part_found 1 - exp_continue - } - timeout { - send_user "\nFAILURE: sinfo not responding\n" - set exit_code 1 - } - eof { - wait - } -} -log_user 1 -if {!$def_part_found} { - send_user "\nFAILURE: could not default partition on the system\n" - exit 1 -} +set def_part [default_partition] # find the nodes in the default partition log_user 0 @@ -357,27 +335,26 @@ if {$host_cnt == 0} { # find me an idle node in default partition log_user 0 +set inode_name "" +set inode_cores_per_socket 0 set inode_procs 0 -spawn $scontrol -o show node $def_hostlist -expect { - -re "NodeName=($alpha_numeric_under) .*CoresPerSocket=($number) .*CPUTot=($number)(K?).* Sockets=($number) State=IDLE ThreadsPerCore=($number)" { - set inode_name $expect_out(1,string) - set inode_cores_per_socket $expect_out(2,string) - set inode_procs $expect_out(3,string) - if {[string compare $expect_out(4,string) ""]} { - set inode_procs [expr $inode_procs * 1024] - } - set inode_sockets $expect_out(5,string) - set inode_threads_per_core $expect_out(6,string) - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait +set units "" +set inode_sockets 0 +set inode_threads_per_core 0 + +set fd [open "|$scontrol --oneliner show node $def_hostlist"] +exp_internal 1 +while {[gets $fd line] != -1} { + if {[regexp {NodeName=(\w+).*CoresPerSocket=(\d+).*CPUTot=(\d+)(K?).*Sockets=(\d+) State=IDLE ThreadsPerCore=(\d+)} $line frag inode_name inode_cores_per_socket inode_procs units inode_sockets inode_threads_per_core] == 1} { + break } } +exp_internal 0 +if {[string compare $units ""]} { + set inode_procs [expr $inode_procs * 1024] +} +catch {close $fd} + log_user 1 if {!$inode_procs} {