diff --git a/META b/META index fc8bcabd020a06166663c380c202b75714b26ab9..ea82c67c315987cb26bbc0fc6ff386b1a46f32dd 100644 --- a/META +++ b/META @@ -3,9 +3,9 @@ Api_revision: 0 Major: 2 Meta: 1 - Micro: 6 + Micro: 7 Minor: 2 Name: slurm Release: 1 Release_tags: dist - Version: 2.2.6 + Version: 2.2.7 diff --git a/NEWS b/NEWS index 1dc00e02f1383c0605c1089b1523b04c9a009dfd..b7e40afd9927942aa264605b76a7b7b178b1fb48 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,20 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. + +* Changes in SLURM 2.2.7 +======================== + -- Eliminate zombie process created if salloc exits with stopped child + process. Patch from Gerrit Renker, CSCS. + -- With default configuration on non-Cray systems, enable salloc to be + spawned as a background process. Based upon work by Don Albert (Bull) and + Gerrit Renker (CSCS). + -- Fixed Regression from 2.2.4 in accounting where an inherited limit + would not be set correctly in the added child association. + -- Fixed issue with accounting when asking for jobs with a hostlist. + -- Avoid clearing a node's Arch, OS, BootTime and SlurmdStartTime when + "scontrol reconfig" is run. Patch from Martin Perry, Bull. + * Changes in SLURM 2.2.6 ======================== -- Fix displaying of account coordinators with sacctmgr. Possiblity to show diff --git a/doc/html/accounting.shtml b/doc/html/accounting.shtml index 32a425e7f1e02e55281a2caef572371ece112597..887f4c2d0edcc5b2eb9c6165235b5b1990a615b8 100644 --- a/doc/html/accounting.shtml +++ b/doc/html/accounting.shtml @@ -394,7 +394,7 @@ usage there is a line that starts with '->'. This a continuation prompt since the previous mysql statement did not end with a ';'. It assumes that you wish to input more info.)</p> -live example: +<p>live example:</p> <pre> mysql@snowflake:~$ mysql @@ -406,24 +406,40 @@ Type 'help;' or '\h' for help. Type '\c' to clear the buffer. mysql> grant all on slurm_acct_db.* TO 'slurm'@'localhost'; Query OK, 0 rows affected (0.00 sec) +</pre> -or with a password... - +<p>You may also need to do the same with the system name in order +for mysql to work correctly:</p> +<pre> +mysql> grant all on slurm_acct_db.* TO 'slurm'@'system0'; +Query OK, 0 rows affected (0.00 sec) +where 'system0' is the localhost or database storage host. +</pre> +<p>or with a password...</p> +<pre> mysql> grant all on slurm_acct_db.* TO 'slurm'@'localhost' -> identified by 'some_pass' with grant option; Query OK, 0 rows affected (0.00 sec) </pre> -<p>This will grant user 'slurm' access to do what it needs to do on -the local host. This should be done before the SlurmDBD will work -properly. After you grant permission to the Slurm user in mysql then -you can start SlurmDBD and Slurm. You start SlurmDBD by typing -'slurmdbd'. You can verify that SlurmDBD is running by typing 'ps aux -| grep slurmdbd'. After SlurmDBD and the slurmctld start you can -verify that the database was created by using the mysql command 'show -databases;'. You can display the tables that slurm created in the -database by using the mysql command 'use slurm_acct_db;' and then 'show -tables;'.</p> +<p>The same is true in the case, you made to do the same with the +system name:</p> +<pre> +mysql> grant all on slurm_acct_db.* TO 'slurm'@'system0' + -> identified by 'some_pass' with grant option; +where 'system0' is the localhost or database storage host. +</pre> + +<p>This will grant user 'slurm' access to do what it needs to do on the local +host or the storage host system. This must be done before the SlurmDBD will +work properly. After you grant permission to the user 'slurm' in mysql then +you can start SlurmDBD and the other SLURM daemons. You start SlurmDBD by +typing its pathname '/usr/sbin/slurmdbd' or '/etc/init.d/slurmdbd start'. You +can verify that SlurmDBD is running by typing 'ps aux | grep slurmdbd'. After +the SlurmDBD and slurmctld daemons start, you can verify that the database +was created by using the mysql command 'show databases;'. You can display the +tables that slurm created in the database by using the mysql command +'use slurm_acct_db;' and then 'show tables;'.</p> <p>Use the mysql 'show databases;' command</p> diff --git a/doc/man/man1/sacct.1 b/doc/man/man1/sacct.1 index 4c71cd68d7e21f6fb6b4d63b02d4eb11afad6bbc..890d064983bf65a590ec76a80ecbddd74f27c6bb 100644 --- a/doc/man/man1/sacct.1 +++ b/doc/man/man1/sacct.1 @@ -1138,7 +1138,7 @@ This example shows the same job accounting information with the .nf .ft 3 # sacct \-\-brief -Jobid Status ExitCode + Jobid State ExitCode \-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\- 2 RUNNING 0 3 RUNNING 0 @@ -1175,8 +1175,8 @@ on the command line. .PP .nf .ft 3 -# sacct \-\-format=jobid,ncpus,ntasks,nsignals,status -Jobid Elapsed Ncpus Ntasks Status +# sacct \-\-format=jobid,elapsed,ncpus,ntasks,state + Jobid Elapsed Ncpus Ntasks State \-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\- 3 00:01:30 2 1 COMPLETED 3.0 00:01:30 2 1 COMPLETED diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 658c18cb9c54245780580b03e62dc86ca19e3666..d10e5d96f0e894dc843a659de784d4f54ab898e1 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -313,7 +313,7 @@ in some failed state (non-zero exit code, node failure, timed out, etc). .TP \fBafterok:job_id[:jobid...]\fR This job can begin execution after the specified jobs have successfully -executed (ran to completion with non-zero exit code). +executed (ran to completion with an exit code of zero). .TP \fBsingleton\fR This job can begin execution after any previously launched jobs diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 0c0ed5ecd02be4c0b6ec178b97dc2e25b48f7623..0cdfa03e1e3cd47f580f8379821d362984e9d2d8 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -322,7 +322,7 @@ in some failed state (non-zero exit code, node failure, timed out, etc). .TP \fBafterok:job_id[:jobid...]\fR This job can begin execution after the specified jobs have successfully -executed (ran to completion with non-zero exit code). +executed (ran to completion with an exit code of zero). .TP \fBsingleton\fR This job can begin execution after any previously launched jobs diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 385c51449c41bc02a539a8c7338551e6d77f97e0..5999834c206ccbf5b4e3aa279bef15621cbf06bd 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -424,7 +424,7 @@ in some failed state (non-zero exit code, node failure, timed out, etc). .TP \fBafterok:job_id[:jobid...]\fR This job can begin execution after the specified jobs have successfully -executed (ran to completion with non-zero exit code). +executed (ran to completion with an exit code of zero). .TP \fBsingleton\fR This job can begin execution after any previously launched jobs diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index b17e144e0a2ebaabe42a96a09b29b4c2036f7ff9..8187ac65df8b344ef8281126d67bbcc4e5b5c2e2 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -336,7 +336,7 @@ in some failed state (non-zero exit code, node failure, timed out, etc). .TP \fBafterok:job_id[:jobid...]\fR This job can begin execution after the specified jobs have successfully -executed (ran to completion with non-zero exit code). +executed (ran to completion with an exit code of zero). .TP \fBsingleton\fR This job can begin execution after any previously launched jobs diff --git a/slurm.spec b/slurm.spec index 10263924b66259429f86f85e0d8fed63412d555c..37a89df7e2bfd4e303aef7b10e534013de8adb05 100644 --- a/slurm.spec +++ b/slurm.spec @@ -86,14 +86,14 @@ %endif Name: slurm -Version: 2.2.6 +Version: 2.2.7 Release: 1%{?dist} Summary: Simple Linux Utility for Resource Management License: GPL Group: System Environment/Base -Source: slurm-2.2.6.tar.bz2 +Source: slurm-2.2.7.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release} URL: https://computing.llnl.gov/linux/slurm/ @@ -368,7 +368,7 @@ Gives the ability for SLURM to use Berkeley Lab Checkpoint/Restart ############################################################################# %prep -%setup -n slurm-2.2.6 +%setup -n slurm-2.2.7 %build %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \ diff --git a/src/plugins/accounting_storage/mysql/as_mysql_assoc.c b/src/plugins/accounting_storage/mysql/as_mysql_assoc.c index 15d97df59d4c7d1b772107ff1175b468dc274914..3b696b704b6ce0a871f33dc51fa50560b8e37b88 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_assoc.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_assoc.c @@ -643,24 +643,26 @@ static int _set_assoc_limits_for_add( if (!(row = mysql_fetch_row(result))) goto end_it; - if (row[ASSOC2_REQ_DEF_QOS] && assoc->def_qos_id == NO_VAL) + if (row[ASSOC2_REQ_DEF_QOS] && assoc->def_qos_id == INFINITE) assoc->def_qos_id = slurm_atoul(row[ASSOC2_REQ_DEF_QOS]); - else if (assoc->def_qos_id == NO_VAL) + else if (assoc->def_qos_id == INFINITE) assoc->def_qos_id = 0; - if (row[ASSOC2_REQ_MCMPJ] && assoc->max_cpu_mins_pj == (uint64_t)NO_VAL) + if (row[ASSOC2_REQ_MCMPJ] + && assoc->max_cpu_mins_pj == (uint64_t)INFINITE) assoc->max_cpu_mins_pj = slurm_atoull(row[ASSOC2_REQ_MCMPJ]); - if (row[ASSOC2_REQ_MCRM] && assoc->max_cpu_run_mins == (uint64_t)NO_VAL) + if (row[ASSOC2_REQ_MCRM] + && assoc->max_cpu_run_mins == (uint64_t)INFINITE) assoc->max_cpu_run_mins = slurm_atoull(row[ASSOC2_REQ_MCRM]); - if (row[ASSOC2_REQ_MCPJ] && assoc->max_cpus_pj == NO_VAL) + if (row[ASSOC2_REQ_MCPJ] && assoc->max_cpus_pj == INFINITE) assoc->max_cpus_pj = slurm_atoul(row[ASSOC2_REQ_MCPJ]); - if (row[ASSOC2_REQ_MJ] && assoc->max_jobs == NO_VAL) + if (row[ASSOC2_REQ_MJ] && assoc->max_jobs == INFINITE) assoc->max_jobs = slurm_atoul(row[ASSOC2_REQ_MJ]); - if (row[ASSOC2_REQ_MNPJ] && assoc->max_nodes_pj == NO_VAL) + if (row[ASSOC2_REQ_MNPJ] && assoc->max_nodes_pj == INFINITE) assoc->max_nodes_pj = slurm_atoul(row[ASSOC2_REQ_MNPJ]); - if (row[ASSOC2_REQ_MSJ] && assoc->max_submit_jobs == NO_VAL) + if (row[ASSOC2_REQ_MSJ] && assoc->max_submit_jobs == INFINITE) assoc->max_submit_jobs = slurm_atoul(row[ASSOC2_REQ_MSJ]); - if (row[ASSOC2_REQ_MWPJ] && assoc->max_wall_pj == NO_VAL) + if (row[ASSOC2_REQ_MWPJ] && assoc->max_wall_pj == INFINITE) assoc->max_wall_pj = slurm_atoul(row[ASSOC2_REQ_MWPJ]); if (assoc->qos_list) { diff --git a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c index 05345ecadfdb30dba6e8ba48f56ac9ee4f7ccabe..6befc441696f126c725e2a6d2d536c65d40ab79f 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c @@ -843,7 +843,7 @@ extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn, } temp_hl = hostlist_create(job_cond->used_nodes); - if (!hostlist_count(temp_hl)) { + if (hostlist_count(temp_hl) <= 0) { error("we didn't get any real hosts to look for."); goto no_hosts; } @@ -869,8 +869,7 @@ extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn, mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); - hostlist_destroy(temp_hl); - return NULL; + goto no_hosts; } xfree(query); @@ -902,14 +901,16 @@ extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn, _destroy_local_cluster(local_cluster); } mysql_free_result(result); - hostlist_iterator_destroy(h_itr); + if (!list_count(local_cluster_list)) { - hostlist_destroy(temp_hl); list_destroy(local_cluster_list); - return NULL; + local_cluster_list = NULL; + goto no_hosts; } + no_hosts: + hostlist_iterator_destroy(h_itr); hostlist_destroy(temp_hl); return local_cluster_list; diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index a36ef2fc98bb7e6f4e95dd862ea69c25ecc6074a..1112ed6d54e2fab0b45a52c86761e28edd2ae7b1 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -222,6 +222,7 @@ int main(int argc, char *argv[]) * a) input is from a terminal (stdin has valid termios attributes), * b) controlling terminal exists (non-negative tpgid), * c) salloc is not run in allocation-only (--no-shell) mode, + * NOTE: d and e below are configuration dependent * d) salloc runs in its own process group (true in interactive * shells that support job control), * e) salloc has been configured at compile-time to support background @@ -237,10 +238,10 @@ int main(int argc, char *argv[]) error("no controlling terminal: please set --no-shell"); exit(error_exit); } +#ifdef SALLOC_RUN_FOREGROUND } else if ((!opt.no_shell) && (pid == getpgrp())) { if (tpgid == pid) is_interactive = true; -#ifdef SALLOC_RUN_FOREGROUND while (tcgetpgrp(STDIN_FILENO) != pid) { if (!is_interactive) { error("Waiting for program to be placed in " @@ -249,8 +250,12 @@ int main(int argc, char *argv[]) } killpg(pid, SIGTTIN); } -#endif } +#else + } else if (!opt.no_shell) { + is_interactive = true; + } +#endif /* * Reset saved tty attributes at exit, in case a child * process died before properly resetting terminal. @@ -487,6 +492,9 @@ relinquish: if (WIFEXITED(status)) { rc = WEXITSTATUS(status); + } else if (WIFSTOPPED(status)) { + /* Terminate stopped child process */ + _forward_signal(SIGKILL); } else if (WIFSIGNALED(status)) { verbose("Command \"%s\" was terminated by signal %d", command_argv[0], WTERMSIG(status)); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 132039efc8b062fdf5eafb23680029b11a5bb079..7c388adf005ce74bd116acc0dc4a50773ec3c755 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -658,9 +658,7 @@ int read_slurm_conf(int recover, bool reconfig) old_node_table_ptr = node_record_table_ptr; for (i=0, node_ptr=old_node_table_ptr; i<node_record_count; i++, node_ptr++) { - xfree(node_ptr->arch); xfree(node_ptr->features); - xfree(node_ptr->os); node_ptr->features = xstrdup( node_ptr->config_ptr->feature); /* Store the original configured CPU count somewhere @@ -919,11 +917,13 @@ static int _restore_node_state(int recover, node_ptr->name, old_node_ptr->port, node_ptr->config_ptr->cpus); } + node_ptr->boot_time = old_node_ptr->boot_time; node_ptr->cpus = old_node_ptr->cpus; node_ptr->cores = old_node_ptr->cores; node_ptr->sockets = old_node_ptr->sockets; node_ptr->threads = old_node_ptr->threads; node_ptr->real_memory = old_node_ptr->real_memory; + node_ptr->slurmd_start_time = old_node_ptr->slurmd_start_time; node_ptr->tmp_disk = old_node_ptr->tmp_disk; node_ptr->weight = old_node_ptr->weight;