From a1a037e17405ae0c9551278b4059215e3c1fe588 Mon Sep 17 00:00:00 2001 From: Mehdi Dogguy <mehdi@debian.org> Date: Mon, 8 Sep 2014 21:31:03 +0200 Subject: [PATCH] Imported Upstream version 1.2.24 --- META | 4 +- NEWS | 13 +++- contribs/perlapi/Makefile.am | 6 +- contribs/perlapi/Makefile.in | 6 +- slurm.spec | 17 ++-- src/api/step_launch.c | 3 +- src/plugins/jobacct/common/common_slurmctld.c | 6 +- src/plugins/sched/wiki/get_jobs.c | 36 +++------ src/plugins/sched/wiki/get_nodes.c | 32 +++----- src/plugins/sched/wiki2/get_jobs.c | 43 ++++------- src/plugins/sched/wiki2/get_nodes.c | 36 +++------ .../bluegene/block_allocator/bridge_linker.c | 8 +- src/plugins/select/cons_res/select_cons_res.c | 4 +- src/sacct/process.c | 6 +- src/slurmctld/job_mgr.c | 3 +- src/slurmctld/proc_req.c | 4 +- src/slurmd/slurmd/req.c | 77 +++++++++++++++++-- src/slurmd/slurmstepd/mgr.c | 15 +++- src/slurmd/slurmstepd/req.c | 9 ++- src/slurmd/slurmstepd/slurmstepd.h | 3 +- 20 files changed, 190 insertions(+), 141 deletions(-) diff --git a/META b/META index 60cb66771..55d315864 100644 --- a/META +++ b/META @@ -3,9 +3,9 @@ Api_revision: 0 Major: 1 Meta: 1 - Micro: 23 + Micro: 24 Minor: 2 Name: slurm Release: 1 Release_tags: - Version: 1.2.23 + Version: 1.2.24 diff --git a/NEWS b/NEWS index 2cb6a555b..195b78d72 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,17 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 1.2.24 +========================= + -- In sched/wiki and sched/wiki2, support non-zero UPDATE_TIME specification + for GETNODES and GETJOBS commands. + -- Bug fix for sending accounting information multiple times for same + info. patch from Hongjia Cao (NUDT). + -- BLUEGENE - try FILE pointer rotation logic to avoid core dump on + bridge log rotate + -- Spread out in time the EPILOG_COMPLETE messages from slurmd to slurmctld + to avoid message congestions and retransmission. + * Changes in SLURM 1.2.23 ========================= -- Fix for libpmi to not export unneeded variables like xstr* @@ -2815,4 +2826,4 @@ documents those changes that are of interest to users and admins. -- Change directory to /tmp in slurmd if daemonizing. -- Logfiles are reopened on reconfigure. -$Id: NEWS 13293 2008-02-15 21:51:16Z jette $ +$Id: NEWS 13393 2008-02-27 23:07:43Z da $ diff --git a/contribs/perlapi/Makefile.am b/contribs/perlapi/Makefile.am index 6cbdfb132..00939f2da 100644 --- a/contribs/perlapi/Makefile.am +++ b/contribs/perlapi/Makefile.am @@ -24,7 +24,7 @@ $(perl_dir)/Makefile: $(perl_dir)/Makefile.PL ${LN_S} -f ../${srcdir}/$$f $$f; \ done; \ fi - @cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3 + @cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} # # Note on linking logic below @@ -39,7 +39,7 @@ all-local: $(perl_dir)/Makefile #libslurm if HAVE_AIX @cd $(perl_dir) && \ if [ ! -f Makefile ]; then \ - $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \ + $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \ fi && \ ($(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ $(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ @@ -47,7 +47,7 @@ if HAVE_AIX else @cd $(perl_dir) && \ if [ ! -f Makefile ]; then \ - $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \ + $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \ fi && \ ($(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ $(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ diff --git a/contribs/perlapi/Makefile.in b/contribs/perlapi/Makefile.in index fe82cbd59..c11e027f8 100644 --- a/contribs/perlapi/Makefile.in +++ b/contribs/perlapi/Makefile.in @@ -416,7 +416,7 @@ $(perl_dir)/Makefile: $(perl_dir)/Makefile.PL ${LN_S} -f ../${srcdir}/$$f $$f; \ done; \ fi - @cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3 + @cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} # # Note on linking logic below @@ -430,14 +430,14 @@ $(perl_dir)/Makefile: $(perl_dir)/Makefile.PL all-local: $(perl_dir)/Makefile #libslurm @HAVE_AIX_TRUE@ @cd $(perl_dir) && \ @HAVE_AIX_TRUE@ if [ ! -f Makefile ]; then \ -@HAVE_AIX_TRUE@ $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \ +@HAVE_AIX_TRUE@ $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \ @HAVE_AIX_TRUE@ fi && \ @HAVE_AIX_TRUE@ ($(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ @HAVE_AIX_TRUE@ $(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ @HAVE_AIX_TRUE@ cd ..; @HAVE_AIX_FALSE@ @cd $(perl_dir) && \ @HAVE_AIX_FALSE@ if [ ! -f Makefile ]; then \ -@HAVE_AIX_FALSE@ $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \ +@HAVE_AIX_FALSE@ $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \ @HAVE_AIX_FALSE@ fi && \ @HAVE_AIX_FALSE@ ($(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ @HAVE_AIX_FALSE@ $(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ diff --git a/slurm.spec b/slurm.spec index 5f7e05c94..3e8c10fde 100644 --- a/slurm.spec +++ b/slurm.spec @@ -1,4 +1,4 @@ -# $Id: slurm.spec 13266 2008-02-13 21:54:50Z da $ +# $Id: slurm.spec 13299 2008-02-19 19:46:58Z da $ # # Note that this package is not relocatable @@ -60,14 +60,14 @@ %endif Name: slurm -Version: 1.2.23 +Version: 1.2.24 Release: 1 Summary: Simple Linux Utility for Resource Management License: GPL Group: System Environment/Base -Source: slurm-1.2.23.tar.bz2 +Source: slurm-1.2.24.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release} URL: https://computing.llnl.gov/linux/slurm/ BuildRequires: openssl-devel >= 0.9.6 openssl >= 0.9.6 @@ -112,8 +112,9 @@ BuildRequires: readline-devel # http://slforums.typo3-factory.net/index.php?showtopic=11378 %define _unpackaged_files_terminate_build 0 - -%define _perlarch %(perl -e 'use Config; $T=$Config{installsitearch}; $P=$Config{installprefix}; $T =~ s/$P//; print $T;') +# First we remove $prefix/local and then just prefix to make +# sure we get the correct installdir +%define _perlarch %(perl -e 'use Config; $T=$Config{installsitearch}; $P=$Config{installprefix}; $P1="$P/local"; $T =~ s/$P1//; $T =~ s/$P//; print $T;') %define _perldir %{_prefix}%{_perlarch} @@ -211,7 +212,7 @@ SLURM process tracking plugin for SGI job containers. ############################################################################# %prep -%setup -n slurm-1.2.23 +%setup -n slurm-1.2.24 %build %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \ @@ -289,6 +290,10 @@ test -f $RPM_BUILD_ROOT/%{_perldir}/Slurm.pm && echo "%{_perldir}/Slurm.pm" >> $LIST test -f $RPM_BUILD_ROOT/%{_perldir}/auto/Slurm/Slurm.so && echo "%{_perldir}/auto/Slurm/Slurm.so" >> $LIST +test -f $RPM_BUILD_ROOT/%{_perldir}/auto/Slurm/Slurm.bs && + echo "%{_perldir}/auto/Slurm/Slurm.bs" >> $LIST +test -f $RPM_BUILD_ROOT/%{_perldir}/auto/Slurm/autosplit.ix && + echo "%{_perldir}/auto/Slurm/autosplit.ix" >> $LIST LIST=./torque.files touch $LIST diff --git a/src/api/step_launch.c b/src/api/step_launch.c index d700e76f7..c018fabe6 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * step_launch.c - launch a parallel job step * - * $Id: step_launch.c 10920 2007-02-02 03:01:14Z morrone $ + * $Id: step_launch.c 13373 2008-02-27 16:47:13Z jette $ ***************************************************************************** * Copyright (C) 2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -678,6 +678,7 @@ _node_fail_handler(struct step_launch_state *sls, slurm_msg_t *fail_msg) int i, j; int node_id, num_tasks; + error("Node failure on %s", nf->nodelist); fail_nodes = hostset_create(nf->nodelist); fail_itr = hostset_iterator_create(fail_nodes); num_node_ids = hostset_count(fail_nodes); diff --git a/src/plugins/jobacct/common/common_slurmctld.c b/src/plugins/jobacct/common/common_slurmctld.c index 22a442d0f..872a756ca 100644 --- a/src/plugins/jobacct/common/common_slurmctld.c +++ b/src/plugins/jobacct/common/common_slurmctld.c @@ -53,7 +53,7 @@ const char *_jobstep_format = "%d " "%u " /* stepid */ "%d " /* completion status */ -"%d " /* completion code */ +"%u " /* completion code */ "%u " /* nprocs */ "%u " /* number of cpus */ "%u " /* elapsed seconds */ @@ -96,7 +96,7 @@ const char *_jobstep_format = "%u " /* max pages node */ "%u " /* min cpu node */ "%s " /* account */ -"%d"; /* requester user id */ +"%u"; /* requester user id */ /* * Print the record to the log file. @@ -255,7 +255,7 @@ extern int common_job_complete_slurmctld(struct job_record *job_ptr) } /* leave the requid as a %d since we want to see if it is -1 in sacct */ - snprintf(buf, BUFFER_SIZE, "%d %u %d %d", + snprintf(buf, BUFFER_SIZE, "%d %u %d %u", JOB_TERMINATED, (int) (job_ptr->end_time - job_ptr->start_time), job_ptr->job_state & (~JOB_COMPLETING), diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c index 6c6fd1f62..7da06dc9d 100644 --- a/src/plugins/sched/wiki/get_jobs.c +++ b/src/plugins/sched/wiki/get_jobs.c @@ -45,8 +45,8 @@ #include "src/slurmctld/locks.h" #include "src/slurmctld/slurmctld.h" -static char * _dump_all_jobs(int *job_cnt, int state_info); -static char * _dump_job(struct job_record *job_ptr, int state_info); +static char * _dump_all_jobs(int *job_cnt, time_t update_time); +static char * _dump_job(struct job_record *job_ptr, time_t update_time); static char * _get_group_name(gid_t gid); static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr); static uint32_t _get_job_end_time(struct job_record *job_ptr); @@ -62,11 +62,6 @@ static uint32_t _get_job_time_limit(struct job_record *job_ptr); static int _hidden_job(struct job_record *job_ptr); static char * _task_list(struct job_record *job_ptr); - -#define SLURM_INFO_ALL 0 -#define SLURM_INFO_VOLITILE 1 -#define SLURM_INFO_STATE 2 - /* * get_jobs - get information on specific job(s) changed since some time * cmd_ptr IN - CMD=GETJOBS ARG=[<UPDATETIME>:<JOBID>[:<JOBID>]...] @@ -102,7 +97,7 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) /* Locks: read job, partition */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK }; - int job_rec_cnt = 0, buf_size = 0, state_info; + int job_rec_cnt = 0, buf_size = 0; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { @@ -126,16 +121,9 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) } tmp_char++; lock_slurmctld(job_read_lock); - if (update_time == 0) - state_info = SLURM_INFO_ALL; - else if (update_time > last_job_update) - state_info = SLURM_INFO_STATE; - else - state_info = SLURM_INFO_VOLITILE; - if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all jobs */ - buf = _dump_all_jobs(&job_rec_cnt, state_info); + buf = _dump_all_jobs(&job_rec_cnt, update_time); } else { struct job_record *job_ptr; char *job_name, *tmp2_char; @@ -145,7 +133,7 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) while (job_name) { job_id = (uint32_t) strtoul(job_name, NULL, 10); job_ptr = find_job_record(job_id); - tmp_buf = _dump_job(job_ptr, state_info); + tmp_buf = _dump_job(job_ptr, update_time); if (job_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -180,7 +168,7 @@ static int _hidden_job(struct job_record *job_ptr) return 0; } -static char * _dump_all_jobs(int *job_cnt, int state_info) +static char * _dump_all_jobs(int *job_cnt, time_t update_time) { int cnt = 0; struct job_record *job_ptr; @@ -191,7 +179,7 @@ static char * _dump_all_jobs(int *job_cnt, int state_info) while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (_hidden_job(job_ptr)) continue; - tmp_buf = _dump_job(job_ptr, state_info); + tmp_buf = _dump_job(job_ptr, update_time); if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -202,7 +190,7 @@ static char * _dump_all_jobs(int *job_cnt, int state_info) return buf; } -static char * _dump_job(struct job_record *job_ptr, int state_info) +static char * _dump_job(struct job_record *job_ptr, time_t update_time) { char tmp[16384], *buf = NULL; uint32_t end_time, suspend_time; @@ -210,15 +198,13 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) if (!job_ptr) return NULL; - /* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ snprintf(tmp, sizeof(tmp), "%u:STATE=%s;", job_ptr->job_id, _get_job_state(job_ptr)); xstrcat(buf, tmp); - if (state_info == SLURM_INFO_STATE) + if (update_time > last_job_update) return buf; - /* SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ if ((job_ptr->job_state == JOB_PENDING) && (job_ptr->details) && (job_ptr->details->req_nodes) @@ -318,10 +304,10 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf,tmp); } - if (state_info == SLURM_INFO_VOLITILE) + if (job_ptr->details && + (update_time > job_ptr->details->submit_time)) return buf; - /* SLURM_INFO_ALL only */ snprintf(tmp, sizeof(tmp), "UNAME=%s;GNAME=%s;", uid_to_string((uid_t) job_ptr->user_id), diff --git a/src/plugins/sched/wiki/get_nodes.c b/src/plugins/sched/wiki/get_nodes.c index 1177bbe7e..8569e9580 100644 --- a/src/plugins/sched/wiki/get_nodes.c +++ b/src/plugins/sched/wiki/get_nodes.c @@ -39,14 +39,10 @@ #include "src/slurmctld/locks.h" #include "src/slurmctld/slurmctld.h" -static char * _dump_all_nodes(int *node_cnt, int state_info); -static char * _dump_node(struct node_record *node_ptr, int state_info); +static char * _dump_all_nodes(int *node_cnt, time_t update_time); +static char * _dump_node(struct node_record *node_ptr, time_t update_time); static char * _get_node_state(struct node_record *node_ptr); -#define SLURM_INFO_ALL 0 -#define SLURM_INFO_VOLITILE 1 -#define SLURM_INFO_STATE 2 - /* * get_nodes - get information on specific node(s) changed since some time * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...] @@ -66,7 +62,7 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) /* Locks: read node, read partition */ slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK }; - int node_rec_cnt = 0, buf_size = 0, state_info; + int node_rec_cnt = 0, buf_size = 0; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { @@ -84,16 +80,9 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) } tmp_char++; lock_slurmctld(node_read_lock); - if (update_time == 0) - state_info = SLURM_INFO_ALL; - else if (update_time > last_node_update) - state_info = SLURM_INFO_STATE; - else - state_info = SLURM_INFO_VOLITILE; - if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all nodes */ - buf = _dump_all_nodes(&node_rec_cnt, state_info); + buf = _dump_all_nodes(&node_rec_cnt, update_time); } else { struct node_record *node_ptr; char *node_name, *tmp2_char; @@ -101,7 +90,7 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) node_name = strtok_r(tmp_char, ":", &tmp2_char); while (node_name) { node_ptr = find_node_record(node_name); - tmp_buf = _dump_node(node_ptr, state_info); + tmp_buf = _dump_node(node_ptr, update_time); if (node_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -123,7 +112,7 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) return 0; } -static char * _dump_all_nodes(int *node_cnt, int state_info) +static char * _dump_all_nodes(int *node_cnt, time_t update_time) { int i, cnt = 0; struct node_record *node_ptr = node_record_table_ptr; @@ -132,7 +121,7 @@ static char * _dump_all_nodes(int *node_cnt, int state_info) for (i=0; i<node_record_count; i++, node_ptr++) { if (node_ptr->name == NULL) continue; - tmp_buf = _dump_node(node_ptr, state_info); + tmp_buf = _dump_node(node_ptr, update_time); if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -143,7 +132,7 @@ static char * _dump_all_nodes(int *node_cnt, int state_info) return buf; } -static char * _dump_node(struct node_record *node_ptr, int state_info) +static char * _dump_node(struct node_record *node_ptr, time_t update_time) { char tmp[512], *buf = NULL; int i; @@ -151,17 +140,14 @@ static char * _dump_node(struct node_record *node_ptr, int state_info) if (!node_ptr) return NULL; - /* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ snprintf(tmp, sizeof(tmp), "%s:STATE=%s;", node_ptr->name, _get_node_state(node_ptr)); xstrcat(buf, tmp); - if ((state_info == SLURM_INFO_STATE) || - (state_info == SLURM_INFO_VOLITILE)) + if (update_time > 0) return buf; - /* SLURM_INFO_ALL only */ if (slurmctld_conf.fast_schedule) { /* config from slurm.conf */ snprintf(tmp, sizeof(tmp), diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index d05a99036..824bbdac5 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -45,8 +45,8 @@ #include "src/common/uid.h" #include "src/slurmctld/locks.h" -static char * _dump_all_jobs(int *job_cnt, int state_info); -static char * _dump_job(struct job_record *job_ptr, int state_info); +static char * _dump_all_jobs(int *job_cnt, time_t update_time); +static char * _dump_job(struct job_record *job_ptr, time_t update_time); static char * _get_group_name(gid_t gid); static void _get_job_comment(struct job_record *job_ptr, char *buffer, int buf_size); @@ -63,10 +63,6 @@ static uint32_t _get_job_tasks(struct job_record *job_ptr); static uint32_t _get_job_time_limit(struct job_record *job_ptr); static int _hidden_job(struct job_record *job_ptr); -#define SLURM_INFO_ALL 0 -#define SLURM_INFO_VOLITILE 1 -#define SLURM_INFO_STATE 2 - static uint32_t cr_enabled = 0, cr_test = 0; /* @@ -112,7 +108,7 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) /* Locks: read job, partition */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK }; - int job_rec_cnt = 0, buf_size = 0, state_info; + int job_rec_cnt = 0, buf_size = 0; if (cr_test == 0) { select_g_get_info_from_plugin(SELECT_CR_PLUGIN, @@ -142,16 +138,9 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) } tmp_char++; lock_slurmctld(job_read_lock); - if (update_time == 0) - state_info = SLURM_INFO_ALL; - else if (update_time > last_job_update) - state_info = SLURM_INFO_STATE; - else - state_info = SLURM_INFO_VOLITILE; - if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all jobs */ - buf = _dump_all_jobs(&job_rec_cnt, state_info); + buf = _dump_all_jobs(&job_rec_cnt, update_time); } else { struct job_record *job_ptr = NULL; char *job_name = NULL, *tmp2_char = NULL; @@ -161,7 +150,7 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) while (job_name) { job_id = (uint32_t) strtoul(job_name, NULL, 10); job_ptr = find_job_record(job_id); - tmp_buf = _dump_job(job_ptr, state_info); + tmp_buf = _dump_job(job_ptr, update_time); if (job_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -201,7 +190,7 @@ static int _hidden_job(struct job_record *job_ptr) return 0; } -static char * _dump_all_jobs(int *job_cnt, int state_info) +static char * _dump_all_jobs(int *job_cnt, time_t update_time) { int cnt = 0; struct job_record *job_ptr; @@ -212,7 +201,7 @@ static char * _dump_all_jobs(int *job_cnt, int state_info) while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (_hidden_job(job_ptr)) continue; - tmp_buf = _dump_job(job_ptr, state_info); + tmp_buf = _dump_job(job_ptr, update_time); if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -223,7 +212,7 @@ static char * _dump_all_jobs(int *job_cnt, int state_info) return buf; } -static char * _dump_job(struct job_record *job_ptr, int state_info) +static char * _dump_job(struct job_record *job_ptr, time_t update_time) { char tmp[16384], *buf = NULL; uint32_t end_time, suspend_time; @@ -231,15 +220,13 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) if (!job_ptr) return NULL; - /* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ snprintf(tmp, sizeof(tmp), "%u:STATE=%s;", job_ptr->job_id, _get_job_state(job_ptr)); xstrcat(buf, tmp); - if (state_info == SLURM_INFO_STATE) + if (update_time > last_job_update) return buf; - /* SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ if (job_ptr->job_state == JOB_PENDING) { char *req_features = _get_job_features(job_ptr); if (req_features) { @@ -337,20 +324,20 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); } - if (state_info == SLURM_INFO_VOLITILE) + snprintf(tmp, sizeof(tmp), + "NAME=\"%s\";", job_ptr->name); + xstrcat(buf, tmp); + + if (job_ptr->details && + (update_time > job_ptr->details->submit_time)) return buf; - /* SLURM_INFO_ALL only */ snprintf(tmp, sizeof(tmp), "UNAME=%s;GNAME=%s;", uid_to_string((uid_t) job_ptr->user_id), _get_group_name(job_ptr->group_id)); xstrcat(buf, tmp); - snprintf(tmp, sizeof(tmp), - "NAME=\"%s\";", job_ptr->name); - xstrcat(buf, tmp); - return buf; } diff --git a/src/plugins/sched/wiki2/get_nodes.c b/src/plugins/sched/wiki2/get_nodes.c index 44afd1a2e..ddf66db69 100644 --- a/src/plugins/sched/wiki2/get_nodes.c +++ b/src/plugins/sched/wiki2/get_nodes.c @@ -39,14 +39,10 @@ #include "src/slurmctld/locks.h" #include "src/slurmctld/slurmctld.h" -static char * _dump_all_nodes(int *node_cnt, int state_info); -static char * _dump_node(struct node_record *node_ptr, int state_info); +static char * _dump_all_nodes(int *node_cnt, time_t update_time); +static char * _dump_node(struct node_record *node_ptr, time_t update_time); static char * _get_node_state(struct node_record *node_ptr); -#define SLURM_INFO_ALL 0 -#define SLURM_INFO_VOLITILE 1 -#define SLURM_INFO_STATE 2 - /* * get_nodes - get information on specific node(s) changed since some time * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...] @@ -73,7 +69,7 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) /* Locks: read node, read partition */ slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK }; - int node_rec_cnt = 0, buf_size = 0, state_info; + int node_rec_cnt = 0, buf_size = 0; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { @@ -91,16 +87,9 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) } tmp_char++; lock_slurmctld(node_read_lock); - if (update_time == 0) - state_info = SLURM_INFO_ALL; - else if (update_time > last_node_update) - state_info = SLURM_INFO_STATE; - else - state_info = SLURM_INFO_VOLITILE; - if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all nodes */ - buf = _dump_all_nodes(&node_rec_cnt, state_info); + buf = _dump_all_nodes(&node_rec_cnt, update_time); } else { struct node_record *node_ptr = NULL; char *node_name = NULL, *tmp2_char = NULL; @@ -108,7 +97,7 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) node_name = strtok_r(tmp_char, ":", &tmp2_char); while (node_name) { node_ptr = find_node_record(node_name); - tmp_buf = _dump_node(node_ptr, state_info); + tmp_buf = _dump_node(node_ptr, update_time); if (node_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -130,7 +119,7 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) return 0; } -static char * _dump_all_nodes(int *node_cnt, int state_info) +static char * _dump_all_nodes(int *node_cnt, time_t update_time) { int i, cnt = 0; struct node_record *node_ptr = node_record_table_ptr; @@ -139,7 +128,7 @@ static char * _dump_all_nodes(int *node_cnt, int state_info) for (i=0; i<node_record_count; i++, node_ptr++) { if (node_ptr->name == NULL) continue; - tmp_buf = _dump_node(node_ptr, state_info); + tmp_buf = _dump_node(node_ptr, update_time); if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); @@ -150,7 +139,7 @@ static char * _dump_all_nodes(int *node_cnt, int state_info) return buf; } -static char * _dump_node(struct node_record *node_ptr, int state_info) +static char * _dump_node(struct node_record *node_ptr, time_t update_time) { char tmp[512], *buf = NULL; int i; @@ -159,7 +148,6 @@ static char * _dump_node(struct node_record *node_ptr, int state_info) if (!node_ptr) return NULL; - /* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ snprintf(tmp, sizeof(tmp), "%s:STATE=%s;", node_ptr->name, _get_node_state(node_ptr)); @@ -169,11 +157,9 @@ static char * _dump_node(struct node_record *node_ptr, int state_info) xstrcat(buf, tmp); } - if (state_info == SLURM_INFO_STATE) + if (update_time > last_node_update) return buf; - - /* SLURM_INFO_VOLITILE or SLURM_INFO_ALL */ if (slurmctld_conf.fast_schedule) { /* config from slurm.conf */ cpu_cnt = node_ptr->config_ptr->cpus; @@ -192,11 +178,9 @@ static char * _dump_node(struct node_record *node_ptr, int state_info) if (i > 0) xstrcat(buf, ";"); - if (state_info == SLURM_INFO_VOLITILE) + if (update_time > 0) return buf; - - /* SLURM_INFO_ALL only */ if (slurmctld_conf.fast_schedule) { /* config from slurm.conf */ snprintf(tmp, sizeof(tmp), diff --git a/src/plugins/select/bluegene/block_allocator/bridge_linker.c b/src/plugins/select/bluegene/block_allocator/bridge_linker.c index d86631b7b..f2a0428b4 100644 --- a/src/plugins/select/bluegene/block_allocator/bridge_linker.c +++ b/src/plugins/select/bluegene/block_allocator/bridge_linker.c @@ -577,14 +577,16 @@ extern status_t bridge_destroy_block(pm_partition_id_t pid) extern int bridge_set_log_params(char *api_file_name, unsigned int level) { static FILE *fp = NULL; + FILE *fp2 = NULL; int rc = SLURM_SUCCESS; if(!bridge_init()) return SLURM_ERROR; slurm_mutex_lock(&api_file_mutex); - if(fp) - fclose(fp); + if(fp) + fp2 = fp; + fp = fopen(api_file_name, "a"); if (fp == NULL) { @@ -596,6 +598,8 @@ extern int bridge_set_log_params(char *api_file_name, unsigned int level) (*(bridge_api.set_log_params))(fp, level); + if(fp2) + fclose(fp2); end_it: slurm_mutex_unlock(&api_file_mutex); return rc; diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index ecb3aa3c6..a246aefac 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2,7 +2,7 @@ * select_cons_res.c - node selection plugin supporting consumable * resources policies. * - * $Id: select_cons_res.c 12649 2007-11-15 18:02:35Z da $ + * $Id: select_cons_res.c 13373 2008-02-27 16:47:13Z jette $ *****************************************************************************\ * * The following example below illustrates how four jobs are allocated @@ -1309,6 +1309,8 @@ extern int select_p_state_restore(char *dir_name) safe_unpack16(&restore_plugin_crtype, buffer); safe_unpack32(&restore_pstate_version, buffer); + if (restore_plugin_type == NULL) + goto unpack_error; if ((strcmp(restore_plugin_type, plugin_type) != 0) || (restore_plugin_version != plugin_version) || (restore_plugin_crtype != cr_type) || diff --git a/src/sacct/process.c b/src/sacct/process.c index 45032cae7..d09346eff 100644 --- a/src/sacct/process.c +++ b/src/sacct/process.c @@ -419,12 +419,12 @@ void process_step(char *f[], int lc, int show_full, int len) got_step: - + if ( job->exitcode == 0 ) + job->exitcode = step->exitcode; + if (job->job_terminated_seen == 0) { /* If the job is still running, this is the most recent status */ - if ( job->exitcode == 0 ) - job->exitcode = step->exitcode; job->status = JOB_RUNNING; job->elapsed = step->header.timestamp - job->header.timestamp; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 923670c92..9397d9e10 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3,7 +3,7 @@ * Note: there is a global job list (job_list), time stamp * (last_job_update), and hash table (job_hash) * - * $Id: job_mgr.c 13176 2008-02-04 16:56:57Z jette $ + * $Id: job_mgr.c 13373 2008-02-27 16:47:13Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -1087,6 +1087,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) } else { info("Killing job_id %u on failed node %s", job_ptr->job_id, node_name); + srun_node_fail(job_ptr->job_id, node_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; job_ptr->exit_code = diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index e222d1e86..c1e6c4cbc 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * proc_req.c - process incomming messages to slurmctld * - * $Id: proc_req.c 13237 2008-02-08 23:16:16Z jette $ + * $Id: proc_req.c 13341 2008-02-25 17:20:07Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -1401,7 +1401,7 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg) uid_t uid; START_TIMER; - debug2("Processing RPC: REQUEST_RECONFIGURE"); + info("Processing RPC: REQUEST_RECONFIGURE"); uid = g_slurm_auth_get_uid(msg->auth_cred); if (!_is_super_user(uid)) { error("Security violation, RECONFIGURE RPC from uid=%u", diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index ad5ca1b78..84242acf6 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/slurmd/slurmd/req.c - slurmd request handling - * $Id: req.c 11813 2007-07-11 17:03:30Z jette $ + * $Id: req.c 13326 2008-02-21 20:37:56Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -96,6 +96,7 @@ typedef struct { static int _abort_job(uint32_t job_id); static int _abort_step(uint32_t job_id, uint32_t step_id); static char ** _build_env(uint32_t jobid, uid_t uid, char *bg_part_id); +static void _delay_rpc(int host_inx, int host_cnt, int usec_per_rpc); static void _destroy_env(char **env); static bool _slurm_authorized_user(uid_t uid); static bool _job_still_running(uint32_t job_id); @@ -124,6 +125,7 @@ static int _run_epilog(uint32_t jobid, uid_t uid, char *bg_part_id); static bool _pause_for_job_completion(uint32_t jobid, char *nodes, int maxtime); +static void _sync_messages_kill(kill_job_msg_t *req); static int _waiter_init (uint32_t jobid); static int _waiter_complete (uint32_t jobid); @@ -822,7 +824,7 @@ _rpc_batch_job(slurm_msg_t *msg) rc = ESLURMD_CREDENTIAL_REVOKED; /* job already ran */ } - if (req->step_id != SLURM_BATCH_SCRIPT && req->step_id != 0) + if ((req->step_id != SLURM_BATCH_SCRIPT) && (req->step_id != 0)) first_job_run = false; /* @@ -1818,8 +1820,6 @@ _epilog_complete(uint32_t jobid, int rc) slurm_msg_t_init(&msg); - _wait_state_completed(jobid, 5); - req.job_id = jobid; req.return_code = rc; req.node_name = conf->node_name; @@ -2185,8 +2185,75 @@ _rpc_terminate_job(slurm_msg_t *msg) debug("completed epilog for jobid %u", req->job_id); done: - _epilog_complete(req->job_id, rc); + _wait_state_completed(req->job_id, 5); _waiter_complete(req->job_id); + _sync_messages_kill(req); + _epilog_complete(req->job_id, rc); +} + +/* On a parallel job, every slurmd may send the EPILOG_COMPLETE + * message to the slurmctld at the same time, resulting in lost + * messages. We add a delay here to spead out the message traffic + * assuming synchronized clocks across the cluster. + * Allow 10 msec processing time in slurmctld for each RPC. */ +static void _sync_messages_kill(kill_job_msg_t *req) +{ + int host_cnt, host_inx; + char *host; + hostset_t hosts; + + hosts = hostset_create(req->nodes); + host_cnt = hostset_count(hosts); + if (host_cnt <= 32) + goto fini; + if (conf->hostname == NULL) + goto fini; /* should never happen */ + + for (host_inx=0; host_inx<host_cnt; host_inx++) { + host = hostset_shift(hosts); + if (host == NULL) + break; + if (strcmp(host, conf->node_name) == 0) { + free(host); + break; + } + free(host); + } + _delay_rpc(host_inx, host_cnt, 10000); + + fini: hostset_destroy(hosts); +} + +/* Delay a message based upon the host index, total host count and RPC_TIME. + * This logic depends upon synchronized clocks across the cluster. */ +static void _delay_rpc(int host_inx, int host_cnt, int usec_per_rpc) +{ + struct timeval tv1; + uint32_t cur_time; /* current time in usec (just 9 digits) */ + uint32_t tot_time; /* total time expected for all RPCs */ + uint32_t offset_time; /* relative time within tot_time */ + uint32_t target_time; /* desired time to issue the RPC */ + uint32_t delta_time; + +again: if (gettimeofday(&tv1, NULL)) { + usleep(host_inx * usec_per_rpc); + return; + } + + cur_time = (tv1.tv_sec % 1000) + tv1.tv_usec; + tot_time = host_cnt * usec_per_rpc; + offset_time = cur_time % tot_time; + target_time = host_inx * usec_per_rpc; + if (target_time < offset_time) + delta_time = target_time - offset_time + tot_time; + else + delta_time = target_time - offset_time; + if (usleep(delta_time)) { + if (errno == EINVAL) /* usleep for more than 1 sec */ + usleep(900000); + /* errno == EINTR */ + goto again; + } } /* diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 6ac03c22d..0fc820149 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/slurmd/slurmstepd/mgr.c - job manager functions for slurmstepd - * $Id: mgr.c 13229 2008-02-08 01:02:06Z jette $ + * $Id: mgr.c 13322 2008-02-21 19:06:27Z da $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -139,6 +139,7 @@ step_complete_t step_complete = { {}, -1, -1, + true, (bitstr_t *)NULL, 0, NULL @@ -478,6 +479,8 @@ _wait_for_children_slurmstepd(slurmd_job_t *job) step_complete.step_rc = MAX(step_complete.step_rc, WEXITSTATUS(job->task[i]->estatus)); + step_complete.wait_children = false; + pthread_mutex_unlock(&step_complete.lock); } @@ -495,6 +498,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) int rc = -1; int retcode; int i; + static bool acct_sent = false; debug2("_one_step_complete_msg: first=%d, last=%d", first, last); msg.job_id = job->jobid; @@ -504,9 +508,12 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) msg.step_rc = step_complete.step_rc; msg.jobacct = jobacct_g_alloc(NULL); /************* acct stuff ********************/ - jobacct_g_aggregate(step_complete.jobacct, job->jobacct); - jobacct_g_getinfo(step_complete.jobacct, JOBACCT_DATA_TOTAL, - msg.jobacct); + if(!acct_sent) { + jobacct_g_aggregate(step_complete.jobacct, job->jobacct); + jobacct_g_getinfo(step_complete.jobacct, JOBACCT_DATA_TOTAL, + msg.jobacct); + acct_sent = true; + } /*********************************************/ slurm_msg_t_init(&req); req.msg_type = REQUEST_STEP_COMPLETE; diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index dacf1123d..fc744d250 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/slurmd/slurmstepd/req.c - slurmstepd domain socket request handling - * $Id: req.c 11856 2007-07-19 02:36:24Z morrone $ + * $Id: req.c 13322 2008-02-21 19:06:27Z da $ ***************************************************************************** * Copyright (C) 2005 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -1056,6 +1056,12 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid) * Record the completed nodes */ pthread_mutex_lock(&step_complete.lock); + if (! step_complete.wait_children) { + rc = -1; + errnum = ETIMEDOUT; /* not used anyway */ + goto timeout; + } + /* debug2("Setting range %d(bit %d) through %d(bit %d)", */ /* first, first-(step_complete.rank+1), */ /* last, last-(step_complete.rank+1)); */ @@ -1070,6 +1076,7 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid) /************* acct stuff ********************/ jobacct_g_aggregate(step_complete.jobacct, jobacct); +timeout: jobacct_g_free(jobacct); /*********************************************/ diff --git a/src/slurmd/slurmstepd/slurmstepd.h b/src/slurmd/slurmstepd/slurmstepd.h index d5bfee041..2ef0cc40f 100644 --- a/src/slurmd/slurmstepd/slurmstepd.h +++ b/src/slurmd/slurmstepd/slurmstepd.h @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/slurmd/slurmstepd/slurmstepd.h - slurmstepd general header file - * $Id: slurmstepd.h 11590 2007-05-25 18:52:33Z da $ + * $Id: slurmstepd.h 13322 2008-02-21 19:06:27Z da $ ***************************************************************************** * Copyright (C) 2005 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -55,6 +55,7 @@ typedef struct { slurm_addr parent_addr; int children; int max_depth; + bool wait_children; bitstr_t *bits; int step_rc; jobacctinfo_t *jobacct; -- GitLab