From a1a037e17405ae0c9551278b4059215e3c1fe588 Mon Sep 17 00:00:00 2001
From: Mehdi Dogguy <mehdi@debian.org>
Date: Mon, 8 Sep 2014 21:31:03 +0200
Subject: [PATCH] Imported Upstream version 1.2.24

---
 META                                          |  4 +-
 NEWS                                          | 13 +++-
 contribs/perlapi/Makefile.am                  |  6 +-
 contribs/perlapi/Makefile.in                  |  6 +-
 slurm.spec                                    | 17 ++--
 src/api/step_launch.c                         |  3 +-
 src/plugins/jobacct/common/common_slurmctld.c |  6 +-
 src/plugins/sched/wiki/get_jobs.c             | 36 +++------
 src/plugins/sched/wiki/get_nodes.c            | 32 +++-----
 src/plugins/sched/wiki2/get_jobs.c            | 43 ++++-------
 src/plugins/sched/wiki2/get_nodes.c           | 36 +++------
 .../bluegene/block_allocator/bridge_linker.c  |  8 +-
 src/plugins/select/cons_res/select_cons_res.c |  4 +-
 src/sacct/process.c                           |  6 +-
 src/slurmctld/job_mgr.c                       |  3 +-
 src/slurmctld/proc_req.c                      |  4 +-
 src/slurmd/slurmd/req.c                       | 77 +++++++++++++++++--
 src/slurmd/slurmstepd/mgr.c                   | 15 +++-
 src/slurmd/slurmstepd/req.c                   |  9 ++-
 src/slurmd/slurmstepd/slurmstepd.h            |  3 +-
 20 files changed, 190 insertions(+), 141 deletions(-)

diff --git a/META b/META
index 60cb66771..55d315864 100644
--- a/META
+++ b/META
@@ -3,9 +3,9 @@
   Api_revision:  0
   Major:         1
   Meta:          1
-  Micro:         23
+  Micro:         24
   Minor:         2
   Name:          slurm
   Release:       1
   Release_tags:  
-  Version:       1.2.23
+  Version:       1.2.24
diff --git a/NEWS b/NEWS
index 2cb6a555b..195b78d72 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,17 @@
 This file describes changes in recent versions of SLURM. It primarily
 documents those changes that are of interest to users and admins.
 
+* Changes in SLURM 1.2.24
+=========================
+ -- In sched/wiki and sched/wiki2, support non-zero UPDATE_TIME specification
+    for GETNODES and GETJOBS commands.
+ -- Bug fix for sending accounting information multiple times for same 
+    info.  patch from Hongjia Cao (NUDT).
+ -- BLUEGENE - try FILE pointer rotation logic to avoid core dump on 
+    bridge log rotate
+ -- Spread out in time the EPILOG_COMPLETE messages from slurmd to slurmctld
+    to avoid message congestions and retransmission.
+
 * Changes in SLURM 1.2.23
 =========================
  -- Fix for libpmi to not export unneeded variables like xstr*
@@ -2815,4 +2826,4 @@ documents those changes that are of interest to users and admins.
  -- Change directory to /tmp in slurmd if daemonizing.
  -- Logfiles are reopened on reconfigure.
  
-$Id: NEWS 13293 2008-02-15 21:51:16Z jette $
+$Id: NEWS 13393 2008-02-27 23:07:43Z da $
diff --git a/contribs/perlapi/Makefile.am b/contribs/perlapi/Makefile.am
index 6cbdfb132..00939f2da 100644
--- a/contribs/perlapi/Makefile.am
+++ b/contribs/perlapi/Makefile.am
@@ -24,7 +24,7 @@ $(perl_dir)/Makefile:	$(perl_dir)/Makefile.PL
 			${LN_S} -f ../${srcdir}/$$f $$f; \
 		done; \
 	fi
-	@cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3
+	@cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}
 
 #
 # Note on linking logic below
@@ -39,7 +39,7 @@ all-local: $(perl_dir)/Makefile #libslurm
 if HAVE_AIX
 	@cd $(perl_dir) && \
 	if [ ! -f Makefile ]; then \
-		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \
+		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \
 	fi && \
 	($(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \
 	 $(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \
@@ -47,7 +47,7 @@ if HAVE_AIX
 else
 	@cd $(perl_dir) && \
 	if [ ! -f Makefile ]; then \
-		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \
+		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \
 	fi && \
 	($(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \
 	 $(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \
diff --git a/contribs/perlapi/Makefile.in b/contribs/perlapi/Makefile.in
index fe82cbd59..c11e027f8 100644
--- a/contribs/perlapi/Makefile.in
+++ b/contribs/perlapi/Makefile.in
@@ -416,7 +416,7 @@ $(perl_dir)/Makefile:	$(perl_dir)/Makefile.PL
 			${LN_S} -f ../${srcdir}/$$f $$f; \
 		done; \
 	fi
-	@cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3
+	@cd $(perl_dir) && $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}
 
 #
 # Note on linking logic below
@@ -430,14 +430,14 @@ $(perl_dir)/Makefile:	$(perl_dir)/Makefile.PL
 all-local: $(perl_dir)/Makefile #libslurm
 @HAVE_AIX_TRUE@	@cd $(perl_dir) && \
 @HAVE_AIX_TRUE@	if [ ! -f Makefile ]; then \
-@HAVE_AIX_TRUE@		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \
+@HAVE_AIX_TRUE@		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \
 @HAVE_AIX_TRUE@	fi && \
 @HAVE_AIX_TRUE@	($(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \
 @HAVE_AIX_TRUE@	 $(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \
 @HAVE_AIX_TRUE@	cd ..;
 @HAVE_AIX_FALSE@	@cd $(perl_dir) && \
 @HAVE_AIX_FALSE@	if [ ! -f Makefile ]; then \
-@HAVE_AIX_FALSE@		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix} INSTALLDIRS=perl LIB=@libdir@/perl5 installman1dir=@mandir@/man1 installman3dir=@mandir@/man3; \
+@HAVE_AIX_FALSE@		$(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \
 @HAVE_AIX_FALSE@	fi && \
 @HAVE_AIX_FALSE@	($(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \
 @HAVE_AIX_FALSE@	 $(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \
diff --git a/slurm.spec b/slurm.spec
index 5f7e05c94..3e8c10fde 100644
--- a/slurm.spec
+++ b/slurm.spec
@@ -1,4 +1,4 @@
-# $Id: slurm.spec 13266 2008-02-13 21:54:50Z da $
+# $Id: slurm.spec 13299 2008-02-19 19:46:58Z da $
 #
 # Note that this package is not relocatable
 
@@ -60,14 +60,14 @@
 %endif
 
 Name:    slurm
-Version: 1.2.23
+Version: 1.2.24
 Release: 1
 
 Summary: Simple Linux Utility for Resource Management
 
 License: GPL 
 Group: System Environment/Base
-Source: slurm-1.2.23.tar.bz2
+Source: slurm-1.2.24.tar.bz2
 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}
 URL: https://computing.llnl.gov/linux/slurm/
 BuildRequires: openssl-devel >= 0.9.6 openssl >= 0.9.6
@@ -112,8 +112,9 @@ BuildRequires: readline-devel
 # http://slforums.typo3-factory.net/index.php?showtopic=11378
 %define _unpackaged_files_terminate_build      0
 
-
-%define _perlarch %(perl -e 'use Config; $T=$Config{installsitearch}; $P=$Config{installprefix}; $T =~ s/$P//; print $T;') 
+# First we remove $prefix/local and then just prefix to make 
+# sure we get the correct installdir
+%define _perlarch %(perl -e 'use Config; $T=$Config{installsitearch}; $P=$Config{installprefix}; $P1="$P/local"; $T =~ s/$P1//; $T =~ s/$P//; print $T;') 
 
 %define _perldir %{_prefix}%{_perlarch}
 
@@ -211,7 +212,7 @@ SLURM process tracking plugin for SGI job containers.
 #############################################################################
 
 %prep
-%setup -n slurm-1.2.23
+%setup -n slurm-1.2.24
 
 %build
 %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \
@@ -289,6 +290,10 @@ test -f $RPM_BUILD_ROOT/%{_perldir}/Slurm.pm &&
   echo "%{_perldir}/Slurm.pm"                 >> $LIST
 test -f $RPM_BUILD_ROOT/%{_perldir}/auto/Slurm/Slurm.so &&
   echo "%{_perldir}/auto/Slurm/Slurm.so"      >> $LIST
+test -f $RPM_BUILD_ROOT/%{_perldir}/auto/Slurm/Slurm.bs &&
+  echo "%{_perldir}/auto/Slurm/Slurm.bs"      >> $LIST
+test -f $RPM_BUILD_ROOT/%{_perldir}/auto/Slurm/autosplit.ix &&
+  echo "%{_perldir}/auto/Slurm/autosplit.ix"      >> $LIST
 
 LIST=./torque.files
 touch $LIST
diff --git a/src/api/step_launch.c b/src/api/step_launch.c
index d700e76f7..c018fabe6 100644
--- a/src/api/step_launch.c
+++ b/src/api/step_launch.c
@@ -1,7 +1,7 @@
 /*****************************************************************************\
  *  step_launch.c - launch a parallel job step
  *
- *  $Id: step_launch.c 10920 2007-02-02 03:01:14Z morrone $
+ *  $Id: step_launch.c 13373 2008-02-27 16:47:13Z jette $
  *****************************************************************************
  *  Copyright (C) 2006 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -678,6 +678,7 @@ _node_fail_handler(struct step_launch_state *sls, slurm_msg_t *fail_msg)
 	int i, j;
 	int node_id, num_tasks;
 
+	error("Node failure on %s", nf->nodelist);
 	fail_nodes = hostset_create(nf->nodelist);
 	fail_itr = hostset_iterator_create(fail_nodes);
 	num_node_ids = hostset_count(fail_nodes);
diff --git a/src/plugins/jobacct/common/common_slurmctld.c b/src/plugins/jobacct/common/common_slurmctld.c
index 22a442d0f..872a756ca 100644
--- a/src/plugins/jobacct/common/common_slurmctld.c
+++ b/src/plugins/jobacct/common/common_slurmctld.c
@@ -53,7 +53,7 @@ const char *_jobstep_format =
 "%d "
 "%u "	/* stepid */
 "%d "	/* completion status */
-"%d "	/* completion code */
+"%u "	/* completion code */
 "%u "	/* nprocs */
 "%u "	/* number of cpus */
 "%u "	/* elapsed seconds */
@@ -96,7 +96,7 @@ const char *_jobstep_format =
 "%u "	/* max pages node */
 "%u "	/* min cpu node */
 "%s "   /* account */
-"%d";   /* requester user id */
+"%u";   /* requester user id */
 
 /*
  * Print the record to the log file.
@@ -255,7 +255,7 @@ extern int common_job_complete_slurmctld(struct job_record *job_ptr)
 	}
 	/* leave the requid as a %d since we want to see if it is -1
 	   in sacct */
-	snprintf(buf, BUFFER_SIZE, "%d %u %d %d",
+	snprintf(buf, BUFFER_SIZE, "%d %u %d %u",
 		 JOB_TERMINATED,
 		 (int) (job_ptr->end_time - job_ptr->start_time),
 		 job_ptr->job_state & (~JOB_COMPLETING),
diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c
index 6c6fd1f62..7da06dc9d 100644
--- a/src/plugins/sched/wiki/get_jobs.c
+++ b/src/plugins/sched/wiki/get_jobs.c
@@ -45,8 +45,8 @@
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/slurmctld.h"
 
-static char *	_dump_all_jobs(int *job_cnt, int state_info);
-static char *	_dump_job(struct job_record *job_ptr, int state_info);
+static char *	_dump_all_jobs(int *job_cnt, time_t update_time);
+static char *	_dump_job(struct job_record *job_ptr, time_t update_time);
 static char *	_get_group_name(gid_t gid);
 static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr);
 static uint32_t	_get_job_end_time(struct job_record *job_ptr);
@@ -62,11 +62,6 @@ static uint32_t	_get_job_time_limit(struct job_record *job_ptr);
 static int	_hidden_job(struct job_record *job_ptr);
 static char *	_task_list(struct job_record *job_ptr);
 
-
-#define SLURM_INFO_ALL		0
-#define SLURM_INFO_VOLITILE	1
-#define SLURM_INFO_STATE	2
-
 /*
  * get_jobs - get information on specific job(s) changed since some time
  * cmd_ptr IN - CMD=GETJOBS ARG=[<UPDATETIME>:<JOBID>[:<JOBID>]...]
@@ -102,7 +97,7 @@ extern int	get_jobs(char *cmd_ptr, int *err_code, char **err_msg)
 	/* Locks: read job, partition */
 	slurmctld_lock_t job_read_lock = {
 		NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK };
-	int job_rec_cnt = 0, buf_size = 0, state_info;
+	int job_rec_cnt = 0, buf_size = 0;
 
 	arg_ptr = strstr(cmd_ptr, "ARG=");
 	if (arg_ptr == NULL) {
@@ -126,16 +121,9 @@ extern int	get_jobs(char *cmd_ptr, int *err_code, char **err_msg)
 	}
 	tmp_char++;
 	lock_slurmctld(job_read_lock);
-	if (update_time == 0)
-		state_info = SLURM_INFO_ALL;
-	else if (update_time > last_job_update)
-		state_info = SLURM_INFO_STATE;
-	else
-		state_info = SLURM_INFO_VOLITILE;
-
 	if (strncmp(tmp_char, "ALL", 3) == 0) {
 		/* report all jobs */
-		buf = _dump_all_jobs(&job_rec_cnt, state_info);
+		buf = _dump_all_jobs(&job_rec_cnt, update_time);
 	} else {
 		struct job_record *job_ptr;
 		char *job_name, *tmp2_char;
@@ -145,7 +133,7 @@ extern int	get_jobs(char *cmd_ptr, int *err_code, char **err_msg)
 		while (job_name) {
 			job_id = (uint32_t) strtoul(job_name, NULL, 10);
 			job_ptr = find_job_record(job_id);
-			tmp_buf = _dump_job(job_ptr, state_info);
+			tmp_buf = _dump_job(job_ptr, update_time);
 			if (job_rec_cnt > 0)
 				xstrcat(buf, "#");
 			xstrcat(buf, tmp_buf);
@@ -180,7 +168,7 @@ static int	_hidden_job(struct job_record *job_ptr)
 	return 0;
 }
 
-static char *   _dump_all_jobs(int *job_cnt, int state_info)
+static char *   _dump_all_jobs(int *job_cnt, time_t update_time)
 {
 	int cnt = 0;
 	struct job_record *job_ptr;
@@ -191,7 +179,7 @@ static char *   _dump_all_jobs(int *job_cnt, int state_info)
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (_hidden_job(job_ptr))
 			continue;
-		tmp_buf = _dump_job(job_ptr, state_info);
+		tmp_buf = _dump_job(job_ptr, update_time);
 		if (cnt > 0)
 			xstrcat(buf, "#");
 		xstrcat(buf, tmp_buf);
@@ -202,7 +190,7 @@ static char *   _dump_all_jobs(int *job_cnt, int state_info)
 	return buf;
 }
 
-static char *	_dump_job(struct job_record *job_ptr, int state_info)
+static char *	_dump_job(struct job_record *job_ptr, time_t update_time)
 {
 	char tmp[16384], *buf = NULL;
 	uint32_t end_time, suspend_time;
@@ -210,15 +198,13 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 	if (!job_ptr)
 		return NULL;
 
-	/* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	snprintf(tmp, sizeof(tmp), "%u:STATE=%s;",
 		job_ptr->job_id, _get_job_state(job_ptr));
 	xstrcat(buf, tmp);
 
-	if (state_info == SLURM_INFO_STATE)
+	if (update_time > last_job_update)
 		return buf;
 
-	/* SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	if ((job_ptr->job_state == JOB_PENDING)
 	&&  (job_ptr->details)
 	&&  (job_ptr->details->req_nodes)
@@ -318,10 +304,10 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 		xstrcat(buf,tmp);
 	}
 
-	if (state_info == SLURM_INFO_VOLITILE)
+	if (job_ptr->details &&
+	    (update_time > job_ptr->details->submit_time))
 		return buf;
 
-	/* SLURM_INFO_ALL only */
 	snprintf(tmp, sizeof(tmp),
 		"UNAME=%s;GNAME=%s;",
 		uid_to_string((uid_t) job_ptr->user_id),
diff --git a/src/plugins/sched/wiki/get_nodes.c b/src/plugins/sched/wiki/get_nodes.c
index 1177bbe7e..8569e9580 100644
--- a/src/plugins/sched/wiki/get_nodes.c
+++ b/src/plugins/sched/wiki/get_nodes.c
@@ -39,14 +39,10 @@
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/slurmctld.h"
 
-static char *	_dump_all_nodes(int *node_cnt, int state_info);
-static char *	_dump_node(struct node_record *node_ptr, int state_info);
+static char *	_dump_all_nodes(int *node_cnt, time_t update_time);
+static char *	_dump_node(struct node_record *node_ptr, time_t update_time);
 static char *	_get_node_state(struct node_record *node_ptr);
 
-#define SLURM_INFO_ALL		0
-#define SLURM_INFO_VOLITILE	1
-#define SLURM_INFO_STATE	2
-
 /*
  * get_nodes - get information on specific node(s) changed since some time
  * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...]
@@ -66,7 +62,7 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 	/* Locks: read node, read partition */
 	slurmctld_lock_t node_read_lock = {
 		NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK };
-	int node_rec_cnt = 0, buf_size = 0, state_info;
+	int node_rec_cnt = 0, buf_size = 0;
 
 	arg_ptr = strstr(cmd_ptr, "ARG=");
 	if (arg_ptr == NULL) {
@@ -84,16 +80,9 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 	}
 	tmp_char++;
 	lock_slurmctld(node_read_lock);
-	if (update_time == 0)
-		state_info = SLURM_INFO_ALL;
-	else if (update_time > last_node_update)
-		state_info = SLURM_INFO_STATE;
-	else
-		state_info = SLURM_INFO_VOLITILE;
-
 	if (strncmp(tmp_char, "ALL", 3) == 0) {
 		/* report all nodes */
-		buf = _dump_all_nodes(&node_rec_cnt, state_info);
+		buf = _dump_all_nodes(&node_rec_cnt, update_time);
 	} else {
 		struct node_record *node_ptr;
 		char *node_name, *tmp2_char;
@@ -101,7 +90,7 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 		node_name = strtok_r(tmp_char, ":", &tmp2_char);
 		while (node_name) {
 			node_ptr = find_node_record(node_name);
-			tmp_buf = _dump_node(node_ptr, state_info);
+			tmp_buf = _dump_node(node_ptr, update_time);
 			if (node_rec_cnt > 0)
 				xstrcat(buf, "#");
 			xstrcat(buf, tmp_buf);
@@ -123,7 +112,7 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 	return 0;
 }
 
-static char *	_dump_all_nodes(int *node_cnt, int state_info)
+static char *	_dump_all_nodes(int *node_cnt, time_t update_time)
 {
 	int i, cnt = 0;
 	struct node_record *node_ptr = node_record_table_ptr;
@@ -132,7 +121,7 @@ static char *	_dump_all_nodes(int *node_cnt, int state_info)
 	for (i=0; i<node_record_count; i++, node_ptr++) {
 		if (node_ptr->name == NULL)
 			continue;
-		tmp_buf = _dump_node(node_ptr, state_info);
+		tmp_buf = _dump_node(node_ptr, update_time);
 		if (cnt > 0)
 			xstrcat(buf, "#");
 		xstrcat(buf, tmp_buf);
@@ -143,7 +132,7 @@ static char *	_dump_all_nodes(int *node_cnt, int state_info)
 	return buf;
 }
 
-static char *	_dump_node(struct node_record *node_ptr, int state_info)
+static char *	_dump_node(struct node_record *node_ptr, time_t update_time)
 {
 	char tmp[512], *buf = NULL;
 	int i;
@@ -151,17 +140,14 @@ static char *	_dump_node(struct node_record *node_ptr, int state_info)
 	if (!node_ptr)
 		return NULL;
 
-	/* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	snprintf(tmp, sizeof(tmp), "%s:STATE=%s;",
 		node_ptr->name, 
 		_get_node_state(node_ptr));
 	xstrcat(buf, tmp);
 	
-	if ((state_info == SLURM_INFO_STATE) ||
-	    (state_info == SLURM_INFO_VOLITILE))
+	if (update_time > 0)
 		return buf;
 
-	/* SLURM_INFO_ALL only */
 	if (slurmctld_conf.fast_schedule) {
 		/* config from slurm.conf */
 		snprintf(tmp, sizeof(tmp),
diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c
index d05a99036..824bbdac5 100644
--- a/src/plugins/sched/wiki2/get_jobs.c
+++ b/src/plugins/sched/wiki2/get_jobs.c
@@ -45,8 +45,8 @@
 #include "src/common/uid.h"
 #include "src/slurmctld/locks.h"
 
-static char *	_dump_all_jobs(int *job_cnt, int state_info);
-static char *	_dump_job(struct job_record *job_ptr, int state_info);
+static char *	_dump_all_jobs(int *job_cnt, time_t update_time);
+static char *	_dump_job(struct job_record *job_ptr, time_t update_time);
 static char *	_get_group_name(gid_t gid);
 static void	_get_job_comment(struct job_record *job_ptr, 
 			char *buffer, int buf_size);
@@ -63,10 +63,6 @@ static uint32_t	_get_job_tasks(struct job_record *job_ptr);
 static uint32_t	_get_job_time_limit(struct job_record *job_ptr);
 static int	_hidden_job(struct job_record *job_ptr);
 
-#define SLURM_INFO_ALL		0
-#define SLURM_INFO_VOLITILE	1
-#define SLURM_INFO_STATE	2
-
 static uint32_t cr_enabled = 0, cr_test = 0;
 
 /*
@@ -112,7 +108,7 @@ extern int	get_jobs(char *cmd_ptr, int *err_code, char **err_msg)
 	/* Locks: read job, partition */
 	slurmctld_lock_t job_read_lock = {
 		NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK };
-	int job_rec_cnt = 0, buf_size = 0, state_info;
+	int job_rec_cnt = 0, buf_size = 0;
 
 	if (cr_test == 0) {
 		select_g_get_info_from_plugin(SELECT_CR_PLUGIN,
@@ -142,16 +138,9 @@ extern int	get_jobs(char *cmd_ptr, int *err_code, char **err_msg)
 	}
 	tmp_char++;
 	lock_slurmctld(job_read_lock);
-	if (update_time == 0)
-		state_info = SLURM_INFO_ALL;
-	else if (update_time > last_job_update)
-		state_info = SLURM_INFO_STATE;
-	else
-		state_info = SLURM_INFO_VOLITILE;
-
 	if (strncmp(tmp_char, "ALL", 3) == 0) {
 		/* report all jobs */
-		buf = _dump_all_jobs(&job_rec_cnt, state_info);
+		buf = _dump_all_jobs(&job_rec_cnt, update_time);
 	} else {
 		struct job_record *job_ptr = NULL;
 		char *job_name = NULL, *tmp2_char = NULL;
@@ -161,7 +150,7 @@ extern int	get_jobs(char *cmd_ptr, int *err_code, char **err_msg)
 		while (job_name) {
 			job_id = (uint32_t) strtoul(job_name, NULL, 10);
 			job_ptr = find_job_record(job_id);
-			tmp_buf = _dump_job(job_ptr, state_info);
+			tmp_buf = _dump_job(job_ptr, update_time);
 			if (job_rec_cnt > 0)
 				xstrcat(buf, "#");
 			xstrcat(buf, tmp_buf);
@@ -201,7 +190,7 @@ static int	_hidden_job(struct job_record *job_ptr)
 	return 0;
 }
 
-static char *   _dump_all_jobs(int *job_cnt, int state_info)
+static char *   _dump_all_jobs(int *job_cnt, time_t update_time)
 {
 	int cnt = 0;
 	struct job_record *job_ptr;
@@ -212,7 +201,7 @@ static char *   _dump_all_jobs(int *job_cnt, int state_info)
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (_hidden_job(job_ptr))
 			continue;
-		tmp_buf = _dump_job(job_ptr, state_info);
+		tmp_buf = _dump_job(job_ptr, update_time);
 		if (cnt > 0)
 			xstrcat(buf, "#");
 		xstrcat(buf, tmp_buf);
@@ -223,7 +212,7 @@ static char *   _dump_all_jobs(int *job_cnt, int state_info)
 	return buf;
 }
 
-static char *	_dump_job(struct job_record *job_ptr, int state_info)
+static char *	_dump_job(struct job_record *job_ptr, time_t update_time)
 {
 	char tmp[16384], *buf = NULL;
 	uint32_t end_time, suspend_time;
@@ -231,15 +220,13 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 	if (!job_ptr)
 		return NULL;
 
-	/* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	snprintf(tmp, sizeof(tmp), "%u:STATE=%s;",
 		job_ptr->job_id, _get_job_state(job_ptr));
 	xstrcat(buf, tmp);
 
-	if (state_info == SLURM_INFO_STATE)
+	if (update_time > last_job_update)
 		return buf;
 
-	/* SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	if (job_ptr->job_state == JOB_PENDING) {
 		char *req_features = _get_job_features(job_ptr);
 		if (req_features) {
@@ -337,20 +324,20 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 		xstrcat(buf, tmp);
 	}
 
-	if (state_info == SLURM_INFO_VOLITILE)
+	snprintf(tmp, sizeof(tmp),
+		"NAME=\"%s\";", job_ptr->name);
+	xstrcat(buf, tmp);
+
+	if (job_ptr->details &&
+	    (update_time > job_ptr->details->submit_time))
 		return buf;
 
-	/* SLURM_INFO_ALL only */
 	snprintf(tmp, sizeof(tmp),
 		"UNAME=%s;GNAME=%s;",
 		uid_to_string((uid_t) job_ptr->user_id),
 		_get_group_name(job_ptr->group_id));
 	xstrcat(buf, tmp);
 
-	snprintf(tmp, sizeof(tmp),
-		"NAME=\"%s\";", job_ptr->name);
-	xstrcat(buf, tmp);
-
 	return buf;
 }
 
diff --git a/src/plugins/sched/wiki2/get_nodes.c b/src/plugins/sched/wiki2/get_nodes.c
index 44afd1a2e..ddf66db69 100644
--- a/src/plugins/sched/wiki2/get_nodes.c
+++ b/src/plugins/sched/wiki2/get_nodes.c
@@ -39,14 +39,10 @@
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/slurmctld.h"
 
-static char *	_dump_all_nodes(int *node_cnt, int state_info);
-static char *	_dump_node(struct node_record *node_ptr, int state_info);
+static char *	_dump_all_nodes(int *node_cnt, time_t update_time);
+static char *	_dump_node(struct node_record *node_ptr, time_t update_time);
 static char *	_get_node_state(struct node_record *node_ptr);
 
-#define SLURM_INFO_ALL		0
-#define SLURM_INFO_VOLITILE	1
-#define SLURM_INFO_STATE	2
-
 /*
  * get_nodes - get information on specific node(s) changed since some time
  * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...]
@@ -73,7 +69,7 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 	/* Locks: read node, read partition */
 	slurmctld_lock_t node_read_lock = {
 		NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK };
-	int node_rec_cnt = 0, buf_size = 0, state_info;
+	int node_rec_cnt = 0, buf_size = 0;
 
 	arg_ptr = strstr(cmd_ptr, "ARG=");
 	if (arg_ptr == NULL) {
@@ -91,16 +87,9 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 	}
 	tmp_char++;
 	lock_slurmctld(node_read_lock);
-	if (update_time == 0)
-		state_info = SLURM_INFO_ALL;
-	else if (update_time > last_node_update)
-		state_info = SLURM_INFO_STATE;
-	else
-		state_info = SLURM_INFO_VOLITILE;
-
 	if (strncmp(tmp_char, "ALL", 3) == 0) {
 		/* report all nodes */
-		buf = _dump_all_nodes(&node_rec_cnt, state_info);
+		buf = _dump_all_nodes(&node_rec_cnt, update_time);
 	} else {
 		struct node_record *node_ptr = NULL;
 		char *node_name = NULL, *tmp2_char = NULL;
@@ -108,7 +97,7 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 		node_name = strtok_r(tmp_char, ":", &tmp2_char);
 		while (node_name) {
 			node_ptr = find_node_record(node_name);
-			tmp_buf = _dump_node(node_ptr, state_info);
+			tmp_buf = _dump_node(node_ptr, update_time);
 			if (node_rec_cnt > 0)
 				xstrcat(buf, "#");
 			xstrcat(buf, tmp_buf);
@@ -130,7 +119,7 @@ extern int	get_nodes(char *cmd_ptr, int *err_code, char **err_msg)
 	return 0;
 }
 
-static char *	_dump_all_nodes(int *node_cnt, int state_info)
+static char *	_dump_all_nodes(int *node_cnt, time_t update_time)
 {
 	int i, cnt = 0;
 	struct node_record *node_ptr = node_record_table_ptr;
@@ -139,7 +128,7 @@ static char *	_dump_all_nodes(int *node_cnt, int state_info)
 	for (i=0; i<node_record_count; i++, node_ptr++) {
 		if (node_ptr->name == NULL)
 			continue;
-		tmp_buf = _dump_node(node_ptr, state_info);
+		tmp_buf = _dump_node(node_ptr, update_time);
 		if (cnt > 0)
 			xstrcat(buf, "#");
 		xstrcat(buf, tmp_buf);
@@ -150,7 +139,7 @@ static char *	_dump_all_nodes(int *node_cnt, int state_info)
 	return buf;
 }
 
-static char *	_dump_node(struct node_record *node_ptr, int state_info)
+static char *	_dump_node(struct node_record *node_ptr, time_t update_time)
 {
 	char tmp[512], *buf = NULL;
 	int i;
@@ -159,7 +148,6 @@ static char *	_dump_node(struct node_record *node_ptr, int state_info)
 	if (!node_ptr)
 		return NULL;
 
-	/* SLURM_INFO_STATE or SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	snprintf(tmp, sizeof(tmp), "%s:STATE=%s;",
 		node_ptr->name, 
 		_get_node_state(node_ptr));
@@ -169,11 +157,9 @@ static char *	_dump_node(struct node_record *node_ptr, int state_info)
 		xstrcat(buf, tmp);
 	}
 	
-	if (state_info == SLURM_INFO_STATE)
+	if (update_time > last_node_update)
 		return buf;
 
-
-	/* SLURM_INFO_VOLITILE or SLURM_INFO_ALL */
 	if (slurmctld_conf.fast_schedule) {
 		/* config from slurm.conf */
 		cpu_cnt = node_ptr->config_ptr->cpus;
@@ -192,11 +178,9 @@ static char *	_dump_node(struct node_record *node_ptr, int state_info)
 	if (i > 0)
 		xstrcat(buf, ";");
 
-	if (state_info == SLURM_INFO_VOLITILE)
+	if (update_time > 0)
 		return buf;
 
-
-	/* SLURM_INFO_ALL only */
 	if (slurmctld_conf.fast_schedule) {
 		/* config from slurm.conf */
 		snprintf(tmp, sizeof(tmp),
diff --git a/src/plugins/select/bluegene/block_allocator/bridge_linker.c b/src/plugins/select/bluegene/block_allocator/bridge_linker.c
index d86631b7b..f2a0428b4 100644
--- a/src/plugins/select/bluegene/block_allocator/bridge_linker.c
+++ b/src/plugins/select/bluegene/block_allocator/bridge_linker.c
@@ -577,14 +577,16 @@ extern status_t bridge_destroy_block(pm_partition_id_t pid)
 extern int bridge_set_log_params(char *api_file_name, unsigned int level)
 {
 	static FILE *fp = NULL;
+        FILE *fp2 = NULL;
 	int rc = SLURM_SUCCESS;
 
 	if(!bridge_init())
 		return SLURM_ERROR;
 	
 	slurm_mutex_lock(&api_file_mutex);
-	if(fp)
-		fclose(fp);
+	if(fp) 
+		fp2 = fp;
+	
 	fp = fopen(api_file_name, "a");
 	
 	if (fp == NULL) { 
@@ -596,6 +598,8 @@ extern int bridge_set_log_params(char *api_file_name, unsigned int level)
 
 	
 	(*(bridge_api.set_log_params))(fp, level);
+	if(fp2)
+		fclose(fp2);
 end_it:
 	slurm_mutex_unlock(&api_file_mutex);
 	return rc;
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index ecb3aa3c6..a246aefac 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -2,7 +2,7 @@
  *  select_cons_res.c - node selection plugin supporting consumable 
  *  resources policies.
  *
- *  $Id: select_cons_res.c 12649 2007-11-15 18:02:35Z da $
+ *  $Id: select_cons_res.c 13373 2008-02-27 16:47:13Z jette $
  *****************************************************************************\
  *
  *  The following example below illustrates how four jobs are allocated
@@ -1309,6 +1309,8 @@ extern int select_p_state_restore(char *dir_name)
 	safe_unpack16(&restore_plugin_crtype,  buffer);
 	safe_unpack32(&restore_pstate_version, buffer);
 
+	if (restore_plugin_type == NULL)
+		goto unpack_error;
 	if ((strcmp(restore_plugin_type, plugin_type) != 0) ||
 	    (restore_plugin_version != plugin_version) ||
 	    (restore_plugin_crtype  != cr_type) ||
diff --git a/src/sacct/process.c b/src/sacct/process.c
index 45032cae7..d09346eff 100644
--- a/src/sacct/process.c
+++ b/src/sacct/process.c
@@ -419,12 +419,12 @@ void process_step(char *f[], int lc, int show_full, int len)
 	
 got_step:
 	
-		
+	if ( job->exitcode == 0 )
+		job->exitcode = step->exitcode;
+	
 	if (job->job_terminated_seen == 0) {	/* If the job is still running,
 						   this is the most recent
 						   status */
-		if ( job->exitcode == 0 )
-			job->exitcode = step->exitcode;
 		job->status = JOB_RUNNING;
 		job->elapsed = step->header.timestamp - job->header.timestamp;
 	}
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 923670c92..9397d9e10 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -3,7 +3,7 @@
  *	Note: there is a global job list (job_list), time stamp 
  *	(last_job_update), and hash table (job_hash)
  *
- *  $Id: job_mgr.c 13176 2008-02-04 16:56:57Z jette $
+ *  $Id: job_mgr.c 13373 2008-02-27 16:47:13Z jette $
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -1087,6 +1087,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test)
 			} else {
 				info("Killing job_id %u on failed node %s",
 				     job_ptr->job_id, node_name);
+				srun_node_fail(job_ptr->job_id, node_name);
 				job_ptr->job_state = JOB_NODE_FAIL | 
 					JOB_COMPLETING;
 				job_ptr->exit_code = 
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index e222d1e86..c1e6c4cbc 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -1,7 +1,7 @@
 /*****************************************************************************\
  *  proc_req.c - process incomming messages to slurmctld
  *
- *  $Id: proc_req.c 13237 2008-02-08 23:16:16Z jette $
+ *  $Id: proc_req.c 13341 2008-02-25 17:20:07Z jette $
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -1401,7 +1401,7 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg)
 	uid_t uid;
 
 	START_TIMER;
-	debug2("Processing RPC: REQUEST_RECONFIGURE");
+	info("Processing RPC: REQUEST_RECONFIGURE");
 	uid = g_slurm_auth_get_uid(msg->auth_cred);
 	if (!_is_super_user(uid)) {
 		error("Security violation, RECONFIGURE RPC from uid=%u",
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index ad5ca1b78..84242acf6 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -1,6 +1,6 @@
 /*****************************************************************************\
  *  src/slurmd/slurmd/req.c - slurmd request handling
- *  $Id: req.c 11813 2007-07-11 17:03:30Z jette $
+ *  $Id: req.c 13326 2008-02-21 20:37:56Z jette $
  *****************************************************************************
  *  Copyright (C) 2002-2006 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -96,6 +96,7 @@ typedef struct {
 static int  _abort_job(uint32_t job_id);
 static int  _abort_step(uint32_t job_id, uint32_t step_id);
 static char ** _build_env(uint32_t jobid, uid_t uid, char *bg_part_id);
+static void _delay_rpc(int host_inx, int host_cnt, int usec_per_rpc);
 static void _destroy_env(char **env);
 static bool _slurm_authorized_user(uid_t uid);
 static bool _job_still_running(uint32_t job_id);
@@ -124,6 +125,7 @@ static int  _run_epilog(uint32_t jobid, uid_t uid, char *bg_part_id);
 
 static bool _pause_for_job_completion(uint32_t jobid, char *nodes, 
 		int maxtime);
+static void _sync_messages_kill(kill_job_msg_t *req);
 static int _waiter_init (uint32_t jobid);
 static int _waiter_complete (uint32_t jobid);
 
@@ -822,7 +824,7 @@ _rpc_batch_job(slurm_msg_t *msg)
 		 rc = ESLURMD_CREDENTIAL_REVOKED;	/* job already ran */
 	}
 
-	if (req->step_id != SLURM_BATCH_SCRIPT && req->step_id != 0)
+	if ((req->step_id != SLURM_BATCH_SCRIPT) && (req->step_id != 0))
 		first_job_run = false;
 		
 	/*
@@ -1818,8 +1820,6 @@ _epilog_complete(uint32_t jobid, int rc)
 
 	slurm_msg_t_init(&msg);
 	
-	_wait_state_completed(jobid, 5);
-
 	req.job_id      = jobid;
 	req.return_code = rc;
 	req.node_name   = conf->node_name;
@@ -2185,8 +2185,75 @@ _rpc_terminate_job(slurm_msg_t *msg)
 		debug("completed epilog for jobid %u", req->job_id);
 	
     done:
-	_epilog_complete(req->job_id, rc);
+	_wait_state_completed(req->job_id, 5);
 	_waiter_complete(req->job_id);
+	_sync_messages_kill(req);
+	_epilog_complete(req->job_id, rc);
+}
+
+/* On a parallel job, every slurmd may send the EPILOG_COMPLETE
+ * message to the slurmctld at the same time, resulting in lost
+ * messages. We add a delay here to spead out the message traffic
+ * assuming synchronized clocks across the cluster. 
+ * Allow 10 msec processing time in slurmctld for each RPC. */
+static void _sync_messages_kill(kill_job_msg_t *req)
+{
+	int host_cnt, host_inx;
+	char *host;
+	hostset_t hosts;
+
+	hosts = hostset_create(req->nodes);
+	host_cnt = hostset_count(hosts);
+	if (host_cnt <= 32)
+		goto fini;
+	if (conf->hostname == NULL)
+		goto fini;	/* should never happen */
+
+	for (host_inx=0; host_inx<host_cnt; host_inx++) {
+		host = hostset_shift(hosts);
+		if (host == NULL)
+			break;
+		if (strcmp(host, conf->node_name) == 0) {
+			free(host);
+			break;
+		}
+		free(host);
+	}
+	_delay_rpc(host_inx, host_cnt, 10000);
+
+ fini:	hostset_destroy(hosts);
+}
+
+/* Delay a message based upon the host index, total host count and RPC_TIME. 
+ * This logic depends upon synchronized clocks across the cluster. */
+static void _delay_rpc(int host_inx, int host_cnt, int usec_per_rpc)
+{
+	struct timeval tv1;
+	uint32_t cur_time;	/* current time in usec (just 9 digits) */
+	uint32_t tot_time;	/* total time expected for all RPCs */
+	uint32_t offset_time;	/* relative time within tot_time */
+	uint32_t target_time;	/* desired time to issue the RPC */
+	uint32_t delta_time;
+
+again:	if (gettimeofday(&tv1, NULL)) {
+		usleep(host_inx * usec_per_rpc);
+		return;
+	}
+
+	cur_time = (tv1.tv_sec % 1000) + tv1.tv_usec;
+	tot_time = host_cnt * usec_per_rpc;
+	offset_time = cur_time % tot_time;
+	target_time = host_inx * usec_per_rpc;
+	if (target_time < offset_time)
+		delta_time = target_time - offset_time + tot_time;
+	else
+		delta_time = target_time - offset_time;
+	if (usleep(delta_time)) {
+		if (errno == EINVAL) /* usleep for more than 1 sec */
+			usleep(900000);
+		/* errno == EINTR */
+		goto again;
+	}
 }
 
 /*
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index 6ac03c22d..0fc820149 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -1,6 +1,6 @@
 /*****************************************************************************\
  *  src/slurmd/slurmstepd/mgr.c - job manager functions for slurmstepd
- *  $Id: mgr.c 13229 2008-02-08 01:02:06Z jette $
+ *  $Id: mgr.c 13322 2008-02-21 19:06:27Z da $
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -139,6 +139,7 @@ step_complete_t step_complete = {
 	{},
 	-1,
 	-1,
+	true,
 	(bitstr_t *)NULL,
 	0,
         NULL
@@ -478,6 +479,8 @@ _wait_for_children_slurmstepd(slurmd_job_t *job)
 		step_complete.step_rc = MAX(step_complete.step_rc,
 					 WEXITSTATUS(job->task[i]->estatus));
 
+	step_complete.wait_children = false;
+
 	pthread_mutex_unlock(&step_complete.lock);
 }
 
@@ -495,6 +498,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last)
 	int rc = -1;
 	int retcode;
 	int i;
+	static bool acct_sent = false;
 
 	debug2("_one_step_complete_msg: first=%d, last=%d", first, last);
 	msg.job_id = job->jobid;
@@ -504,9 +508,12 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last)
 	msg.step_rc = step_complete.step_rc;
 	msg.jobacct = jobacct_g_alloc(NULL);
 	/************* acct stuff ********************/
-	jobacct_g_aggregate(step_complete.jobacct, job->jobacct);
-	jobacct_g_getinfo(step_complete.jobacct, JOBACCT_DATA_TOTAL, 
-			  msg.jobacct);
+	if(!acct_sent) {
+		jobacct_g_aggregate(step_complete.jobacct, job->jobacct);
+		jobacct_g_getinfo(step_complete.jobacct, JOBACCT_DATA_TOTAL, 
+				  msg.jobacct);
+		acct_sent = true;
+	}
 	/*********************************************/	
 	slurm_msg_t_init(&req);
 	req.msg_type = REQUEST_STEP_COMPLETE;
diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c
index dacf1123d..fc744d250 100644
--- a/src/slurmd/slurmstepd/req.c
+++ b/src/slurmd/slurmstepd/req.c
@@ -1,6 +1,6 @@
 /*****************************************************************************\
  *  src/slurmd/slurmstepd/req.c - slurmstepd domain socket request handling
- *  $Id: req.c 11856 2007-07-19 02:36:24Z morrone $
+ *  $Id: req.c 13322 2008-02-21 19:06:27Z da $
  *****************************************************************************
  *  Copyright (C) 2005 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -1056,6 +1056,12 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid)
 	 * Record the completed nodes
 	 */
 	pthread_mutex_lock(&step_complete.lock);
+	if (! step_complete.wait_children) {
+		rc = -1;
+		errnum = ETIMEDOUT; /* not used anyway */
+		goto timeout;
+	}
+
 /* 	debug2("Setting range %d(bit %d) through %d(bit %d)", */
 /* 	       first, first-(step_complete.rank+1), */
 /* 	       last, last-(step_complete.rank+1)); */
@@ -1070,6 +1076,7 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid)
 	
 	/************* acct stuff ********************/
 	jobacct_g_aggregate(step_complete.jobacct, jobacct);
+timeout:
 	jobacct_g_free(jobacct);
 	/*********************************************/
 	
diff --git a/src/slurmd/slurmstepd/slurmstepd.h b/src/slurmd/slurmstepd/slurmstepd.h
index d5bfee041..2ef0cc40f 100644
--- a/src/slurmd/slurmstepd/slurmstepd.h
+++ b/src/slurmd/slurmstepd/slurmstepd.h
@@ -1,6 +1,6 @@
 /*****************************************************************************\
  * src/slurmd/slurmstepd/slurmstepd.h - slurmstepd general header file
- * $Id: slurmstepd.h 11590 2007-05-25 18:52:33Z da $
+ * $Id: slurmstepd.h 13322 2008-02-21 19:06:27Z da $
  *****************************************************************************
  *  Copyright (C) 2005 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@@ -55,6 +55,7 @@ typedef struct {
 	slurm_addr parent_addr;
 	int children;
 	int max_depth;
+	bool wait_children;
 	bitstr_t *bits;
 	int step_rc;
 	jobacctinfo_t *jobacct;
-- 
GitLab