diff --git a/META b/META index fff8ca7d73500aee1bbd95be05241f497cb511c2..9a761e08a91a0c488bc02d7c2fe767ca73e08fdd 100644 --- a/META +++ b/META @@ -3,8 +3,8 @@ Api_revision: 0 Major: 1 Meta: 1 - Micro: 17 + Micro: 18 Minor: 2 Name: slurm Release: 1 - Version: 1.2.17 + Version: 1.2.18 diff --git a/NEWS b/NEWS index 0e4caf4b02b34552c67747a0aaae0497cf1cce86..dfbe5cc4036c9a2bac837273ac951be638bd5f84 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,25 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.2.18 ========================= + -- BLUEGENE - bug fix for smap stating passthroughs are used when they aren't + -- Fixed bug in sview to be able to edit partitions correctly + -- Fixed bug so in slurm.conf files where SlurmdPort isn't defined things + work correctly. + -- In sched/wiki2 and sched/wiki add support for batch job being requeued + in Slurm either when nodes fail or upon request. + -- In sched/wiki2 and sched/wiki with FastSchedule=2 configured and nodes + configured with more CPUs than actually exist, return a value of TASKS + equal to the number of configured CPUs that are allocated to a job rather + than the number of physical CPUs allocated. + -- For sched/wiki2, timeout "srun --get-user-env ..." command after 3 seconds + if unable to perform pseudo-login and get user environment variables. + -- Add contribs/time_login.c program to test how long pseudo-login takes + for specific users or all users. This can identify users for which Moab + job submissions are unable to set the proper environment variables. + -- Fix problem in parallel make of Slurm. + -- Fixed bug in consumable resources when CR_Core_Memory is enabled + -- Add delay in slurmctld for "scontrol shutdown" RPC to get propagated + to slurmd daemons. * Changes in SLURM 1.2.17 ========================= @@ -2640,4 +2659,4 @@ documents those changes that are of interest to users and admins. -- Change directory to /tmp in slurmd if daemonizing. -- Logfiles are reopened on reconfigure. -$Id: NEWS 12383 2007-09-21 21:15:45Z da $ +$Id: NEWS 12462 2007-10-08 17:42:47Z jette $ diff --git a/contribs/Makefile.am b/contribs/Makefile.am index 4ea5912bbe739f5d5a1020a77c893af7b6d97f3a..7e6b666ba19e4e396c30c96a584b4d2f4605dc62 100644 --- a/contribs/Makefile.am +++ b/contribs/Makefile.am @@ -4,4 +4,5 @@ EXTRA_DIST = \ make.slurm.patch \ mpich1.slurm.patch \ ptrace.patch \ + time_login.c \ README diff --git a/contribs/Makefile.in b/contribs/Makefile.in index b6c6834e3fb680f7da25c2291062359c1d85e746..88e882faded169b78ee63861dc8aa97c7a539a0b 100644 --- a/contribs/Makefile.in +++ b/contribs/Makefile.in @@ -236,6 +236,7 @@ EXTRA_DIST = \ make.slurm.patch \ mpich1.slurm.patch \ ptrace.patch \ + time_login.c \ README all: all-recursive diff --git a/contribs/README b/contribs/README index aa511f34cbd2c8e7120a3510242cbbadd905a1c7..ad9792738294c490ad31f8db94a614f03cb0f91d 100644 --- a/contribs/README +++ b/contribs/README @@ -11,6 +11,12 @@ of the SLURM contribs distribution follows: API to SLURM using perl. Making available all SLURM command that exist in the SLURM proper API. + time_login.c [ C program ] + This program will report how long a pseudo-login will take for specific + users or all users on the system. Users identified by this program + will not have their environment properly set for jobs submitted through + Moab. Build with "make -f /dev/null time_login" and execute as user root. + torque/ [ Wrapper Scripts for Torque migration to SLURM ] Helpful scripts to make transition to SLURM easier from PBS or Torque. These scripts are easily updatable if there is functionality missing. diff --git a/contribs/perlapi/Makefile.am b/contribs/perlapi/Makefile.am index 678cbd5a068d22ce858a3513b831ba7aad480a18..58685cbe40e3784ad4ac2af19a2ddd19ef103503 100644 --- a/contribs/perlapi/Makefile.am +++ b/contribs/perlapi/Makefile.am @@ -31,8 +31,8 @@ all-local: $(perl_dir)/Makefile #libslurm if [ ! -f Makefile ]; then \ $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \ fi && \ - ($(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ - $(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ + ($(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ + $(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ cd ..; install-exec-local: diff --git a/contribs/perlapi/Makefile.in b/contribs/perlapi/Makefile.in index e758eb5169212357933abfb8732146916c38cdee..30296bef76cd8e5a8fb1e7e01e7fdde2b86ab37f 100644 --- a/contribs/perlapi/Makefile.in +++ b/contribs/perlapi/Makefile.in @@ -422,8 +422,8 @@ all-local: $(perl_dir)/Makefile #libslurm if [ ! -f Makefile ]; then \ $(perlpath) Makefile.PL $(PERL_MM_PARAMS) prefix=${prefix}; \ fi && \ - ($(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ - $(MAKE) CC="$(CC)" CCFLAGS="$(PERL_CFLAGS) -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ + ($(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS) || \ + $(MAKE) CC="$(CC)" LD="$(CC) $(CFLAGS)" CCFLAGS="$(PERL_CFLAGS) -g -static $(CFLAGS)" $(PERL_EXTRA_OPTS)) && \ cd ..; install-exec-local: diff --git a/contribs/time_login.c b/contribs/time_login.c new file mode 100644 index 0000000000000000000000000000000000000000..529188627a0ad50742ed56fdfb8b5912668df4ee --- /dev/null +++ b/contribs/time_login.c @@ -0,0 +1,195 @@ +/*****************************************************************************\ + * This program is used to identify users for whom a pseudo-login takes + * more than SU_WAIT_MSEC to complete. Either enter specific user names + * on the execute line (e.g.. "time_login alice bob") or provide no input + * on the execute line to test all users in the /etc/passwd file with a + * UID greater than 100 (avoiding various system users). + * + * Users indentified for whom the pseudo-login takes too long will not + * have their environment variables set by Moab on job submit, which + * relies upon the srun "--get-user-env" option to get this information. + * See SLURM's env_array_user_default() code in src/common/env.c. + * This option is presently used only by Moab. + ***************************************************************************** + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov>. + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> + +#define SU_WAIT_MSEC 3000 +static void _parse_line(char *in_line, char **user_name, int *user_id); +static long int _time_login(char *user_name); + +main (int argc, char **argv) +{ + FILE *passwd_fd; + char in_line[256], *user_name; + int i, user_id; + long int delta_t; + + if (geteuid() != (uid_t)0) { + printf("need to run as user root\n"); + exit(1); + } + + for (i=1; i<argc; i++) { + delta_t = _time_login(argv[i]); + printf("user %-8s time %ld usec\n", argv[i], delta_t); + } + if (i > 1) + exit(0); + + passwd_fd = fopen("/etc/passwd", "r"); + if (!passwd_fd) { + perror("fopen(/etc/passwd)"); + exit(1); + } + + while (fgets(in_line, sizeof(in_line), passwd_fd)) { + _parse_line(in_line, &user_name, &user_id); + if (user_id <= 100) + continue; + delta_t = _time_login(user_name); + if (delta_t < ((SU_WAIT_MSEC * 0.8) * 1000)) + continue; + printf("user %-8s time %ld usec\n", user_name, delta_t); + } + fclose(passwd_fd); +} + +static void _parse_line(char *in_line, char **user_name, int *user_id) +{ + char *tok; + + *user_name = strtok(in_line, ":"); + (void) strtok(NULL, ":"); + tok = strtok(NULL, ":"); + if (tok) + *user_id = atoi(tok); + else { + perror("strtok"); + *user_id = 0; + } +} + +static long int _time_login(char *user_name) +{ + FILE *su; + char line[BUFSIZ]; + char name[BUFSIZ]; + char value[BUFSIZ]; + int fildes[2], found, fval, rc, timeleft; + pid_t child; + struct timeval begin, now; + struct pollfd ufds; + long int delta_t; + + if (pipe(fildes) < 0) { + perror("pipe"); + return -1; + } + + child = fork(); + if (child == -1) { + perror("fork"); + return -1; + } + if (child == 0) { + close(0); + open("/dev/null", O_RDONLY); + dup2(fildes[1], 1); + close(2); + open("/dev/null", O_WRONLY); +#if 0 + /* execute .profile only */ + execl("/bin/su", "su", user_name, "-c", + "echo; echo; echo HELLO", NULL); +#else + /* execute .login plus .profile */ + execl("/bin/su", "su", "-", user_name, "-c", + "echo; echo; echo HELLO", NULL); +#endif + exit(1); + } + + close(fildes[1]); + if ((fval = fcntl(fildes[0], F_GETFL, 0)) >= 0) + fcntl(fildes[0], F_SETFL, fval | O_NONBLOCK); + su= fdopen(fildes[0], "r"); + + gettimeofday(&begin, NULL); + ufds.fd = fildes[0]; + ufds.events = POLLIN; + found = 0; + while (!found) { + gettimeofday(&now, NULL); + timeleft = SU_WAIT_MSEC; + timeleft -= (now.tv_sec - begin.tv_sec) * 1000; + timeleft -= (now.tv_usec - begin.tv_usec) / 1000; + if (timeleft <= 0) + break; + if ((rc = poll(&ufds, 1, timeleft)) <= 0) { + if (rc == 0) /* timeout */ + break; + if ((errno == EINTR) || (errno == EAGAIN)) + continue; + perror("poll"); + break; + } + if ((ufds.revents & POLLERR) || (ufds.revents & POLLHUP)) + break; + while (fgets(line, BUFSIZ, su)) { + if (!strncmp(line, "HELLO", 5)) { + found = 1; + break; + } + } + } + close(fildes[0]); + waitpid(-1, NULL, WNOHANG); + + if (!found) + return (SU_WAIT_MSEC * 1000); + + delta_t = (now.tv_sec - begin.tv_sec) * 1000000; + delta_t += now.tv_usec - begin.tv_usec; + return delta_t; +} diff --git a/doc/html/Makefile.am b/doc/html/Makefile.am index e8c0ff7bc6ca3694a3c7f27993d9354ae975fbb6..c7b713fd7abfc6ae63c4a556fd3f49105de5fc89 100644 --- a/doc/html/Makefile.am +++ b/doc/html/Makefile.am @@ -49,7 +49,6 @@ html_DATA = \ lci.7.tutorial.pdf \ lll.gif \ mc_support.gif \ - nnsa_doe_uc_comp.gif \ plane_ex1.gif \ plane_ex2.gif \ plane_ex3.gif \ @@ -60,6 +59,7 @@ html_DATA = \ slurm_banner.gif \ slurm_design.pdf \ slurmstyles.css \ + sponsors.gif \ linuxstyles.css MOSTLYCLEANFILES = ${generated_html} diff --git a/doc/html/Makefile.in b/doc/html/Makefile.in index 5fec7cc689c1c5c0da4158025fa729e703cffd13..22172644094acb8694150182005d6ef85402946e 100644 --- a/doc/html/Makefile.in +++ b/doc/html/Makefile.in @@ -278,7 +278,6 @@ html_DATA = \ lci.7.tutorial.pdf \ lll.gif \ mc_support.gif \ - nnsa_doe_uc_comp.gif \ plane_ex1.gif \ plane_ex2.gif \ plane_ex3.gif \ @@ -289,6 +288,7 @@ html_DATA = \ slurm_banner.gif \ slurm_design.pdf \ slurmstyles.css \ + sponsors.gif \ linuxstyles.css MOSTLYCLEANFILES = ${generated_html} diff --git a/doc/html/footer.txt b/doc/html/footer.txt index f7c5449e59612c5579d27d08de9805cc8fb56ac2..af8ad069c6e333c1a5a04e8a5d3b4e09bc5c5e84 100644 --- a/doc/html/footer.txt +++ b/doc/html/footer.txt @@ -6,14 +6,15 @@ </div> <div id="footer2"> -<div id="left2"><img src="nnsa_doe_uc_comp.gif" width="250" height="30" border="0" usemap="#Map2"></div> +<div id="left2"><img src="sponsors.gif" width="129" height="30" border="0" usemap="#Map2"></div> <div id="center2"><a href="http://www.llnl.gov/" target="_blank" class="footer">Lawrence Livermore National Laboratory</a><br /> <span class="smalltextblue">7000 East Avenue • Livermore, CA 94550</span></div> <div id="right2"><span class="smalltextblue">Operated by -the</span> <a href="http://www.universityofcalifornia.edu/" target="_blank" class="footer">University of California</a><br /> -<span class="smalltextblue">for the</span> <a href="http://www.energy.gov/" target="_blank" class="footer">Department of Energy's</a><br /> -<a href="http://www.nnsa.doe.gov/" target="_blank" class="footer">National -Nuclear Security Administration</a></div> +Lawrence Livermore National Security, LLC, for the</span> +<a href="http://www.energy.gov/" target="_blank" class="footer"> +Department of Energy's</a><br /> +<a href="http://www.nnsa.doe.gov/" target="_blank" class="footer"> +National Nuclear Security Administration</a></div> <div style="clear:both;"></div> </div> @@ -23,10 +24,8 @@ Nuclear Security Administration</a></div> <area shape="rect" coords="571,1,799,15" href="http://www.llnl.gov/"> </map> <map name="Map2"> -<area shape="rect" coords="3,2,95,34" href="http://www.nnsa.doe.gov/" target="_blank" alt="NNSA logo links to the NNSA Web site"> -<area shape="rect" coords="97,1,133,28" href="http://www.energy.gov/" target="_blank" alt="Department of Energy logo links to the DOE Web site"> -<area shape="rect" coords="136,0,164,28" href="http://www.universityofcalifornia.edu/" target="_blank" alt="University of California logo links to the UC Web site"> -<area shape="rect" coords="172,2,249,31" href="http://www.llnl.gov/computing/hpc/" target="_blank"> +<area shape="rect" coords="1,1,92,30" href="http://www.nnsa.doe.gov/" target="_blank" alt="NNSA logo links to the NNSA Web site"> +<area shape="rect" coords="98,1,132,30" href="http://www.energy.gov/" target="_blank" alt="Department of Energy logo links to the DOE Web site"> </map> </body> diff --git a/doc/html/header.txt b/doc/html/header.txt index 49a74abc5e30dd1da276b3f5e7a8bbe6d24f783c..097af4411c7c4a2c19dd0511fd4d1381be0d06de 100644 --- a/doc/html/header.txt +++ b/doc/html/header.txt @@ -14,7 +14,7 @@ Linux clusters, high-performance computing, Livermore Computing"> <meta name="description" content="Simple Linux Utility for Resource Management"> <meta name="copyright" content="This document is copyrighted U.S. -Department of Energy under Contract W-7405-Eng-48"> +Department of Energy under Contract DE-AC52-07NA27344"> <meta name="Author" content="Morris Jette"> <meta name="email" content="jette1@llnl.gov"> <meta name="Classification" diff --git a/doc/html/nnsa_doe_uc_comp.gif b/doc/html/nnsa_doe_uc_comp.gif deleted file mode 100644 index 57322db13690c74f8f2d52b35081ab31647a1ea2..0000000000000000000000000000000000000000 Binary files a/doc/html/nnsa_doe_uc_comp.gif and /dev/null differ diff --git a/doc/html/quickstart.shtml b/doc/html/quickstart.shtml index 6a979803f8d1e73494b329545c67a315b8df6911..7dae93bd5415a8d17799040870c63603dd510783 100644 --- a/doc/html/quickstart.shtml +++ b/doc/html/quickstart.shtml @@ -328,7 +328,9 @@ or srun's <i>--mpi=mvapich</i> option. $ mpicc ... $ srun -n16 --mpi=mvapich a.out </pre> -<p><b>Note for system administrators:</b> Configure +<b>NOTE:</b> If MVAPICH is used in the shared memory model, with all tasks +running on a single node, then use the <i>mpich1_shmem</i> MPI plugin instead.<br> +<b>NOTE (for system administrators):</b> Configure <i>PropagateResourceLimitsExcept=MEMLOCK</i> in <b>slurm.conf</b> and start the <i>slurmd</i> daemons with an unlimited locked memory limit. For more details, see diff --git a/doc/html/sponsors.gif b/doc/html/sponsors.gif new file mode 100644 index 0000000000000000000000000000000000000000..2d5bf921dd688b3eed3798e824fd9cfccecc16f9 Binary files /dev/null and b/doc/html/sponsors.gif differ diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 4b1a2842558eeade0a23b212ae954e4d6f36617f..603547e2559ba2ddac5f7ed23c581d06b048893d 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1,4 +1,4 @@ -\." $Id: srun.1 12315 2007-09-13 23:56:02Z jette $ +\." $Id: srun.1 12445 2007-10-03 22:24:35Z jette $ .\" .TH SRUN "1" "July 2007" "srun 1.2" "slurm components" @@ -554,6 +554,7 @@ environment variables for LAM/MPI. .B mpich1_shmem Initiates one process per node and establishes necessary environment variables for mpich1 shared memory model. +This also works for mvapich built for shared memory. .TP .B mpichgm For use with Myrinet. diff --git a/slurm.spec b/slurm.spec index 4c494a9f2c96eb2662d899ffbf18cd85f43ccd7c..858f4eb4e80c74a9d58824735faffab7e8bf1786 100644 --- a/slurm.spec +++ b/slurm.spec @@ -3,14 +3,14 @@ # Note that this package is not relocatable Name: slurm -Version: 1.2.17 +Version: 1.2.18 Release: 1%{?dist} Summary: Simple Linux Utility for Resource Management License: GPL Group: System Environment/Base -Source: slurm-1.2.17.bz2 +Source: slurm-1.2.18.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release} URL: http://www.llnl.gov/linux/slurm %ifos linux @@ -142,7 +142,7 @@ SLURM process tracking plugin for SGI job containers. (See http://oss.sgi.com/projects/pagg). %prep -%setup -n slurm-1.2.17 +%setup -n slurm-1.2.18 %build %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \ diff --git a/src/api/Makefile.am b/src/api/Makefile.am index 8330ade070908350eeff4053363d389ff60e1b92..6aeddd8699f614fef3f3dcb098dc456125b7b501 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -85,13 +85,6 @@ slurmapi_add = \ $(common_dir)/libeio.la \ -lpthread -libslurm_la_SOURCES = $(slurmapi_src) -libslurm_la_LIBADD = $(slurmapi_add) -libslurm_la_LDFLAGS = \ - $(LIB_LDFLAGS) \ - -version-info $(current):$(rev):$(age) \ - $(OTHER_FLAGS) - libslurmhelper_la_SOURCES = $(slurmapi_src) libslurmhelper_la_LIBADD = $(slurmapi_add) libslurmhelper_la_LDFLAGs = \ @@ -100,13 +93,18 @@ libslurmhelper_la_LDFLAGs = \ convenience_libs = $(top_builddir)/src/api/libslurmhelper.la +libslurm_la_SOURCES = +libslurm_la_LIBADD = $(convenience_libs) +libslurm_la_LDFLAGS = \ + $(LIB_LDFLAGS) \ + -version-info $(current):$(rev):$(age) \ + $(OTHER_FLAGS) + libpmi_la_SOURCES = pmi.c libpmi_la_LIBADD = $(convenience_libs) libpmi_la_LDFLAGS = $(LIB_LDFLAGS) force: -$(libslurm_la_LIBADD) : force - @cd `dirname $@` && $(MAKE) `basename $@` $(convenience_libs) : force @cd `dirname $@` && $(MAKE) `basename $@` diff --git a/src/api/Makefile.in b/src/api/Makefile.in index 1050f91280f4c2b4bd214a5a14ce8b9319d6b42d..bf7c01b6c7460bb5696b4b7ddb10e795a35015f6 100644 --- a/src/api/Makefile.in +++ b/src/api/Makefile.in @@ -80,21 +80,21 @@ libpmi_la_OBJECTS = $(am_libpmi_la_OBJECTS) libpmi_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ $(libpmi_la_LDFLAGS) $(LDFLAGS) -o $@ +libslurm_la_DEPENDENCIES = $(convenience_libs) +am_libslurm_la_OBJECTS = +libslurm_la_OBJECTS = $(am_libslurm_la_OBJECTS) +libslurm_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(libslurm_la_LDFLAGS) $(LDFLAGS) -o $@ am__DEPENDENCIES_1 = $(common_dir)/libcommon.la \ $(common_dir)/libspank.la $(common_dir)/libeio.la -libslurm_la_DEPENDENCIES = $(am__DEPENDENCIES_1) +libslurmhelper_la_DEPENDENCIES = $(am__DEPENDENCIES_1) am__objects_1 = allocate.lo cancel.lo checkpoint.lo complete.lo \ config_info.lo init_msg.lo job_info.lo job_step_info.lo \ node_info.lo node_select_info.lo partition_info.lo signal.lo \ slurm_pmi.lo step_ctx.lo step_io.lo step_launch.lo \ pmi_server.lo submit.lo suspend.lo triggers.lo reconfigure.lo \ update_config.lo -am_libslurm_la_OBJECTS = $(am__objects_1) -libslurm_la_OBJECTS = $(am_libslurm_la_OBJECTS) -libslurm_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ - $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ - $(libslurm_la_LDFLAGS) $(LDFLAGS) -o $@ -libslurmhelper_la_DEPENDENCIES = $(am__DEPENDENCIES_1) am_libslurmhelper_la_OBJECTS = $(am__objects_1) libslurmhelper_la_OBJECTS = $(am_libslurmhelper_la_OBJECTS) DEFAULT_INCLUDES = -I. -I$(top_builddir) -I$(top_builddir)/slurm@am__isrc@ @@ -350,13 +350,6 @@ slurmapi_add = \ $(common_dir)/libeio.la \ -lpthread -libslurm_la_SOURCES = $(slurmapi_src) -libslurm_la_LIBADD = $(slurmapi_add) -libslurm_la_LDFLAGS = \ - $(LIB_LDFLAGS) \ - -version-info $(current):$(rev):$(age) \ - $(OTHER_FLAGS) - libslurmhelper_la_SOURCES = $(slurmapi_src) libslurmhelper_la_LIBADD = $(slurmapi_add) libslurmhelper_la_LDFLAGs = \ @@ -364,6 +357,13 @@ libslurmhelper_la_LDFLAGs = \ -version-info $(current):$(rev):$(age) convenience_libs = $(top_builddir)/src/api/libslurmhelper.la +libslurm_la_SOURCES = +libslurm_la_LIBADD = $(convenience_libs) +libslurm_la_LDFLAGS = \ + $(LIB_LDFLAGS) \ + -version-info $(current):$(rev):$(age) \ + $(OTHER_FLAGS) + libpmi_la_SOURCES = pmi.c libpmi_la_LIBADD = $(convenience_libs) libpmi_la_LDFLAGS = $(LIB_LDFLAGS) @@ -694,8 +694,6 @@ uninstall-am: uninstall-libLTLIBRARIES force: -$(libslurm_la_LIBADD) : force - @cd `dirname $@` && $(MAKE) `basename $@` $(convenience_libs) : force @cd `dirname $@` && $(MAKE) `basename $@` diff --git a/src/api/init_msg.c b/src/api/init_msg.c index a8e2ea98c3e8c0d9c6bf389bcabee33c1d422e00..528eb3850a888145ba8f60be095ddba555becb9f 100644 --- a/src/api/init_msg.c +++ b/src/api/init_msg.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * init_msg.c - initialize RPC messages contents - * $Id: init_msg.c 10912 2007-02-01 19:48:22Z jette $ + * $Id: init_msg.c 12457 2007-10-05 23:15:28Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -74,6 +74,7 @@ void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg) job_desc_msg->job_min_procs = (uint16_t) NO_VAL; job_desc_msg->job_min_sockets = (uint16_t) NO_VAL; job_desc_msg->job_min_threads = (uint16_t) NO_VAL; + job_desc_msg->job_max_memory = NO_VAL; job_desc_msg->job_min_memory = NO_VAL; job_desc_msg->job_min_tmp_disk= NO_VAL; job_desc_msg->kill_on_node_fail = (uint16_t) NO_VAL; diff --git a/src/common/bitstring.c b/src/common/bitstring.c index c786560d8def6821abe78adb7737338510bcd37a..69d7029c763a8e97dd1626c1600037acf328dceb 100644 --- a/src/common/bitstring.c +++ b/src/common/bitstring.c @@ -981,7 +981,7 @@ bitfmt2int (char *bit_str_ptr) } else if (bit_str_ptr[i] == ',' || - bit_str_ptr[i] == (char) NULL) { + bit_str_ptr[i] == '\0') { if (i == 0) break; if (start_val == -1) diff --git a/src/common/env.c b/src/common/env.c index aeecc36a68e26ffea9c647f97c5c41e618770fdf..a8d139c6f9602faf1f0d390fd1226d435ed27f02 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/common/env.c - add an environment variable to environment vector - * $Id: env.c 12233 2007-09-07 22:03:42Z jette $ + * $Id: env.c 12448 2007-10-05 00:45:10Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -40,11 +40,13 @@ # include "config.h" #endif +#include <fcntl.h> #include <stdio.h> #include <stdarg.h> #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <sys/poll.h> #include <sys/types.h> #include "src/common/macros.h" @@ -73,6 +75,8 @@ strong_alias(env_array_append_fmt, slurm_env_array_append_fmt); strong_alias(env_array_overwrite, slurm_env_array_overwrite); strong_alias(env_array_overwrite_fmt, slurm_env_array_overwrite_fmt); +#define SU_WAIT_MSEC 3000 /* 3000 msec for /bin/su to return user + * env vars for --get-user-env option */ /* * Return pointer to `name' entry in environment if found, or * pointer to the last entry (i.e. NULL) if `name' is not @@ -1230,40 +1234,120 @@ char **env_array_user_default(const char *username) char line[BUFSIZ]; char name[BUFSIZ]; char value[BUFSIZ]; - char *cmdstr = xstrdup(""); char **env = NULL; char *starttoken = "XXXXSLURMSTARTPARSINGHEREXXXX"; - char *stoptoken = "XXXXSLURMSTOPPARSINGHEREXXXXX"; - int len; + char *stoptoken = "XXXXSLURMSTOPPARSINGHEREXXXXX"; + char cmdstr[256]; + int fildes[2], found, fval, len, rc, timeleft; + pid_t child; + struct timeval begin, now; + struct pollfd ufds; if (geteuid() != (uid_t)0) { info("WARNING: you must be root to use --get-user-env"); return NULL; } - xstrfmtcat(cmdstr, "/bin/su - %s -c \"echo; echo; echo; echo %s; env; echo %s\" 2>/dev/null", - username, starttoken, stoptoken); - su = popen(cmdstr, "r"); - xfree(cmdstr); - if (su == NULL) { + if (pipe(fildes) < 0) { + error("pipe: %m"); return NULL; } - env = env_array_create(); + child = fork(); + if (child == -1) { + error("fork: %m"); + return NULL; + } + if (child == 0) { + close(0); + open("/dev/null", O_RDONLY); + dup2(fildes[1], 1); + close(2); + open("/dev/null", O_WRONLY); + snprintf(cmdstr, sizeof(cmdstr), + "echo; echo; echo; echo %s; env; echo %s", + starttoken, stoptoken); +#if 0 + /* execute .profile only */ + execl("/bin/su", "su", username, "-c", cmdstr, NULL); +#else + /* execute .login plus .profile */ + execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); +#endif + exit(1); + } + + close(fildes[1]); + if ((fval = fcntl(fildes[0], F_GETFL, 0)) >= 0) + fcntl(fildes[0], F_SETFL, fval | O_NONBLOCK); + su= fdopen(fildes[0], "r"); + + gettimeofday(&begin, NULL); + ufds.fd = fildes[0]; + ufds.events = POLLIN; /* First look for the start token in the output */ len = strlen(starttoken); - while (fgets(line, BUFSIZ, su) != NULL) { - if (0 == strncmp(line, starttoken, len)) { + found = 0; + while (!found) { + gettimeofday(&now, NULL); + timeleft = SU_WAIT_MSEC; + timeleft -= (now.tv_sec - begin.tv_sec) * 1000; + timeleft -= (now.tv_usec - begin.tv_usec) / 1000; + if (timeleft <= 0) + break; + if ((rc = poll(&ufds, 1, timeleft)) <= 0) { + if (rc == 0) { + verbose("timeout waiting for /bin/su to complete"); + break; + } + if ((errno == EINTR) || (errno == EAGAIN)) + continue; + error("poll: %m"); + break; + } + if ((ufds.revents & POLLERR) || (ufds.revents & POLLHUP)) break; + while (fgets(line, BUFSIZ, su)) { + if (!strncmp(line, starttoken, len)) { + found = 1; + break; + } } } + if (!found) { + error("Failed to get user environment variables"); + close(fildes[0]); + return NULL; + } /* Now read in the environment variable strings. */ + env = env_array_create(); len = strlen(stoptoken); - while (fgets(line, BUFSIZ, su) != NULL) { + found = 0; + while (!found) { + gettimeofday(&now, NULL); + timeleft = SU_WAIT_MSEC; + timeleft -= (now.tv_sec - begin.tv_sec) * 1000; + timeleft -= (now.tv_usec - begin.tv_usec) / 1000; + if (timeleft <= 0) + break; + if ((rc = poll(&ufds, 1, timeleft)) <= 0) { + if (rc == 0) { + verbose("timeout waiting for /bin/su to complete"); + break; + } + if ((errno == EINTR) || (errno == EAGAIN)) + continue; + error("poll: %m"); + break; + } /* stop at the line containing the stoptoken string */ - if (0 == strncmp(line, stoptoken, len)) { + if ((ufds.revents & POLLERR) || (ufds.revents & POLLHUP)) + break; + if ((fgets(line, BUFSIZ, su) == 0) || + (!strncmp(line, stoptoken, len))) { + found = 1; break; } @@ -1271,7 +1355,8 @@ char **env_array_user_default(const char *username) _env_array_entry_splitter(line, name, BUFSIZ, value, BUFSIZ); env_array_overwrite(&env, name, value); } - pclose(su); + close(fildes[0]); return env; } + diff --git a/src/common/parse_spec.c b/src/common/parse_spec.c index f1146f55dbbd48aeb6f68a9c0bf5e4e5ee464f11..0bfbc78f0d179d23bf325ce408a8e4188c09d799 100644 --- a/src/common/parse_spec.c +++ b/src/common/parse_spec.c @@ -1,4 +1,4 @@ -/* $Id: parse_spec.c 10574 2006-12-15 23:38:29Z jette $ */ +/* $Id: parse_spec.c 12452 2007-10-05 19:07:07Z da $ */ /*****************************************************************************\ * parse_spec.c - configuration file parser ***************************************************************************** @@ -178,7 +178,7 @@ _load_integer (int *destination, char *keyword, char *in_line) if (str_ptr1 != NULL) { str_len1 = strlen (keyword); strcpy (scratch, str_ptr1 + str_len1); - if ((scratch[0] == (char) NULL) || + if ((scratch[0] == '\0') || (isspace ((int) scratch[0]))) { /* keyword with no value set */ *destination = 1; @@ -235,7 +235,7 @@ _load_long (long *destination, char *keyword, char *in_line) if (str_ptr1 != NULL) { str_len1 = strlen (keyword); strcpy (scratch, str_ptr1 + str_len1); - if ((scratch[0] == (char) NULL) || + if ((scratch[0] == '\0') || (isspace ((int) scratch[0]))) { /* keyword with no value set */ *destination = 1; diff --git a/src/common/parse_time.c b/src/common/parse_time.c index 1c24e614c4ec2b35c8d69a66c9d734d43afbe3ad..e70fe5f29a9cdc99cf80018f28ae256806b8a6b5 100644 --- a/src/common/parse_time.c +++ b/src/common/parse_time.c @@ -42,6 +42,8 @@ #define __USE_ISOC99 /* isblank() */ #include <ctype.h> +#include <slurm/slurm.h> + #define _RUN_STAND_ALONE 0 time_t time_now; @@ -434,7 +436,8 @@ slurm_make_time_str (time_t *time, char *string, int size) * days-hr:min:sec * days-hr * output: - * minutes (or -1 on error) + * minutes (or -1 on error) (or INFINITE value defined in slurm.h + * if unlimited is the value of string) */ extern int time_str2mins(char *string) { @@ -444,6 +447,10 @@ extern int time_str2mins(char *string) if ((string == NULL) || (string[0] == '\0')) return -1; /* invalid input */ + if (!strcasecmp(string, "UNLIMITED")) { + return INFINITE; + } + for (i=0; ; i++) { if ((string[i] >= '0') && (string[i] <= '9')) { tmp = (tmp * 10) + (string[i] - '0'); @@ -491,3 +498,25 @@ extern int time_str2mins(char *string) res++; /* round up */ return res; } + +extern void secs2time_str(time_t time, char *string, int size) +{ + if (time == INFINITE) { + snprintf(string, size, "UNLIMITED"); + } else { + long days, hours, minutes, seconds; + seconds = time % 60; + minutes = (time / 60) % 60; + hours = (time / 3600) % 24; + days = time / 86400; + + if (days) + snprintf(string, size, + "%ld-%2.2ld:%2.2ld:%2.2ld", + days, hours, minutes, seconds); + else + snprintf(string, size, + "%2.2ld:%2.2ld:%2.2ld", + hours, minutes, seconds); + } +} diff --git a/src/common/parse_time.h b/src/common/parse_time.h index 5b2caf0499c7d0dd44cb99f02cf154651392ae89..5b4a7b547c8f73c8f4b56d222d7ed720f376f9e6 100644 --- a/src/common/parse_time.h +++ b/src/common/parse_time.h @@ -77,4 +77,10 @@ slurm_make_time_str (time_t *time, char *string, int size); */ extern int time_str2mins(char *string); +/* Convert a time value into a string that can be converted back by + * time_str2mins. + * fill in string with HH:MM:SS or D-HH:MM:SS + */ +extern void secs2time_str(time_t time, char *string, int size); + #endif diff --git a/src/common/read_config.c b/src/common/read_config.c index b1dd0d7da1e141e42a88c737fc1c40d25719395b..6eb76072a47ce011bca30ac15f87b3d0d48f82ef 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -75,12 +75,7 @@ static s_p_hashtbl_t *conf_hashtbl = NULL; static slurm_ctl_conf_t *conf_ptr = &slurmctld_conf; static bool conf_initialized = false; -/* - * FIXME - If we eliminate the SlurmdPort option altogether, then - * default_slurmd_port and parse_slurmd_port can - * be removed. - */ -static uint16_t default_slurmd_port; +static uint16_t global_slurmd_port = SLURMD_PORT; static int parse_slurmd_port(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover); @@ -210,7 +205,7 @@ s_p_options_t slurm_conf_options[] = { /* * This function works almost exactly the same as the * default S_P_UINT32 handler, except that it also sets the - * global variable default_slurmd_port. + * global variable global_slurmd_port. */ static int parse_slurmd_port(void **dest, slurm_parser_enum_t type, const char *key, const char *value, @@ -238,7 +233,7 @@ static int parse_slurmd_port(void **dest, slurm_parser_enum_t type, return -1; } - default_slurmd_port = (uint16_t) num; + global_slurmd_port = (uint16_t) num; ptr = (uint32_t *)xmalloc(sizeof(uint32_t)); *ptr = (uint32_t)num; @@ -328,10 +323,7 @@ static int parse_nodename(void **dest, slurm_parser_enum_t type, if (!s_p_get_uint16(&n->port, "Port", tbl) && !s_p_get_uint16(&n->port, "Port", dflt)) { - if (default_slurmd_port != 0) - n->port = default_slurmd_port; - else - n->port = SLURMD_PORT; + n->port = global_slurmd_port; } if (!s_p_get_uint16(&n->cpus, "Procs", tbl) @@ -917,8 +909,8 @@ extern uint16_t slurm_conf_get_port(const char *node_name) while (p) { if (strcmp(p->alias, node_name) == 0) { uint16_t port = p->port; - if ((!port || port == SLURMD_PORT)) - port = default_slurmd_port; + if (!port) + port = global_slurmd_port; slurm_conf_unlock(); return port; } @@ -1171,7 +1163,6 @@ _init_slurm_conf(const char *file_name) { char *name = (char *)file_name; /* conf_ptr = (slurm_ctl_conf_t *)xmalloc(sizeof(slurm_ctl_conf_t)); */ - default_slurmd_port = 0; if (name == NULL) { name = getenv("SLURM_CONF"); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index b632b528b032f701de4691370bfc6c2a34815872..ff413445e38fc50afd3a6c32c616d7f9bb601d74 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -152,7 +152,7 @@ static slurm_errtab_t slurm_errtab[] = { { ESLURM_TRANSITION_STATE_NO_UPDATE, "Job can not be altered now, try again later" }, { ESLURM_ALREADY_DONE, - "Job/step already completed" }, + "Job/step already completing or completed" }, { ESLURM_INTERCONNECT_FAILURE, "Error configuring interconnect" }, { ESLURM_BAD_DIST, diff --git a/src/common/timers.c b/src/common/timers.c index 421cdae91824652024907fb9eef8a8c7100f56e6..06cdec28d09638e656cbe194f399f09414f2c4e1 100644 --- a/src/common/timers.c +++ b/src/common/timers.c @@ -45,16 +45,19 @@ * IN tv2 - end of event * OUT tv_str - place to put delta time in format "usec=%ld" * IN len_tv_str - size of tv_str in bytes + * IN from - where the function was called form */ inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, - char *tv_str, int len_tv_str) + char *tv_str, int len_tv_str, char *from) { long delta_t; delta_t = (tv2->tv_sec - tv1->tv_sec) * 1000000; delta_t += tv2->tv_usec - tv1->tv_usec; snprintf(tv_str, len_tv_str, "usec=%ld", delta_t); - if (delta_t > 1000000) - verbose("Warning: Note very large processing time: %s",tv_str); + if ((delta_t > 1000000) && from) { + verbose("Warning: Note very large processing time from %s: %s", + from, tv_str); + } } /* diff --git a/src/common/timers.h b/src/common/timers.h index 5d1f1588925a71f0118cce40c58296896f77845b..0404c4e67d4a5344128dceada713d58af930f069 100644 --- a/src/common/timers.h +++ b/src/common/timers.h @@ -43,7 +43,9 @@ #define DEF_TIMERS struct timeval tv1, tv2; char tv_str[20] #define START_TIMER gettimeofday(&tv1, NULL) #define END_TIMER gettimeofday(&tv2, NULL); \ - diff_tv_str(&tv1, &tv2, tv_str, 20) + diff_tv_str(&tv1, &tv2, tv_str, 20, NULL) +#define END_TIMER2(from) gettimeofday(&tv2, NULL); \ + diff_tv_str(&tv1, &tv2, tv_str, 20, from) #define DELTA_TIMER diff_tv(&tv1, &tv2) #define TIME_STR tv_str @@ -54,8 +56,8 @@ * OUT tv_str - place to put delta time in format "usec=%ld" * IN len_tv_str - size of tv_str in bytes */ -extern inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, - char *tv_str, int len_tv_str); +extern inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, + char *tv_str, int len_tv_str, char *from); /* * diff_tv - return the difference between two times diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c index 5a54c9b6f548428c3b2755fd4ccb44cafdbc5162..8592fcf34c8bc7f395709b7682db0f9e1269516e 100644 --- a/src/plugins/sched/wiki/get_jobs.c +++ b/src/plugins/sched/wiki/get_jobs.c @@ -68,14 +68,27 @@ static uint32_t _get_job_time_limit(struct job_record *job_ptr); * RET 0 on success, -1 on failure * * Response format - * ARG=<cnt>#<JOBID>;UPDATE_TIME=<uts>;STATE=<state>;WCLIMIT=<time_limit>; - * TASKS=<cpus>;QUEUETIME=<submit_time>;STARTTIME=<time>; - * UNAME=<user>;GNAME=<group>;PARTITIONMASK=<part>; - * NODES=<node_cnt>;RMEM=<mem_size>;RDISK=<disk_space>; - * [COMPLETETIME=<end_time>;] - * [#<JOBID>;...]; + * ARG=<cnt>#<JOBID>; + * STATE=<state>; + * [HOSTLIST=<required_hosts>;] + * [TASKLIST=<allocated_hosts>;] + * [REJMESSAGE=<reason_job_failed>;] + * UPDATE_TIME=<uts>; + * WCLIMIT=<time_limit>; + * [TASKS=<required_cpus>;] + * [NODES=<required_node_cnt>;] + * QUEUETIME=<submit_time>; + * STARTTIME=<time>; + * PARTITIONMASK=<partition>; + * RMEM=<mem_size>; + * RDISK=<disk_space>; + * [COMPLETETIME=<end_time>;] + * [SUSPENDTIME=<time_suspended>;] + * [UNAME=<user>;] + * [GNAME=<group>;] + * [#<JOBID>;...]; + * */ -/* RET 0 on success, -1 on failure */ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *tmp_char, *tmp_buf, *buf = NULL; @@ -217,17 +230,21 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) (uint32_t) _get_job_time_limit(job_ptr)); xstrcat(buf, tmp); - snprintf(tmp, sizeof(tmp), - "TASKS=%u;QUEUETIME=%u;STARTTIME=%u;", - _get_job_tasks(job_ptr), - _get_job_submit_time(job_ptr), - (uint32_t) job_ptr->start_time); - xstrcat(buf, tmp); + if (job_ptr->job_state == JOB_PENDING) { + /* Don't report actual tasks or nodes allocated since + * this can impact requeue on heterogenous clusters */ + snprintf(tmp, sizeof(tmp), + "TASKS=%u;NODES=%u;", + _get_job_tasks(job_ptr), + _get_job_min_nodes(job_ptr)); + xstrcat(buf, tmp); + } snprintf(tmp, sizeof(tmp), - "PARTITIONMASK=%s;NODES=%u;", - job_ptr->partition, - _get_job_min_nodes(job_ptr)); + "QUEUETIME=%u;STARTTIME=%u;PARTITIONMASK=%s;", + _get_job_submit_time(job_ptr), + (uint32_t) job_ptr->start_time, + job_ptr->partition); xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), diff --git a/src/plugins/sched/wiki/start_job.c b/src/plugins/sched/wiki/start_job.c index ceab8840e744c2a2b25cc96322ea0e8ee05c79a7..b581aef83502cda104be4331570ef8e03865e5f4 100644 --- a/src/plugins/sched/wiki/start_job.c +++ b/src/plugins/sched/wiki/start_job.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * start_job.c - Process Wiki start job request ***************************************************************************** - * Copyright (C) 2006 The Regents of the University of California. + * Copyright (C) 2006-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-226842. @@ -117,9 +117,9 @@ static int _start_job(uint32_t jobid, char *hostlist, /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; - char *new_node_list; + char *new_node_list, *save_req_nodes = NULL; static char tmp_msg[128]; - bitstr_t *new_bitmap; + bitstr_t *new_bitmap, *save_req_bitmap = (bitstr_t *) NULL; lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); @@ -170,9 +170,9 @@ static int _start_job(uint32_t jobid, char *hostlist, } /* start it now */ - xfree(job_ptr->details->req_nodes); + save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; - FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; job_ptr->priority = 100000000; @@ -182,6 +182,18 @@ static int _start_job(uint32_t jobid, char *hostlist, /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); /* job_ptr = find_job_record(jobid); don't bother */ + if ((job_ptr->job_id == jobid) && job_ptr->details && + (job_ptr->job_state == JOB_RUNNING)) { + /* Restore required node list */ + xfree(job_ptr->details->req_nodes); + job_ptr->details->req_nodes = save_req_nodes; + FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + job_ptr->details->req_node_bitmap = save_req_bitmap; + } else { + xfree(save_req_nodes); + FREE_NULL_BITMAP(save_req_bitmap); + } + if ((job_ptr->job_id == jobid) && (job_ptr->job_state != JOB_RUNNING)) { uint16_t wait_reason = 0; diff --git a/src/plugins/sched/wiki2/event.c b/src/plugins/sched/wiki2/event.c index e56f7832dcfe05552a133c7ad7cec25e496b6718..f3795d2d5e3fc1d86c01c5672031429ddce4afa8 100644 --- a/src/plugins/sched/wiki2/event.c +++ b/src/plugins/sched/wiki2/event.c @@ -110,7 +110,9 @@ extern int event_notify(int event_code, char *desc) time_t now = time(NULL); int rc = 0, retry = 2; char *event_msg; + DEF_TIMERS; + START_TIMER; if (e_port == 0) { /* Event notification disabled */ return 0; @@ -169,6 +171,7 @@ extern int event_notify(int event_code, char *desc) } } pthread_mutex_unlock(&event_mutex); + END_TIMER2("event_notify"); return rc; } diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index c8bd554429bd8716bf8fcb17200bf03fba0a2c1b..0105748a07e4dfeb1371dc4e29315e13e3aa5c42 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -87,10 +87,10 @@ static uint32_t cr_enabled = 0, cr_test = 0; * [FLAGS=INTERACTIVE;] set if interactive (not batch) job * WCLIMIT=<secs>; wall clock time limit, seconds * TASKS=<cpus>; CPUs required + * NODES=<nodes>; nodes required * QUEUETIME=<uts>; submission time * STARTTIME=<uts>; time execution started * RCLASS=<partition>; SLURM partition name - * NODES=<nodes>; nodes required * RMEM=<MB>; MB of memory required * RDISK=<MB>; MB of disk space required * [COMMENT=<whatever>;] job dependency or account number @@ -105,7 +105,7 @@ static uint32_t cr_enabled = 0, cr_test = 0; /* RET 0 on success, -1 on failure */ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) { - char *arg_ptr, *tmp_char, *tmp_buf, *buf = NULL; + char *arg_ptr = NULL, *tmp_char = NULL, *tmp_buf = NULL, *buf = NULL; time_t update_time; /* Locks: read job, partition */ slurmctld_lock_t job_read_lock = { @@ -151,8 +151,8 @@ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) /* report all jobs */ buf = _dump_all_jobs(&job_rec_cnt, state_info); } else { - struct job_record *job_ptr; - char *job_name, *tmp2_char; + struct job_record *job_ptr = NULL; + char *job_name = NULL, *tmp2_char = NULL; uint32_t job_id; job_name = strtok_r(tmp_char, ":", &tmp2_char); @@ -269,16 +269,16 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), - "TASKS=%u;QUEUETIME=%u;STARTTIME=%u;", + "TASKS=%u;NODES=%u;", _get_job_tasks(job_ptr), - _get_job_submit_time(job_ptr), - (uint32_t) job_ptr->start_time); + _get_job_min_nodes(job_ptr)); xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), - "RCLASS=%s;NODES=%u;", - job_ptr->partition, - _get_job_min_nodes(job_ptr)); + "QUEUETIME=%u;STARTTIME=%u;RCLASS=%s;", + _get_job_submit_time(job_ptr), + (uint32_t) job_ptr->start_time, + job_ptr->partition); xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), diff --git a/src/plugins/sched/wiki2/get_nodes.c b/src/plugins/sched/wiki2/get_nodes.c index 94cc9f9cad8572dd1fdfe9d436d1b8893c8ad097..44afd1a2ef73e5a221aa45af3e81a96c39e90b54 100644 --- a/src/plugins/sched/wiki2/get_nodes.c +++ b/src/plugins/sched/wiki2/get_nodes.c @@ -68,7 +68,7 @@ static char * _get_node_state(struct node_record *node_ptr); */ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) { - char *arg_ptr, *tmp_char, *tmp_buf, *buf = NULL; + char *arg_ptr = NULL, *tmp_char = NULL, *tmp_buf = NULL, *buf = NULL; time_t update_time; /* Locks: read node, read partition */ slurmctld_lock_t node_read_lock = { @@ -102,8 +102,8 @@ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) /* report all nodes */ buf = _dump_all_nodes(&node_rec_cnt, state_info); } else { - struct node_record *node_ptr; - char *node_name, *tmp2_char; + struct node_record *node_ptr = NULL; + char *node_name = NULL, *tmp2_char = NULL; node_name = strtok_r(tmp_char, ":", &tmp2_char); while (node_name) { @@ -134,7 +134,7 @@ static char * _dump_all_nodes(int *node_cnt, int state_info) { int i, cnt = 0; struct node_record *node_ptr = node_record_table_ptr; - char *tmp_buf, *buf = NULL; + char *tmp_buf = NULL, *buf = NULL; for (i=0; i<node_record_count; i++, node_ptr++) { if (node_ptr->name == NULL) diff --git a/src/plugins/sched/wiki2/job_modify.c b/src/plugins/sched/wiki2/job_modify.c index 82f17550ae70e03b45e6d48329bee1232fa9a7c5..e3a62654951e8e3241d248157ca37ac629aa2c60 100644 --- a/src/plugins/sched/wiki2/job_modify.c +++ b/src/plugins/sched/wiki2/job_modify.c @@ -70,7 +70,7 @@ static int32_t _get_depend_id(char *str) } static int _job_modify(uint32_t jobid, char *bank_ptr, - int32_t depend_id, + int32_t depend_id, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit) { @@ -110,6 +110,60 @@ static int _job_modify(uint32_t jobid, char *bank_ptr, last_job_update = time(NULL); } + if (new_hostlist) { + int i, rc = 0, task_cnt; + hostlist_t hl; + char *tasklist; + + if (!job_ptr->details) { + /* Job is done, nothing to reset */ + if (new_hostlist == '\0') + goto host_fini; + error("wiki: MODIFYJOB tasklist of non-pending " + "job %u", jobid); + return ESLURM_DISABLED; + } + + xfree(job_ptr->details->req_nodes); + FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + if (new_hostlist == '\0') + goto host_fini; + + tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); + if (tasklist == NULL) { + rc = 1; + goto host_fini; + } + hl = hostlist_create(tasklist); + if (hl == 0) { + rc = 1; + goto host_fini; + } + hostlist_uniq(hl); + hostlist_sort(hl); + i = strlen(new_hostlist) + 16; + job_ptr->details->req_nodes = xmalloc(i); + i = hostlist_ranged_string(hl, i, job_ptr->details->req_nodes); + hostlist_destroy(hl); + if (i < 0) { + rc = 1; + goto host_fini; + } + if (node_name2bitmap(job_ptr->details->req_nodes, false, + &job_ptr->details->req_node_bitmap)) { + rc = 1; + goto host_fini; + } + +host_fini: if (rc) { + info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); + xfree(job_ptr->details->req_nodes); + return EINVAL; + } else { + info("wiki: change job %u hostlist %s", jobid, new_hostlist); + } + } + if (part_name_ptr) { struct part_record *part_ptr; part_ptr = find_part_record(part_name_ptr); @@ -151,7 +205,7 @@ static int _job_modify(uint32_t jobid, char *bank_ptr, extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *bank_ptr, *depend_ptr, *nodes_ptr; - char *part_ptr, *time_ptr, *tmp_char; + char *host_ptr, *part_ptr, *time_ptr, *tmp_char; int slurm_rc; int depend_id = -1; uint32_t jobid, new_node_cnt = 0, new_time_limit = 0; @@ -180,6 +234,7 @@ extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) } bank_ptr = strstr(cmd_ptr, "BANK="); depend_ptr = strstr(cmd_ptr, "DEPEND="); + host_ptr = strstr(cmd_ptr, "HOSTLIST="); nodes_ptr = strstr(cmd_ptr, "NODES="); part_ptr = strstr(cmd_ptr, "PARTITION="); time_ptr = strstr(cmd_ptr, "TIMELIMIT="); @@ -200,6 +255,11 @@ extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) return -1; } } + if (host_ptr) { + host_ptr[8] = ':'; + host_ptr += 9; + _null_term(bank_ptr); + } if (nodes_ptr) { nodes_ptr[5] = ':'; nodes_ptr += 6; @@ -226,7 +286,7 @@ extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) } lock_slurmctld(job_write_lock); - slurm_rc = _job_modify(jobid, bank_ptr, depend_id, + slurm_rc = _job_modify(jobid, bank_ptr, depend_id, host_ptr, new_node_cnt, part_ptr, new_time_limit); unlock_slurmctld(job_write_lock); if (slurm_rc != SLURM_SUCCESS) { diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c index ff3f6a1a36db6a93e969fe0805030f46fac86db7..8161e15235575199cadd88c1abce6a1a30f81bb0 100644 --- a/src/plugins/sched/wiki2/msg.c +++ b/src/plugins/sched/wiki2/msg.c @@ -287,7 +287,7 @@ extern int parse_wiki_config(void) s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl); if (s_p_get_string(&exclude_partitions, "ExcludePartitions", tbl)) { - char *tok, *tok_p; + char *tok = NULL, *tok_p = NULL; tok = strtok_r(exclude_partitions, ",", &tok_p); i = 0; while (tok) { @@ -539,12 +539,14 @@ static int _parse_msg(char *msg, char **req) \*****************************************************************************/ static void _proc_msg(slurm_fd new_fd, char *msg) { - char *req, *cmd_ptr; + DEF_TIMERS; + char *req, *cmd_ptr, *msg_type = NULL; char response[128]; if (new_fd < 0) return; + START_TIMER; if (!msg) { err_code = -300; err_msg = "NULL request message"; @@ -565,40 +567,55 @@ static void _proc_msg(slurm_fd new_fd, char *msg) cmd_ptr +=4; err_code = 0; if (strncmp(cmd_ptr, "GETJOBS", 7) == 0) { + msg_type = "wiki:GETJOBS"; if (!get_jobs(cmd_ptr, &err_code, &err_msg)) goto free_resp_msg; } else if (strncmp(cmd_ptr, "GETNODES", 8) == 0) { + msg_type = "wiki:GETNODES"; if (!get_nodes(cmd_ptr, &err_code, &err_msg)) goto free_resp_msg; } else if (strncmp(cmd_ptr, "STARTJOB", 8) == 0) { + msg_type = "wiki:STARTJOB"; start_job(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "CANCELJOB", 9) == 0) { + msg_type = "wiki:CANCELJOB"; cancel_job(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "REQUEUEJOB", 10) == 0) { + msg_type = "wiki:REQUEUEJOB"; job_requeue_wiki(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "SUSPENDJOB", 10) == 0) { + msg_type = "wiki:SUSPENDJOB"; suspend_job(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "RESUMEJOB", 9) == 0) { + msg_type = "wiki:RESUMEJOB"; resume_job(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "JOBADDTASK", 10) == 0) { + msg_type = "wiki:JOBADDTASK"; job_add_task(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "JOBRELEASETASK", 14) == 0) { + msg_type = "wiki:JOBRELEASETASK"; job_release_task(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "JOBWILLRUN", 10) == 0) { + msg_type = "wiki:JOBWILLRUN"; job_will_run(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "MODIFYJOB", 9) == 0) { + msg_type = "wiki:MODIFYJOB"; job_modify_wiki(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "NOTIFYJOB", 9) == 0) { + msg_type = "wiki:NOTIFYJOB"; job_notify_wiki(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "SIGNALJOB", 9) == 0) { + msg_type = "wiki:SIGNALJOB"; job_signal_wiki(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "INITIALIZE", 10) == 0) { + msg_type = "wiki:INITIALIZE"; initialize_wiki(cmd_ptr, &err_code, &err_msg); } else { err_code = -300; err_msg = "unsupported request type"; error("wiki: unrecognized request type: %s", req); } + END_TIMER2(msg_type); resp_msg: snprintf(response, sizeof(response), diff --git a/src/plugins/sched/wiki2/start_job.c b/src/plugins/sched/wiki2/start_job.c index 224f05da8f097bafd1ad17e75b1bfee53f2ab59b..26d7f0e3b999b8cfd695121db25df0f8d9d4571a 100644 --- a/src/plugins/sched/wiki2/start_job.c +++ b/src/plugins/sched/wiki2/start_job.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * start_job.c - Process Wiki start job request ***************************************************************************** - * Copyright (C) 2006 The Regents of the University of California. + * Copyright (C) 2006-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-226842. @@ -134,10 +134,10 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; char *new_node_list; static char tmp_msg[128]; - bitstr_t *new_bitmap; + bitstr_t *new_bitmap, *save_req_bitmap = (bitstr_t *) NULL; bitoff_t i, bsize; int ll; /* layout info index */ - char *node_name, *node_idx, *node_cur; + char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL; size_t node_name_len; static uint32_t cr_test = 0, cr_enabled = 0; @@ -231,9 +231,9 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, } /* get job ready to start now */ - xfree(job_ptr->details->req_nodes); + save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; - FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; old_task_cnt = job_ptr->num_procs; job_ptr->num_procs = MAX(task_cnt, old_task_cnt); @@ -245,10 +245,27 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, /* No errors so far */ (void) schedule(); /* provides own locking */ + /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); - /* job_ptr = find_job_record(jobid); don't bother */ - if ((job_ptr->job_id == jobid) + if (job_ptr->job_id != jobid) + job_ptr = find_job_record(jobid); + if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details && + (job_ptr->job_state == JOB_RUNNING)) { + /* Restore required node list */ + xfree(job_ptr->details->req_nodes); + job_ptr->details->req_nodes = save_req_nodes; + FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + job_ptr->details->req_node_bitmap = save_req_bitmap; + FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + } else { + xfree(save_req_nodes); + FREE_NULL_BITMAP(save_req_bitmap); + if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) + FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + } + + if (job_ptr && (job_ptr->job_id == jobid) && (job_ptr->job_state != JOB_RUNNING)) { uint16_t wait_reason = 0; char *wait_string; diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index d9c6f3019946ccb4df74bbe0ffba6007ba70db16..e21260d2241fc4ba1fcdee5bc413d0c9d35395fd 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * block_allocator.c - Assorted functions for layout of bglblocks, * wiring, mapping for smap, etc. - * $Id: block_allocator.c 12382 2007-09-21 21:13:23Z da $ + * $Id: block_allocator.c 12412 2007-09-26 17:16:01Z da $ ***************************************************************************** * Copyright (C) 2004 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -1046,13 +1046,13 @@ extern void ba_update_node_state(ba_node_t *ba_node, uint16_t state) } #ifdef HAVE_BG - debug("ba_update_node_state: new state of node[%c%c%c] is %s", - alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]], node_state_string(state)); + debug2("ba_update_node_state: new state of node[%c%c%c] is %s", + alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], + alpha_num[ba_node->coord[Z]], node_state_string(state)); #else - debug("ba_update_node_state: new state of node[%d] is %s", - ba_node->coord[X], - node_state_string(state)); + debug2("ba_update_node_state: new state of node[%d] is %s", + ba_node->coord[X], + node_state_string(state)); #endif /* basically set the node as used */ @@ -2465,7 +2465,7 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, [mark_node_tar[Z]]); _new_ba_node(ba_node, mark_node_tar, false); list_push(nodes, ba_node); - debug3("adding %c%c%c as a pass through", + debug3("haven't seen %c%c%c adding it", alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], alpha_num[ba_node->coord[Z]]); @@ -3416,8 +3416,8 @@ static int _set_external_wires(int dim, int count, ba_node_t* source, target = &ba_system_ptr->grid[4] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of 4th */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of 4th */ + _switch_config(source, target, dim, 4, 3); break; case 3: /* 3rd node */ @@ -3425,8 +3425,8 @@ static int _set_external_wires(int dim, int count, ba_node_t* source, target = &ba_system_ptr->grid[2] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of 2nd */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of 2nd */ + _switch_config(source, target, dim, 4, 3); break; case 4: /* 4th node */ @@ -3434,8 +3434,8 @@ static int _set_external_wires(int dim, int count, ba_node_t* source, target = &ba_system_ptr->grid[1] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of 2nd */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of 1st */ + _switch_config(source, target, dim, 4, 3); break; default: @@ -3456,40 +3456,40 @@ static int _set_external_wires(int dim, int count, ba_node_t* source, target = &ba_system_ptr->grid[count-1] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of previous */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of previous */ + _switch_config(source, target, dim, 4, 3); break; case 2: /* 2nd Node */ target = &ba_system_ptr->grid[7] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of last */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of last */ + _switch_config(source, target, dim, 4, 3); break; case 3: /* 3rd Node */ target = &ba_system_ptr->grid[6] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of 6th */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of 6th */ + _switch_config(source, target, dim, 4, 3); break; case 6: /* 6th Node */ target = &ba_system_ptr->grid[3] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of 3rd */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of 3rd */ + _switch_config(source, target, dim, 4, 3); break; case 7: /* 7th Node */ target = &ba_system_ptr->grid[2] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of 2nd */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of 2nd */ + _switch_config(source, target, dim, 4, 3); break; default: fatal("got %d for a count on a %d X-dim system", @@ -3539,8 +3539,8 @@ static int _set_external_wires(int dim, int count, ba_node_t* source, target = &ba_system_ptr->grid[count-1] [source->coord[Y]] [source->coord[Z]]; - /* 3->4 of previous */ - _switch_config(source, target, dim, 3, 4); + /* 4->3 of previous */ + _switch_config(source, target, dim, 4, 3); break; default: fatal("got %d for a count on a %d X-dim system", @@ -4792,12 +4792,16 @@ static int *_set_best_path() return NULL; itr = list_iterator_create(best_path); while((path_switch = (ba_path_switch_t*) list_next(itr))) { - if(passthrough) + if(passthrough && path_switch->in > 1 && path_switch->out > 1) { *passthrough = true; + debug2("got a passthrough"); + } #ifdef HAVE_BG - debug3("mapping %c%c%c",alpha_num[path_switch->geometry[X]], + debug3("mapping %c%c%c %d->%d", + alpha_num[path_switch->geometry[X]], alpha_num[path_switch->geometry[Y]], - alpha_num[path_switch->geometry[Z]]); + alpha_num[path_switch->geometry[Z]], + path_switch->in, path_switch->out); if(!geo) geo = path_switch->geometry; curr_switch = &ba_system_ptr-> @@ -4909,33 +4913,33 @@ int main(int argc, char** argv) /* } */ /* list_destroy(results); */ - results = list_create(NULL); - request->geometry[0] = 1; - request->geometry[1] = 1; - request->geometry[2] = 1; - request->start[0] = 0; - request->start[1] = 0; - request->start[2] = 0; - request->start_req = 1; - request->size = 1; - request->rotate = 0; - request->elongate = 0; - request->conn_type = SELECT_TORUS; - new_ba_request(request); - print_ba_request(request); - if(!allocate_block(request, results)) { - debug("couldn't allocate %c%c%c", - alpha_num[request->geometry[0]], - alpha_num[request->geometry[1]], - alpha_num[request->geometry[2]]); - } - list_destroy(results); +/* results = list_create(NULL); */ +/* request->geometry[0] = 1; */ +/* request->geometry[1] = 1; */ +/* request->geometry[2] = 1; */ +/* request->start[0] = 0; */ +/* request->start[1] = 0; */ +/* request->start[2] = 0; */ +/* request->start_req = 1; */ +/* request->size = 1; */ +/* request->rotate = 0; */ +/* request->elongate = 0; */ +/* request->conn_type = SELECT_TORUS; */ +/* new_ba_request(request); */ +/* print_ba_request(request); */ +/* if(!allocate_block(request, results)) { */ +/* debug("couldn't allocate %c%c%c", */ +/* alpha_num[request->geometry[0]], */ +/* alpha_num[request->geometry[1]], */ +/* alpha_num[request->geometry[2]]); */ +/* } */ +/* list_destroy(results); */ results = list_create(NULL); - request->geometry[0] = 4; + request->geometry[0] = 12; request->geometry[1] = 1; request->geometry[2] = 1; - request->start[0] = 1; + request->start[0] = 0; request->start[1] = 0; request->start[2] = 0; request->start_req = 1; @@ -4990,6 +4994,7 @@ int main(int argc, char** argv) int endx=DIM_SIZE[X]; int endy=1;//DIM_SIZE[Y]; int endz=1;//DIM_SIZE[Z]; + for(x=startx;x<endx;x++) { for(y=starty;y<endy;y++) { for(z=startz;z<endz;z++) { diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 96453ff37eeb9d47c348cea5a0e7361d7cbc6cc2..a134fd8136ae7619394cdef4e1ccfac992970586 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -2,7 +2,7 @@ * bg_job_place.c - blue gene job placement (e.g. base block selection) * functions. * - * $Id: bg_job_place.c 11400 2007-04-24 18:50:38Z da $ + * $Id: bg_job_place.c 12450 2007-10-05 18:22:36Z da $ ***************************************************************************** * Copyright (C) 2004 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -655,6 +655,10 @@ try_again: if(check_block_bp_states( (*found_bg_record)->bg_block_id) == SLURM_ERROR) { + error("_find_best_block_match: Marking " + "block %s in an error state " + "because of bad bps.", + (*found_bg_record)->bg_block_id); (*found_bg_record)->job_running = BLOCK_ERROR_STATE; (*found_bg_record)->state = RM_PARTITION_ERROR; diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 5944fd621b0182f15850ae14ef86042b7ca824e8..c684d32f53578c3149d8ad49244e927e0f86b4fb 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * bluegene.c - blue gene node configuration processing module. * - * $Id: bluegene.c 11620 2007-06-04 20:03:56Z da $ + * $Id: bluegene.c 12450 2007-10-05 18:22:36Z da $ ***************************************************************************** * Copyright (C) 2004 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -683,7 +683,9 @@ end_it: bg_record->bg_block_id, bg_record->job_running); sleep(1); } + slurm_mutex_lock(&block_state_mutex); + error("Setting Block %s to ERROR state.", bg_record->bg_block_id); bg_record->job_running = BLOCK_ERROR_STATE; bg_record->state = RM_PARTITION_ERROR; slurm_mutex_unlock(&block_state_mutex); diff --git a/src/plugins/select/bluegene/plugin/opts.c b/src/plugins/select/bluegene/plugin/opts.c index a641b3d7cd1fd4cb06f593d0af6389cf3577f260..e196a982e6979addc39ffaea863f94df77594818 100644 --- a/src/plugins/select/bluegene/plugin/opts.c +++ b/src/plugins/select/bluegene/plugin/opts.c @@ -1,6 +1,6 @@ /****************************************************************************\ * opts.c - sfree command line option processing functions - * $Id: opts.c 10574 2006-12-15 23:38:29Z jette $ + * $Id: opts.c 12403 2007-09-25 18:36:42Z da $ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -97,31 +97,6 @@ void parse_command_line(int argc, char *argv[]) } -void snprint_time(char *buf, size_t buf_size, time_t time) -{ - if (time == INFINITE) { - snprintf(buf, buf_size, "UNLIMITED"); - } else { - long days, hours, minutes, seconds; - seconds = time % 60; - minutes = (time / 60) % 60; - hours = (time / 3600) % 24; - days = time / 86400; - - if (days) - snprintf(buf, buf_size, - "%ld:%2.2ld:%2.2ld:%2.2ld", - days, hours, minutes, seconds); - else if (hours) - snprintf(buf, buf_size, - "%ld:%2.2ld:%2.2ld", - hours, minutes, seconds); - else - snprintf(buf, buf_size, - "%ld:%2.2ld", minutes,seconds); - } -} - static void _print_version(void) { printf("%s %s\n", PACKAGE, SLURM_VERSION); diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c index 8bb976eaacad053ab5e79983f037f51ccbd9d4ee..48752fc999b5a46c19328d2774e648061c9c459d 100644 --- a/src/plugins/select/bluegene/plugin/select_bluegene.c +++ b/src/plugins/select/bluegene/plugin/select_bluegene.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * select_bluegene.c - node selection plugin for Blue Gene system. * - * $Id: select_bluegene.c 11400 2007-04-24 18:50:38Z da $ + * $Id: select_bluegene.c 12409 2007-09-26 16:32:16Z jette $ ***************************************************************************** * Copyright (C) 2004-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -315,8 +315,7 @@ extern int select_p_state_save(char *dir_name) xfree(new_file); free_buf(buffer); - END_TIMER; - debug3("select_p_state_save %s", TIME_STR); + END_TIMER2("select_p_state_save"); return SLURM_SUCCESS; } diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c index 8acd4d772afc147c1bb21686975f3c00faa1ae5a..1f6e1f4036d4c6a3c1682d64bfa0c3425a37a884 100644 --- a/src/plugins/select/cons_res/dist_tasks.c +++ b/src/plugins/select/cons_res/dist_tasks.c @@ -541,7 +541,7 @@ int cr_dist(struct select_cr_job *job, int cyclic, job->alloc_sockets[job_index] = 0; if ((cr_type == CR_CORE) || (cr_type == CR_CORE_MEMORY)) { for (j = 0; - j < node_record_table_ptr[host_index].cores; + j < job->num_sockets[job_index]; j++) job->alloc_cores[job_index][j] = 0; } @@ -752,7 +752,7 @@ int cr_plane_dist(struct select_cr_job *job, job->alloc_sockets[job_index] = 0; if ((cr_type == CR_CORE) || (cr_type == CR_CORE_MEMORY)) { for (j = 0; - j < node_record_table_ptr[host_index].cores; + j < job->num_sockets[job_index]; j++) job->alloc_cores[job_index][j] = 0; } diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 923348d84a4c30d5b06dfef107e3502e9b6cec43..6ee686efd8247dc22d6cca202142cb84e6a6f495 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2,7 +2,7 @@ * select_cons_res.c - node selection plugin supporting consumable * resources policies. * - * $Id: select_cons_res.c 12266 2007-09-11 20:34:43Z jette $ + * $Id: select_cons_res.c 12452 2007-10-05 19:07:07Z da $ *****************************************************************************\ * * The following example below illustrates how four jobs are allocated @@ -1083,11 +1083,11 @@ extern int select_p_state_save(char *dir_name) { int error_code = SLURM_SUCCESS; ListIterator job_iterator; - struct select_cr_job *job; + struct select_cr_job *job = NULL; Buf buffer = NULL; int state_fd, i; uint16_t job_cnt; - char *file_name; + char *file_name = NULL; static time_t last_save_time; if (last_save_time > last_cr_update_time) @@ -1910,33 +1910,6 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, extern int select_p_job_begin(struct job_record *job_ptr) { - ListIterator job_iterator; - struct select_cr_job *job; - uint32_t cnt = 0; - int i; - - xassert(job_ptr); - xassert(select_cr_job_list); - - /* set job's processor count (for accounting purposes) */ - job_iterator = list_iterator_create(select_cr_job_list); - if (job_iterator == NULL) - fatal("list_iterator_create: %m"); - while ((job = (struct select_cr_job *) list_next(job_iterator))) { - if (job->job_id != job_ptr->job_id) - continue; - for (i=0; i<job->nhosts; i++) - cnt += MIN(job->cpus[i], job->alloc_lps[i]); - if (job_ptr->num_procs != cnt) { - debug2("cons_res: reset num_procs for %u from " - "%u to %u", - job_ptr->job_id, job_ptr->num_procs, cnt); - job_ptr->num_procs = cnt; - } - break; - } - list_iterator_destroy(job_iterator); - return SLURM_SUCCESS; } diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 15ab34c94b5d5a9f0e9d1307c7c4c41e4c51ed5e..c97907a1d3f8c81b36d66eade58aa8d328c82b49 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -3,7 +3,7 @@ * address space. Selects nodes for a job so as to minimize the number * of sets of consecutive nodes using a best-fit algorithm. * - * $Id: select_linear.c 11246 2007-03-27 00:59:46Z jette $ + * $Id: select_linear.c 12419 2007-09-27 16:33:13Z jette $ ***************************************************************************** * Copyright (C) 2004-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -610,25 +610,17 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, extern int select_p_job_begin(struct job_record *job_ptr) { - int i; - uint32_t cnt=0; #ifdef HAVE_XCPU - /* FIXME - rc is not returned! */ - int rc=SLURM_SUCCESS; -#endif + int i; + int rc = SLURM_SUCCESS; + char clone_path[128]; + xassert(job_ptr); xassert(job_ptr->node_bitmap); - /* set job's processor count (for accounting purposes) */ for (i=0; i<select_node_cnt; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; - if (select_fast_schedule) - cnt += select_node_ptr[i].config_ptr->cpus; - else - cnt += select_node_ptr[i].cpus; -#ifdef HAVE_XCPU -{ char clone_path[128]; snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone", XCPU_DIR, select_node_ptr[i].name); @@ -640,14 +632,11 @@ extern int select_p_job_begin(struct job_record *job_ptr) debug("chown %s to %u", clone_path, job_ptr->user_id); } -} -#endif } - debug2("reset num_proc for %u from %u to %u",job_ptr->job_id, - job_ptr->num_procs, cnt); - job_ptr->num_procs = cnt; - + return rc; +#else return SLURM_SUCCESS; +#endif } extern int select_p_job_fini(struct job_record *job_ptr) diff --git a/src/plugins/task/affinity/dist_tasks.c b/src/plugins/task/affinity/dist_tasks.c index abe3c3a3f0ed1f796bbee01b8064b0f44b6baa44..f1ba95232f51325c5acd3add522021b70a9da1e6 100644 --- a/src/plugins/task/affinity/dist_tasks.c +++ b/src/plugins/task/affinity/dist_tasks.c @@ -463,7 +463,6 @@ static bitstr_t *_lllp_map_abstract_mask (bitstr_t *bitmask) int num_bits = bit_size(bitmask); bitstr_t *newmask = bit_alloc(num_bits); - bit_nclear(newmask,0,num_bits-1); /* init to zero */ /* remap to physical machine */ for (i = 0; i < num_bits; i++) { if (bit_test(bitmask,i)) { @@ -1327,7 +1326,6 @@ static void _single_mask(const uint16_t nsockets, bitoff_t bit; bitoff_t num_bits = nsockets * ncores * nthreads; bitstr_t * bitmask = bit_alloc(num_bits); - bit_nclear(bitmask,0,num_bits-1); /* init to zero */ if (bind_to_exact_socket) { nsockets_left = 1; @@ -1355,7 +1353,10 @@ static void _single_mask(const uint16_t nsockets, while (nthreads_left-- > 0) { bit = SCT_TO_LLLP(socket, core, thread, ncores, nthreads); - bit_set(bitmask, bit); + if (bit < num_bits) + bit_set(bitmask, bit); + else + info("Invalid job cpu_bind mask"); thread++; } core++; @@ -1429,7 +1430,8 @@ static void _cr_reserve_unit(bitstr_t *bitmask, int cr_type) reserve_this_core = true; nthreads_left = 0; } - } + } else + info("Invalid job cpu_bind mask"); thread++; } /* mark entire core */ @@ -1443,7 +1445,10 @@ static void _cr_reserve_unit(bitstr_t *bitmask, int cr_type) ncores, nthreads); /* map abstract to machine */ bit = BLOCK_MAP(bit); - bit_set(bitmask, bit); + if (bit < num_bits) + bit_set(bitmask, bit); + else + info("Invalid job cpu_bind mask"); thread++; } } @@ -1463,7 +1468,10 @@ static void _cr_reserve_unit(bitstr_t *bitmask, int cr_type) ncores, nthreads); /* map abstract to machine */ bit = BLOCK_MAP(bit); - bit_set(bitmask, bit); + if (bit < num_bits) + bit_set(bitmask, bit); + else + info("Invalid job cpu_bind mask"); thread++; } core++; @@ -1475,22 +1483,28 @@ static void _cr_reserve_unit(bitstr_t *bitmask, int cr_type) } -void get_bitmap_from_cpu_bind(bitstr_t *bitmap_test, - cpu_bind_type_t cpu_bind_type, - char *cpu_bind, uint32_t numtasks) +static int _get_bitmap_from_cpu_bind(bitstr_t *bitmap_test, + cpu_bind_type_t cpu_bind_type, + char *cpu_bind, uint32_t numtasks) { char opt_dist[10]; char *dist_str = NULL; char *dist_str_next = NULL; int bitmap_size = bit_size(bitmap_test); + int rc = SLURM_SUCCESS; unsigned int i; dist_str = cpu_bind; if (cpu_bind_type & CPU_BIND_RANK) { - for (i = 0; i < MIN(numtasks,bitmap_size); i++) { - bit_set(bitmap_test, i); + for (i=0; i<numtasks; i++) { + if (i < bitmap_size) + bit_set(bitmap_test, i); + else { + info("Invalid job cpu_bind mask"); + return SLURM_ERROR; + } } - return; + return rc; } i = 0; @@ -1520,8 +1534,12 @@ void get_bitmap_from_cpu_bind(bitstr_t *bitmap_test, } else { mycpu = strtoul(opt_dist, NULL, 10); } - if (mycpu < bitmap_size) { + if (mycpu < bitmap_size) bit_set(bitmap_test, mycpu); + else { + info("Invalid job cpu_bind mask"); + rc = SLURM_ERROR; + /* continue and try to map remaining tasks */ } } @@ -1529,6 +1547,7 @@ void get_bitmap_from_cpu_bind(bitstr_t *bitmap_test, dist_str_next = NULL; i++; } + return rc; } @@ -1573,9 +1592,8 @@ static void _cr_update_lllp(int reserve, uint32_t job_id, uint32_t job_step_id, bitoff_t num_bits = conf->sockets * conf->cores * conf->threads; bitstr_t * bitmap_test = bit_alloc(num_bits); - bit_nclear(bitmap_test,0,num_bits-1); /* init to zero */ - get_bitmap_from_cpu_bind(bitmap_test, - cpu_bind_type, cpu_bind, numtasks); + _get_bitmap_from_cpu_bind(bitmap_test, + cpu_bind_type, cpu_bind, numtasks); _cr_reserve_unit(bitmap_test, conf->cr_type); diff --git a/src/slaunch/multi_prog.c b/src/slaunch/multi_prog.c index 46c3383ef9ae4fddf3da9b0385bc84de1ecd71c5..84125355adb7b699ef7609099b9469090c628195 100644 --- a/src/slaunch/multi_prog.c +++ b/src/slaunch/multi_prog.c @@ -55,7 +55,7 @@ static char * _build_path(char* fname) { int i; - char *path_env = NULL, *dir, *ptrptr; + char *path_env = NULL, *dir = NULL, *ptrptr = NULL; static char file_name[256], file_path[256]; /* return values */ struct stat buf; @@ -115,7 +115,8 @@ _set_range(int low_num, int high_num, char *exec_name) static void _set_exec_names(char *ranks, char *exec_name, int ntasks) { - char *range, *p, *ptrptr, *exec_path, *upper; + char *range = NULL, *p = NULL, *ptrptr = NULL; + char *exec_path = NULL, *upper = NULL; int low_num, high_num; if (ranks[0] == '*' && ranks[1] == '\0') { @@ -160,7 +161,7 @@ mpir_set_multi_name(int ntasks, const char *config_fname) { FILE *config_fd; char line[256]; - char *ranks, *exec_name, *p, *ptrptr; + char *ranks = NULL, *exec_name = NULL, *p = NULL, *ptrptr = NULL; int line_num = 0, i; for (i=0; i<ntasks; i++) { diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 5a933616fdcfd67d0d7b34c3145c9951da8ecc8b..fff8cc7f2ba8870cc94e391a34917064a0d1fe36 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -2,7 +2,7 @@ * agent.c - parallel background communication functions. This is where * logic could be placed for broadcast communications. * - * $Id: agent.c 12370 2007-09-20 19:18:17Z jette $ + * $Id: agent.c 12462 2007-10-08 17:42:47Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -1191,8 +1191,7 @@ void agent_queue_request(agent_arg_t *agent_arg_ptr) { queued_request_t *queued_req_ptr = NULL; - if ((agent_cnt < MAX_AGENT_CNT) || /* execute now */ - (agent_arg_ptr->msg_type == REQUEST_SHUTDOWN)) { + if (agent_arg_ptr->msg_type == REQUEST_SHUTDOWN) { /* execute now */ pthread_attr_t attr_agent; pthread_t thread_agent; int rc; @@ -1203,8 +1202,12 @@ void agent_queue_request(agent_arg_t *agent_arg_ptr) rc = pthread_create(&thread_agent, &attr_agent, agent, (void *) agent_arg_ptr); slurm_attr_destroy(&attr_agent); - if (rc == 0) + if (rc == 0) { + sleep(1); + if (!pthread_kill(thread_agent, 0)) + info("Shutdown agent still running"); return; + } } queued_req_ptr = xmalloc(sizeof(queued_request_t)); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 5f0839bd908488cf4bddd5dd0a4063109e48deb1..10b0703e275bf565f3e4ec641b3cd064232312d9 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * controller.c - main control machine daemon for slurm - * $Id: controller.c 11973 2007-08-08 23:59:56Z jette $ + * $Id: controller.c 12452 2007-10-05 19:07:07Z da $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -921,16 +921,13 @@ static void *_slurmctld_background(void *no_data) slurmctld_conf.backup_addr[0] && (difftime(now, last_assert_primary_time) >= slurmctld_conf.slurmctld_timeout) && - node_name && slurmctld_conf.backup_controller && + slurmctld_conf.backup_controller && strcmp(node_name, slurmctld_conf.backup_controller)) { last_assert_primary_time = now; (void) _shutdown_backup_controller(0); } unlock_slurmctld(config_read_lock); - - END_TIMER; - if (DELTA_TIMER > 1000000) /* more than one second */ - info("_slurmctld_background loop %s", TIME_STR); + END_TIMER2("_slurmctld_background"); } debug3("_slurmctld_background shutting down"); @@ -1130,7 +1127,7 @@ static int _shutdown_backup_controller(int wait_time) START_TIMER; if (slurm_send_recv_rc_msg_only_one(&req, &rc, (CONTROL_TIMEOUT * 1000)) < 0) { - END_TIMER; + END_TIMER2("_shutdown_backup_controller"); error("_shutdown_backup_controller:send/recv: %m, %s", TIME_STR); return SLURM_ERROR; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 4f929aa6333fcef735b1af83a98edbc326ee12df..521700b5e27b2b86b016cd49cc4b3d68568d519d 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3,7 +3,7 @@ * Note: there is a global job list (job_list), time stamp * (last_job_update), and hash table (job_hash) * - * $Id: job_mgr.c 12339 2007-09-17 19:25:19Z jette $ + * $Id: job_mgr.c 12460 2007-10-05 23:50:48Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -346,8 +346,7 @@ int dump_all_job_state(void) unlock_state_files(); free_buf(buffer); - END_TIMER; - debug3("dump_all_job_state %s", TIME_STR); + END_TIMER2("dump_all_job_state"); return error_code; } @@ -2027,12 +2026,14 @@ _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) { int error_code = 0; char *dir_name, job_dir[20], *file_name; + DEF_TIMERS; + START_TIMER; /* Create state_save_location directory */ dir_name = xstrdup(slurmctld_conf.state_save_location); /* Create job_id specific directory */ - sprintf(job_dir, "/job.%d", job_id); + sprintf(job_dir, "/job.%u", job_id); xstrcat(dir_name, job_dir); if (mkdir(dir_name, 0700)) { error("mkdir(%s) error %m", dir_name); @@ -2058,6 +2059,7 @@ _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) } xfree(dir_name); + END_TIMER2("_copy_job_desc_to_file"); return error_code; } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index a56f8f99a90705ae12f68cfe068b30df53c7ba30..1823ae06d998d410b810661c2fa16eb810b85897 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -166,7 +166,9 @@ int schedule(void) #endif static bool wiki_sched = false; static bool wiki_sched_test = false; + DEF_TIMERS; + START_TIMER; /* don't bother trying to avoid fragmentation with sched/wiki */ if (!wiki_sched_test) { char *sched_type = slurm_get_sched_type(); @@ -271,6 +273,7 @@ int schedule(void) xfree(failed_parts); xfree(job_queue); unlock_slurmctld(job_write_lock); + END_TIMER2("schedule"); return job_cnt; } diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 574937bcd18626641b36d49ffd06ae3cbdd3beb8..5d83687d7c84560e1e16c0e2d938fead0a96f0be 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -4,7 +4,7 @@ * hash table (node_hash_table), time stamp (last_node_update) and * configuration list (config_list) * - * $Id: node_mgr.c 11759 2007-06-28 20:25:25Z jette $ + * $Id: node_mgr.c 12407 2007-09-25 22:51:43Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -306,8 +306,7 @@ int dump_all_node_state ( void ) unlock_state_files (); free_buf (buffer); - END_TIMER; - debug3("dump_all_node_state %s", TIME_STR); + END_TIMER2("dump_all_node_state"); return error_code; } @@ -973,9 +972,7 @@ void set_slurmd_addr (void) node_ptr->comm_name); } - END_TIMER; - debug("set_slurmd_addr: got IP addresses for all nodes %s", - TIME_STR); + END_TIMER2("set_slurmd_addr"); return; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 6b684effbbbf82b6928f377c5b1d40c50682eca0..3aeb4c69e6eb6074ff9c7ef829bdff6bf2bf4c85 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2,7 +2,7 @@ * node_scheduler.c - select and allocated nodes to jobs * Note: there is a global node table (node_record_table_ptr) * - * $Id: node_scheduler.c 12092 2007-08-22 20:39:02Z jette $ + * $Id: node_scheduler.c 12452 2007-10-05 19:07:07Z da $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -243,7 +243,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, */ static int _match_feature(char *seek, char *available) { - char *tmp_available, *str_ptr3, *str_ptr4; + char *tmp_available = NULL, *str_ptr3 = NULL, *str_ptr4 = NULL; int found; if (seek == NULL) @@ -1533,7 +1533,7 @@ static bitstr_t *_valid_features(char *requested, char *available) result = 1; /* assume good for now */ last_op = FEATURE_OP_AND; for (i=0; ; i++) { - if (tmp_requested[i] == (char) NULL) { + if (tmp_requested[i] == '\0') { if (strlen(str_ptr1) == 0) break; found = _match_feature(str_ptr1, available); @@ -1551,7 +1551,7 @@ static bitstr_t *_valid_features(char *requested, char *available) result = 0; break; } - tmp_requested[i] = (char) NULL; + tmp_requested[i] = '\0'; found = _match_feature(str_ptr1, available); if (last_op == FEATURE_OP_AND) result &= found; @@ -1561,7 +1561,7 @@ static bitstr_t *_valid_features(char *requested, char *available) last_op = FEATURE_OP_AND; } else if (tmp_requested[i] == '|') { - tmp_requested[i] = (char) NULL; + tmp_requested[i] = '\0'; found = _match_feature(str_ptr1, available); if (bracket != 0) { if (found) { @@ -1591,7 +1591,7 @@ static bitstr_t *_valid_features(char *requested, char *available) str_ptr1 = &tmp_requested[i + 1]; } else if (tmp_requested[i] == ']') { - tmp_requested[i] = (char) NULL; + tmp_requested[i] = '\0'; found = _match_feature(str_ptr1, available); if (found) { if (!result_bits) @@ -1615,7 +1615,7 @@ static bitstr_t *_valid_features(char *requested, char *available) && (bracket == 1)) { last_op = FEATURE_OP_OR; str_ptr1 = &tmp_requested[i + 2]; - } else if ((tmp_requested[i + 1] == (char) NULL) + } else if ((tmp_requested[i + 1] == '\0') && (bracket == 1)) { break; } else { diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index 065a7363fdf5aca5a831d3da79bbb97e148ac106..62ab07a80b6ff2323ad1bf63397e78ffe391f5be 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -2,7 +2,7 @@ * partition_mgr.c - manage the partition information of slurm * Note: there is a global partition list (part_list) and * time stamp (last_part_update) - * $Id: partition_mgr.c 11781 2007-07-02 23:00:56Z jette $ + * $Id: partition_mgr.c 12452 2007-10-05 19:07:07Z da $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -341,8 +341,7 @@ int dump_all_part_state(void) unlock_state_files(); free_buf(buffer); - END_TIMER; - debug3("dump_all_part_state %s", TIME_STR); + END_TIMER2("dump_all_part_state"); return 0; } @@ -945,7 +944,7 @@ uid_t *_get_groups_members(char *group_names) uid_t *group_uids = NULL; uid_t *temp_uids = NULL; int i, j, k; - char *tmp_names, *name_ptr, *one_group_name; + char *tmp_names = NULL, *name_ptr = NULL, *one_group_name = NULL; if (group_names == NULL) return NULL; diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index d3ee319a80c593029612c9b2dfc9002ceab4334c..8e894fef915c7d83673bbe3d9a189e0622e15822 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -344,8 +344,8 @@ static int _init_power_config(void) } if (exc_parts) { - char *tmp, *one_part, *part_list; - struct part_record *part_ptr; + char *tmp = NULL, *one_part = NULL, *part_list = NULL; + struct part_record *part_ptr = NULL; int rc = 0; part_list = xstrdup(exc_parts); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 8086588790262fd48a8c4acd8c71c13bce9068a5..f5768a3fb9d687662d8013cd58777262aea2727f 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * proc_req.c - process incomming messages to slurmctld * - * $Id: proc_req.c 11873 2007-07-25 21:08:46Z jette $ + * $Id: proc_req.c 12413 2007-09-26 17:32:47Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -482,7 +482,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) immediate, false, true, uid, &job_ptr); /* unlock after finished using the job structure data */ - END_TIMER; + END_TIMER2("_slurm_rpc_allocate_resources"); } /* return result */ @@ -558,8 +558,7 @@ static void _slurm_rpc_dump_conf(slurm_msg_t * msg) } else { _fill_ctld_conf(&config_tbl); unlock_slurmctld(config_read_lock); - END_TIMER; - debug2("_slurm_rpc_dump_conf %s", TIME_STR); + END_TIMER2("_slurm_rpc_dump_conf"); /* init response_msg structure */ slurm_msg_t_init(&response_msg); @@ -599,7 +598,7 @@ static void _slurm_rpc_dump_jobs(slurm_msg_t * msg) job_info_request_msg->show_flags, g_slurm_auth_get_uid(msg->auth_cred)); unlock_slurmctld(job_read_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_dump_jobs"); debug2("_slurm_rpc_dump_jobs, size=%d %s", dump_size, TIME_STR); @@ -634,7 +633,7 @@ static void _slurm_rpc_end_time(slurm_msg_t * msg) lock_slurmctld(job_read_lock); rc = job_end_time(time_req_msg, &timeout_msg); unlock_slurmctld(job_read_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_end_time"); if (rc != SLURM_SUCCESS) { slurm_send_rc_msg(msg, rc); @@ -674,7 +673,7 @@ static void _slurm_rpc_dump_nodes(slurm_msg_t * msg) pack_all_node(&dump, &dump_size, node_req_msg->show_flags, g_slurm_auth_get_uid(msg->auth_cred)); unlock_slurmctld(node_read_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_dump_nodes"); debug2("_slurm_rpc_dump_nodes, size=%d %s", dump_size, TIME_STR); @@ -698,13 +697,15 @@ static void _slurm_rpc_dump_partitions(slurm_msg_t * msg) char *dump; int dump_size; slurm_msg_t response_msg; - part_info_request_msg_t *part_req_msg = (part_info_request_msg_t *) msg->data; + part_info_request_msg_t *part_req_msg; + /* Locks: Read partition */ slurmctld_lock_t part_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; START_TIMER; debug2("Processing RPC: REQUEST_PARTITION_INFO"); + part_req_msg = (part_info_request_msg_t *) msg->data; lock_slurmctld(part_read_lock); if ((part_req_msg->last_update - 1) >= last_part_update) { @@ -715,7 +716,7 @@ static void _slurm_rpc_dump_partitions(slurm_msg_t * msg) pack_all_part(&dump, &dump_size, part_req_msg->show_flags, g_slurm_auth_get_uid(msg->auth_cred)); unlock_slurmctld(part_read_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_dump_partitions"); debug2("_slurm_rpc_dump_partitions, size=%d %s", dump_size, TIME_STR); @@ -759,7 +760,7 @@ static void _slurm_rpc_epilog_complete(slurm_msg_t * msg) epilog_msg->return_code)) run_scheduler = true; unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_epilog_complete"); if (epilog_msg->return_code) error("_slurm_rpc_epilog_complete JobId=%u Node=%s Err=%s %s", @@ -805,7 +806,7 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg) job_step_kill_msg->signal, job_step_kill_msg->batch_flag, uid); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_job_step_kill"); /* return result */ if (error_code) { @@ -827,7 +828,7 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg) job_step_kill_msg->signal, uid); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_job_step_kill"); /* return result */ if (error_code) { @@ -877,7 +878,7 @@ static void _slurm_rpc_complete_job_allocation(slurm_msg_t * msg) error_code = job_complete(comp_msg->job_id, uid, job_requeue, comp_msg->job_rc); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_complete_job_allocation"); /* return result */ if (error_code) { @@ -917,7 +918,7 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) if (!_is_super_user(uid)) { /* Only the slurmstepd can complete a batch script */ - END_TIMER; + END_TIMER2("_slurm_rpc_complete_batch_script"); return; } @@ -958,7 +959,7 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) error_code = job_complete(comp_msg->job_id, uid, job_requeue, comp_msg->job_rc); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_complete_batch_script"); /* return result */ if (error_code) { @@ -1026,7 +1027,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) } if (error_code == SLURM_SUCCESS) error_code = _make_step_cred(step_rec, &slurm_cred); - END_TIMER; + END_TIMER2("_slurm_rpc_job_step_create"); /* return result */ if (error_code) { @@ -1091,7 +1092,7 @@ static void _slurm_rpc_job_step_get_info(slurm_msg_t * msg) request->job_id, request->step_id, uid, request->show_flags, buffer); unlock_slurmctld(job_read_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_job_step_get_info"); if (error_code) { /* job_id:step_id not found or otherwise *\ \* error message is printed elsewhere */ @@ -1152,7 +1153,7 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg) error_code = job_allocate(job_desc_msg, true, true, true, uid, &job_ptr); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_job_will_run"); } /* return result */ @@ -1214,7 +1215,7 @@ static void _slurm_rpc_node_registration(slurm_msg_t * msg) node_reg_stat_msg->status); #endif unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_node_registration"); } /* return result */ @@ -1254,7 +1255,7 @@ static void _slurm_rpc_job_alloc_info(slurm_msg_t * msg) do_unlock = true; lock_slurmctld(job_read_lock); error_code = job_alloc_info(uid, job_info_msg->job_id, &job_ptr); - END_TIMER; + END_TIMER2("_slurm_rpc_job_alloc_info"); /* return result */ if (error_code || (job_ptr == NULL)) { @@ -1329,7 +1330,7 @@ static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg) do_unlock = true; lock_slurmctld(job_read_lock); error_code = job_alloc_info(uid, job_info_msg->job_id, &job_ptr); - END_TIMER; + END_TIMER2("_slurm_rpc_job_alloc_info_lite"); /* return result */ if (error_code || (job_ptr == NULL)) { @@ -1423,7 +1424,7 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg) unlock_slurmctld(config_write_lock); trigger_reconfig(); } - END_TIMER; + END_TIMER2("_slurm_rpc_reconfigure_controller"); /* return result */ if (error_code) { @@ -1582,7 +1583,7 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) error_code = job_complete(req->job_id, uid, job_requeue, step_rc); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_step_complete"); /* return result */ if (error_code) { @@ -1599,7 +1600,7 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) error_code = job_step_complete(req->job_id, req->job_step_id, uid, job_requeue, step_rc); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_step_complete"); /* return result */ if (error_code) { @@ -1643,7 +1644,7 @@ static void _slurm_rpc_step_layout(slurm_msg_t *msg) lock_slurmctld(job_read_lock); error_code = job_alloc_info(uid, req->job_id, &job_ptr); - END_TIMER; + END_TIMER2("_slurm_rpc_step_layout"); /* return result */ if (error_code || (job_ptr == NULL)) { unlock_slurmctld(job_read_lock); @@ -1727,7 +1728,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) error_code = _launch_batch_step(job_desc_msg, uid, &step_id); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_submit_batch_job"); if (error_code != SLURM_SUCCESS) { info("_launch_batch_step: %s", @@ -1756,7 +1757,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) job_desc_msg->immediate, false, false, uid, &job_ptr); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_submit_batch_job"); } /* return result */ @@ -1804,7 +1805,7 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) lock_slurmctld(job_write_lock); error_code = update_job(job_desc_msg, uid); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_update_job"); /* return result */ if (error_code) { @@ -1834,13 +1835,16 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) extern int slurm_drain_nodes(char *node_list, char *reason) { int error_code; + DEF_TIMERS; /* Locks: Write node */ slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; + START_TIMER; lock_slurmctld(node_write_lock); error_code = drain_nodes(node_list, reason); unlock_slurmctld(node_write_lock); + END_TIMER2("slurm_drain_nodes"); return error_code; } @@ -1856,13 +1860,16 @@ extern int slurm_drain_nodes(char *node_list, char *reason) extern int slurm_fail_job(uint32_t job_id) { int error_code; + DEF_TIMERS; /* Locks: Write job and node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + START_TIMER; lock_slurmctld(job_write_lock); error_code = job_fail(job_id); unlock_slurmctld(job_write_lock); + END_TIMER2("slurm_fail_job"); return error_code; } @@ -1895,7 +1902,7 @@ static void _slurm_rpc_update_node(slurm_msg_t * msg) lock_slurmctld(node_write_lock); error_code = update_node(update_node_msg_ptr); unlock_slurmctld(node_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_update_node"); } /* return result */ @@ -1951,7 +1958,7 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg) error_code = update_part(part_desc_ptr); unlock_slurmctld(part_write_lock); } - END_TIMER; + END_TIMER2("_slurm_rpc_update_partition"); } /* return result */ @@ -2000,7 +2007,7 @@ static void _slurm_rpc_delete_partition(slurm_msg_t * msg) lock_slurmctld(part_write_lock); error_code = delete_partition(part_desc_ptr); unlock_slurmctld(part_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_delete_partition"); } /* return result */ @@ -2031,7 +2038,7 @@ static void _slurm_rpc_job_ready(slurm_msg_t * msg) START_TIMER; error_code = job_node_ready(id_msg->job_id, &result); - END_TIMER; + END_TIMER2("_slurm_rpc_job_ready"); if (error_code) { debug2("_slurm_rpc_job_ready: %s", @@ -2063,7 +2070,7 @@ static void _slurm_rpc_node_select_info(slurm_msg_t * msg) debug2("Processing RPC: REQUEST_NODE_SELECT_INFO"); error_code = select_g_pack_node_info(sel_req_msg->last_update, &buffer); - END_TIMER; + END_TIMER2("_slurm_rpc_node_select_info"); if (error_code) { debug3("_slurm_rpc_node_select_info: %s", @@ -2120,7 +2127,7 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg) lock_slurmctld(job_write_lock); error_code = job_suspend(sus_ptr, uid, msg->conn_fd); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_suspend"); if (error_code) { info("_slurm_rpc_suspend(%s) %u: %s", op, @@ -2153,7 +2160,7 @@ inline static void _slurm_rpc_requeue(slurm_msg_t * msg) error_code = job_requeue(uid, requeue_ptr->job_id, msg->conn_fd); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_requeue"); if (error_code) { info("_slurm_rpc_requeue %u: %s", requeue_ptr->job_id, @@ -2211,7 +2218,7 @@ inline static void _slurm_rpc_checkpoint(slurm_msg_t * msg) lock_slurmctld(job_write_lock); error_code = job_step_checkpoint(ckpt_ptr, uid, msg->conn_fd); unlock_slurmctld(job_write_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_checkpoint"); if (error_code) { if (ckpt_ptr->step_id == SLURM_BATCH_SCRIPT) @@ -2256,7 +2263,7 @@ inline static void _slurm_rpc_checkpoint_comp(slurm_msg_t * msg) lock_slurmctld(job_read_lock); error_code = job_step_checkpoint_comp(ckpt_ptr, uid, msg->conn_fd); unlock_slurmctld(job_read_lock); - END_TIMER; + END_TIMER2("_slurm_rpc_checkpoint_comp"); if (error_code) { info("_slurm_rpc_checkpoint_comp %u.%u: %s", @@ -2488,11 +2495,15 @@ inline static void _slurm_rpc_trigger_clear(slurm_msg_t * msg) int rc; uid_t uid; trigger_info_msg_t * trigger_ptr = (trigger_info_msg_t *) msg->data; + DEF_TIMERS; + START_TIMER; debug("Processing RPC: REQUEST_TRIGGER_CLEAR"); uid = g_slurm_auth_get_uid(msg->auth_cred); rc = trigger_clear(uid, trigger_ptr); + END_TIMER2("_slurm_rpc_trigger_clear"); + slurm_send_rc_msg(msg, rc); } @@ -2502,11 +2513,14 @@ inline static void _slurm_rpc_trigger_get(slurm_msg_t * msg) trigger_info_msg_t *resp_data; trigger_info_msg_t * trigger_ptr = (trigger_info_msg_t *) msg->data; slurm_msg_t response_msg; + DEF_TIMERS; + START_TIMER; debug("Processing RPC: REQUEST_TRIGGER_GET"); uid = g_slurm_auth_get_uid(msg->auth_cred); resp_data = trigger_get(uid, trigger_ptr); + END_TIMER2("_slurm_rpc_trigger_get"); slurm_msg_t_init(&response_msg); response_msg.address = msg->address; @@ -2522,11 +2536,15 @@ inline static void _slurm_rpc_trigger_set(slurm_msg_t * msg) uid_t uid; gid_t gid; trigger_info_msg_t * trigger_ptr = (trigger_info_msg_t *) msg->data; + DEF_TIMERS; + START_TIMER; debug("Processing RPC: REQUEST_TRIGGER_SET"); uid = g_slurm_auth_get_uid(msg->auth_cred); gid = g_slurm_auth_get_gid(msg->auth_cred); rc = trigger_set(uid, gid, trigger_ptr); + END_TIMER2("_slurm_rpc_trigger_set"); + slurm_send_rc_msg(msg, rc); } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 9b01d27bcee2e7ce2239ff78ddf2ad77d4eeba47..09617451a21ca7964eb802b1bbae1c4dcd4cf7fb 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -803,10 +803,7 @@ int read_slurm_conf(int recover) old_select_type_p); slurmctld_conf.last_update = time(NULL); - END_TIMER; - debug("read_slurm_conf: finished loading configuration %s", - TIME_STR); - + END_TIMER2("read_slurm_conf"); return error_code; } diff --git a/src/slurmctld/trigger_mgr.c b/src/slurmctld/trigger_mgr.c index 8fa90cccc78874206e2b3422b0d33fb4af4da846..6b4871e491e0358ff00273d71f8882eb6d2d0098 100644 --- a/src/slurmctld/trigger_mgr.c +++ b/src/slurmctld/trigger_mgr.c @@ -817,6 +817,12 @@ static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now) } } +/* Ideally we would use the existing proctrack plugin to prevent any + * processes from escaping our control, but that plugin is tied + * to various slurmd data structures. We just the process group ID + * to kill the spawned program after MAX_PROG_TIME. Since triggers are + * meant primarily for system administrators rather than users, this + * may be sufficient. */ static void _trigger_run_program(trig_mgr_info_t *trig_in) { char program[1024], arg0[1024], arg1[1024], user_name[1024], *pname; diff --git a/src/smap/job_functions.c b/src/smap/job_functions.c index cf6f63195fcbf90aaf865b4a6d6d1d0dd0fef2ae..8536e27b773e7bff9216cc33827217d72ffcbfdf 100644 --- a/src/smap/job_functions.c +++ b/src/smap/job_functions.c @@ -38,6 +38,7 @@ #include "src/common/uid.h" #include "src/common/node_select.h" +#include "src/common/parse_time.h" #include "src/smap/smap.h" static int _get_node_cnt(job_info_t * job); @@ -308,7 +309,7 @@ static int _print_text_job(job_info_t * job_ptr) sprintf(time_buf,"0:00:00"); } else { time_diff = now_time - job_ptr->start_time; - snprint_time(time_buf, sizeof(time_buf), time_diff); + secs2time_str(time_diff, time_buf, sizeof(time_buf)); } width = strlen(time_buf); mvwprintw(text_win, main_ycord, @@ -374,7 +375,7 @@ static int _print_text_job(job_info_t * job_ptr) sprintf(time_buf,"0:00:00"); } else { time_diff = now_time - job_ptr->start_time; - snprint_time(time_buf, sizeof(time_buf), time_diff); + secs2time_str(time_diff, time_buf, sizeof(time_buf)); } printf("%10.10s ", time_buf); diff --git a/src/smap/opts.c b/src/smap/opts.c index 795b28ab094db6c61f7f88e2c33006cff5a55605..2abc184edd9126ce6a51fc4847027343b3a8940d 100644 --- a/src/smap/opts.c +++ b/src/smap/opts.c @@ -122,31 +122,6 @@ extern void parse_command_line(int argc, char *argv[]) } -extern void snprint_time(char *buf, size_t buf_size, time_t time) -{ - if (time == INFINITE) { - snprintf(buf, buf_size, "UNLIMITED"); - } else { - long days, hours, minutes, seconds; - seconds = time % 60; - minutes = (time / 60) % 60; - hours = (time / 3600) % 24; - days = time / 86400; - - if (days) - snprintf(buf, buf_size, - "%ld-%2.2ld:%2.2ld:%2.2ld", - days, hours, minutes, seconds); - else if (hours) - snprintf(buf, buf_size, - "%ld:%2.2ld:%2.2ld", - hours, minutes, seconds); - else - snprintf(buf, buf_size, - "%ld:%2.2ld", minutes,seconds); - } -} - extern void print_date() { time_t now_time = time(NULL); diff --git a/src/smap/partition_functions.c b/src/smap/partition_functions.c index 19a3880308172e6d6eaa41f1a54010256fd00aa8..5445894088c9fba31245081e2aec3dea90e1d565 100644 --- a/src/smap/partition_functions.c +++ b/src/smap/partition_functions.c @@ -39,6 +39,7 @@ #include "src/smap/smap.h" #include "src/common/node_select.h" +#include "src/common/parse_time.h" #include "src/api/node_select_info.h" #define _DEBUG 0 @@ -554,10 +555,10 @@ static int _print_text_part(partition_info_t *part_ptr, snprintf(time_buf, sizeof(time_buf), "infinite"); else { - snprint_time(time_buf, - sizeof(time_buf), - (part_ptr->max_time - * 60)); + secs2time_str((part_ptr->max_time + * 60), + time_buf, + sizeof(time_buf)); } width = strlen(time_buf); @@ -692,10 +693,10 @@ static int _print_text_part(partition_info_t *part_ptr, snprintf(time_buf, sizeof(time_buf), "infinite"); else { - snprint_time(time_buf, - sizeof(time_buf), - (part_ptr->max_time - * 60)); + secs2time_str((part_ptr->max_time + * 60), + time_buf, + sizeof(time_buf)); } width = strlen(time_buf); @@ -836,12 +837,17 @@ static int _addto_nodelist(List nodelist, int *start, int *end) int *coord = NULL; int x,y,z; - assert(end[X] < DIM_SIZE[X]); + if(end[X] < DIM_SIZE[X] + || end[Y] < DIM_SIZE[Y] + || end[Z] < DIM_SIZE[Z]) { + fatal("It appears the slurm.conf file has changed since " + "the last restart.\nThings are in an incompatible " + "state, please restart the slurmctld."); + } + assert(start[X] >= 0); - assert(end[Y] < DIM_SIZE[Y]); assert(start[Y] >= 0); - assert(end[Z] < DIM_SIZE[Z]); - assert(start[Z] >= 0); + assert(start[X] >= 0); for (x = start[X]; x <= end[X]; x++) { for (y = start[Y]; y <= end[Y]; y++) { diff --git a/src/smap/smap.h b/src/smap/smap.h index 8f1702e27baf51598406d298724cf3e8ef33c925..b6f3413f662a4dbe60ab511e6a3a49f3bae352e4 100644 --- a/src/smap/smap.h +++ b/src/smap/smap.h @@ -150,7 +150,6 @@ extern int set_grid_bg(int *start, int *end, int count, int set); extern void print_grid(int dir); extern void parse_command_line(int argc, char *argv[]); -extern void snprint_time(char *buf, size_t buf_size, time_t time); extern void print_date(); extern void clear_window(WINDOW *win); diff --git a/src/squeue/opts.c b/src/squeue/opts.c index f741828bfa90d037ae443e4f5729f0d30e2cc898..53112f89eb6eb9b6593cbb08474ce53aa2a1dc9e 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -1,7 +1,7 @@ /****************************************************************************\ * opts.c - srun command line option parsing * - * $Id: opts.c 10971 2007-02-13 16:44:23Z jette $ + * $Id: opts.c 12452 2007-10-05 19:07:07Z da $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -346,7 +346,8 @@ extern int parse_format( char* format ) { int field_size; bool right_justify; - char *prefix, *suffix, *token, *tmp_char, *tmp_format; + char *prefix = NULL, *suffix = NULL, *token = NULL; + char *tmp_char = NULL, *tmp_format = NULL; char field[1]; if (format == NULL) { @@ -772,9 +773,9 @@ static List _build_job_list( char* str ) { List my_list; - char *job, *tmp_char, *my_job_list; + char *job = NULL, *tmp_char = NULL, *my_job_list = NULL; int i; - uint32_t *job_id; + uint32_t *job_id = NULL; if ( str == NULL) return NULL; @@ -805,7 +806,7 @@ static List _build_part_list( char* str ) { List my_list; - char *part, *tmp_char, *my_part_list; + char *part = NULL, *tmp_char = NULL, *my_part_list = NULL; if ( str == NULL) return NULL; @@ -829,8 +830,8 @@ static List _build_state_list( char* str ) { List my_list; - char *state, *tmp_char, *my_state_list; - enum job_states *state_id; + char *state = NULL, *tmp_char = NULL, *my_state_list = NULL; + enum job_states *state_id = NULL; if ( str == NULL) return NULL; @@ -887,10 +888,10 @@ static List _build_step_list( char* str ) { List my_list; - char *step, *tmp_char, *tmps_char, *my_step_list; - char *job_name, *step_name; + char *step = NULL, *tmp_char = NULL, *tmps_char = NULL; + char *job_name = NULL, *step_name = NULL, *my_step_list = NULL; int i, j; - squeue_job_step_t *job_step_id; + squeue_job_step_t *job_step_id = NULL; if ( str == NULL) return NULL; @@ -931,9 +932,10 @@ static List _build_user_list( char* str ) { List my_list; - char *user, *tmp_char, *my_user_list, *end_ptr; - uint32_t *uid; - struct passwd *passwd_ptr; + char *user = NULL; + char *tmp_char = NULL, *my_user_list = NULL, *end_ptr = NULL; + uint32_t *uid = NULL; + struct passwd *passwd_ptr = NULL; if ( str == NULL) return NULL; diff --git a/src/srun/multi_prog.c b/src/srun/multi_prog.c index 5a98ebd03565d8643d098846bd40903d68d8aee8..af36c99f71927068263124b3c41f79f42f865dd1 100644 --- a/src/srun/multi_prog.c +++ b/src/srun/multi_prog.c @@ -66,7 +66,7 @@ static char * _build_path(char* fname) { int i; - char *path_env = NULL, *dir, *ptrptr; + char *path_env = NULL, *dir = NULL, *ptrptr = NULL; static char file_name[256], file_path[256]; /* return values */ struct stat buf; @@ -126,7 +126,8 @@ _set_range(int low_num, int high_num, char *exec_name) static void _set_exec_names(char *ranks, char *exec_name, int ntasks) { - char *range, *p, *ptrptr, *exec_path, *upper; + char *range = NULL, *p = NULL; + char *ptrptr = NULL, *exec_path = NULL, *upper = NULL; int low_num, high_num; if (ranks[0] == '*' && ranks[1] == '\0') { diff --git a/src/sview/block_info.c b/src/sview/block_info.c index b14ea447f5e3e2533f3ab38bdabb9d08ef0c3f8a..47654ea7f6defd63557a2143ad8bb4359ee0001c 100644 --- a/src/sview/block_info.c +++ b/src/sview/block_info.c @@ -573,24 +573,30 @@ extern int get_new_info_node_select(node_select_info_msg_t **node_select_ptr, static node_select_info_msg_t *new_bg_ptr = NULL; time_t now = time(NULL); static time_t last; + static bool changed = 0; if(!force && ((now - last) < global_sleep_time)) { *node_select_ptr = bg_info_ptr; + if(changed) + return SLURM_SUCCESS; return error_code; } last = now; if (bg_info_ptr) { error_code = slurm_load_node_select(bg_info_ptr->last_update, &new_bg_ptr); - if (error_code == SLURM_SUCCESS) + if (error_code == SLURM_SUCCESS) { select_g_free_node_info(&bg_info_ptr); - else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { + changed = 1; + } else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_NO_CHANGE_IN_DATA; new_bg_ptr = bg_info_ptr; + changed = 0; } } else { error_code = slurm_load_node_select((time_t) NULL, &new_bg_ptr); + changed = 1; } bg_info_ptr = new_bg_ptr; diff --git a/src/sview/common.c b/src/sview/common.c index 943842a4fc25cba99cb76c3bb632287ab6f5cc4f..62330728bc6cb0baec64a9950a5c9cab90e56ac9 100644 --- a/src/sview/common.c +++ b/src/sview/common.c @@ -273,31 +273,6 @@ static void _selected_page(GtkMenuItem *menuitem, xfree(treedata); } -extern void snprint_time(char *buf, size_t buf_size, time_t time) -{ - if (time == INFINITE) { - snprintf(buf, buf_size, "UNLIMITED"); - } else { - long days, hours, minutes, seconds; - seconds = time % 60; - minutes = (time / 60) % 60; - hours = (time / 3600) % 24; - days = time / 86400; - - if (days) - snprintf(buf, buf_size, - "%ld-%2.2ld:%2.2ld:%2.2ld", - days, hours, minutes, seconds); - else if (hours) - snprintf(buf, buf_size, - "%ld:%2.2ld:%2.2ld", - hours, minutes, seconds); - else - snprintf(buf, buf_size, - "%ld:%2.2ld", minutes,seconds); - } -} - extern int get_row_number(GtkTreeView *tree_view, GtkTreePath *path) { GtkTreeModel *model = gtk_tree_view_get_model(tree_view); diff --git a/src/sview/grid.c b/src/sview/grid.c index 7c3055b395b4b31265ea14d7469e439a204feaca..4d0fa0566b85eeb23a9d8383be1246d010b8cc15 100644 --- a/src/sview/grid.c +++ b/src/sview/grid.c @@ -628,7 +628,7 @@ extern int get_system_stats(GtkTable *table) if(label) gtk_widget_destroy(label); - + if((error_code = get_new_info_node(&node_info_ptr, force_refresh)) == SLURM_NO_CHANGE_IN_DATA) { changed = 0; @@ -817,7 +817,7 @@ extern void sview_init_grid() ListIterator itr = NULL; grid_button_t *grid_button = NULL; GdkColor color; - + if((error_code = get_new_info_node(&node_info_ptr, force_refresh)) == SLURM_NO_CHANGE_IN_DATA) { return; diff --git a/src/sview/job_info.c b/src/sview/job_info.c index a91cef4800124e4e3b69f77e58e8d3672fa7e571..d7c2be59bfed478a411494bd43ea9c013f309402 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -577,7 +577,7 @@ static const char *_set_job_msg(job_desc_msg_t *job_msg, const char *new_text, if(!strcasecmp(new_text, "infinite")) temp_int = INFINITE; else - temp_int = strtol(new_text, (char **)NULL, 10); + temp_int = time_str2mins((char *)new_text); type = "timelimit"; if(temp_int <= 0 && temp_int != INFINITE) @@ -883,7 +883,7 @@ static void _admin_edit_combo_box_job(GtkComboBox *combo, gtk_tree_model_get(model, &iter, 0, &name, -1); gtk_tree_model_get(model, &iter, 1, &column, -1); - + _set_job_msg(job_msg, name, column); g_free(name); @@ -1064,7 +1064,7 @@ static void _layout_job_record(GtkTreeView *treeview, + job_ptr->pre_sus_time); now_time = difftime(now_time, job_ptr->start_time); } - snprint_time(tmp_char, sizeof(tmp_char), now_time); + secs2time_str(now_time, tmp_char, sizeof(tmp_char)); nodes = sview_job_info_ptr->nodes; } add_display_treestore_line(update, treestore, &iter, @@ -1094,18 +1094,17 @@ static void _layout_job_record(GtkTreeView *treeview, find_col_name(display_data_job, SORTID_END_TIME), tmp_char); - snprint_time(tmp_char, sizeof(tmp_char), job_ptr->suspend_time); + secs2time_str(job_ptr->suspend_time, tmp_char, sizeof(tmp_char)); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_SUSPEND_TIME), tmp_char); - if (job_ptr->time_limit == INFINITE) - sprintf(tmp_char, "UNLIMITED"); - else if (job_ptr->time_limit == NO_VAL) + if (job_ptr->time_limit == NO_VAL) sprintf(tmp_char, "Partition Limit"); else - snprint_time(tmp_char, sizeof(tmp_char), job_ptr->time_limit); + secs2time_str((job_ptr->time_limit * 60), + tmp_char, sizeof(tmp_char)); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_TIMELIMIT), @@ -1455,7 +1454,7 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, + job_ptr->pre_sus_time); now_time = difftime(now_time, job_ptr->start_time); } - snprint_time(tmp_char, sizeof(tmp_char), now_time); + secs2time_str(now_time, tmp_char, sizeof(tmp_char)); nodes = sview_job_info_ptr->nodes; } gtk_tree_store_set(treestore, iter, SORTID_TIME, tmp_char, -1); @@ -1476,12 +1475,11 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, sizeof(tmp_char)); gtk_tree_store_set(treestore, iter, SORTID_SUSPEND_TIME, tmp_char, -1); - if (job_ptr->time_limit == INFINITE) - sprintf(tmp_char, "UNLIMITED"); - else if (job_ptr->time_limit == NO_VAL) + if (job_ptr->time_limit == NO_VAL) sprintf(tmp_char, "Partition Limit"); else - snprint_time(tmp_char, sizeof(tmp_char), job_ptr->time_limit); + secs2time_str((job_ptr->time_limit * 60), + tmp_char, sizeof(tmp_char)); gtk_tree_store_set(treestore, iter, SORTID_TIMELIMIT, tmp_char, -1); gtk_tree_store_set(treestore, iter, SORTID_ALLOC, 1, -1); @@ -1765,7 +1763,7 @@ static void _layout_step_record(GtkTreeView *treeview, state = JOB_PENDING; } else { now_time -= step_ptr->start_time; - snprint_time(tmp_time, sizeof(tmp_time), now_time); + secs2time_str(now_time, tmp_time, sizeof(tmp_time)); nodes = step_ptr->nodes; #ifdef HAVE_BG convert_num_unit((float)step_ptr->num_tasks, @@ -1855,7 +1853,7 @@ static void _update_step_record(job_step_info_t *step_ptr, state = JOB_PENDING; } else { now_time -= step_ptr->start_time; - snprint_time(tmp_time, sizeof(tmp_time), now_time); + secs2time_str(now_time, tmp_time, sizeof(tmp_time)); nodes = step_ptr->nodes; #ifdef HAVE_BG convert_num_unit((float)step_ptr->num_tasks, @@ -2360,10 +2358,13 @@ extern int get_new_info_job(job_info_msg_t **info_ptr, int error_code = SLURM_NO_CHANGE_IN_DATA; time_t now = time(NULL); static time_t last; + static bool changed = 0; if(!force && ((now - last) < global_sleep_time)) { error_code = SLURM_NO_CHANGE_IN_DATA; *info_ptr = job_info_ptr; + if(changed) + return SLURM_SUCCESS; return error_code; } last = now; @@ -2371,15 +2372,19 @@ extern int get_new_info_job(job_info_msg_t **info_ptr, if (job_info_ptr) { error_code = slurm_load_jobs(job_info_ptr->last_update, &new_job_ptr, show_flags); - if (error_code == SLURM_SUCCESS) + if (error_code == SLURM_SUCCESS) { slurm_free_job_info_msg(job_info_ptr); - else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { + changed = 1; + } else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_NO_CHANGE_IN_DATA; new_job_ptr = job_info_ptr; + changed = 0; } - } else + } else { error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); + changed = 1; + } job_info_ptr = new_job_ptr; *info_ptr = new_job_ptr; return error_code; diff --git a/src/sview/node_info.c b/src/sview/node_info.c index 65c7446b2585a866265373bcee1efc793931bcc0..6348522e5736d1c288e826bb06c78b2027c0176b 100644 --- a/src/sview/node_info.c +++ b/src/sview/node_info.c @@ -455,9 +455,13 @@ extern int get_new_info_node(node_info_msg_t **info_ptr, int force) int error_code = SLURM_NO_CHANGE_IN_DATA; time_t now = time(NULL); static time_t last; - + static bool changed = 0; + if(!force && ((now - last) < global_sleep_time)) { *info_ptr = node_info_ptr; + if(changed) + return SLURM_SUCCESS; + return error_code; } last = now; @@ -466,15 +470,19 @@ extern int get_new_info_node(node_info_msg_t **info_ptr, int force) if (node_info_ptr) { error_code = slurm_load_node(node_info_ptr->last_update, &new_node_ptr, show_flags); - if (error_code == SLURM_SUCCESS) + if (error_code == SLURM_SUCCESS) { slurm_free_node_info_msg(node_info_ptr); - else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { + changed = 1; + } else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_NO_CHANGE_IN_DATA; new_node_ptr = node_info_ptr; + changed = 0; } - } else + } else { error_code = slurm_load_node((time_t) NULL, &new_node_ptr, show_flags); + changed = 1; + } node_info_ptr = new_node_ptr; *info_ptr = new_node_ptr; return error_code; diff --git a/src/sview/part_info.c b/src/sview/part_info.c index 47ce54d6057a9b54df76ccb3bdde9c30a45ff7cd..369da9dda97146eb16b01ab185846c95fa976e9d 100644 --- a/src/sview/part_info.c +++ b/src/sview/part_info.c @@ -27,6 +27,7 @@ \*****************************************************************************/ #include "src/sview/sview.h" +#include "src/common/parse_time.h" #define _DEBUG 0 @@ -203,6 +204,7 @@ enum { static display_data_t *local_display_data = NULL; static char *got_edit_signal = NULL; +static char *got_features_edit_signal = NULL; static void _update_part_sub_record(sview_part_sub_t *sview_part_sub, GtkTreeStore *treestore, @@ -303,7 +305,7 @@ static void _set_active_combo_part(GtkComboBox *combo, { char *temp_char = NULL; int action = 0; - int i = 0; + int i = 0, unknown_found = 0; char *upper = NULL; gtk_tree_model_get(model, iter, type, &temp_char, -1); @@ -330,6 +332,7 @@ static void _set_active_combo_part(GtkComboBox *combo, action = 2; else action = 0; + break; case SORTID_AVAIL: if(!strcmp(temp_char, "up")) action = 0; @@ -346,10 +349,13 @@ static void _set_active_combo_part(GtkComboBox *combo, else for(i = 0; i < NODE_STATE_END; i++) { upper = node_state_string(i); - if(!strcmp(upper, "UNKNOWN")) + if(!strcmp(upper, "UNKNOWN")) { + unknown_found++; continue; + } + if(!strcasecmp(temp_char, upper)) { - action = i + 2; + action = i + 2 - unknown_found; break; } } @@ -396,7 +402,7 @@ static const char *_set_part_msg(update_part_msg_t *part_msg, if ((strcasecmp(new_text,"infinite") == 0)) temp_int = INFINITE; else - temp_int = strtol(new_text, (char **)NULL, 10); + temp_int = time_str2mins((char *)new_text); type = "timelimit"; if(temp_int <= 0 && temp_int != INFINITE) @@ -425,25 +431,26 @@ static const char *_set_part_msg(update_part_msg_t *part_msg, break; case SORTID_ROOT: if (!strcasecmp(new_text, "yes")) { - part_msg->default_part = 1; + part_msg->root_only = 1; } else { - part_msg->default_part = 0; + part_msg->root_only = 0; } type = "root"; break; case SORTID_SHARE: if (!strcasecmp(new_text, "yes")) { - part_msg->default_part = SHARED_YES; + part_msg->shared = SHARED_YES; } else if (!strcasecmp(new_text, "no")) { - part_msg->default_part = SHARED_NO; + part_msg->shared = SHARED_NO; } else { - part_msg->default_part = SHARED_FORCE; + part_msg->shared = SHARED_FORCE; } type = "share"; break; case SORTID_GROUPS: type = "groups"; + part_msg->allow_groups = xstrdup(new_text); break; case SORTID_NODELIST: part_msg->nodes = xstrdup(new_text); @@ -459,7 +466,11 @@ static const char *_set_part_msg(update_part_msg_t *part_msg, case SORTID_STATE: type = (char *)new_text; got_edit_signal = xstrdup(new_text); - break; + break; + case SORTID_FEATURES: + type = "Update Features"; + got_features_edit_signal = xstrdup(new_text); + break; } return type; @@ -490,7 +501,7 @@ static void _admin_edit_combo_box_part(GtkComboBox *combo, g_print("nothing selected\n"); return; } - + gtk_tree_model_get(model, &iter, 0, &name, -1); gtk_tree_model_get(model, &iter, 1, &column, -1); @@ -765,8 +776,8 @@ static void _layout_part_record(GtkTreeView *treeview, if (part_ptr->max_time == INFINITE) snprintf(time_buf, sizeof(time_buf), "infinite"); else { - snprint_time(time_buf, sizeof(time_buf), - (part_ptr->max_time * 60)); + secs2time_str((part_ptr->max_time * 60), + time_buf, sizeof(time_buf)); } add_display_treestore_line(update, treestore, &iter, @@ -933,8 +944,8 @@ static void _update_part_record(sview_part_info_t *sview_part_info, if (part_ptr->max_time == INFINITE) snprintf(time_buf, sizeof(time_buf), "infinite"); else { - snprint_time(time_buf, sizeof(time_buf), - (part_ptr->max_time * 60)); + secs2time_str((part_ptr->max_time * 60), + time_buf, sizeof(time_buf)); } gtk_tree_store_set(treestore, iter, SORTID_TIMELIMIT, time_buf, -1); @@ -1032,6 +1043,7 @@ static void _update_part_sub_record(sview_part_sub_t *sview_part_sub, gtk_tree_store_set(treestore, iter, SORTID_STATE, lower, -1); xfree(lower); + gtk_tree_store_set(treestore, iter, SORTID_STATE_NUM, sview_part_sub->node_state, -1); @@ -1235,7 +1247,6 @@ static void _update_sview_part_sub(sview_part_sub_t *sview_part_sub, if(!node_scaling) node_scaling = 1; #endif - if (sview_part_sub->node_cnt == 0) { /* first node added */ sview_part_sub->node_state = node_ptr->node_state; sview_part_sub->features = xstrdup(node_ptr->features); @@ -1311,7 +1322,6 @@ static sview_part_sub_t *_create_sview_part_sub(partition_info_t *part_ptr, return NULL; } sview_part_sub_ptr->part_ptr = part_ptr; - sview_part_sub_ptr->node_state = node_ptr->node_state; sview_part_sub_ptr->node_cnt = node_scaling; @@ -1623,24 +1633,30 @@ extern int get_new_info_part(partition_info_msg_t **part_ptr, int force) int error_code = SLURM_NO_CHANGE_IN_DATA; time_t now = time(NULL); static time_t last; + static bool changed = 0; if(!force && ((now - last) < global_sleep_time)) { *part_ptr = part_info_ptr; + if(changed) + return SLURM_SUCCESS; return error_code; } last = now; if (part_info_ptr) { error_code = slurm_load_partitions(part_info_ptr->last_update, &new_part_ptr, SHOW_ALL); - if (error_code == SLURM_SUCCESS) + if (error_code == SLURM_SUCCESS) { slurm_free_partition_info_msg(part_info_ptr); - else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { + changed = 1; + } else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_NO_CHANGE_IN_DATA; new_part_ptr = part_info_ptr; + changed = 0; } } else { error_code = slurm_load_partitions((time_t) NULL, &new_part_ptr, SHOW_ALL); + changed = 1; } part_info_ptr = new_part_ptr; @@ -1656,59 +1672,66 @@ extern GtkListStore *create_model_part(int type) int i=0; switch(type) { case SORTID_DEFAULT: - model = gtk_list_store_new(1, G_TYPE_STRING, - G_TYPE_INT); + model = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "yes", + 1, SORTID_DEFAULT, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "no", + 1, SORTID_DEFAULT, -1); break; case SORTID_HIDDEN: - model = gtk_list_store_new(1, G_TYPE_STRING); + model = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "yes", + 1, SORTID_HIDDEN, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "no", + 1, SORTID_HIDDEN, -1); break; case SORTID_TIMELIMIT: case SORTID_MIN_NODES: - break; case SORTID_MAX_NODES: break; case SORTID_ROOT: - model = gtk_list_store_new(1, G_TYPE_STRING); + model = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "yes", + 1, SORTID_ROOT, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "no", + 1, SORTID_ROOT, -1); break; case SORTID_SHARE: - model = gtk_list_store_new(1, G_TYPE_STRING); + model = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "yes", + 1, SORTID_SHARE, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "no", + 1, SORTID_SHARE, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "force", + 1, SORTID_SHARE, -1); break; case SORTID_GROUPS: @@ -1716,26 +1739,29 @@ extern GtkListStore *create_model_part(int type) case SORTID_NODELIST: break; case SORTID_AVAIL: - model = gtk_list_store_new(1, G_TYPE_STRING); + model = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "up", + 1, SORTID_AVAIL, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "down", + 1, SORTID_AVAIL, -1); break; case SORTID_STATE: - model = gtk_list_store_new(1, G_TYPE_STRING, - G_TYPE_INT); + model = gtk_list_store_new(2, G_TYPE_STRING, G_TYPE_INT); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "drain", + 1, SORTID_STATE, -1); gtk_list_store_append(model, &iter); gtk_list_store_set(model, &iter, 0, "resume", + 1, SORTID_STATE, -1); for(i = 0; i < NODE_STATE_END; i++) { upper = node_state_string(i); @@ -1746,6 +1772,7 @@ extern GtkListStore *create_model_part(int type) lower = str_tolower(upper); gtk_list_store_set(model, &iter, 0, lower, + 1, SORTID_STATE, -1); xfree(lower); } @@ -1797,8 +1824,13 @@ extern void admin_edit_part(GtkCellRendererText *cell, xfree(temp); goto no_input; } + + if(got_features_edit_signal) { + admin_part(GTK_TREE_MODEL(treestore), &iter, (char *)type); + goto no_input; + } - if(column != SORTID_STATE) { + if(column != SORTID_STATE && column != SORTID_FEATURES ) { if(old_text && !strcmp(old_text, new_text)) { temp = g_strdup_printf("No change in value."); display_edit_note(temp); @@ -2388,11 +2420,18 @@ extern void admin_part(GtkTreeModel *model, GtkTreeIter *iter, char *type) entry = _admin_full_edit_part(part_msg, model, iter); } else if(!strncasecmp("Update", type, 6)) { char *old_features = NULL; - gtk_tree_model_get(model, iter, SORTID_FEATURES, - &old_features, -1); + if(got_features_edit_signal) + old_features = got_features_edit_signal; + else + gtk_tree_model_get(model, iter, SORTID_FEATURES, + &old_features, -1); update_features_node(GTK_DIALOG(popup), nodelist, old_features); - g_free(old_features); + if(got_features_edit_signal) { + got_features_edit_signal = NULL; + xfree(old_features); + } else + g_free(old_features); goto end_it; } else { /* something that has to deal with a node state change */ @@ -2407,27 +2446,19 @@ extern void admin_part(GtkTreeModel *model, GtkTreeIter *iter, char *type) entry, TRUE, TRUE, 0); gtk_widget_show_all(popup); response = gtk_dialog_run (GTK_DIALOG(popup)); + if (response == GTK_RESPONSE_OK) { - switch(edit_type) { - case EDIT_AVAIL: - if(got_edit_signal) - goto end_it; - if(slurm_update_partition(part_msg) == SLURM_SUCCESS) { - temp = g_strdup_printf( - "Partition %s updated successfully", - partid); - } else { - temp = g_strdup_printf( - "Problem updating partition %s.", - partid); - } - display_edit_note(temp); - g_free(temp); - break; - default: - break; - + if(slurm_update_partition(part_msg) == SLURM_SUCCESS) { + temp = g_strdup_printf( + "Partition %s updated successfully", + partid); + } else { + temp = g_strdup_printf( + "Problem updating partition %s.", + partid); } + display_edit_note(temp); + g_free(temp); } end_it: @@ -2442,7 +2473,10 @@ end_it: admin_part(model, iter, type); xfree(type); } - + if(got_features_edit_signal) { + type = "Update Features"; + admin_part(model, iter, type); + } return; } diff --git a/src/sview/sview.h b/src/sview/sview.h index 6f508ead45a06eb5da4a0575aea04b78677939a6..5a57eefc84d26173650a679a0ca7ffa1bdbb63a7 100644 --- a/src/sview/sview.h +++ b/src/sview/sview.h @@ -357,7 +357,6 @@ extern void get_info_submit(GtkTable *table, display_data_t *display_data); extern void set_menus_submit(void *arg, GtkTreePath *path, GtkMenu *menu, int type); // common.c -extern void snprint_time(char *buf, size_t buf_size, time_t time); extern int get_row_number(GtkTreeView *tree_view, GtkTreePath *path); extern int find_col(display_data_t *display_data, int type); extern const char *find_col_name(display_data_t *display_data, int type); diff --git a/testsuite/expect/test1.90 b/testsuite/expect/test1.90 index 70e1192d887334383c58dbeb45c0e4a11f62889f..9b2dd825e3e5794f630008c8ca9054c5b665a028 100755 --- a/testsuite/expect/test1.90 +++ b/testsuite/expect/test1.90 @@ -42,9 +42,14 @@ print_header $test_id # Test if memory affinity support is supported. # set affinity 0 +set fast_sched 0 log_user 0 spawn $scontrol show config expect { + -re "FastSchedule *= ($number)" { + set fast_sched $expect_out(1,string) + exp_continue + } -re "task/affinity" { set affinity 1 exp_continue @@ -53,6 +58,10 @@ expect { wait } } +if {$fast_sched > 1} { + send_user "\nWARNING: FastSchedule > 1 not compatable with this test\n" + exit 0 +} spawn ls /usr/include/numa.h expect { -nocase "no such file" { @@ -80,7 +89,7 @@ exec $bin_chmod 700 $file_prog # # Create an allocation # -set srun_pid [spawn $srun --allocate -N1 --verbose -t2] +set srun_pid [spawn $srun --allocate -N1 --exclusive --verbose -t2] # # Run a job step to get allocated processor count and affinity diff --git a/testsuite/expect/test18.37 b/testsuite/expect/test18.37 index 0befe301b94f24e30d1e5f92f36909fcdfa13214..b4dcb6c0fc36c557aa4332bfececee72db130e7c 100755 --- a/testsuite/expect/test18.37 +++ b/testsuite/expect/test18.37 @@ -43,9 +43,14 @@ print_header $test_id # Test if memory affinity support is supported. # set affinity 0 +set fast_sched 0 log_user 0 spawn $scontrol show config expect { + -re "FastSchedule *= ($number)" { + set fast_sched $expect_out(1,string) + exp_continue + } -re "task/affinity" { set affinity 1 exp_continue @@ -54,6 +59,10 @@ expect { wait } } +if {$fast_sched > 1} { + send_user "\nWARNING: FastSchedule > 1 not compatable with this test\n" + exit 0 +} spawn ls /usr/include/numa.h expect { -nocase "no such file" { @@ -81,7 +90,7 @@ exec $bin_chmod 700 $file_prog # # Create an allocation # -set salloc_pid [spawn $salloc -N1 --verbose -t2 $bin_bash] +set salloc_pid [spawn $salloc -N1 --exclusive --verbose -t2 $bin_bash] expect { -re "Granted job allocation ($number)" { set jobid $expect_out(1,string) @@ -102,7 +111,7 @@ expect { expect -re $prompt set task_cnt 0 set full_mask 0 -send "$slaunch -c1 $file_prog\n" +send "$srun -c1 $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { set full_mask $expect_out(2,string) @@ -115,7 +124,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 } -re $prompt @@ -126,7 +135,7 @@ expect { # set cpu_mask 0 set mem_mask 0 -send "$slaunch -c1 --mem_bind=rank $file_prog\n" +send "$srun -c1 --mem_bind=rank $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr cpu_mask $expect_out(2,string) @@ -139,7 +148,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 } -re $prompt @@ -154,7 +163,7 @@ if {$mem_mask != $full_mask} { # set task_mask 0 set verbose_cnt 0 -send "$slaunch -c1 --mem_bind=verbose,map_mem:0 $file_prog\n" +send "$srun -c1 --mem_bind=verbose,map_mem:0 $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -166,7 +175,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 } -re $prompt @@ -176,7 +185,7 @@ if {$task_mask != $task_cnt} { set exit_code 1 } set verbose_cnt 0 -send "$slaunch -c1 --mem_bind=verbose,map_mem:0 $file_prog\n" +send "$srun -c1 --mem_bind=verbose,map_mem:0 $file_prog\n" expect { -re "mem_bind=MAP" { incr verbose_cnt @@ -188,7 +197,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 } -re $prompt @@ -201,7 +210,7 @@ if {$verbose_cnt != $task_cnt} { # # Run all tasks all bound to the same CPU's memory (local CPU) # -send "$slaunch -c1 --cpu_bind=rank --mem_bind=local $file_prog\n" +send "$srun -c1 --cpu_bind=rank --mem_bind=local $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { if {$expect_out(2,string) != $expect_out(3,string)} { @@ -216,7 +225,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -230,7 +239,7 @@ set cpu_cnt 0 while {$cpu_cnt < $task_cnt} { set mask_sum 0 set mask [ expr 1 << $cpu_cnt ] - send "$slaunch -c1 --mem_bind=map_mem:$cpu_cnt $file_prog\n" + send "$srun -c1 --mem_bind=map_mem:$cpu_cnt $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr mask_sum $expect_out(3,string) @@ -242,7 +251,7 @@ while {$cpu_cnt < $task_cnt} { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -263,7 +272,7 @@ while {$cpu_cnt < $task_cnt} { set mask_sum 0 set mask [ expr 1 << $cpu_cnt ] set mstr [ dec2hex16 $mask] - send "$slaunch -c1 --mem_bind=mask_mem:$mstr $file_prog\n" + send "$srun -c1 --mem_bind=mask_mem:$mstr $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr mask_sum $expect_out(3,string) @@ -275,7 +284,7 @@ while {$cpu_cnt < $task_cnt} { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -338,7 +347,7 @@ send_user "alt_mask: $alt_mask\n" # Run all tasks bound to a different CPU's memory by specifying a forward map # set task_mask 0 -send "$slaunch -c1 --mem_bind=map_mem:$fwd_map $file_prog\n" +send "$srun -c1 --mem_bind=map_mem:$fwd_map $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -350,7 +359,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -365,7 +374,7 @@ if {$task_mask != $full_mask} { # Run all tasks bound to a different CPU's memory by specifying a reverse map # set task_mask 0 -send "$slaunch -c1 --mem_bind=map_mem:$rev_map $file_prog\n" +send "$srun -c1 --mem_bind=map_mem:$rev_map $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -377,7 +386,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -392,7 +401,7 @@ if {$task_mask != $full_mask} { # Run all tasks bound to a different CPU's memroy by specifying an alternating map # set task_mask 0 -send "$slaunch -c1 --mem_bind=map_mem:$alt_map $file_prog\n" +send "$srun -c1 --mem_bind=map_mem:$alt_map $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -404,7 +413,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -419,7 +428,7 @@ if {$task_mask != $full_mask} { # Run all tasks bound to a different CPU's memory by specifying a forward mask # set task_mask 0 -send "$slaunch -c1 --mem_bind=mask_mem:$fwd_mask $file_prog\n" +send "$srun -c1 --mem_bind=mask_mem:$fwd_mask $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -431,7 +440,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -446,7 +455,7 @@ if {$task_mask != $full_mask} { # Run all tasks bound to a different CPU's memory by specifying a reverse mask # set task_mask 0 -send "$slaunch -c1 --mem_bind=mask_mem:$rev_mask $file_prog\n" +send "$srun -c1 --mem_bind=mask_mem:$rev_mask $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -458,7 +467,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } @@ -473,7 +482,7 @@ if {$task_mask != $full_mask} { # Run all tasks bound to a different CPU's memory by specifying an alternating mask # set task_mask 0 -send "$slaunch -c1 --mem_bind=mask_mem:$alt_mask $file_prog\n" +send "$srun -c1 --mem_bind=mask_mem:$alt_mask $file_prog\n" expect { -re "TASK_ID:($number),CPU_MASK:($number),MEM_MASK:($number)" { incr task_mask $expect_out(3,string) @@ -485,7 +494,7 @@ expect { exp_continue } timeout { - send_user "\nFAILURE: slaunch not responding or failure to recognize prompt\n" + send_user "\nFAILURE: srun not responding or failure to recognize prompt\n" set exit_code 1 exp_continue } diff --git a/testsuite/expect/test5.4 b/testsuite/expect/test5.4 index c22a345a073a1fbfbec6774dc499274b3541bb61..3bb4112c6b0182573970ac3d75067a1bca750f14 100755 --- a/testsuite/expect/test5.4 +++ b/testsuite/expect/test5.4 @@ -90,8 +90,8 @@ if {$job_id2 == 0} { if {[wait_for_job $job_id1 RUNNING] != 0} { send_user "\nFAILURE to start job $job_id1\n" - cancel_job $$job_id1 - cancel_job $$job_id2 + cancel_job $job_id1 + cancel_job $job_id2 exit 1 } exec $bin_rm -f $file_in