diff --git a/AUTHORS b/AUTHORS index 1ca83d05b9f1ac10f19d4e99ebcbdf445803cda3..4811a692e77f512e9a29c4e5fe7b9c281271101c 100644 --- a/AUTHORS +++ b/AUTHORS @@ -30,6 +30,7 @@ Dan Phung <phung4(at)llnl.gov> Ashley Pitman <ashley(at)quadrics.com> Andy Riebs <Andy.Riebs(at)hp.com> Asier Roa <asier.roa(at)bsc.es> +Miguel Ros <miguel.ros(at)bsc.es> Federico Sacerdoti <Federico.Sacerdoti(at)deshaw.com> Jeff Squyres <jsquyres(at)lam-mpi.org> Keven Tew <tew1(at)llnl.gov> diff --git a/BUILD.NOTES b/BUILD.NOTES index 8e0a08d2dbe591e9719ca4c104e5fb821ad6fd74..273a2d816622b539e78ca7f2c137b447adebe538 100644 --- a/BUILD.NOTES +++ b/BUILD.NOTES @@ -10,6 +10,11 @@ Simple build/install on Linux: make make install +To build the files in the contribs directory: + make contrib + make install-contrib + (The RPMs are built by default) + If you make changes to any auxdir/* or Makefile.am file, then run _snowflake_ (where there are recent versions of autoconf, automake and libtool installed): @@ -41,16 +46,17 @@ Linux cluster (See BlueGene and AIX specific notes below for some differences). %_with_sgijob 1 (ON SYSTEMS WITHOUT ELAN SWITCH) I usually build with using the following syntax: build -s https://eris.llnl.gov/svn/slurm/tags/slurm-1-2-0-0-pre3 - NOTE: For v1.0 and earlier add: --pre-exec='./autogen.sh' -4. Move the RPMs to +4. Remove the RPMs that we don't want: + rm -f slurm-perlapi*rpm slurm-torque*rpm +5. Move the RPMs to /usr/local/admin/rpms/llnl/RPMS-RHEL4/x86_64 (odevi, or gauss) /usr/local/admin/rpms/llnl/RPMS-RHEL4/i386/ (adevi) /usr/local/admin/rpms/llnl/RPMS-RHEL4/ia64/ (tdevi) send an announcement email (with the latest entry from the NEWS file) out to linux-admin@lists.llnl.gov. -5. Copy tagged bzip file (e.g. slurm-0.6.0-0.pre3.bz2) to FTP server +6. Copy tagged bzip file (e.g. slurm-0.6.0-0.pre3.bz2) to FTP server for external SLURM users. -6. Copy bzip file and rpms (including src.rpm) to sourceforge.net: +7. Copy bzip file and rpms (including src.rpm) to sourceforge.net: ncftp upload.sf.net cd upload put filename @@ -64,8 +70,7 @@ BlueGene build notes: %_with_bluegene 1 %with_cflags CFLAGS=-m64 Build on Service Node with using the following syntax - build -s https://eris.llnl.gov/svn/slurm/tags/slurm-1-2-0-0-pre3 - 4. Copy RPMs to /usr/admin/sles/llnl/RPMS-SLES9 + rpmbuild -ta slurm-...bz2 To build and run on AIX: 0. svn co https://eris.llnl.gov/svn/slurm/trunk slurm @@ -114,8 +119,10 @@ To build and run on AIX: There will be a log file create named /tmp/mplog.<jobid>.<taskid> 7. If you update proctrack, be sure to run "slibclean" to clear cached version. -8. Install the rpms int /usr/admin/inst.images/slurm/aix5.3 on an OCF AIX - machine (pdev is a good choice). +8. Remove the RPMs that we don't want: + rm -f slurm-perlapi*rpm slurm-torque*rpm + and install the other RPMs into /usr/admin/inst.images/slurm/aix5.3 on an + OCF AIX machine (pdev is a good choice). AIX/Federation switch window problems To clean switch windows: ntblclean =w 8 -a sni0 diff --git a/META b/META index 51846978ecdf72fc28169e677f2aa335012f2eb0..8723309f560d0e1b1e016f37a22f9105a64a57be 100644 --- a/META +++ b/META @@ -3,8 +3,9 @@ Api_revision: 0 Major: 1 Meta: 1 - Micro: 19 + Micro: 20 Minor: 2 Name: slurm Release: 1 - Version: 1.2.19 + Release_tags: dist + Version: 1.2.20 diff --git a/NEWS b/NEWS index 113cb3f80bb6442d41d1300bef5903e249dec75b..964b4f1c3e0d968e3e71e1d9bc4927ab6f08bbf1 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,37 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 1.2.20 +========================= + -- In switch/federation, fix small memory leak effecting slurmd. + -- Add PMI_FANOUT_OFF_HOST environment variable to control how message + forwarding is done for PMI (MPICH2). See "man srun" for details. + -- From sbatch set SLURM_NTASKS_PER_NODE when --ntasks-per-node option is + specified. + -- BLUEGENE: Documented the prefix should always be lower case and the 3 + digit suffix should be uppercase if any letters are used as digits. + -- In sched/wiki and sched/wiki2, add support for --cpus-per-task option. + From Miguel Ros, BSC. + -- In sched/wiki2, prevent invalid memory pointer (and likely seg fault) + for job associated with a partition that has since been deleted. + -- In sched/wiki2 plus select/cons_res, prevent invalid memory pointer + (and likely seg fault) when a job is requeued. + -- In sched/wiki, add support for job suspend, resume, and modify. + -- In sched/wiki, add suppport for processor allocation (not just node allocation) + with layout control. + -- Prevent re-sending job termination RPC to a node that has already completed + the job. Only send it to specific nodes which have not reported completion. + -- Support larger environment variables 64K instead of BUFSIZ (8k on some + systems). + -- If a job is being requeued, job step create requests will print a + warning and repeatedly retry rather than aborting. + -- Add optional mode value to srun and sbatch --get-user-env option. + -- Print error message and retry job submit commands when MaxJobCount + is reached. From Don Albert, Bull. + -- Treat invalid begin time specification as a fatal error in sbatch and + srun. From Don Albert, Bull. + -- Validate begin time specification to avoid hours >24, minutes >59, etc. + * Changes in SLURM 1.2.19 ========================= *** NOTE IMPORTANT CHANGE IN RPM BUILD BELOW **** @@ -2707,4 +2738,4 @@ documents those changes that are of interest to users and admins. -- Change directory to /tmp in slurmd if daemonizing. -- Logfiles are reopened on reconfigure. -$Id: NEWS 12599 2007-11-01 21:19:34Z jette $ +$Id: NEWS 12712 2007-11-29 00:12:08Z jette $ diff --git a/doc/html/bluegene.shtml b/doc/html/bluegene.shtml index 9121d3ce10df4a07c79aa107f8825de5ab87d002..8a4d01db2f3301ea0f98ae8eb645d9781c346e28 100644 --- a/doc/html/bluegene.shtml +++ b/doc/html/bluegene.shtml @@ -83,7 +83,7 @@ date <h3><a name="naming">Naming Convensions</a></h3> <p>The naming of base partitions includes a three-digit suffix representing the its -coordinates in the X, Y and Z dimensions with a zero origin. +coordinates in the X, Y and Z dimensions with a zero origin. For example, "bg012" represents the base partition whose coordinate is at X=0, Y=1 and Z=2. In a system configured with <i>small blocks</i> (any block less than a full base partition) there will be divisions into the base partition notation. For example, if there were 64 psets in the @@ -100,6 +100,24 @@ For example, "bg[620x731]" is used to represent the eight base partitions enclos with endpoints bg620 and bg731 (bg620, bg621, bg630, bg631, bg720, bg721, bg730 and bg731).</p></a> +<p> +<b>IMPORTANT:</b> As of SLURM version 1.2 SLURM can handle a bluegene +system of size 36x36x36. To try to keep with the 'three-digit suffix +representing the its coordinates in the X, Y and Z dimensions with a +zero origin', we now support A-Z as valid numbers. This makes it so +the prefix <b>must always be lower case</b>, and any letters in the +three-digit suffix <b> must always be upper case</b>. This schema +should be used in your slurm.conf file and in your bluegene.conf file +if you put a prefix there even though it is not necessary there. This +schema should also be used to specify midplanes or locations in +configure mode of smap. + +<br> +valid: bgl[000xC44] bgl000 bglZZZ +<br> +invalid: BGL[000xC44] BglC00 bglb00 Bglzzz +</p> + <p>One new tool provided is <i>smap</i>. As of SLURM verison 1.2, <i>sview</i> is another new tool offering even more viewing and configuring options. diff --git a/doc/html/team.shtml b/doc/html/team.shtml index 33695062f6da22eb6fcf4e998f4af53824159cdb..a2b16f1ff8e312350081d9480c225b2905598e87 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -49,6 +49,7 @@ Networking, Italy)</li> <li>Ashley Pittman (Quadrics)</li> <li>Andy Riebs (HP)</li> <li>Asier Roa (Barcelona Supercomputer Center, Spain)<li> +<li>Miguel Ros (Barcelona Supercomputer Center, Spain)<li> <li>Federico Sacerdoti (D.E. Shaw)<li> <li>Jeff Squyres (LAM MPI)</li> <li>Prashanth Tamraparni (HP, India)</li> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 490c9c252ad78ec5acbd8642a77d19041fa40a5d..3fe996045934959cae130c86a170e777dc7a56a1 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -24,10 +24,9 @@ a specific time of day (seconds are optional). You may also specify \fImidnight\fR, \fInoon\fR, or \fIteatime\fR (4pm) and you can have a time\-of\-day suffixed with \fIAM\fR or \fIPM\fR for running in the morning or the evening. -You can also say what day the job will be run, by giving -a date in the form \fImonth\-name\fR day with an optional year, -or giving a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR -or \fIDD.MM.YY\fR. You can also +You can also say what day the job will be run, by specifying +a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR +or \fIMM.DD.YY\fR. You can also give times like \fInow + count time\-units\fR, where the time\-units can be \fIminutes\fR, \fIhours\fR, \fIdays\fR, or \fIweeks\fR and you can tell SLURM to run the job today with the keyword @@ -61,7 +60,7 @@ If no nodes have the requested features, then the job will be rejected by the slurm job manager. .TP -\fB\-\-comment\fR +\fB\-\-comment\fR=<\fIstring\fR> An arbitrary comment. .TP diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 532d4d8e6e7b1357f43c77876b86b97a1d218c1b..e317a291817843db5fc868272c1750d92fd97f11 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -34,10 +34,9 @@ a specific time of day (seconds are optional). You may also specify \fImidnight\fR, \fInoon\fR, or \fIteatime\fR (4pm) and you can have a time\-of\-day suffixed with \fIAM\fR or \fIPM\fR for running in the morning or the evening. -You can also say what day the job will be run, by giving -a date in the form \fImonth\-name\fR day with an optional year, -or giving a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR -or \fIDD.MM.YY\fR. You can also +You can also say what day the job will be run, by specifying +a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR +or \fIMM.DD.YY\fR. You can also give times like \fInow + count time\-units\fR, where the time\-units can be \fIminutes\fR, \fIhours\fR, \fIdays\fR, or \fIweeks\fR and you can tell SLURM to run the job today with the keyword @@ -78,7 +77,7 @@ the \-\-cpus\-per\-task=3 options, the controller knows that each task requires of 4 nodes, one for each of the 4 tasks. .TP -\fB\-\-comment\fR +\fB\-\-comment\fR=<\fIstring\fR> An arbitrary comment. .TP @@ -120,15 +119,25 @@ The order of the node names in the list is not important; the node names will be sorted my SLURM. .TP -\fB\-\-get\-user\-env\fR[=\fItimeout\fR] +\fB\-\-get\-user\-env\fR[=\fItimeout\fR][\fImode\fR] This option will tell sbatch to retrieve the -login environment variables for the user specified in the \-\-uid option. -The environment variables are retrieved by running "su - <username> -c -/usr/bin/env" and parsing the output. Be aware that any environment -variables already set in sbatch's environment will take precedence over any -environment variables in the user's login environment. -Optional timeout value is in seconds. Default value is 8 seconds. -NOTE: This option only works if the caller has an effective uid of "root". +login environment variables for the user specified in the \fB\-\-uid\fR option. +The environment variables are retrieved by running something of this sort +"su - <username> -c /usr/bin/env" and parsing the output. +Be aware that any environment variables already set in sbatch's environment +will take precedence over any environment variables in the user's +login environment. +The optional \fItimeout\fR value is in seconds. Default value is 8 seconds. +The optional \fImode\fR value control the "su" options. +With a \fImode\fR value of "S", "su" is executed without the "\-" option. +With a \fImode\fR value of "L", "su" is executed with the "\-" option, +replicating the login environment. +If \fImode\fR not specified, the mode established at SLURM build time +is used. +Example of use include "\-\-get\-user\-env", "\-\-get\-user\-env=10" +"\-\-get\-user\-env=10L", and "\-\-get\-user\-env=S". +NOTE: This option only works if the caller has an +effective uid of "root". This option was originally created for use by Moab. .TP diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 13255266ca8dd84c31a00c325b15458af6d73ef2..74ad6370fa2ce841af14ff2a4bae12553a76b49f 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -348,10 +348,9 @@ a specific time of day (seconds are optional). You may also specify \fImidnight\fR, \fInoon\fR, or \fIteatime\fR (4pm) and you can have a time\-of\-day suffixed with \fIAM\fR or \fIPM\fR for running in the morning or the evening. -You can also say what day the job will be run, by giving -a date in the form \fImonth\-name\fR day with an optional year, -or giving a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR -or \fIDD.MM.YY\fR. You can also +You can also say what day the job will be run, by specifying +a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR +or \fIMM.DD.YY\fR. You can also give times like \fInow + count time\-units\fR, where the time\-units can be \fIminutes\fR, \fIhours\fR, \fIdays\fR, or \fIweeks\fR and you can tell SLURM to run the job today with the keyword diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index bcbc3cb435205636644f00727385f1c492d8bc42..c2ab45e664d27cd6840c0ae8dbb32d77ec125938 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1,4 +1,4 @@ -\." $Id: srun.1 12574 2007-10-26 17:00:52Z jette $ +\." $Id: srun.1 12697 2007-11-27 22:02:29Z jette $ .\" .TH SRUN "1" "July 2007" "srun 1.2" "slurm components" @@ -99,10 +99,9 @@ a specific time of day (seconds are optional). You may also specify \fImidnight\fR, \fInoon\fR, or \fIteatime\fR (4pm) and you can have a time\-of\-day suffixed with \fIAM\fR or \fIPM\fR for running in the morning or the evening. -You can also say what day the job will be run, by giving -a date in the form \fImonth\-name\fR day with an optional year, -or giving a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR -or \fIDD.MM.YY\fR. You can also +You can also say what day the job will be run, by specifying +a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR +or \fIMM.DD.YY\fR. You can also give times like \fInow + count time\-units\fR, where the time\-units can be \fIseconds\fR (default), \fIminutes\fR, \fIhours\fR, \fIdays\fR, or \fIweeks\fR and you can tell SLURM to run @@ -221,7 +220,7 @@ tasks will be allocated per node as possible while satisfying the \fB\-c\fR restriction. .TP -\fB\-\-comment\fR +\fB\-\-comment\fR=<\fIstring\fR> An arbitrary comment. .TP @@ -282,16 +281,26 @@ even if consumable resources are enabled (e.g. \fBSelectType=select/cons_res\fR). .TP -\fB\-\-get\-user\-env\fR[=\fItimeout\fR] +\fB\-\-get\-user\-env\fR[=\fItimeout\fR][\fImode\fR] For a batch script submission, this option will tell srun to retrieve the -login environment variables for the user specified in the \-\-uid option. -The environment variables are retrieved by running "su - <username> -c -/usr/bin/env" and parsing the output. Be aware that any environment -variables already set in srun's environment will take precedence over any -environment variables in the user's login environment. -Optional timeout value is in seconds. Default value is 8 seconds. +login environment variables for the user specified in the \fB\-\-uid\fR option. +The environment variables are retrieved by running something of this sort +"su - <username> -c /usr/bin/env" and parsing the output. +Be aware that any environment variables already set in srun's environment +will take precedence over any environment variables in the user's +login environment. +The optional \fItimeout\fR value is in seconds. Default value is 8 seconds. +The optional \fImode\fR value control the "su" options. +With a \fImode\fR value of "S", "su" is executed without the "\-" option. +With a \fImode\fR value of "L", "su" is executed with the "\-" option, +replicating the login environment. +If \fImode\fR not specified, the mode established at SLURM build time +is used. +Example of use include "\-\-get\-user\-env", "\-\-get\-user\-env=10" +"\-\-get\-user\-env=10L", and "\-\-get\-user\-env=S". NOTE: This option only works if the caller has an -effective uid of "root", and only takes effect in batch mode (\-b/\-\-batch). +effective uid of "root", and only takes effect in batch mode +(\fB\-b\fR/\fB\-\-batch\fR). This option was originally created for use by Moab. .TP @@ -622,6 +631,8 @@ comma\-separated and case insensitive types are recongnized: \fBBULK_XFER\fR and adapter names (e.g. \fBSNI0\fR and \fBSNI1\fR). For more information, on IBM systems see \fIpoe\fR documenation on the environment variables \fBMP_EUIDEVICE\fR and \fBMP_USE_BULK_XFER\fR. +Note that only four jobs steps may be active at once on a node with the +\fBBULK_XFER\fR option due to limitations in the Federation switch driver. .TP \fB\-\-nice\fR[=\fIadjustment]\fR @@ -1185,6 +1196,19 @@ offload work from the srun command to the applications and likely increase the vulernability to failures. The default value is 32. .TP +\fBPMI_FANOUT_OFF_HOST\fR +This is used exclusively with PMI (MPICH2 and MVAPICH2) and +controls the fanout of data communications. The srun command +sends messages to application programs (via the PMI library) +and those applications may be called upon to forward that +data to additional tasks. By default, srun sends one message +per host and one task on that host forwards the data to other +tasks on that host up to \fBPMI_FANOUT\fR. +If \fBPMI_FANOUT_OFF_HOST\fR is defined, the user task +may be required to forward the data to tasks on other hosts. +Setting \fBPMI_FANOUT_OFF_HOST\fR may increase performance +and vulernability to failures. +.TP \fBPMI_TIME\fR This is used exclusively with PMI (MPICH2 and MVAPICH2) and controls how much the communications from the tasks to the diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 2883fa5cae2b6697e04368be7c288859006d3307..2ff61088c9bc4ac46756626a3c51b52218250942 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -234,6 +234,12 @@ If the job fails to terminate gracefully in the interval specified, it will be forcably terminated. The default value is 30 seconds. May not exceed 65533. + +.TP +\fBMailProg\fR +Fully qualified pathname to the program used to send email per user request. +The default value is "/bin/mail". + .TP \fBMaxJobCount\fR The maximum number of jobs SLURM can have in its active database @@ -1035,10 +1041,6 @@ users from directly using those resources. Possible values are "YES" and "NO". The default value is "NO". .TP -\fBMailProg\fR -Fully qualified pathname to the program used to send email per user request. -The default value is "/bin/mail". -.TP \fBMaxNodes\fR Maximum count of nodes (or base partitions for BlueGene systems) which may be allocated to any single job. diff --git a/slurm.spec b/slurm.spec index d3c1fbc892d4d574e6f307660f3aba8041594520..33e2a50cbbecc5997057cfc7e57abc68d127a69d 100644 --- a/slurm.spec +++ b/slurm.spec @@ -1,4 +1,4 @@ -# $Id: slurm.spec 12605 2007-11-01 23:12:43Z jette $ +# $Id: slurm.spec 12730 2007-11-29 19:51:09Z jette $ # # Note that this package is not relocatable @@ -34,7 +34,6 @@ %slurm_without_opt bluegene %slurm_without_opt auth_none %slurm_without_opt debug -%slurm_without_opt sgijob # Build with munge by default on all platforms (disable with --without munge) %slurm_with_opt munge @@ -48,20 +47,27 @@ %endif # Define with_aix on AIX systems (for proctrack) -%ifos aix +%ifos aix5.3 %slurm_with_opt aix %endif +# Build with sgijob on CHAOS systems +# (add elan too when it is available) +%if %{?chaos}0 +%slurm_with_opt sgijob +%else +%slurm_without_opt sgijob +%endif Name: slurm -Version: 1.2.19 +Version: 1.2.20 Release: 1%{?dist} Summary: Simple Linux Utility for Resource Management License: GPL Group: System Environment/Base -Source: slurm-1.2.19.tar.bz2 +Source: slurm-1.2.20.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release} URL: http://www.llnl.gov/linux/slurm BuildRequires: openssl-devel >= 0.9.6 openssl >= 0.9.6 @@ -205,7 +211,7 @@ SLURM process tracking plugin for SGI job containers. ############################################################################# %prep -%setup -n slurm-1.2.19 +%setup -n slurm-1.2.20 %build %configure --program-prefix=%{?_program_prefix:%{_program_prefix}} \ diff --git a/src/api/job_info.c b/src/api/job_info.c index b3a70a6afa737bf0c40273b3170f8bc8d240ff89..829c6e32905889437dde9578dbebb8070964da8e 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * job_info.c - get/print the job state information of slurm - * $Id: job_info.c 12249 2007-09-11 00:48:52Z jette $ + * $Id: job_info.c 12627 2007-11-06 19:48:55Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -87,10 +87,11 @@ slurm_print_job_info_msg ( FILE* out, job_info_msg_t *jinfo, int one_liner ) static void _sprint_range(char *str, uint16_t lower, uint16_t upper) { - char tmp[128]; - convert_num_unit((float)lower, str, UNIT_NONE); + /* Note: We don't have the size of str here */ + convert_num_unit((float)lower, str, 16, UNIT_NONE); if (upper > 0) { - convert_num_unit((float)upper, tmp, UNIT_NONE); + char tmp[128]; + convert_num_unit((float)upper, tmp, sizeof(tmp), UNIT_NONE); strcat(str, "-"); strcat(str, tmp); } @@ -252,7 +253,8 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) /****** Line 6a (optional) ******/ #if 0 /* mainly for debugging */ - convert_num_unit((float)job_ptr->num_cpu_groups, tmp1, UNIT_NONE); + convert_num_unit((float)job_ptr->num_cpu_groups, tmp1, sizeof(tmp1), + UNIT_NONE); snprintf(tmp_line, sizeof(tmp_line), "NumCPUGroups=%s ", tmp1); @@ -300,9 +302,11 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) } /****** Line 7 ******/ - convert_num_unit((float)job_ptr->num_procs, tmp1, UNIT_NONE); + convert_num_unit((float)job_ptr->num_procs, tmp1, sizeof(tmp1), + UNIT_NONE); #ifdef HAVE_BG - convert_num_unit((float)job_ptr->num_nodes, tmp2, UNIT_NONE); + convert_num_unit((float)job_ptr->num_nodes, tmp2, sizeof(tmp2), + UNIT_NONE); sprintf(tmp_line, "ReqProcs=%s MinBPs=%s ", tmp1, tmp2); #else _sprint_range(tmp2, job_ptr->num_nodes, job_ptr->max_nodes); @@ -356,8 +360,10 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) xstrcat(out, "\n "); /****** Line 10 ******/ - convert_num_unit((float)job_ptr->job_min_memory, tmp1, UNIT_NONE); - convert_num_unit((float)job_ptr->job_min_tmp_disk, tmp2, UNIT_NONE); + convert_num_unit((float)job_ptr->job_min_memory, tmp1, sizeof(tmp1), + UNIT_NONE); + convert_num_unit((float)job_ptr->job_min_tmp_disk, tmp2, sizeof(tmp2), + UNIT_NONE); snprintf(tmp_line, sizeof(tmp_line), "MinMemory=%s MinTmpDisk=%s Features=%s", tmp1, tmp2, job_ptr->features); diff --git a/src/api/partition_info.c b/src/api/partition_info.c index 02135e3be50180b99be56da04102d6808600d94c..5729074ce9cccd7ec1d7dff609ccc79498eb2a2a 100644 --- a/src/api/partition_info.c +++ b/src/api/partition_info.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * partition_info.c - get/print the partition state information of slurm - * $Id: partition_info.c 11781 2007-07-02 23:00:56Z jette $ + * $Id: partition_info.c 12627 2007-11-06 19:48:55Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -105,17 +105,19 @@ char *slurm_sprint_partition_info ( partition_info_t * part_ptr, int one_liner ) { int j; - char tmp1[7], tmp2[7]; + char tmp1[16], tmp2[16]; char tmp_line[MAXHOSTRANGELEN]; char *out = NULL; /****** Line 1 ******/ #ifdef HAVE_BG - convert_num_unit((float)part_ptr->total_nodes, tmp1, UNIT_NONE); + convert_num_unit((float)part_ptr->total_nodes, tmp1, sizeof(tmp1), + UNIT_NONE); #else - sprintf(tmp1, "%u", part_ptr->total_nodes); + snprintf(tmp1, sizeof(tmp1), "%u", part_ptr->total_nodes); #endif - convert_num_unit((float)part_ptr->total_cpus, tmp2, UNIT_NONE); + convert_num_unit((float)part_ptr->total_cpus, tmp2, sizeof(tmp2), + UNIT_NONE); snprintf(tmp_line, sizeof(tmp_line), "PartitionName=%s TotalNodes=%s TotalCPUs=%s ", part_ptr->name, tmp1, tmp2); @@ -169,9 +171,10 @@ char *slurm_sprint_partition_info ( partition_info_t * part_ptr, /****** Line 3 ******/ #ifdef HAVE_BG - convert_num_unit((float)part_ptr->min_nodes, tmp1, UNIT_NONE); + convert_num_unit((float)part_ptr->min_nodes, tmp1, sizeof(tmp1), + UNIT_NONE); #else - sprintf(tmp1, "%u", part_ptr->min_nodes); + snprintf(tmp1, sizeof(tmp1), "%u", part_ptr->min_nodes); #endif sprintf(tmp_line, "MinNodes=%s ", tmp1); xstrcat(out, tmp_line); @@ -180,9 +183,10 @@ char *slurm_sprint_partition_info ( partition_info_t * part_ptr, sprintf(tmp_line, "MaxNodes=UNLIMITED "); else { #ifdef HAVE_BG - convert_num_unit((float)part_ptr->max_nodes, tmp1, UNIT_NONE); + convert_num_unit((float)part_ptr->max_nodes, tmp1, sizeof(tmp1), + UNIT_NONE); #else - sprintf(tmp1, "%u", part_ptr->max_nodes); + snprintf(tmp1, sizeof(tmp1),"%u", part_ptr->max_nodes); #endif sprintf(tmp_line, "MaxNodes=%s ", tmp1); } diff --git a/src/api/pmi_server.c b/src/api/pmi_server.c index d0385508917ad14dda2e37c58c76e74fb238d962..406da910d5b099fda0796573897a1cf0a01a17c0 100644 --- a/src/api/pmi_server.c +++ b/src/api/pmi_server.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * pmi.c - Global PMI data as maintained within srun - * $Id: pmi_server.c 12315 2007-09-13 23:56:02Z jette $ + * $Id: pmi_server.c 12620 2007-11-05 19:00:45Z jette $ ***************************************************************************** * Copyright (C) 2005-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -182,7 +182,7 @@ static void *_agent(void *x) struct kvs_hosts *kvs_host_list; int i, j, kvs_set_cnt = 0, host_cnt, pmi_fanout = 32; int msg_sent = 0, max_forward = 0; - char *tmp; + char *tmp, *fanout_off_host; pthread_t msg_id; pthread_attr_t attr; DEF_TIMERS; @@ -193,6 +193,7 @@ static void *_agent(void *x) if (pmi_fanout < 1) pmi_fanout = 32; } + fanout_off_host = getenv("PMI_FANOUT_OFF_HOST"); /* only send one message to each host, * build table of the ports on each host */ @@ -213,7 +214,8 @@ static void *_agent(void *x) for (j=(i+1); j<args->barrier_xmit_cnt; j++) { if (args->barrier_xmit_ptr[j].port == 0) continue; /* already sent message */ - if (strcmp(args->barrier_xmit_ptr[i].hostname, + if ((fanout_off_host == NULL) && + strcmp(args->barrier_xmit_ptr[i].hostname, args->barrier_xmit_ptr[j].hostname)) continue; /* another host */ kvs_host_list[host_cnt].task_id = 0; /* not avail */ diff --git a/src/api/signal.c b/src/api/signal.c index c46aa41d997a0d1ea855fbb3fba0e6cdf59dc68e..3091fa6de220ccc1b2856ec3b8c996ca7e7fc784 100644 --- a/src/api/signal.c +++ b/src/api/signal.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * signal.c - Send a signal to a slurm job or job step - * $Id: signal.c 10574 2006-12-15 23:38:29Z jette $ + * $Id: signal.c 12647 2007-11-12 17:09:47Z da $ ***************************************************************************** * Copyright (C) 2005 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -172,7 +172,7 @@ _local_send_recv_rc_msgs(const char *nodelist, slurm_msg_type_t type, msg->msg_type = type; msg->data = data; - if((ret_list = slurm_send_recv_msgs(nodelist, msg, 10000))) { + if((ret_list = slurm_send_recv_msgs(nodelist, msg, 0))) { while((ret_data_info = list_pop(ret_list))) { temp_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); @@ -397,7 +397,7 @@ static int _terminate_batch_script_step( return -1; } free(name); - i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 10000); + i = slurm_send_recv_rc_msg_only_one(&msg, &rc, 0); if (i != 0) rc = i; diff --git a/src/common/daemonize.c b/src/common/daemonize.c index f62ef9221fd14f71c81d485008ef9bfa860fdd4b..528aaaa145511b81ca7da01a1706fc53368dd99d 100644 --- a/src/common/daemonize.c +++ b/src/common/daemonize.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * daemonize.c - daemonization routine - * $Id: daemonize.c 10927 2007-02-05 18:43:52Z jette $ + * $Id: daemonize.c 12723 2007-11-29 18:55:48Z jette $ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -139,12 +139,14 @@ read_pidfile(const char *pidfile, int *pidfd) if (fscanf(fp, "%lu", &pid) < 1) { error ("Possible corrupt pidfile `%s'", pidfile); + (void) close(fd); return ((pid_t) 0); } if ((lpid = fd_is_read_lock_blocked(fd)) == (pid_t) 0) { verbose ("pidfile not locked, assuming no running daemon"); - return (lpid); + (void) close(fd); + return ((pid_t) 0); } if (lpid != (pid_t) pid) @@ -154,7 +156,7 @@ read_pidfile(const char *pidfile, int *pidfd) if (pidfd != NULL) *pidfd = fd; else - (void) close(fd); /* Ignore errors */ + (void) close(fd); return (lpid); } diff --git a/src/common/env.c b/src/common/env.c index 6502fd56b9e808142545e96d172ae613e1446c91..ac32d5254bc407784c5faa6bffee0023e5584675 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/common/env.c - add an environment variable to environment vector - * $Id: env.c 12592 2007-10-31 21:12:49Z jette $ + * $Id: env.c 12697 2007-11-27 22:02:29Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -77,6 +77,8 @@ strong_alias(env_array_overwrite_fmt, slurm_env_array_overwrite_fmt); #define SU_WAIT_MSEC 8000 /* 8000 msec for /bin/su to return user * env vars for --get-user-env option */ +#define ENV_BUFSIZE (64 * 1024) + /* * Return pointer to `name' entry in environment if found, or * pointer to the last entry (i.e. NULL) if `name' is not @@ -153,12 +155,12 @@ int setenvfs(const char *fmt, ...) { va_list ap; - char buf[BUFSIZ]; + char buf[ENV_BUFSIZE]; char *bufcpy; int rc; va_start(ap, fmt); - vsnprintf(buf, BUFSIZ, fmt, ap); + vsnprintf(buf, ENV_BUFSIZE, fmt, ap); va_end(ap); bufcpy = xstrdup(buf); @@ -169,7 +171,7 @@ setenvfs(const char *fmt, ...) int setenvf(char ***envp, const char *name, const char *fmt, ...) { - char buf[BUFSIZ]; + char buf[ENV_BUFSIZE]; char **ep = NULL; char *str = NULL; va_list ap; @@ -177,7 +179,7 @@ setenvf(char ***envp, const char *name, const char *fmt, ...) char *bufcpy; va_start(ap, fmt); - vsnprintf (buf, BUFSIZ, fmt, ap); + vsnprintf (buf, ENV_BUFSIZE, fmt, ap); va_end(ap); bufcpy = xstrdup(buf); @@ -944,7 +946,7 @@ char **env_array_create(void) int env_array_append_fmt(char ***array_ptr, const char *name, const char *value_fmt, ...) { - char buf[BUFSIZ]; + char buf[ENV_BUFSIZE]; char **ep = NULL; char *str = NULL; va_list ap; @@ -959,7 +961,7 @@ int env_array_append_fmt(char ***array_ptr, const char *name, } va_start(ap, value_fmt); - vsnprintf (buf, BUFSIZ, value_fmt, ap); + vsnprintf (buf, ENV_BUFSIZE, value_fmt, ap); va_end(ap); ep = _find_name_in_env(*array_ptr, name); @@ -1020,7 +1022,7 @@ int env_array_append(char ***array_ptr, const char *name, int env_array_overwrite_fmt(char ***array_ptr, const char *name, const char *value_fmt, ...) { - char buf[BUFSIZ]; + char buf[ENV_BUFSIZE]; char **ep = NULL; char *str = NULL; va_list ap; @@ -1035,7 +1037,7 @@ int env_array_overwrite_fmt(char ***array_ptr, const char *name, } va_start(ap, value_fmt); - vsnprintf (buf, BUFSIZ, value_fmt, ap); + vsnprintf (buf, ENV_BUFSIZE, value_fmt, ap); va_end(ap); xstrfmtcat (str, "%s=%s", name, buf); @@ -1154,10 +1156,11 @@ static int _env_array_entry_splitter(const char *entry, */ static int _env_array_putenv(const char *string) { - char name[BUFSIZ]; - char value[BUFSIZ]; + char name[ENV_BUFSIZE]; + char value[ENV_BUFSIZE]; - if (!_env_array_entry_splitter(string, name, BUFSIZ, value, BUFSIZ)) + if (!_env_array_entry_splitter(string, name, ENV_BUFSIZE, value, + ENV_BUFSIZE)) return 0; if (setenv(name, value, 1) == -1) return 0; @@ -1189,14 +1192,15 @@ void env_array_set_environment(char **env_array) void env_array_merge(char ***dest_array, const char **src_array) { char **ptr; - char name[BUFSIZ]; - char value[BUFSIZ]; + char name[ENV_BUFSIZE]; + char value[ENV_BUFSIZE]; if (src_array == NULL) return; for (ptr = (char **)src_array; *ptr != NULL; ptr++) { - _env_array_entry_splitter(*ptr, name, BUFSIZ, value, BUFSIZ); + _env_array_entry_splitter(*ptr, name, ENV_BUFSIZE, value, + ENV_BUFSIZE); env_array_overwrite(dest_array, name, value); } } @@ -1224,8 +1228,8 @@ static void _strip_cr_nl(char *line) */ char **_load_env_cache(const char *username) { - char *state_save_loc, fname[BUFSIZ]; - char line[BUFSIZ], name[BUFSIZ], value[BUFSIZ]; + char *state_save_loc, fname[ENV_BUFSIZE]; + char line[ENV_BUFSIZE], name[ENV_BUFSIZE], value[ENV_BUFSIZE]; char **env = NULL; FILE *fp; int i; @@ -1247,10 +1251,11 @@ char **_load_env_cache(const char *username) info("Getting cached environment variables at %s", fname); env = env_array_create(); while (1) { - if (!fgets(line, BUFSIZ, fp)) + if (!fgets(line, ENV_BUFSIZE, fp)) break; _strip_cr_nl(line); - _env_array_entry_splitter(line, name, BUFSIZ, value, BUFSIZ); + _env_array_entry_splitter(line, name, ENV_BUFSIZE, value, + ENV_BUFSIZE); env_array_overwrite(&env, name, value); } fclose(fp); @@ -1267,17 +1272,18 @@ char **_load_env_cache(const char *username) * in the event that option 1 times out. * * timeout value is in seconds or zero for default (8 secs) + * mode is 1 for short ("su <user>"), 2 for long ("su - <user>") * On error, returns NULL. * * NOTE: The calling process must have an effective uid of root for * this function to succeed. */ -char **env_array_user_default(const char *username, int timeout) +char **env_array_user_default(const char *username, int timeout, int mode) { FILE *su; - char line[BUFSIZ]; - char name[BUFSIZ]; - char value[BUFSIZ]; + char line[ENV_BUFSIZE]; + char name[ENV_BUFSIZE]; + char value[ENV_BUFSIZE]; char **env = NULL; char *starttoken = "XXXXSLURMSTARTPARSINGHEREXXXX"; char *stoptoken = "XXXXSLURMSTOPPARSINGHEREXXXXX"; @@ -1311,11 +1317,17 @@ char **env_array_user_default(const char *username, int timeout) snprintf(cmdstr, sizeof(cmdstr), "echo; echo; echo; echo %s; env; echo %s", starttoken, stoptoken); + if (mode == 1) + execl("/bin/su", "su", username, "-c", cmdstr, NULL); + else if (mode == 2) + execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); + else { /* Default system configuration */ #ifdef LOAD_ENV_NO_LOGIN - execl("/bin/su", "su", username, "-c", cmdstr, NULL); + execl("/bin/su", "su", username, "-c", cmdstr, NULL); #else - execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); + execl("/bin/su", "su", "-", username, "-c", cmdstr, NULL); #endif + } exit(1); } @@ -1353,7 +1365,7 @@ char **env_array_user_default(const char *username, int timeout) } if (!(ufds.revents & POLLIN)) break; - while (fgets(line, BUFSIZ, su)) { + while (fgets(line, ENV_BUFSIZE, su)) { if (!strncmp(line, starttoken, len)) { found = 1; break; @@ -1393,14 +1405,15 @@ char **env_array_user_default(const char *username, int timeout) /* stop at the line containing the stoptoken string */ if (!(ufds.revents & POLLIN)) break; - if ((fgets(line, BUFSIZ, su) == 0) || + if ((fgets(line, ENV_BUFSIZE, su) == 0) || (!strncmp(line, stoptoken, len))) { found = 1; break; } _strip_cr_nl(line); - _env_array_entry_splitter(line, name, BUFSIZ, value, BUFSIZ); + _env_array_entry_splitter(line, name, ENV_BUFSIZE, value, + ENV_BUFSIZE); env_array_overwrite(&env, name, value); } close(fildes[0]); diff --git a/src/common/env.h b/src/common/env.h index 01ed6c770402abd97e1eee63a4176f151ca9337a..94000ed24f202354845e431ba3c42b5d84fa4925 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -238,15 +238,20 @@ void env_array_set_environment(char **env_array); /* * Return an array of strings representing the specified user's default - * environment variables, as determined by calling (more-or-less) - * "/bin/su - <username> -c /usr/bin/env". + * environment variables following a two-prongged approach. + * 1. Execute (more or less): "/bin/su - <username> -c /usr/bin/env" + * Depending upon the user's login scripts, this may take a very + * long time to complete or possibly never return + * 2. Load the user environment from a cache file. This is used + * in the event that option 1 times out. * * timeout value is in seconds or zero for default (8 secs) + * mode is 1 for short ("su <user>"), 2 for long ("su - <user>") * On error, returns NULL. * * NOTE: The calling process must have an effective uid of root for * this function to succeed. */ -char **env_array_user_default(const char *username, int timeout); +char **env_array_user_default(const char *username, int timeout, int mode); #endif diff --git a/src/common/hostlist.c b/src/common/hostlist.c index 61fee0aba43f00a07e3fb4b653e29af47ac5158a..36b489af1844a10cf32b1cd9987a308d8a555c11 100644 --- a/src/common/hostlist.c +++ b/src/common/hostlist.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * $Id: hostlist.c 12538 2007-10-23 17:11:04Z jette $ + * $Id: hostlist.c 12632 2007-11-06 23:27:07Z da $ ***************************************************************************** * $LSDId: hostlist.c,v 1.14 2003/10/14 20:11:54 grondo Exp $ ***************************************************************************** @@ -1569,8 +1569,6 @@ static int _parse_box_range(char *str, struct _range *ranges, int len, int *coun for(i = 0; i<3; i++) { if ((str[i] >= '0') && (str[i] <= '9')) a[i] = str[i] - '0'; - else if ((str[i] >= 'a') && (str[i] <= 'z')) - a[i] = str[i] - 'a' + 10; else if ((str[i] >= 'A') && (str[i] <= 'Z')) a[i] = str[i] - 'A' + 10; else @@ -1578,8 +1576,6 @@ static int _parse_box_range(char *str, struct _range *ranges, int len, int *coun if ((str[i+4] >= '0') && (str[i+4] <= '9')) b[i] = str[i+4] - '0'; - else if ((str[i+4] >= 'a') && (str[i+4] <= 'z')) - b[i] = str[i+4] - 'a' + 10; else if ((str[i+4] >= 'A') && (str[i+4] <= 'Z')) b[i] = str[i+4] - 'A' + 10; else diff --git a/src/common/node_select.c b/src/common/node_select.c index e6020814d53f63b5ca57fb39660dc9bb8abca705..2f9a57d0cb56856cd0d57f96ce6d211bb85fb2d5 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -9,7 +9,7 @@ * the plugin. This is because functions required by the plugin can not be * resolved on the front-end nodes, so we can't load the plugins there. * - * $Id: node_select.c 11590 2007-05-25 18:52:33Z da $ + * $Id: node_select.c 12627 2007-11-06 19:48:55Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -1071,7 +1071,7 @@ extern char *select_g_sprint_jobinfo(select_jobinfo_t jobinfo, { uint16_t geometry[SYSTEM_DIMENSIONS]; int i; - char max_procs_char[7], start_char[32]; + char max_procs_char[8], start_char[32]; char *tmp_image = "default"; if (buf == NULL) { @@ -1108,7 +1108,8 @@ extern char *select_g_sprint_jobinfo(select_jobinfo_t jobinfo, sprintf(max_procs_char, "None"); else convert_num_unit((float)jobinfo->max_procs, - max_procs_char, UNIT_NONE); + max_procs_char, sizeof(max_procs_char), + UNIT_NONE); if (jobinfo->start[0] == (uint16_t) NO_VAL) sprintf(start_char, "None"); else { @@ -1130,7 +1131,8 @@ extern char *select_g_sprint_jobinfo(select_jobinfo_t jobinfo, sprintf(max_procs_char, "None"); else convert_num_unit((float)jobinfo->max_procs, - max_procs_char, UNIT_NONE); + max_procs_char, sizeof(max_procs_char), + UNIT_NONE); if (jobinfo->start[0] == (uint16_t) NO_VAL) sprintf(start_char, "None"); else { @@ -1181,7 +1183,8 @@ extern char *select_g_sprint_jobinfo(select_jobinfo_t jobinfo, sprintf(max_procs_char, "None"); else convert_num_unit((float)jobinfo->max_procs, - max_procs_char, UNIT_NONE); + max_procs_char, sizeof(max_procs_char), + UNIT_NONE); snprintf(buf, size, "%s", max_procs_char); break; diff --git a/src/common/parse_time.c b/src/common/parse_time.c index e70fe5f29a9cdc99cf80018f28ae256806b8a6b5..ea7882fdf06af30e856994359c9ff92afe4894a3 100644 --- a/src/common/parse_time.c +++ b/src/common/parse_time.c @@ -117,6 +117,10 @@ _get_time(char *time_str, int *pos, int *hour, int *minute, int * second) if ((time_str[offset] < '0') || (time_str[offset] > '9')) goto prob; hr = (hr * 10) + time_str[offset++] - '0'; + if (hr > 23) { + offset -= 2; + goto prob; + } if (time_str[offset] != ':') goto prob; offset++; @@ -128,6 +132,10 @@ _get_time(char *time_str, int *pos, int *hour, int *minute, int * second) if ((time_str[offset] < '0') || (time_str[offset] > '9')) goto prob; min = (min * 10) + time_str[offset++] - '0'; + if (min > 59) { + offset -= 2; + goto prob; + } /* get optional second */ if (time_str[offset] == ':') { @@ -138,6 +146,10 @@ _get_time(char *time_str, int *pos, int *hour, int *minute, int * second) if ((time_str[offset] < '0') || (time_str[offset] > '9')) goto prob; sec = (sec * 10) + time_str[offset++] - '0'; + if (sec > 59) { + offset -= 2; + goto prob; + } } else sec = 0; @@ -146,8 +158,20 @@ _get_time(char *time_str, int *pos, int *hour, int *minute, int * second) } if (strncasecmp(time_str+offset, "pm", 2)== 0) { hr += 12; + if (hr > 23) { + if (hr == 24) + hr = 12; + else + goto prob; + } offset += 2; } else if (strncasecmp(time_str+offset, "am", 2) == 0) { + if (hr > 11) { + if (hr == 12) + hr = 0; + else + goto prob; + } offset += 2; } diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 2da16712c941a54cf2d47c6181d8d27faa7f2031..9d1da7691417de41310a076e5f1eb2d10842caf9 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -2282,13 +2282,13 @@ extern int nodelist_find(const char *nodelist, const char *name) return id; } -extern void convert_num_unit(float num, char *buf, int orig_type) +extern void convert_num_unit(float num, char *buf, int buf_size, int orig_type) { char *unit = "\0KMGP?"; int i = (int)num % 512; if(i > 0 || (int)num == 0) { - sprintf(buf, "%d%c", (int)num, unit[orig_type]); + snprintf(buf, buf_size, "%d%c", (int)num, unit[orig_type]); return; } @@ -2301,9 +2301,9 @@ extern void convert_num_unit(float num, char *buf, int orig_type) orig_type = UNIT_UNKNOWN; i = (int)num; if(i == num) - sprintf(buf, "%d%c", i, unit[orig_type]); + snprintf(buf, buf_size, "%d%c", i, unit[orig_type]); else - sprintf(buf, "%.2f%c", num, unit[orig_type]); + snprintf(buf, buf_size, "%.2f%c", num, unit[orig_type]); } extern int revert_num_unit(const char *buf) diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 2efb4854c42a40ecab2fb866538bc12c73ab0a6a..bd5614c56b4f7efd8d8edcc009b3c1710be746fd 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -716,7 +716,7 @@ extern void slurm_free_msg(slurm_msg_t * msg); /* must free this memory with free not xfree */ extern char *nodelist_nth_host(const char *nodelist, int inx); extern int nodelist_find(const char *nodelist, const char *name); -extern void convert_num_unit(float num, char *buf, int orig_type); +extern void convert_num_unit(float num, char *buf, int buf_size, int orig_type); extern int revert_num_unit(const char *buf); /* diff --git a/src/plugins/sched/wiki/Makefile.am b/src/plugins/sched/wiki/Makefile.am index b164be29e2395cd93184ee34347551f56f71777c..ba054203a77b030fadf5a9e26c88fe68e9cfa78a 100644 --- a/src/plugins/sched/wiki/Makefile.am +++ b/src/plugins/sched/wiki/Makefile.am @@ -16,8 +16,12 @@ sched_wiki_la_SOURCES = \ crypto.h \ get_jobs.c \ get_nodes.c \ + hostlist.c \ + job_modify.c \ msg.c \ msg.h \ + resume_job.c \ sched_wiki.c \ - start_job.c + start_job.c \ + suspend_job.c sched_wiki_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/sched/wiki/Makefile.in b/src/plugins/sched/wiki/Makefile.in index 55320a07219392c7b898954b84fd49e21d7c3f9e..17ec4ae8413bfebc5baf1db7ba7e934685ea2c7f 100644 --- a/src/plugins/sched/wiki/Makefile.in +++ b/src/plugins/sched/wiki/Makefile.in @@ -75,7 +75,8 @@ pkglibLTLIBRARIES_INSTALL = $(INSTALL) LTLIBRARIES = $(pkglib_LTLIBRARIES) sched_wiki_la_LIBADD = am_sched_wiki_la_OBJECTS = cancel_job.lo crypto.lo get_jobs.lo \ - get_nodes.lo msg.lo sched_wiki.lo start_job.lo + get_nodes.lo hostlist.lo job_modify.lo msg.lo resume_job.lo \ + sched_wiki.lo start_job.lo suspend_job.lo sched_wiki_la_OBJECTS = $(am_sched_wiki_la_OBJECTS) sched_wiki_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -263,10 +264,14 @@ sched_wiki_la_SOURCES = \ crypto.h \ get_jobs.c \ get_nodes.c \ + hostlist.c \ + job_modify.c \ msg.c \ msg.h \ + resume_job.c \ sched_wiki.c \ - start_job.c + start_job.c \ + suspend_job.c sched_wiki_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) all: all-am @@ -342,9 +347,13 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/crypto.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/get_jobs.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/get_nodes.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hostlist.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_modify.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/resume_job.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sched_wiki.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/start_job.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/suspend_job.Plo@am__quote@ .c.o: @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c index df478987d4c531109eea445934cad7ab26d293e0..e021e05a847044ba2b4dc4e0705421af9ed85a68 100644 --- a/src/plugins/sched/wiki/get_jobs.c +++ b/src/plugins/sched/wiki/get_jobs.c @@ -39,6 +39,7 @@ #include <sys/types.h> #include "./msg.h" +#include "src/common/hostlist.h" #include "src/common/list.h" #include "src/common/uid.h" #include "src/slurmctld/locks.h" @@ -47,6 +48,7 @@ static char * _dump_all_jobs(int *job_cnt, int state_info); static char * _dump_job(struct job_record *job_ptr, int state_info); static char * _get_group_name(gid_t gid); +static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr); static uint32_t _get_job_end_time(struct job_record *job_ptr); static uint32_t _get_job_min_disk(struct job_record *job_ptr); static uint32_t _get_job_min_mem(struct job_record *job_ptr); @@ -56,6 +58,8 @@ static uint32_t _get_job_submit_time(struct job_record *job_ptr); static uint32_t _get_job_suspend_time(struct job_record *job_ptr); static uint32_t _get_job_tasks(struct job_record *job_ptr); static uint32_t _get_job_time_limit(struct job_record *job_ptr); +static char * _task_list(struct job_record *job_ptr); + #define SLURM_INFO_ALL 0 #define SLURM_INFO_VOLITILE 1 @@ -209,8 +213,7 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); xfree(hosts); } else if (!IS_JOB_FINISHED(job_ptr)) { - char *hosts = bitmap2wiki_node_name( - job_ptr->node_bitmap); + char *hosts = _task_list(job_ptr); snprintf(tmp, sizeof(tmp), "TASKLIST=%s;", hosts); xstrcat(buf, tmp); @@ -230,15 +233,18 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) (uint32_t) _get_job_time_limit(job_ptr)); xstrcat(buf, tmp); - if (job_ptr->job_state == JOB_PENDING) { - /* Don't report actual tasks or nodes allocated since - * this can impact requeue on heterogenous clusters */ - snprintf(tmp, sizeof(tmp), - "TASKS=%u;NODES=%u;", - _get_job_tasks(job_ptr), - _get_job_min_nodes(job_ptr)); - xstrcat(buf, tmp); - } + /* Don't report actual tasks or nodes allocated since + * this can impact requeue on heterogenous clusters */ + snprintf(tmp, sizeof(tmp), + "TASKS=%u;NODES=%u;", + _get_job_tasks(job_ptr), + _get_job_min_nodes(job_ptr)); + xstrcat(buf, tmp); + + snprintf(tmp, sizeof(tmp), + "DPROCS=%u;", + _get_job_cpus_per_task(job_ptr)); + xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), "QUEUETIME=%u;STARTTIME=%u;PARTITIONMASK=%s;", @@ -267,6 +273,18 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); } + if (job_ptr->account) { + snprintf(tmp, sizeof(tmp), + "ACCOUNT=%s;", job_ptr->account); + xstrcat(buf, tmp); + } + + if (job_ptr->comment && job_ptr->comment[0]) { + snprintf(tmp,sizeof(tmp), + "COMMENT=%s;", job_ptr->comment); + xstrcat(buf,tmp); + } + if (state_info == SLURM_INFO_VOLITILE) return buf; @@ -280,6 +298,15 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) return buf; } +static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr) +{ + uint16_t cpus_per_task = 1; + + if (job_ptr->details && job_ptr->details->cpus_per_task) + cpus_per_task = job_ptr->details->cpus_per_task; + return cpus_per_task; +} + static uint32_t _get_job_min_mem(struct job_record *job_ptr) { if (job_ptr->details) @@ -337,7 +364,7 @@ static uint32_t _get_job_tasks(struct job_record *job_ptr) job_ptr->details->ntasks_per_node)); } - return task_cnt; + return task_cnt / _get_job_cpus_per_task(job_ptr); } static uint32_t _get_job_time_limit(struct job_record *job_ptr) @@ -427,3 +454,37 @@ extern char * bitmap2wiki_node_name(bitstr_t *bitmap) } return buf; } + + +/* Return task list in Maui format: tux0:tux0:tux1:tux1:tux2 */ +static char * _task_list(struct job_record *job_ptr) +{ + int i, j, task_cnt; + char *buf = NULL, *host; + hostlist_t hl = hostlist_create(job_ptr->nodes); + + buf = xstrdup(""); + if (hl == NULL) + return buf; + + for (i=0; i<job_ptr->alloc_lps_cnt; i++) { + host = hostlist_shift(hl); + if (host == NULL) { + error("bad alloc_lps_cnt for job %u (%s, %d)", + job_ptr->job_id, job_ptr->nodes, + job_ptr->alloc_lps_cnt); + break; + } + task_cnt = job_ptr->alloc_lps[i]; + if (job_ptr->details && job_ptr->details->cpus_per_task) + task_cnt /= job_ptr->details->cpus_per_task; + for (j=0; j<task_cnt; j++) { + if (buf) + xstrcat(buf, ":"); + xstrcat(buf, host); + } + free(host); + } + hostlist_destroy(hl); + return buf; +} diff --git a/src/plugins/sched/wiki/hostlist.c b/src/plugins/sched/wiki/hostlist.c new file mode 100644 index 0000000000000000000000000000000000000000..c31dc6bf89207b154d55f354e6f062353055becc --- /dev/null +++ b/src/plugins/sched/wiki/hostlist.c @@ -0,0 +1,291 @@ +/*****************************************************************************\ + * hostlist.c - Convert hostlist expressions between Slurm and Moab formats + ***************************************************************************** + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +# include "config.h" +# if HAVE_INTTYPES_H +# include <inttypes.h> +# else +# if HAVE_STDINT_H +# include <stdint.h> +# endif +# endif /* HAVE_INTTYPES_H */ +#else /* !HAVE_CONFIG_H */ +# include <inttypes.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <string.h> + +#include "./msg.h" +#include "src/common/hostlist.h" +#include "src/common/node_select.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" + +static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps); +static char * _task_list(struct job_record *job_ptr); +static char * _task_list_exp(struct job_record *job_ptr); + +/* + * Convert Moab supplied TASKLIST expression into a SLURM hostlist expression + * + * Moab format 1: tux0:tux0:tux1:tux1:tux2 (list host for each cpu) + * Moab format 2: tux[0-1]*2:tux2 (list cpu count after host name) + * + * SLURM format: tux0,tux0,tux1,tux1,tux2 (if consumable resources enabled) + * SLURM format: tux0,tux1,tux2 (if consumable resources disabled) + * + * NOTE: returned string must be released with xfree() + */ +extern char * moab2slurm_task_list(char *moab_tasklist, int *task_cnt) +{ + char *slurm_tasklist, *host, *tmp1, *tmp2, *tok, *tok_p; + int i, reps; + hostlist_t hl; + static uint32_t cr_test = 0, cr_enabled = 0; + + if (cr_test == 0) { + select_g_get_info_from_plugin(SELECT_CR_PLUGIN, + &cr_enabled); + cr_test = 1; + } + + *task_cnt = 0; + + /* Moab format 2 if string contains '*' or '[' */ + tmp1 = strchr(moab_tasklist, (int) '*'); + if (tmp1 == NULL) + tmp1 = strchr(moab_tasklist, (int) '['); + + if (tmp1 == NULL) { /* Moab format 1 */ + slurm_tasklist = xstrdup(moab_tasklist); + if (moab_tasklist[0]) + *task_cnt = 1; + for (i=0; slurm_tasklist[i]!='\0'; i++) { + if (slurm_tasklist[i] == ':') { + slurm_tasklist[i] = ','; + (*task_cnt)++; + } else if (slurm_tasklist[i] == ',') + (*task_cnt)++; + } + return slurm_tasklist; + } + + /* Moab format 2 */ + slurm_tasklist = xstrdup(""); + tmp1 = xstrdup(moab_tasklist); + tok = strtok_r(tmp1, ":", &tok_p); + while (tok) { + /* find task count, assume 1 if no "*" */ + tmp2 = strchr(tok, (int) '*'); + if (tmp2) { + reps = atoi(tmp2 + 1); + tmp2[0] = '\0'; + } else + reps = 1; + + /* find host expression */ + hl = hostlist_create(tok); + while ((host = hostlist_shift(hl))) { + for (i=0; i<reps; i++) { + if (slurm_tasklist[0]) + xstrcat(slurm_tasklist, ","); + xstrcat(slurm_tasklist, host); + if (!cr_enabled) + break; + } + free(host); + (*task_cnt) += reps; + } + hostlist_destroy(hl); + + /* get next token */ + tok = strtok_r(NULL, ":", &tok_p); + } + xfree(tmp1); + return slurm_tasklist; +} + +/* + * Report a job's tasks a a MOAB TASKLIST expression + * + * Moab format 1: tux0:tux0:tux1:tux1:tux2 (list host for each cpu) + * Moab format 2: tux[0-1]*2:tux2 (list cpu count after host name) + * + * NOTE: returned string must be released with xfree() + */ +extern char * slurm_job2moab_task_list(struct job_record *job_ptr) +{ + if (use_host_exp) + return _task_list_exp(job_ptr); + else + return _task_list(job_ptr); +} + +/* Return task list in Moab format 1: tux0:tux0:tux1:tux1:tux2 */ +static char * _task_list(struct job_record *job_ptr) +{ + int i, j; + char *buf = NULL, *host; + hostlist_t hl = hostlist_create(job_ptr->nodes); + + if (hl == NULL) { + error("hostlist_create error for job %u, %s", + job_ptr->job_id, job_ptr->nodes); + return buf; + } + + for (i=0; i<job_ptr->alloc_lps_cnt; i++) { + host = hostlist_shift(hl); + if (host == NULL) { + error("bad alloc_lps_cnt for job %u (%s, %d)", + job_ptr->job_id, job_ptr->nodes, + job_ptr->alloc_lps_cnt); + break; + } + for (j=0; j<job_ptr->alloc_lps[i]; j++) { + if (buf) + xstrcat(buf, ":"); + xstrcat(buf, host); + } + free(host); + } + hostlist_destroy(hl); + return buf; +} + +/* Append to buf a compact tasklist expression (e.g. "tux[0-1]*2") + * Prepend ":" to expression as needed */ +static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) +{ + int host_str_len = 4096; + char *host_str, tmp_str[64]; + char *tok, *sep; + int i, in_bracket = 0, fini = 0; + + host_str = xmalloc(host_str_len); + hostlist_uniq(*hl_tmp); + while (hostlist_ranged_string(*hl_tmp, host_str_len, host_str) < 0) { + host_str_len *= 2; + xrealloc(*host_str, host_str_len); + } + + /* Note that host_str may be of this form "alpha,beta". We want + * to record this as "alpha*#:beta*#" and NOT "alpha,beta*#". + * NOTE: Do not break up command within brackets (e.g. "tux[1,2-4]") */ + if (*buf) + sep = ":"; + else + sep = ""; + tok = host_str; + for (i=0; (fini == 0) ; i++) { + switch (tok[i]) { + case '[': + in_bracket = 1; + break; + case ']': + in_bracket = 0; + break; + case '\0': + fini = 1; + if (in_bracket) + error("badly formed hostlist %s", tok); + case ',': + if (in_bracket) + break; + tok[i] = '\0'; + snprintf(tmp_str, sizeof(tmp_str), "%s%s*%d", + sep, tok, *reps); + xstrcat(*buf, tmp_str); + sep = ":"; + tok += (i + 1); + i = -1; + break; + } + } + xfree(host_str); + hostlist_destroy(*hl_tmp); + *hl_tmp = (hostlist_t) NULL; + *reps = 0; +} + +/* Return task list in Moab format 2: tux[0-1]*2:tux2 */ +static char * _task_list_exp(struct job_record *job_ptr) +{ + int i, reps = -1; + char *buf = NULL, *host; + hostlist_t hl = hostlist_create(job_ptr->nodes); + hostlist_t hl_tmp = (hostlist_t) NULL; + + if (hl == NULL) { + error("hostlist_create error for job %u, %s", + job_ptr->job_id, job_ptr->nodes); + return buf; + } + + for (i=0; i<job_ptr->alloc_lps_cnt; i++) { + host = hostlist_shift(hl); + if (host == NULL) { + error("bad alloc_lps_cnt for job %u (%s, %d)", + job_ptr->job_id, job_ptr->nodes, + job_ptr->alloc_lps_cnt); + break; + } + + if (reps == job_ptr->alloc_lps[i]) { + /* append to existing hostlist record */ + if (hostlist_push(hl_tmp, host) == 0) + error("hostlist_push failure"); + } else { + if (hl_tmp) + _append_hl_buf(&buf, &hl_tmp, &reps); + + /* start new hostlist record */ + hl_tmp = hostlist_create(host); + if (hl_tmp) + reps = job_ptr->alloc_lps[i]; + else + error("hostlist_create failure"); + } + free(host); + } + hostlist_destroy(hl); + if (hl_tmp) + _append_hl_buf(&buf, &hl_tmp, &reps); + return buf; +} diff --git a/src/plugins/sched/wiki/job_modify.c b/src/plugins/sched/wiki/job_modify.c new file mode 100644 index 0000000000000000000000000000000000000000..e3a62654951e8e3241d248157ca37ac629aa2c60 --- /dev/null +++ b/src/plugins/sched/wiki/job_modify.c @@ -0,0 +1,303 @@ +/*****************************************************************************\ + * job_modify.c - Process Wiki job modify request + ***************************************************************************** + * Copyright (C) 2006-2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include "./msg.h" +#include <strings.h> +#include "src/slurmctld/locks.h" +#include "src/slurmctld/slurmctld.h" + +static void _null_term(char *str) +{ + char *tmp_ptr; + for (tmp_ptr=str; ; tmp_ptr++) { + if (tmp_ptr[0] == '\0') + break; + if (isspace(tmp_ptr[0])) { + tmp_ptr[0] = '\0'; + break; + } + } +} + +/* return -1 on error */ +static int32_t _get_depend_id(char *str) +{ + /* stand-alone job_id */ + if (isdigit(str[0])) + return (int32_t) atol(str); + + if (strncasecmp(str, "afterany:", 9) != 0) /* invalid spec */ + return (int32_t) -1; + + str += 9; + if (!isdigit(str[0])) + return (int32_t) -1; + return (int32_t) atol(str); +} + +static int _job_modify(uint32_t jobid, char *bank_ptr, + int32_t depend_id, char *new_hostlist, + uint32_t new_node_cnt, char *part_name_ptr, + uint32_t new_time_limit) +{ + struct job_record *job_ptr; + + job_ptr = find_job_record(jobid); + if (job_ptr == NULL) { + error("wiki: MODIFYJOB has invalid jobid %u", jobid); + return ESLURM_INVALID_JOB_ID; + } + if (IS_JOB_FINISHED(job_ptr)) { + error("wiki: MODIFYJOB jobid %u is finished", jobid); + return ESLURM_DISABLED; + } + + if (depend_id != -1) { + info("wiki: changing job dependency to %d", depend_id); + job_ptr->dependency = depend_id; + } + + if (new_time_limit) { + time_t old_time = job_ptr->time_limit; + job_ptr->time_limit = new_time_limit; + info("wiki: change job %u time_limit to %u", + jobid, new_time_limit); + /* Update end_time based upon change + * to preserve suspend time info */ + job_ptr->end_time = job_ptr->end_time + + ((job_ptr->time_limit - + old_time) * 60); + last_job_update = time(NULL); + } + if (bank_ptr) { + info("wiki: change job %u bank %s", jobid, bank_ptr); + xfree(job_ptr->account); + job_ptr->account = xstrdup(bank_ptr); + last_job_update = time(NULL); + } + + if (new_hostlist) { + int i, rc = 0, task_cnt; + hostlist_t hl; + char *tasklist; + + if (!job_ptr->details) { + /* Job is done, nothing to reset */ + if (new_hostlist == '\0') + goto host_fini; + error("wiki: MODIFYJOB tasklist of non-pending " + "job %u", jobid); + return ESLURM_DISABLED; + } + + xfree(job_ptr->details->req_nodes); + FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + if (new_hostlist == '\0') + goto host_fini; + + tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); + if (tasklist == NULL) { + rc = 1; + goto host_fini; + } + hl = hostlist_create(tasklist); + if (hl == 0) { + rc = 1; + goto host_fini; + } + hostlist_uniq(hl); + hostlist_sort(hl); + i = strlen(new_hostlist) + 16; + job_ptr->details->req_nodes = xmalloc(i); + i = hostlist_ranged_string(hl, i, job_ptr->details->req_nodes); + hostlist_destroy(hl); + if (i < 0) { + rc = 1; + goto host_fini; + } + if (node_name2bitmap(job_ptr->details->req_nodes, false, + &job_ptr->details->req_node_bitmap)) { + rc = 1; + goto host_fini; + } + +host_fini: if (rc) { + info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); + xfree(job_ptr->details->req_nodes); + return EINVAL; + } else { + info("wiki: change job %u hostlist %s", jobid, new_hostlist); + } + } + + if (part_name_ptr) { + struct part_record *part_ptr; + part_ptr = find_part_record(part_name_ptr); + if (part_ptr == NULL) { + error("wiki: MODIFYJOB has invalid partition %s", + part_name_ptr); + return ESLURM_INVALID_PARTITION_NAME; + } + info("wiki: change job %u partition %s", + jobid, part_name_ptr); + strncpy(job_ptr->partition, part_name_ptr, MAX_SLURM_NAME); + job_ptr->part_ptr = part_ptr; + last_job_update = time(NULL); + } + + if (new_node_cnt) { + if (IS_JOB_PENDING(job_ptr) && job_ptr->details) { + job_ptr->details->min_nodes = new_node_cnt; + if (job_ptr->details->max_nodes + && (job_ptr->details->max_nodes < new_node_cnt)) + job_ptr->details->max_nodes = new_node_cnt; + info("wiki: change job %u min_nodes to %u", + jobid, new_node_cnt); + last_job_update = time(NULL); + } else { + error("wiki: MODIFYJOB node count of non-pending " + "job %u", jobid); + return ESLURM_DISABLED; + } + } + + return SLURM_SUCCESS; +} + +/* Modify a job: + * CMD=MODIFYJOB ARG=<jobid> PARTITION=<name> NODES=<number> + * DEPEND=afterany:<jobid> TIMELIMT=<seconds> BANK=<name> + * RET 0 on success, -1 on failure */ +extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) +{ + char *arg_ptr, *bank_ptr, *depend_ptr, *nodes_ptr; + char *host_ptr, *part_ptr, *time_ptr, *tmp_char; + int slurm_rc; + int depend_id = -1; + uint32_t jobid, new_node_cnt = 0, new_time_limit = 0; + static char reply_msg[128]; + /* Locks: write job, read node and partition info */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; + + arg_ptr = strstr(cmd_ptr, "ARG="); + if (arg_ptr == NULL) { + *err_code = -300; + *err_msg = "MODIFYJOB lacks ARG="; + error("wiki: MODIFYJOB lacks ARG="); + return -1; + } + /* Change all parsed "=" to ":" then search for remaining "=" + * and report results as unrecognized options */ + arg_ptr[3] = ':'; + arg_ptr += 4; + jobid = strtoul(arg_ptr, &tmp_char, 10); + if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { + *err_code = -300; + *err_msg = "Invalid ARG value"; + error("wiki: MODIFYJOB has invalid jobid"); + return -1; + } + bank_ptr = strstr(cmd_ptr, "BANK="); + depend_ptr = strstr(cmd_ptr, "DEPEND="); + host_ptr = strstr(cmd_ptr, "HOSTLIST="); + nodes_ptr = strstr(cmd_ptr, "NODES="); + part_ptr = strstr(cmd_ptr, "PARTITION="); + time_ptr = strstr(cmd_ptr, "TIMELIMIT="); + if (bank_ptr) { + bank_ptr[4] = ':'; + bank_ptr += 5; + _null_term(bank_ptr); + } + if (depend_ptr) { + depend_ptr[6] = ':'; + depend_ptr += 7; + depend_id = _get_depend_id(depend_ptr); + if (depend_id == -1) { + *err_code = -300; + *err_msg = "MODIFYJOB has invalid DEPEND specificiation"; + error("wiki: MODIFYJOB has invalid DEPEND spec: %s", + depend_ptr); + return -1; + } + } + if (host_ptr) { + host_ptr[8] = ':'; + host_ptr += 9; + _null_term(bank_ptr); + } + if (nodes_ptr) { + nodes_ptr[5] = ':'; + nodes_ptr += 6; + new_node_cnt = strtoul(nodes_ptr, NULL, 10); + } + if (part_ptr) { + part_ptr[9] = ':'; + part_ptr += 10; + _null_term(part_ptr); + } + if (time_ptr) { + time_ptr[9] = ':'; + time_ptr += 10; + new_time_limit = strtoul(time_ptr, NULL, 10); + } + + /* Look for any un-parsed "=" */ + tmp_char = strchr(cmd_ptr, '='); + if (tmp_char) { + tmp_char[0] = '\0'; + while (tmp_char[-1] && (!isspace(tmp_char[-1]))) + tmp_char--; + error("wiki: Invalid MODIFYJOB option %s", tmp_char); + } + + lock_slurmctld(job_write_lock); + slurm_rc = _job_modify(jobid, bank_ptr, depend_id, host_ptr, + new_node_cnt, part_ptr, new_time_limit); + unlock_slurmctld(job_write_lock); + if (slurm_rc != SLURM_SUCCESS) { + *err_code = -700; + *err_msg = slurm_strerror(slurm_rc); + error("wiki: Failed to modify job %u (%m)", jobid); + return -1; + } + + snprintf(reply_msg, sizeof(reply_msg), + "job %u modified successfully", jobid); + *err_msg = reply_msg; + return 0; +} diff --git a/src/plugins/sched/wiki/msg.c b/src/plugins/sched/wiki/msg.c index 0f9b9e9b303a5b348d20bdfe30c86a1fb01f5acc..20cedd7e3ee3f0ef00e95f35d3f5056d79be02e4 100644 --- a/src/plugins/sched/wiki/msg.c +++ b/src/plugins/sched/wiki/msg.c @@ -505,12 +505,14 @@ static int _parse_msg(char *msg, char **req) \*****************************************************************************/ static void _proc_msg(slurm_fd new_fd, char *msg) { - char *req, *cmd_ptr; + DEF_TIMERS; + char *req, *cmd_ptr, *msg_type = NULL; char response[128]; if (new_fd < 0) return; + START_TIMER; if (!msg) { err_code = -300; err_msg = "NULL request message"; @@ -531,20 +533,34 @@ static void _proc_msg(slurm_fd new_fd, char *msg) cmd_ptr +=4; err_code = 0; if (strncmp(cmd_ptr, "GETJOBS", 7) == 0) { + msg_type = "wiki:GETJOBS"; if (!get_jobs(cmd_ptr, &err_code, &err_msg)) goto free_resp_msg; } else if (strncmp(cmd_ptr, "GETNODES", 8) == 0) { + msg_type = "wiki:GETNODES"; if (!get_nodes(cmd_ptr, &err_code, &err_msg)) goto free_resp_msg; } else if (strncmp(cmd_ptr, "STARTJOB", 8) == 0) { + msg_type = "wiki:STARTJOB"; start_job(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "CANCELJOB", 9) == 0) { + msg_type = "wiki:CANCELJOB"; cancel_job(cmd_ptr, &err_code, &err_msg); + } else if (strncmp(cmd_ptr, "SUSPENDJOB", 10) == 0) { + msg_type = "wiki:SUSPENDJOB"; + suspend_job(cmd_ptr, &err_code, &err_msg); + } else if (strncmp(cmd_ptr, "RESUMEJOB", 9) == 0) { + msg_type = "wiki:RESUMEJOB"; + resume_job(cmd_ptr, &err_code, &err_msg); + } else if (strncmp(cmd_ptr, "MODIFYJOB", 9) == 0) { + msg_type = "wiki:MODIFYJOB"; + job_modify_wiki(cmd_ptr, &err_code, &err_msg); } else { err_code = -300; err_msg = "unsupported request type"; error("wiki: unrecognized request type: %s", req); } + END_TIMER2(msg_type); resp_msg: snprintf(response, sizeof(response), diff --git a/src/plugins/sched/wiki/msg.h b/src/plugins/sched/wiki/msg.h index 76458521a45c5bac36e9b5eb5a8a03fb6946eaa4..e19e45b1f08e449f47100c7101465e89fc5f65a4 100644 --- a/src/plugins/sched/wiki/msg.h +++ b/src/plugins/sched/wiki/msg.h @@ -114,6 +114,7 @@ extern int job_release_task(char *cmd_ptr, int *err_code, char **err_msg); extern int job_requeue_wiki(char *cmd_ptr, int *err_code, char **err_msg); extern int job_signal_wiki(char *cmd_ptr, int *err_code, char **err_msg); extern int job_will_run(char *cmd_ptr, int *err_code, char **err_msg); +extern char * moab2slurm_task_list(char *moab_tasklist, int *task_cnt); extern int parse_wiki_config(void); extern int start_job(char *cmd_ptr, int *err_code, char **err_msg); extern int suspend_job(char *cmd_ptr, int *err_code, char **err_msg); diff --git a/src/plugins/sched/wiki/resume_job.c b/src/plugins/sched/wiki/resume_job.c new file mode 100644 index 0000000000000000000000000000000000000000..319219244b55f6128c64c10797901e298a3b2f0a --- /dev/null +++ b/src/plugins/sched/wiki/resume_job.c @@ -0,0 +1,85 @@ +/*****************************************************************************\ + * resume_job.c - Process Wiki resume job request + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include "./msg.h" +#include "src/slurmctld/locks.h" +#include "src/slurmctld/slurmctld.h" + +/* RET 0 on success, -1 on failure */ +extern int resume_job(char *cmd_ptr, int *err_code, char **err_msg) +{ + char *arg_ptr, *tmp_char; + int slurm_rc; + suspend_msg_t msg; + uint32_t jobid; + static char reply_msg[128]; + /* Locks: write job and node info */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + + arg_ptr = strstr(cmd_ptr, "ARG="); + if (arg_ptr == NULL) { + *err_code = -300; + *err_msg = "RESUMEJOB lacks ARG"; + error("wiki: RESUMEJOB lacks ARG"); + return -1; + } + jobid = strtoul(arg_ptr+4, &tmp_char, 10); + if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { + *err_code = -300; + *err_msg = "Invalid ARG value"; + error("wiki: RESUMEJOB has invalid jobid"); + return -1; + } + + msg.job_id = jobid; + msg.op = RESUME_JOB; + lock_slurmctld(job_write_lock); + slurm_rc = job_suspend(&msg, 0, -1); + unlock_slurmctld(job_write_lock); + if (slurm_rc != SLURM_SUCCESS) { + *err_code = -700; + *err_msg = slurm_strerror(slurm_rc); + error("wiki: Failed to resume job %u (%m)", jobid); + return -1; + } + + snprintf(reply_msg, sizeof(reply_msg), + "job %u resumed successfully", jobid); + *err_msg = reply_msg; + return 0; +} diff --git a/src/plugins/sched/wiki/start_job.c b/src/plugins/sched/wiki/start_job.c index b581aef83502cda104be4331570ef8e03865e5f4..f7cdd46d189ad8b080c780beb3ac794e7a9ca0ee 100644 --- a/src/plugins/sched/wiki/start_job.c +++ b/src/plugins/sched/wiki/start_job.c @@ -36,21 +36,23 @@ \*****************************************************************************/ #include "./msg.h" +#include "src/common/node_select.h" +#include "src/common/slurm_protocol_defs.h" #include "src/common/xstring.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/slurmctld.h" #include "src/slurmctld/state_save.h" -static int _start_job(uint32_t jobid, char *hostlist, - int *err_code, char **err_msg); +static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, + char *tasklist, int *err_code, char **err_msg); /* RET 0 on success, -1 on failure */ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) { - char *arg_ptr, *task_ptr, *node_ptr, *tmp_char; - int i; + char *arg_ptr, *task_ptr, *tasklist, *tmp_char; + int i, rc, task_cnt; uint32_t jobid; - hostlist_t hl; + hostlist_t hl = (hostlist_t) NULL; char host_string[MAXHOSTRANGELEN]; static char reply_msg[128]; @@ -76,20 +78,19 @@ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) error("wiki: STARTJOB lacks TASKLIST"); return -1; } - node_ptr = task_ptr + 9; - for (i=0; node_ptr[i]!='\0'; i++) { - if (node_ptr[i] == ':') - node_ptr[i] = ','; - } - hl = hostlist_create(node_ptr); - if (hl == NULL) { + task_ptr += 9; /* skip over "TASKLIST=" */ + tasklist = moab2slurm_task_list(task_ptr, &task_cnt); + if (tasklist) + hl = hostlist_create(tasklist); + if ((tasklist == NULL) || (hl == NULL)) { *err_code = -300; *err_msg = "STARTJOB TASKLIST is invalid"; error("wiki: STARTJOB TASKLIST is invalid: %s", - node_ptr); + task_ptr); + xfree(tasklist); return -1; } - hostlist_uniq(hl); /* for now, don't worry about task layout */ + hostlist_uniq(hl); hostlist_sort(hl); i = hostlist_ranged_string(hl, sizeof(host_string), host_string); hostlist_destroy(hl); @@ -98,28 +99,54 @@ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) *err_msg = "STARTJOB has invalid TASKLIST"; error("wiki: STARTJOB has invalid TASKLIST: %s", host_string); + xfree(tasklist); return -1; } - if (_start_job(jobid, host_string, err_code, err_msg) != 0) - return -1; - snprintf(reply_msg, sizeof(reply_msg), - "job %u started successfully", jobid); - *err_msg = reply_msg; - return 0; + rc = _start_job(jobid, task_cnt, host_string, tasklist, + err_code, err_msg); + xfree(tasklist); + if (rc == 0) { + snprintf(reply_msg, sizeof(reply_msg), + "job %u started successfully", jobid); + *err_msg = reply_msg; + } + return rc; } -static int _start_job(uint32_t jobid, char *hostlist, - int *err_code, char **err_msg) +/* + * Attempt to start a job + * jobid (IN) - job id + * task_cnt (IN) - total count of tasks to start + * hostlist (IN) - SLURM hostlist expression with no repeated hostnames + * tasklist (IN/OUT) - comma separated list of hosts with tasks to be started, + * list hostname once per task to start + * err_code (OUT) - Moab error code + * err_msg (OUT) - Moab error message + */ +static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, + char *tasklist, int *err_code, char **err_msg) { - int rc = 0; + int rc = 0, old_task_cnt = 1; struct job_record *job_ptr; /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; - char *new_node_list, *save_req_nodes = NULL; + char *new_node_list = NULL; static char tmp_msg[128]; - bitstr_t *new_bitmap, *save_req_bitmap = (bitstr_t *) NULL; + bitstr_t *new_bitmap = (bitstr_t *) NULL; + bitstr_t *save_req_bitmap = (bitstr_t *) NULL; + bitoff_t i, bsize; + int ll; /* layout info index */ + char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL; + size_t node_name_len; + static uint32_t cr_test = 0, cr_enabled = 0; + + if (cr_test == 0) { + select_g_get_info_from_plugin(SELECT_CR_PLUGIN, + &cr_enabled); + cr_test = 1; + } lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); @@ -134,105 +161,135 @@ static int _start_job(uint32_t jobid, char *hostlist, if ((job_ptr->details == NULL) || (job_ptr->job_state != JOB_PENDING)) { *err_code = -700; - *err_msg = "Job not pending, can't update"; - error("wiki: Attempt to start non-pending job %u", - jobid); + *err_msg = "Job not pending, can't start"; + error("wiki: Attempt to start job %u in state %s", + jobid, job_state_string(job_ptr->job_state)); rc = -1; goto fini; } - new_node_list = xstrdup(hostlist); - if (hostlist && (new_node_list == NULL)) { - *err_code = -700; - *err_msg = "Invalid TASKLIST"; - error("wiki: Attempt to set invalid node list for job %u, %s", - jobid, hostlist); - rc = -1; - goto fini; - } + if (task_cnt) { + new_node_list = xstrdup(hostlist); + if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) { + *err_code = -700; + *err_msg = "Invalid TASKLIST"; + error("wiki: Attempt to set invalid node list for " + "job %u, %s", + jobid, hostlist); + xfree(new_node_list); + rc = -1; + goto fini; + } - if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) { - *err_code = -700; - *err_msg = "Invalid TASKLIST"; - error("wiki: Attempt to set invalid node list for job %u, %s", - jobid, hostlist); - xfree(new_node_list); - rc = -1; - goto fini; + /* User excluded node list incompatable with Wiki + * Exclude all nodes not explicitly requested */ + FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap); + bit_not(job_ptr->details->exc_node_bitmap); } - /* Remove any excluded nodes, incompatable with Wiki */ - if (job_ptr->details->exc_nodes) { - error("wiki: clearing exc_nodes for job %u", jobid); - xfree(job_ptr->details->exc_nodes); - if (job_ptr->details->exc_node_bitmap) - FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + /* Build layout information from tasklist (assuming that Moab + * sends a non-bracketed list of nodes, repeated as many times + * as cpus should be used per node); at this point, node names + * are comma-separated. This is _not_ a fast algorithm as it + * performs many string compares. */ + xfree(job_ptr->details->req_node_layout); + if (task_cnt && cr_enabled) { + job_ptr->details->req_node_layout = (uint16_t *) + xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t)); + bsize = bit_size(new_bitmap); + for (i = 0, ll = -1; i < bsize; i++) { + if (!bit_test(new_bitmap, i)) + continue; + ll++; + node_name = node_record_table_ptr[i].name; + node_name_len = strlen(node_name); + if (node_name_len == 0) + continue; + node_cur = tasklist; + while (*node_cur) { + if ((node_idx = strstr(node_cur, node_name))) { + if ((node_idx[node_name_len] == ',') || + (node_idx[node_name_len] == '\0')) { + job_ptr->details-> + req_node_layout[ll]++; + } + node_cur = strchr(node_idx, ','); + if (node_cur) + continue; + } + break; + } + } } - /* start it now */ + /* save and update job state to start now */ save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; + old_task_cnt = job_ptr->num_procs; + job_ptr->num_procs = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); - if (rc == 0) { /* New job to start ASAP */ - (void) schedule(); /* provides own locking */ - /* Check to insure the job was actually started */ - lock_slurmctld(job_write_lock); - /* job_ptr = find_job_record(jobid); don't bother */ - if ((job_ptr->job_id == jobid) && job_ptr->details && - (job_ptr->job_state == JOB_RUNNING)) { - /* Restore required node list */ - xfree(job_ptr->details->req_nodes); - job_ptr->details->req_nodes = save_req_nodes; - FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); - job_ptr->details->req_node_bitmap = save_req_bitmap; - } else { - xfree(save_req_nodes); - FREE_NULL_BITMAP(save_req_bitmap); - } + if (rc) + return rc; - if ((job_ptr->job_id == jobid) - && (job_ptr->job_state != JOB_RUNNING)) { - uint16_t wait_reason = 0; - char *wait_string; - - /* restore job state */ - job_ptr->priority = 0; - if (job_ptr->details) { - /* Details get cleared on job abort; happens - * if the request is sufficiently messed up. - * This happens when Moab tries to start a - * a job on invalid nodes (wrong partition). */ - xfree(job_ptr->details->req_nodes); - FREE_NULL_BITMAP(job_ptr->details-> - req_node_bitmap); - } - if (job_ptr->job_state == JOB_FAILED) - wait_string = "Invalid request, job aborted"; - else { - wait_reason = job_ptr->state_reason; - if (wait_reason == WAIT_HELD) { - /* some job is completing, slurmctld did - * not even try to schedule this job */ - wait_reason = WAIT_RESOURCES; - } - wait_string = job_reason_string(wait_reason); - job_ptr->state_reason = WAIT_HELD; + /* No errors so far */ + (void) schedule(); /* provides own locking */ + + /* Check to insure the job was actually started */ + lock_slurmctld(job_write_lock); + if (job_ptr->job_id != jobid) + job_ptr = find_job_record(jobid); + + if (job_ptr && (job_ptr->job_id == jobid) + && (job_ptr->job_state != JOB_RUNNING)) { + uint16_t wait_reason = 0; + char *wait_string; + + if (job_ptr->job_state == JOB_FAILED) + wait_string = "Invalid request, job aborted"; + else { + wait_reason = job_ptr->state_reason; + if (wait_reason == WAIT_HELD) { + /* some job is completing, slurmctld did + * not even try to schedule this job */ + wait_reason = WAIT_RESOURCES; } - *err_code = -910 - wait_reason; - snprintf(tmp_msg, sizeof(tmp_msg), - "Could not start job %u: %s", - jobid, wait_string); - *err_msg = tmp_msg; - error("wiki: %s", tmp_msg); - rc = -1; + wait_string = job_reason_string(wait_reason); + job_ptr->state_reason = WAIT_HELD; } - unlock_slurmctld(job_write_lock); - schedule_node_save(); /* provides own locking */ - schedule_job_save(); /* provides own locking */ + *err_code = -910 - wait_reason; + snprintf(tmp_msg, sizeof(tmp_msg), + "Could not start job %u(%s): %s", + jobid, new_node_list, wait_string); + *err_msg = tmp_msg; + error("wiki: %s", tmp_msg); + + /* restore some of job state */ + job_ptr->priority = 0; + job_ptr->num_procs = old_task_cnt; + rc = -1; } + + if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) { + /* Restore required node list in case job requeued */ + xfree(job_ptr->details->req_nodes); + job_ptr->details->req_nodes = save_req_nodes; + FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); + job_ptr->details->req_node_bitmap = save_req_bitmap; + FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + xfree(job_ptr->details->req_node_layout); + } else { + error("wiki: start_job(%u) job missing", jobid); + xfree(save_req_nodes); + FREE_NULL_BITMAP(save_req_bitmap); + } + + unlock_slurmctld(job_write_lock); + schedule_node_save(); /* provides own locking */ + schedule_job_save(); /* provides own locking */ return rc; } diff --git a/src/plugins/sched/wiki/suspend_job.c b/src/plugins/sched/wiki/suspend_job.c new file mode 100644 index 0000000000000000000000000000000000000000..058d2dce3f271bb4f7775a8a3aa5701c80b39bfe --- /dev/null +++ b/src/plugins/sched/wiki/suspend_job.c @@ -0,0 +1,85 @@ +/*****************************************************************************\ + * suspend_job.c - Process Wiki suspend job request + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include "./msg.h" +#include "src/slurmctld/locks.h" +#include "src/slurmctld/slurmctld.h" + +/* RET 0 on success, -1 on failure */ +extern int suspend_job(char *cmd_ptr, int *err_code, char **err_msg) +{ + char *arg_ptr, *tmp_char; + int slurm_rc; + suspend_msg_t msg; + uint32_t jobid; + static char reply_msg[128]; + /* Locks: write job and node info */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + + arg_ptr = strstr(cmd_ptr, "ARG="); + if (arg_ptr == NULL) { + *err_code = -300; + *err_msg = "SUSPENDJOB lacks ARG"; + error("wiki: SUSPENDJOB lacks ARG"); + return -1; + } + jobid = strtoul(arg_ptr+4, &tmp_char, 10); + if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { + *err_code = -300; + *err_msg = "Invalid ARG value"; + error("wiki: SUSPENDJOB has invalid jobid"); + return -1; + } + + msg.job_id = jobid; + msg.op = SUSPEND_JOB; + lock_slurmctld(job_write_lock); + slurm_rc = job_suspend(&msg, 0, -1); + unlock_slurmctld(job_write_lock); + if (slurm_rc != SLURM_SUCCESS) { + *err_code = -700; + *err_msg = slurm_strerror(slurm_rc); + error("wiki: Failed to suspend job %u (%m)", jobid); + return -1; + } + + snprintf(reply_msg, sizeof(reply_msg), + "job %u suspended successfully", jobid); + *err_msg = reply_msg; + return 0; +} diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index e8688b3d5919a5d994f108a3d9741cefb8c17178..ea2afa63f7a5f7f8f8227a59c49472fd14304355 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -50,6 +50,7 @@ static char * _dump_job(struct job_record *job_ptr, int state_info); static char * _get_group_name(gid_t gid); static void _get_job_comment(struct job_record *job_ptr, char *buffer, int buf_size); +static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr); static uint32_t _get_job_end_time(struct job_record *job_ptr); static char * _get_job_features(struct job_record *job_ptr); static uint32_t _get_job_min_disk(struct job_record *job_ptr); @@ -274,6 +275,11 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) _get_job_min_nodes(job_ptr)); xstrcat(buf, tmp); + snprintf(tmp, sizeof(tmp), + "DPROCS=%u;", + _get_job_cpus_per_task(job_ptr)); + xstrcat(buf, tmp); + snprintf(tmp, sizeof(tmp), "QUEUETIME=%u;STARTTIME=%u;RCLASS=%s;", _get_job_submit_time(job_ptr), @@ -346,7 +352,8 @@ static void _get_job_comment(struct job_record *job_ptr, /* SHARED NODES */ if (cr_enabled) { /* consumable resources */ - if (job_ptr->part_ptr->shared == SHARED_EXCLUSIVE) + if (job_ptr->part_ptr && + (job_ptr->part_ptr->shared == SHARED_EXCLUSIVE)) sharing = 0; else if (job_ptr->details && (job_ptr->details->shared != 0)) sharing = 1; @@ -383,6 +390,15 @@ static void _get_job_comment(struct job_record *job_ptr, size += snprintf((buffer + size), (buf_size - size), "\";"); } +static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr) +{ + uint16_t cpus_per_task = 1; + + if (job_ptr->details && job_ptr->details->cpus_per_task) + cpus_per_task = job_ptr->details->cpus_per_task; + return cpus_per_task; +} + static uint32_t _get_job_min_mem(struct job_record *job_ptr) { if (job_ptr->details) @@ -440,7 +456,7 @@ static uint32_t _get_job_tasks(struct job_record *job_ptr) job_ptr->details->ntasks_per_node)); } - return task_cnt; + return task_cnt / _get_job_cpus_per_task(job_ptr); } static uint32_t _get_job_time_limit(struct job_record *job_ptr) diff --git a/src/plugins/sched/wiki2/hostlist.c b/src/plugins/sched/wiki2/hostlist.c index c31dc6bf89207b154d55f354e6f062353055becc..7a96bd30cc5956bd22ab0b1c1826373e33ec285a 100644 --- a/src/plugins/sched/wiki2/hostlist.c +++ b/src/plugins/sched/wiki2/hostlist.c @@ -160,7 +160,7 @@ extern char * slurm_job2moab_task_list(struct job_record *job_ptr) /* Return task list in Moab format 1: tux0:tux0:tux1:tux1:tux2 */ static char * _task_list(struct job_record *job_ptr) { - int i, j; + int i, j, task_cnt; char *buf = NULL, *host; hostlist_t hl = hostlist_create(job_ptr->nodes); @@ -178,7 +178,10 @@ static char * _task_list(struct job_record *job_ptr) job_ptr->alloc_lps_cnt); break; } - for (j=0; j<job_ptr->alloc_lps[i]; j++) { + task_cnt = job_ptr->alloc_lps[i]; + if (job_ptr->details && job_ptr->details->cpus_per_task) + task_cnt /= job_ptr->details->cpus_per_task; + for (j=0; j<task_cnt; j++) { if (buf) xstrcat(buf, ":"); xstrcat(buf, host); @@ -247,7 +250,7 @@ static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) /* Return task list in Moab format 2: tux[0-1]*2:tux2 */ static char * _task_list_exp(struct job_record *job_ptr) { - int i, reps = -1; + int i, reps = -1, task_cnt; char *buf = NULL, *host; hostlist_t hl = hostlist_create(job_ptr->nodes); hostlist_t hl_tmp = (hostlist_t) NULL; @@ -267,7 +270,10 @@ static char * _task_list_exp(struct job_record *job_ptr) break; } - if (reps == job_ptr->alloc_lps[i]) { + task_cnt = job_ptr->alloc_lps[i]; + if (job_ptr->details && job_ptr->details->cpus_per_task) + task_cnt /= job_ptr->details->cpus_per_task; + if (reps == task_cnt) { /* append to existing hostlist record */ if (hostlist_push(hl_tmp, host) == 0) error("hostlist_push failure"); @@ -278,7 +284,7 @@ static char * _task_list_exp(struct job_record *job_ptr) /* start new hostlist record */ hl_tmp = hostlist_create(host); if (hl_tmp) - reps = job_ptr->alloc_lps[i]; + reps = task_cnt; else error("hostlist_create failure"); } diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 8fc6c6b8a83828d393ea3425a071be8d5102a669..f03eae075ad2d68af028d2bc7f778946e34fdfe1 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -2,7 +2,7 @@ * bg_job_place.c - blue gene job placement (e.g. base block selection) * functions. * - * $Id: bg_job_place.c 12533 2007-10-22 23:19:23Z jette $ + * $Id: bg_job_place.c 12627 2007-11-06 19:48:55Z jette $ ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -474,7 +474,8 @@ try_again: /* We use the proccessor count per partition here mostly to see if we can run on a smaller partition. */ - convert_num_unit((float)proc_cnt, tmp_char, UNIT_NONE); + convert_num_unit((float)proc_cnt, tmp_char, + sizeof(tmp_char), UNIT_NONE); debug("block %s CPU count (%s) not suitable", record->bg_block_id, tmp_char); @@ -490,7 +491,7 @@ try_again: || (req_nodes != 0 && record->bp_count > req_nodes) || (record->bp_count < target_size)) { convert_num_unit((float)record->node_cnt, tmp_char, - UNIT_NONE); + sizeof(tmp_char), UNIT_NONE); debug("block %s node count (%s) not suitable", record->bg_block_id, tmp_char); diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 6ee686efd8247dc22d6cca202142cb84e6a6f495..ecb3aa3c68a8a99d506c59ab7e738f5afeb6e86f 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2,7 +2,7 @@ * select_cons_res.c - node selection plugin supporting consumable * resources policies. * - * $Id: select_cons_res.c 12452 2007-10-05 19:07:07Z da $ + * $Id: select_cons_res.c 12649 2007-11-15 18:02:35Z da $ *****************************************************************************\ * * The following example below illustrates how four jobs are allocated @@ -545,7 +545,7 @@ static void _append_to_job_list(struct select_cr_job *new_job) /* * _count_cpus - report how many cpus are available with the identified nodes */ -static void _count_cpus(unsigned *bitmap, uint16_t *sum) +static void _count_cpus(bitstr_t *bitmap, uint16_t *sum) { int i, allocated_lps; *sum = 0; diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index e67383afb5d4f92d9b04d98eb4c9489b6b8dbe2e..aa0a8d37c8796a418950f38435e3c6d9c1276c3a 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -1,6 +1,6 @@ /*****************************************************************************\ ** federation.c - Library routines for initiating jobs on IBM Federation - ** $Id: federation.c 11600 2007-05-31 23:59:57Z morrone $ + ** $Id: federation.c 12618 2007-11-02 22:31:39Z jette $ ***************************************************************************** * Copyright (C) 2004 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -2250,14 +2250,20 @@ _check_rdma_job_count(char *adapter) { unsigned int job_count; unsigned int *job_keys; - int z; + int rc, z; + + rc = ntbl_rdma_jobs(NTBL_VERSION, adapter, + &job_count, &job_keys); + if (rc != NTBL_SUCESSS) { + error("ntbl_rdma_jobs(): %d", rc); + return SLURM_ERROR; + } - ntbl_rdma_jobs(NTBL_VERSION, adapter, - &job_count, &job_keys); debug3("Adapter %s, RDMA job_count = %u", adapter, job_count); for (z = 0; z < job_count; z++) debug3(" job key = %u", job_keys[z]); + free(job_keys); if (job_count >= 4) { error("RDMA job_count is too high: %u", job_count); return SLURM_ERROR; diff --git a/src/sacct/print.c b/src/sacct/print.c index 3de3f88d2e9e110a20bd3a6bcc7b285d97fc62cb..dd7172657ee1856eaa9eafa6d70578ce362aca7d 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -238,7 +238,8 @@ void print_idrss(type_t type, void *object) rusage = step->rusage; break; } - convert_num_unit((float)rusage.ru_idrss, outbuf, UNIT_NONE); + convert_num_unit((float)rusage.ru_idrss, outbuf, sizeof(outbuf), + UNIT_NONE); printf("%8s", outbuf); } @@ -726,13 +727,14 @@ void print_pages(type_t type, void *object) sacct = job->sacct; nodes = job->nodes; pos = sacct.min_cpu_id.nodeid; - convert_num_unit((float)sacct.max_pages, buf1, UNIT_NONE); + convert_num_unit((float)sacct.max_pages, + buf1, sizeof(buf1), UNIT_NONE); if(job->track_steps) snprintf(outbuf, FORMAT_STRING_SIZE, "%s/- - -", buf1); else { convert_num_unit((float)sacct.ave_pages, - buf2, UNIT_NONE); + buf2, sizeof(buf2), UNIT_NONE); find_hostname(pos, nodes, buf3); snprintf(outbuf, FORMAT_STRING_SIZE, "%s/%s:%u - %s", buf1, @@ -746,8 +748,10 @@ void print_pages(type_t type, void *object) sacct = step->sacct; nodes = step->nodes; pos = sacct.min_cpu_id.nodeid; - convert_num_unit((float)sacct.max_pages, buf1, UNIT_NONE); - convert_num_unit((float)sacct.ave_pages, buf2, UNIT_NONE); + convert_num_unit((float)sacct.max_pages, buf1, sizeof(buf1), + UNIT_NONE); + convert_num_unit((float)sacct.ave_pages, buf2, sizeof(buf2), + UNIT_NONE); find_hostname(pos, nodes, buf3); snprintf(outbuf, FORMAT_STRING_SIZE, "%s/%s:%u - %s", buf1, @@ -782,13 +786,14 @@ void print_rss(type_t type, void *object) sacct = job->sacct; nodes = job->nodes; pos = sacct.min_cpu_id.nodeid; - convert_num_unit((float)sacct.max_rss, buf1, UNIT_NONE); + convert_num_unit((float)sacct.max_rss, buf1, sizeof(buf1), + UNIT_NONE); if(job->track_steps) snprintf(outbuf, FORMAT_STRING_SIZE, "%s/- - -", buf1); else { convert_num_unit((float)sacct.ave_rss, - buf2, UNIT_NONE); + buf2, sizeof(buf2), UNIT_NONE); find_hostname(pos, nodes, buf3); snprintf(outbuf, FORMAT_STRING_SIZE, "%s/%s:%u - %s", buf1, @@ -802,8 +807,10 @@ void print_rss(type_t type, void *object) sacct = step->sacct; nodes = step->nodes; pos = sacct.min_cpu_id.nodeid; - convert_num_unit((float)sacct.max_rss, buf1, UNIT_NONE); - convert_num_unit((float)sacct.ave_rss, buf2, UNIT_NONE); + convert_num_unit((float)sacct.max_rss, buf1, sizeof(buf1), + UNIT_NONE); + convert_num_unit((float)sacct.ave_rss, buf2, sizeof(buf2), + UNIT_NONE); find_hostname(pos, nodes, buf3); snprintf(outbuf, FORMAT_STRING_SIZE, "%s/%s:%u - %s", buf1, @@ -1059,12 +1066,13 @@ void print_vsize(type_t type, void *object) sacct = job->sacct; nodes = job->nodes; pos = sacct.min_cpu_id.nodeid; - convert_num_unit((float)sacct.max_vsize, buf1, UNIT_NONE); + convert_num_unit((float)sacct.max_vsize, + buf1, sizeof(buf1),UNIT_NONE); if(job->track_steps) snprintf(outbuf, FORMAT_STRING_SIZE, "%s/- - -", buf1); else { convert_num_unit((float)sacct.ave_vsize, - buf2, UNIT_NONE); + buf2, sizeof(buf2), UNIT_NONE); find_hostname(pos, nodes, buf3); snprintf(outbuf, FORMAT_STRING_SIZE, "%s/%s:%u - %s", buf1, @@ -1078,8 +1086,10 @@ void print_vsize(type_t type, void *object) sacct = step->sacct; nodes = step->nodes; pos = sacct.min_cpu_id.nodeid; - convert_num_unit((float)sacct.max_vsize, buf1, UNIT_NONE); - convert_num_unit((float)sacct.ave_vsize, buf2, UNIT_NONE); + convert_num_unit((float)sacct.max_vsize, buf1, sizeof(buf1), + UNIT_NONE); + convert_num_unit((float)sacct.ave_vsize, buf2, sizeof(buf2), + UNIT_NONE); find_hostname(pos, nodes, buf3); snprintf(outbuf, FORMAT_STRING_SIZE, "%s/%s:%u - %s", buf1, diff --git a/src/salloc/opt.c b/src/salloc/opt.c index c6d4e84950a456de2c55199cf355acd188f5a84b..80b5435bcbf52f36d907a8906961f208a9c70672 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -907,9 +907,8 @@ void set_options(const int argc, char **argv) case LONG_OPT_BEGIN: opt.begin = parse_time(optarg); if (opt.begin == 0) { - error("Invalid time specification %s", + fatal("Invalid time specification %s", optarg); - exit(1); } break; case LONG_OPT_MAIL_TYPE: diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 6d9aa5a3fd17355bfabae0cc5f9346ebc5f5a2bb..66981c6a12ad7f41039babb3d5d59683f2b31017 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -2,7 +2,7 @@ * salloc.c - Request a SLURM job allocation and * launch a user-specified command. * - * $Id: salloc.c 12536 2007-10-22 23:57:53Z jette $ + * $Id: salloc.c 12700 2007-11-27 23:39:24Z jette $ ***************************************************************************** * Copyright (C) 2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -51,6 +51,8 @@ #include "src/salloc/opt.h" #include "src/salloc/msg.h" +#define MAX_RETRIES 3 + char **command_argv; int command_argc; pid_t command_pid = -1; @@ -79,9 +81,11 @@ int main(int argc, char *argv[]) char **env = NULL; int status = 0; int errnum = 0; + int retries = 0; pid_t pid; pid_t rc_pid = 0; int rc = 0; + static char *msg = "Slurm job queue full, sleeping and retrying."; log_init(xbasename(argv[0]), logopt, 0, NULL); if (initialize_and_process_args(argc, argv) < 0) { @@ -116,8 +120,18 @@ int main(int argc, char *argv[]) xsignal(SIGUSR2, _signal_while_allocating); before = time(NULL); - alloc = slurm_allocate_resources_blocking(&desc, opt.max_wait, - _pending_callback); + while ((alloc = slurm_allocate_resources_blocking(&desc, opt.max_wait, + _pending_callback)) == NULL) { + if ((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || + (retries >= MAX_RETRIES)) + break; + if (retries == 0) + error(msg); + else + debug(msg); + sleep (++retries); + } + if (alloc == NULL) { if (allocation_interrupted) { /* cancelled by signal */ diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 24427f8f66d08be25bfb344d27b1e757dce79153..ecdffd4538709179825a5620c8f165e0f30f87aa 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -141,6 +141,7 @@ static void _opt_pbs_batch_script(const void *body, int size); /* set options based upon env vars */ static void _opt_env(void); +static void _proc_get_user_env(char *optarg); /* list known options and their settings */ static void _opt_list(void); @@ -446,7 +447,9 @@ static void _opt_default() opt.ifname = xstrdup("/dev/null"); opt.ofname = NULL; opt.efname = NULL; - opt.get_user_env = -1; + + opt.get_user_env_time = -1; + opt.get_user_env_mode = -1; } /*---[ env var processing ]-----------------------------------------------*/ @@ -616,7 +619,7 @@ static struct option long_options[] = { {"immediate", no_argument, 0, 'I'}, {"job-name", required_argument, 0, 'J'}, {"no-kill", no_argument, 0, 'k'}, - {"tasks", required_argument, 0, 'n'}, + {"ntasks", required_argument, 0, 'n'}, {"nodes", required_argument, 0, 'N'}, {"output", required_argument, 0, 'o'}, {"overcommit", no_argument, 0, 'O'}, @@ -1227,6 +1230,10 @@ static void _set_options(int argc, char **argv) break; case LONG_OPT_BEGIN: opt.begin = parse_time(optarg); + if (opt.begin == 0) { + fatal("Invalid time specification %s", + optarg); + } break; case LONG_OPT_MAIL_TYPE: opt.mail_type |= _parse_mail_type(optarg); @@ -1285,15 +1292,17 @@ static void _set_options(int argc, char **argv) break; case LONG_OPT_TASKSPERNODE: opt.tasks_per_node = _get_int(optarg, "ntasks-per-node"); + setenvf(NULL, "SLURM_NTASKS_PER_NODE", "%d", + opt.tasks_per_node); break; case LONG_OPT_WRAP: /* handled in process_options_first_pass() */ break; case LONG_OPT_GET_USER_ENV: if (optarg) - opt.get_user_env = strtol(optarg, NULL, 10); + _proc_get_user_env(optarg); else - opt.get_user_env = 0; + opt.get_user_env_time = 0; break; default: fatal("Unrecognized command line parameter %c", @@ -1306,6 +1315,25 @@ static void _set_options(int argc, char **argv) } } +static void _proc_get_user_env(char *optarg) +{ + char *end_ptr; + + if ((optarg[0] >= '0') && (optarg[0] <= '9')) + opt.get_user_env_time = strtol(optarg, &end_ptr, 10); + else { + opt.get_user_env_time = 0; + end_ptr = optarg; + } + + if ((end_ptr == NULL) || (end_ptr[0] == '\0')) + return; + if ((end_ptr[0] == 's') || (end_ptr[0] == 'S')) + opt.get_user_env_mode = 1; + else if ((end_ptr[0] == 'l') || (end_ptr[0] == 'L')) + opt.get_user_env_mode = 2; +} + static void _set_pbs_options(int argc, char **argv) { int opt_char, option_index = 0; diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h index a14602553c88ae4d7250109c8f8a75e83e60176c..9591a83869a1b5c93bf4616e169d3d4690230f4d 100644 --- a/src/sbatch/opt.h +++ b/src/sbatch/opt.h @@ -1,6 +1,6 @@ /*****************************************************************************\ * opt.h - definitions for srun option processing - * $Id: opt.h 12574 2007-10-26 17:00:52Z jette $ + * $Id: opt.h 12697 2007-11-27 22:02:29Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -116,7 +116,8 @@ typedef struct sbatch_options { char *ifname; /* input file name */ char *ofname; /* output file name */ char *efname; /* error file name */ - int get_user_env; /* --get-user-env[=timeout] */ + int get_user_env_time; /* --get-user-env[=timeout] */ + int get_user_env_mode; /* --get-user-env=[S|L] */ } opt_t; extern opt_t opt; diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 7e2e11049b7525c37c683039ff27af68ce736b6a..b5b340cda91387498e3eeee7afcb114b8dc61562 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * sbatch.c - Submit a SLURM batch script. * - * $Id: sbatch.c 12574 2007-10-26 17:00:52Z jette $ + * $Id: sbatch.c 12700 2007-11-27 23:39:24Z jette $ ***************************************************************************** * Copyright (C) 2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -46,6 +46,8 @@ #include "src/sbatch/opt.h" +#define MAX_RETRIES 3 + static int fill_job_desc_from_opts(job_desc_msg_t *desc); static void *get_script_buffer(const char *filename, int *size); static void set_prio_process_env(void); @@ -60,6 +62,7 @@ int main(int argc, char *argv[]) char *script_name; void *script_body; int script_size = 0; + int retries = 0; log_init(xbasename(argv[0]), logopt, 0, NULL); script_name = process_options_first_pass(argc, argv); @@ -93,10 +96,21 @@ int main(int argc, char *argv[]) desc.script = (char *)script_body; - if (slurm_submit_batch_job(&desc, &resp) == -1) { - error("Batch job submission failed: %m"); - exit(3); - } + while (slurm_submit_batch_job(&desc, &resp) < 0) { + static char *msg = "Slurm job queue full, sleeping and retrying."; + + if ((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || + (retries >= MAX_RETRIES)) { + error("Batch job submission failed: %m"); + exit(3); + } + + if (retries) + debug(msg); + else + error(msg); + sleep (++retries); + } info("Submitted batch job %d", resp->job_id); xfree(desc.script); @@ -194,12 +208,13 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) desc->shared = opt.shared; desc->environment = NULL; - if (opt.get_user_env >= 0) { + if (opt.get_user_env_time >= 0) { struct passwd *pw = NULL; pw = getpwuid(opt.uid); if (pw != NULL) { desc->environment = env_array_user_default(pw->pw_name, - opt.get_user_env); + opt.get_user_env_time, + opt.get_user_env_mode); /* FIXME - should we abort if j->environment * is NULL? */ } diff --git a/src/sinfo/print.c b/src/sinfo/print.c index 9a235a17627ba10e43023c9d567ea0dd83854ef0..6e4704c245d9d170673d4e6aa984c58dfe7b3cdb 100644 --- a/src/sinfo/print.c +++ b/src/sinfo/print.c @@ -172,10 +172,10 @@ static int _print_secs(long time, int width, bool right, bool cut_output) static int _build_min_max_16_string(char *buffer, int buf_size, uint16_t min, uint16_t max, bool range) { - char tmp_min[7]; - char tmp_max[7]; - convert_num_unit((float)min, tmp_min, UNIT_NONE); - convert_num_unit((float)max, tmp_max, UNIT_NONE); + char tmp_min[8]; + char tmp_max[8]; + convert_num_unit((float)min, tmp_min, sizeof(tmp_min), UNIT_NONE); + convert_num_unit((float)max, tmp_max, sizeof(tmp_max), UNIT_NONE); if (max == min) return snprintf(buffer, buf_size, "%s", tmp_max); @@ -193,10 +193,10 @@ _build_min_max_16_string(char *buffer, int buf_size, uint16_t min, uint16_t max, static int _build_min_max_32_string(char *buffer, int buf_size, uint32_t min, uint32_t max, bool range) { - char tmp_min[7]; - char tmp_max[7]; - convert_num_unit((float)min, tmp_min, UNIT_NONE); - convert_num_unit((float)max, tmp_max, UNIT_NONE); + char tmp_min[8]; + char tmp_max[8]; + convert_num_unit((float)min, tmp_min, sizeof(tmp_min), UNIT_NONE); + convert_num_unit((float)max, tmp_max, sizeof(tmp_max), UNIT_NONE); if (max == min) return snprintf(buffer, buf_size, "%s", tmp_max); @@ -305,20 +305,20 @@ int _print_cpus_aiot(sinfo_data_t * sinfo_data, int width, bool right_justify, char *suffix) { char id[FORMAT_STRING_SIZE]; - char tmpa[7]; - char tmpi[7]; - char tmpo[7]; - char tmpt[7]; + char tmpa[8]; + char tmpi[8]; + char tmpo[8]; + char tmpt[8]; if (sinfo_data) { #ifdef HAVE_BG convert_num_unit((float)sinfo_data->cpus_alloc, - tmpa, UNIT_NONE); + tmpa, sizeof(tmpa), UNIT_NONE); convert_num_unit((float)sinfo_data->cpus_idle, - tmpi, UNIT_NONE); + tmpi, sizeof(tmpi), UNIT_NONE); convert_num_unit((float)sinfo_data->cpus_other, - tmpo, UNIT_NONE); + tmpo, sizeof(tmpo), UNIT_NONE); convert_num_unit((float)sinfo_data->cpus_total, - tmpt, UNIT_NONE); + tmpt, sizeof(tmpt), UNIT_NONE); #else sprintf(tmpa, "%u", sinfo_data->cpus_alloc); sprintf(tmpi, "%u", sinfo_data->cpus_idle); @@ -516,13 +516,13 @@ int _print_nodes_t(sinfo_data_t * sinfo_data, int width, bool right_justify, char *suffix) { char id[FORMAT_STRING_SIZE]; - char tmp[7]; + char tmp[8]; if (sinfo_data) { #ifdef HAVE_BG convert_num_unit((float)sinfo_data->nodes_total, tmp, - UNIT_NONE); + sizeof(tmp), UNIT_NONE); #else - sprintf(tmp, "%d", sinfo_data->nodes_total); + snprintf(tmp, sizeof(tmp), "%d", sinfo_data->nodes_total); #endif snprintf(id, FORMAT_STRING_SIZE, "%s", tmp); _print_str(id, width, right_justify, true); @@ -538,17 +538,17 @@ int _print_nodes_ai(sinfo_data_t * sinfo_data, int width, bool right_justify, char *suffix) { char id[FORMAT_STRING_SIZE]; - char tmpa[7]; - char tmpi[7]; + char tmpa[8]; + char tmpi[8]; if (sinfo_data) { #ifdef HAVE_BG convert_num_unit((float)sinfo_data->nodes_alloc, - tmpa, UNIT_NONE); + tmpa, sizeof(tmpa), UNIT_NONE); convert_num_unit((float)sinfo_data->nodes_idle, - tmpi, UNIT_NONE); + tmpi, sizeof(tmpi), UNIT_NONE); #else - sprintf(tmpa, "%d", sinfo_data->nodes_alloc); - sprintf(tmpi, "%d", sinfo_data->nodes_idle); + snprintf(tmpa, sizeof(tmpa), "%d", sinfo_data->nodes_alloc); + snprintf(tmpi, sizeof(tmpi), "%d", sinfo_data->nodes_idle); #endif snprintf(id, FORMAT_STRING_SIZE, "%s/%s", tmpa, tmpi); @@ -565,25 +565,25 @@ int _print_nodes_aiot(sinfo_data_t * sinfo_data, int width, bool right_justify, char *suffix) { char id[FORMAT_STRING_SIZE]; - char tmpa[7]; - char tmpi[7]; - char tmpo[7]; - char tmpt[7]; + char tmpa[8]; + char tmpi[8]; + char tmpo[8]; + char tmpt[8]; if (sinfo_data) { #ifdef HAVE_BG convert_num_unit((float)sinfo_data->nodes_alloc, - tmpa, UNIT_NONE); + tmpa, sizeof(tmpa), UNIT_NONE); convert_num_unit((float)sinfo_data->nodes_idle, - tmpi, UNIT_NONE); + tmpi, sizeof(tmpi), UNIT_NONE); convert_num_unit((float)sinfo_data->nodes_other, - tmpo, UNIT_NONE); + tmpo, sizeof(tmpo), UNIT_NONE); convert_num_unit((float)sinfo_data->nodes_total, - tmpt, UNIT_NONE); + tmpt, sizeof(tmpt), UNIT_NONE); #else - sprintf(tmpa, "%u", sinfo_data->nodes_alloc); - sprintf(tmpi, "%u", sinfo_data->nodes_idle); - sprintf(tmpo, "%u", sinfo_data->nodes_other); - sprintf(tmpt, "%u", sinfo_data->nodes_total); + snprintf(tmpa, sizeof(tmpa), "%u", sinfo_data->nodes_alloc); + snprintf(tmpi, sizeof(tmpi), "%u", sinfo_data->nodes_idle); + snprintf(tmpo, sizeof(tmpo), "%u", sinfo_data->nodes_other); + snprintf(tmpt, sizeof(tmpt), "%u", sinfo_data->nodes_total); #endif snprintf(id, FORMAT_STRING_SIZE, "%s/%s/%s/%s", diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 521700b5e27b2b86b016cd49cc4b3d68568d519d..7d589f168eeae287921eed73cfb5f503de756b82 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3,7 +3,7 @@ * Note: there is a global job list (job_list), time stamp * (last_job_update), and hash table (job_hash) * - * $Id: job_mgr.c 12460 2007-10-05 23:50:48Z jette $ + * $Id: job_mgr.c 12655 2007-11-20 21:02:43Z jette $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -3090,15 +3090,15 @@ void reset_job_bitmaps(void) job_ptr->part_ptr = part_ptr; FREE_NULL_BITMAP(job_ptr->node_bitmap); - if ((job_ptr->nodes_completing) - && (node_name2bitmap(job_ptr->nodes_completing, - false, &job_ptr->node_bitmap))) { + if ((job_ptr->nodes_completing) && + (node_name2bitmap(job_ptr->nodes_completing, + false, &job_ptr->node_bitmap))) { error("Invalid nodes (%s) for job_id %u", job_ptr->nodes_completing, job_ptr->job_id); job_fail = true; - } else if ((job_ptr->nodes) - && (node_name2bitmap(job_ptr->nodes, false, + } else if ((job_ptr->node_bitmap == NULL) && job_ptr->nodes && + (node_name2bitmap(job_ptr->nodes, false, &job_ptr->node_bitmap))) { error("Invalid nodes (%s) for job_id %u", job_ptr->nodes, job_ptr->job_id); @@ -4786,6 +4786,8 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd conn_fd) else job_ptr->end_time = now; deallocate_nodes(job_ptr, false, suspended); + if (job_ptr->details) + xfree(job_ptr->details->req_node_layout); job_completion_logger(job_ptr); //FIXME: Test accounting @@ -4834,12 +4836,11 @@ extern void update_job_nodes_completing(void) job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { - if ((job_ptr->job_state & JOB_COMPLETING) == 0) + if (((job_ptr->job_state & JOB_COMPLETING) == 0) || + (job_ptr->node_bitmap == NULL)) continue; - if (job_ptr->nodes_completing) /* no change */ - continue; - node_name2bitmap(job_ptr->nodes_completing, - false, &job_ptr->node_bitmap); + xfree(job_ptr->nodes_completing); + job_ptr->nodes_completing = bitmap2node_name(job_ptr->node_bitmap); } list_iterator_destroy(job_iterator); } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 3aeb4c69e6eb6074ff9c7ef829bdff6bf2bf4c85..20cf46bccef4a17b41a262b220614918ed0c4d80 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2,7 +2,7 @@ * node_scheduler.c - select and allocated nodes to jobs * Note: there is a global node table (node_record_table_ptr) * - * $Id: node_scheduler.c 12452 2007-10-05 19:07:07Z da $ + * $Id: node_scheduler.c 12631 2007-11-06 22:48:18Z da $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -137,7 +137,7 @@ extern void allocate_nodes(struct job_record *job_ptr) * globals: node_record_count - number of nodes configured * node_record_table_ptr - pointer to global node table */ -extern int count_cpus(unsigned *bitmap) +extern int count_cpus(bitstr_t *bitmap) { int i, sum; diff --git a/src/slurmctld/node_scheduler.h b/src/slurmctld/node_scheduler.h index 7febc9ad74a628daca6aab9bcf92a0f264911bc9..d44fa4eff0ccf26c3f1ea96459ef5df98cb92130 100644 --- a/src/slurmctld/node_scheduler.h +++ b/src/slurmctld/node_scheduler.h @@ -61,7 +61,7 @@ extern void build_node_details(struct job_record *job_ptr); * globals: node_record_count - number of nodes configured * node_record_table_ptr - pointer to global node table */ -extern int count_cpus(unsigned *bitmap); +extern int count_cpus(bitstr_t *bitmap); /* * deallocate_nodes - for a given job, deallocate its nodes and make diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 09617451a21ca7964eb802b1bbae1c4dcd4cf7fb..aca9e9bba65c27f6e9d00a335723c7b0d9dea0ad 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -697,6 +697,12 @@ int read_slurm_conf(int recover) /* initialization */ START_TIMER; + if (recover == 0) { + /* in order to re-use job state information, + * update nodes_completing string (based on node_bitmap) */ + update_job_nodes_completing(); + } + /* save node states for reconfig RPC */ old_node_record_count = node_record_count; old_node_table_ptr = node_record_table_ptr; @@ -708,11 +714,6 @@ int read_slurm_conf(int recover) node_record_table_ptr = NULL; node_record_count = 0; - if (recover == 0) { - /* in order to re-use job state information, - * update nodes_completing string (based on node_bitmap) */ - update_job_nodes_completing(); - } if ((error_code = _init_all_slurm_conf())) { node_record_table_ptr = old_node_table_ptr; return error_code; @@ -1005,6 +1006,7 @@ static int _sync_nodes_to_comp_job(void) deallocate_nodes(job_ptr, false, false); } } + list_iterator_destroy(job_iterator); if (update_cnt) info("_sync_nodes_to_comp_job completing %d jobs", update_cnt); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 483fbe39a577670db16a3277a829ab5f1868ab75..43aa77897f19ea43a3e85d9bb5c09bafee175639 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * step_mgr.c - manage the job step information of slurm - * $Id: step_mgr.c 11969 2007-08-08 23:13:12Z da $ + * $Id: step_mgr.c 12681 2007-11-26 18:56:25Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -701,7 +701,7 @@ step_create(job_step_create_request_msg_t *step_specs, if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID ; - if (job_ptr->job_state == JOB_SUSPENDED) + if ((job_ptr->job_state == JOB_SUSPENDED) || IS_JOB_PENDING(job_ptr)) return ESLURM_DISABLED; if (batch_step) { @@ -716,9 +716,6 @@ step_create(job_step_create_request_msg_t *step_specs, (step_specs->user_id != 0)) return ESLURM_ACCESS_DENIED ; - if (IS_JOB_PENDING(job_ptr)) - return ESLURM_INVALID_JOB_ID ; - if (IS_JOB_FINISHED(job_ptr) || (job_ptr->end_time <= time(NULL))) return ESLURM_ALREADY_DONE; diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 7d1f3aa16ede8b9fa5d3b16480713de7adc48fe3..0091fadc28f0b3c48fb7fa12437a9a1170f4aadf 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/slurmd/slurmstepd/mgr.c - job manager functions for slurmstepd - * $Id: mgr.c 12580 2007-10-29 20:17:09Z jette $ + * $Id: mgr.c 12647 2007-11-12 17:09:47Z da $ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -537,7 +537,7 @@ _one_step_complete_msg(slurmd_job_t *job, int first, int last) for (i = 0; i < REVERSE_TREE_PARENT_RETRY; i++) { if (i) sleep(1); - retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 10000); + retcode = slurm_send_recv_rc_msg_only_one(&req, &rc, 0); if (retcode == 0 && rc == 0) goto finished; } diff --git a/src/smap/job_functions.c b/src/smap/job_functions.c index 8536e27b773e7bff9216cc33827217d72ffcbfdf..21eedfc16d9ddb797df147b1db7b014b3a33b572 100644 --- a/src/smap/job_functions.c +++ b/src/smap/job_functions.c @@ -251,7 +251,7 @@ static int _print_text_job(job_info_t * job_ptr) int i = 0; int width = 0; char time_buf[20]; - char tmp_cnt[7]; + char tmp_cnt[8]; uint32_t node_cnt = 0; char *ionodes = NULL; time_t now_time = time(NULL); @@ -271,9 +271,9 @@ static int _print_text_job(job_info_t * job_ptr) if ((node_cnt == 0) || (node_cnt == NO_VAL)) node_cnt = _get_node_cnt(job_ptr); #ifdef HAVE_BG - convert_num_unit((float)node_cnt, tmp_cnt, UNIT_NONE); + convert_num_unit((float)node_cnt, tmp_cnt, sizeof(tmp_cnt), UNIT_NONE); #else - sprintf(tmp_cnt, "%d", node_cnt); + snprintf(tmp_cnt, sizeof(tmp_cnt), "%d", node_cnt); #endif if(!params.commandline) { mvwprintw(text_win, main_ycord, diff --git a/src/smap/partition_functions.c b/src/smap/partition_functions.c index 2a04de9abecf8b6bd8706ee5afea83f1b4fb61be..2dd270b00740fe0bfd3aece334267731ad25059b 100644 --- a/src/smap/partition_functions.c +++ b/src/smap/partition_functions.c @@ -517,12 +517,13 @@ static int _print_text_part(partition_info_t *part_ptr, int i = 0; int width = 0; char *nodes = NULL, time_buf[20]; - char tmp_cnt[7]; + char tmp_cnt[8]; #ifdef HAVE_BG - convert_num_unit((float)part_ptr->total_nodes, tmp_cnt, UNIT_NONE); + convert_num_unit((float)part_ptr->total_nodes, tmp_cnt, + sizeof(tmp_cnt), UNIT_NONE); #else - sprintf(tmp_cnt, "%u", part_ptr->total_nodes); + snprintf(tmp_cnt, sizeof(tmp_cnt), "%u", part_ptr->total_nodes); #endif if(!params.commandline) { diff --git a/src/squeue/print.c b/src/squeue/print.c index e43b17f4418d9de54dcf8a699e974dec9aa634e5..ff4c3ffe642c3106e89832bccd8e437f43ca5145 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * print.c - squeue print job functions - * $Id: print.c 12594 2007-10-31 22:27:56Z jette $ + * $Id: print.c 12627 2007-11-06 19:48:55Z jette $ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -543,7 +543,7 @@ int _print_job_reason_list(job_info_t * job, int width, bool right, char* suffix) { char *ionodes = NULL; - char tmp_char[6]; + char tmp_char[16]; if (job == NULL) { /* Print the Header instead */ #ifdef HAVE_BG @@ -567,7 +567,8 @@ int _print_job_reason_list(job_info_t * job, int width, bool right, _print_nodes(job->nodes, width, right, false); if(ionodes) { - sprintf(tmp_char, "[%s]", ionodes); + snprintf(tmp_char, sizeof(tmp_char), "[%s]", + ionodes); _print_str(tmp_char, width, right, false); } } @@ -600,7 +601,7 @@ int _print_job_node_inx(job_info_t * job, int width, bool right, char* suffix) int _print_job_num_procs(job_info_t * job, int width, bool right, char* suffix) { - char tmp_char[6]; + char tmp_char[8]; if (job == NULL) /* Print the Header instead */ _print_str("CPUS", width, right, true); else { @@ -610,9 +611,11 @@ int _print_job_num_procs(job_info_t * job, int width, bool right, char* suffix) cnt += job->cpus_per_node[i] * job->cpu_count_reps[i]; } - convert_num_unit((float)cnt, tmp_char, UNIT_NONE); + convert_num_unit((float)cnt, tmp_char, + sizeof(tmp_char), UNIT_NONE); } else { - convert_num_unit((float)job->num_procs, tmp_char, UNIT_NONE); + convert_num_unit((float)job->num_procs, tmp_char, + sizeof(tmp_char), UNIT_NONE); } _print_str(tmp_char, width, right, true); } @@ -625,7 +628,7 @@ int _print_job_num_nodes(job_info_t * job, int width, bool right_justify, char* suffix) { uint32_t node_cnt = 0; - char tmp_char[6]; + char tmp_char[8]; if (job == NULL) /* Print the Header instead */ _print_str("NODES", width, right_justify, true); @@ -639,9 +642,10 @@ int _print_job_num_nodes(job_info_t * job, int width, bool right_justify, node_cnt = _get_node_cnt(job); #ifdef HAVE_BG - convert_num_unit((float)node_cnt, tmp_char, UNIT_NONE); + convert_num_unit((float)node_cnt, tmp_char, sizeof(tmp_char), + UNIT_NONE); #else - sprintf(tmp_char, "%d", node_cnt); + snprintf(tmp_char, sizeof(tmp_char), "%d", node_cnt); #endif _print_str(tmp_char, width, right_justify, true); } @@ -675,9 +679,12 @@ int _print_job_num_sct(job_info_t * job, int width, bool right_justify, char threads[10]; char sct[(10+1)*3]; if (job) { - convert_num_unit((float)job->min_sockets, sockets, UNIT_NONE); - convert_num_unit((float)job->min_cores, cores, UNIT_NONE); - convert_num_unit((float)job->min_threads, threads, UNIT_NONE); + convert_num_unit((float)job->min_sockets, sockets, + sizeof(sockets), UNIT_NONE); + convert_num_unit((float)job->min_cores, cores, + sizeof(cores), UNIT_NONE); + convert_num_unit((float)job->min_threads, threads, + sizeof(threads), UNIT_NONE); sct[0] = '\0'; strcat(sct, sockets); strcat(sct, ":"); @@ -701,7 +708,8 @@ int _print_job_num_sockets(job_info_t * job, int width, bool right_justify, if (job == NULL) /* Print the Header instead */ _print_str("SOCKETS", width, right_justify, true); else { - convert_num_unit((float)job->min_sockets, tmp_char, UNIT_NONE); + convert_num_unit((float)job->min_sockets, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -716,7 +724,8 @@ int _print_job_num_cores(job_info_t * job, int width, bool right_justify, if (job == NULL) /* Print the Header instead */ _print_str("CORES", width, right_justify, true); else { - convert_num_unit((float)job->min_cores, tmp_char, UNIT_NONE); + convert_num_unit((float)job->min_cores, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -731,7 +740,8 @@ int _print_job_num_threads(job_info_t * job, int width, bool right_justify, if (job == NULL) /* Print the Header instead */ _print_str("THREADS", width, right_justify, true); else { - convert_num_unit((float)job->min_threads, tmp_char, UNIT_NONE); + convert_num_unit((float)job->min_threads, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -789,12 +799,13 @@ int _print_job_contiguous(job_info_t * job, int width, bool right_justify, int _print_job_min_procs(job_info_t * job, int width, bool right_justify, char* suffix) { - char tmp_char[6]; + char tmp_char[8]; if (job == NULL) /* Print the Header instead */ _print_str("MIN_PROCS", width, right_justify, true); else { - convert_num_unit((float)job->job_min_procs, tmp_char, UNIT_NONE); + convert_num_unit((float)job->job_min_procs, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -805,12 +816,13 @@ int _print_job_min_procs(job_info_t * job, int width, bool right_justify, int _print_job_min_sockets(job_info_t * job, int width, bool right_justify, char* suffix) { - char tmp_char[6]; + char tmp_char[8]; if (job == NULL) /* Print the Header instead */ _print_str("MIN_SOCKETS", width, right_justify, true); else { - convert_num_unit((float)job->job_min_sockets, tmp_char, UNIT_NONE); + convert_num_unit((float)job->job_min_sockets, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -821,12 +833,13 @@ int _print_job_min_sockets(job_info_t * job, int width, bool right_justify, int _print_job_min_cores(job_info_t * job, int width, bool right_justify, char* suffix) { - char tmp_char[6]; + char tmp_char[8]; if (job == NULL) /* Print the Header instead */ _print_str("MIN_CORES", width, right_justify, true); else { - convert_num_unit((float)job->job_min_cores, tmp_char, UNIT_NONE); + convert_num_unit((float)job->job_min_cores, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -837,12 +850,13 @@ int _print_job_min_cores(job_info_t * job, int width, bool right_justify, int _print_job_min_threads(job_info_t * job, int width, bool right_justify, char* suffix) { - char tmp_char[6]; + char tmp_char[8]; if (job == NULL) /* Print the Header instead */ _print_str("MIN_THREADS", width, right_justify, true); else { - convert_num_unit((float)job->job_min_threads, tmp_char, UNIT_NONE); + convert_num_unit((float)job->job_min_threads, tmp_char, + sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } if (suffix) @@ -862,11 +876,13 @@ int _print_job_min_memory(job_info_t * job, int width, bool right_justify, else { tmp_char[0] = '\0'; if (job->job_max_memory < job->job_min_memory) { - convert_num_unit((float)job->job_max_memory, max_mem, UNIT_NONE); + convert_num_unit((float)job->job_max_memory, max_mem, + sizeof(max_mem), UNIT_NONE); strcat(tmp_char, max_mem); strcat(tmp_char, "-"); - } - convert_num_unit((float)job->job_min_memory, min_mem, UNIT_NONE); + } + convert_num_unit((float)job->job_min_memory, min_mem, + sizeof(min_mem), UNIT_NONE); strcat(tmp_char, min_mem); _print_str(tmp_char, width, right_justify, true); } @@ -880,13 +896,13 @@ int _print_job_min_tmp_disk(job_info_t * job, int width, bool right_justify, char* suffix) { - char tmp_char[6]; + char tmp_char[10]; if (job == NULL) /* Print the Header instead */ _print_str("MIN_TMP_DISK", width, right_justify, true); else { convert_num_unit((float)job->job_min_tmp_disk, - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); _print_str(tmp_char, width, right_justify, true); } diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 92ea573ee6a1ae677678bc1939909717042c775a..a06ea95966d6af57affd4ec8dfbbc3c5fa0b218f 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * src/srun/allocate.c - srun functions for managing node allocations - * $Id: allocate.c 12574 2007-10-26 17:00:52Z jette $ + * $Id: allocate.c 12700 2007-11-27 23:39:24Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -363,8 +363,7 @@ static bool _retry() { static int retries = 0; - static char *msg = "Slurm controller not responding, " - "sleeping and retrying."; + static char *msg = "Slurm job queue full, sleeping and retrying."; if (errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) { if (retries == 0) @@ -558,13 +557,14 @@ job_desc_msg_create_from_opts (char *script) xassert (opt.batch); j->environment = NULL; - if (opt.get_user_env >= 0) { + if (opt.get_user_env_time >= 0) { struct passwd *pw = NULL; pw = getpwuid(opt.uid); if (pw != NULL) { j->environment = env_array_user_default(pw->pw_name, - opt.get_user_env); + opt.get_user_env_time, + opt.get_user_env_mode); /* FIXME - should we abort if j->environment is NULL? */ } diff --git a/src/srun/opt.c b/src/srun/opt.c index 17a78067d998f589dfc12563de3b4931cb2d040d..e00bf4f232e8ae13d2daed56fc06f40459eb3e54 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -1,6 +1,6 @@ /*****************************************************************************\ * opt.c - options processing for srun - * $Id: opt.c 12583 2007-10-30 17:01:31Z jette $ + * $Id: opt.c 12711 2007-11-29 00:04:01Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -78,7 +78,6 @@ #include "src/common/xmalloc.h" #include "src/common/xstring.h" #include "src/common/slurm_rlimits_info.h" -#include "src/common/parse_time.h" #include "src/common/plugstack.h" #include "src/common/optz.h" #include "src/api/pmi_server.h" @@ -193,8 +192,8 @@ static void _opt_default(void); /* set options based upon env vars */ static void _opt_env(void); - static void _opt_args(int argc, char **argv); +static void _proc_get_user_env(char *optarg); /* list known options and their settings */ static void _opt_list(void); @@ -1045,7 +1044,8 @@ static void _opt_default() opt.msg_timeout = 15; } - opt.get_user_env = -1; + opt.get_user_env_time = -1; + opt.get_user_env_mode = -1; } /*---[ env var processing ]-----------------------------------------------*/ @@ -1884,6 +1884,10 @@ void set_options(const int argc, char **argv, int first) break; case LONG_OPT_BEGIN: opt.begin = parse_time(optarg); + if (opt.begin == 0) { + fatal("Invalid time specification %s", + optarg); + } break; case LONG_OPT_MAIL_TYPE: opt.mail_type |= _parse_mail_type(optarg); @@ -2006,9 +2010,9 @@ void set_options(const int argc, char **argv, int first) break; case LONG_OPT_GET_USER_ENV: if (optarg) - opt.get_user_env = strtol(optarg, NULL, 10); + _proc_get_user_env(optarg); else - opt.get_user_env = 0; + opt.get_user_env_time = 0; break; default: if (spank_process_option (opt_char, optarg) < 0) { @@ -2027,6 +2031,25 @@ void set_options(const int argc, char **argv, int first) spank_option_table_destroy (optz); } +static void _proc_get_user_env(char *optarg) +{ + char *end_ptr; + + if ((optarg[0] >= '0') && (optarg[0] <= '9')) + opt.get_user_env_time = strtol(optarg, &end_ptr, 10); + else { + opt.get_user_env_time = 0; + end_ptr = optarg; + } + + if ((end_ptr == NULL) || (end_ptr[0] == '\0')) + return; + if ((end_ptr[0] == 's') || (end_ptr[0] == 'S')) + opt.get_user_env_mode = 1; + else if ((end_ptr[0] == 'l') || (end_ptr[0] == 'L')) + opt.get_user_env_mode = 2; +} + /* Load the multi_prog config file into argv, pass the entire file contents * in order to avoid having to read the file on every node. We could parse * the infomration here too for loading the MPIR records for TotalView */ diff --git a/src/srun/opt.h b/src/srun/opt.h index 1bd844e00b1c1178f87cacfeda0ab7ec3a9a83ae..e7e67e2b7c397f529ec5c13800f574154a724e8c 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -1,6 +1,6 @@ /*****************************************************************************\ * opt.h - definitions for srun option processing - * $Id: opt.h 12574 2007-10-26 17:00:52Z jette $ + * $Id: opt.h 12697 2007-11-27 22:02:29Z jette $ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -212,7 +212,8 @@ typedef struct srun_options { uint16_t mail_type; /* --mail-type */ char *mail_user; /* --mail-user */ char *ctrl_comm_ifhn; /* --ctrl-comm-ifhn */ - int get_user_env; /* --get-user-env[=secs] */ + int get_user_env_time; /* --get-user-env[=secs] */ + int get_user_env_mode; /* --get-user-env=[S|L] */ } opt_t; extern opt_t opt; diff --git a/src/srun/srun.c b/src/srun/srun.c index 4fb3f0b001c143e2a824ee06b0717be9f7fb1770..82b01cd39ad14b1d60739fed76384b7f491f70d9 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -597,12 +597,12 @@ _print_job_information(resource_allocation_response_msg_t *resp) static int _run_batch_job(const char *argv0) { - int file_type, retries; + int file_type, retries = 0; int rc = SLURM_SUCCESS; job_desc_msg_t *req; submit_response_msg_t *resp; char *script; - void (*log_msg) (const char *fmt, ...) = (void (*)) &error; + static char *msg = "Slurm job queue full, sleeping and retrying."; if ((remote_argc == 0) || (remote_argv[0] == NULL)) return SLURM_ERROR; @@ -629,15 +629,15 @@ _run_batch_job(const char *argv0) if (!opt.jobid_set) req->job_id = NO_VAL; - retries = 0; - while ( (retries < MAX_RETRIES) - && (rc = slurm_submit_batch_job(req, &resp)) < 0) { - - if (errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) + while ((rc = slurm_submit_batch_job(req, &resp)) < 0) { + if ((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || + (retries >= MAX_RETRIES)) return (error("Unable to submit batch job: %m")); - (*log_msg) ("Controller not responding, retrying..."); - log_msg = &debug; + if (retries == 0) + error(msg); + else + debug(msg); sleep (++retries); } diff --git a/src/sview/block_info.c b/src/sview/block_info.c index 47654ea7f6defd63557a2143ad8bb4359ee0001c..ee1afa88e4063eaec4d3fdbedc030eb268244fc9 100644 --- a/src/sview/block_info.c +++ b/src/sview/block_info.c @@ -207,7 +207,7 @@ static void _layout_block_record(GtkTreeView *treeview, sview_block_info_t *block_ptr, int update) { - char tmp_cnt[7]; + char tmp_cnt[8]; GtkTreeIter iter; GtkTreeStore *treestore = GTK_TREE_STORE(gtk_tree_view_get_model(treeview)); @@ -238,7 +238,8 @@ static void _layout_block_record(GtkTreeView *treeview, SORTID_USE), _convert_node_use(block_ptr->bg_node_use)); - convert_num_unit((float)block_ptr->node_cnt, tmp_cnt, UNIT_NONE); + convert_num_unit((float)block_ptr->node_cnt, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_block, SORTID_NODES), @@ -269,7 +270,7 @@ static void _layout_block_record(GtkTreeView *treeview, static void _update_block_record(sview_block_info_t *block_ptr, GtkTreeStore *treestore, GtkTreeIter *iter) { - char tmp_cnt[7]; + char tmp_cnt[8]; gtk_tree_store_set(treestore, iter, SORTID_BLOCK, block_ptr->bg_block_name, -1); @@ -284,7 +285,8 @@ static void _update_block_record(sview_block_info_t *block_ptr, gtk_tree_store_set(treestore, iter, SORTID_USE, _convert_node_use(block_ptr->bg_node_use), -1); - convert_num_unit((float)block_ptr->node_cnt, tmp_cnt, UNIT_NONE); + convert_num_unit((float)block_ptr->node_cnt, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); gtk_tree_store_set(treestore, iter, SORTID_NODES, tmp_cnt, -1); gtk_tree_store_set(treestore, iter, SORTID_NODELIST, @@ -350,7 +352,7 @@ static void _update_info_block(List block_list, if(block_ptr->node_cnt == 0) block_ptr->node_cnt = block_ptr->size; if(!block_ptr->slurm_part_name) - block_ptr->slurm_part_name = "no part"; + block_ptr->slurm_part_name = xstrdup("no part"); /* get the iter, or find out the list is empty goto add */ if (!gtk_tree_model_get_iter(model, &iter, path)) { diff --git a/src/sview/job_info.c b/src/sview/job_info.c index e07749e1d453ef76c9aa0b58d3aa5d26535ec762..2bbc43be2e2515d27974940737e6aea9107ea35d 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -1188,9 +1188,10 @@ static void _layout_job_record(GtkTreeView *treeview, #ifdef HAVE_BG convert_num_unit((float)sview_job_info_ptr->node_cnt, - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); #else - sprintf(tmp_char, "%u", sview_job_info_ptr->node_cnt); + snprintf(tmp_char, sizeof(tmp_char), "%u", + sview_job_info_ptr->node_cnt); #endif add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, @@ -1198,9 +1199,10 @@ static void _layout_job_record(GtkTreeView *treeview, tmp_char); #ifdef HAVE_BG - convert_num_unit((float)job_ptr->num_procs, tmp_char, UNIT_NONE); + convert_num_unit((float)job_ptr->num_procs, tmp_char, sizeof(tmp_char), + UNIT_NONE); #else - sprintf(tmp_char, "%u", job_ptr->num_procs); + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_procs); #endif add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, @@ -1592,7 +1594,7 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, #ifdef HAVE_BG convert_num_unit((float)sview_job_info_ptr->node_cnt, - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); #else sprintf(tmp_char, "%u", sview_job_info_ptr->node_cnt); #endif @@ -1600,9 +1602,10 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, SORTID_NODES, tmp_char, -1); #ifdef HAVE_BG - convert_num_unit((float)job_ptr->num_procs, tmp_char, UNIT_NONE); + convert_num_unit((float)job_ptr->num_procs, tmp_char, sizeof(tmp_char), + UNIT_NONE); #else - sprintf(tmp_char, "%u", job_ptr->num_procs); + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_procs); #endif gtk_tree_store_set(treestore, iter, SORTID_NUM_PROCS, tmp_char, -1); @@ -1643,57 +1646,43 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, SORTID_MAX_NODES, tmp_char, -1); } if(job_ptr->cpus_per_task > 0) { - //convert_num_unit((float)job_ptr->cpus_per_task, - // tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->cpus_per_task); gtk_tree_store_set(treestore, iter, SORTID_CPUS_PER_TASK, tmp_char, -1); } - //convert_num_unit((float)job_ptr->job_min_procs, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->job_min_procs); gtk_tree_store_set(treestore, iter, SORTID_REQ_PROCS, tmp_char, -1); - //convert_num_unit((float)job_ptr->min_sockets, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->min_sockets); gtk_tree_store_set(treestore, iter, SORTID_MIN_SOCKETS, tmp_char, -1); - //convert_num_unit((float)job_ptr->max_sockets, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->max_sockets); gtk_tree_store_set(treestore, iter, SORTID_MAX_SOCKETS, tmp_char, -1); - //convert_num_unit((float)job_ptr->min_cores, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->min_cores); gtk_tree_store_set(treestore, iter, SORTID_MIN_CORES, tmp_char, -1); - //convert_num_unit((float)job_ptr->max_cores, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->max_cores); gtk_tree_store_set(treestore, iter, SORTID_MAX_CORES, tmp_char, -1); - //convert_num_unit((float)job_ptr->min_threads, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->min_threads); gtk_tree_store_set(treestore, iter, SORTID_MIN_THREADS, tmp_char, -1); - //convert_num_unit((float)job_ptr->max_threads, tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->max_threads); gtk_tree_store_set(treestore, iter, SORTID_MAX_THREADS, tmp_char, -1); - //convert_num_unit((float)job_ptr->job_min_memory, - // tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->job_min_memory); gtk_tree_store_set(treestore, iter, SORTID_MIN_MEM, tmp_char, -1); - //convert_num_unit((float)job_ptr->job_max_memory, - // tmp_char, UNIT_NONE); + sprintf(tmp_char, "%u", job_ptr->job_max_memory); gtk_tree_store_set(treestore, iter, SORTID_MAX_MEM, tmp_char, -1); - //convert_num_unit((float)job_ptr->job_min_tmp_disk, - // tmp_char, UNIT_NONE); sprintf(tmp_char, "%u", job_ptr->job_min_tmp_disk); gtk_tree_store_set(treestore, iter, SORTID_TMP_DISK, tmp_char, -1); @@ -1767,10 +1756,10 @@ static void _layout_step_record(GtkTreeView *treeview, nodes = step_ptr->nodes; #ifdef HAVE_BG convert_num_unit((float)step_ptr->num_tasks, - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); #else convert_num_unit((float)_nodes_in_list(nodes), - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); #endif add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, @@ -1818,7 +1807,8 @@ static void _layout_step_record(GtkTreeView *treeview, SORTID_NAME), step_ptr->name); - convert_num_unit((float)step_ptr->num_tasks, tmp_char, UNIT_NONE); + convert_num_unit((float)step_ptr->num_tasks, tmp_char, sizeof(tmp_char), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_TASKS), @@ -1857,10 +1847,10 @@ static void _update_step_record(job_step_info_t *step_ptr, nodes = step_ptr->nodes; #ifdef HAVE_BG convert_num_unit((float)step_ptr->num_tasks, - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); #else convert_num_unit((float)_nodes_in_list(nodes), - tmp_char, UNIT_NONE); + tmp_char, sizeof(tmp_char), UNIT_NONE); #endif gtk_tree_store_set(treestore, iter, SORTID_NODES, tmp_char, -1); @@ -1894,7 +1884,8 @@ static void _update_step_record(job_step_info_t *step_ptr, gtk_tree_store_set(treestore, iter, SORTID_NAME, step_ptr->name, -1); - convert_num_unit((float)step_ptr->num_tasks, tmp_char, UNIT_NONE); + convert_num_unit((float)step_ptr->num_tasks, tmp_char, sizeof(tmp_char), + UNIT_NONE); gtk_tree_store_set(treestore, iter, SORTID_TASKS, tmp_char, -1); diff --git a/src/sview/node_info.c b/src/sview/node_info.c index 6348522e5736d1c288e826bb06c78b2027c0176b..2330f0f8f71ece64c86c29da46cb8e2517bced10 100644 --- a/src/sview/node_info.c +++ b/src/sview/node_info.c @@ -137,43 +137,50 @@ static void _layout_node_record(GtkTreeView *treeview, lower); xfree(lower); - convert_num_unit((float)node_ptr->cpus, tmp_cnt, UNIT_NONE); + convert_num_unit((float)node_ptr->cpus, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_CPUS), tmp_cnt); - convert_num_unit((float)node_ptr->used_cpus, tmp_cnt, UNIT_NONE); + convert_num_unit((float)node_ptr->used_cpus, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_USED_CPUS), tmp_cnt); - convert_num_unit((float)node_ptr->cores, tmp_cnt, UNIT_NONE); + convert_num_unit((float)node_ptr->cores, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_CORES), tmp_cnt); - convert_num_unit((float)node_ptr->sockets, tmp_cnt, UNIT_NONE); + convert_num_unit((float)node_ptr->sockets, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_SOCKETS), tmp_cnt); - convert_num_unit((float)node_ptr->threads, tmp_cnt, UNIT_NONE); + convert_num_unit((float)node_ptr->threads, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_THREADS), tmp_cnt); - convert_num_unit((float)node_ptr->real_memory, tmp_cnt, UNIT_MEGA); + convert_num_unit((float)node_ptr->real_memory, tmp_cnt, sizeof(tmp_cnt), + UNIT_MEGA); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_MEMORY), tmp_cnt); - convert_num_unit((float)node_ptr->tmp_disk, tmp_cnt, UNIT_MEGA); + convert_num_unit((float)node_ptr->tmp_disk, tmp_cnt, sizeof(tmp_cnt), + UNIT_MEGA); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_node, SORTID_DISK), @@ -217,9 +224,11 @@ static void _update_node_record(node_info_t *node_ptr, node_ptr->sockets, -1); gtk_tree_store_set(treestore, iter, SORTID_THREADS, node_ptr->threads, -1); - convert_num_unit((float)node_ptr->real_memory, tmp_cnt, UNIT_MEGA); + convert_num_unit((float)node_ptr->real_memory, tmp_cnt, sizeof(tmp_cnt), + UNIT_MEGA); gtk_tree_store_set(treestore, iter, SORTID_MEMORY, tmp_cnt, -1); - convert_num_unit((float)node_ptr->tmp_disk, tmp_cnt, UNIT_MEGA); + convert_num_unit((float)node_ptr->tmp_disk, tmp_cnt, sizeof(tmp_cnt), + UNIT_MEGA); gtk_tree_store_set(treestore, iter, SORTID_DISK, tmp_cnt, -1); gtk_tree_store_set(treestore, iter, SORTID_WEIGHT, node_ptr->weight, -1); diff --git a/src/sview/part_info.c b/src/sview/part_info.c index 369da9dda97146eb16b01ab185846c95fa976e9d..48c274ea1ea0330d8c7a3d84df225f31f8202a14 100644 --- a/src/sview/part_info.c +++ b/src/sview/part_info.c @@ -258,11 +258,13 @@ static int _build_min_max_16_string(char *buffer, int buf_size, uint16_t min, uint16_t max, bool range) { - char tmp_min[7]; - char tmp_max[7]; - convert_num_unit((float)min, tmp_min, UNIT_NONE); - if(max != (uint16_t) INFINITE) - convert_num_unit((float)max, tmp_max, UNIT_NONE); + char tmp_min[8]; + char tmp_max[8]; + convert_num_unit((float)min, tmp_min, sizeof(tmp_min), UNIT_NONE); + if(max != (uint16_t) INFINITE) { + convert_num_unit((float)max, tmp_max, sizeof(tmp_max), + UNIT_NONE); + } if (max == min) return snprintf(buffer, buf_size, "%s", tmp_max); @@ -281,10 +283,10 @@ static int _build_min_max_32_string(char *buffer, int buf_size, uint32_t min, uint32_t max, bool range) { - char tmp_min[7]; - char tmp_max[7]; - convert_num_unit((float)min, tmp_min, UNIT_NONE); - convert_num_unit((float)max, tmp_max, UNIT_NONE); + char tmp_min[8]; + char tmp_max[8]; + convert_num_unit((float)min, tmp_min, sizeof(tmp_min), UNIT_NONE); + convert_num_unit((float)max, tmp_max, sizeof(tmp_max), UNIT_NONE); if (max == min) return snprintf(buffer, buf_size, "%s", tmp_max); @@ -721,9 +723,9 @@ static void _layout_part_record(GtkTreeView *treeview, GtkTreeIter iter; ListIterator itr = NULL; char time_buf[20]; - char tmp_cnt[7]; - char tmp_cnt1[7]; - char tmp_cnt2[7]; + char tmp_cnt[8]; + char tmp_cnt1[8]; + char tmp_cnt2[8]; partition_info_t *part_ptr = sview_part_info->part_ptr; sview_part_sub_t *sview_part_sub = NULL; sview_part_sub_t *temp_part_sub = NULL; @@ -797,7 +799,7 @@ static void _layout_part_record(GtkTreeView *treeview, snprintf(time_buf, sizeof(time_buf), "infinite"); else { convert_num_unit((float)part_ptr->min_nodes, - time_buf, UNIT_NONE); + time_buf, sizeof(time_buf), UNIT_NONE); } add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_part, @@ -807,7 +809,7 @@ static void _layout_part_record(GtkTreeView *treeview, snprintf(time_buf, sizeof(time_buf), "infinite"); else { convert_num_unit((float)part_ptr->max_nodes, - time_buf, UNIT_NONE); + time_buf, sizeof(time_buf), UNIT_NONE); } add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_part, @@ -844,7 +846,8 @@ static void _layout_part_record(GtkTreeView *treeview, temp_char); #ifdef HAVE_BG - convert_num_unit((float)part_ptr->total_nodes, tmp_cnt, UNIT_NONE); + convert_num_unit((float)part_ptr->total_nodes, tmp_cnt, + sizeof(tmp_cnt), UNIT_NONE); #else sprintf(tmp_cnt, "%u", part_ptr->total_nodes); #endif @@ -852,7 +855,8 @@ static void _layout_part_record(GtkTreeView *treeview, find_col_name(display_data_part, SORTID_NODES), tmp_cnt); - convert_num_unit((float)part_ptr->total_cpus, tmp_cnt, UNIT_NONE); + convert_num_unit((float)part_ptr->total_cpus, tmp_cnt, sizeof(tmp_cnt), + UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_part, SORTID_CPUS), @@ -891,9 +895,12 @@ static void _layout_part_record(GtkTreeView *treeview, other_part_sub.reason = temp_part_sub->reason; } } - convert_num_unit((float)alloc_part_sub.node_cnt, tmp_cnt, UNIT_NONE); - convert_num_unit((float)idle_part_sub.node_cnt, tmp_cnt1, UNIT_NONE); - convert_num_unit((float)other_part_sub.node_cnt, tmp_cnt2, UNIT_NONE); + convert_num_unit((float)alloc_part_sub.node_cnt, + tmp_cnt, sizeof(tmp_cnt), UNIT_NONE); + convert_num_unit((float)idle_part_sub.node_cnt, + tmp_cnt1, sizeof(tmp_cnt1), UNIT_NONE); + convert_num_unit((float)other_part_sub.node_cnt, + tmp_cnt2, sizeof(tmp_cnt2), UNIT_NONE); snprintf(tmp, sizeof(tmp), "%s/%s/%s", tmp_cnt, tmp_cnt1, tmp_cnt2); add_display_treestore_line(update, treestore, &iter, @@ -915,7 +922,7 @@ static void _update_part_record(sview_part_info_t *sview_part_info, GtkTreeIter *iter) { char time_buf[20]; - char tmp_cnt[7]; + char tmp_cnt[8]; char *temp_char = NULL; partition_info_t *part_ptr = sview_part_info->part_ptr; GtkTreeIter sub_iter; @@ -959,7 +966,7 @@ static void _update_part_record(sview_part_info_t *sview_part_info, snprintf(time_buf, sizeof(time_buf), "infinite"); else { convert_num_unit((float)part_ptr->min_nodes, - time_buf, UNIT_NONE); + time_buf, sizeof(time_buf), UNIT_NONE); } gtk_tree_store_set(treestore, iter, SORTID_MIN_NODES, time_buf, -1); @@ -967,7 +974,7 @@ static void _update_part_record(sview_part_info_t *sview_part_info, snprintf(time_buf, sizeof(time_buf), "infinite"); else { convert_num_unit((float)part_ptr->max_nodes, - time_buf, UNIT_NONE); + time_buf, sizeof(time_buf), UNIT_NONE); } gtk_tree_store_set(treestore, iter, SORTID_MAX_NODES, time_buf, -1); @@ -993,7 +1000,8 @@ static void _update_part_record(sview_part_info_t *sview_part_info, gtk_tree_store_set(treestore, iter, SORTID_GROUPS, temp_char, -1); #ifdef HAVE_BG - convert_num_unit((float)part_ptr->total_nodes, tmp_cnt, UNIT_NONE); + convert_num_unit((float)part_ptr->total_nodes, tmp_cnt, + sizeof(tmp_cnt), UNIT_NONE); #else sprintf(tmp_cnt, "%u", part_ptr->total_nodes); #endif @@ -1031,7 +1039,7 @@ static void _update_part_sub_record(sview_part_sub_t *sview_part_sub, GtkTreeStore *treestore, GtkTreeIter *iter) { char time_buf[20]; - char tmp_cnt[7]; + char tmp_cnt[8]; partition_info_t *part_ptr = sview_part_sub->part_ptr; char *upper = NULL, *lower = NULL; char tmp[MAXHOSTRANGELEN]; @@ -1067,7 +1075,8 @@ static void _update_part_sub_record(sview_part_sub_t *sview_part_sub, sview_part_sub->max_weight, false); gtk_tree_store_set(treestore, iter, SORTID_WEIGHT, time_buf, -1); - convert_num_unit((float)sview_part_sub->node_cnt, tmp_cnt, UNIT_NONE); + convert_num_unit((float)sview_part_sub->node_cnt, tmp_cnt, + sizeof(tmp_cnt), UNIT_NONE); gtk_tree_store_set(treestore, iter, SORTID_NODES, tmp_cnt, -1); hostlist_ranged_string(sview_part_sub->hl, sizeof(tmp), tmp); diff --git a/testsuite/expect/test1.32 b/testsuite/expect/test1.32 index da2d9bd348e365fd0197bb4de544c289ff61e96b..f833cb4ec90bb06e6b684560ba80795ae2e41fb7 100755 --- a/testsuite/expect/test1.32 +++ b/testsuite/expect/test1.32 @@ -88,12 +88,12 @@ expect { incr matches exp_continue } - -re "SIGUSR1" { - set usr1cnt [expr $usr1cnt + 1] - exp_continue - } - -re "SIGUSR2" { - set usr2cnt [expr $usr2cnt + 1] + -re "SIGUSR($number)" { + if {$expect_out(1,string) == 1} { + set usr1cnt [expr $usr1cnt + 1] + } else { + set usr2cnt [expr $usr2cnt + 1] + } exp_continue } -re "error.*not running" { diff --git a/testsuite/expect/test10.5 b/testsuite/expect/test10.5 index 349aade2d14f509afd8f6e9a7f4fd154ae23c1e7..d48da0b53189df911b0a2e722386a93d451d413a 100755 --- a/testsuite/expect/test10.5 +++ b/testsuite/expect/test10.5 @@ -35,7 +35,7 @@ source ./globals set test_id "10.5" set exit_code 0 set matches 0 -set non_bg 0 +set non_bg 0 set too_small 0 print_header $test_id @@ -97,7 +97,16 @@ expect { send "q" exp_continue } - + -re "error" { + send_user "\nFAILURE: smap error\n" + set exit_code 1 + exp_continue + } + -re "fatal" { + send_user "\nFAILURE: smap error\n" + set exit_code 1 + exp_continue + } timeout { send_user "\nFAILURE: smap not responding\n" set exit_code 1 diff --git a/testsuite/expect/test10.6 b/testsuite/expect/test10.6 index 48388e84a2179ae21c92e11fb38b6a86bfd3126d..038e9331be98d992465eef326637926f15e54b76 100755 --- a/testsuite/expect/test10.6 +++ b/testsuite/expect/test10.6 @@ -86,7 +86,16 @@ expect { incr matches exp_continue } - + -re "error" { + send_user "\nFAILURE: smap error\n" + set exit_code 1 + exp_continue + } + -re "fatal" { + send_user "\nFAILURE: smap error\n" + set exit_code 1 + exp_continue + } timeout { send_user "\nFAILURE: smap not responding\n" set exit_code 1 diff --git a/testsuite/expect/test17.32 b/testsuite/expect/test17.32 index b26af694904dab4fbdf46cd997d6cde1b9ddd5ae..0a408b0197b6c9fa48cf84e837c68c3d92d76653 100755 --- a/testsuite/expect/test17.32 +++ b/testsuite/expect/test17.32 @@ -51,7 +51,7 @@ make_bash_script $file_in " # # Submit a slurm job that will execute 'id' on 1 node and over task_cnt tasks # -spawn $sbatch --tasks=$task_cnt --overcommit -N1 --output=$file_out -t1 $file_in +spawn $sbatch --ntasks=$task_cnt --overcommit -N1 --output=$file_out -t1 $file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) diff --git a/testsuite/expect/test17.6 b/testsuite/expect/test17.6 index 2d845385d160f5a65b47092639fbdaee2ef6fd80..ef1df747e2c0396d6f277358b8f2983bbcbcd9fb 100755 --- a/testsuite/expect/test17.6 +++ b/testsuite/expect/test17.6 @@ -42,13 +42,13 @@ set tasks 0 print_header $test_id # -# Submit a slurm job that will execute 'id' on 4 tasks (or try anyway) +# Submit a slurm job that will execute 'id' on $task_cnt tasks (or try anyway) # file delete $file_in $file_out make_bash_script $file_in "$slaunch $bin_id" set job_id 0 set no_run 0 -spawn $sbatch --tasks=$task_cnt --output=$file_out -t1 $file_in +spawn $sbatch --ntasks=$task_cnt --output=$file_out -t1 $file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) @@ -107,7 +107,7 @@ file delete $file_in $file_out make_bash_script $file_in "$srun -n $task_cnt --overcommit $bin_id" set job_id 0 set tasks 0 -spawn $sbatch --tasks=1 --output=$file_out -t1 $file_in +spawn $sbatch --ntasks=1 --output=$file_out -t1 $file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) diff --git a/testsuite/expect/test7.8 b/testsuite/expect/test7.8 index 61e8c9d9da59ad90934041640eb9b128fc81ac5d..8f5c0a134c203c43b46b3566700a5983c277bdda 100755 --- a/testsuite/expect/test7.8 +++ b/testsuite/expect/test7.8 @@ -94,6 +94,14 @@ if {$sched_port == 0} { exit 1 } +if { [test_bluegene] } { + set is_bluegene 1 + set node_cnt "1-1024" +} else { + set is_bluegene 0 + set node_cnt "1-2" +} + # # Submit two jobs to work with # @@ -106,15 +114,15 @@ make_bash_script $file_in " set job_id1 0 set job_id2 0 -set srun_pid [spawn $srun -N1-6 --output=$file_out --comment=test --batch -t1 $file_in] +set sbatch_pid [spawn $sbatch -N $node_cnt --output=$file_out --comment=test -t1 $file_in] expect { - -re "jobid ($number) submitted" { + -re "Submitted batch job ($number)" { set job_id1 $expect_out(1,string) exp_continue } timeout { - send_user "\nFAILURE: srun not responding\n" - catch {exec $bin_kill -KILL $srun_pid} + send_user "\nFAILURE: sbatch not responding\n" + catch {exec $bin_kill -KILL $sbatch_pid} set exit_code 1 } eof { @@ -126,15 +134,15 @@ if {$job_id1 == 0} { exit 1 } -set srun_pid [spawn $srun -N1-6 --output=$file_out --comment=test --batch -t1 $file_in] +set sbatch_pid [spawn $sbatch -N $node_cnt --output=$file_out --comment=test -t1 $file_in] expect { - -re "jobid ($number) submitted" { + -re "Submitted batch job ($number)" { set job_id2 $expect_out(1,string) exp_continue } timeout { - send_user "\nFAILURE: srun not responding\n" - catch {exec $bin_kill -KILL $srun_pid} + send_user "\nFAILURE: sbatch not responding\n" + catch {exec $bin_kill -KILL $sbatch_pid} set exit_code 1 } eof { @@ -154,7 +162,7 @@ if {$job_id2 == 0} { exec $bin_rm -f $test_prog exec $bin_make -f /dev/null $test_prog set success 0 -set moab_pid [spawn $test_prog $control_addr $job_id1 $job_id2 $sched_port] +set moab_pid [spawn $test_prog $control_addr $job_id1 $job_id2 $sched_port $is_bluegene] expect { -re "SUCCESS" { set success 1 diff --git a/testsuite/expect/test7.8.prog.c b/testsuite/expect/test7.8.prog.c index 046ab7f65b6e9bd7de12aedbedf4ae9af1af468a..d88ca12450c9d535e0d57d44872910796533feee 100644 --- a/testsuite/expect/test7.8.prog.c +++ b/testsuite/expect/test7.8.prog.c @@ -33,7 +33,7 @@ /* global variables */ char *control_addr; -int sched_port; +int is_bluegene, sched_port; long job_id1, job_id2; static int _conn_wiki_port(char *host, int port) @@ -202,6 +202,33 @@ static void _cancel_job(long my_job_id) _xmit(out_msg); } +static void _modify_job(long my_job_id) +{ + time_t now = time(NULL); + char out_msg[256]; + + snprintf(out_msg, sizeof(out_msg), + "TS=%u AUTH=root DT=CMD=MODIFYJOB ARG=%ld " + /* "PARTITION=pdebug " */ + /* "NODES=2 " */ + /* "DEPEND=afterany:3 " */ + /* "INVALID=123 " */ + "TIMELIMIT=10 BANK=test_bank", + (uint32_t) now, my_job_id); + _xmit(out_msg); +} + +static void _resume_job(long my_job_id) +{ + time_t now = time(NULL); + char out_msg[128]; + + snprintf(out_msg, sizeof(out_msg), + "TS=%u AUTH=root DT=CMD=RESUMEJOB ARG=%ld", + (uint32_t) now, my_job_id); + _xmit(out_msg); +} + static void _start_job(long my_job_id) { time_t now = time(NULL); @@ -214,10 +241,21 @@ static void _start_job(long my_job_id) _xmit(out_msg); } +static void _suspend_job(long my_job_id) +{ + time_t now = time(NULL); + char out_msg[128]; + + snprintf(out_msg, sizeof(out_msg), + "TS=%u AUTH=root DT=CMD=SUSPENDJOB ARG=%ld", + (uint32_t) now, my_job_id); + _xmit(out_msg); +} + int main(int argc, char * argv[]) { - if (argc != 5) { - printf("Usage: %s, control_addr job_id1 job_id2 sched_port\n", + if (argc != 6) { + printf("Usage: %s, control_addr job_id1 job_id2 sched_port is_bluegene\n", argv[0]); exit(1); } @@ -226,12 +264,19 @@ int main(int argc, char * argv[]) job_id1 = atoi(argv[2]); job_id2 = atoi(argv[3]); sched_port = atoi(argv[4]); - printf("control_addr=%s job_id=%ld,%ld sched_port=%d\n", - control_addr, job_id1, job_id2, sched_port); + is_bluegene = atoi(argv[5]); + printf("control_addr=%s job_id=%ld,%ld sched_port=%d is_bluegene=%d\n", + control_addr, job_id1, job_id2, sched_port, is_bluegene); _get_jobs(); _get_nodes(); + _modify_job(job_id1); + _get_jobs(); _start_job(job_id1); + if (!is_bluegene) { + _suspend_job(job_id1); + _resume_job(job_id1); + } _cancel_job(job_id2); sleep(5); _get_jobs();