From 325044b206717c184a03cfe4c1d4fa2bae5ae8a9 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Fri, 26 Sep 2008 22:26:02 +0000 Subject: [PATCH] Initial import of current spank plugins project to googlecode. --- COPYING | 340 ++++ ChangeLog | 516 +++++ DISCLAIMER | 24 + META | 4 + Makefile | 44 + NEWS | 73 + README | 156 ++ README.use-env | 343 ++++ addr-no-randomize.c | 114 ++ auto-affinity.c | 552 +++++ chaos-spankings.spec | 174 ++ cpuset/Makefile | 37 + cpuset/README | 71 + cpuset/conf-parser.l | 78 + cpuset/conf-parser.y | 199 ++ cpuset/conf.c | 299 +++ cpuset/conf.h | 101 + cpuset/cpuset.c | 493 +++++ cpuset/cpuset.init | 47 + cpuset/create.c | 411 ++++ cpuset/create.h | 51 + cpuset/log.c | 232 +++ cpuset/log.h | 56 + cpuset/nodemap.c | 616 ++++++ cpuset/nodemap.h | 51 + cpuset/pam_slurm_cpuset.8 | 81 + cpuset/pam_slurm_cpuset.c | 295 +++ cpuset/release-agent.c | 86 + cpuset/slurm-cpuset.8 | 378 ++++ cpuset/slurm.c | 114 ++ cpuset/slurm.h | 36 + cpuset/test.c | 90 + cpuset/use-cpusets.1 | 114 ++ cpuset/util.c | 464 +++++ cpuset/util.h | 64 + cpuset/version.map | 9 + iorelay/Makefile | 13 + iorelay/iorelay-bind-nfs.sh | 84 + iorelay/iorelay-mount-nodezero.sh | 81 + iorelay/iorelay-mrsh-sshfs-wrap.sh | 50 + iorelay/iorelay.c | 142 ++ iotrace.c | 126 ++ lib/fd.c | 273 +++ lib/fd.h | 129 ++ lib/hostlist.c | 2715 +++++++++++++++++++++++++ lib/hostlist.h | 419 ++++ lib/list.c | 835 ++++++++ lib/list.h | 281 +++ lib/split.c | 149 ++ lib/split.h | 35 + oom-detect.c | 315 +++ overcommit-memory/Makefile | 17 + overcommit-memory/overcommit-memory.c | 220 ++ overcommit-memory/overcommit.c | 383 ++++ overcommit-memory/overcommit.h | 47 + overcommit-memory/util.c | 201 ++ preserve-env.c | 244 +++ pty.c | 565 +++++ renice.c | 190 ++ system-safe-preload.c | 343 ++++ system-safe.c | 123 ++ tmpdir.c | 111 + use-env/Makefile | 27 + use-env/log_msg.c | 241 +++ use-env/log_msg.h | 41 + use-env/main.c | 92 + use-env/test.conf | 79 + use-env/test.conf.include | 3 + use-env/use-env-parser.l | 906 +++++++++ use-env/use-env-parser.y | 676 ++++++ use-env/use-env.c | 460 +++++ use-env/use-env.h | 123 ++ use-env/version.map | 9 + 73 files changed, 17561 insertions(+) create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 DISCLAIMER create mode 100644 META create mode 100644 Makefile create mode 100644 NEWS create mode 100644 README create mode 100644 README.use-env create mode 100644 addr-no-randomize.c create mode 100644 auto-affinity.c create mode 100644 chaos-spankings.spec create mode 100644 cpuset/Makefile create mode 100644 cpuset/README create mode 100644 cpuset/conf-parser.l create mode 100644 cpuset/conf-parser.y create mode 100644 cpuset/conf.c create mode 100644 cpuset/conf.h create mode 100644 cpuset/cpuset.c create mode 100644 cpuset/cpuset.init create mode 100644 cpuset/create.c create mode 100644 cpuset/create.h create mode 100644 cpuset/log.c create mode 100644 cpuset/log.h create mode 100644 cpuset/nodemap.c create mode 100644 cpuset/nodemap.h create mode 100644 cpuset/pam_slurm_cpuset.8 create mode 100644 cpuset/pam_slurm_cpuset.c create mode 100644 cpuset/release-agent.c create mode 100644 cpuset/slurm-cpuset.8 create mode 100644 cpuset/slurm.c create mode 100644 cpuset/slurm.h create mode 100644 cpuset/test.c create mode 100644 cpuset/use-cpusets.1 create mode 100644 cpuset/util.c create mode 100644 cpuset/util.h create mode 100644 cpuset/version.map create mode 100644 iorelay/Makefile create mode 100755 iorelay/iorelay-bind-nfs.sh create mode 100755 iorelay/iorelay-mount-nodezero.sh create mode 100755 iorelay/iorelay-mrsh-sshfs-wrap.sh create mode 100644 iorelay/iorelay.c create mode 100644 iotrace.c create mode 100644 lib/fd.c create mode 100644 lib/fd.h create mode 100644 lib/hostlist.c create mode 100644 lib/hostlist.h create mode 100644 lib/list.c create mode 100644 lib/list.h create mode 100644 lib/split.c create mode 100644 lib/split.h create mode 100644 oom-detect.c create mode 100644 overcommit-memory/Makefile create mode 100644 overcommit-memory/overcommit-memory.c create mode 100644 overcommit-memory/overcommit.c create mode 100644 overcommit-memory/overcommit.h create mode 100644 overcommit-memory/util.c create mode 100644 preserve-env.c create mode 100644 pty.c create mode 100644 renice.c create mode 100644 system-safe-preload.c create mode 100644 system-safe.c create mode 100644 tmpdir.c create mode 100644 use-env/Makefile create mode 100644 use-env/log_msg.c create mode 100644 use-env/log_msg.h create mode 100644 use-env/main.c create mode 100644 use-env/test.conf create mode 100644 use-env/test.conf.include create mode 100644 use-env/use-env-parser.l create mode 100644 use-env/use-env-parser.y create mode 100644 use-env/use-env.c create mode 100644 use-env/use-env.h create mode 100644 use-env/version.map diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..3912109 --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..c2a5f36 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,516 @@ +2008-09-25 Mark Grondona + + * : tag v0.34. + + * preserve-env.c : + Added. + + * auto-affinity.c : + Check current CPU mask in task context as well as in + post opt and abort auto-set of affinity if CPU masks + have changed. This probably means something else is + controlling CPU affinity (or cpusets). + +2008-09-11 Mark Grondona + + * : tag v0.33. + + * cpuset/log.c : + Fix off-by-one use of log level. + + * cpuset/conf-parser.y : + Change message about opening config file from verbose to debug. + + * cpuset/cpuset.c, cpuset/create.c, cpuset/util.c : + Change locking methodology to use global lockfile in /var/lock + instead of using lockfile under /dev/cpuset/slurm. Advisory + locks are dropped if any fd open against a locked file is closed, + not just the original fd. Since libcpuset opens all files under + the /dev/cpuset heirarchy, no file within that fs is usable + as a lockfile for slurm cpuset. + +2008-09-10 Mark Grondona + + * use-env/use-env.c : + Fix generation of SLURM_CMDLINE. (Was accidentally generated + in reverse order). + +2008-08-21 Mark Grondona + + * : tag v0.32. + + * oom-detect.c : + Add 'do_syslog' parameter, which, when true, + sends a message via syslog(3) when any task of + a job step is thought to have been terminated by + the OOM killer. + +2008-08-19 Mark Grondona + + * : tag v0.31. + + * oom-detect.c : + Delay slightly if an OOM killed task is detected. + This should give srun more time to recv the error + message. + +2008-08-04 Mark Grondona + + * : tag v0.30. + + * cpuset/conf-parser.y, cpuset/cpuset.c : + Improve config file parse errors. + + * auto-affinty.c : + Update --auto-affinity=help. + +2008-07-29 Mark Grondona + + * : tag v0.29. + + * auto-affinity.c : + Do not set CPU affinity by default if the number of CPUs + is not a multiple of the number of tasks. + + * lib/hostlist.c, lib/hostlist.h, Makefile: + Include hostlist code, used by cpuset PAM module. + + * cpuset/* : Overhaul cpuset support, including new config file + parser, PAM module, and man pages. + +2008-07-22 Mark Grondona + + * : tag v0.28. + + * auto-affinity.c : + Only run spank_init_post_opt() hook on remote side (slurmd). + +2008-07-16 Mark Grondona + + * : tag v0.27. + + * cpuset/README : + Document `tasks' option to --use-cpusets. + + * cpuset/cpuset.c, cpuset/util.c : + Add --use-cpusets=tasks support to constrain tasks to + their own cpusets under the job step cpuset. + +2008-07-16 Mark Grondona + + * : tag v0.26 + + * cpuset/README : + Add documentation for --use-cpusets option. + + * cpuset/cpuset.c, cpuset/util.h, cpuset/util.c, + cpuset/nodemap.c, cpuset/Makefile : + Add spank user option --use-cpusets to optionally allow + per-job-step cpusets, which are created under the overall + job cpuset. + + * auto-affinity.so : + Move check for cpuset to after user options have been + processed, in case cpuset was changed. Open cpuset related + proc files with O_RDONLY instead of O_RDWR. + +2008-07-10 Mark Grondona + + * : tag v0.25. + + * chaos-spankings.spec : + Add cpuset subpackage for SLURM cpuset plugin, + /etc/init.d/slurm-cpuset init script, and + /sbin/cpuset_release_agent binary. + + * cpsuet/cpuset.init : + Add initscript to mount /dev/cpuset. + + * cpuset/nodemap.c : + Allocate CPUs from nodes in reverse for best-fit and + worst-fit, but in order for first-fit. + + * cpuset/cpuset.c : + Be sure to call slurm_cpuset_create() early in plugin, + before slurm_cpuset_lock(). + + * auto-affinity.c : + Fix bug in auto-affinity plugin when cpuset filesystem + is not mounted. + + * cpuset/README : + Added. + +2008-07-09 Mark Grondona + + * cpuset.c, util.c, util.h : + Add !mem or !mem-constrain option to disable constraint + of memory nodes. Change "idle-first" options to + !idle-1st, idle-1st=gt, idle-1st=mult, idle-1st=no. + + * Makefile : + Add cpuset to subdirs. + + * cpuset/Makefile, cpuset/util.h, cpuset/util.c, + cpuset/nodemap.h, cpuset/nodemap.c, cpuset/cpsuet.c, + cpuset/test.c, cpuset/release-agent.c : + Add initial version of SLURM cpuset.so module. + + * Makefile : + Add dependence on lib/fd.o to auto-affinity.so. + + * auto-affinity.c : + Allow auto-affinity to work when running inside a cpuset. + Map CPUs as chosen for CPU affinity back to actual CPUs + available to tasks inside their cpuset. (Plugin should + work the same as before, except that the number of available + CPUs is adjusted to the number of CPUs in the cpuset). + +2008-06-10 Mark Grondona + + * : tag v0.24. + + * Makefile : + auto-affinity.so now needs to link against libslurm. + + * auto-affinity.c : + If SLURM_JOB_CPUS_PER_NODE is not set, fall back to querying + slurm controller for necessary information. This is only + used in exclusive_only mode, and is a temporary solution + until the env var above is set for all SLURM jobs. + +2008-06-10 Mark Grondona + + * : tag v0.23. + + * auto-affinity.c : + Change `exclusive' option to `exclusive_only'. + +2008-06-09 Mark Grondona + + * auto-affinity.c : + Add `exclusive' option to auto-affinity plugin, which, when + used, will disable auto-affinity when the running job does + not have exclusive access to the node. + +2008-05-15 Mark Grondona + + * addr-no-randomize.c : + Added plugin to set ADDR_NO_RANOMIZE personality on + processes, thus disabling address space randomization. + +2007-08-13 Jim Garlick + + * iorelay/* : New. + +2007-07-27 Mark Grondona + + * pty.c : + Add ability to process window size changes. + + * : tag v0.20. + + * pty.c : Instead of closing stdin/out/err, dup onto /dev/null. + Allow SLURM_PTY_NO_CLOSE_STDIO env variable to disable close + of stdio in tasks != task0. + + * overcommit-memory/overcommit.c (unregister_job) : + Fix bug that caused improper cleanup when runnin against all steps + for a given jobid, i.e. stepid = -1. + + * overcommit-memory/overcommit.c, overcommit-memory/util.c : + Properly report failed job cleanup. + + * : tag v0.21. + +2007-07-27 Mark Grondona + + * pty.c : + Added. New --pty option to srun(1) runs task 0 under a pseudo-tty. + + * Makefile, chaos-spankings.spec : + Build and package pty.so. + +2007-07-03 Mark Grondona + + * chaos-spankings.spec : + Include proper BuildRequires. + + * : tag v0.19. + +2007-02-12 Mark Grondona + + * auto-affinity.c : Don't ignore 1 task/node if CPUs/task is set. + + * overcommit-memory/overcommit-memory.c, + overcommit-memory/overcommit.c, overcommit-memory/overcommit.h : + Also adjust overcommmit_ratio when overcommit-memory plugin is + used (mainly for the "no overcommit" case). Reset original + value when the last user exits. + + * : tag v0.18. + +2007-02-02 Mark Grondona + + * : tag v0.17. + +2007-02-02 Mark Grondona + + * overcommit-memory.c, overcommit-memory/overcommit-memory.c + overcommit-memory/overcommit.h, overcommit-memory/overcommit.c, + overcommit-memory/util.c, lib/fd.c, lib/fd.h : + Move overcommit-memory source into its own dir. + Plugin now uses a shared memory file to track current users + and restores default overcommit policy when the last user + exits. Supply a utility, overcommit-util, to clean up state + of shared memory file, query current users, etc. + + * chaos-spankings.spec : + Updates for changes in overcommit-memory plugin. + +2007-02-02 Mark Grondona + + * auto-affinity.c : Force enable auto-affinity if any user option is + passed to --auto-affinity (except "off" of course). + + * auto-affinity.c : Rename "last_cpu_first" to "reverse." + Add start=N option to begin CPU affinity at CPU [N] instead + of CPU 0. Add shorthands "v" for verbose, "rev" for reverse. + +2007-01-24 Mark Grondona + + * auto-affinity.c : + Added. Set up some sane CPU affinity defaults. + + * use-env/use-env-parser.l : + Expand `~' to $HOME in POSTOP and STRING conditions. + + * : tag v0.16. + +2007-01-19 Mark Grondona + + * use-env/use-env-parser.y : + Be sure to not evaluate "matches" keyword when the condition + state is not true. + + * use-env/use-env.c : + Do not report errors from spank_setenv() when overwrite == 0. + + * overcommit-memory.c : + Added. Allow users to change overcommit behavior on nodes + of their job. + + * : tag v0.15. + +2007-01-10 Mark Grondona + + * io-watchdog/* : + Remove io-watchdog code. It is now its own project. + + * use-env/use-env-parser.l, use-env/use-env-parser.y, + README.use-env: + Change fnmatch() function to ``STRING matches PATTERN'' + + * : tag v0.13. + + * use-env/use-env-parser.y : + Allow empty input file. + + * : tag v0.14. + +2006-12-29 Mark Grondona + + * io-watchdog/io-watchdog-interposer.c : + Glob for proper libc using pattern /lib{64,}/libc.so* instead + of explicitly specifying libc filenames. + + * io-watchdog/io-watchdog-interposer.c : + Intercept calls to glibc IO functions _IO_putc and IO_puts. + + * io-watchdog/io-watchdog-interposer.c : + Set ctx.progname even if IO_WATCHDOG_TARGET not set. + +2006-12-28 Mark Grondona + + * io-watchdog/io-watchdog-interposer.c : + Also check for libc.so.6.1 if libc.so.6 is not found. + + * use-env/use-env-parser.y, use-env/use-env-parser.l, + use-env/use-env.c : + Add fnmatch() "function" to use-env config file. + Additional comments in use-env.c. + +2006-12-27 Mark Grondona + + * use-env/use-env.c : + Replace slurm_spank_local_user_init() which was inadvertently + removed earlier. + + * use-env/use-env.c : + Set SLURM_CMDLINE and SLURM_ARGV*/SLURM_ARGC keywords for + use in use-env config files. + + * README.use-env : Update documentation. + +2006-12-26 Mark Grondona + + * use-env/use-env.c, use-env/user-env.h , + use-env/use-env-parser.y, use-env/use-env-parser.l, + use-env/main.c, use-env/test.conf : + Add support for "in task" blocks in use-env config files + that are only parsed from spank_task_init. Provide wrappers + for {get,set,unset}env to access job environment in remote + context. + +2006-12-21 Mark Grondona + + * io-watchdog/io-watchdog-interposer.c : + Undefine fwrite_unlocked if it is a #define. Fix for compile + problem. + + * : tag v0.12. + +2006-12-20 Mark Grondona + + * lib/split.c, lib/split.h, lib/list.c, lib/list.h : + Move src files that may be used by multiple plugins into + a lib dir. + + * use-env/split.c, use-env/split.h, + use-env/list.c, use-env/list.h : + Removed. + + * use-env/Makefile : + Use sources from ../lib/. + + * Makefile, chaos-spankings.spec : + Better use of subdirectories. + + * chaos-spankings.spec : + Package tmpdir.so. + + * Makefile, io-watchdog/* : + Initial support for io-watchdog plugin. + +2006-12-13 Mark Grondona + + * tmpdir.c, Makefile : Add toy module that creates and + destroys job-step specific TMPDIR. + +2006-11-30 Mark Grondona + + * use-env/use-env.c : Only run cleanup in local context, + i.e. when !spank_remote(). + + * : tag v0.11. + +2006-11-28 Mark Grondona + + * use-env : + Moved use-env plugin into its own directory. + Complete redesign of use-env parser implemented with lex & yacc. + - Support for double-quoted strings. + - Added support for conditional if/else if/else/endif blocks. + - Added support for expansion of symbols with $ID and ${ID} + constructs. Symbols are use-env keywords, locally defined + symbols, or environment variables (searched in that order). + - Added support for keywords SLURM_NNODES, SLURM_NPROCS, SLURM_JOBID, + SLURM_STEPID for testing attributes of the current job. + - New "set" command for setting parser options (currently only + debuglevel is supported) + - New "dump" command to dump either the current list of "symbols" + "keywords" or both ("all"). + - New "print" command for printing arbitrary strings to stdout. + - New "define" command for defining symbols not exported to + the environment and "undefine" for deleting local symbols. + - See README.use-env for more information. + + * : tag v0.10. + +2006-11-15 Mark Grondona + + * use-env.c, env-override.c : + - Prefer files in /etc/slurm/environment/name instead + of /etc/slurm/env-name.conf (same for ~/.slurm/) + - Always read both system and user "default" file. Apply + user defaults after system defaults so user can override + system settings. + - User default file is always called "default" + - Allow a list of names to be specified to --use-env, e.g. + --use-env=mvapich,test. The settings are applied in order + i.e. test after mvapich. + + * use-env.c : + - Check for slurm_spank_local_user_init support from SPANK, + and if it exists, read environment overrides in that + callback instead of in spank_init and option processing hooks. + + * list.c, list.h, split.c, split.h : Added. + + * README.use-env : Updated documentation for --use-env. + + * : tag v0.9. + +2006-11-11 Mark Grondona + + * use-env.c, env-override.c : + - Fix environment variable value overwrite. + - Allow config files to be included from other files with + the "include" directive. + - Add "unset" directive for unsetting + - Improve environment override file parsing a bit. + + * README.use-env, chaos-spankings.spec : + Add and install README for the use-env plugin. + + * : tag v0.8. + +2006-11-09 Mark Grondona + + * Makefile, use-env.c, env-override.c, env-override.h, + strlcpy.c, strlcpy.h, list.c, list.h, chaos-spankings.spec : + Add --use-env capability for overriding environment variables + in srun before sending environment to the remote job. + + * : tag v0.7. + +2006-10-18 Jim Garlick + + * iotrace.c : Added ability to pass flags to plasticfs log module. + Remove extraneous code. + + * : tag v0.6. + +2006-10-17 Jim Garlick + + * iotrace.c : New --iotrace capability using plasticfs via LD_PRELOAD. + Derived from Mark's system-safe.c. + + * chaos-spankings.spec, Makefile : Add iotrace. + + * : tag v0.5. + +2006-10-09 Mark Grondona + + * system-safe.c, system-safe-preload.c : Add srun option to place + system-safe-preload.so in job's LD_PRELOAD, which replaces + system(3) with a version that calls fork(2) before application's + main(), thus allowing MPI applications to use system(3) on MPI + implementations that might not be fork()-safe. + + * oom-detect.c : Fix "(null)" at end of error message. + + * : tag v0.3. + + * system-safe-preload.c : Fix some stray fprintf's. + + * : tag v0.4. + +2006-07-25 Mark Grondona + + * renice.c : Fix format for verbose message. + + * oom-detect.c : Fix formatting of error message. + +2006-07-21 Mark Grondona + * : Initial version. diff --git a/DISCLAIMER b/DISCLAIMER new file mode 100644 index 0000000..1bb04be --- /dev/null +++ b/DISCLAIMER @@ -0,0 +1,24 @@ +This work was produced at the Lawrence Livermore National Laboratory +(LLNL) under Contract No. DE-AC52-07NA27344 (Contract 44) between +the U.S. Department of Energy (DOE) and Lawrence Livermore National +Security, LLC (LLNS) for the operation of LLNL. + +This work was prepared as an account of work sponsored by an agency of +the United States Government. Neither the United States Government nor +Lawrence Livermore National Security, LLC nor any of their employees, +makes any warranty, express or implied, or assumes any liability or +responsibility for the accuracy, completeness, or usefulness of any +information, apparatus, product, or process disclosed, or represents +that its use would not infringe privately-owned rights. + +Reference herein to any specific commercial products, process, or +services by trade name, trademark, manufacturer or otherwise does +not necessarily constitute or imply its endorsement, recommendation, +or favoring by the United States Government or Lawrence Livermore +National Security, LLC. The views and opinions of authors expressed +herein do not necessarily state or reflect those of the Untied States +Government or Lawrence Livermore National Security, LLC, and shall +not be used for advertising or product endorsement purposes. + +The precise terms and conditions for copying, distribution, and +modification are specified in the file "COPYING". diff --git a/META b/META new file mode 100644 index 0000000..f1b68de --- /dev/null +++ b/META @@ -0,0 +1,4 @@ + Name: chaos-spankings + Version: 0.34 + Release: 1 + Author: Mark Grondona diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..bf2cb17 --- /dev/null +++ b/Makefile @@ -0,0 +1,44 @@ + +CFLAGS = -Wall -ggdb + +all: renice.so \ + oom-detect.so \ + system-safe-preload.so system-safe.so \ + iotrace.so \ + tmpdir.so \ + auto-affinity.so \ + pty.so \ + addr-no-randomize.so \ + preserve-env.so \ + subdirs + +SUBDIRS = use-env overcommit-memory cpuset + +.SUFFIXES: .c .o .so + +.c.o: + $(CC) $(CFLAGS) -o $@ -fPIC -c $< +.o.so: + $(CC) -shared -o $*.so $< $(LIBS) + +subdirs: + @for d in $(SUBDIRS); do make -C $$d; done + +system-safe-preload.so : system-safe-preload.o + $(CC) -shared -o $*.so $< -ldl + +auto-affinity.so : auto-affinity.o lib/split.o lib/list.o lib/fd.o + $(CC) -shared -o $*.so auto-affinity.o lib/split.o lib/list.o -lslurm + +preserve-env.so : preserve-env.o lib/list.o + $(CC) -shared -o $*.so preserve-env.o lib/list.o + +pty.so : pty.o + $(CC) -shared -o $*.so $< -lutil + +clean: subdirs-clean + rm -f *.so *.o lib/*.o + +subdirs-clean: + @for d in $(SUBDIRS); do make -C $$d clean; done + diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..ff29cea --- /dev/null +++ b/NEWS @@ -0,0 +1,73 @@ +Version 0.34 (2008-09-25): +- auto-affinity: Fix for using auto-affinity module with jobs using + --use-cpusets=task. The auto-affinity module now checks to make sure + CPU mask has not changed in task context, and if so, silently + does nothing. +- preserve-env: New plugin which, when enabled with --preserve-slurm-env + option, will attempt to keep the remote SLURM_* environment variables + the same as in the current context. Useful for invoking + "srun -n1 --pty bash" from within an allocation shell. + +Version 0.33 (2008-09-11): +- Fix for critical locking bug in cpuset plugin. The cpuset plugin + now uses a global lockfile in /var/lock instead of locking files + under /dev/cpuset. +- Fix for generation of SLURM_CMDLINE in use-env plugin. + +Version 0.32 (2008-08-21): +- oom-detect: Optionally log OOM killed jobs via syslog(3), if + the do_syslog parameter is used in plugstack.conf. The syslog + message has the form "slurmd: OOM detected: jobid=JOBID uid=UID" + +Version 0.31 (2008-08-19): +- oom-detect: Delay slightly if an OOM killed process is detected + to give the error message time to make it to srun stderr. + +Version 0.30 (2008-08-04): +- cpuset: Slightly improve config file error messages. +- cpuset: Minor fixes for man pages. +- auto-affinity: Update --auto-affinity=help message. + +Version 0.29 (2008-07-29): +- cpuset: Major overhaul of SLURM cpuset support. Now includes a PAM + module, pam_slurm_cpuset.so, and a global config file in + /etc/slurm/slurm-cpuset.conf. For more information, see the + new manual pages included with the distribution. +- auto-affinity: Do not set CPU affinity by default if the number + of available CPUs is not evenly divisible by the number of tasks. + +Version 0.28 (2008-07-22): +- auto-affinity: Fix error where spank_post_opt hook was incorrectly + run in srun, which caused an immediate error and abort. + +Version 0.27 (2008-07-16): +- cpuset: Expand cpuset support to per-task cpusets via --use-cpusets=tasks. + +Version 0.26 (2008-07-16): +- cpuset: Add support for per-job-step cpusets via the new srun option + '--use-cpusets'. See the README or --use-cpusets=help for more information. +- auto-affinity: Delay detection of current cpuset until after user + option processing in the event that user option changed our cpuset. + +Version 0.25 (2008-07-10): +- cpuset: Added cpuset plugin to constrain jobs to number of CPUs + allocated on shared, but not oversubscribed nodes. +- auto-affinity: Make auto-affinity plugin cpuset-aware. CPU affinity + is assigned as if the job were running on a node the size of the + current cpuset. If cpusets are not enabled, the auto-affinity behavior + is unchanged. + +Version 0.24 (2008-06-10): + - auto-affinity: Query SLURM controller for number of CPUs allocated + to the current job in exclusive_only mode if the environment variable + SLURM_JOB_CPUS_PER_NODE is not set. + +Version 0.23 (2008-06-10): + - auto-affinity: Add 'exclusive_only' flag to auto-affinity plugin + to constrain plugin activity to only those jobs that have exclusive + use of the current node. + +(2008-06-10): + - Started NEWS file. + +$Id: NEWS 7811 2008-09-25 22:21:11Z grondo $ diff --git a/README b/README new file mode 100644 index 0000000..b330b27 --- /dev/null +++ b/README @@ -0,0 +1,156 @@ +SLURM spank plugins README +================================== + +This package includes several SLURM spank plugins developed +at LLNL and used on production compute clusters onsite. A few +of these plugins are only valid when used on LLNL's software +stack (oom-detect.so, for example, requires LLNL-specific patches +to track job's terminated by the OOM killer). However, the +source for all plugins is provided here in the hope that they +might be useful to other plugin developers. The following +is a short description of most of the plugins in this package. + +addr-no-randomize +----------------- + +The addr-no-randomize plugin allows sysadmins to set a default +policy for address space randomization (when supported and +enabled in the Linux kernel), and provides an option for users +to enable/disable randomization on a per-job basis. + +auto-affinity +----------------- + +Automatically assign CPU affinity using best-guess defaults. + +The default behavior of this plugin attempts to accomodate +multi-threaded apps by assigning more than one CPU per task +if the number of tasks running on the node is evenly divisible +into the number of CPUs. Otherwise, CPU affinity is not enabled +unless the cpus_per_task (cpt) option is specified. The default +behavior may be modified using the --auto-affinity options +listed below. Also, the srun(1) --cpu_bind option is processed +after auto-affinity, and thus may be used to override any CPU +affinity settings from this module. + +This plugin should not be used alone on systems using node +sharing. In that case, it should be used along with +the cpuset plugin below (and auto-affinity.so should be listed +*after* cpuset.so in the plugstack.conf). + +cpuset +----------------- + +The cpuset plugin uses Linux cpusets to constrain jobs to the +number of CPUs they have been allocated on nodes. The plugin +is specifically designed for sytems sharing nodes and using CPU +scheduling (i.e. using the select/cons_res plugin). The plugin +will not work on systems where CPUs are oversubscribed to jobs +(i.e. strict node sharing without the use of select/cons_res). + +The plugin also has a pam_slurm_cpuset counterpart, which +replaces pam_slurm and serves an identical functionality, +except that user login sessions are constrained to their +currently allocated CPUs on a node. + +The cpuset plugin requires the SGI libbitmask and libcpuset +libraries available from + + http://oss.sgi.com/projects/cpusets + +(See also cpuset/README) + +iorelay +----------------- + +The iorelay plugin is an experimental proof-of-concept plugin +for remounting required filesystems for a parallel job from +the first allocated node to all others. It is meant to reduce +the load on global NFS servers. + +It has not been used in production. + + +iotrace +----------------- + +The iotrace plugin is another experimental plugin which +uses "plasticfs" to log filesystem access on a per-job +basis. + + +oom-detect +----------------- + +The oom-detect plugin detects jobs that have been victims +of the OOM killer using some special code added to the LLNL +Linux kernel. As tasks exit after having been killed by +the OOM killer, a message is printed to the user's stderr +along with some memory information about the task. + +overcommit-memory +----------------- + +The overcommit-memory plugin is an attempt to allow users +to tune global overcommit behavior of the Linux kernel on +a per-job basis. It is currently buggy and thus not used. + +preserve-env +----------------- + +The preserve-env plugin adds an srun option + + --preserve-slurm-env + +which attempts to preserve the current state of all SLURM_* +environment variables in the remotely executed environment. This +is meant solely to be used from an allocation shell with +the syntax + + srun -n1 -N1 --pty --preserve-slurm-env $SHELL + +as a sort of "remote" allocation shell. + +pty +----------------- + +The pty plugin provides the SLURM --pty option, introduced +in slurm-1.3, for slurm-1.2. It isn't fully functional at this +point, but is a good example of a complex feature added solely +from a spank plugin. + + +renice +----------------- + +The renice plugin is the same as the example code in the +spank(8) man page. It provides a new srun option "--renice=VALUE" +which allows users to set the nice value of their remote +tasks (down to a minimum value configured by sysadmin). + +system-safe +------------------ + +The system-safe plugin provides an MPI-safe system(3) +replacement through an LD_PRELOAD library (most of the work +is done in system-safe-preload.c). The preloaded library +interposes a version of system(3) that does not fork. Instead, +the command line is passed through a pipe to a copy of the +program which was pre-forked before MPI_Init(). The return +value of the real system() call is passed back through the +pipe and returned to the calling application, for which there +is no noticable difference with the real system(3). + +use-env +------------------ + +The use-env plugin allows system administrators and users to +modify the environment of SLURM jobs using a set of simple +yet very flexible config files. Environment variables can +be overridden, set only if unset, set based on conditional +syntax, and even defined in a per-task context. The config +files have access to key slurm variables such as SLURM_NNODES, +SLURM_NPROCS, etc., so variables can even be defined differently +depending of the size of the job. + +See README.use-env for further information. diff --git a/README.use-env b/README.use-env new file mode 100644 index 0000000..ab5fdde --- /dev/null +++ b/README.use-env @@ -0,0 +1,343 @@ +The use-env.so plugin for SLURM +============================================================================ + +SYNOPSIS + +The "use-env" spank(8) plugin for SLURM provides a simple facility for +utilizing SLURM to initialize and/or modify the current environment for +users launching jobs through srun(1). When the plugin is enabled in the +spank plugin stack (plugstack.conf by default), it reads environment +overrides from a default config file at srun initialization, and +also allows user-selected environment overrides via the srun option +"--use-env=name." When using --use-env=name, the config file +loaded is from ~/.slurm/environment/ or /etc/slurm/environment/name. +(~/.slurm/env-.conf or /etc/slurm/env-.conf is also +supported for backwards compatibility, but these locations are +deprecated and the file in environment/ is preferred) +The format of the config file is described below. + +This plugin also supports generation of a different environment per +task throught use of "in task" blocks, which are parsed by slurmd +in task context just before calling exec(). See TASK BLOCKS below +for more information. + +DEFAULT CONFIG + +The default config file is read from /etc/slurm/environment/default +and is always used if it exists. A user default is also read +from ~/.slurm/environment/default. Settings in the user file are applied +after the global defaults in /etc/slurm so that user settings can +override system defaults. The default environment settings are +applied before any user-selected environment via the --use-env +option. + +The name of the global default config can be overridden by use of the +"default=" option to plugin, e.g., with the following line in +plugstack.conf: + + required use-env.so default=mvapich + +would read /etc/slurm/environment/mvapich by default instead of +/etc/slurm/environment/default. The user default file is always +named "default" however. + + +CONFIG FILE FORMAT + +Lines in the use-env config file(s) can have the following format. +A '#' anywhere on the line indicates a comment. Statements +are separated by newlines or semicolons ";". + +Config files can be included from other files with the "include" +statement + + include name + +will include file "name" from the same directory as the file +in which the "include" was invoked. An absolute pathname +may also be specified, e.g.: + + include /etc/slurm/environment/foo + +TASK BLOCKS + +Configuration that should only be applied to remotely executed +tasks may be specified in special "in task" blocks, which +have the form + + in task { statments... } + +This block, if present, will be read by each task in the job +just before exec() is called. This allows the environment +to be tailored for a specific task, for example: + + in task { + if ($SLURM_PROCID == 0) + LD_PRELOAD = "$LD_PRELOAD libdebug.so" + endif + } + +would append libdebug.so to LD_PRELOAD only for task 0 +in the job. The rest of the config file is ignored +by the task. Likewise, these task blocks are ignored +when the config file is parsed by srun (except for +syntax checking). + + +ASSIGNMENT EXPRESSIONS + +The simplest form of expression in the config file is to assign +a new value to an environment variable + + identifier = value + +Where identifier is a valid environment variable of the form +[A-Za-z_][0-9A-Za-Z_] and value can be any arbitrary quoted string +or string literal. Environment variables (and other locally defined +symbols or keywords) can be expanded by the familiar form of +$ID or ${ID}. Variable expansion will occur in both unquoted and +quoted strings. Whitespace outside of quoted strings is ignored. + +Examples of assignment are + + MYENV = foo # Valid: MYENV="foo" + MYENV2 = $MYENV/bar # Valid: MYENV2="foo/bar" + MYENV3 = ${MYENV}bar # Valid: MYENV3="foobar" + MYENV3 = "${MYENV}bar" # Valid: MYENV4="foobar" + MYENV4 = foo bar # Invalid + MYENV5 = "foo bar" # Valid: MYENV5="foo bar" + MYENV6 = "foo \"bar\"" # Valid: MYENV6="foo "bar"" + +There are additional assignment operators that may be used in +assignment expressions as well. These include: + + "|=" Set new value only if identifier was previously unset. + "+=" Prepend value to colon-separated identifier (e.g. PATH) + "=+" Append value to colon-separated identifier (e.g. PATH) + +For example: + + MYENV = foo + MYENV |= bar # Does nothing. MYENV="foo" + + PATH = /usr/bin # PATH = "/usr/bin" + PATH += /bin # PATH = "/bin:/usr/bin" + PATH =+ /usr/local/bin # PATH = "/bin:/usr/bin:/usr/local/bin" + +Note that + + PATH += "/usr/bin" + +is the same as + + PATH = /usr/bin:$PATH + +except that when $PATH is empty, the trailing ":" will not appear +when using the "+=" operator. + + +UNSET EXPRESSION + +Environment variables may also be unset using the simple +unset expression + + unset identifier + +For example: + + unset MYENV + +would unset the environment variable "MYENV" from the current env. + + +AVAILABLE KEYWORDS + +A small set of keywords are available within the config file +which describe current parameters of the executing job. These +include + + SLURM_JOBID Current SLURM job id + SLURM_STEPID Current SLURM job step id + SLURM_NNODES Number of nodes in current job + SLURM_NPROCS Number of processes in current job + SLURM_CMDLINE Remote command line for this job + SLURM_ARGC Number of command line arguments + SLURM_ARGV* Command line argument(s) ARGV0-ARGVN + +In task context, the following additional keywords are also available + + SLURM_PROCID Global task id or rank + SLURM_NODEID Global node id + +These are called keywords because their values cannot be overridden +by the user. However, they are referenced just like any other +variable. + +A full list of keywords and their values can be dumped to +stderr with the "dump keywords" command. See the DEBUGGING +section below. + + +DEFINING LOCAL SYMBOLS + +Occaisionally it may be desireable to define new variables that are +not exported to the current environment. The "define" keyword is used +for this purpose + + define identifier = value + +works much like the assignment expression, except that the variable +is not exported to the local environment (and thus, not to the job). + +Locally defined variables such as these are undefined with the +"undefine" keyword: + + undefine n + +will delete "n" from the symbol table. + + +CONDITIONAL EXPRESSIONS + +The use-env configuration file supports conditional expressions of +the form + + if (tests) + statements + else if (tests) + statements + else if ... + statements + else + statements + endif + +Where ``tests'' can have combinations of the following formsa + + value < value # Numeric comparison only + value > value # Numeric comparison only + value >= value # Numeric comparison only + value <- value # Numeric comparison only + value == value # Numeric or string compare + value !- value # Numeric or string compare + value # True if var is not 0 or empty string; + defined var # True if var is defined + S matches P # True if string S matches the glob expression P + + ! tests + tests && tests + tests || tests + ( tests ) + + +For example: + + if ($SLURM_NNODES > 100) + MORE_THAN_100_NODES = 1 + else if ($SLURM_NNODES > 50) + MORE_THAN_50_NODES = 1 + else + FIFTY_NODES_OR_LESS = 1 + endif + + if ( "$SLURM_ARGV0" matches "*myapp*") + include env.myapp + endif + + +DEBUGGING + +Other commands that are mainly useful for debugging include: + + print "STRING" Print the value of string to stdout + set debuglevel N Set the debug level for the parser to value N + dump keywords Dump a list of currently defined keywords + dump symbols Dump a list of currently defined local symbols + dump all Dump both of the above + +The use-env plugin also looks for the environment variable: + + SPANK_USE_ENV_DEBUG + +which will increase the verbosity of debug logs for the use-env +parser if non-zero. + + + +EXAMPLES + +/etc/slurm/environment/default: + # + # Include global defaults + include global + # + # Include environment for mvapich + include mvapich + +/etc/slurm/environment/global + # + # If TMPDIR not set, set to /tmp + TMPDIR |= /tmp + # + +/etc/slurm/environment/mvapich + # + # Force MVAPICH timeout to 22 + # + VIADEV_DEFAULT_TIME_OUT=22 + # + # Prepend /usr/lib/mpi/dbg/mvapich-gen2/lib/shared to LD_LIBRARY_PATH + LD_LIBRARY_PATH += /usr/lib/mpi/dbg/mvapich-gen2/lib/shared + + +~/.slurm/environment/mvapich-test + # + # environment for testing new versions of MVAPICH + # + PATH += /home/grondo/mvapich-test/root/lib/shared + LD_LIBRARY_PATH += /home/grondo/mvapich-test/root/bin + +PATH and LD_LIBRARAY_PATH can then be adjusted to use the mvapich-test +version with the srun command line: + + srun --use-env=mvapich-test ... + + +Using conditional expressions + +~/.slurm/environment/default + # + # Using different environment variables based on job size + # + + define n = $SLURM_NPROCS + define N = $SLURM_NNODES + + if ($N > 128 || $n > 1024) + include large-env + else if (($N > 16) || ($n > 128)) + include medium-env + else + include small-env + endif + + if (defined $DEBUG) + print "environment setup for $SLURM_JOBID.$SLURM_STEPID complete" + dump keywords + dump symbols + endif + + +Output for this config file for a run with DEBUG set might look like: + +~ > DEBUG=1 srun hostname +environment setup for 4862.4 complete +use-env: default: 18: Dumping keywords +use-env: default: 18: SLURM_STEPID = "4" +use-env: default: 18: SLURM_JOBID = "4862" +use-env: default: 18: SLURM_NPROCS = "16" +use-env: default: 18: SLURM_NNODES = "2" +use-env: default: 19: Dumping symbols +use-env: default: 19: N = "2" +use-env: default: 19: n = "16" + diff --git a/addr-no-randomize.c b/addr-no-randomize.c new file mode 100644 index 0000000..38eedbe --- /dev/null +++ b/addr-no-randomize.c @@ -0,0 +1,114 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include + +#include + +/* + * All spank plugins must define this macro for the SLURM plugin loader. + */ +SPANK_PLUGIN(no-randomize, 1); + +#define ADDR_NO_RANDOMIZE 0x0040000 + +static int default_randomize = 0; +static int randomize = -1; + +#define OPT_RANDOMIZE 1 +#define OPT_NO_RANDOMIZE 2 + +static int process_opts (int val, const char *optarg, int remote); + +/* + * Provide options to srun: + */ +struct spank_option spank_options[] = +{ + { "addr-randomize", NULL, + "Enable address space randomization", 0, OPT_RANDOMIZE, + (spank_opt_cb_f) process_opts + }, + { "no-addr-randomize", NULL, + "Disable address space randomization", 0, OPT_NO_RANDOMIZE, + (spank_opt_cb_f) process_opts + }, + SPANK_OPTIONS_TABLE_END +}; + + +/* + * Called from both srun and slurmd. + */ +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + int i; + + for (i = 0; i < ac; i++) { + if (strncmp ("default_randomize=", av[i], 8) == 0) { + const char *optarg = av[i] + 18; + if (*optarg == '0') + default_randomize = 0; + else if (*optarg == '1') + default_randomize = 1; + else + slurm_error ("no-randomize: Ignoring invalid default value: " + "\"%s\"", av[i]); + } + else { + slurm_error ("no-randomize: Invalid option \"%s\"", av[i]); + } + } + + randomize = default_randomize; + + return (0); +} + +static int process_opts (int val, const char *optarg, int remote) +{ + if (val == OPT_RANDOMIZE) + randomize = 1; + else if (val == OPT_NO_RANDOMIZE) + randomize = 0; + else + randomize = default_randomize; + + return (0); +} + +int slurm_spank_task_init (spank_t sp, int ac, char **av) +{ + if (randomize == -1) + randomize = default_randomize; + + slurm_info ("randomize = %d\n", randomize); + + if (randomize == 0 && (personality (ADDR_NO_RANDOMIZE) < 0)) + slurm_error ("Failed to set personality: %m"); + return 0; +} + diff --git a/auto-affinity.c b/auto-affinity.c new file mode 100644 index 0000000..a0d67b3 --- /dev/null +++ b/auto-affinity.c @@ -0,0 +1,552 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#define __USE_GNU +#include + +#include +#include + +#include "lib/split.h" +#include "lib/fd.h" + +SPANK_PLUGIN(auto-affinity, 1); + +static int ncpus = -1; +static int ntasks = -1; +static int enabled = 1; +static int verbose = 0; +static int reverse = 0; +static int startcpu = 0; +static int requested_cpus_per_task = 0; +static int exclusive_only = 0; + +static cpu_set_t cpus_available; +static int ncpus_available; + +static const char auto_affinity_help [] = +"\ +auto-affinity: Automatically assign CPU affinity using best-guess defaults.\n\ +\n\ +The default behavior attempts to accomodate multi-threaded apps by \n\ +assigning more than one CPU per task if the number of tasks running \n\ +on the node is evenly divisible into the number of CPUs. Otherwise, \n\ +CPU affinity is not enabled unless the cpus_per_task (cpt) option is \n\ +specified. The default behavior may be modified using the \n\ +--auto-affinity options listed below. Also, the srun(1) --cpu_bind option\n\ +is processed after auto-affinity, and thus may be used to override any \n\ +CPU affinity settings from this module.\n\ + \n\ +Option Usage: --auto-affinity=[args...]\n\ + \n\ +where args... is a comma separated list of one or more of the following\n\ + help Display this message.\n\ + v(erbose) Print CPU affinty list for each remote task\n\ + \n\ + off Disable automatic CPU affinity.\n\ + \n\ + start=N Start affinity assignment at CPU [N]. If assigning CPUs\n\ + in reverse, start [N] CPUs from the last CPU.\n\ + rev(erse) Allocate last CPU first instead of starting with CPU0.\n\ + cpus_per_task=N Allocate [N] CPUs to each task.\n\ + cpt=N Shorthand for cpus_per_task.\n\n"; + + +static int parse_user_option (int val, const char *optarg, int remote); + +struct spank_option spank_options [] = { + { "auto-affinity", "[args]", + "Automatic, best guess CPU affinity for SMP machines " + "(args=`help' for more info)", + 2, 0, (spank_opt_cb_f) parse_user_option + }, + SPANK_OPTIONS_TABLE_END +}; + +static int str2int (const char *str) +{ + char *p; + long l = strtol (str, &p, 10); + + if (p && (*p != '\0')) + return (-1); + + return ((int) l); +} + +static int parse_option (const char *opt, int *remotep) +{ + if (strcmp (opt, "off") == 0) + enabled = 0; + else if ((strcmp (opt, "reverse") == 0) || (strcmp (opt, "rev") == 0)) + reverse = 1; + else if (strncmp (opt, "cpt=", 4) == 0) { + if ((requested_cpus_per_task = str2int (opt+4)) < 0) + goto fail; + } + else if (strncmp (opt, "cpus_per_task=", 14) == 0) { + if ((requested_cpus_per_task = str2int (opt+14)) < 0) + goto fail; + } + else if (strncmp (opt, "start=", 6) == 0) { + if ((startcpu = str2int (opt+6)) < 0) + goto fail; + } + else if (strcmp (opt, "verbose") == 0 || strcmp (opt, "v") == 0) + verbose = 1; + else if ((strcmp (opt, "help") == 0) && !(*remotep)) { + fprintf (stderr, auto_affinity_help); + exit (0); + } + + return (0); + + fail: + slurm_error ("auto-affinity: Invalid option: `%s'", opt); + return (-1); +} + +static int parse_user_option (int val, const char *arg, int remote) +{ + char *str; + List l; + int rc = 1; + + if (arg == NULL) + return (0); + + l = list_split (",", (str = strdup (arg))); + rc = list_for_each (l, (ListForF) parse_option, &remote); + + list_destroy (l); + free (str); + + return (rc); +} + +static int parse_argv (int ac, char **av, int remote) +{ + int i; + for (i = 0; i < ac; i++) { + if (strcmp (av[i], "off") == 0) + enabled = 0; + else if (strcmp (av[i], "exclusive_only") == 0) + exclusive_only = 1; + else + return (-1); + } + return (0); +} + + +/* + * XXX: Since we don't have a good way to determine the number of + * CPUs allocated to this job on this node, we have to query + * the slurm controller (!). + * + * Hopefully this function can be removed in the near future. + * It should only be called when SLURM_JOB_CPUS_PER_NODE is not + * set in the environment. + */ +static int query_ncpus_per_node (spank_t sp) +{ + job_info_msg_t * msg; + uint32_t jobid; + int cpus_per_node = -1; + int i; + + if (spank_get_item (sp, S_JOB_ID, &jobid) != ESPANK_SUCCESS) { + if (verbose) + fprintf (stderr, "auto-affinity: Failed to get my JOBID!\n"); + return (-1); + } + + if (slurm_load_jobs (0, &msg, 0) < 0) { + slurm_error ("auto-affinity: slurm_load_jobs: %m\n"); + return (-1); + } + + for (i = 0; i < msg->record_count; i++) { + job_info_t *j = &msg->job_array[i]; + + if (j->job_id == jobid) { + /* + * XXX: Assume cpus_per_node is the same across the whole job. + */ + cpus_per_node = (int) j->cpus_per_node[0]; + break; + } + } + + slurm_free_job_info_msg (msg); + return (cpus_per_node); +} + + +/* + * Return 1 if job has allocated all CPUs on this node + */ +static int job_is_exclusive (spank_t sp) +{ + const char var[] = "SLURM_JOB_CPUS_PER_NODE"; + char val[16]; + int n; + + if (spank_getenv (sp, var, val, sizeof (val)) != ESPANK_SUCCESS) { + if (verbose) + fprintf (stderr, "auto-affinity: Failed to find %s in env\n", + "SLURM_JOB_CPUS_PER_NODE"); + + /* XXX: Now query slurm controller for this information */ + if ((n = query_ncpus_per_node (sp)) < 0) { + fprintf (stderr, "auto-affinity: Unabled to determine ncpus!\n"); + return (0); + } + } + else if ((n = str2int (val)) < 0) { + fprintf (stderr, "auto-affinity: %s=%s invalid\n", + "SLURM_JOB_CPUS_PER_NODE", val); + return (0); + } + + return (n == ncpus); +} + + +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + if (!spank_remote (sp)) + return (0); + + if (parse_argv (ac, av, spank_remote (sp)) < 0) + return (-1); + + /* + * First get total number of online CPUs + */ + if ((ncpus = (int) sysconf (_SC_NPROCESSORS_ONLN)) < 0) { + slurm_error ("Failed to get number of processors: %m\n"); + return (-1); + } + + if (spank_get_item (sp, S_JOB_LOCAL_TASK_COUNT, &ntasks) != ESPANK_SUCCESS) + { + slurm_error ("Failed to get number of local tasks\n"); + return (-1); + } + + return (0); +} + +/* + * Use the slurm_spank_user_init callback to check for exclusivity + * becuase user options are processed prior to calling here. + * Otherwise, we would not be able to use the `verbose' flag. + */ +int slurm_spank_user_init (spank_t sp, int ac, char **av) +{ + if (!spank_remote (sp)) + return (0); + + if (exclusive_only && !job_is_exclusive (sp)) { + if (verbose) + fprintf (stderr, "auto-affinity: Disabling. " + "(job doesn't have exclusive access to this node)\n"); + enabled = 0; + } + + if (exclusive_only && + (ntasks < ncpus_available) && (ncpus_available % ntasks)) { + if (verbose) + fprintf (stderr, "auto-affinity: Disabling. " + "ncpus must be evenly divisible by number of tasks\n"); + enabled = 0; + } + + return (0); +} + +static int cpu_set_count (cpu_set_t *setp) +{ + int i; + int n = 0; + for (i = 0; i < ncpus; i++) { + if (CPU_ISSET (i, setp)) + n++; + } + return (n); +} + +static char * cpuset_to_cstr (cpu_set_t *mask, char *str) +{ + int i; + char *ptr = str; + int entry_made = 0; + + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, mask)) { + int j; + int run = 0; + entry_made = 1; + for (j = i + 1; j < CPU_SETSIZE; j++) { + if (CPU_ISSET(j, mask)) + run++; + else + break; + } + if (!run) + sprintf(ptr, "%d,", i); + else if (run == 1) { + sprintf(ptr, "%d,%d,", i, i + 1); + i++; + } else { + sprintf(ptr, "%d-%d,", i, i + run); + i += run; + } + while (*ptr != 0) + ptr++; + } + } + ptr -= entry_made; + *ptr = 0; + + return str; +} + +static int get_cpus_per_task () +{ + if (requested_cpus_per_task) + return (requested_cpus_per_task); + else if ((ncpus_available % ntasks) == 0) + return (ncpus_available / ntasks); + else + return (1); +} + +/* + * Return the absolute cpu number for relative CPU cpu within + * the available cpus mask 'cpus_available'. + */ +static int mask_to_available (int cpu) +{ + int i; + int j = 0; + for (i = 0; i < ncpus; i++) { + if (CPU_ISSET (i, &cpus_available) && (cpu == j++)) + return (i); + } + slurm_error ("Yikes! Couldn't convert CPU%d to available CPU!", cpu); + return (-1); +} + +static int generate_mask (cpu_set_t *setp, int localid) +{ + int i = 0; + int cpu; + int cpus_per_task = get_cpus_per_task (); + + if (cpus_per_task == 1) { + if ((cpu = mask_to_available (localid + startcpu)) < 0) + return (-1); + CPU_SET (cpu, setp); + return (0); + } + + cpu = ((localid * cpus_per_task) + startcpu) % ncpus_available; + + while (i++ < cpus_per_task) { + int bit = mask_to_available (cpu); + if (bit < 0) + return (-1); + CPU_SET (bit, setp); + cpu = (cpu + 1) % ncpus_available; + } + + return (0); +} + +static int generate_mask_reverse (cpu_set_t *setp, int localid) +{ + int i = 0; + int cpu; + int cpus_per_task = get_cpus_per_task (); + int lastcpu = ncpus_available - 1; + + if (cpus_per_task == 1) { + cpu = (lastcpu - (localid + startcpu) % ncpus_available); + if ((cpu = mask_to_available (cpu)) < 0) + return (-1); + CPU_SET (cpu, setp); + return (0); + } + + cpu = lastcpu - (((localid * cpus_per_task) + startcpu) % ncpus_available); + + while (i++ < cpus_per_task) { + int bit = mask_to_available (cpu); + if (bit < 0) + return (-1); + CPU_SET (bit, setp); + cpu = (--cpu >= 0) ? cpu : (ncpus_available - 1); + } + + return (0); +} + +/* + * Set the provided cpu set to the actual CPUs available to the + * current task (which may be restricted by cpusets or other + * mechanism. + * + * Returns the number of cpus set in setp. + * + */ +static int get_cpus_available (cpu_set_t *setp) +{ + if (sched_getaffinity (0, sizeof (cpu_set_t), setp) < 0) { + slurm_error ("auto-affinity: sched_getaffinity: %m"); + return (-1); + } + + return (cpu_set_count (setp)); +} + +int slurm_spank_init_post_opt (spank_t sp, int ac, char **av) +{ + if (!spank_remote (sp)) + return (0); + /* + * Set available cpus mask after user options have been processed, + * in case our cpuset changed. + */ + ncpus_available = get_cpus_available (&cpus_available); + return (0); +} + +int check_task_cpus_available (void) +{ + int n; + + /* + * Check number of available cpus again. If it has + * changed since checking in spank_init_post_opt, + * then abort, because likely something else is adjusting + * the cpu mask (or we are using per-task cpusets) + * and auto-affinity is not warranted. + */ + if ((n = get_cpus_available (&cpus_available)) && + (n != ncpus_available) ) { + if (ncpus_available > 0) { + if (verbose) + fprintf (stderr, "auto-affinity: Not adjusting CPU mask. " + "(task cpu mask adjusted externally)\n"); + return (-1); + } + + ncpus_available = n; + } + + return (0); +} + +int slurm_spank_task_init (spank_t sp, int ac, char **av) +{ + int localid; + cpu_set_t setp[1]; + char buf[4096]; + + if (!enabled) + return (0); + + if (check_task_cpus_available () < 0) + return (0); + + if (ncpus_available <= 1) + return (0); + + if ((ntasks <= 1) && !requested_cpus_per_task) { + if (verbose) + fprintf (stderr, "auto-affinity: Not adjusting CPU mask. " + "(%d task on this node)\n", ntasks); + return (0); + } + + /* + * Do nothing if user is overcommitting resources + */ + if (ntasks > ncpus_available) + return (0); + + /* + * Do nothing by default if number of CPUs is not a multiple + * of the number of tasks + */ + if ((ncpus_available % ntasks) && !requested_cpus_per_task) { + if (verbose) { + fprintf (stderr, "auto-affinity: Not adjusting mask. " + "(%d tasks not evenly divided among %d CPUs)\n", + ntasks, ncpus_available); + fprintf (stderr, "To force, explicity set cpus-per-task\n"); + } + return (0); + } + + spank_get_item (sp, S_TASK_ID, &localid); + + if (requested_cpus_per_task > ncpus_available) { + if (localid == 0) + slurm_error ("auto-affinity cpus_per_task=%d > ncpus=%d. %s...", + requested_cpus_per_task, ncpus_available, "Ignoring"); + requested_cpus_per_task = 0; + } + + CPU_ZERO (setp); + + if (reverse) + generate_mask_reverse (setp, localid); + else + generate_mask (setp, localid); + + if (verbose) + fprintf (stderr, "%s: local task %d: CPUs: %s\n", + "auto-affinity", localid, cpuset_to_cstr (setp, buf)); + + if (sched_setaffinity (getpid (), sizeof (*setp), setp) < 0) { + slurm_error ("Failed to set auto-affinity for task %d\n", localid); + return (-1); + } + + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/chaos-spankings.spec b/chaos-spankings.spec new file mode 100644 index 0000000..610da57 --- /dev/null +++ b/chaos-spankings.spec @@ -0,0 +1,174 @@ +## +# $Id: chaos-spankings.spec 7813 2008-09-25 23:08:25Z grondo $ +## + +Name: +Version: +Release: + +Summary: SLURM SPANK modules for CHAOS systems +Group: System Environment/Base +License: GPL + +BuildRoot: %{_tmppath}/%{name}-%{version} +Source0: %{name}-%{version}.tgz + +BuildRequires: slurm-devel job bison flex +BuildRequires: libbitmask libcpuset +BuildRequires: pam-devel + +Requires: slurm + +%description +This package contains a set of SLURM SPANK modules for CHAOS clusters. +Currently includes: + - renice.so : add --renice option to srun allowing users to set priority + of job + - oom-detect.so : Detect tasks killed by OOM killer via /proc/oomkilled file. + - system-safe.so : Implement pre-forked system(3) replacement in case MPI + implementation doesn't support fork(2). + - iotrace.so : Enable tracing of IO calls through LD_PRELOAD trick + - use-env.so : Add --use-env flag to srun to override environment + variables for job + - tmpdir.so : Create a job-specific TMPDIR and remove it (as the user) + after the job has exited. + + - auto-affinity.so: + Try to set CPU affinity on jobs using some kind of + presumably sane defaults. Also adds an --auto-affinity + option for tweaking the default behavior. + + - overcommit-memory.so : + Allow users to choose overcommit mode on nodes of + their job. + + - pty.so : Run task 0 of SLURM job under pseudo tty. + +%package cpuset +Summary: Cpuset spank plugin for slurm. +Group: System Environment/Base +Requires: libbitmask libcpuset slurm + +%description cpuset +This package contains a SLURM spank plugin for enabling +the use of cpusets to constrain CPU use of jobs on nodes to +the number of CPUs allocated. This plugin is specifically +designed for systems sharing nodes and using CPU scheduling +(i.e. using the sched/cons_res plugin). Most importantly the +plugin will be harmful when overallocating CPUs on nodes. The +plugin is enabled by adding the line: + + required cpuset.so [options] + +to /etc/slurm/plugstack.conf. + +A PAM module - pam_slurm_cpuset.so - is also provided for +constraining user logins in a similar fashion. For more +information see the slurm-cpuset(8) man page provided with +this package. + + +%prep +%setup + +%build +make CFLAGS="$RPM_OPT_FLAGS" + +%install +rm -rf "$RPM_BUILD_ROOT" +mkdir -p "$RPM_BUILD_ROOT" + +plugins="renice.so \ + oom-detect.so \ + system-safe.so \ + iotrace.so \ + tmpdir.so \ + use-env/use-env.so \ + overcommit-memory/overcommit-memory.so \ + auto-affinity.so \ + preserve-env.so \ + pty.so + " + +libs="system-safe-preload.so" +utilities="overcommit-memory/overcommit-util" + +libdir=$RPM_BUILD_ROOT%{_libdir} +plugindir=${libdir}/slurm +utildir=$RPM_BUILD_ROOT%{_libexecdir}/chaos-spankings/ + +mkdir -p --mode=0755 $plugindir +mkdir -p --mode=0755 $utildir + +cat /dev/null > std-plugins.list +for plugin in $plugins; do + install -m0755 $plugin $plugindir + echo %{_libdir}/slurm/$(basename $plugin) >>std-plugins.list +done + +for lib in $libs; do + install -m0755 $lib $libdir +done + +for utility in $utilities; do + install -m0755 $utility $utildir +done + +# +# cpuset_release_agent goes into /sbin +# +mkdir -p $RPM_BUILD_ROOT/sbin +install -m0755 cpuset/cpuset_release_agent $RPM_BUILD_ROOT/sbin +install -m0755 cpuset/cpuset.so $plugindir +mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir}/init.d/ +install -m0755 cpuset/cpuset.init \ + $RPM_BUILD_ROOT/%{_sysconfdir}/init.d/slurm-cpuset + +mkdir -p $RPM_BUILD_ROOT/%{_mandir}/man1 +mkdir -p $RPM_BUILD_ROOT/%{_mandir}/man8 +mkdir -p $RPM_BUILD_ROOT/%{_lib}/security + +install -m0755 cpuset/pam_slurm_cpuset.so $RPM_BUILD_ROOT/%{_lib}/security +install -m0644 cpuset/slurm-cpuset.8 cpuset/pam_slurm_cpuset.8 \ + $RPM_BUILD_ROOT/%{_mandir}/man8 +install -m0644 cpuset/use-cpusets.1 \ + $RPM_BUILD_ROOT/%{_mandir}/man1 + +# create /etc/slurm/plugstack.d directory +mkdir -p $RPM_BUILD_ROOT/%{_sysconfdir}/slurm/plugstack.conf.d + +# create entry for preserve-env.so +echo " required preserve-env.so" > \ + $RPM_BUILD_ROOT/%{_sysconfdir}/slurm/plugstack.conf.d/99-preserve-env + +%clean +rm -rf "$RPM_BUILD_ROOT" + +%post cpuset +if [ -x /sbin/chkconfig ]; then /sbin/chkconfig --add slurm-cpuset; fi + +%preun cpuset +if [ "$1" = 0 ]; then + if [ -x /sbin/chkconfig ]; then /sbin/chkconfig --del slurm-cpuset; fi +fi + +%files -f std-plugins.list +%defattr(-,root,root,0755) +%doc NEWS ChangeLog README.use-env +/%{_libdir}/*.so +/%{_libexecdir}/chaos-spankings/* +%dir %attr(0755,root,root) %{_sysconfdir}/slurm/plugstack.conf.d +%config(noreplace) %{_sysconfdir}/slurm/plugstack.conf.d/* + +%files cpuset +%defattr(-,root,root,0755) +%doc NEWS ChangeLog cpuset/README +%{_sysconfdir}/init.d/slurm-cpuset +%{_libdir}/slurm/cpuset.so +/%{_lib}/security/pam_slurm_cpuset.so +/sbin/cpuset_release_agent +%{_mandir}/man1/use-cpusets.* +%{_mandir}/man8/pam_slurm_cpuset.* +%{_mandir}/man8/slurm-cpuset.* + + diff --git a/cpuset/Makefile b/cpuset/Makefile new file mode 100644 index 0000000..4279039 --- /dev/null +++ b/cpuset/Makefile @@ -0,0 +1,37 @@ +NAME := cpuset + +FLAGS := -ggdb -Wall -I../lib +SHOPTS := -shared -Wl,--version-script=version.map +LLIBS := -lbitmask -lcpuset -ldl -lfl +OBJS := nodemap.o util.o create.o log.o slurm.o \ + conf.o conf-lexer.o conf-parser.o \ + ../lib/fd.o ../lib/list.o ../lib/split.o + +all: $(NAME).so test cpuset_release_agent pam_slurm_cpuset.so + +$(NAME).so: $(OBJS) $(NAME).o + $(CC) $(SHOPTS) -o $(NAME).so $(OBJS) $(NAME).o $(LLIBS) + +test: test.o $(OBJS) + $(CC) -o test $(OBJS) test.o $(LLIBS) + +cpuset_release_agent: release-agent.o $(OBJS) + $(CC) -o cpuset_release_agent $(OBJS) release-agent.o $(LLIBS) + + +pam_slurm_cpuset.so : $(OBJS) pam_slurm_cpuset.o ../lib/hostlist.o + $(CC) -shared -o pam_slurm_cpuset.so $(OBJS) ../lib/hostlist.o \ + pam_slurm_cpuset.o -lbitmask $(LLIBS) -lpam -lpam_misc +.c.o: + $(CC) $(CFLAGS) $(FLAGS) -o $@ -fPIC -c $< + +conf.o : conf-parser.h + +conf-lexer.c : conf-parser.l conf-parser.h + flex -oconf-lexer.c conf-parser.l + +conf-parser.c conf-parser.h : conf-parser.y + bison -d -oconf-parser.c conf-parser.y + +clean: + -rm -f *.o *.so conf-parser.[ch] conf-lexer.c cpuset_release_agent test diff --git a/cpuset/README b/cpuset/README new file mode 100644 index 0000000..5fabc24 --- /dev/null +++ b/cpuset/README @@ -0,0 +1,71 @@ + +INTRODUCTION + +The SLURM 'cpuset' plugin uses Linux cpusets to constrain jobs to +the number of CPUs they have been allocated on nodes. The plugin is +specifically designed for sytems sharing nodes and using CPU scheduling +(i.e. using the select/cons_res plugin). The plugin will not work on +systems where CPUs are oversubscribed to jobs (i.e. strict node sharing +without the use of select/cons_res). + +The plugin uses SLURM's spank framework, and thus it is enabled by adding +the following line to /etc/slurm/plugstack.conf: + + required cpuset.so [options] + +where options [options] may be supplied to tune module behavior. + +The plugin may also constrain job steps to their own cpusets under +the job cpuset. This may be useful when running multiple job steps +under a single allocation, as the resources of each job step may +be partitioned into separate job steps. This functionality is enabled +by the srun user option + + --use-cpusets=[args...] + +Use of the --use-cpusets option for job steps is described below. + + +REQUIREMENTS + +The cpuset plugin of course requires cpuset support. It also uses the +libbitmask and libcpuset libraries from SGI for creating and managing +cpusets. Source for these libraries are available at + + http://oss.sgi.com/projects/cpusets/ + +The cpuset filesystem must also be mounted at runtime in order for +the plugin to be able to query and create cpusets. To mount the cpuset +filesystem, use: + + mount -t cpuset none /dev/cpuset + +The plugin currently assumes that the cpuset filesystem will be available +under /dev/cpuset. + +Included with the cpuset plugin source is a cpusets "release +agent" (release-agent.c) which may optionally be installed as +/sbin/cpuset_release_agent on any nodes using the SLURM cpuset plugin. +This release agent will be run for each SLURM cpuset when the last task +within the cpuset exits, and will free the cpuset immediately (with +proper locking so as not to race with other jobs). This release agent +is optional for a couple reasons: + + 1. In the current version of Linux for which this plugin was written + (RHEL5), there can only be one release-agent system-wide. We don't + want to interfere with other uses of cpusets if they exist. + + 2. The cpuset plugin removes stale cpusets at startup anyway. So, + the cpuset_release_agent is not a critical component. However, + it is nice to clean up job cpusets as the jobs exit, instead of + waiting until the *next* job is run. Unused cpusets lying around + may be confusing to users and sysadmins. + + +MAN PAGES + +This file is out of date. For up-to-date information see the +man pages provided with this software: slurm-cpuset(8), +use-cpusets(1), and pam_slurm_cpuset(8). + +$Id: README 7653 2008-07-29 22:33:31Z grondo $ diff --git a/cpuset/conf-parser.l b/cpuset/conf-parser.l new file mode 100644 index 0000000..d23b613 --- /dev/null +++ b/cpuset/conf-parser.l @@ -0,0 +1,78 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +%{ +#include +#include +#include + +#include "conf.h" + +#define YYSTYPE char * +#include "conf-parser.h" + +%} + +%option nounput + +%% + +#[^\n]* ; /* Ignore comments. */ +[ \t\r]+ ; /* Ignore whitespace. */ +; { return ';'; } +, { return ','; } +(#.*)?\\?\n { return '\n'; } + +\"[^\"]*\" | +\'[^\']*\' { + yytext [strlen (yytext) - 1] = '\0'; + yylval = strdup (yytext+1); + return STRING; + } + +(fit-)?policy { return POLICY; } +order { return ORDER; } +use-idle | +alloc-idle { return USE_IDLE; } +constrain-mem(s)? { return CONST_MEM; } +kill-orph(an)?s { return KILL_ORPHS; } += { return '='; } + +0 | +no | +No { return FALSE; } +1 | +yes | +Yes { return TRUE; } + +[^=;, \t\r\n]+ { + yylval = strdup (yytext); + return STRING; + } + +%% + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/conf-parser.y b/cpuset/conf-parser.y new file mode 100644 index 0000000..838b0e1 --- /dev/null +++ b/cpuset/conf-parser.y @@ -0,0 +1,199 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +%{ +#include +#include +#include +#include + +#include "conf.h" +#include "log.h" + +extern int yylex (); +void yyerror (const char *s); +extern FILE *yyin; + +static int cpuset_conf_line; + +#define YYSTYPE char * +#define YYDEBUG 1 +int yydebug = 0; + +static int cf_policy (const char *); +static int cf_use_idle (const char *); +static int cf_order (const char *); +static int cf_const_mem (int); +static int cf_kill_orphs (int); + +%} + +%token POLICY "policy" +%token USE_IDLE "use-idle" +%token CONST_MEM "constrain-mem" +%token KILL_ORPHS "kill-orphs" +%token ORDER "order" +%token TRUE "true" +%token FALSE "false" +%token STRING "string" + +%error-verbose + +%% + +file : /* empty */ + | file stmts + ; + +stmts : end + | stmt end + | stmts stmt + ; + +stmt : POLICY '=' STRING { if (cf_policy ($3) < 0) YYABORT; } + | USE_IDLE '=' STRING { if (cf_use_idle ($3) < 0) YYABORT; } + | USE_IDLE '=' FALSE { if (cf_use_idle ("no") < 0) YYABORT; } + | USE_IDLE '=' TRUE { if (cf_use_idle ("yes") < 0) YYABORT; } + | CONST_MEM '=' TRUE { if (cf_const_mem (1) < 0) YYABORT; } + | CONST_MEM '=' FALSE { if (cf_const_mem (0) < 0) YYABORT; } + | KILL_ORPHS '=' TRUE { if (cf_kill_orphs (1) < 0) YYABORT; } + | KILL_ORPHS '=' FALSE { if (cf_kill_orphs (0) < 0) YYABORT; } + | ORDER '=' STRING { if (cf_order ($3) < 0) YYABORT; } + +end : '\n' { cpuset_conf_line++; } + | ';' + ; + +%% + +static cpuset_conf_t conf; +static const char * cpuset_conf_filename = NULL; + +void cpuset_conf_debug () +{ + yydebug = 1; +} + +static const char * cf_file () +{ + if (!cpuset_conf_filename) + return ("stdin"); + return (cpuset_conf_filename); +} + +static int cf_line () +{ + return (cpuset_conf_line); +} + +void yyerror (const char *s) +{ + log_err ("%s: %d: %s\n", cf_file (), cf_line (), s); +} + +int cpuset_conf_parse (cpuset_conf_t cf, const char *path) +{ + cpuset_conf_filename = NULL; + + cpuset_conf_set_file (cf, path); + + if (strcmp (path, "-") == 0) + yyin = stdin; + else if (!(yyin = fopen (path, "r"))) { + int err = errno; + log_err ("open: %s: %s\n", path, strerror (errno)); + errno = err; + return (-1); + } + + cpuset_conf_filename = path; + cpuset_conf_line = 1; + conf = cf; + + log_debug ("reading config from \"%s\"\n", cf_file ()); + + if (yyparse ()) { + log_err ("%s: %d: parser failed\n", cf_file (), cf_line ()); + errno = 0; + return (-1); + } + + fclose (yyin); + + return (0); +} + +static int cf_policy (const char *name) +{ + log_debug ("%s: %d: Setting allocation policy to %s.\n", + cf_file (), cf_line(), name); + if (cpuset_conf_set_policy_string (conf, name) < 0) + return log_err ("%s: %d: Invalid allocation policy '%s'.\n", + cf_file (), cf_line (), name); + return (0); +} + +static int cf_use_idle (const char *s) +{ + log_debug ("%s: %d: Setting idle node use policy to %s.\n", + cf_file (), cf_line(), s); + if (cpuset_conf_set_alloc_idle_string (conf, s) < 0) + return log_err ("%s: %d: Invalid alloc-idle string '%s'\n", + cf_file (), cf_line (), s); + return (0); +} + +static int cf_order (const char *s) +{ + log_debug ("%s: %d: Setting order to %s.\n", + cf_file (), cf_line (), s); + + if (strcasecmp (s, "reverse") == 0) + return cpuset_conf_set_order (conf, 1); + else if (strcasecmp (s, "normal") == 0) + return cpuset_conf_set_order (conf, 0); + + return log_err ("%s: %d: Invalid setting for order: %s\n", + cf_file (), cf_line (), s); +} + +static int cf_const_mem (int val) +{ + log_debug ("%s: %d: Setting constrain-memsto %s.\n", + cf_file (), cf_line(), val ? "true" : "false"); + return (cpuset_conf_set_constrain_mem (conf, val)); +} + +static int cf_kill_orphs (int val) +{ + log_debug ("%s: %d: Setting kill-orphans to %s.\n", + cf_file (), cf_line(), val ? "true" : "false"); + return (cpuset_conf_set_kill_orphans (conf, val)); +} + +/* + * vi: ts=4 sw=4 expandtab + */ + diff --git a/cpuset/conf.c b/cpuset/conf.c new file mode 100644 index 0000000..0aa8b64 --- /dev/null +++ b/cpuset/conf.c @@ -0,0 +1,299 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include + +#include "conf.h" +#include "log.h" + +#include "conf-parser.h" + +static const char * default_config = "/etc/slurm/slurm-cpuset.conf"; + +struct cpuset_conf { + char filename [1024]; + + enum fit_policy policy; + + unsigned filename_valid:1; + unsigned reverse_order:1; + unsigned alloc_idle_nodes:1; + unsigned use_idle_if_multiple:1; + unsigned constrain_mems:1; + unsigned kill_orphans:1; +}; + + +/* + * Accessor routines + */ +enum fit_policy cpuset_conf_policy (cpuset_conf_t conf) +{ + return (conf->policy); +} + +int cpuset_conf_alloc_idle (cpuset_conf_t conf) +{ + return (conf->alloc_idle_nodes); +} + +int cpuset_conf_alloc_idle_gt (cpuset_conf_t conf) +{ + return (conf->alloc_idle_nodes && !conf->use_idle_if_multiple); +} + +int cpuset_conf_alloc_idle_multiple (cpuset_conf_t conf) +{ + return (conf->alloc_idle_nodes && conf->use_idle_if_multiple); +} + +int cpuset_conf_constrain_mem (cpuset_conf_t conf) +{ + return (conf->constrain_mems); +} + +int cpuset_conf_kill_orphans (cpuset_conf_t conf) +{ + return (conf->kill_orphans); +} + +int cpuset_conf_reverse_order (cpuset_conf_t conf) +{ + return (conf->reverse_order); +} + +int cpuset_conf_set_policy (cpuset_conf_t conf, enum fit_policy policy) +{ + if (!conf) + return (-1); + conf->policy = policy; + return (0); +} + +int cpuset_conf_set_policy_string (cpuset_conf_t conf, const char *name) +{ + if (strcmp (name, "best-fit") == 0) + return (cpuset_conf_set_policy (conf, BEST_FIT)); + else if (strcmp (name, "worst-fit") == 0) + return (cpuset_conf_set_policy (conf, WORST_FIT)); + else if (strcmp (name, "first-fit") == 0) + return (cpuset_conf_set_policy (conf, FIRST_FIT)); + else + return (-1); +} + +int cpuset_conf_set_alloc_idle (cpuset_conf_t conf, int alloc_idle) +{ + if (!conf) + return (-1); + conf->alloc_idle_nodes = alloc_idle; + return (0); +} + +int cpuset_conf_set_alloc_idle_mode (cpuset_conf_t conf, int multiple_only) +{ + if (!conf) + return (-1); + conf->use_idle_if_multiple = multiple_only; + return (0); +} + +int cpuset_conf_set_alloc_idle_string (cpuset_conf_t conf, const char *s) +{ + if (strcmp (s, "0") == 0 || + strcasecmp (s, "never") == 0 || + strcasecmp (s, "no") == 0) + return (cpuset_conf_set_alloc_idle (conf, 0)); + + if (strcmp (s, "1") == 0 || + strcasecmp (s, "yes") == 0) + return (cpuset_conf_set_alloc_idle (conf, 1)); + + if (strcasecmp (s, "multiple") == 0 || + strcasecmp (s, "mult") == 0) + return (cpuset_conf_set_alloc_idle_mode (conf, 1)); + + if (strcasecmp (s, "gt") == 0 || + strcasecmp (s, "greater") == 0) + return (cpuset_conf_set_alloc_idle_mode (conf, 0)); + + log_err ("Unknown alloc-idle setting \"%s\"\n", s); + + return (-1); +} + +int cpuset_conf_parse_opt (cpuset_conf_t conf, const char *opt) +{ + /* + * First check to see if we're setting a policy + */ + if (cpuset_conf_set_policy_string (conf, opt) == 0) + return (0); + + if (strncmp ("policy=", opt, 7) == 0) { + if (cpuset_conf_set_policy_string (conf, opt + 7) < 0) + return (log_err ("Unknown allocation policy \"%s\"", opt)); + } + + /* + * Next check for new config file via "conf=" + */ + if (strncmp ("conf=", opt, 5) == 0) + return (cpuset_conf_parse (conf, opt + 5)); + + if ((strcmp ("!idle-1st", opt) == 0) || + (strcmp ("no-idle", opt) == 0)) + return (cpuset_conf_set_alloc_idle (conf, 0)); + + if (strncmp ("idle-1st=", opt, 9) == 0) + return (cpuset_conf_set_alloc_idle_string (conf, opt + 9)); + + if (strncmp ("idle-first=", opt, 11) == 0) + return (cpuset_conf_set_alloc_idle_string (conf, opt + 11)); + + if ((strcmp ("!mem", opt) == 0) || + (strcmp ("nomem", opt) == 0) || + (strcmp ("!constrain-mem", opt) == 0)) + return (cpuset_conf_set_constrain_mem (conf, 0)); + + if ((strcmp ("mem", opt) == 0) || + (strcmp ("constrain-mem", opt) == 0)) + return (cpuset_conf_set_constrain_mem (conf, 1)); + + if ((strcmp ("reverse", opt) == 0) || + (strcmp ("order=reverse", opt) == 0)) + return (cpuset_conf_set_order (conf, 1)); + + if ((strcmp ("order=normal", opt) == 0)) + return (cpuset_conf_set_order (conf, 0)); + + return (log_err ("Unknown option \"%s\"\n", opt)); +} + +int cpuset_conf_set_constrain_mem (cpuset_conf_t conf, int constrain_mem) +{ + if (!conf) + return (-1); + conf->constrain_mems = constrain_mem; + return (0); +} + +int cpuset_conf_set_kill_orphans (cpuset_conf_t conf, int kill_orphans) +{ + if (!conf) + return (-1); + conf->kill_orphans = kill_orphans; + return (0); +} + +int cpuset_conf_set_order (cpuset_conf_t conf, int reverse) +{ + if (!conf) + return (-1); + conf->reverse_order = reverse; + return (0); +} + + +/* + * Create and Destroy: + */ +cpuset_conf_t cpuset_conf_create () +{ + cpuset_conf_t conf = malloc (sizeof (*conf)); + + if (conf == NULL) + return (NULL); + + memset (conf->filename, 0, sizeof (conf->filename)); + conf->filename_valid = 0; + + /* + * Set defaults + */ + conf->policy = BEST_FIT; + conf->reverse_order = 0; + conf->alloc_idle_nodes = 1; + conf->use_idle_if_multiple = 1; + conf->constrain_mems = 1; + conf->kill_orphans = 0; + + return (conf); +} + +void cpuset_conf_destroy (cpuset_conf_t conf) +{ + if (conf) free (conf); +} + + +/* + * Parsing + */ + +static int parse_if_exists (cpuset_conf_t conf, const char *file) +{ + if (access (file, F_OK) < 0) + return (0); + + if (access (file, R_OK) < 0) { + log_err ("File %s exists but is not readable.\n", file); + return (-1); + } + + if (cpuset_conf_parse (conf, file) < 0) + return (-1); + + /* Successfully read config file */ + return (0); +} + +int cpuset_conf_parse_system (cpuset_conf_t conf) +{ + return (parse_if_exists (conf, default_config)); +} + +const char * cpuset_conf_file (cpuset_conf_t conf) +{ + if (!conf->filename_valid) + return (NULL); + return (conf->filename); +} + +void cpuset_conf_set_file (cpuset_conf_t conf, const char *file) +{ + strncpy (conf->filename, file, sizeof (conf->filename)); + conf->filename_valid = 1; +} + +/* + * Later, perhaps allow a per-user conf file in ~/.slurm/cpuset.conf... + */ + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/conf.h b/cpuset/conf.h new file mode 100644 index 0000000..bc8b31b --- /dev/null +++ b/cpuset/conf.h @@ -0,0 +1,101 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#ifndef _CPUSET_CONF_H +#define _CPUSET_CONF_H + +typedef struct cpuset_conf * cpuset_conf_t; + +/* + * Valid allocation policies for cpusets + */ +enum fit_policy { + BEST_FIT, + FIRST_FIT, + WORST_FIT, +}; + + +/* + * Accessor routines + */ +enum fit_policy cpuset_conf_policy (cpuset_conf_t conf); + +int cpuset_conf_alloc_idle (cpuset_conf_t conf); + +int cpuset_conf_constrain_mem (cpuset_conf_t conf); + +int cpuset_conf_alloc_idle_gt (cpuset_conf_t conf); + +int cpuset_conf_alloc_idle_multiple (cpuset_conf_t conf); + +int cpuset_conf_kill_orphans (cpuset_conf_t conf); + +int cpuset_conf_reverse_order (cpuset_conf_t conf); + +int cpuset_conf_set_policy (cpuset_conf_t conf, enum fit_policy policy); + +int cpuset_conf_set_alloc_idle (cpuset_conf_t conf, int alloc_idle); + +int cpuset_conf_set_alloc_idle_mode (cpuset_conf_t conf, int multiple_only); + +int cpuset_conf_set_kill_orphans (cpuset_conf_t conf, int kill_orphans); + +int cpuset_conf_set_alloc_idle_string (cpuset_conf_t conf, const char *s); + +int cpuset_conf_set_policy_string (cpuset_conf_t conf, const char *name); + +int cpuset_conf_set_constrain_mem (cpuset_conf_t conf, int constrain_mem); + +int cpuset_conf_set_order (cpuset_conf_t conf, int reverse); +/* + * Create and Destroy: + */ +cpuset_conf_t cpuset_conf_create (); + +void cpuset_conf_destroy (cpuset_conf_t conf); + + +/* + * Parsing + */ + +int cpuset_conf_parse (cpuset_conf_t conf, const char *path); + +int cpuset_conf_parse_system (cpuset_conf_t conf); + +int cpuset_conf_parse_opt (cpuset_conf_t conf, const char *opt); + +/* + * Return filename of last config file parsed + */ +const char *cpuset_conf_file (cpuset_conf_t conf); + +void cpuset_conf_set_file (cpuset_conf_t conf, const char *file); + +#endif +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/cpuset.c b/cpuset/cpuset.c new file mode 100644 index 0000000..7fe6442 --- /dev/null +++ b/cpuset/cpuset.c @@ -0,0 +1,493 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#include +#include +#include +#include +#include + +#include + +/* SGI libcpuset */ +#include +#include + +#include "fd.h" +#include "list.h" +#include "split.h" +#include "util.h" +#include "create.h" +#include "conf.h" +#include "log.h" +#include "slurm.h" + +SPANK_PLUGIN (cpuset, 1) + +/* + * Help message for user option + */ +static const char cpuset_help_string [] = +"\ +use-cpusets: Automatically allocate cpusets to each step within a job.\n\ +\n\ +When using the SLURM cpuset.so plugin, the default behavior is to allocate\n\ +one cpuset per job, and run all subsequent job steps within the job cpuset.\n\ +When using --use-cpusets, the cpuset plugin will re-allocate CPUs and\n\ +optionally memory nodes from the job cpuset into a child cpuset for the\n\ +executing job step. This allows convenient separation of multiple job steps \n\ +being run in parallel within a single job allocation.\n\ +\n\ +By default, the same allocation options are used for job steps as are\n\ +configured for jobs. These options can be tuned by providing arguments to\n\ +the --use-cpusets option.\n\ +\n\ +Option Usage: --use-cpusets=[args...]\n\ +\n\ +where args... is a comma separated list of one or more of the following\n\ + help Display this message.\n\ + debug Enable verbose debugging messages.\n\ + tasks Additionally constrain tasks to cpusets.\n\ +\n\ + Policy options:\n\ + best-fit Allocate tasks to most full nodes/sockets first.\n\ + worst-fit Allocate tasks to least full nodes/sockets first.\n\ + first-fit Allocate tasks to first free slots found.\n\ + reverse Reverse CPU allocation order (start at last CPU).\n\ + order=normal Normal CPU allocation order (start at first CPU).\n\ + no-idle Do not try to allocate whole idle nodes first.\n\ +\n\ + idle-first=[policy] Use [policy] to allocate idle nodes first, where\n\ + policy is one of:\n\ + gt Allocate idle nodes first if the number of \n\ + tasks is greater than or equal to the size \n\ + of a socket/NUMA node.\n\ + mult Allocate idle nodes first only if the number of\n\ + tasks in the job step is a multiple of the\n\ + size of a socket/NUMA node.\n\ + no Equivalent to no-idle.\n\ +\n\ + nomem Do not also constrain memory to the local nodes of\n\ + the selected CPUs.\n\n"; + +static List user_options = NULL; + +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + +static cpuset_conf_t conf = NULL; + +//static int step_cpuset_created = 0; +static int per_task_cpuset = 0; /* --use-cpuset=tasks */ + +static uint32_t jobid; +static uint32_t stepid; +static int step_ncpus = -1; +static int ncpus_per_task = -1; +static int debug_level = 0; +static int user_debug_level = 0; + +static int parse_one_option (const char *opt) +{ + if (strncmp ("debug=", opt, 6) == 0) + debug_level = str2int (opt + 6); + else if (strcmp ("debug", opt) == 0) + debug_level = 1; + else + return (cpuset_conf_parse_opt (conf, opt)); + + return (0); +} + +static int parse_options (int ac, char **av) +{ + int i; + for (i = 0; i < ac; i++) + parse_one_option (av[i]); + return (0); +} + +/* + * XXX: Since we don't have a good way to determine the number of + * CPUs allocated to this job on this node, we have to query + * the slurm controller (!). + * + */ +static int query_ncpus_per_node (spank_t sp, uint32_t jobid) +{ + const char var[] = "SLURM_JOB_CPUS_PER_NODE"; + char val[16]; + job_info_msg_t * msg; + int cpus_per_node = -1; + int i; + + /* + * If SLURM_JOB_CPUS_PER_NODE is set in environment, + * return that value so we don't have to contact SLURM controller. + */ + if (spank_getenv (sp, var, val, sizeof (val)) == ESPANK_SUCCESS) { + cpuset_debug ("SLURM_JOB_CPUS_PER_NODE=%s\n", val); + return (str2int (val)); + } + + /* + * Otherwise, we have to query all jobs and find the right job record. + */ + if (dyn_slurm_load_jobs (&msg) < 0) { + cpuset_error ("slurm_load_jobs: %s\n", slurm_strerror (errno)); + return (-1); + } + + for (i = 0; i < msg->record_count; i++) { + job_info_t *j = &msg->job_array[i]; + + if (j->job_id == jobid) { + /* + * XXX: Assumes cpus_per_node is the same across the whole job. + */ + cpus_per_node = (int) j->cpus_per_node[0]; + break; + } + } + + dyn_slurm_free_job_info_msg (msg); + if (cpus_per_node < 0) + cpuset_error ("Failed to get nCPUs for this node: %s\n", slurm_strerror (errno)); + return (cpus_per_node); +} + +int migrate_job_to_cpuset (uint32_t jobid, uid_t uid, pid_t pid) +{ + int rc; + char path[4096]; + int n = 0; + + cpuset_getcpusetpath (0, path, sizeof (path)); + + if (pid) + cpuset_debug ("Migrate: Moving %d from cpuset %s\n", pid, path); + else + cpuset_debug ("Migrate: Moving from cpuset %s\n", path); + /* + * If we're not under /slurm, prepend user cpuset + */ + if (strncmp (path, "/slurm", 6) != 0) + n = snprintf (path, sizeof (path), "/slurm/%d", uid); + else + n = strlen (path); + + /* + * Now everything happens relative to current cpuset + */ + rc = snprintf (path + n, sizeof (path) - n, "/%u", jobid); + + if (rc < 0 || rc > sizeof (path)) { + cpuset_error ("job%u: snprintf failed: %s\n", jobid, strerror (errno)); + return (-1); + } + + if (pid) + cpuset_debug ("Migrate: Moving %d to cpuset %s\n", pid, path); + else + cpuset_debug ("Migrate: Moving to cpuset %s\n", path); + + if (cpuset_move (pid, path) < 0) + return (-1); + return (0); +} + +static int job_ncpus_per_task (spank_t sp) +{ + const char var[] = "SLURM_CPUS_PER_TASK"; + char val [128]; + + if (ncpus_per_task < 0) { + if (spank_getenv (sp, var, val, sizeof (val)) != ESPANK_SUCCESS) { + //cpuset_error ("getenv (SLURM_CPUS_PER_TASK) failed\n"); + return (-1); + } + ncpus_per_task = str2int(val); + } + return (ncpus_per_task); +} + +static int job_step_ncpus (spank_t sp) +{ + uint32_t ntasks; + + if (spank_get_item (sp, S_JOB_LOCAL_TASK_COUNT, &ntasks) != ESPANK_SUCCESS) + return (-1); + + return (job_ncpus_per_task (sp) * ntasks); +} + +static int log_slurm (const char *msg) +{ + slurm_info ("%s", msg); + return (0); +} + +static int log_stderr (const char *msg) +{ + fprintf (stderr, "%s", msg); + return (0); +} + +int slurm_spank_init (spank_t sp, int ac, char *av[]) +{ + int rc; + int lockfd; + uid_t uid; + + if (!spank_remote (sp)) + return (0); + + log_add_dest (1, log_slurm); + + conf = cpuset_conf_create (); + cpuset_conf_parse_system (conf); + + parse_options (ac, av); + + log_update (debug_level, log_slurm); + + /* + * Get jobid + */ + if (spank_get_item (sp, S_JOB_ID, &jobid) != ESPANK_SUCCESS) { + cpuset_error ("Failed to get jobid: %s\n", strerror (errno)); + return (-1); + } + + if (spank_get_item (sp, S_JOB_STEPID, &stepid) != ESPANK_SUCCESS) { + cpuset_error ("Failed to get stepid: %s\n", strerror (errno)); + return (-1); + } + + if (spank_get_item (sp, S_JOB_UID, &uid) != ESPANK_SUCCESS) { + cpuset_error ("Failed to get uid: %m\n", strerror (errno)); + return (-1); + } + + cpuset_debug ("Attempting to create slurm cpuset\n"); + /* + * Try to migrate to existing cpuset for this job. If + * successful, then we're done. + */ + if ((lockfd = slurm_cpuset_create (conf)) < 0) { + cpuset_error ("Failed to create/lock slurm cpuset: %s\n", + strerror (errno)); + return (-1); + } + + if ((rc = migrate_job_to_cpuset (jobid, uid, 0)) != 0) { + /* + * No existing job cpuset on this node, create one: + */ + int ncpus = query_ncpus_per_node (sp, jobid); + + cpuset_debug ("Creating cpuset for job=%d uid=%d ncpus=%d\n", + jobid, uid, ncpus); + + if ((rc = create_cpuset_for_job (conf, jobid, uid, ncpus)) < 0) + goto done; + + if ((rc = migrate_job_to_cpuset (jobid, uid, 0)) < 0) { + log_err ("Failed to migrate jobid %d to cpuset: %s\n", + jobid, strerror (errno)); + goto done; + } + } + + step_ncpus = job_step_ncpus (sp); + +done: + slurm_cpuset_unlock (lockfd); + return (rc); +} + +/* + * User optional per-step cpuset option parsing + * Options are processed *after* slurm_spank_init completes, + * so we have to create the step cpuset within the option + * handler. + */ + +static int set_user_options (int remote) +{ + char *opt; + ListIterator i; + int rc = 0; + + if (user_options == NULL) + return (0); + + i = list_iterator_create (user_options); + while ((opt = list_next (i))) { + if (!remote && (strcmp (opt, "help") == 0)) { + fprintf (stderr, cpuset_help_string); + exit (0); + } + else if (strcmp (opt, "tasks") == 0) + per_task_cpuset = 1; + else if (strncmp ("debug=", opt, 6) == 0) + user_debug_level = str2int (opt + 6); + else if (strcmp ("debug", opt) == 0) + user_debug_level = 1; + else if (parse_one_option (opt) < 0) + rc = -1; + } + /* + * Done with user_options now. + */ + list_destroy (user_options); + return (rc); +} + +static int parse_user_option (int val, const char *optarg, int remote) +{ + int rc = 1; + + log_add_dest (0, log_stderr); + + if (optarg) { + char *str; + str = strdup (optarg); + user_options = list_split (",", str); + free (str); + + /* + * If running 'local' (i.e. in srun), then we may + * not yet have created a cpuset configuration object. + * We'll need this to test-parse options, so create it now. + */ + if (!conf) conf = cpuset_conf_create (); + + if (set_user_options (remote) < 0) + return (-1); + + } + + log_update (user_debug_level, log_stderr); + + if (remote && !spank_symbol_supported ("slurm_spank_init_post_opt")) { + /* + * Must create job step cpuset in option handler unless + * init_post_opt callback exists in this version of SLURM. + */ + int lockfd = slurm_cpuset_lock (); + if (debug_level > 0 || user_debug_level > 0) + print_current_cpuset_info (); + if ((rc = create_cpuset_for_step (conf, stepid, step_ncpus)) < 0) { + /* + * If step cpuset creation failed, ensure we don't try + * to create per-task cpuset. + */ + cpuset_error ("Failed to create cpuset for step %d: %s\n", + stepid, strerror (errno)); + per_task_cpuset = 0; + } + else + rc = migrate_job_to_cpuset (stepid, -1, 0); + slurm_cpuset_unlock (lockfd); + } + + return (rc); +} + +struct spank_option spank_options [] = { + { "use-cpusets", "[args..]", + "Use per-job-step and per-task cpusets. (args=`help' for more info)", + 2, 0, (spank_opt_cb_f) parse_user_option + }, + SPANK_OPTIONS_TABLE_END +}; + +int slurm_spank_init_post_opt (spank_t sp, int ac, char **av) +{ + int lockfd; + int rc; + + if (!spank_remote (sp) || !user_options) + return (0); + + if ((lockfd = slurm_cpuset_lock ()) < 0) + return (-1); + + if (debug_level > 0 || user_debug_level > 0) + print_current_cpuset_info (); + + if ((rc = create_cpuset_for_step (conf, stepid, step_ncpus)) < 0) + per_task_cpuset = 0; + else + rc = migrate_job_to_cpuset (stepid, -1, 0); + + if (debug_level > 0) + print_current_cpuset_info (); + + slurm_cpuset_unlock (lockfd); + + return (rc); +} + +int slurm_spank_task_post_fork (spank_t sp, int ac, char **av) +{ + pid_t task_pid; + int taskid; + int lockfd; + int cpus_per_task; + int rc; + + if (!per_task_cpuset) + return (0); + + if (spank_get_item (sp, S_TASK_ID, &taskid) != ESPANK_SUCCESS) { + cpuset_error ("Failed to get taskid\n"); + return (-1); + } + + if (spank_get_item (sp, S_TASK_PID, &task_pid) != ESPANK_SUCCESS) { + cpuset_error ("Failed to get task pid\n"); + return (-1); + } + + if ((lockfd = slurm_cpuset_lock ()) < 0) + return (-1); + + cpus_per_task = job_ncpus_per_task (sp); + + if ((rc = create_cpuset_for_task (conf, taskid, cpus_per_task)) == 0) + rc = migrate_job_to_cpuset (taskid, -1, task_pid); + + slurm_cpuset_unlock (lockfd); + + return (rc); + +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/cpuset.init b/cpuset/cpuset.init new file mode 100644 index 0000000..6759809 --- /dev/null +++ b/cpuset/cpuset.init @@ -0,0 +1,47 @@ +#!/bin/sh +############################################################################### +# chkconfig: 12345 01 99 +############################################################################### +### BEGIN INIT INFO +# Provides: slurm-cpuset +# Required-Start: $named $time +# Default-Start: 3 4 5 +# Default-Stop: 0 1 2 6 +# Description: Mount /dev/cpuset filesystem +### END INIT INFO +############################################################################### + + +case "$1" in + start) + echo -n "Mounting /dev/cpuset filesystem: " + mkdir -m 0755 -p /dev/cpuset + mount -t cpuset none /dev/cpuset + if [ $? -ne 0 ]; then + echo "Failed" + exit 1 + fi + + # Spread slab allocations over all memory nodes + echo 1 > /dev/cpuset/memory_spread_slab + echo "Success" + ;; + + stop) + # Do nothing + exit 0; + ;; + + status) + echo -n "cpuset filesystem is " + [ -f /dev/cpuset/cpus ] || echo -n "not " + echo -n "mounted." + ;; + + *) + echo "Usage: $0 start|status" + exit 1 + ;; +esac + +exit 0 diff --git a/cpuset/create.c b/cpuset/create.c new file mode 100644 index 0000000..edbe1ed --- /dev/null +++ b/cpuset/create.c @@ -0,0 +1,411 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#include +#include +#include +#include + +#include "log.h" +#include "conf.h" +#include "create.h" +#include "util.h" +#include "nodemap.h" + +/* + * Return the /dev/cpuset relative path for job, step, or task [id]. + * Basically if we're in / or /slurm, return "/slurm//" + * otherwise return /. + */ +static int job_cpuset_path (uint32_t id, uid_t uid, char *path, int len) +{ + int n; + char buf [64]; + + if (cpuset_getcpusetpath (0, buf, sizeof (buf)) < 0) + return (-1); + + /* + * If we are in root or /slurm cpuset, prepend path to user cpuset + */ + if (strcmp (buf, "/") == 0 || strcmp (buf, "/slurm") == 0) + snprintf (buf, sizeof (buf), "/slurm/%d", uid); + + n = snprintf (path, len, "%s/%u", buf, id); + if ((n < 0) || (n >= len)) + return (-1); + + return (0); +} + +/* + * Return a struct cpuset with cpus set to those in [alloc] and + * memory constrained to local memories if constrain_mems == 1. + */ +static struct cpuset * +do_cpuset_create (cpuset_conf_t cf, const struct bitmask *alloc) +{ + struct cpuset *cp; + struct bitmask *mems; + + if ((cp = cpuset_alloc ()) == NULL) { + cpuset_error ("Failed to alloc job cpuset: %m"); + return (NULL); + } + + if (cpuset_setcpus (cp, alloc) < 0) { + cpuset_error ("Failed to set cpus: %m"); + goto fail1; + } + + if ((mems = bitmask_alloc (cpuset_mems_nbits ())) == NULL) { + cpuset_error ("failed to alloc mems bitmask: %m"); + goto fail1; + } + + if (cpuset_conf_constrain_mem (cf)) { + if (cpuset_localmems (alloc, mems) < 0) { + cpuset_error ("cpuset_localmems failed: %m"); + goto fail2; + } + } else { + if (cpuset_getmems (NULL, mems) < 0) { + cpuset_error ("cpuset_getmems: %m"); + goto fail2; + } + } + + if (cpuset_setmems (cp, mems) < 0) { + cpuset_error ("cpuset_setmems failed: %m"); + goto fail2; + } + + cpuset_set_iopt (cp, "notify_on_release", 1); + + bitmask_free (mems); + return (cp); + +fail2: + bitmask_free (mems); +fail1: + cpuset_free (cp); + return (NULL); +} + +int job_cpuset_exists (uint32_t jobid, uid_t uid) +{ + char path [4096]; + struct cpuset *cp; + int rc; + + if (job_cpuset_path (jobid, uid, path, sizeof (path)) < 0) { + cpuset_error ("Failed to geneerate job cpuset path\n"); + return (0); + } + + cp = cpuset_alloc (); + rc = cpuset_query (cp, path); + cpuset_free (cp); + + return (rc == 0); +} + +/* + * Create a job cpuset for job [jobid] user [uid] with cpus in [alloc] + */ +static int +job_cpuset_create (cpuset_conf_t cf, uint32_t jobid, uid_t uid, + const struct bitmask *alloc) +{ + int rc; + struct cpuset *cp; + char path [4096]; + mode_t oldmask; + + if ((cp = do_cpuset_create (cf, alloc)) < 0) + return (-1); + + if (job_cpuset_path (jobid, uid, path, sizeof (path)) < 0) { + cpuset_error ("Failed to generate job cpuset path: %s\n", + strerror (errno)); + goto out; + } + + oldmask = umask (022); + if (cpuset_create (path, cp) < 0) + cpuset_error ("create [%s]: %s", path, strerror (errno)); + else + rc = 0; + umask (oldmask); + + print_cpuset_info (path, cp); + +out: + cpuset_free (cp); + return (rc); +} + +#if 0 +static struct bitmask * cpuset_cpus_bitmask (const char *name) +{ + struct cpuset *cp = cpuset_alloc (); + struct bitmask *b = NULL; + + if (cpuset_query (cp, name) < 0) { + cpuset_error ("cpuset query %s: %m", name); + goto out; + } + + if ((b = bitmask_alloc (cpumask_size ())) == NULL) { + cpuset_error ("bitmask_alloc: %m"); + goto out; + } + + if (cpuset_getcpus (cp, b) < 0) { + cpuset_error ("Failed to get cpus for cpuset %s: %m", name); + bitmask_free (b); + b = NULL; + } +out: + cpuset_free (cp); + return (b); +} +#endif + +/* + * Create a cpuset for [id] user [uid] with ncpus. + */ +static int +create_cpuset (cpuset_conf_t cf, unsigned int id, uid_t uid, int ncpus) +{ + struct nodemap *map; + struct bitmask *alloc; + int rc = -1; + + if (!(map = nodemap_create (cf, NULL))) + return (-1); + + if ((alloc = nodemap_allocate (map, ncpus)) == NULL) + goto out; + + /* + * Create and/or update user cpuset, under which job cpuset will + * be created. + */ + if ((int) uid >= 0) { + cpuset_debug ("Updating user %d cpuset with %d cpus\n", uid, ncpus); + if (user_cpuset_update (cf, uid, alloc) < 0) { + cpuset_error ("Failed to update user cpuset"); + goto out; + } + } + + if (job_cpuset_create (cf, id, uid, alloc) < 0) + goto out; + + rc = 0; +out: + if (map) + nodemap_destroy (map); + if (alloc) + bitmask_free (alloc); + + if (rc < 0) + log_debug2 ("create_cpuset: id=%u uid=%d ncpus=%d: Failed.\n", + id, uid, ncpus); + + return (rc); +} + +int create_cpuset_for_job (cpuset_conf_t cf, unsigned jobid, uid_t uid, + int ncpus) +{ + return (create_cpuset (cf, jobid, uid, ncpus)); +} + +int create_cpuset_for_step (cpuset_conf_t cf, unsigned int stepid, int ncpus) +{ + return (create_cpuset (cf, stepid, -1, ncpus)); +} + +int create_cpuset_for_task (cpuset_conf_t cf, unsigned int taskid, int ncpus) +{ + return (create_cpuset (cf, taskid, -1, ncpus)); +} + +static int user_cpuset_orphan (uid_t uid, const char *path) +{ + char orphan [1024]; + int n; + n = snprintf (orphan, sizeof (orphan), "/dev/cpuset/slurm/orphan:%d", uid); + if ((n <= 0) || (n > sizeof (orphan))) + return (-1); + if (rename (path, orphan) < 0) + cpuset_error ("Failed to rename %s to %s: %m", path, orphan); + return (0); +} + +static int kill_orphan (const char *name) +{ + struct cpuset_pidlist *pids; + int i; + + if ((pids = cpuset_init_pidlist (name, 0)) == NULL) { + cpuset_error ("cpuset_init_pidlist: %s: %s\n", + name, strerror (errno)); + return (-1); + } + + for (i = 0; i < cpuset_pidlist_length (pids); i++) + kill (cpuset_get_pidlist (pids, i), SIGKILL); + + cpuset_freepidlist (pids); + return (0); +} + +static int user_cpuset_unorphan (uid_t uid, const char *path) +{ + char orphan [1024]; + int n; + n = snprintf (orphan, sizeof (orphan), "/dev/cpuset/slurm/orphan:%d", uid); + if ((n <= 0) || (n > sizeof (orphan))) + return (-1); + cpuset_debug ("rename (%s, %s)\n", orphan, path); + if (rename (orphan, path) < 0) + return (0); + return (1); +} + +/* + * If user cpuset does not exist, keep its cpus and mems empty + * They'll be filled in later. + */ +static int user_cpuset_create (const char *path) +{ + int rc = 0; + mode_t oldmask = umask (022); + + if ((mkdir (path, 0755)) < 0 && errno != EEXIST) { + cpuset_error ("mkdir %s: %m", path); + rc = -1; + } + umask (oldmask); + return (rc); +} + + +int +user_cpuset_update (cpuset_conf_t cf, uid_t uid, const struct bitmask *alloc) +{ + int rc = -1; + char path [1024]; + const char *name; + struct bitmask *used; + struct cpuset *cp; + int orphan = 0; + + snprintf (path, sizeof (path), "/dev/cpuset/slurm/%d", uid); + name = cpuset_path_to_name (path); + + /* + * If there is an orphan user login, move it back + * Otherwise, create regular user cpuset if it doesn't + * already exist. + */ + if (!(orphan = user_cpuset_unorphan (uid, path)) + && (user_cpuset_create (path) < 0)) + return (-1); + + cpuset_debug ("Updating user cpuset at %s\n", path); + used = used_cpus_bitmask_path (path, 1); + if (orphan) + bitmask_clearall (used); + if (alloc) + bitmask_or (used, used, alloc); + + if (bitmask_weight (used) == 0) { + /* + * This is an orphaned user cpuset. + * We can't leave it with 0 cpus, and + * we can't leave it allocated under /slurm + * since those cpusets are used for tracking + * in-use cpusets. Instead, just rename the + * current cpuset to an orphans directory. + */ + cpuset_debug ("user_cpuset_orphan: uid=%d\n", uid); + if (cpuset_conf_kill_orphans (cf)) + kill_orphan (name); + else + user_cpuset_orphan (uid, path); + return (0); + } + + if (!(cp = do_cpuset_create (cf, used))) { + bitmask_free (used); + return (-1); + } + +again: + if ((rc = cpuset_modify (name, cp)) < 0) { + /* + * cpuset_modify can potentially return EBUSY. + */ + if (errno == EBUSY || errno == EAGAIN) { + sleep (1); + goto again; + } + cpuset_error ("Failed to modify %s: %m", name); + } + + bitmask_free (used); + cpuset_free (cp); + return (rc); +} + +int update_user_cpusets (cpuset_conf_t cf) +{ + DIR *dirp; + struct dirent *dp; + + if ((dirp = opendir ("/dev/cpuset/slurm")) == NULL) { + cpuset_error ("Unable to open /dev/cpuset/slurm: %m"); + return (-1); + } + + while ((dp = readdir (dirp))) { + int uid; + if ((uid = str2int (dp->d_name)) < 0) + continue; + cpuset_debug ("Checking cpuset for uid %d\n", uid); + user_cpuset_update (cf, uid, NULL); + } + closedir (dirp); + + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/create.h b/cpuset/create.h new file mode 100644 index 0000000..62ab5de --- /dev/null +++ b/cpuset/create.h @@ -0,0 +1,51 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#ifndef _HAVE_CREATE_H +#define _HAVE_CREATE_H + +#include +#include +#include + +#include "conf.h" + +int job_cpuset_exists (uint32_t jobid, uid_t uid); + +int create_cpuset_for_job (cpuset_conf_t cf, + unsigned int jobid, uid_t uid, int ncpus); + +int create_cpuset_for_step (cpuset_conf_t cf, + unsigned int stepid, int ncpus); + +int create_cpuset_for_task (cpuset_conf_t cf, + unsigned int taskid, int ncpus_per_task); + +int user_cpuset_update (cpuset_conf_t cf, + uid_t uid, const struct bitmask *b); + +int update_user_cpusets (); + +#endif diff --git a/cpuset/log.c b/cpuset/log.c new file mode 100644 index 0000000..36a3fd9 --- /dev/null +++ b/cpuset/log.c @@ -0,0 +1,232 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#include +#include +#include +#include +#include "list.h" +#include "log.h" + +static char facility [64] = "cpuset"; + +struct logger { + int level; + log_f *logfn; +}; + +static List log_list = NULL; + +static struct logger * logger_create (int level, log_f *fn) +{ + struct logger *l = malloc (sizeof (*l)); + + if (l != NULL) { + l->level = level; + l->logfn = fn; + } + + return (l); +} + +void logger_destroy (struct logger *l) +{ + free (l); +} + +int log_add_dest (int level, log_f *fn) +{ + struct logger *l; + + if (log_list == NULL) { + log_list = list_create ((ListDelF) logger_destroy); + } + + if ((l = logger_create (level, fn)) == NULL) + return (-1); + + list_push (log_list, l); + return (0); +} + +int log_set_prefix (const char *prefix) +{ + strncpy (facility, prefix, sizeof (facility)); + return (0); +} + +int find_fn (struct logger *l, log_f *fn) +{ + return (l->logfn == fn); +} + +int log_update (int level, log_f *fn) +{ + struct logger *l = list_find_first (log_list, (ListFindF) find_fn, fn); + + if (l == NULL) + return (-1); + + l->level = level; + return (0); +} + + +void log_cleanup () +{ + list_destroy (log_list); +} + +static int do_log_all (int level, const char *buf) +{ + struct logger *l; + ListIterator i = list_iterator_create (log_list); + + while ((l = list_next (i))) { + if (l->level >= level) + (*l->logfn) (buf); + } + + list_iterator_destroy (i); + return (0); +} + +static void vlog_msg (const char *prefix, int level, const char *format, va_list ap) +{ + char buf[4096]; + char *p; + int n; + int len; + + if (!log_list) + return; + + p = buf; + len = sizeof (buf); + + if (strlen (facility)) { + n = snprintf (p, len, "%s: ", facility); + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + } + + /* Add a log level prefix. + */ + if ((len > 0) && prefix) { + n = snprintf (p, len, "%s: ", prefix); + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + } + + if ((len > 0) && (format)) { + n = vsnprintf (p, len, format, ap); + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + } + + /* Add suffix for truncation if necessary. + */ + if (len <= 0) { + char *q; + const char *suffix = "+"; + q = buf + sizeof (buf) - 1 - strlen (suffix); + p = (p < q) ? p : q; + strcpy (p, suffix); + p += strlen (suffix); + } + + *p = '\0'; + + do_log_all (level, buf); + + return; +} + +int log_err (const char *format, ...) +{ + va_list ap; + va_start (ap, format); + vlog_msg ("Error", -1, format, ap); + va_end (ap); + return (-1); /* So we can do return (log_err (...)) */ +} + +void log_msg (const char *format, ...) +{ + va_list ap; + va_start (ap, format); + vlog_msg (NULL, 0, format, ap); + va_end (ap); + return; +} + +void log_verbose (const char *format, ...) +{ + va_list ap; + va_start (ap, format); + vlog_msg (NULL, 1, format, ap); + va_end (ap); + return; +} + +void log_debug (const char *format, ...) +{ + va_list ap; + va_start (ap, format); + vlog_msg ("Debug", 2, format, ap); + va_end (ap); + return; +} + +void log_debug2 (const char *format, ...) +{ + va_list ap; + va_start (ap, format); + vlog_msg ("Debug", 3, format, ap); + va_end (ap); + return; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/log.h b/cpuset/log.h new file mode 100644 index 0000000..a8e2b26 --- /dev/null +++ b/cpuset/log.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#ifndef _CPUSET_LOG_H +#define _CPUSET_LOG_H + +#define C_LOG_QUIET -2 +#define C_LOG_CRIT -1 +#define C_LOG_NORMAL 0 +#define C_LOG_VERBOSE 1 +#define C_LOG_DEBUG 2 +#define C_LOG_DEBUG2 3 + +typedef int (log_f) (const char *msg); + +int log_add_dest (int level, log_f *fn); +int log_update (int level, log_f *fn); +int log_set_prefix (const char *prefix); +void log_cleanup (); +int log_err (const char *format, ...); +void log_msg (const char *format, ...); +void log_verbose (const char *format, ...); +void log_debug (const char *format, ...); +void log_debug2 (const char *format, ...); + +/* + * Legacy logging functions + */ +#define cpuset_error(args...) log_err (args) +#define cpuset_verbose(args...) log_verbose (args) +#define cpuset_debug(args...) log_debug (args) +#define cpuset_debug2(args...) log_debug2 (args) + +#endif diff --git a/cpuset/nodemap.c b/cpuset/nodemap.c new file mode 100644 index 0000000..b0d7521 --- /dev/null +++ b/cpuset/nodemap.c @@ -0,0 +1,616 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include + +#include "log.h" +#include "list.h" +#include "util.h" +#include "conf.h" +#include "nodemap.h" + + +/* + * Description of one NUMA node on the system. + */ +struct node { + int nodeid; /* The NUMA node id */ + int ncpus; /* Total Number of CPUs */ + int navail; /* Number of currently available CPUs */ + struct bitmask *localcpus; /* Bitmask mapping local CPUs to global */ + struct bitmask *usedcpus; /* Bitmask of used cpus (size = ncpus) */ + struct nodemap *map; /* Pointer back to the nodemap */ +}; + +#define ALLOC_IDLE_MULTIPLE 0 /* Allocate idle nodes first if + ntasks is multiple of node size */ +#define ALLOC_IDLE_GT 1 /* Allocate idle nodes first if + ntasks is >= node size */ +#define ALLOC_NO_IDLE 2 /* Do not allocate idle nodes first */ + +struct policy { + unsigned int reverse:1; + unsigned int best_fit:1; + unsigned int first_fit:1; + unsigned int worst_fit:1; + unsigned int alloc_idle_first:1; + unsigned int alloc_idle_multiples_only:1; +}; + +static struct policy default_policy = { + .best_fit = 1, + .alloc_idle_first = 1, +}; + +/* + * Store the current mapping of CPUs to memory nodes as + * well as the currently in-use CPUs. + */ +struct nodemap { + struct policy policy; /* Allocation policy: best fit, first fit... */ + + int nnodes; /* Number of NUMA nodes */ + int ncpus; /* Total number of CPUs online */ + int navail; /* Total number of CPUs currently available */ + struct bitmask *usedcpus; /* Bitmask of used CPUs */ + struct bitmask *cpus; /* Bitmask of available CPUs relative to the + current cpuset */ + List nodelist; /* List of nodes in this map */ +}; + +/* + * A temporary object used to create a new allocation. + */ +struct allocation { + int ntasks; /* Number of total tasks to allocate */ + int nleft; /* Number of CPUs left to allocate */ + struct nodemap * map; /* pointer back to nodemap */ + struct bitmask * allocated_cpus; + /* The final bitmask of allocated CPUs */ +}; + +int nodemap_policy_update (struct nodemap *map, cpuset_conf_t cf) +{ + map->policy.best_fit = cpuset_conf_policy (cf) == BEST_FIT; + map->policy.worst_fit = cpuset_conf_policy (cf) == WORST_FIT; + map->policy.first_fit = cpuset_conf_policy (cf) == FIRST_FIT; + map->policy.alloc_idle_first = cpuset_conf_alloc_idle (cf); + map->policy.alloc_idle_multiples_only = + cpuset_conf_alloc_idle_multiple (cf); + map->policy.reverse = cpuset_conf_reverse_order (cf); + return (0); +} + + +static struct bitmask *current_cpuset_cpus () +{ + struct bitmask *cpus; + struct cpuset *cp; + + if ((cp = cpuset_alloc ()) == NULL) { + cpuset_error ("Failed to alloc cpuset: %s\n", strerror (errno)); + return (NULL); + } + + if ((cpus = bitmask_alloc (cpumask_size ())) == NULL) { + cpuset_error ("Failed to alloc bitmask: %s\n", strerror (errno)); + cpuset_free (cp); + return (NULL); + } + + cpuset_query (cp, "."); + cpuset_getcpus (cp, cpus); + cpuset_free (cp); + + return (cpus); +} + +static struct bitmask *used_cpus_bitmask () +{ + return (used_cpus_bitmask_path (NULL, 0)); +} + +static struct node * node_create (struct nodemap *map, int id) +{ + int i, offset; + struct bitmask *mems; + struct node *n = malloc (sizeof (*n)); + + if (n == NULL) + return (NULL); + + + n->map = map; + + n->nodeid = id; + n->ncpus = 0; + n->localcpus = bitmask_alloc (cpumask_size ()); + + /* + * Get the bitmask of local cpus for this node + */ + mems = bitmask_alloc (memmask_size ()); + bitmask_setbit (mems, n->nodeid); + cpuset_localcpus (mems, n->localcpus); + bitmask_free (mems); + + /* + * Now count the number of local CPUs + */ + n->ncpus = bitmask_weight (n->localcpus); + + + /* + * Now set used cpus from node map + */ + n->usedcpus = bitmask_alloc (n->ncpus); + + offset = bitmask_first (n->localcpus); + for (i = 0; i < n->ncpus; i++) { + if (bitmask_isbitset (map->usedcpus, offset + i)) + bitmask_setbit (n->usedcpus, i); + } + + n->navail = n->ncpus - bitmask_weight (n->usedcpus); + + cpuset_debug2 ("Done creating node%d with %d/%d CPUs\n", + n->nodeid, n->navail, n->ncpus); + + return (n); +} + +static void node_destroy (struct node *n) +{ + bitmask_free (n->localcpus); + bitmask_free (n->usedcpus); + free (n); +} + + +void nodemap_destroy (struct nodemap *map) +{ + bitmask_free (map->usedcpus); + list_destroy (map->nodelist); + free (map); +} + +static int node_cpus_available (struct nodemap *map, int i) +{ + int rc; + struct bitmask * mems = bitmask_alloc (memmask_size ()); + + if (mems == NULL) + return log_err ("failed to allocate mems mask!!\n"); + + if (cpuset_localmems (map->cpus, mems) < 0) + return log_err ("cpuset_localmems: %s\n", strerror (errno)); + + rc = bitmask_isbitset (mems, i); + + bitmask_free (mems); + return (rc); +} + +struct nodemap * nodemap_create (cpuset_conf_t cf, struct bitmask *used) +{ + int i; + struct nodemap *map = malloc (sizeof (*map)); + + if (map == NULL) + return (NULL); + + map->policy = default_policy; + + map->nodelist = list_create ((ListDelF) node_destroy); + + map->nnodes = memmask_size (); + map->ncpus = cpumask_size (); + + if (used) { + map->usedcpus = bitmask_alloc (bitmask_weight (used)); + bitmask_copy (map->usedcpus, used); + } + else { + map->usedcpus = used_cpus_bitmask (); + } + + if (!map->usedcpus) { + list_destroy (map->nodelist); + free (map); + return (NULL); + } + + map->cpus = current_cpuset_cpus (); + + for (i = 0; i < map->nnodes; i++) { + struct node *n; + + /* + * Don't bother appending this node if none of its CPUs + * are available in the current cpuset + */ + if (!node_cpus_available (map, i)) + continue; + + if ((n = node_create (map, i)) == NULL) { + nodemap_destroy (map); + return (NULL); + } + list_push (map->nodelist, n); + } + + map->navail = map->ncpus - bitmask_weight (map->usedcpus); + + log_debug2 ("Created nodemap with %d nodes, %d/%d CPUs\n", + map->nnodes, map->navail, map->ncpus); + + nodemap_policy_update (map, cf); + + return (map); +} + +void print_nodemap (const struct nodemap *map) +{ + struct node *n; + struct bitmask *b; + ListIterator i = list_iterator_create (map->nodelist); + + print_bitmask ("Available CPUs: %s\n", map->cpus); + + b = bitmask_alloc (cpumask_size ()); + bitmask_and (b, map->cpus, map->usedcpus); + + print_bitmask ("Used CPUs: %s\n", b); + bitmask_free (b); + + while ((n = list_next (i))) { + //slurm_info ("Node%d:", n->nodeid); + print_bitmask ("Local CPUs: %s\n", n->localcpus); + print_bitmask ("Used CPUs: %s\n", n->usedcpus); + } + + list_iterator_destroy (i); +} + + +static int find_multiple_of_node_size (struct node *n, int *np) +{ + if (!(*np % n->ncpus) && (n->navail == n->ncpus)) + return (1); + return (0); +} + +static int find_node_lt_size (struct node *n, int *np) +{ + if ((*np >= n->ncpus) && (n->navail == n->ncpus)) + return (1); + return (0); +} + +static int should_allocate_idle_nodes (struct nodemap *m, int count) +{ + ListFindF fn; + + log_debug ("should_allocate_idle_nodes: %d\n", m->policy.alloc_idle_first); + + if (!m->policy.alloc_idle_first) + return (0); + + if (m->policy.alloc_idle_multiples_only) + fn = (ListFindF) find_multiple_of_node_size; + else + fn = (ListFindF) find_node_lt_size; + + if (list_find_first (m->nodelist, fn, &count)) + return (1); + return (0); +} + +static struct allocation * allocation_create (struct nodemap *map, int ntasks) +{ + struct allocation *a = malloc (sizeof (*a)); + + if (a == NULL) + return (NULL); + + a->map = map; + + a->ntasks = a->nleft = ntasks; + a->allocated_cpus = bitmask_alloc (cpumask_size ()); + + return (a); +} + +static void allocation_destroy (struct allocation *a) +{ + free (a); +} + +static int node_cpu_to_global (struct node *n, int cpu) +{ + int firstcpu = bitmask_first (n->localcpus); + return (firstcpu + cpu); +} + +static int node_allocate_cpu (struct node *n, int cpu) +{ + int gcpu; + if (bitmask_isbitset (n->usedcpus, cpu)) + return (-1); + + gcpu = node_cpu_to_global (n, cpu); + if (bitmask_isbitset (n->map->usedcpus, gcpu)) + return (-1); + + bitmask_setbit (n->usedcpus, cpu); + n->navail--; + bitmask_setbit (n->map->usedcpus, gcpu); + n->map->navail--; + return (gcpu); +} + +static void allocation_add_cpu (struct allocation *a, int cpu) +{ + bitmask_setbit (a->allocated_cpus, cpu); + a->nleft--; +} + +static int try_alloc (struct node *n, struct allocation *a, int cpu) +{ + int globalcpu = node_allocate_cpu (n, cpu); + + if (globalcpu < 0) /* CPU is in use */ + return (-1); + + cpuset_debug2 ("Node%d: allocated local CPU%d = CPU%d\n", + n->nodeid, cpu, globalcpu); + + allocation_add_cpu (a, globalcpu); + + return (0); +} + +static int node_allocate_n (struct node *n, struct allocation *a, int count) +{ + int nalloc = 0; + int i; + + if (a->nleft == 0) + return (0); + + /* + * Allocate all CPUs left in node if count == -1 + */ + if (count < 0) + count = n->navail; + + cpuset_debug2 ("Allocating %d CPUs from node%d. nleft = %d\n", + count, n->nodeid, a->nleft); + + if (!n->map->policy.reverse) { + /* + * Start with first CPU in node + */ + for (i = 0; i < n->ncpus && a->nleft && nalloc < count; i++) { + if (try_alloc (n, a, i) < 0) + continue; + nalloc++; + } + } + else { + /* + * Start with last CPU in node + */ + for (i = n->ncpus - 1; i >= 0 && a->nleft && nalloc < count; i--) { + if (try_alloc (n, a, i) < 0) + continue; + nalloc++; + } + } + + return (nalloc); +} + +static int node_allocate_all (struct node *n, struct allocation *a) +{ + return (node_allocate_n (n, a, -1)); +} + +static int alloc_idle_nodes (struct allocation *a) +{ + ListIterator i; + struct node *n; + int nalloc = 0; + + cpuset_debug ("Attempting to allocate idle nodes\n"); + i = list_iterator_create (a->map->nodelist); + + while ((n = list_next (i)) && (a->nleft > 0)) { + + log_debug2 ("alloc_idle: node%d; avail=%d\n", n->nodeid, n->navail); + if(n->navail == 0) + continue; + + /* + * Ignore this node if we're only allocating multiples + * and the number of tasks left is not a multiple. + */ + if (a->map->policy.alloc_idle_multiples_only + && ((a->nleft % n->navail) != 0)) + continue; + + /* + * Otherwise, allocate whole, idle node. + */ + if ((n->navail == n->ncpus) && (a->nleft >= n->navail)) { + log_debug2 ("Allocating up to %d CPUs from node%d\n", + n->ncpus, n->nodeid); + nalloc += node_allocate_n (n, a, n->ncpus); + } + } + + return (nalloc); +} + +static int node_cmp_free (struct node *n1, struct node *n2) +{ + if (n1->navail == n2->navail) + return (0); + else if (n1->navail < n2->navail) + return (-1); + else + return (1); +} + +static int node_cmp_avail (struct node *n1, struct node *n2) +{ + int rc = node_cmp_free (n1, n2); + return (-rc); +} + +static int node_cmp_nodeid (struct node *n1, struct node *n2) +{ + if (n1->nodeid < n2->nodeid) + return (-1); + else if (n1->nodeid > n2->nodeid) + return (1); + else /* Shouldn't happen, but we'll check anyway */ + return (0); +} + +static int node_cmp_reverse (struct node *n1, struct node *n2) +{ + return (-node_cmp_nodeid (n1, n2)); +} + + +static int do_allocation (struct allocation *a, ListCmpF sort_f) +{ + if (sort_f) + list_sort (a->map->nodelist, sort_f); + list_for_each (a->map->nodelist, (ListForF) node_allocate_all, a); + return (0); +} + +static int allocation_best_fit (struct allocation *a) +{ + ListCmpF fn; + + log_debug ("allocation: best-fit\n"); + /* + * Best fit: + * + * Sort NUMA nodes by amount of CPUs free in ascending + * order, then pack in first-fit mode. + */ + fn = (ListCmpF) node_cmp_free; + + return (do_allocation (a, fn)); +} + +static int allocation_first_fit (struct allocation *a) +{ + log_debug ("allocation: first-fit\n"); + return (do_allocation (a, NULL)); +} + +static int allocation_worst_fit (struct allocation *a) +{ + log_debug ("allocation: worst-fit\n"); + while (a->nleft) { + /* + * For worst-fit, we have to sort by available CPUs in + * desending order, then allocate 1 CPU. Then re-sort, and + * so on. + */ + list_sort (a->map->nodelist, (ListCmpF) node_cmp_avail); + if (node_allocate_n (list_peek (a->map->nodelist), a, 1) < 0) + return (-1); + } + return (0); +} + +struct bitmask * nodemap_allocate (struct nodemap *map, int ncpus) +{ + struct bitmask *allocated; + struct allocation * a; + + log_debug ("nodemap_allocate (ncpus=%d, navail=%d)\n", + ncpus, map->navail); + + if (ncpus > map->navail) { + cpuset_error ("%d CPUs requested, but only %d available\n", + ncpus, map->navail); + return (NULL); + } + + if (!map->policy.reverse) + list_sort (map->nodelist, (ListCmpF) node_cmp_nodeid); + else + list_sort (map->nodelist, (ListCmpF) node_cmp_reverse); + + if ((a = allocation_create (map, ncpus)) == NULL) + return (NULL); + + if (should_allocate_idle_nodes (a->map, ncpus)) + alloc_idle_nodes (a); + + if (a->nleft > 0) { + /* + * Allocate based on policy. + */ + if (a->map->policy.best_fit) + allocation_best_fit (a); + else if (a->map->policy.first_fit) + allocation_first_fit (a); + else if (a->map->policy.worst_fit) + allocation_worst_fit (a); + } + + if (a->nleft > 0) + cpuset_error ("Failed to allocate %d tasks.\n", a->nleft); + + allocated = a->allocated_cpus; + a->allocated_cpus = NULL; + + allocation_destroy (a); + + return (allocated); +} + +const struct bitmask * nodemap_used (struct nodemap *map) +{ + return (map->usedcpus); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/nodemap.h b/cpuset/nodemap.h new file mode 100644 index 0000000..026068d --- /dev/null +++ b/cpuset/nodemap.h @@ -0,0 +1,51 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#ifndef HAVE_NODEMAP_H +#define HAVE_NODEMAP_H + +#include "conf.h" + +/* + * Create a nodemap with optional used CPUs bitmask + * if used == NULL, then the nodemap will be initialized + * with the actual utilized CPUs. + */ +struct nodemap * nodemap_create (cpuset_conf_t cf, struct bitmask *used); +int nodemap_policy_update (struct nodemap *map, cpuset_conf_t cf); + +void nodemap_destroy (struct nodemap *); + +void print_nodemap (const struct nodemap *); + +/* + * Allocate ncpus from nodemap + */ +struct bitmask * nodemap_allocate (struct nodemap *map, int ncpus); + +const struct bitmask * nodemap_used (struct nodemap *map); + + +#endif /* !HAVE_NODEMAP_H */ diff --git a/cpuset/pam_slurm_cpuset.8 b/cpuset/pam_slurm_cpuset.8 new file mode 100644 index 0000000..9a1887f --- /dev/null +++ b/cpuset/pam_slurm_cpuset.8 @@ -0,0 +1,81 @@ + +.TH "PAM_SLURM_CPUSET" "8" + +.SH NAME +pam_slurm_cpuset \- restrict user logins to SLURM cpusets + +.SH SYNOPSIS +\fBpam_slurm_cpuset.so\fR [\fIOPTIONS\fR]... + +.SH DESCRIPTION +.PP +The \fBpam_slurm_cpuset\fR module may be used to restrict user +login sessions on compute nodes to only the CPUs which they have +been allocated by SLURM. It will also deny access to users attempting +to log in to nodes which they have not been allocated. Thus, it +should replace \fBpam_slurm.so\fR in the PAM stack. +.PP +Like the \fBpam_slurm\fR module, the \fBpam_slurm_cpuset.so\fR module +should be enabled in the account section of the PAM stack. +.PP +User login session tasks are placed into the \fBuser\fR cpuset created +by the \fBslurm-cpuset\fR(8) utilities. If a \fBuser\fR cpuset doesn't +exist at the time of operation of this module, and the user has one +or more valid SLURM jobs assigned to the current system, then a user +cpuset under + +.B /dev/cpuset/slurm/UID + +will be created with access to all CPUs to which the user has access. +.PP +As jobs begin and are terminated on the node, the set of CPUs in the +user cpuset is automatically adjusted to the union of all job cpusets. +If and when all the user's jobs on the node are complete, and the +user has no CPUs allocated to them, SLURM with either \fBorphan\fR +the user cpuset by renaming it to + +.B /dev/cpuset/slurm/orphan:UID + +or will immediately terminate the user login and clean up the +user cpuset. The method used depends on the \fBkill-orphs\fR +setting in \fBslurm-cpuset.conf\fR. +.PP +For more information about the SLURM cpuset suite and its +operation, see the \fBslurm-cpuset\fR(8) man page. + +.SH OPTIONS +.TP +.BI debug [=level] +Enable verbose module logging via \fBpam_syslog\fR(3). Optionally +a \fIlevel\fR may be specified. +.TP +.BI conf= FILENAME +Read configuration from config file \fIFILENAME\fR. By default, the +configuration is read from /etc/slurm/slurm-cpuset.conf. +.PP +For valid configuration file syntax and options, see the +\fBslurm-cpuset\fR(8) man page. + +.SH "MODULE SERVICES PROVIDED" +.PP +Currently, on the \fBaccount\fR service is supported. + +.SH "RETURN VALUES" +.TP 3n +PAM_SUCCESS +Access was granted. +.TP +PAM_PERM_DENIED +Access was not granted. +.TP +PAM_USER_UNKNOWN +Failed to read \fBPAM_USER\fR or user not in passwd file. +.TP +PAM_SYSTEM_ERR +System or module configuration error. + +.SH "SEE ALSO" +.BR slurm-cpuset (8), +.BR cpuset (4), +.BR pam (8), +.BR pam.d (8) diff --git a/cpuset/pam_slurm_cpuset.c b/cpuset/pam_slurm_cpuset.c new file mode 100644 index 0000000..46d50a2 --- /dev/null +++ b/cpuset/pam_slurm_cpuset.c @@ -0,0 +1,295 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + + +#include +#include +#include +#include + +#define PAM_SM_ACCOUNT +#include +#include +#include + +#include "create.h" +#include "util.h" +#include "hostlist.h" +#include "slurm.h" +#include "conf.h" +#include "log.h" + +static int create_all_job_cpusets (cpuset_conf_t conf, uid_t uid); +static int migrate_to_user_cpuset (uid_t uid); +static int in_user_cpuset (uid_t uid); + +static pam_handle_t *pam_handle = NULL; + +static const char msg_prefix [] = ""; +static const char msg_suffix [] = "\r"; + +static int debuglevel = 1; + + +static int log_pam_syslog (const char *msg) { + pam_syslog (pam_handle, 0, "%s", msg); + return (0); +} + +static int log_pam_error (const char *msg) { + pam_error (pam_handle, "%s%s%s", msg_prefix, msg, msg_suffix); + return (0); +} + +static int parse_options (cpuset_conf_t conf, int ac, const char **av) +{ + int i; + for (i = 0; i < ac; i++) { + if (strcmp ("debug", av[i]) == 0) + debuglevel++; + else if (strncmp ("debug=", av[i], 6) == 0) + debuglevel = 1 + str2int (av[i] + 6); + else if (cpuset_conf_parse_opt (conf, av[i]) < 0) + return (-1); + } + return (0); +} + +PAM_EXTERN int +pam_sm_acct_mgmt (pam_handle_t *pamh, int flags, int ac, const char **av) +{ + int rc; + int n; + const char *user; + struct passwd *pw; + uid_t uid; + const void **uptr = (const void **) &user; + int lockfd; + + cpuset_conf_t conf = cpuset_conf_create (); + + pam_handle = pamh; + + log_add_dest (debuglevel, log_pam_syslog); + log_add_dest (0, log_pam_error); + log_set_prefix (""); + + if ((rc = pam_get_item (pamh, PAM_USER, uptr)) != PAM_SUCCESS + || user == NULL + || *user == '\0') { + log_err ("get PAM_USER: %s", pam_strerror (pamh, rc)); + return (PAM_USER_UNKNOWN); + } + + if (!(pw = getpwnam (user))) { + log_err ("User (%s) does not exist.", user); + return (PAM_USER_UNKNOWN); + } + + uid = pw->pw_uid; + + if (uid == 0) + return (PAM_SUCCESS); + + /* + * If we're already in the user's cpuset, bail early + */ + if (in_user_cpuset (uid)) { + log_msg ("User %s (uid=%d) already in cpuset", user, uid); + return (PAM_SUCCESS); + } + + /* + * Read any configuration: + */ + if (parse_options (conf, ac, av) < 0) + return (PAM_SYSTEM_ERR); + + log_update (debuglevel, log_pam_syslog); + + /* + * If we didn't parse a config file due to "conf=" above, + * then parse the system config. + */ + if (!cpuset_conf_file (conf)) + cpuset_conf_parse_system (conf); + + /* + * Now we have to create cpusets for all running jobs + * on the system for this user, so that they have the + * correct number of CPUs accounted to them upon logging + * in. + */ + + if ((lockfd = slurm_cpuset_create (conf)) < 0) { + log_err ("Unable to initialilze slurm cpuset"); + return (PAM_SYSTEM_ERR); + } + + /* + * create_all_job_cpusets returns the number of CPUs + * the user has allocated on this node (or -1 for failure) + */ + + if ((n = create_all_job_cpusets (conf, uid)) < 0) { + log_err ("Failed to create user cpuset for uid=%d", uid); + slurm_cpuset_unlock (lockfd); + return (PAM_SYSTEM_ERR); + } + else if (n == 0) { + log_err ("Access denied: User %s (uid=%d) has no active SLURM jobs.", + user, uid); + slurm_cpuset_unlock (lockfd); + return (PAM_PERM_DENIED); + } + + if (migrate_to_user_cpuset (uid) < 0) { + log_err ("Failed to create user cpuset for uid=%d", uid); + slurm_cpuset_unlock (lockfd); + return (PAM_SYSTEM_ERR); + } + slurm_cpuset_unlock (lockfd); + + log_msg ("Access granted for user %s (uid=%d) with %d CPUs", + user, uid, n); + + cpuset_conf_destroy (conf); + + return (PAM_SUCCESS); +} + +static int in_user_cpuset (uid_t uid) +{ + char p [1024]; + char q [1024]; + int n; + + if (!cpuset_getcpusetpath (0, p, sizeof (p))) + return (0); + + n = snprintf (q, sizeof (q), "/slurm/%d", uid); + if ((n <= 0) || (n >= sizeof (q))) + return (0); + + return (strncmp (p, q, strlen (q)) == 0); +} + +static int migrate_to_user_cpuset (uid_t uid) +{ + int rc; + char path [128]; + + rc = snprintf (path, sizeof (path), "/slurm/%d", uid); + if (rc < 0 || rc > sizeof (path)) + return (-1); + + if (cpuset_move (0, path) < 0) + return (-1); + + return (0); +} + +int hostname_hostid (const char *host, const char *nodes) +{ + int n; + hostlist_t h = hostlist_create (nodes); + + if (!(h = hostlist_create (nodes))) + return (0); + + n = hostlist_find (h, host); + hostlist_destroy (h); + + return (n); +} + +int cpus_on_node (job_info_t *j, int hostid) +{ + int i; + int start = 0; + + for (i = 0; i < j->num_cpu_groups; i++) { + if (hostid >= start && hostid < (start + j->cpu_count_reps[i])) + return (j->cpus_per_node[i]); + else + start += j->cpu_count_reps[i]; + } + + return (0); +} + +int create_all_job_cpusets (cpuset_conf_t conf, uid_t uid) +{ + int i; + char hostname[256]; + char *p; + job_info_msg_t * msg; + int total_cpus = 0; + + if (gethostname (hostname, sizeof (hostname)) < 0) { + return (-1); + } + + if ((p = strchr (hostname, '.'))) + *p = '\0'; + + if (dyn_slurm_load_jobs (&msg) < 0) { + return (-1); + } + + for (i = 0; i < msg->record_count; i++) { + job_info_t *j = &msg->job_array[i]; + int hostid; + int ncpus; + + if ((j->user_id != uid) || (j->job_state != JOB_RUNNING)) + continue; + + if ((hostid = hostname_hostid (hostname, j->nodes)) < 0) + continue; + + if (!(ncpus = cpus_on_node (j, hostid))) { + log_err ("job %u: Failed to find ncpus for this node", j->job_id); + continue; + } + + if (!job_cpuset_exists (j->job_id, j->user_id) && + create_cpuset_for_job (conf, j->job_id, j->user_id, ncpus) < 0) { + log_err ("job %u: Failed to create cpuset: %m", j->job_id); + continue; + } + + total_cpus += ncpus; + } + + dyn_slurm_free_job_info_msg (msg); + + return (total_cpus); +} + +/* + * vi: ts=4 sw=4 expandtab + */ + diff --git a/cpuset/release-agent.c b/cpuset/release-agent.c new file mode 100644 index 0000000..947ea93 --- /dev/null +++ b/cpuset/release-agent.c @@ -0,0 +1,86 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#include +#include +#include +#include +#include + +#include "util.h" +#include "create.h" +#include "conf.h" +#include "log.h" + +const char cpuset_path[] = "/dev/cpuset"; + +const char * basename (const char *path); +static FILE *fp = NULL; + +static int log_fp (const char *msg) +{ + if (fp) + fprintf (fp, "%s", msg); + return (0); +} + +int main (int ac, char **av) +{ + int lockfd; + char path [4096]; + const char *prog = basename (av[0]); + + cpuset_conf_t conf = cpuset_conf_create (); + + if (ac < 2) { + fprintf (stderr, "Usage: %s cpuset_path\n", prog); + return (1); + } + + fp = fopen ("/var/log/slurm-cpuset.log", "a"); + + log_add_dest (C_LOG_VERBOSE, log_fp); + cpuset_conf_parse_system (conf); /* Ignore errors, we must proceed */ + + snprintf (path, sizeof (path), "%s%s", cpuset_path, av[1]); + + if ((lockfd = slurm_cpuset_create (conf)) < 0) { + log_err ("Failed to lock slurm cpuset: %s\n", strerror (errno)); + exit (1); + } + + log_verbose ("Cleaning path %s\n", path); + + update_user_cpusets (conf); + slurm_cpuset_unlock (lockfd); + cpuset_conf_destroy (conf); + fclose (fp); + + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/slurm-cpuset.8 b/cpuset/slurm-cpuset.8 new file mode 100644 index 0000000..c742fdf --- /dev/null +++ b/cpuset/slurm-cpuset.8 @@ -0,0 +1,378 @@ +.\" $Id: slurm-cpuset.8 7653 2008-07-29 22:33:31Z grondo $ + +.TH slurm-cpuset 8 "SLURM cpuset plugin" + +.SH NAME +slurm-cpuset \- confine SLURM jobs to CPUs using cpusets + +.SH DESCRIPTION +The SLURM \fBcpuset\fR suite enables the use of Linux \fBcpuset\fR(4) +functionality to constrain user jobs and login sessions to the +number of CPUs allocated on compute nodes. The suite consists of a +\fBspank\fR(8) plugin, a \fBPAM\fR module, and a cpuset \fIrelease +agent\fR. Together, these three components may effectively restrict +user access to shared nodes based on actual SLURM allocations. +.PP +The SLURM cpuset components are specifically designed for +systems sharing nodes using CPU scheduling (i.e. using SLURM's +\fIselect/cons_res\fR plugin) These plugins and utilities will not +be effective on systems where CPUs may be oversubscribed to jobs +(e.g. strict node sharing without the use of \fIselect/cons_res\fR). +.PP +For more details see the OPERATION section below. + +.SH SLURM PLUGIN +The core cpuset functionality for SLURM jobs is provided +by a SLURM \fBspank\fR(8) plugin \fBcpuset.so\fR. Since this plugin +uses SLURM's \fBspank\fR(8) framework, it must be enabled +in the plugstack.conf for the system, via the following +line +.nf + + required cpuset.so [options] + +.fi +where \fIoptions\fR are described further in the \fIOPTIONS\fR +section below. +.PP +The slurm cpuset plugin (as well as other SLURM cpuset components) +works on a single node. It knows nothing about the global state of +SLURM, its queues, etc. Local CPUs are allocated dynamically to +incoming jobs based on the number of CPUs assigned to the job by +SLURM. The cpuset plugin does not keep any state across jobs, nor +across the nodes of a job. Instead, it uses past created cpusets +to track which CPUs are currently in use, and which are available. +.PP +The SLURM cpuset plugin may also constrain job steps to their +own cpusets under the job cpuset. This may be useful when running +multiple job steps under a single allocation, as the resources of +each job step may be partitioned into separate, non-overlapping +cpusets. This functionality is enabled by the srun user option +.TP +.BI "--use-cpusets="[args...] +.PP +Where the optional arguments in \fIargs\fR modify the cpuset plugin +behavior for job steps and/or tasks. Any plugin option as described +in the OPTIONS section can be specified. + +.SH PAM MODULE +The \fBpam_slurm_cpuset\fR(8) module may be used to restrict user +login sessions on compute nodes to only the CPUs which they have +been allocated by SLURM. If enabled in the PAM stack, it will also +deny access to users attempting to log in to nodes which they +have not been allocated. +.PP +The \fBpam_slurm_cpuset\fR PAM module +uses the same configuration file and algorithms as the SLURM cpuset +plugin, and is further documented in the \fBpam_slurm_cpuset\fR(8) +man page. + +.SH RELEASE AGENT +Included with the SLURM cpuset utilities is a cpuset release-agent +which may optionally be installed into /sbin/cpuset_release_agent +on any nodes using the SLURM cpuset plugin or PAM module. This release +agent will be run for each SLURM cpuset when the last +task within the cpuset exits, and will free the cpuset immediately +(with proper locking so as to not race with other jobs). The release +agent is optional for a couple reasons: +.RS 8 +.TP 3 +1. +Some versions of Linux may only allow a single \fBcpuset_release_agent\fR +and we don't want to interfere with other uses of cpusets if they exist. +.TP +2. +The cpuset plugin and PAM modules remove stale cpusets as they initialize +anyway. Therefore \fBcpuset_release_agent\fR is not a critical component +for operation. However, it is nice to clean up job cpusets as jobs exit, +instaed of waiting until the next job is run. Unused cpusets lying around +may be confusing to syadmins and users. + +.SH CONFIGURATION +All SLURM cpuset components will first attempt to read the systemwide +config file at /etc/slurm/slurm-cpuset.conf. This location may be overridden +in the PAM module and SLURM plugin with the \fBconf=\fR parameter. +However, this is not suggested, because there is no way currently +to override the config file location for the cpuset release agent. +.PP +Available configuration parameters that may be set in slurm-cpuset.conf +are: +.TP 8 +\fBpolicy\fR = \fIPOLICY\fR +Set the allocation policy for cpusets to \fIPOLICY\fR. Currently +supported policies include: +.RS +.TP +.B best-fit +Allocate tasks to the most full NUMA nodes first. This is the default +.TP +.B first-fit +Allocate tasks to nodes in order of node ID. +.TP +.B worst-fit +Allocate tasks to least full nodes first. +.RE + +.TP +\fBorder\fR = [\fInormal\fR|\fIreverse\fR] +Set the allocation order of tasks to CPUs. In \fInormal\fR +mode, tasks are allocated starting with the first available +CPU and in increasing order, while with \fRreverse\fR order, +tasks are allocated starting with the last available CPU. The +default order is \fInormal\fR. +.TP +\fBuse-idle\fR = \fISTRATEGY\fR +The \fBuse-idle\fR parameter indicates when to allocate tasks +to fully idle NUMA nodes first. The default behavior is +to use idle nodes first when the number of tasks is a multiple +of the number of CPUs within a node. Other options include +.RS +.TP 12 +.B mult[iple] +The default. Allocate idle nodes first if number of tasks is a +multiple of the node size. +.TP +.B [greater|gt] +Allocate idle nodes first if the number of tasks is \fBgreater\fR +than the number of CPUs in a node. +.TP +.B [0|no|never] +Do not allocate idle nodes first, no matter the job size. +.TP +.B [1|yes] +Allocate idle nodes first using the default policy. +.RE +.TP +\fBconstrain-mem\fR = \fIBOOLEAN\fR +If set to 1 or yes, constrain memory nodes along with CPUs when +creating cpusets. If set to 0 or no, let all cpusets access all +memory nodes on the system (i.e. do not constrain memory). The +default is yes. +.TP +\fBkill-orphs\fR = \fIBOOLEAN\fR +If set to 1 or yes, kill orphaned user logins, i.e. those logins +for which there are no longer any SLURM jobs running. If 0 or no, +then leave orphan user logins (in a special orphan login cpuset). +The default is no. + +.SH USER OPTIONS + +The \fB--use-cpusets\fR option may be used to override some of +the options above, in addition to providing a couple of extra options. +Currently supported arguments for this option include: +.TP +.B help +Print a short usage message to stderr and exit. +.TP +.B debug +Enable debug messages. +.TP +.BI "debug=" N +Increase debugging verbosity to \fIN\fR +.TP +.BI "conf=" FILENAME +Read configuration from file \fIFILENAME\fR. Settings in this +config file will override system configuration, as well as options +previously set on the command line. +.TP +.BI "policy=" POLICY +As above, set the allocation policy for cpusets to \fIPOLICY\fR. +For the user option, this only overrides the policy as applied to +job steps and tasks. +.TP +.BI "order=" ORDER +Set allocation order to \fInormal\fR or \fIreverse\fR. +.TP +.B reverse +Same as \fBorder=\fR\fIreverse\fR. +.TP +.B best-fit | worst-fit | first-fit +Shortcut for \fBpolicy\fR=\fIPOLICY\fR. +.TP +.BI "idle-first=" WHEN +As above, set \fIWHEN\fR to allocate idle nodes first. +.TP +.BI "no-idle" +Same as \fBidle-first\fR=\fIno\fR. +.TP +.B mem | constrain-mem +Constrain memory as well as CPUs. Same as \fBconstrain-mem\fR = \fIyes\fR +in the config file. +.TP +.B nomem | !constrain-mem +Do not constrain memory. +.TP +.B tasks +Also constrain individual tasks to cpusets. + +.SH OPERATION +All SLURM cpusets for jobs and login sessions are created +under the /slurm cpuset heirarchy, and require that the +epuset filesystem be mounted under /dev/cpuset (An init script +is provided for this purpose.). +.PP +The first level of cpuset +created under the /slurm directory are UID cpusets. Each +user with a job or login to the current node will have +a cpuset under +.nf + + \fB/slurm/UID\fR + +.fi +which will contain the set of +all CPUs that user is allowed to use on the system. Processes +which are part of a login session are contained within this +cpuset, and thus have access to all CPUs which the user has +been allocated. +.PP +Under each UID cpuset will be one cpuset per active job. +These cpusets are named with the JOBID, and thus fall +under the path +.nf + + \fB/slurm/UID/JOBID\fR + +.fi +The CPUs allocated to the JOBID cpusets will obviously +be a subset of the UID cpuset. +.PP +Finally, if the user requests per-job-step or per-task +cpusets, these cpusets will fall under the JOBID cpuset, +and will of course be a subset of the job cpuset. Thus, +the final cpuset path for a task would be: +.nf + + \fB/slurm/UID/JOBID/STEPID/TASKID\fR + +.fi +where there would be N TASKID cpusets for an N task job. +.PP +As cpusets are created by the SLURM cpuset utilities, +the \fBnotify_on_release\fR flag is set. This causes +the cpuset release agent at /sbin/cpuset_release_agent +to be called after the last task exits from the cpuset. +The SLURM cpuset version of \fBcpuset_release_agent\fR takes +care of removing the cpuset and releasing CPUs for use +if necessary. Use of the release agent is optional, however, +because the SLURM cpuset utilities will also try to +free unused cpusets on demand as well. +.PP +The general algorithm the SLURM cpuset utilities use for +allocating a new JOB cpuset is as follows: +.PP +.RS 2 +.TP 3 +1. +Lock SLURM cpuset at /dev/cpuset/slurm. +.TP +2. +Clean up current slurm cpuset heirarchy by removing all unused cpusets, +and ensuring user cpusets (/slurm/UID) are up to date. +.TP +3. +Check for an existing cpuset for this job in /slurm/UID/JOBID. If +it exists, goto directly to step 8. +.TP +4. +Scan the slurm cpuset heirarchy and gather the list of currently +used CPUs. This is the union of all active user cpusets, which are +in turn the union of all active user job cpusets. +.TP +5. +Abort if the number of CPUs assigned to the starting job is greater +than the number of available CPUs. +.TP +6. +Assign CPUs and optionally memory nodes based on the currently +configured policy. (See CONFIGURATION section for valid policies) +.TP +7. +Create new cpuset under /dev/cpuset/slurm/UID/JOBID, updating +the user cpuset if necessary with newly allocated cpus. +.TP +8. +Migrate job to cpuset /dev/cpuset/slurm/UID/JOBID. +.TP +9. Unlock SLURM cpuset at /dev/cpuset/slurm. +.RE +.PP + +.SH EXAMPLES +Default allocation policy, job sizes 2 cpus, 1 cpu, 1 cpu, 4 cpus: +.nf + + cpuset: /slurm/6885/69946: 2 cpus [0-1], 1 mem [0] + cpuset: /slurm/6885/69947: 1 cpu [2], 1 mem [1] + cpuset: /slurm/6885/69948: 1 cpu [3], 1 mem [1] + cpuset: /slurm/6885/69950: 4 cpus [4-7], 2 mems [2-3] + +.fi +Same as above with order = reverse. +.nf + + cpuset: /slurm/6885/69954: 2 cpus [6-7], 1 mem [3] + cpuset: /slurm/6885/69955: 1 cpu [5], 1 mem [2] + cpuset: /slurm/6885/69956: 1 cpu [4], 1 mem [2] + cpuset: /slurm/6885/69957: 4 cpus [0-3], 2 mems [0-1] + +.fi +use-idle = never, policy = worst-fit: job sizes 1, 1, 1, 4, 1 +.nf + + cpuset: /slurm/6885/69976: 1 cpu [0], 1 mem [0] + cpuset: /slurm/6885/69977: 1 cpu [2], 1 mem [1] + cpuset: /slurm/6885/69978: 1 cpu [4], 1 mem [2] + cpuset: /slurm/6885/69979: 4 cpus [1,3,6-7], 3 mems [0-1,3] + cpuset: /slurm/6885/69980: 1 cpu [5], 1 mem [2] + +.fi +policy = first-fit: job sizes 1, 1, 1, 4, 1 +Note that 4 cpu job is allocated to idle nodes first. +.nf + + cpuset: /slurm/6885/69985: 1 cpu [0], 1 mem [0] + cpuset: /slurm/6885/69986: 1 cpu [1], 1 mem [0] + cpuset: /slurm/6885/69987: 1 cpu [2], 1 mem [1] + cpuset: /slurm/6885/69988: 4 cpus [4-7], 2 mems [2-3] + cpuset: /slurm/6885/69989: 1 cpu [3], 1 mem [1] + +.fi +Using cpusets for multiple job steps under an allocate of 1 node +with 8 cpus. + +.nf + + > srun --use-cpusets=debug -n1 sleep 100 & + + cpuset: /slurm/6885/69993: 8 cpus [0-7], 4 mems [0-3] + cpuset: /slurm/6885/69993/0: 1 cpu [0], 1 mem [0] + + > srun --use-cpusets=debug -n2 sleep 100 & + + cpuset: /slurm/6885/69993: 8 cpus [0-7], 4 mems [0-3] + cpuset: /slurm/6885/69993/1: 2 cpus [2-3], 1 mem [1] + +.fi +Use of --use-cpusets=tasks + +.nf + + > srun --use-cpusets=debug,tasks -n4 sleep 100 + + cpuset: /slurm/6885/69993: 8 cpus [0-7], 4 mems [0-3] + cpuset: /slurm/6885/69993/2: 4 cpus [0-3], 2 mems [0-1] + cpuset: /slurm/6885/69993/2/0: 1 cpu [0], 1 mem [0] + cpuset: /slurm/6885/69993/2/1: 1 cpu [1], 1 mem [0] + cpuset: /slurm/6885/69993/2/2: 1 cpu [2], 1 mem [1] + cpuset: /slurm/6885/69993/2/3: 1 cpu [3], 1 mem [1] +.fi + +.SH AUTHOR +Mark Grondona + +.SH "SEE ALSO" +.BR use-cpusets (1), +.BR pam_slurm_cpuset (8), +.BR spank (8), +.BR cpuset (4) diff --git a/cpuset/slurm.c b/cpuset/slurm.c new file mode 100644 index 0000000..eb107b4 --- /dev/null +++ b/cpuset/slurm.c @@ -0,0 +1,114 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#include +#include "slurm.h" +#include "log.h" +/* + * Handle for libslurm.so + * + * We open libslurm.so via dlopen () in order to pass the + * flag RTDL_GLOBAL so that subsequently loaded modules have + * access to libslurm symbols. This is pretty much only needed + * for dynamically loaded modules that would otherwise be + * linked against libslurm. + * + */ +static void * slurm_h = NULL; + + +static int dyn_slurm_open () +{ + if (slurm_h) + return (0); + if (!(slurm_h = dlopen("libslurm.so", RTLD_NOW|RTLD_GLOBAL))) { + log_err ("Unable to dlopen libslurm: %s\n", dlerror ()); + return (-1); + } + return (0); +} + +/* + * Wrapper for SLURM API function slurm_load_jobs () + */ +int dyn_slurm_load_jobs (job_info_msg_t **msgp) +{ + static int (*load_jobs) (time_t, job_info_msg_t **) = NULL; + + dyn_slurm_open (); + + if (!load_jobs && !(load_jobs = dlsym (slurm_h, "slurm_load_jobs"))) { + log_err ("Unable to resolve slurm_load_jobs\n"); + return -1; + } + + return load_jobs ((time_t) NULL, msgp); +} + +/* + * Wrapper for SLURM API function slurm_strerror () + */ +char * dyn_slurm_strerror (int errnum) +{ + static char * (*f) (int) = NULL; + + dyn_slurm_open (); + + if (!f && !(f = dlsym (slurm_h, "slurm_strerror"))) { + log_err ("Unable to resolve slurm_strerror\n"); + return "unknown error"; + } + + return f (errnum); +} + + +/* + * Wrapper for slurm_free_job_info_msg () + */ +void dyn_slurm_free_job_info_msg (job_info_msg_t *msg) +{ + static void (*free_msg) (job_info_msg_t *) = NULL; + + dyn_slurm_open (); + + if (!free_msg && !(free_msg = dlsym (slurm_h, "slurm_free_job_info_msg"))) { + log_err ("Unable to resolve slurm_free_job...\n"); + return; + } + + free_msg (msg); + + return; +} + +void dyn_slurm_close () +{ + if (slurm_h) dlclose (slurm_h); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/slurm.h b/cpuset/slurm.h new file mode 100644 index 0000000..80db18a --- /dev/null +++ b/cpuset/slurm.h @@ -0,0 +1,36 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#ifndef _HAVE_DYN_SLURM_H +#define _HAVE_DYN_SLURM_H + +#include + +int dyn_slurm_load_jobs (job_info_msg_t **msgp); +char * dyn_slurm_strerror (int errnum); +void dyn_slurm_free_job_info_msg (job_info_msg_t *msg); +void dyn_slurm_close (); + +#endif diff --git a/cpuset/test.c b/cpuset/test.c new file mode 100644 index 0000000..a1a6c49 --- /dev/null +++ b/cpuset/test.c @@ -0,0 +1,90 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + + +#include +#include +#include +#include + +#include "nodemap.h" +#include "util.h" +#include "conf.h" +#include "log.h" + +static int log_stderr (const char *msg) +{ + fprintf (stderr, "%s", msg); return 0; +} + +int main (int ac, char **av) +{ + cpuset_conf_t conf; + struct bitmask * b; + struct nodemap * map; + int n = str2int (av[1]); + + log_add_dest (4, log_stderr); + + conf = cpuset_conf_create (); + //cpuset_conf_debug (); + + if (cpuset_conf_parse_system (conf) < 0) + exit (1); + + if (ac < 2) + exit (1); + + if (av[1] == NULL || ((n = str2int (av[1])) <= 0)) { + fprintf (stderr, "Usage: %s NCPUS\n", av[0]); + exit (1); + } + + fprintf (stdout, "Faking a job with %d CPUs\n", n); + + if ((map = nodemap_create (conf, NULL)) == NULL) { + fprintf (stderr, "Failed to create nodemap\n"); + exit (1); + } + + print_nodemap (map); + + if (!(b = nodemap_allocate (map, n))) { + fprintf (stderr, "Failed to allocate %d tasks in nodemap\n", n); + exit (1); + } + + print_bitmask ("Used CPUs: %s\n", nodemap_used (map)); + + nodemap_destroy (map); + + cpuset_conf_destroy (conf); + + exit (0); + +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/use-cpusets.1 b/cpuset/use-cpusets.1 new file mode 100644 index 0000000..354a260 --- /dev/null +++ b/cpuset/use-cpusets.1 @@ -0,0 +1,114 @@ +.TH use-cpusets 1 "user options for SLURM cpuset plugin" + +.SH NAME +use-cpusets \- user options for SLURM cpuset plugin + +.SH SYNOPSIS +\fB--use-cpusets=\fR[\fIargs\fR]... + +.SH DESCRIPTION +The \fB--use-cpusets\fR option is added to \fBsrun\fR(1) +by the SLURM cpuset plugin, which is described fully +in the \fBslurm-cpuset\fR(8) manpage. This option allows +users to request that job steps and optionally individual +tasks be contained within cpusets under a SLURM job cpuset. +This may be useful when running multiple job steps under +an allocation, as the resources of each job step may be +partitioned into separate cpus and/or memory nodes. + +.SH OPTIONS +The \fB--use-cpusets\fR option may be used to override some of +the SLURM cpuset defaults and system configuration. Additionally, +some extra options are provided. +.PP +Used alone, the \fB--use-cpusets\fR option enables per-job-step +cpusets for the spawned tasks. Options that change policies +and behavior of the SLURM cpuset plugin may specified with an +optional list of comma-separated arguments to the \fB--use-cpusets\fR +option, e.g. + +.BI "--use-cpusets=" debug,tasks + +.PP +Currently supported arguments for this option include: +.TP +.B help +Print a short usage message to stderr and exit. +.TP +.B debug +Enable debug messages. +.TP +.BI "debug=" N +Increase debugging verbosity to \fIN\fR +.TP +.BI "conf=" FILENAME +Read configuration from file \fIFILENAME\fR. Settings in this +config file will override system configuration, as well as options +previously set on the command line. +.TP +.BI "policy=" POLICY +As above, set the allocation policy for cpusets to \fIPOLICY\fR. +For the user option, this only overrides the policy as applied to +job steps and tasks. +.TP +.BI "order=" ORDER +Set allocation order to \fInormal\fR or \fIreverse\fR. +.TP +.B reverse +Same as \fBorder=\fR\fIreverse\fR. +.TP +.B best-fit | worst-fit | first-fit +Shortcut for \fBpolicy\fR=\fIPOLICY\fR. +.TP +.BI "idle-first=" WHEN +As above, set \fIWHEN\fR to allocate idle nodes first. +.TP +.BI "no-idle" +Same as \fBidle-first\fR=\fIno\fR. +.TP +.B mem | constrain-mem +Constrain memory as well as CPUs. Same as \fBconstrain-mem\fR = \fIyes\fR +in the config file. +.TP +.B nomem | !constrain-mem +Do not constrain memory. +.TP +.B tasks +Also constrain individual tasks to cpusets. + +.SH EXAMPLES +Using cpusets for multiple job steps under an allocate of 1 node +with 8 cpus. + +.nf + + > srun --use-cpusets=debug -n1 sleep 100 & + + cpuset: /slurm/6885/69993: 8 cpus [0-7], 4 mems [0-3] + cpuset: /slurm/6885/69993/0: 1 cpu [0], 1 mem [0] + + > srun --use-cpusets=debug -n2 sleep 100 & + + cpuset: /slurm/6885/69993: 8 cpus [0-7], 4 mems [0-3] + cpuset: /slurm/6885/69993/1: 2 cpus [2-3], 1 mem [1] + +.fi +Use of --use-cpusets=tasks + +.nf + + > srun --use-cpusets=debug,tasks -n4 sleep 100 + + cpuset: /slurm/6885/69993: 8 cpus [0-7], 4 mems [0-3] + cpuset: /slurm/6885/69993/2: 4 cpus [0-3], 2 mems [0-1] + cpuset: /slurm/6885/69993/2/0: 1 cpu [0], 1 mem [0] + cpuset: /slurm/6885/69993/2/1: 1 cpu [1], 1 mem [0] + cpuset: /slurm/6885/69993/2/2: 1 cpu [2], 1 mem [1] + cpuset: /slurm/6885/69993/2/3: 1 cpu [3], 1 mem [1] +.fi +.SH AUTHOR +Mark Grondona + +.SH "SEE ALSO" +.BR slurm-cpuset (8), +.BR cpuset (4) diff --git a/cpuset/util.c b/cpuset/util.c new file mode 100644 index 0000000..c419f8d --- /dev/null +++ b/cpuset/util.c @@ -0,0 +1,464 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#define __USE_GNU 1 +#include +#include +#include +#include +#include + +#include +#include + +#include "fd.h" +#include "util.h" +#include "nodemap.h" +#include "create.h" +#include "slurm.h" +#include "log.h" + +/* + * Path to base SLURM cpuset, which contains all other cpusets. + */ +static const char slurm_cpuset[] = "/dev/cpuset/slurm"; + +void print_bitmask (const char *fmt, const struct bitmask *b) +{ + char buf [16]; + bitmask_displaylist (buf, sizeof (buf), b); + log_msg (fmt, buf); +} + +static struct cpuset * get_cpuset (const char *path) +{ + struct cpuset *cpuset = NULL; + + if (!(cpuset = cpuset_alloc ())) + return (NULL); + + if (cpuset_query (cpuset, path) < 0) { + cpuset_free (cpuset); + return (NULL); + } + + return (cpuset); +} + +int cpumask_size (void) +{ + struct cpuset *cp; + static int totalcpus = -1; + if (totalcpus < 0) { + cp = get_cpuset ("/"); + totalcpus = cpuset_cpus_weight (cp); + cpuset_free (cp); + } + return (totalcpus); +} + +int memmask_size (void) +{ + struct cpuset *cp; + static int totalmems = -1; + if (totalmems < 0) { + cp = get_cpuset ("/"); + totalmems = cpuset_mems_weight (cp); + cpuset_free (cp); + } + return (totalmems); +} + +void print_cpuset_info (const char *path, struct cpuset *cp) +{ + char cstr [16]; + char mstr [16]; + struct bitmask *cpus, *mems; + int ncpus, nmems; + + ncpus = cpuset_cpus_weight (cp); + nmems = cpuset_mems_weight (cp); + + cpus = bitmask_alloc (cpumask_size ()); + mems = bitmask_alloc (memmask_size ()); + + cpuset_getcpus (cp, cpus); + cpuset_getmems (cp, mems); + + bitmask_displaylist (cstr, sizeof (cstr), cpus); + bitmask_displaylist (mstr, sizeof (mstr), mems); + + cpuset_verbose ("%s: %d cpu%s [%s], %d mem%s [%s]\n", + path, + ncpus, (ncpus == 1 ? "" : "s"), cstr, + nmems, (nmems == 1 ? "" : "s"), mstr); + + bitmask_free (cpus); + bitmask_free (mems); +} + +void print_current_cpuset_info () +{ + char path [4096]; + struct cpuset *cp = cpuset_alloc (); + + cpuset_getcpusetpath (0, path, sizeof (path)); + cpuset_query (cp, path); + + print_cpuset_info (path, cp); + + cpuset_free (cp); +} + +static int current_cpuset_path (char *path, int len) +{ + if (len < 12) + return (-1); + + strncpy (path, "/dev/cpuset", len); + + if (!cpuset_getcpusetpath (0, path + 11, len - 11)) + return (-1); + + if (strcmp (path, "/dev/cpuset/") == 0) { + /* + * If we are in the root cpuset, pretend we're in /slurm instead. + */ + strncat (path, "slurm", len); + } + + return (0); +} + +const char * cpuset_path_to_name (const char *path) +{ + return (path + 11); +} + +struct bitmask *used_cpus_bitmask_path (char *path, int clearall) +{ + char buf [4096]; + const char *current; + struct bitmask *b, *used; + DIR *dirp; + struct dirent *dp; + struct cpuset *cp; + + if (path == NULL) { + path = buf; + if (current_cpuset_path (buf, sizeof (buf)) < 0) { + cpuset_error ("Unable to get current cpuset path: %m"); + return (NULL); + } + cpuset_debug ("used_cpus_bitmask_path (%s)\n", path); + } + + if ((dirp = opendir (path)) == NULL) { + cpuset_error ("Couldn't open %s: %m", path); + return NULL; + } + + if ((cp = cpuset_alloc ()) == NULL) { + cpuset_error ("Couldn't alloc cpuset: %m"); + return (NULL); + } + + current = cpuset_path_to_name (path); + + b = bitmask_alloc (cpumask_size ()); + used = bitmask_alloc (cpumask_size ()); + + if (!clearall) { + /* + * First, set all CPUs not in this cpuset as used + */ + cpuset_query (cp, current); + cpuset_getcpus (cp, used); + bitmask_complement (used, used); + } + + while ((dp = readdir (dirp))) { + char name [4096]; + + if (*dp->d_name == '.') + continue; + + /* + * Skip any orphans + */ + if (strncmp (dp->d_name, "orphan:", 7) == 0) + continue; + + /* + * Generate cpuset name relative to /dev/cpuset + */ + snprintf (name, sizeof (name), "%s/%s", current, dp->d_name); + if (cpuset_query (cp, name) < 0) + continue; + + if (cpuset_getcpus (cp, b) < 0) + cpuset_error ("Failed to get CPUs for %s: %m", name); + + used = bitmask_or (used, b, used); + } + closedir (dirp); + + bitmask_free (b); + return (used); +} + +int slurm_jobid_is_valid (int jobid) +{ + static job_info_msg_t *msg = NULL; + int i; + + cpuset_debug ("slurm_jobid_is_valid (%d)\n", jobid); + + if (msg == NULL) + dyn_slurm_load_jobs (&msg); + else if (jobid == -1) { + dyn_slurm_free_job_info_msg (msg); + return (0); + } + + for (i = 0; i < msg->record_count; i++) { + job_info_t *j = &msg->job_array[i]; + + if (j->job_id == jobid && j->job_state == JOB_RUNNING) + return (1); + } + + return (0); +} + +int cpuset_ntasks (const char *path) +{ + struct cpuset_pidlist *pids; + int n; + + if ((pids = cpuset_init_pidlist (path, 0)) == NULL) { + cpuset_error ("cpuset_init_pidlist %s: %m", path); + return (-1); + } + + n = cpuset_pidlist_length (pids); + + cpuset_freepidlist (pids); + + return (n); +} + +int slurm_cpuset_clean_path (const char *path) +{ + int userid; + int jobid; + int stepid; + const char *name = cpuset_path_to_name (path); + + if (sscanf (name, "/slurm/%d/%d/%d", &userid, &jobid, &stepid) == 2) { + /* + * We only destroy jobid cpusets when the owner uid + * cpuset is also empty. This is because the jobid + * cpusets are used for accounting the CPUs in the + * uid cpuset. + */ + char user_cpuset [128]; + snprintf (user_cpuset, sizeof (user_cpuset), "/slurm/%d", userid); + if ((cpuset_ntasks (user_cpuset) > 0) && + slurm_jobid_is_valid (jobid)) + return (0); + } + + rmdir (path); + return (0); +} + +int slurm_cpuset_clean (cpuset_conf_t cf) +{ + struct cpuset_fts_tree *fts; + const struct cpuset_fts_entry *entry; + + if (!(fts = cpuset_fts_open ("/slurm"))) + return (-1); + /* + * Reverse cpuset fts tree so that child cpusets + * are returned before parents. This is important + * because a cpuset can seemingly only be removed + * after all its children have been removed. + */ + + cpuset_fts_reverse (fts); + + while ((entry = cpuset_fts_read (fts))) { + const char *name = cpuset_fts_get_path (entry); + + + if (strcmp (name, "/slurm") != 0) { + char path [4096]; + snprintf (path, sizeof (path), "/dev/cpuset%s", name); + cpuset_debug ("clean: %s\n", name); + slurm_cpuset_clean_path (path); + } + } + + cpuset_fts_close (fts); + + update_user_cpusets (cf); + + return (0); +} + +static int do_cpuset_lock (const char *name) +{ + int fd; + char path [1024]; + + /* + * We can't just any files under the cpuset for advisory locking as + * we used to do. Recall that the advisory lock is dropped for the + * process if _any_ open file descriptor for the locked file is closed, + * Since libcpuset opens and closes *all* files under all our cpusets, + * we instead use a more typical lockfile under /var/lock. + */ + snprintf (path, sizeof (path), "/var/lock/%s-cpuset", name); + +again: + if ((fd = open (path, O_RDWR|O_CREAT|O_NOFOLLOW, 0644)) < 0) { + static int first = 1; + if (errno == EEXIST && first) { /* A symlink */ + unlink (path); + first = 0; + goto again; + } + log_err ("Open of lockfile [%s] failed: %s\n", path, strerror (errno)); + return (-1); + } + if (fd_get_writew_lock (fd) < 0) { + close (fd); + return (-1); + } + return (fd); +} + +static int do_cpuset_unlock (int fd) +{ + if (fd < 0) + return (-1); + /*if (fd_release_lock (fd) < 0) + return (-1); */ + return (close (fd)); +} + +int slurm_cpuset_lock (void) +{ + return (do_cpuset_lock ("/slurm")); +} + +int slurm_cpuset_unlock (int fd) +{ + return (do_cpuset_unlock (fd)); +} + +/* + * Create slurm cpuset if necessary and return + * with lock held. + */ +static int create_and_lock_cpuset_dir (cpuset_conf_t cf, const char *name) +{ + char path [1024] = "/dev/cpuset"; + struct cpuset *cp; + int fd; + mode_t oldmask = umask (022); + + strncat (path, name, sizeof (path)); + + cpuset_debug2 ("create_and_lock_cpuset_dir (%s)\n", name); + + /* + * First grab cpuset lock from /var/lock: + */ + if ((fd = do_cpuset_lock (name)) < 0) { + cpuset_error ("Failed to lock %s: %m", path); + return (-1); + } + + if ((mkdir (path, 0755)) < 0) { + /* If mkdir fails with EEXIST, then slurm cpuset already + * exists and we can simply return lockfd after ensuring + * the cpuset is "clean" + */ + umask (oldmask); + if (errno == EEXIST) { + slurm_cpuset_clean (cf); + return (fd); + } + else { + cpuset_error ("mkdir %s: %m", path); + return (-1); + } + } + umask (oldmask); + + /* + * Initialize SLURM cpuset with all CPUs and MEMs: + */ + cp = cpuset_alloc (); + if (cpuset_query (cp, "/") < 0) { + cpuset_error ("Failed to query root cpuset: %m"); + return (-1); + } + + cpuset_debug2 ("modifying %s cpuset\n", name); + + if (cpuset_modify (name, cp) < 0) { + cpuset_error ("Failed to modify %s cpuset: %m", name); + return (-1); + } + + cpuset_free (cp); + + return (fd); +} + +int slurm_cpuset_create (cpuset_conf_t cf) +{ + return (create_and_lock_cpuset_dir (cf, "/slurm")); +} + +int str2int (const char *str) +{ + char *p; + long l = strtol (str, &p, 10); + + if (p && (*p != '\0')) + return (-1); + + return ((int) l); +} +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/util.h b/cpuset/util.h new file mode 100644 index 0000000..174a6b0 --- /dev/null +++ b/cpuset/util.h @@ -0,0 +1,64 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#ifndef _HAVE_CPUSET_UTIL_H +#define _HAVE_CPUSET_UTIL_H + +#include + +#include +#include +#include +#include + +#include "fd.h" +#include "conf.h" + +int cpumask_size (void); +int memmask_size (void); + +int slurm_cpuset_lock (void); +int slurm_cpuset_unlock (int fd); + +int user_cpuset_lock (uid_t uid); +void user_cpuset_unlock (int fd); + +void print_current_cpuset_info (); +void print_cpuset_info (const char *path, struct cpuset *cp); + +void print_bitmask (const char * fmt, const struct bitmask *b); + +struct bitmask *used_cpus_bitmask_path (char *path, int clearall); + +int slurm_cpuset_create (cpuset_conf_t conf); +int slurm_cpuset_clean_path (const char *path); + +int str2int (const char *str); + +const char * cpuset_path_to_name (const char *path); +#endif + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/cpuset/version.map b/cpuset/version.map new file mode 100644 index 0000000..e234ff4 --- /dev/null +++ b/cpuset/version.map @@ -0,0 +1,9 @@ +{ global: + plugin_name; + plugin_type; + plugin_version; + spank*; + slurm_spank*; + local: + *; +}; diff --git a/iorelay/Makefile b/iorelay/Makefile new file mode 100644 index 0000000..f0c78b7 --- /dev/null +++ b/iorelay/Makefile @@ -0,0 +1,13 @@ +CFLAGS = -Wall -ggdb + +all: iorelay.so + +.SUFFIXES: .c .o .so + +.c.o: + $(CC) $(CFLAGS) -o $@ -fPIC -c $< +.o.so: + $(CC) -shared -o $*.so $< $(LIBS) + +clean: + rm -f *.so *.o diff --git a/iorelay/iorelay-bind-nfs.sh b/iorelay/iorelay-bind-nfs.sh new file mode 100755 index 0000000..3a4cb51 --- /dev/null +++ b/iorelay/iorelay-bind-nfs.sh @@ -0,0 +1,84 @@ +#!/bin/bash +############################################################################### +# +# Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory. +# Written by Jim Garlick . +# +# UCRL-CODE-235358 +# +# This file is part of chaos-spankings, a set of spank plugins for SLURM. +# +# This is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### +# +# iorelay-bind-nfs - bind directories from mntpt over all nfs mounted +# file systems +# +# Run as root in private namespace +# +declare -r prog=iorelay-bind-nfs + +die () +{ + echo "$prog: $1" >&2 + exit 1 +} +warn () +{ + echo "$prog: $1" >&2 +} +usage () +{ + echo "Usage: $prog -m mntpt" + exit 1 +} +listnfs () +{ + local src dst typ opts a1 a2 + + cat /proc/mounts | while read src dst typ opts a1 a2; do + [ ${typ} = nfs ] && echo ${dst} + fi + done +} + +[ -n "$SLURM_NODELIST" ] || die "SLURM_NODELIST is not set" +relayhost=$(echo $SLURM_NODELIST | glob-hosts -n1) +[ "$(hostname)" = "$relayhost" ] && exit 0 # silently exit if relayhost + +uopt=0 +mntpt="" +while getopts "m:" opt; do + case ${opt} in + m) mntpt=${OPTARG} ;; + *) usage ;; + esac +done +shift $((${OPTIND} - 1)) +[ $# = 0 ] || usage +[ -n "$mntpt" ] || usage +[ -d $mntpt ] || die "not a directory: $mntpt" + +count=0 +for dir in $(listnfs); do + if [ -d ${mntpt}/${dir} ]; then + mount --bind ${mntpt}/${dir} ${dir} || warn "bind ${dir} failed" + count=$(($count+1)) + fi +done +warn "relayed $count file systems" + +exit 0 diff --git a/iorelay/iorelay-mount-nodezero.sh b/iorelay/iorelay-mount-nodezero.sh new file mode 100755 index 0000000..62eac65 --- /dev/null +++ b/iorelay/iorelay-mount-nodezero.sh @@ -0,0 +1,81 @@ +#!/bin/bash +############################################################################### +# +# Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory. +# Written by Jim Garlick . +# +# UCRL-CODE-235358 +# +# This file is part of chaos-spankings, a set of spank plugins for SLURM. +# +# This is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### +# +# iorelay-mount-nodezero - mount / from first slurm node on /mnt +# +# Run as root in private namespace. +# +declare -r prog=iorelay-mount-nodezero +declare -r sshcmd=/usr/libexec/iorelay-mrsh-sshfs-wrap + +die () +{ + echo "$prog: $1" >&2 + exit 1 +} +warn () +{ + echo "$prog: $1" >&2 +} +usage () +{ + echo "Usage: $prog -m mntpt -u username" + exit 1 +} + + +[ -n "$SLURM_NODELIST" ] || die "SLURM_NODELIST is not set" +relayhost=$(echo $SLURM_NODELIST | glob-hosts -n1) +[ -n "$relayhost" ] || die "could not determine relayhost" +[ "$(hostname)" = "$relayhost" ] && exit 0 # silently exit if relayhost + +mntpt="" +username="" +while getopts "u:m:" opt; do + case ${opt} in + m) mntpt=${OPTARG} ;; + u) username=${OPTARG} ;; + *) usage ;; + esac +done +shift $((${OPTIND} - 1)) +[ $# = 0 ] || usage +[ -n "$mntpt" ] || usage +[ -d $mntpt ] || die "not a directory: $mntpt" +[ -n "$username" ] || usage +uid=$(id -u $username 2>&1) || die "no such user: $username" +[ "$uid" != 0 ] || die "sshfs as root is unsupported" + +grep -q sshfs /proc/mounts && die "sshfs is already mounted" + +# NOTE: work around missing -n option in sshfs/fusermount +mv -f /etc/mtab /etc/mtab-iorelay || die "failed to back up /etc/mtab" +sshfs -o ssh_command=${sshcmd} ${username}@${relayhost}/ ${mntpt} +result=$? +mv -f /etc/mtab-iorelay /etc/mtab || warn "failed to restore /etc/mtab" +[ $result = 0 ] || die "sshfs mount ${username}@${relayhost}/ ${mntpt} failed" + +exit 0 diff --git a/iorelay/iorelay-mrsh-sshfs-wrap.sh b/iorelay/iorelay-mrsh-sshfs-wrap.sh new file mode 100755 index 0000000..0b17802 --- /dev/null +++ b/iorelay/iorelay-mrsh-sshfs-wrap.sh @@ -0,0 +1,50 @@ +#!/bin/bash +############################################################################### +# +# Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory. +# Written by Jim Garlick . +# +# UCRL-CODE-235358 +# +# This file is part of chaos-spankings, a set of spank plugins for SLURM. +# +# This is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### +# +# sshfs-mrsh-wrap - wrapper for mrsh for sshfs usage +# +declare -r prog=iorelay-sshfs-mrsh-wrap + +die () { + echo "$prog: $1" >&2 + exit 1 +} + +# Expected args: +# -x -a -oClearAllForwardings=yes -2 user@host -s sftp +# We ignore everything except user@host arg +for arg in $*; do + if echo $arg | grep -q "@"; then + user=$(echo $arg | cut -d@ -f1) + host=$(echo $arg | cut -d@ -f2) + fi +done + +[ -n "$user" ] && [ -n "$host" ] || die "no user@host arg" + +exec /usr/bin/mrsh -l $user $host /usr/libexec/openssh/sftp-server +die "failed to exec mrsh" +# NOTREACHED diff --git a/iorelay/iorelay.c b/iorelay/iorelay.c new file mode 100644 index 0000000..a3b12bb --- /dev/null +++ b/iorelay/iorelay.c @@ -0,0 +1,142 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Jim Garlick . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * All spank plugins must define this macro for the SLURM plugin loader. + */ +SPANK_PLUGIN(iorelay, 1) + +#define IORELAY_ENABLE 1 + +/* Usage: iorelay-mount-nodezero -u user -m mntpt */ +#define MOUNT_SCRIPT "/usr/libexec/iorelay-mount-nodezero" + +/* Usage: iorelay-bind-nfs -m mntpt */ +#define BIND_SCRIPT "/usr/libexec/iorelay-bind-nfs" + +static int enabled = 0; + +static int _opt_process (int val, const char *optarg, int remote); + +/* + * Provide a --iorelay option to srun: + */ +struct spank_option spank_options[] = +{ + { "iorelay", NULL, "Enable NFS I/O relaying.", + 1, IORELAY_ENABLE, + (spank_opt_cb_f) _opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + +/* + * Called from both srun and slurmd. + */ +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + char cmd[256]; + struct passwd *pw; + uid_t uid; + + if (!enabled || !spank_remote (sp)) + return (0); + + spank_get_item (sp, S_JOB_UID, &uid); + pw = getpwuid (uid); + if (!pw) { + slurm_error ("Error looking up uid in /etc/passwd"); + return (-1); + } + + /* Unshare file namespace. This means only this process and its children + * will see the following mounts, and when this process and its children + * terminate, the mounts go away automatically. + */ + if (unshare (CLONE_NEWNS) < 0) { + slurm_error ("unshare CLONE_NEWNS: %m"); + return (-1); + } + + /* Mount node zero root on /mnt using sshfs. + * Script has no effect on node zero. + */ + snprintf (cmd, sizeof(cmd), "%s -u %s -m /mnt", MOUNT_SCRIPT, pw->pw_name); + if (system (cmd) != 0) { + slurm_error ("Error running `%s': %m", cmd); + return (-1); + } + + /* Bind NFS-mounted directories now mirrored in /mnt via sshfs + * over their NFS mount points. + * Script has no effect on node zero. + */ + snprintf (cmd, sizeof(cmd), "%s -m /mnt", BIND_SCRIPT); + if (system (cmd) != 0) { + slurm_error ("Error running `%s': %m", cmd); + return (-1); + } + + return (0); +} + +/* + * Called from both srun and slurmd. + */ +int slurm_spank_exit (spank_t sp, int ac, char **av) +{ + /* Do nothing here as mounts in private namespace will take care of + * themselves. + */ + return (0); +} + +static int _opt_process (int val, const char *optarg, int remote) +{ + switch (val) { + case IORELAY_ENABLE: + enabled = 1; + break; + default: + slurm_error ("Ignoring unknown iorelay option value %d\n", val); + break; + } + + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/iotrace.c b/iotrace.c new file mode 100644 index 0000000..8b154e6 --- /dev/null +++ b/iotrace.c @@ -0,0 +1,126 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include + +#include + +/* + * All spank plugins must define this macro for the SLURM plugin loader. + */ +SPANK_PLUGIN(iotrace, 1) + +#define IOTRACE_ENABLE 1 + +static int enabled = 0; +static char *flags = ""; + +static int _opt_process (int val, const char *optarg, int remote); + +/* + * Provide a --iotrace option to srun: + */ +struct spank_option spank_options[] = +{ + { "iotrace", "[flags]", "Enable application I/O tracing.", + 2, IOTRACE_ENABLE, + (spank_opt_cb_f) _opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + + +static void _iotrace_label(spank_t sp, char *buf, int len) +{ + char hostname[128], *p; + uint32_t taskid = 0; + spank_err_t rc; + + rc = spank_get_item (sp, S_TASK_GLOBAL_ID, &taskid); + if (rc != ESPANK_SUCCESS) + slurm_error ("iotrace: error fetching taskid: %d", rc); + + if (gethostname (hostname, sizeof (hostname)) == 0) { + hostname[sizeof(hostname) - 1] = '\0'; + if ((p = strchr (hostname, '.'))) + *p = '\0'; + } else + strncpy (hostname, "unknown", sizeof(hostname)); + + snprintf (buf, len, "iotrace-%d@%s", taskid, hostname); +} + +int slurm_spank_task_init (spank_t sp, int ac, char **av) +{ + char nbuf [4096], obuf [4096]; + char label [64]; + const char *preload = "libplasticfs.so"; + + if (!enabled) + return (0); + + /* append to LD_PRELOAD (with a space) */ + if (spank_getenv (sp, "LD_PRELOAD", obuf, sizeof (obuf)) == ESPANK_SUCCESS) + snprintf (nbuf, sizeof (nbuf), "%s %s", obuf, preload); + else + strncpy (nbuf, preload, strlen (preload)); + if (spank_setenv (sp, "LD_PRELOAD", nbuf, 1) != ESPANK_SUCCESS) + slurm_error ("Failed to set LD_PRELOAD=%s\n", nbuf); + + /* prepend to PLASTICFS (with a pipe) */ + _iotrace_label (sp, label, sizeof (label)); + if (spank_getenv (sp, "PLASTICFS", obuf, sizeof (obuf)) == ESPANK_SUCCESS) + snprintf (nbuf, sizeof (nbuf), "log - %s %s | %s", label, flags, obuf); + else + snprintf (nbuf, sizeof (nbuf), "log - %s %s", label, flags); + + if (spank_setenv (sp, "PLASTICFS", nbuf, 1) != ESPANK_SUCCESS) + slurm_error ("Failed to set PLASTICFS=%s\n", nbuf); + + return (0); +} + +static int _opt_process (int val, const char *optarg, int remote) +{ + switch (val) { + case IOTRACE_ENABLE: + enabled = 1; + if (optarg) + flags = optarg; + break; + default: + slurm_error ("Ignoring unknown iotrace option value %d\n", val); + break; + } + + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/lib/fd.c b/lib/fd.c new file mode 100644 index 0000000..99f2270 --- /dev/null +++ b/lib/fd.c @@ -0,0 +1,273 @@ +/***************************************************************************** + * $Id: fd.c 412 2003-06-03 21:31:19Z achu $ + ***************************************************************************** + * This file is part of the Munge Uid 'N' Gid Emporium (MUNGE). + * For details, see . + * UCRL-CODE-2003-???. + * + * Copyright (C) 2001-2003 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Chris Dunlap . + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License; + * if not, write to the Free Software Foundation, Inc., 59 Temple Place, + * Suite 330, Boston, MA 02111-1307 USA. + ***************************************************************************** + * Refer to "fd.h" for documentation on public functions. + *****************************************************************************/ + + +#if HAVE_CONFIG_H +# include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include "fd.h" + + +static int _fd_get_lock (int fd, int cmd, int type); +static pid_t _fd_test_lock (int fd, int type); + + +int +fd_set_close_on_exec (int fd) +{ + assert (fd >= 0); + + if (fcntl (fd, F_SETFD, FD_CLOEXEC) < 0) + return (-1); + return (0); +} + + +int +fd_set_nonblocking (int fd) +{ + int fval; + + assert (fd >= 0); + + if ((fval = fcntl (fd, F_GETFL, 0)) < 0) + return (-1); + if (fcntl (fd, F_SETFL, fval | O_NONBLOCK) < 0) + return (-1); + return (0); +} + + +int +fd_get_read_lock (int fd) +{ + return (_fd_get_lock (fd, F_SETLK, F_RDLCK)); +} + + +int +fd_get_readw_lock (int fd) +{ + return (_fd_get_lock (fd, F_SETLKW, F_RDLCK)); +} + + +int +fd_get_write_lock (int fd) +{ + return (_fd_get_lock (fd, F_SETLK, F_WRLCK)); +} + + +int +fd_get_writew_lock (int fd) +{ + return (_fd_get_lock (fd, F_SETLKW, F_WRLCK)); +} + + +int +fd_release_lock (int fd) +{ + return (_fd_get_lock (fd, F_SETLK, F_UNLCK)); +} + + +pid_t +fd_is_read_lock_blocked (int fd) +{ + return (_fd_test_lock (fd, F_RDLCK)); +} + + +pid_t +fd_is_write_lock_blocked (int fd) +{ + return (_fd_test_lock (fd, F_WRLCK)); +} + + +static int +_fd_get_lock (int fd, int cmd, int type) +{ + struct flock lock; + + assert (fd >= 0); + + lock.l_type = type; + lock.l_start = 0; + lock.l_whence = SEEK_SET; + lock.l_len = 0; + + return (fcntl (fd, cmd, &lock)); +} + + +static pid_t +_fd_test_lock (int fd, int type) +{ + struct flock lock; + + assert (fd >= 0); + + lock.l_type = type; + lock.l_start = 0; + lock.l_whence = SEEK_SET; + lock.l_len = 0; + + if (fcntl (fd, F_GETLK, &lock) < 0) + return (-1); + if (lock.l_type == F_UNLCK) + return (0); + return (lock.l_pid); +} + + +ssize_t +fd_read_n (int fd, void *buf, size_t n) +{ + size_t nleft; + ssize_t nread; + unsigned char *p; + + p = buf; + nleft = n; + while (nleft > 0) { + if ((nread = read (fd, p, nleft)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + } + else if (nread == 0) { /* EOF */ + break; + } + nleft -= nread; + p += nread; + } + return (n - nleft); +} + + +ssize_t +fd_write_n (int fd, void *buf, size_t n) +{ + size_t nleft; + ssize_t nwritten; + unsigned char *p; + + p = buf; + nleft = n; + while (nleft > 0) { + if ((nwritten = write (fd, p, nleft)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + } + nleft -= nwritten; + p += nwritten; + } + return (n); +} + + +ssize_t +fd_read_line (int fd, void *buf, size_t maxlen) +{ + ssize_t n, rc; + unsigned char c, *p; + + n = 0; + p = buf; + while (n < (ssize_t) maxlen - 1) { /* reserve space for NUL-termination */ + + if ((rc = read (fd, &c, 1)) == 1) { + n++; + *p++ = c; + if (c == '\n') + break; /* store newline, like fgets() */ + } + else if (rc == 0) { + if (n == 0) /* EOF, no data read */ + return (0); + else /* EOF, some data read */ + break; + } + else { + if (errno == EINTR) + continue; + return (-1); + } + } + + *p = '\0'; /* NUL-terminate, like fgets() */ + return (n); +} + +/* + * Following added by Mike Haskell + */ +ssize_t +fd_null_read_n (int fd, void *buf, size_t n) +{ + unsigned char *mp; + size_t nleft; + ssize_t nread; + unsigned char *p; + unsigned char *q; + + q = p = (unsigned char *)buf; + nleft = n; + while (nleft > 0) { + if ((nread = read (fd, p, nleft)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + } + else if (nread == 0) { /* EOF */ + break; + } + nleft -= nread; + p += nread; + mp = (unsigned char *) memchr(q, '\0', (n - nleft)); + if (mp <= &q[ (n - nleft - 1)]) { + if (mp != NULL) + break; + } + } + return (n - nleft); +} diff --git a/lib/fd.h b/lib/fd.h new file mode 100644 index 0000000..c753382 --- /dev/null +++ b/lib/fd.h @@ -0,0 +1,129 @@ +/***************************************************************************** + * $Id: fd.h 412 2003-06-03 21:31:19Z achu $ + ***************************************************************************** + * This file is part of the Munge Uid 'N' Gid Emporium (MUNGE). + * For details, see . + * UCRL-CODE-2003-???. + * + * Copyright (C) 2001-2003 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Chris Dunlap . + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License; + * if not, write to the Free Software Foundation, Inc., 59 Temple Place, + * Suite 330, Boston, MA 02111-1307 USA. + *****************************************************************************/ + + +#ifndef FD_H +#define FD_H + + +#if HAVE_CONFIG_H +# include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include +#include + + +int fd_set_close_on_exec (int fd); +/* + * Sets the file descriptor [fd] to be closed on exec(). + * Returns 0 on success, or -1 on error. + */ + +int fd_set_nonblocking (int fd); +/* + * Sets the file descriptor [fd] for non-blocking I/O. + * Returns 0 on success, or -1 on error. + */ + +int fd_get_read_lock (int fd); +/* + * Obtain a read lock on the file specified by [fd]. + * Returns 0 on success, or -1 if prevented from obtaining the lock. + */ + +int fd_get_readw_lock (int fd); +/* + * Obtain a read lock on the file specified by [fd], + * blocking until one becomes available. + * Returns 0 on success, or -1 on error. + */ + +int fd_get_write_lock (int fd); +/* + * Obtain a write lock on the file specified by [fd]. + * Returns 0 on success, or -1 if prevented from obtaining the lock. + */ + +int fd_get_writew_lock (int fd); +/* + * Obtain a write lock on the file specified by [fd], + * blocking until one becomes available. + * Returns 0 on success, or -1 on error. + */ + +int fd_release_lock (int fd); +/* + * Release a lock held on the file specified by [fd]. + * Returns 0 on success, or -1 on error. + */ + +pid_t fd_is_read_lock_blocked (int fd); +/* + * Checks to see if a lock exists on [fd] that would block a request for a + * read-lock (ie, if a write-lock is already being held on the file). + * Returns the pid of the process holding the lock, 0 if no lock exists, + * or -1 on error. + */ + +pid_t fd_is_write_lock_blocked (int fd); +/* + * Checks to see if a lock exists on [fd] that would block a request for a + * write-lock (ie, if any lock is already being held on the file). + * Returns the pid of the process holding the lock, 0 if no lock exists, + * or -1 on error. + */ + +ssize_t fd_read_n (int fd, void *buf, size_t n); +/* + * Reads up to [n] bytes from [fd] into [buf]. + * Returns the number of bytes read, 0 on EOF, or -1 on error. + */ + +ssize_t fd_write_n (int fd, void *buf, size_t n); +/* + * Writes [n] bytes from [buf] to [fd]. + * Returns the number of bytes written, or -1 on error. + */ + +ssize_t fd_read_line (int fd, void *buf, size_t maxlen); +/* + * Reads at most [maxlen-1] bytes up to a newline from [fd] into [buf]. + * The [buf] is guaranteed to be NUL-terminated and will contain the + * newline if it is encountered within [maxlen-1] bytes. + * Returns the number of bytes read, 0 on EOF, or -1 on error. + */ + +ssize_t fd_null_read_n (int fd, void *buf, size_t maxlen); +/* + * Reads up to [n] bytes from [fd] into [buf]. + * Returns the number of bytes read, 0 on EOF, or -1 on error. + * Differs from fd_read_n() in that it checks for the presence + * a null along the partial read and breaks out if it does. + * Added by Mike Haskell + */ + +#endif /* !FD_H */ diff --git a/lib/hostlist.c b/lib/hostlist.c new file mode 100644 index 0000000..b55a78a --- /dev/null +++ b/lib/hostlist.c @@ -0,0 +1,2715 @@ +/*****************************************************************************\ + * $Id: hostlist.c 7582 2008-07-11 22:38:28Z grondo $ + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Mark Grondona + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see . + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +# if HAVE_STRING_H +# include +# endif +# if HAVE_PTHREAD_H +# include +# endif +#else /* !HAVE_CONFIG_H */ +# include +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hostlist.h" + +/* + * lsd_fatal_error : fatal error macro + */ +#ifdef WITH_LSD_FATAL_ERROR_FUNC +# undef lsd_fatal_error + extern void lsd_fatal_error(char *file, int line, char *mesg); +#else /* !WITH_LSD_FATAL_ERROR_FUNC */ +# ifndef lsd_fatal_error +# define lsd_fatal_error(file, line, mesg) \ + do { \ + fprintf(stderr, "ERROR: [%s:%d] %s: %s\n", \ + file, line, mesg, strerror(errno)); \ + } while (0) +# endif /* !lsd_fatal_error */ +#endif /* !WITH_LSD_FATAL_ERROR_FUNC */ + +/* + * lsd_nomem_error + */ +#ifdef WITH_LSD_NOMEM_ERROR_FUNC +# undef lsd_nomem_error + extern void * lsd_nomem_error(char *file, int line, char *mesg); +#else /* !WITH_LSD_NOMEM_ERROR_FUNC */ +# ifndef lsd_nomem_error +# define lsd_nomem_error(file, line, mesg) (NULL) +# endif /* !lsd_nomem_error */ +#endif /* !WITH_LSD_NOMEM_ERROR_FUNC */ + +/* + * OOM helper function + * Automatically call lsd_nomem_error with appropriate args + * and set errno to ENOMEM + */ +#define out_of_memory(mesg) \ + do { \ + errno = ENOMEM; \ + return(lsd_nomem_error(__FILE__, __LINE__, mesg)); \ + } while (0) + +/* + * Some constants and tunables: + */ + +/* number of elements to allocate when extending the hostlist array */ +#define HOSTLIST_CHUNK 16 + +/* max host range: anything larger will be assumed to be an error */ +#define MAX_RANGE 16384 /* 16K Hosts */ + +/* max host suffix value */ +#define MAX_HOST_SUFFIX 1<<25 + +/* max number of ranges that will be processed between brackets */ +#define MAX_RANGES 10240 /* 10K Ranges */ + +/* size of internal hostname buffer (+ some slop), hostnames will probably + * be truncated if longer than MAXHOSTNAMELEN */ +#ifndef MAXHOSTNAMELEN +#define MAXHOSTNAMELEN 64 +#endif + +/* max size of internal hostrange buffer */ +#define MAXHOSTRANGELEN 1024 + +/* ----[ Internal Data Structures ]---- */ + +/* hostname type: A convenience structure used in parsing single hostnames */ +struct hostname_components { + char *hostname; /* cache of initialized hostname */ + char *prefix; /* hostname prefix */ + unsigned long num; /* numeric suffix */ + + /* string representation of numeric suffix + * points into `hostname' */ + char *suffix; +}; + +typedef struct hostname_components *hostname_t; + +/* hostrange type: A single prefix with `hi' and `lo' numeric suffix values */ +struct hostrange_components { + char *prefix; /* alphanumeric prefix: */ + + /* beginning (lo) and end (hi) of suffix range */ + unsigned long lo, hi; + + /* width of numeric output format + * (pad with zeros up to this width) */ + int width; + + /* If singlehost is 1, `lo' and `hi' are invalid */ + unsigned singlehost:1; +}; + +typedef struct hostrange_components *hostrange_t; + +/* The hostlist type: An array based list of hostrange_t's */ +struct hostlist { +#ifndef NDEBUG +#define HOSTLIST_MAGIC 57005 + int magic; +#endif +#if WITH_PTHREADS + pthread_mutex_t mutex; +#endif /* WITH_PTHREADS */ + + /* current number of elements available in array */ + int size; + + /* current number of ranges stored in array */ + int nranges; + + /* current number of hosts stored in hostlist */ + int nhosts; + + /* pointer to hostrange array */ + hostrange_t *hr; + + /* list of iterators */ + struct hostlist_iterator *ilist; + +}; + + +/* a hostset is a wrapper around a hostlist */ +struct hostset { + hostlist_t hl; +}; + +struct hostlist_iterator { +#ifndef NDEBUG + int magic; +#endif + /* hostlist we are traversing */ + hostlist_t hl; + + /* current index of iterator in hl->hr[] */ + int idx; + + /* current hostrange object in list hl, i.e. hl->hr[idx] */ + hostrange_t hr; + + /* current depth we've traversed into range hr */ + int depth; + + /* next ptr for lists of iterators */ + struct hostlist_iterator *next; +}; + + +/* ---- ---- */ + +/* ------[ static function prototypes ]------ */ + +static void _error(char *file, int line, char *mesg, ...); +static char * _next_tok(char *, char **); +static int _zero_padded(unsigned long, int); +static int _width_equiv(unsigned long, int *, unsigned long, int *); + +static int host_prefix_end(const char *); +static hostname_t hostname_create(const char *); +static void hostname_destroy(hostname_t); +static int hostname_suffix_is_valid(hostname_t); +static int hostname_suffix_width(hostname_t); + +static hostrange_t hostrange_new(void); +static hostrange_t hostrange_create_single(const char *); +static hostrange_t hostrange_create(char *, unsigned long, unsigned long, int); +static unsigned long hostrange_count(hostrange_t); +static hostrange_t hostrange_copy(hostrange_t); +static void hostrange_destroy(hostrange_t); +static hostrange_t hostrange_delete_host(hostrange_t, unsigned long); +static int hostrange_cmp(hostrange_t, hostrange_t); +static int hostrange_prefix_cmp(hostrange_t, hostrange_t); +static int hostrange_within_range(hostrange_t, hostrange_t); +static int hostrange_width_combine(hostrange_t, hostrange_t); +static int hostrange_empty(hostrange_t); +static char * hostrange_pop(hostrange_t); +static char * hostrange_shift(hostrange_t); +static int hostrange_join(hostrange_t, hostrange_t); +static hostrange_t hostrange_intersect(hostrange_t, hostrange_t); +static int hostrange_hn_within(hostrange_t, hostname_t); +static size_t hostrange_to_string(hostrange_t hr, size_t, char *, char *); +static size_t hostrange_numstr(hostrange_t, size_t, char *); + +static hostlist_t hostlist_new(void); +static hostlist_t _hostlist_create_bracketed(const char *, char *, char *); +static int hostlist_resize(hostlist_t, size_t); +static int hostlist_expand(hostlist_t); +static int hostlist_push_range(hostlist_t, hostrange_t); +static int hostlist_push_hr(hostlist_t, char *, unsigned long, + unsigned long, int); +static int hostlist_insert_range(hostlist_t, hostrange_t, int); +static void hostlist_delete_range(hostlist_t, int n); +static void hostlist_coalesce(hostlist_t hl); +static void hostlist_collapse(hostlist_t hl); +static hostlist_t _hostlist_create(const char *, char *, char *); +static void hostlist_shift_iterators(hostlist_t, int, int, int); +static int _attempt_range_join(hostlist_t, int); +static int _is_bracket_needed(hostlist_t, int); + +static hostlist_iterator_t hostlist_iterator_new(void); +static void _iterator_advance(hostlist_iterator_t); +static void _iterator_advance_range(hostlist_iterator_t); + +static int hostset_find_host(hostset_t, const char *); + +/* ------[ macros ]------ */ + +#ifdef WITH_PTHREADS +# define mutex_init(mutex) \ + do { \ + int e = pthread_mutex_init(mutex, NULL); \ + if (e) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex init:"); \ + abort(); \ + } \ + } while (0) + +# define mutex_lock(mutex) \ + do { \ + int e = pthread_mutex_lock(mutex); \ + if (e) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex lock:"); \ + abort(); \ + } \ + } while (0) + +# define mutex_unlock(mutex) \ + do { \ + int e = pthread_mutex_unlock(mutex); \ + if (e) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex unlock:"); \ + abort(); \ + } \ + } while (0) + +# define mutex_destroy(mutex) \ + do { \ + int e = pthread_mutex_destroy(mutex); \ + if (e) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex destroy:"); \ + abort(); \ + } \ + } while (0) + +#else /* !WITH_PTHREADS */ + +# define mutex_init(mutex) +# define mutex_lock(mutex) +# define mutex_unlock(mutex) +# define mutex_destroy(mutex) + +#endif /* WITH_PTHREADS */ + +#define LOCK_HOSTLIST(_hl) \ + do { \ + assert(_hl != NULL); \ + mutex_lock(&(_hl)->mutex); \ + assert((_hl)->magic == HOSTLIST_MAGIC); \ + } while (0) + +#define UNLOCK_HOSTLIST(_hl) \ + do { \ + mutex_unlock(&(_hl)->mutex); \ + } while (0) + +#define seterrno_ret(_errno, _rc) \ + do { \ + errno = _errno; \ + return _rc; \ + } while (0) + +/* ------[ Function Definitions ]------ */ + +/* ----[ general utility functions ]---- */ + + +/* + * Varargs capable error reporting via lsd_fatal_error() + */ +static void _error(char *file, int line, char *msg, ...) +{ + va_list ap; + char buf[1024]; + int len = 0; + va_start(ap, msg); + + len = vsnprintf(buf, 1024, msg, ap); + if ((len < 0) || (len > 1024)) + buf[1023] = '\0'; + + lsd_fatal_error(file, line, buf); + + va_end(ap); + return; +} + +static int _advance_past_brackets (char *tok, char **str) +{ + /* if _single_ opening bracket exists b/w tok and str, push str + * past first closing bracket to next seperator */ + if ( memchr(tok, '[', *str - tok) != NULL + && memchr(tok, ']', *str - tok) == NULL ) { + char *q = strchr(*str, ']'); + if (q && memchr(*str, '[', q - *str) == NULL) { + *str = q + 1; + return (1); + } + } + + return 0; +} + +/* + * Helper function for host list string parsing routines + * Returns a pointer to the next token; additionally advance *str + * to the next separator. + * + * next_tok was taken directly from pdsh courtesy of Jim Garlick. + * (with modifications to support bracketed hostlists, i.e.: + * xxx[xx,xx,xx] is a single token) + * + */ +static char * _next_tok(char *sep, char **str) +{ + char *tok; + + /* push str past any leading separators */ + while (**str != '\0' && strchr(sep, **str) != '\0') + (*str)++; + + if (**str == '\0') + return NULL; + + /* assign token ptr */ + tok = *str; + + /* + * Advance str past any separators, but if a separator occurs between + * brackets, e.g. foo[0-3,5], then advance str past closing brackets and + * try again. + */ + do { + /* push str past token and leave pointing to first separator */ + while (**str != '\0' && strchr(sep, **str) == '\0') + (*str)++; + } while (_advance_past_brackets (tok, str)); + + /* nullify consecutive separators and push str beyond them */ + while (**str != '\0' && strchr(sep, **str) != '\0') + *(*str)++ = '\0'; + + return tok; +} + + +/* return the number of zeros needed to pad "num" to "width" + */ +static int _zero_padded(unsigned long num, int width) +{ + int n = 1; + while (num /= 10L) + n++; + return width > n ? width - n : 0; +} + +/* test whether two format `width' parameters are "equivalent" + * The width arguments "wn" and "wm" for integers "n" and "m" + * are equivalent if: + * + * o wn == wm OR + * + * o applying the same format width (either wn or wm) to both of + * 'n' and 'm' will not change the zero padding of *either* 'm' nor 'n'. + * + * If this function returns 1 (or true), the appropriate width value + * (either 'wm' or 'wn') will have been adjusted such that both format + * widths are equivalent. + */ +static int _width_equiv(unsigned long n, int *wn, unsigned long m, int *wm) +{ + int npad, nmpad, mpad, mnpad; + + if (wn == wm) + return 1; + + npad = _zero_padded(n, *wn); + nmpad = _zero_padded(n, *wm); + mpad = _zero_padded(m, *wm); + mnpad = _zero_padded(m, *wn); + + if (npad != nmpad && mpad != mnpad) + return 0; + + if (npad != nmpad) { + if (mpad == mnpad) { + *wm = *wn; + return 1; + } else + return 0; + } else { /* mpad != mnpad */ + if (npad == nmpad) { + *wn = *wm; + return 1; + } else + return 0; + } + + /* not reached */ +} + + +/* ----[ hostname_t functions ]---- */ + +/* + * return the location of the last char in the hostname prefix + */ +static int host_prefix_end(const char *hostname) +{ + int idx = strlen(hostname) - 1; + + while (idx >= 0 && isdigit((char) hostname[idx])) + idx--; + return idx; +} + +/* + * create a hostname_t object from a string hostname + */ +static hostname_t hostname_create(const char *hostname) +{ + hostname_t hn = NULL; + char *p = '\0'; + int idx = 0; + + assert(hostname != NULL); + + if (!(hn = (hostname_t) malloc(sizeof(*hn)))) + out_of_memory("hostname create"); + + idx = host_prefix_end(hostname); + + if (!(hn->hostname = strdup(hostname))) { + free(hn); + out_of_memory("hostname create"); + } + + hn->num = 0; + hn->prefix = NULL; + hn->suffix = NULL; + + if (idx == strlen(hostname) - 1) { + if ((hn->prefix = strdup(hostname)) == NULL) { + hostname_destroy(hn); + out_of_memory("hostname prefix create"); + } + return hn; + } + + hn->suffix = hn->hostname + idx + 1; + hn->num = strtoul(hn->suffix, &p, 10); + + if ((*p == '\0') && (hn->num <= MAX_HOST_SUFFIX)) { + if (!(hn->prefix = malloc((idx + 2) * sizeof(char)))) { + hostname_destroy(hn); + out_of_memory("hostname prefix create"); + } + memcpy(hn->prefix, hostname, idx + 1); + hn->prefix[idx + 1] = '\0'; + } else { + if (!(hn->prefix = strdup(hostname))) { + hostname_destroy(hn); + out_of_memory("hostname prefix create"); + } + hn->suffix = NULL; + } + + return hn; +} + +/* free a hostname object + */ +static void hostname_destroy(hostname_t hn) +{ + if (hn == NULL) + return; + hn->suffix = NULL; + if (hn->hostname) + free(hn->hostname); + if (hn->prefix) + free(hn->prefix); + free(hn); +} + +/* return true if the hostname has a valid numeric suffix + */ +static int hostname_suffix_is_valid(hostname_t hn) +{ + return hn->suffix != NULL; +} + +/* return the width (in characters) of the numeric part of the hostname + */ +static int hostname_suffix_width(hostname_t hn) +{ + assert(hn->suffix != NULL); + return (int) strlen(hn->suffix); +} + + +/* ----[ hostrange_t functions ]---- */ + +/* allocate a new hostrange object + */ +static hostrange_t hostrange_new(void) +{ + hostrange_t new = (hostrange_t) malloc(sizeof(*new)); + if (!new) + out_of_memory("hostrange create"); + return new; +} + +/* Create a hostrange_t containing a single host without a valid suffix + * hr->prefix will represent the entire hostname. + */ +static hostrange_t hostrange_create_single(const char *prefix) +{ + hostrange_t new; + + assert(prefix != NULL); + + if ((new = hostrange_new()) == NULL) + goto error1; + + if ((new->prefix = strdup(prefix)) == NULL) + goto error2; + + new->singlehost = 1; + new->lo = 0L; + new->hi = 0L; + new->width = 0; + + return new; + + error2: + free(new); + error1: + out_of_memory("hostrange create single"); +} + + +/* Create a hostrange object with a prefix, hi, lo, and format width + */ +static hostrange_t +hostrange_create(char *prefix, unsigned long lo, unsigned long hi, int width) +{ + hostrange_t new; + + assert(prefix != NULL); + + if ((new = hostrange_new()) == NULL) + goto error1; + + if ((new->prefix = strdup(prefix)) == NULL) + goto error2; + + new->lo = lo; + new->hi = hi; + new->width = width; + + new->singlehost = 0; + + return new; + + error2: + free(new); + error1: + out_of_memory("hostrange create"); +} + + +/* Return the number of hosts stored in the hostrange object + */ +static unsigned long hostrange_count(hostrange_t hr) +{ + assert(hr != NULL); + if (hr->singlehost) + return 1; + else + return hr->hi - hr->lo + 1; +} + +/* Copy a hostrange object + */ +static hostrange_t hostrange_copy(hostrange_t hr) +{ + assert(hr != NULL); + + if (hr->singlehost) + return hostrange_create_single(hr->prefix); + else + return hostrange_create(hr->prefix, hr->lo, hr->hi, + hr->width); +} + + +/* free memory allocated by the hostrange object + */ +static void hostrange_destroy(hostrange_t hr) +{ + if (hr == NULL) + return; + if (hr->prefix) + free(hr->prefix); + free(hr); +} + +/* hostrange_delete_host() deletes a specific host from the range. + * If the range is split into two, the greater range is returned, + * and `hi' of the lesser range is adjusted accordingly. If the + * highest or lowest host is deleted from a range, NULL is returned + * and the hostrange hr is adjusted properly. + */ +static hostrange_t hostrange_delete_host(hostrange_t hr, unsigned long n) +{ + hostrange_t new = NULL; + + assert(hr != NULL); + assert(n >= hr->lo && n <= hr->hi); + + if (n == hr->lo) + hr->lo++; + else if (n == hr->hi) + hr->hi--; + else { + if (!(new = hostrange_copy(hr))) + out_of_memory("hostrange copy"); + hr->hi = n - 1; + new->lo = n + 1; + } + + return new; +} + +/* hostrange_cmp() is used to sort hostrange objects. It will + * sort based on the following (in order): + * o result of strcmp on prefixes + * o if widths are compatible, then: + * sort based on lowest suffix in range + * else + * sort based on width */ +static int hostrange_cmp(hostrange_t h1, hostrange_t h2) +{ + int retval; + + assert(h1 != NULL); + assert(h2 != NULL); + + if ((retval = hostrange_prefix_cmp(h1, h2)) == 0) + retval = hostrange_width_combine(h1, h2) ? + h1->lo - h2->lo : h1->width - h2->width; + + return retval; +} + + +/* compare the prefixes of two hostrange objects. + * returns: + * < 0 if h1 prefix is less than h2 OR h1 == NULL. + * + * 0 if h1's prefix and h2's prefix match, + * UNLESS, either h1 or h2 (NOT both) do not have a valid suffix. + * + * > 0 if h1's prefix is greater than h2's OR h2 == NULL. */ +static int hostrange_prefix_cmp(hostrange_t h1, hostrange_t h2) +{ + int retval; + if (h1 == NULL) + return 1; + if (h2 == NULL) + return -1; + + retval = strcmp(h1->prefix, h2->prefix); + return retval == 0 ? h2->singlehost - h1->singlehost : retval; +} + +/* returns true if h1 and h2 would be included in the same bracketed hostlist. + * h1 and h2 will be in the same bracketed list iff: + * + * 1. h1 and h2 have same prefix + * 2. neither h1 nor h2 are singlet hosts (i.e. invalid suffix) + * + * (XXX: Should incompatible widths be placed in the same bracketed list? + * There's no good reason not to, except maybe aesthetics) + */ +static int hostrange_within_range(hostrange_t h1, hostrange_t h2) +{ + if (hostrange_prefix_cmp(h1, h2) == 0) + return h1->singlehost || h2->singlehost ? 0 : 1; + else + return 0; +} + + +/* compare two hostrange objects to determine if they are width + * compatible, returns: + * 1 if widths can safely be combined + * 0 if widths cannot be safely combined + */ +static int hostrange_width_combine(hostrange_t h0, hostrange_t h1) +{ + assert(h0 != NULL); + assert(h1 != NULL); + + return _width_equiv(h0->lo, &h0->width, h1->lo, &h1->width); +} + + +/* Return true if hostrange hr contains no hosts, i.e. hi < lo + */ +static int hostrange_empty(hostrange_t hr) +{ + assert(hr != NULL); + return ((hr->hi < hr->lo) || (hr->hi == (unsigned long) -1)); +} + +/* return the string representation of the last host in hostrange hr + * and remove that host from the range (i.e. decrement hi if possible) + * + * Returns NULL if malloc fails OR there are no more hosts left + */ +static char *hostrange_pop(hostrange_t hr) +{ + size_t size = 0; + char *host = NULL; + + assert(hr != NULL); + + if (hr->singlehost) { + hr->lo++; /* effectively set count == 0 */ + host = strdup(hr->prefix); + } else if (hostrange_count(hr) > 0) { + size = strlen(hr->prefix) + hr->width + 16; + if (!(host = (char *) malloc(size * sizeof(char)))) + out_of_memory("hostrange pop"); + snprintf(host, size, "%s%0*lu", hr->prefix, + hr->width, hr->hi--); + } + + return host; +} + +/* Same as hostrange_pop(), but remove host from start of range */ +static char *hostrange_shift(hostrange_t hr) +{ + size_t size = 0; + char *host = NULL; + + assert(hr != NULL); + + if (hr->singlehost) { + hr->lo++; + if (!(host = strdup(hr->prefix))) + out_of_memory("hostrange shift"); + } else if (hostrange_count(hr) > 0) { + size = strlen(hr->prefix) + hr->width + 16; + if (!(host = (char *) malloc(size * sizeof(char)))) + out_of_memory("hostrange shift"); + snprintf(host, size, "%s%0*lu", hr->prefix, + hr->width, hr->lo++); + } + + return host; +} + + +/* join two hostrange objects. + * + * returns: + * + * -1 if ranges do not overlap (including incompatible zero padding) + * 0 if ranges join perfectly + * >0 number of hosts that were duplicated in h1 and h2 + * + * h2 will be coalesced into h1 if rc >= 0 + * + * it is assumed that h1->lo <= h2->lo, i.e. hr1 <= hr2 + * + */ +static int hostrange_join(hostrange_t h1, hostrange_t h2) +{ + int duplicated = -1; + + assert(h1 != NULL); + assert(h2 != NULL); + assert(hostrange_cmp(h1, h2) <= 0); + + if (hostrange_prefix_cmp(h1, h2) == 0 && + hostrange_width_combine(h1, h2)) { + + if (h1->singlehost && h2->singlehost) { /* matching singlets */ + duplicated = 1; + } else if (h1->hi == h2->lo - 1) { /* perfect join */ + h1->hi = h2->hi; + duplicated = 0; + } else if (h1->hi >= h2->lo) { /* some duplication */ + if (h1->hi < h2->hi) { + duplicated = h1->hi - h2->lo + 1; + h1->hi = h2->hi; + } else + duplicated = hostrange_count(h2); + } + } + + return duplicated; +} + +/* hostrange intersect returns the intersection (common hosts) + * of hostrange objects h1 and h2. If there is no intersection, + * NULL is returned. + * + * It is assumed that h1 <= h2 (i.e. h1->lo <= h2->lo) + */ +static hostrange_t hostrange_intersect(hostrange_t h1, hostrange_t h2) +{ + hostrange_t new = NULL; + + assert(h1 != NULL); + assert(h2 != NULL); + + if (h1->singlehost || h2->singlehost) + return NULL; + + assert(hostrange_cmp(h1, h2) <= 0); + + if ((hostrange_prefix_cmp(h1, h2) == 0) + && (h1->hi > h2->lo) + && (hostrange_width_combine(h1, h2))) { + + if (!(new = hostrange_copy(h1))) + return NULL; + new->lo = h2->lo; + new->hi = h2->hi < h1->hi ? h2->hi : h1->hi; + } + + return new; +} + +/* return 1 if hostname hn is within the hostrange hr + * 0 if not. + */ +static int hostrange_hn_within(hostrange_t hr, hostname_t hn) +{ + if (hr->singlehost) { + /* + * If the current hostrange [hr] is a `singlehost' (no valid + * numeric suffix (lo and hi)), then the hostrange [hr] + * stores just one host with name == hr->prefix. + * + * Thus the full hostname in [hn] must match hr->prefix, in + * which case we return true. Otherwise, there is no + * possibility that [hn] matches [hr]. + */ + if (strcmp (hn->hostname, hr->prefix) == 0) + return 1; + else + return 0; + } + + /* + * Now we know [hr] is not a "singlehost", so hostname + * better have a valid numeric suffix, or there is no + * way we can match + */ + if (!hostname_suffix_is_valid (hn)) + return 0; + + /* + * If hostrange and hostname prefixes don't match, then + * there is no way the hostname falls within the range [hr]. + */ + if (strcmp(hr->prefix, hn->prefix) != 0) + return 0; + + /* + * Finally, check whether [hn], with a valid numeric suffix, + * falls within the range of [hr]. + */ + if (hn->num <= hr->hi && hn->num >= hr->lo) { + int width = hostname_suffix_width(hn); + int num = hn->num; + return (_width_equiv(hr->lo, &hr->width, num, &width)); + } + + return 0; +} + + +/* copy a string representation of the hostrange hr into buffer buf, + * writing at most n chars including NUL termination + */ +static size_t +hostrange_to_string(hostrange_t hr, size_t n, char *buf, char *separator) +{ + unsigned long i; + int truncated = 0; + int len = 0; + char sep = separator == NULL ? ',' : separator[0]; + + if (n == 0) + return 0; + + if (hr->singlehost) + return snprintf(buf, n, "%s", hr->prefix); + + for (i = hr->lo; i <= hr->hi; i++) { + size_t m = (n - len) <= n ? n - len : 0; /* check for < 0 */ + int ret = snprintf(buf + len, m, "%s%0*lu", + hr->prefix, hr->width, i); + if (ret < 0 || ret >= m) { + len = n; + truncated = 1; + break; + } + len+=ret; + buf[len++] = sep; + } + + if (truncated) { + buf[n-1] = '\0'; + return -1; + } else { + /* back up over final separator */ + buf[--len] = '\0'; + return len; + } +} + +/* Place the string representation of the numeric part of hostrange into buf + * writing at most n chars including NUL termination. + */ +static size_t hostrange_numstr(hostrange_t hr, size_t n, char *buf) +{ + int len = 0; + + assert(buf != NULL); + + if (hr->singlehost || n == 0) + return 0; + + len = snprintf(buf, n, "%0*lu", hr->width, hr->lo); + + if ((len >= 0) && (len < n) && (hr->lo < hr->hi)) { + int len2 = snprintf(buf+len, n-len, "-%0*lu", hr->width, hr->hi); + if (len2 < 0) + len = -1; + else + len += len2; + } + + return len; +} + + +/* ----[ hostlist functions ]---- */ + +/* Create a new hostlist object. + * Returns an empty hostlist, or NULL if memory allocation fails. + */ +static hostlist_t hostlist_new(void) +{ + int i; + hostlist_t new = (hostlist_t) malloc(sizeof(*new)); + if (!new) + goto fail1; + + assert(new->magic = HOSTLIST_MAGIC); + mutex_init(&new->mutex); + + new->hr = (hostrange_t *) malloc(HOSTLIST_CHUNK * sizeof(hostrange_t)); + if (!new->hr) + goto fail2; + + /* set entries in hostrange array to NULL */ + for (i = 0; i < HOSTLIST_CHUNK; i++) + new->hr[i] = NULL; + + new->size = HOSTLIST_CHUNK; + new->nranges = 0; + new->nhosts = 0; + new->ilist = NULL; + return new; + + fail2: + free(new); + fail1: + out_of_memory("hostlist_create"); +} + + +/* Resize the internal array used to store the list of hostrange objects. + * + * returns 1 for a successful resize, + * 0 if call to _realloc fails + * + * It is assumed that the caller has the hostlist hl locked + */ +static int hostlist_resize(hostlist_t hl, size_t newsize) +{ + int i; + size_t oldsize; + assert(hl != NULL); + assert(hl->magic == HOSTLIST_MAGIC); + oldsize = hl->size; + hl->size = newsize; + hl->hr = realloc((void *) hl->hr, hl->size*sizeof(hostrange_t)); + if (!(hl->hr)) + return 0; + + for (i = oldsize; i < newsize; i++) + hl->hr[i] = NULL; + + return 1; +} + +/* Resize hostlist by one HOSTLIST_CHUNK + * Assumes that hostlist hl is locked by caller + */ +static int hostlist_expand(hostlist_t hl) +{ + if (!hostlist_resize(hl, hl->size + HOSTLIST_CHUNK)) + return 0; + else + return 1; +} + +/* Push a hostrange object onto hostlist hl + * Returns the number of hosts successfully pushed onto hl + * or -1 if there was an error allocating memory + */ +static int hostlist_push_range(hostlist_t hl, hostrange_t hr) +{ + hostrange_t tail; + int retval; + + assert(hr != NULL); + LOCK_HOSTLIST(hl); + + tail = (hl->nranges > 0) ? hl->hr[hl->nranges-1] : hl->hr[0]; + + if (hl->size == hl->nranges && !hostlist_expand(hl)) + goto error; + + if (hl->nranges > 0 + && hostrange_prefix_cmp(tail, hr) == 0 + && tail->hi == hr->lo - 1 + && hostrange_width_combine(tail, hr)) { + tail->hi = hr->hi; + } else { + if ((hl->hr[hl->nranges++] = hostrange_copy(hr)) == NULL) + goto error; + } + + retval = hl->nhosts += hostrange_count(hr); + + UNLOCK_HOSTLIST(hl); + + return retval; + + error: + UNLOCK_HOSTLIST(hl); + return -1; +} + + + +/* Same as hostlist_push_range() above, but prefix, lo, hi, and width + * are passed as args + */ +static int +hostlist_push_hr(hostlist_t hl, char *prefix, unsigned long lo, + unsigned long hi, int width) +{ + hostrange_t hr = hostrange_create(prefix, lo, hi, width); + int retval = hostlist_push_range(hl, hr); + hostrange_destroy(hr); + return retval; +} + +/* Insert a range object hr into position n of the hostlist hl + * Assumes that hl->mutex is already held by calling process + */ +static int hostlist_insert_range(hostlist_t hl, hostrange_t hr, int n) +{ + int i; + hostrange_t tmp; + hostlist_iterator_t hli; + + assert(hl != NULL); + assert(hl->magic == HOSTLIST_MAGIC); + assert(hr != NULL); + + if (n > hl->nranges) + return 0; + + if (hl->size == hl->nranges && !hostlist_expand(hl)) + return 0; + + /* copy new hostrange into slot "n" in array */ + tmp = hl->hr[n]; + hl->hr[n] = hostrange_copy(hr); + + /* push remaining hostrange entries up */ + for (i = n + 1; i < hl->nranges + 1; i++) { + hostrange_t last = hl->hr[i]; + hl->hr[i] = tmp; + tmp = last; + } + hl->nranges++; + + /* adjust hostlist iterators if needed */ + for (hli = hl->ilist; hli; hli = hli->next) { + if (hli->idx >= n) + hli->hr = hli->hl->hr[++hli->idx]; + } + + return 1; +} + +/* Delete the range at position n in the range array + * Assumes the hostlist lock is already held. + */ +static void hostlist_delete_range(hostlist_t hl, int n) +{ + int i; + hostrange_t old; + + assert(hl != NULL); + assert(hl->magic == HOSTLIST_MAGIC); + assert(n < hl->nranges && n >= 0); + + old = hl->hr[n]; + for (i = n; i < hl->nranges - 1; i++) + hl->hr[i] = hl->hr[i + 1]; + hl->nranges--; + hl->hr[hl->nranges] = NULL; + hostlist_shift_iterators(hl, n, 0, 1); + + /* XXX caller responsible for adjusting nhosts */ + /* hl->nhosts -= hostrange_count(old) */ + + hostrange_destroy(old); +} + +#if WANT_RECKLESS_HOSTRANGE_EXPANSION + +/* The reckless hostrange expansion function. + * See comment in hostlist.h:hostlist_create() for more info on + * the different choices for hostlist notation. + */ +hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op) +{ + char *str, *orig; + char *tok, *cur; + int high, low, fmt = 0; + char prefix[256] = ""; + int pos = 0; + int error = 0; + char range_op = r_op[0];/* XXX support > 1 char range ops in future? */ + + hostlist_t new = hostlist_new(); + + orig = str = strdup(hostlist); + + /* return an empty list if an empty string was passed in */ + if (str == NULL || strlen(str) == 0) + goto done; + + /* Use hostlist_create_bracketed if we see "[" */ + if (strchr(str, '[') != NULL) + return _hostlist_create_bracketed(hostlist, sep, r_op); + + while ((tok = _next_tok(sep, &str)) != NULL) { + + /* save the current string for error messages */ + cur = tok; + + high = low = 0; + + /* find end of alpha part + * do this by finding last occurence of range_op in str */ + pos = strlen(tok) - 1; + if (strstr(tok, r_op) != '\0') { + while (pos >= 0 && (char) tok[pos] != range_op) + pos--; + } + + /* now back up past any digits */ + while (pos >= 0 && isdigit((char) tok[--pos])) {;} + + /* Check for valid x-y range (x must be a digit) + * Reset pos if the range is not valid */ + if (!isdigit((char) tok[++pos])) + pos = strlen(tok) - 1; + + /* create prefix string + * if prefix will be zero length, but prefix already exists + * use the previous prefix and fmt + */ + if ((pos > 0) || (prefix[0] == '\0')) { + memcpy(prefix, tok, (size_t) pos * sizeof(char)); + prefix[pos] = '\0'; + + /* push pointer past prefix */ + tok += pos; + + /* count number of digits for ouput fmt */ + for (fmt = 0; isdigit(tok[fmt]); ++fmt) {;} + + if (fmt == 0) + error = 1; + + } else + tok += pos; + + /* get lower bound */ + low = strtoul(tok, (char **) &tok, 10); + + if (*tok == range_op) { /* now get range upper bound */ + /* push pointer past range op */ + ++tok; + + /* find length of alpha part */ + for (pos = 0; tok[pos] && !isdigit(tok[pos]); ++pos) {;} + + /* alpha part must match prefix or error + * this could mean we've got something like "rtr1-a2" + * so just record an error + */ + if (pos > 0) { + if (pos != strlen(prefix) || + strncmp(prefix, tok, pos) != 0) + error = 1; + } + + if (*tok != '\0') + tok += pos; + + /* make sure we have digits to the end */ + for (pos = 0; tok[pos] && isdigit((char) tok[pos]); ++pos) {;} + + if (pos > 0) { /* we have digits to process */ + high = strtoul(tok, (char **) &tok, 10); + } else { /* bad boy, no digits */ + error = 1; + } + + if ((low > high) || (high - low > MAX_RANGE)) + error = 1; + + } else { /* single value */ + high = 0; /* special case, ugh. */ + } + + /* error if: + * 1. we are not at end of string + * 2. upper bound equals lower bound + */ + if (*tok != '\0' || high == low) + error = 1; + + if (error) { /* assume this is not a range on any error */ + hostlist_push_host(new, cur); + } else { + if (high < low) + high = low; + hostlist_push_hr(new, prefix, low, high, fmt); + } + + error = 0; + } + + done: + free(orig); + + return new; +} + +#else /* !WANT_RECKLESS_HOSTRANGE_EXPANSION */ + +hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op) +{ + return _hostlist_create_bracketed(hostlist, sep, r_op); +} + +#endif /* WANT_RECKLESS_HOSTRANGE_EXPANSION */ + +struct _range { + unsigned long lo, hi; + int width; +}; + +/* Grab a single range from str + * returns 1 if str contained a valid number or range, + * 0 if conversion of str to a range failed. + */ +static int _parse_single_range(const char *str, struct _range *range) +{ + char *p, *q; + char *orig = strdup(str); + if (!orig) + seterrno_ret(ENOMEM, 0); + + if ((p = strchr(str, '-'))) { + *p++ = '\0'; + if (*p == '-') /* do NOT allow negative numbers */ + goto error; + } + range->lo = strtoul(str, &q, 10); + if (q == str) + goto error; + + range->hi = (p && *p) ? strtoul(p, &q, 10) : range->lo; + + if (q == p || *q != '\0') + goto error; + + if (range->lo > range->hi) + goto error; + + if (range->hi - range->lo + 1 > MAX_RANGE ) { + _error(__FILE__, __LINE__, "Too many hosts in range `%s'", orig); + free(orig); + seterrno_ret(ERANGE, 0); + } + + free(orig); + range->width = strlen(str); + return 1; + + error: + _error(__FILE__, __LINE__, "Invalid range: `%s'", orig); + free(orig); + seterrno_ret(EINVAL, 0); +} + + +/* + * Convert 'str' containing comma separated digits and ranges into an array + * of struct _range types (max 'len' elements). + * + * Return number of ranges created, or -1 on error. + */ +static int _parse_range_list(char *str, struct _range *ranges, int len) +{ + char *p; + int count = 0; + + while (str) { + if (count == len) + return -1; + if ((p = strchr(str, ','))) + *p++ = '\0'; + if (!_parse_single_range(str, &ranges[count++])) + return -1; + str = p; + } + return count; +} + +static void +_push_range_list(hostlist_t hl, char *pfx, struct _range *rng, + int n) +{ + int i; + for (i = 0; i < n; i++) { + hostlist_push_hr(hl, pfx, rng->lo, rng->hi, rng->width); + rng++; + } +} + +static void +_push_range_list_with_suffix(hostlist_t hl, char *pfx, char *sfx, + struct _range *rng, int n) +{ + int i; + unsigned long j; + for (i = 0; i < n; i++) { + for (j = rng->lo; j <= rng->hi; j++) { + char host[4096]; + hostrange_t hr; + snprintf (host, 4096, "%s%0*lu%s", pfx, rng->width, j, sfx); + hr = hostrange_create_single (host); + hostlist_push_range (hl, hr); + /* + * hr is copied in hostlist_push_range. Need to free here. + */ + hostrange_destroy (hr); + } + rng++; + } +} + +/* + * Create a hostlist from a string with brackets '[' ']' to aid + * detection of ranges and compressed lists + */ +static hostlist_t +_hostlist_create_bracketed(const char *hostlist, char *sep, char *r_op) +{ + hostlist_t new = hostlist_new(); + struct _range ranges[MAX_RANGES]; + int nr, err; + char *p, *tok, *str, *orig; + char cur_tok[1024]; + + if (hostlist == NULL) + return new; + + if (!(orig = str = strdup(hostlist))) { + hostlist_destroy(new); + return NULL; + } + + while ((tok = _next_tok(sep, &str)) != NULL) { + strncpy(cur_tok, tok, 1024); + + if ((p = strchr(tok, '[')) != NULL) { + char *q, *prefix = tok; + *p++ = '\0'; + + if ((q = strchr(p, ']'))) { + *q = '\0'; + nr = _parse_range_list(p, ranges, MAX_RANGES); + if (nr < 0) + goto error; + + if (*(++q) != '\0') + _push_range_list_with_suffix (new, prefix, q, ranges, nr); + else + _push_range_list(new, prefix, ranges, nr); + + + } else + hostlist_push_host(new, cur_tok); + + } else + hostlist_push_host(new, cur_tok); + } + + free(orig); + return new; + + error: + err = errno; + hostlist_destroy(new); + free(orig); + seterrno_ret(err, NULL); +} + + + +hostlist_t hostlist_create(const char *str) +{ + return _hostlist_create(str, "\t, ", "-"); +} + + +hostlist_t hostlist_copy(const hostlist_t hl) +{ + int i; + hostlist_t new; + + if (hl == NULL) + return NULL; + + LOCK_HOSTLIST(hl); + if (!(new = hostlist_new())) + goto done; + + new->nranges = hl->nranges; + new->nhosts = hl->nhosts; + if (new->nranges > new->size) + hostlist_resize(new, new->nranges); + + for (i = 0; i < hl->nranges; i++) + new->hr[i] = hostrange_copy(hl->hr[i]); + + done: + UNLOCK_HOSTLIST(hl); + return new; +} + + +void hostlist_destroy(hostlist_t hl) +{ + int i; + if (hl == NULL) + return; + LOCK_HOSTLIST(hl); + while (hl->ilist) { + mutex_unlock(&hl->mutex); + hostlist_iterator_destroy(hl->ilist); + mutex_lock(&hl->mutex); + } + for (i = 0; i < hl->nranges; i++) + hostrange_destroy(hl->hr[i]); + free(hl->hr); + assert(hl->magic = 0x1); + UNLOCK_HOSTLIST(hl); + mutex_destroy(&hl->mutex); + free(hl); +} + + +int hostlist_push(hostlist_t hl, const char *hosts) +{ + hostlist_t new; + int retval; + if (hosts == NULL) + return 0; + new = hostlist_create(hosts); + if (!new) + return 0; + mutex_lock(&new->mutex); + retval = new->nhosts; + mutex_unlock(&new->mutex); + hostlist_push_list(hl, new); + hostlist_destroy(new); + return retval; +} + +int hostlist_push_host(hostlist_t hl, const char *str) +{ + hostrange_t hr; + hostname_t hn; + + if (str == NULL) + return 0; + + hn = hostname_create(str); + + if (hostname_suffix_is_valid(hn)) { + hr = hostrange_create(hn->prefix, hn->num, hn->num, + hostname_suffix_width(hn)); + } else + hr = hostrange_create_single(str); + + hostlist_push_range(hl, hr); + + hostrange_destroy(hr); + hostname_destroy(hn); + + return 1; +} + +int hostlist_push_list(hostlist_t h1, hostlist_t h2) +{ + int i, n = 0; + + if (h2 == NULL) + return 0; + + LOCK_HOSTLIST(h2); + + for (i = 0; i < h2->nranges; i++) + n += hostlist_push_range(h1, h2->hr[i]); + + UNLOCK_HOSTLIST(h2); + + return n; +} + + +char *hostlist_pop(hostlist_t hl) +{ + char *host = NULL; + + LOCK_HOSTLIST(hl); + if (hl->nhosts > 0) { + hostrange_t hr = hl->hr[hl->nranges - 1]; + host = hostrange_pop(hr); + hl->nhosts--; + if (hostrange_empty(hr)) { + hostrange_destroy(hl->hr[--hl->nranges]); + hl->hr[hl->nranges] = NULL; + } + } + UNLOCK_HOSTLIST(hl); + return host; +} + +/* find all iterators affected by a shift (or deletion) at + * hl->hr[idx], depth, with the deletion of n ranges */ +static void +hostlist_shift_iterators(hostlist_t hl, int idx, int depth, int n) +{ + hostlist_iterator_t i; + for (i = hl->ilist; i; i = i->next) { + if (n == 0) { + if (i->idx == idx && i->depth >= depth) + i->depth = i->depth > -1 ? i->depth - 1 : -1; + } else { + if (i->idx >= idx) { + if ((i->idx -= n) >= 0) + i->hr = i->hl->hr[i->idx]; + else + hostlist_iterator_reset(i); + } + } + } +} + +char *hostlist_shift(hostlist_t hl) +{ + char *host = NULL; + + LOCK_HOSTLIST(hl); + + if (hl->nhosts > 0) { + hostrange_t hr = hl->hr[0]; + + host = hostrange_shift(hr); + hl->nhosts--; + + if (hostrange_empty(hr)) { + hostlist_delete_range(hl, 0); + /* hl->nranges--; */ + } else + hostlist_shift_iterators(hl, 0, 0, 0); + } + + UNLOCK_HOSTLIST(hl); + + return host; +} + + +char *hostlist_pop_range(hostlist_t hl) +{ + int i; + char buf[MAXHOSTRANGELEN + 1]; + hostlist_t hltmp; + hostrange_t tail; + + LOCK_HOSTLIST(hl); + if (hl->nranges < 1 || !(hltmp = hostlist_new())) { + UNLOCK_HOSTLIST(hl); + return NULL; + } + + i = hl->nranges - 2; + tail = hl->hr[hl->nranges - 1]; + while (i >= 0 && hostrange_within_range(tail, hl->hr[i])) + i--; + + for (i++; i < hl->nranges; i++) { + hostlist_push_range(hltmp, hl->hr[i]); + hostrange_destroy(hl->hr[i]); + hl->hr[i] = NULL; + } + hl->nhosts -= hltmp->nhosts; + hl->nranges -= hltmp->nranges; + + UNLOCK_HOSTLIST(hl); + hostlist_ranged_string(hltmp, MAXHOSTRANGELEN, buf); + hostlist_destroy(hltmp); + return strdup(buf); +} + + +char *hostlist_shift_range(hostlist_t hl) +{ + int i; + char buf[1024]; + hostlist_t hltmp = hostlist_new(); + if (!hltmp) + return NULL; + + LOCK_HOSTLIST(hl); + + if (hl->nranges == 0) { + hostlist_destroy(hltmp); + UNLOCK_HOSTLIST(hl); + return NULL; + } + + i = 0; + do { + hostlist_push_range(hltmp, hl->hr[i]); + hostrange_destroy(hl->hr[i]); + } while ( (++i < hl->nranges) + && hostrange_within_range(hltmp->hr[0], hl->hr[i]) ); + + hostlist_shift_iterators(hl, i, 0, hltmp->nranges); + + /* shift rest of ranges back in hl */ + for (; i < hl->nranges; i++) { + hl->hr[i - hltmp->nranges] = hl->hr[i]; + hl->hr[i] = NULL; + } + hl->nhosts -= hltmp->nhosts; + hl->nranges -= hltmp->nranges; + + UNLOCK_HOSTLIST(hl); + + hostlist_ranged_string(hltmp, 1024, buf); + hostlist_destroy(hltmp); + + return strdup(buf); +} + +/* XXX: Note: efficiency improvements needed */ +int hostlist_delete(hostlist_t hl, const char *hosts) +{ + int n = 0; + char *hostname = NULL; + hostlist_t hltmp; + + if (!(hltmp = hostlist_create(hosts))) + seterrno_ret(EINVAL, 0); + + while ((hostname = hostlist_pop(hltmp)) != NULL) { + n += hostlist_delete_host(hl, hostname); + free(hostname); + } + hostlist_destroy(hltmp); + + return n; +} + + +/* XXX watch out! poor implementation follows! (fix it at some point) */ +int hostlist_delete_host(hostlist_t hl, const char *hostname) +{ + int n = hostlist_find(hl, hostname); + if (n >= 0) + hostlist_delete_nth(hl, n); + return n >= 0 ? 1 : 0; +} + + +static char * +_hostrange_string(hostrange_t hr, int depth) +{ + char buf[MAXHOSTNAMELEN + 16]; + int len = snprintf(buf, MAXHOSTNAMELEN + 15, "%s", hr->prefix); + + if (!hr->singlehost) + snprintf(buf+len, MAXHOSTNAMELEN+15 - len, "%0*lu", + hr->width, hr->lo + depth); + return strdup(buf); +} + +char * hostlist_nth(hostlist_t hl, int n) +{ + char *host = NULL; + int i, count; + + LOCK_HOSTLIST(hl); + count = 0; + for (i = 0; i < hl->nranges; i++) { + int num_in_range = hostrange_count(hl->hr[i]); + + if (n <= (num_in_range - 1 + count)) { + host = _hostrange_string(hl->hr[i], n - count); + break; + } else + count += num_in_range; + } + + UNLOCK_HOSTLIST(hl); + + return host; +} + + +int hostlist_delete_nth(hostlist_t hl, int n) +{ + int i, count; + + LOCK_HOSTLIST(hl); + assert(n >= 0 && n <= hl->nhosts); + + count = 0; + + for (i = 0; i < hl->nranges; i++) { + int num_in_range = hostrange_count(hl->hr[i]); + hostrange_t hr = hl->hr[i]; + + if (n <= (num_in_range - 1 + count)) { + unsigned long num = hr->lo + n - count; + hostrange_t new; + + if (hr->singlehost) { /* this wasn't a range */ + hostlist_delete_range(hl, i); + } else if ((new = hostrange_delete_host(hr, num))) { + hostlist_insert_range(hl, new, i + 1); + hostrange_destroy(new); + } else if (hostrange_empty(hr)) + hostlist_delete_range(hl, i); + + goto done; + } else + count += num_in_range; + + } + + done: + UNLOCK_HOSTLIST(hl); + hl->nhosts--; + return 1; +} + +int hostlist_count(hostlist_t hl) +{ + int retval; + LOCK_HOSTLIST(hl); + retval = hl->nhosts; + UNLOCK_HOSTLIST(hl); + return retval; +} + +int hostlist_find(hostlist_t hl, const char *hostname) +{ + int i, count, ret = -1; + hostname_t hn; + + if (!hostname) + return -1; + + hn = hostname_create(hostname); + + LOCK_HOSTLIST(hl); + + for (i = 0, count = 0; i < hl->nranges; i++) { + if (hostrange_hn_within(hl->hr[i], hn)) { + if (hostname_suffix_is_valid(hn) && !hl->hr[i]->singlehost) + ret = count + hn->num - hl->hr[i]->lo; + else + ret = count; + goto done; + } else + count += hostrange_count(hl->hr[i]); + } + + done: + UNLOCK_HOSTLIST(hl); + hostname_destroy(hn); + return ret; +} + +/* hostrange compare with void * arguments to allow use with + * libc qsort() + */ +int _cmp(const void *hr1, const void *hr2) +{ + hostrange_t *h1 = (hostrange_t *) hr1; + hostrange_t *h2 = (hostrange_t *) hr2; + return hostrange_cmp((hostrange_t) * h1, (hostrange_t) * h2); +} + + +void hostlist_sort(hostlist_t hl) +{ + hostlist_iterator_t i; + LOCK_HOSTLIST(hl); + + if (hl->nranges <= 1) { + UNLOCK_HOSTLIST(hl); + return; + } + + qsort(hl->hr, hl->nranges, sizeof(hostrange_t), &_cmp); + + /* reset all iterators */ + for (i = hl->ilist; i; i = i->next) + hostlist_iterator_reset(i); + + UNLOCK_HOSTLIST(hl); + + hostlist_coalesce(hl); + +} + + +/* search through hostlist for ranges that can be collapsed + * does =not= delete any hosts + */ +static void hostlist_collapse(hostlist_t hl) +{ + int i; + + LOCK_HOSTLIST(hl); + for (i = hl->nranges - 1; i > 0; i--) { + hostrange_t hprev = hl->hr[i - 1]; + hostrange_t hnext = hl->hr[i]; + + if (hostrange_prefix_cmp(hprev, hnext) == 0 && + hprev->hi == hnext->lo - 1 && + hostrange_width_combine(hprev, hnext)) { + hprev->hi = hnext->hi; + hostlist_delete_range(hl, i); + } + } + UNLOCK_HOSTLIST(hl); +} + +/* search through hostlist (hl) for intersecting ranges + * split up duplicates and coalesce ranges where possible + */ +static void hostlist_coalesce(hostlist_t hl) +{ + int i, j; + hostrange_t new; + + LOCK_HOSTLIST(hl); + + for (i = hl->nranges - 1; i > 0; i--) { + + new = hostrange_intersect(hl->hr[i - 1], hl->hr[i]); + + if (new) { + hostrange_t hprev = hl->hr[i - 1]; + hostrange_t hnext = hl->hr[i]; + j = i; + + if (new->hi < hprev->hi) + hnext->hi = hprev->hi; + + hprev->hi = new->lo; + hnext->lo = new->hi; + + if (hostrange_empty(hprev)) + hostlist_delete_range(hl, i); + + while (new->lo <= new->hi) { + hostrange_t hr = hostrange_create( new->prefix, + new->lo, new->lo, + new->width ); + + if (new->lo > hprev->hi) + hostlist_insert_range(hl, hr, j++); + + if (new->lo < hnext->lo) + hostlist_insert_range(hl, hr, j++); + + hostrange_destroy(hr); + + new->lo++; + } + i = hl->nranges; + hostrange_destroy(new); + } + } + UNLOCK_HOSTLIST(hl); + + hostlist_collapse(hl); + +} + +/* attempt to join ranges at loc and loc-1 in a hostlist */ +/* delete duplicates, return the number of hosts deleted */ +/* assumes that the hostlist hl has been locked by caller */ +/* returns -1 if no range join occurred */ +static int _attempt_range_join(hostlist_t hl, int loc) +{ + int ndup; + assert(hl != NULL); + assert(hl->magic == HOSTLIST_MAGIC); + assert(loc > 0); + assert(loc < hl->nranges); + ndup = hostrange_join(hl->hr[loc - 1], hl->hr[loc]); + if (ndup >= 0) { + hostlist_delete_range(hl, loc); + hl->nhosts -= ndup; + } + return ndup; +} + +void hostlist_uniq(hostlist_t hl) +{ + int i = 1; + hostlist_iterator_t hli; + LOCK_HOSTLIST(hl); + if (hl->nranges <= 1) { + UNLOCK_HOSTLIST(hl); + return; + } + qsort(hl->hr, hl->nranges, sizeof(hostrange_t), &_cmp); + + while (i < hl->nranges) { + if (_attempt_range_join(hl, i) < 0) /* No range join occurred */ + i++; + } + + /* reset all iterators */ + for (hli = hl->ilist; hli; hli = hli->next) + hostlist_iterator_reset(hli); + + UNLOCK_HOSTLIST(hl); +} + + +ssize_t hostlist_deranged_string(hostlist_t hl, size_t n, char *buf) +{ + int i; + int len = 0; + int truncated = 0; + + LOCK_HOSTLIST(hl); + for (i = 0; i < hl->nranges; i++) { + size_t m = (n - len) <= n ? n - len : 0; + int ret = hostrange_to_string(hl->hr[i], m, buf + len, ","); + if (ret < 0 || ret > m) { + len = n; + truncated = 1; + break; + } + len+=ret; + buf[len++] = ','; + } + UNLOCK_HOSTLIST(hl); + + buf[len > 0 ? --len : 0] = '\0'; + if (len == n) + truncated = 1; + + return truncated ? -1 : len; +} + +/* return true if a bracket is needed for the range at i in hostlist hl */ +static int _is_bracket_needed(hostlist_t hl, int i) +{ + hostrange_t h1 = hl->hr[i]; + hostrange_t h2 = i < hl->nranges - 1 ? hl->hr[i + 1] : NULL; + return hostrange_count(h1) > 1 || hostrange_within_range(h1, h2); +} + +/* write the next bracketed hostlist, i.e. prefix[n-m,k,...] + * into buf, writing at most n chars including the terminating '\0' + * + * leaves start pointing to one past last range object in bracketed list, + * and returns the number of bytes written into buf. + * + * Assumes hostlist is locked. + */ +static int +_get_bracketed_list(hostlist_t hl, int *start, const size_t n, char *buf) +{ + hostrange_t *hr = hl->hr; + int i = *start; + int m, len = 0; + int bracket_needed = _is_bracket_needed(hl, i); + + len = snprintf(buf, n, "%s", hr[i]->prefix); + + if ((len < 0) || (len > n)) + return n; /* truncated, buffer filled */ + + if (bracket_needed && len < n && len >= 0) + buf[len++] = '['; + + do { + m = (n - len) <= n ? n - len : 0; + len += hostrange_numstr(hr[i], m, buf + len); + if (len >= n) + break; + if (bracket_needed) /* Only need commas inside brackets */ + buf[len++] = ','; + } while (++i < hl->nranges && hostrange_within_range(hr[i], hr[i-1])); + + if (bracket_needed && len < n && len > 0) { + + /* Add trailing bracket (change trailing "," from above to "]" */ + buf[len - 1] = ']'; + + /* NUL terminate for safety, but do not add terminator to len */ + buf[len] = '\0'; + + } else if (len >= n) { + if (n > 0) + buf[n-1] = '\0'; + + } else { + /* If len is > 0, NUL terminate (but do not add to len) */ + buf[len > 0 ? len : 0] = '\0'; + } + + *start = i; + return len; +} + +ssize_t hostlist_ranged_string(hostlist_t hl, size_t n, char *buf) +{ + int i = 0; + int len = 0; + int truncated = 0; + + LOCK_HOSTLIST(hl); + while (i < hl->nranges && len < n) { + len += _get_bracketed_list(hl, &i, n - len, buf + len); + if ((len > 0) && (len < n) && (i < hl->nranges)) + buf[len++] = ','; + } + UNLOCK_HOSTLIST(hl); + + /* NUL terminate */ + if (len >= n) { + truncated = 1; + if (n > 0) + buf[n-1] = '\0'; + } else + buf[len > 0 ? len : 0] = '\0'; + + return truncated ? -1 : len; +} + +/* ----[ hostlist iterator functions ]---- */ + +static hostlist_iterator_t hostlist_iterator_new(void) +{ + hostlist_iterator_t i = (hostlist_iterator_t) malloc(sizeof(*i)); + if (!i) + return NULL; + i->hl = NULL; + i->hr = NULL; + i->idx = 0; + i->depth = -1; + i->next = i; + assert(i->magic = HOSTLIST_MAGIC); + return i; +} + +hostlist_iterator_t hostlist_iterator_create(hostlist_t hl) +{ + hostlist_iterator_t i; + + if (!(i = hostlist_iterator_new())) + out_of_memory("hostlist_iterator_create"); + + LOCK_HOSTLIST(hl); + i->hl = hl; + i->hr = hl->hr[0]; + i->next = hl->ilist; + hl->ilist = i; + UNLOCK_HOSTLIST(hl); + return i; +} + +hostlist_iterator_t hostset_iterator_create(hostset_t set) +{ + return hostlist_iterator_create(set->hl); +} + +void hostlist_iterator_reset(hostlist_iterator_t i) +{ + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + i->idx = 0; + i->hr = i->hl->hr[0]; + i->depth = -1; + return; +} + +void hostlist_iterator_destroy(hostlist_iterator_t i) +{ + hostlist_iterator_t *pi; + if (i == NULL) + return; + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + LOCK_HOSTLIST(i->hl); + for (pi = &i->hl->ilist; *pi; pi = &(*pi)->next) { + assert((*pi)->magic == HOSTLIST_MAGIC); + if (*pi == i) { + *pi = (*pi)->next; + break; + } + } + UNLOCK_HOSTLIST(i->hl); + assert(i->magic = 0x1); + free(i); +} + +static void _iterator_advance(hostlist_iterator_t i) +{ + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + if (i->idx > i->hl->nranges - 1) + return; + if (++(i->depth) > (i->hr->hi - i->hr->lo)) { + i->depth = 0; + i->hr = i->hl->hr[++i->idx]; + } +} + +/* advance iterator to end of current range (meaning within "[" "]") + * i.e. advance iterator past all range objects that could be represented + * in on bracketed hostlist. + */ +static void _iterator_advance_range(hostlist_iterator_t i) +{ + int nr, j; + hostrange_t *hr; + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + + nr = i->hl->nranges; + hr = i->hl->hr; + j = i->idx; + if (++i->depth > 0) { + while (++j < nr && hostrange_within_range(i->hr, hr[j])) {;} + i->idx = j; + i->hr = i->hl->hr[i->idx]; + i->depth = 0; + } +} + +char *hostlist_next(hostlist_iterator_t i) +{ + char *buf = NULL; + char suffix[16]; + int len = 0; + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + LOCK_HOSTLIST(i->hl); + _iterator_advance(i); + + if (i->idx > i->hl->nranges - 1) { + UNLOCK_HOSTLIST(i->hl); + return NULL; + } + + suffix[0] = '\0'; + + if (!i->hr->singlehost) + snprintf (suffix, 15, "%0*lu", i->hr->width, i->hr->lo + i->depth); + + len = strlen (i->hr->prefix) + strlen (suffix) + 1; + if (!(buf = malloc (len))) + out_of_memory("hostlist_next"); + + buf[0] = '\0'; + strcat (buf, i->hr->prefix); + strcat (buf, suffix); + + UNLOCK_HOSTLIST(i->hl); + return (buf); +} + +char *hostlist_next_range(hostlist_iterator_t i) +{ + char buf[MAXHOSTRANGELEN + 1]; + int j; + + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + LOCK_HOSTLIST(i->hl); + + _iterator_advance_range(i); + + if (i->idx > i->hl->nranges - 1) { + UNLOCK_HOSTLIST(i->hl); + return NULL; + } + + j = i->idx; + _get_bracketed_list(i->hl, &j, MAXHOSTRANGELEN, buf); + + UNLOCK_HOSTLIST(i->hl); + + return strdup(buf); +} + +int hostlist_remove(hostlist_iterator_t i) +{ + hostrange_t new; + assert(i != NULL); + assert(i->magic == HOSTLIST_MAGIC); + LOCK_HOSTLIST(i->hl); + new = hostrange_delete_host(i->hr, i->hr->lo + i->depth); + if (new) { + hostlist_insert_range(i->hl, new, i->idx + 1); + hostrange_destroy(new); + i->hr = i->hl->hr[++i->idx]; + i->depth = -1; + } else if (hostrange_empty(i->hr)) { + hostlist_delete_range(i->hl, i->idx); + } else + i->depth--; + + i->hl->nhosts--; + UNLOCK_HOSTLIST(i->hl); + + return 1; +} + +/* ----[ hostset functions ]---- */ + +hostset_t hostset_create(const char *hostlist) +{ + hostset_t new; + + if (!(new = (hostset_t) malloc(sizeof(*new)))) + goto error1; + + if (!(new->hl = hostlist_create(hostlist))) + goto error2; + + hostlist_uniq(new->hl); + return new; + + error2: + free(new); + error1: + return NULL; +} + +hostset_t hostset_copy(const hostset_t set) +{ + hostset_t new; + if (!(new = (hostset_t) malloc(sizeof(*new)))) + goto error1; + + if (!(new->hl = hostlist_copy(set->hl))) + goto error2; + + return new; + error2: + free(new); + error1: + return NULL; +} + +void hostset_destroy(hostset_t set) +{ + if (set == NULL) + return; + hostlist_destroy(set->hl); + free(set); +} + +/* inserts a single range object into a hostset + * Assumes that the set->hl lock is already held + * Updates hl->nhosts + */ +static int hostset_insert_range(hostset_t set, hostrange_t hr) +{ + int i = 0; + int inserted = 0; + int nhosts = 0; + int ndups = 0; + hostlist_t hl; + + hl = set->hl; + + if (hl->size == hl->nranges && !hostlist_expand(hl)) + return 0; + + nhosts = hostrange_count(hr); + + for (i = 0; i < hl->nranges; i++) { + if (hostrange_cmp(hr, hl->hr[i]) <= 0) { + + if ((ndups = hostrange_join(hr, hl->hr[i])) >= 0) + hostlist_delete_range(hl, i); + else if (ndups < 0) + ndups = 0; + + hostlist_insert_range(hl, hr, i); + + /* now attempt to join hr[i] and hr[i-1] */ + if (i > 0) { + int m; + if ((m = _attempt_range_join(hl, i)) > 0) + ndups += m; + } + hl->nhosts += nhosts - ndups; + inserted = 1; + break; + } + } + + if (inserted == 0) { + hl->hr[hl->nranges++] = hostrange_copy(hr); + hl->nhosts += nhosts; + if (hl->nranges > 1) { + if ((ndups = _attempt_range_join(hl, hl->nranges - 1)) <= 0) + ndups = 0; + } + } + + /* + * Return the number of unique hosts inserted + */ + return nhosts - ndups; +} + +int hostset_insert(hostset_t set, const char *hosts) +{ + int i, n = 0; + hostlist_t hl = hostlist_create(hosts); + if (!hl) + return 0; + + hostlist_uniq(hl); + LOCK_HOSTLIST(set->hl); + for (i = 0; i < hl->nranges; i++) + n += hostset_insert_range(set, hl->hr[i]); + UNLOCK_HOSTLIST(set->hl); + hostlist_destroy(hl); + return n; +} + + +/* linear search through N ranges for hostname "host" + * */ +static int hostset_find_host(hostset_t set, const char *host) +{ + int i; + int retval = 0; + hostname_t hn; + LOCK_HOSTLIST(set->hl); + hn = hostname_create(host); + for (i = 0; i < set->hl->nranges; i++) { + if (hostrange_hn_within(set->hl->hr[i], hn)) { + retval = 1; + goto done; + } + } + done: + UNLOCK_HOSTLIST(set->hl); + hostname_destroy(hn); + return retval; +} + +int hostset_within(hostset_t set, const char *hosts) +{ + int nhosts, nfound; + hostlist_t hl; + char *hostname; + + assert(set->hl->magic == HOSTLIST_MAGIC); + + if (!(hl = hostlist_create(hosts))) + return (0); + + nhosts = hostlist_count(hl); + nfound = 0; + + while ((hostname = hostlist_pop(hl)) != NULL) { + nfound += hostset_find_host(set, hostname); + free(hostname); + } + + hostlist_destroy(hl); + + return (nhosts == nfound); +} + +int hostset_delete(hostset_t set, const char *hosts) +{ + return hostlist_delete(set->hl, hosts); +} + +int hostset_delete_host(hostset_t set, const char *hostname) +{ + return hostlist_delete_host(set->hl, hostname); +} + +char *hostset_shift(hostset_t set) +{ + return hostlist_shift(set->hl); +} + +char *hostset_pop(hostset_t set) +{ + return hostlist_pop(set->hl); +} + +char *hostset_shift_range(hostset_t set) +{ + return hostlist_shift_range(set->hl); +} + +char *hostset_pop_range(hostset_t set) +{ + return hostlist_pop_range(set->hl); +} + +int hostset_count(hostset_t set) +{ + return hostlist_count(set->hl); +} + +ssize_t hostset_ranged_string(hostset_t set, size_t n, char *buf) +{ + return hostlist_ranged_string(set->hl, n, buf); +} + +ssize_t hostset_deranged_string(hostset_t set, size_t n, char *buf) +{ + return hostlist_deranged_string(set->hl, n, buf); +} + +#if TEST_MAIN + +int hostlist_nranges(hostlist_t hl) +{ + return hl->nranges; +} + +int hostset_nranges(hostset_t set) +{ + return set->hl->nranges; +} + +/* test iterator functionality on the list of hosts represented + * by list + */ +int iterator_test(char *list) +{ + int j; + char buf[1024]; + hostlist_t hl = hostlist_create(list); + hostset_t set = hostset_create(list); + + hostlist_iterator_t i = hostlist_iterator_create(hl); + hostlist_iterator_t seti = hostset_iterator_create(set); + hostlist_iterator_t i2 = hostlist_iterator_create(hl); + char *host; + + + hostlist_ranged_string(hl, 1024, buf); + printf("iterator_test: hl = `%s' passed in `%s'\n", buf, list); + host = hostlist_next(i); + printf("first host in list hl = `%s'\n", host); + free(host); + + /* forge ahead three hosts with i2 */ + for (j = 0; j < 4; j++) { + host = hostlist_next(i2); + free(host); + } + + host = hostlist_shift(hl); + printf("result of shift(hl) = `%s'\n", host); + free(host); + host = hostlist_next(i); + printf("next host in list hl = `%s'\n", host); + free(host); + host = hostlist_next(i2); + printf("next host for i2 = `%s'\n", host); + free(host); + + hostlist_iterator_destroy(i); + + hostlist_destroy(hl); + hostset_destroy(set); + return 1; +} + +int main(int ac, char **av) +{ + char buf[1024000]; + int i; + char *str; + + hostlist_t hl1, hl2, hl3; + hostset_t set, set1; + hostlist_iterator_t iter, iter2; + + if (!(hl1 = hostlist_create(ac > 1 ? av[1] : NULL))) + perror("hostlist_create"); + if (!(set = hostset_create(ac > 1 ? av[1] : NULL))) + perror("hostlist_create"); + + hl3 = hostlist_create("f[0-5]"); + hostlist_delete(hl3, "f[1-3]"); + hostlist_ranged_string(hl3, 102400, buf); + printf("after delete = `%s'\n", buf); + hostlist_destroy(hl3); + + for (i = 2; i < ac; i++) { + hostlist_push(hl1, av[i]); + hostset_insert(set, av[i]); + } + + hostlist_ranged_string(hl1, 102400, buf); + printf("ranged = `%s'\n", buf); + + iterator_test(buf); + + hostlist_deranged_string(hl1, 10240, buf); + printf("deranged = `%s'\n", buf); + + hostset_ranged_string(set, 1024, buf); + printf("hostset = `%s'\n", buf); + + hostlist_sort(hl1); + hostlist_ranged_string(hl1, 1024, buf); + printf("sorted = `%s'\n", buf); + + hostlist_uniq(hl1); + hostlist_ranged_string(hl1, 1024, buf); + printf("uniqed = `%s'\n", buf); + + hl2 = hostlist_copy(hl1); + printf("pop_range: "); + while ((str = hostlist_pop_range(hl2))) { + printf("`%s' ", str); + free(str); + } + hostlist_destroy(hl2); + printf("\n"); + + hl2 = hostlist_copy(hl1); + printf("shift_range: "); + while ((str = hostlist_shift_range(hl2))) { + printf("`%s' ", str); + free(str); + } + hostlist_destroy(hl2); + printf("\n"); + + iter = hostset_iterator_create(set); + iter2 = hostset_iterator_create(set); + hostlist_iterator_destroy(iter2); + + printf("next: "); + while ((str = hostlist_next(iter))) { + printf("`%s' ", str); + free(str); + } + printf("\n"); + + hostlist_iterator_reset(iter); + printf("next_range: "); + while ((str = hostlist_next_range(iter))) { + printf("`%s' ", str); + free(str); + } + printf("\n"); + + printf("nranges = %d\n", hostset_nranges(set)); + + hostset_ranged_string(set, 1024, buf); + printf("set = %s\n", buf); + + hostset_destroy(set); + hostlist_destroy(hl1); + return 0; +} + +#endif /* TEST_MAIN */ + +/* + * vi: tabstop=4 shiftwidth=4 expandtab + */ diff --git a/lib/hostlist.h b/lib/hostlist.h new file mode 100644 index 0000000..d87d6ef --- /dev/null +++ b/lib/hostlist.h @@ -0,0 +1,419 @@ +/*****************************************************************************\ + * $Id: hostlist.h 7428 2008-05-23 16:08:31Z grondo $ + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Mark Grondona + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see . + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifndef _HOSTLIST_H +#define _HOSTLIST_H + +#include + +/* Notes: + * + * If WITH_LSD_FATAL_ERROR_FUNC is defined, the linker will expect to + * find and external lsd_fatal_error(file,line,mesg) function. By default, + * lsd_fatal_error(file,line,mesg) is a macro definition that outputs an + * error message to stderr. This macro may be redefined to invoke another + * routine instead. e.g.: + * + * #define lsd_fatal_error(file,line,mesg) \ + * error("%s:%s %s\n",file,line,mesg); + * + * If WITH_LSD_NOMEM_ERROR_FUNC is defined, the linker will expect to + * find an external lsd_nomem_error(file,line,mesg) function. By default, + * lsd_nomem_error(file,line,mesg) is a macro definition that returns NULL. + * This macro may be redefined to invoke another routine instead. + * + * If WITH_PTHREADS is defined, these routines will be thread-safe. + * + */ + +/* The hostlist opaque data type + * + * A hostlist is a list of hostnames optimized for a prefixXXXX style + * naming convention, where XXXX is a decimal, numeric suffix. + */ +typedef struct hostlist * hostlist_t; + +/* A hostset is a special case of a hostlist. It: + * + * 1. never contains duplicates + * 2. is always sorted + * (Note: sort occurs first on alphanumeric prefix -- where prefix + * matches, numeric suffixes will be sorted *by value*) + */ +typedef struct hostset * hostset_t; + +/* The hostlist iterator type (may be used with a hostset as well) + * used for non-destructive access to hostlist members. + * + */ +typedef struct hostlist_iterator * hostlist_iterator_t; + +/* ----[ hostlist_t functions: ]---- */ + +/* ----[ hostlist creation and destruction ]---- */ + +/* + * hostlist_create(): + * + * Create a new hostlist from a string representation. + * + * The string representation (str) may contain one or more hostnames or + * bracketed hostlists separated by either `,' or whitespace. A bracketed + * hostlist is denoted by a common prefix followed by a list of numeric + * ranges contained within brackets: e.g. "tux[0-5,12,20-25]" + * + * Note: if this module is compiled with WANT_RECKLESS_HOSTRANGE_EXPANSION + * defined, a much more loose interpretation of host ranges is used. + * Reckless hostrange expansion allows all of the following (in addition to + * bracketed hostlists): + * + * o tux0-5,tux12,tux20-25 + * o tux0-tux5,tux12,tux20-tux25 + * o tux0-5,12,20-25 + * + * If str is NULL, and empty hostlist is created and returned. + * + * If the create fails, hostlist_create() returns NULL. + * + * The returned hostlist must be freed with hostlist_destroy() + * + */ +hostlist_t hostlist_create(const char *hostlist); + +/* hostlist_copy(): + * + * Allocate a copy of a hostlist object. Returned hostlist must be freed + * with hostlist_destroy. + */ +hostlist_t hostlist_copy(const hostlist_t hl); + +/* hostlist_destroy(): + * + * Destroy a hostlist object. Frees all memory allocated to the hostlist. + */ +void hostlist_destroy(hostlist_t hl); + + +/* ----[ hostlist list operations ]---- */ + +/* hostlist_push(): + * + * push a string representation of hostnames onto a hostlist. + * + * The hosts argument may take the same form as in hostlist_create() + * + * Returns the number of hostnames inserted into the list, + * or 0 on failure. + */ +int hostlist_push(hostlist_t hl, const char *hosts); + + +/* hostlist_push_host(): + * + * Push a single host onto the hostlist hl. + * This function is more efficient than hostlist_push() for a single + * hostname, since the argument does not need to be checked for ranges. + * + * return value is 1 for success, 0 for failure. + */ +int hostlist_push_host(hostlist_t hl, const char *host); + + +/* hostlist_push_list(): + * + * Push a hostlist (hl2) onto another list (hl1) + * + * Returns 1 for success, 0 for failure. + * + */ +int hostlist_push_list(hostlist_t hl1, hostlist_t hl2); + + +/* hostlist_pop(): + * + * Returns the string representation of the last host pushed onto the list + * or NULL if hostlist is empty or there was an error allocating memory. + * The host is removed from the hostlist. + * + * Note: Caller is responsible for freeing the returned memory. + */ +char * hostlist_pop(hostlist_t hl); + + +char * hostlist_nth(hostlist_t hl, int n); + +/* hostlist_shift(): + * + * Returns the string representation of the first host in the hostlist + * or NULL if the hostlist is empty or there was an error allocating memory. + * The host is removed from the hostlist. + * + * Note: Caller is responsible for freeing the returned memory. + */ +char * hostlist_shift(hostlist_t hl); + + +/* hostlist_pop_range(): + * + * Pop the last bracketed list of hosts of the hostlist hl. + * Returns the string representation in bracketed list form. + * All hosts associated with the returned list are removed + * from hl. + * + * Caller is responsible for freeing returned memory + */ +char * hostlist_pop_range(hostlist_t hl); + +/* hostlist_shift_range(): + * + * Shift the first bracketed hostlist (improperly: range) off the + * hostlist hl. Returns the string representation in bracketed list + * form. All hosts associated with the list are removed from the + * hostlist. + * + * Caller is responsible for freeing returned memory. + */ +char * hostlist_shift_range(hostlist_t hl); + + +/* hostlist_find(): + * + * Searches hostlist hl for the first host matching hostname + * and returns position in list if found. + * + * Returns -1 if host is not found. + * + */ +int hostlist_find(hostlist_t hl, const char *hostname); + +/* hostlist_delete(): + * + * Deletes all hosts in the list represented by `hosts' + * + * Returns the number of hosts successfully deleted + */ +int hostlist_delete(hostlist_t hl, const char *hosts); + + +/* hostlist_delete_host(): + * + * Deletes the first host that matches `hostname' from the hostlist hl. + * Note: "hostname" argument cannot contain a range of hosts + * (see hostlist_delete() for this functionality.) + * + * Returns 1 if successful, 0 if hostname is not found in list. + */ +int hostlist_delete_host(hostlist_t hl, const char *hostname); + + +/* hostlist_delete_nth(): + * + * Deletes the host from position n in the hostlist. + * + * Returns 1 if successful 0 on error. + * + */ +int hostlist_delete_nth(hostlist_t hl, int n); + + +/* hostlist_count(): + * + * Return the number of hosts in hostlist hl. + */ +int hostlist_count(hostlist_t hl); + +/* hostlist_is_empty(): return true if hostlist is empty. */ +#define hostlist_is_empty(__hl) ( hostlist_count(__hl) == 0 ) + +/* ----[ Other hostlist operations ]---- */ + +/* hostlist_sort(): + * + * Sort the hostlist hl. + * + */ +void hostlist_sort(hostlist_t hl); + +/* hostlist_uniq(): + * + * Sort the hostlist hl and remove duplicate entries. + * + */ +void hostlist_uniq(hostlist_t hl); + + +/* ----[ hostlist print functions ]---- */ + +/* hostlist_ranged_string(): + * + * Write the string representation of the hostlist hl into buf, + * writing at most n chars. Returns the number of bytes written, + * or -1 if truncation occurred. + * + * The result will be NULL terminated. + * + * hostlist_ranged_string() will write a bracketed hostlist representation + * where possible. + */ +ssize_t hostlist_ranged_string(hostlist_t hl, size_t n, char *buf); +ssize_t hostset_ranged_string(hostset_t hs, size_t n, char *buf); + +/* hostlist_deranged_string(): + * + * Writes the string representation of the hostlist hl into buf, + * writing at most n chars. Returns the number of bytes written, + * or -1 if truncation occurred. + * + * hostlist_deranged_string() will not attempt to write a bracketed + * hostlist representation. Every hostname will be explicitly written. + */ +ssize_t hostlist_deranged_string(hostlist_t hl, size_t n, char *buf); +ssize_t hostset_deranged_string(hostset_t hs, size_t n, char *buf); + + +/* ----[ hostlist utility functions ]---- */ + + +/* hostlist_nranges(): + * + * Return the number of ranges currently held in hostlist hl. + */ +int hostlist_nranges(hostlist_t hl); + + +/* ----[ hostlist iterator functions ]---- */ + +/* hostlist_iterator_create(): + * + * Creates and returns a hostlist iterator used for non destructive + * access to a hostlist or hostset. Returns NULL on failure. + */ +hostlist_iterator_t hostlist_iterator_create(hostlist_t hl); + +/* hostset_iterator_create(): + * + * Same as hostlist_iterator_create(), but creates a hostlist_iterator + * from a hostset. + */ +hostlist_iterator_t hostset_iterator_create(hostset_t set); + +/* hostlist_iterator_destroy(): + * + * Destroys a hostlist iterator. + */ +void hostlist_iterator_destroy(hostlist_iterator_t i); + +/* hostlist_iterator_reset(): + * + * Reset an iterator to the beginning of the list. + */ +void hostlist_iterator_reset(hostlist_iterator_t i); + +/* hostlist_next(): + * + * Returns a pointer to the next hostname on the hostlist + * or NULL at the end of the list + * + * The caller is responsible for freeing the returned memory. + */ +char * hostlist_next(hostlist_iterator_t i); + + +/* hostlist_next_range(): + * + * Returns the next bracketed hostlist or NULL if the iterator i is + * at the end of the list. + * + * The caller is responsible for freeing the returned memory. + * + */ +char * hostlist_next_range(hostlist_iterator_t i); + + +/* hostlist_remove(): + * Removes the last host returned by hostlist iterator i + * + * Returns 1 for success, 0 for failure. + */ +int hostlist_remove(hostlist_iterator_t i); + + +/* ----[ hostset operations ]---- */ + +/* hostset_create(): + * + * Create a new hostset object from a string representation of a list of + * hosts. See hostlist_create() for valid hostlist forms. + */ +hostset_t hostset_create(const char *hostlist); + +/* hostset_copy(): + * + * Copy a hostset object. Returned set must be freed with hostset_destroy(). + */ +hostset_t hostset_copy(hostset_t set); + +/* hostset_destroy(): + */ +void hostset_destroy(hostset_t set); + +/* hostset_insert(): + * Add a host or list of hosts into hostset "set." + * + * Returns number of hosts successfully added to "set" + * (insertion of a duplicate is not considered successful) + */ +int hostset_insert(hostset_t set, const char *hosts); + +/* hostset_delete(): + * Delete a host or list of hosts from hostset "set." + * Returns number of hosts deleted from set. + */ +int hostset_delete(hostset_t set, const char *hosts); + +/* hostset_within(): + * Return 1 if all hosts specified by "hosts" are within the hostset "set" + * Retrun 0 if every host in "hosts" is not in the hostset "set" + */ +int hostset_within(hostset_t set, const char *hosts); + +/* hostset_shift(): + * hostset equivalent to hostlist_shift() + */ +char * hostset_shift(hostset_t set); + +/* hostset_shift_range(): + * hostset eqivalent to hostlist_shift_range() + */ +char * hostset_shift_range(hostset_t set); + +/* hostset_count(): + * Count the number of hosts currently in hostset + */ +int hostset_count(hostset_t set); + + +#endif /* !_HOSTLIST_H */ diff --git a/lib/list.c b/lib/list.c new file mode 100644 index 0000000..de4de72 --- /dev/null +++ b/lib/list.c @@ -0,0 +1,835 @@ +/***************************************************************************** + * $Id$ + ***************************************************************************** + * $LSDId: list.c 3709 2006-11-29 00:51:22Z dun $ + ***************************************************************************** + * Copyright (C) 2001-2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Chris Dunlap . + * + * This file is from LSD-Tools, the LLNL Software Development Toolbox. + * + * LSD-Tools is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * LSD-Tools is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with LSD-Tools; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + ***************************************************************************** + * Refer to "list.h" for documentation on public functions. + *****************************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif /* HAVE_CONFIG_H */ + +#ifdef WITH_PTHREADS +# include +#endif /* WITH_PTHREADS */ + +#include +#include +#include +#include +#include "list.h" + + +/********************* + * lsd_fatal_error * + *********************/ + +#ifdef WITH_LSD_FATAL_ERROR_FUNC +# undef lsd_fatal_error + extern void lsd_fatal_error(char *file, int line, char *mesg); +#else /* !WITH_LSD_FATAL_ERROR_FUNC */ +# ifndef lsd_fatal_error +# include +# include +# include +# define lsd_fatal_error(file, line, mesg) \ + do { \ + fprintf(stderr, "ERROR: [%s:%d] %s: %s\n", \ + file, line, mesg, strerror(errno)); \ + } while (0) +# endif /* !lsd_fatal_error */ +#endif /* !WITH_LSD_FATAL_ERROR_FUNC */ + + +/********************* + * lsd_nomem_error * + *********************/ + +#ifdef WITH_LSD_NOMEM_ERROR_FUNC +# undef lsd_nomem_error + extern void * lsd_nomem_error(char *file, int line, char *mesg); +#else /* !WITH_LSD_NOMEM_ERROR_FUNC */ +# ifndef lsd_nomem_error +# define lsd_nomem_error(file, line, mesg) (NULL) +# endif /* !lsd_nomem_error */ +#endif /* !WITH_LSD_NOMEM_ERROR_FUNC */ + + +/*************** + * Constants * + ***************/ + +#define LIST_ALLOC 32 +#define LIST_MAGIC 0xDEADBEEF + + +/**************** + * Data Types * + ****************/ + +struct listNode { + void *data; /* node's data */ + struct listNode *next; /* next node in list */ +}; + +struct listIterator { + struct list *list; /* the list being iterated */ + struct listNode *pos; /* the next node to be iterated */ + struct listNode **prev; /* addr of 'next' ptr to prv It node */ + struct listIterator *iNext; /* iterator chain for list_destroy() */ +#ifndef NDEBUG + unsigned int magic; /* sentinel for asserting validity */ +#endif /* !NDEBUG */ +}; + +struct list { + struct listNode *head; /* head of the list */ + struct listNode **tail; /* addr of last node's 'next' ptr */ + struct listIterator *iNext; /* iterator chain for list_destroy() */ + ListDelF fDel; /* function to delete node data */ + int count; /* number of nodes in list */ +#ifdef WITH_PTHREADS + pthread_mutex_t mutex; /* mutex to protect access to list */ +#endif /* WITH_PTHREADS */ +#ifndef NDEBUG + unsigned int magic; /* sentinel for asserting validity */ +#endif /* !NDEBUG */ +}; + +typedef struct listNode * ListNode; + + +/**************** + * Prototypes * + ****************/ + +static void * list_node_create (List l, ListNode *pp, void *x); +static void * list_node_destroy (List l, ListNode *pp); +static List list_alloc (void); +static void list_free (List l); +static ListNode list_node_alloc (void); +static void list_node_free (ListNode p); +static ListIterator list_iterator_alloc (void); +static void list_iterator_free (ListIterator i); +static void * list_alloc_aux (int size, void *pfreelist); +static void list_free_aux (void *x, void *pfreelist); + + +/*************** + * Variables * + ***************/ + +static List list_free_lists = NULL; +static ListNode list_free_nodes = NULL; +static ListIterator list_free_iterators = NULL; + +#ifdef WITH_PTHREADS +static pthread_mutex_t list_free_lock = PTHREAD_MUTEX_INITIALIZER; +#endif /* WITH_PTHREADS */ + + +/************ + * Macros * + ************/ + +#ifdef WITH_PTHREADS + +# define list_mutex_init(mutex) \ + do { \ + int e = pthread_mutex_init(mutex, NULL); \ + if (e != 0) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "list mutex init"); \ + abort(); \ + } \ + } while (0) + +# define list_mutex_lock(mutex) \ + do { \ + int e = pthread_mutex_lock(mutex); \ + if (e != 0) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "list mutex lock"); \ + abort(); \ + } \ + } while (0) + +# define list_mutex_unlock(mutex) \ + do { \ + int e = pthread_mutex_unlock(mutex); \ + if (e != 0) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "list mutex unlock"); \ + abort(); \ + } \ + } while (0) + +# define list_mutex_destroy(mutex) \ + do { \ + int e = pthread_mutex_destroy(mutex); \ + if (e != 0) { \ + errno = e; \ + lsd_fatal_error(__FILE__, __LINE__, "list mutex destroy"); \ + abort(); \ + } \ + } while (0) + +# ifndef NDEBUG + static int list_mutex_is_locked (pthread_mutex_t *mutex); +# endif /* !NDEBUG */ + +#else /* !WITH_PTHREADS */ + +# define list_mutex_init(mutex) +# define list_mutex_lock(mutex) +# define list_mutex_unlock(mutex) +# define list_mutex_destroy(mutex) +# define list_mutex_is_locked(mutex) (1) + +#endif /* !WITH_PTHREADS */ + + +/*************** + * Functions * + ***************/ + +List +list_create (ListDelF f) +{ + List l; + + if (!(l = list_alloc())) + return(lsd_nomem_error(__FILE__, __LINE__, "list create")); + l->head = NULL; + l->tail = &l->head; + l->iNext = NULL; + l->fDel = f; + l->count = 0; + list_mutex_init(&l->mutex); + assert(l->magic = LIST_MAGIC); /* set magic via assert abuse */ + return(l); +} + + +void +list_destroy (List l) +{ + ListIterator i, iTmp; + ListNode p, pTmp; + + assert(l != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + i = l->iNext; + while (i) { + assert(i->magic == LIST_MAGIC); + iTmp = i->iNext; + assert(i->magic = ~LIST_MAGIC); /* clear magic via assert abuse */ + list_iterator_free(i); + i = iTmp; + } + p = l->head; + while (p) { + pTmp = p->next; + if (p->data && l->fDel) + l->fDel(p->data); + list_node_free(p); + p = pTmp; + } + assert(l->magic = ~LIST_MAGIC); /* clear magic via assert abuse */ + list_mutex_unlock(&l->mutex); + list_mutex_destroy(&l->mutex); + list_free(l); + return; +} + + +int +list_is_empty (List l) +{ + int n; + + assert(l != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + n = l->count; + list_mutex_unlock(&l->mutex); + return(n == 0); +} + + +int +list_count (List l) +{ + int n; + + assert(l != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + n = l->count; + list_mutex_unlock(&l->mutex); + return(n); +} + + +void * +list_append (List l, void *x) +{ + void *v; + + assert(l != NULL); + assert(x != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = list_node_create(l, l->tail, x); + list_mutex_unlock(&l->mutex); + return(v); +} + + +void * +list_prepend (List l, void *x) +{ + void *v; + + assert(l != NULL); + assert(x != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = list_node_create(l, &l->head, x); + list_mutex_unlock(&l->mutex); + return(v); +} + + +void * +list_find_first (List l, ListFindF f, void *key) +{ + ListNode p; + void *v = NULL; + + assert(l != NULL); + assert(f != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + for (p=l->head; p; p=p->next) { + if (f(p->data, key)) { + v = p->data; + break; + } + } + list_mutex_unlock(&l->mutex); + return(v); +} + + +int +list_delete_all (List l, ListFindF f, void *key) +{ + ListNode *pp; + void *v; + int n = 0; + + assert(l != NULL); + assert(f != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + pp = &l->head; + while (*pp) { + if (f((*pp)->data, key)) { + if ((v = list_node_destroy(l, pp))) { + if (l->fDel) + l->fDel(v); + n++; + } + } + else { + pp = &(*pp)->next; + } + } + list_mutex_unlock(&l->mutex); + return(n); +} + + +int +list_for_each (List l, ListForF f, void *arg) +{ + ListNode p; + int n = 0; + + assert(l != NULL); + assert(f != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + for (p=l->head; p; p=p->next) { + n++; + if (f(p->data, arg) < 0) { + n = -n; + break; + } + } + list_mutex_unlock(&l->mutex); + return(n); +} + + +void +list_sort (List l, ListCmpF f) +{ +/* Note: Time complexity O(n^2). + */ + ListNode *pp, *ppPrev, *ppPos, pTmp; + ListIterator i; + + assert(l != NULL); + assert(f != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + if (l->count > 1) { + ppPrev = &l->head; + pp = &(*ppPrev)->next; + while (*pp) { + if (f((*pp)->data, (*ppPrev)->data) < 0) { + ppPos = &l->head; + while (f((*pp)->data, (*ppPos)->data) >= 0) + ppPos = &(*ppPos)->next; + pTmp = (*pp)->next; + (*pp)->next = *ppPos; + *ppPos = *pp; + *pp = pTmp; + if (ppPrev == ppPos) + ppPrev = &(*ppPrev)->next; + } + else { + ppPrev = pp; + pp = &(*pp)->next; + } + } + l->tail = pp; + + for (i=l->iNext; i; i=i->iNext) { + assert(i->magic == LIST_MAGIC); + i->pos = i->list->head; + i->prev = &i->list->head; + } + } + list_mutex_unlock(&l->mutex); + return; +} + + +void * +list_push (List l, void *x) +{ + void *v; + + assert(l != NULL); + assert(x != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = list_node_create(l, &l->head, x); + list_mutex_unlock(&l->mutex); + return(v); +} + + +void * +list_pop (List l) +{ + void *v; + + assert(l != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = list_node_destroy(l, &l->head); + list_mutex_unlock(&l->mutex); + return(v); +} + + +void * +list_peek (List l) +{ + void *v; + + assert(l != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = (l->head) ? l->head->data : NULL; + list_mutex_unlock(&l->mutex); + return(v); +} + + +void * +list_enqueue (List l, void *x) +{ + void *v; + + assert(l != NULL); + assert(x != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = list_node_create(l, l->tail, x); + list_mutex_unlock(&l->mutex); + return(v); +} + + +void * +list_dequeue (List l) +{ + void *v; + + assert(l != NULL); + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + v = list_node_destroy(l, &l->head); + list_mutex_unlock(&l->mutex); + return(v); +} + + +ListIterator +list_iterator_create (List l) +{ + ListIterator i; + + assert(l != NULL); + if (!(i = list_iterator_alloc())) + return(lsd_nomem_error(__FILE__, __LINE__, "list iterator create")); + i->list = l; + list_mutex_lock(&l->mutex); + assert(l->magic == LIST_MAGIC); + i->pos = l->head; + i->prev = &l->head; + i->iNext = l->iNext; + l->iNext = i; + assert(i->magic = LIST_MAGIC); /* set magic via assert abuse */ + list_mutex_unlock(&l->mutex); + return(i); +} + + +void +list_iterator_reset (ListIterator i) +{ + assert(i != NULL); + assert(i->magic == LIST_MAGIC); + list_mutex_lock(&i->list->mutex); + assert(i->list->magic == LIST_MAGIC); + i->pos = i->list->head; + i->prev = &i->list->head; + list_mutex_unlock(&i->list->mutex); + return; +} + + +void +list_iterator_destroy (ListIterator i) +{ + ListIterator *pi; + + assert(i != NULL); + assert(i->magic == LIST_MAGIC); + list_mutex_lock(&i->list->mutex); + assert(i->list->magic == LIST_MAGIC); + for (pi=&i->list->iNext; *pi; pi=&(*pi)->iNext) { + assert((*pi)->magic == LIST_MAGIC); + if (*pi == i) { + *pi = (*pi)->iNext; + break; + } + } + list_mutex_unlock(&i->list->mutex); + assert(i->magic = ~LIST_MAGIC); /* clear magic via assert abuse */ + list_iterator_free(i); + return; +} + + +void * +list_next (ListIterator i) +{ + ListNode p; + + assert(i != NULL); + assert(i->magic == LIST_MAGIC); + list_mutex_lock(&i->list->mutex); + assert(i->list->magic == LIST_MAGIC); + if ((p = i->pos)) + i->pos = p->next; + if (*i->prev != p) + i->prev = &(*i->prev)->next; + list_mutex_unlock(&i->list->mutex); + return(p ? p->data : NULL); +} + + +void * +list_insert (ListIterator i, void *x) +{ + void *v; + + assert(i != NULL); + assert(x != NULL); + assert(i->magic == LIST_MAGIC); + list_mutex_lock(&i->list->mutex); + assert(i->list->magic == LIST_MAGIC); + v = list_node_create(i->list, i->prev, x); + list_mutex_unlock(&i->list->mutex); + return(v); +} + + +void * +list_find (ListIterator i, ListFindF f, void *key) +{ + void *v; + + assert(i != NULL); + assert(f != NULL); + assert(i->magic == LIST_MAGIC); + while ((v=list_next(i)) && !f(v,key)) {;} + return(v); +} + + +void * +list_remove (ListIterator i) +{ + void *v = NULL; + + assert(i != NULL); + assert(i->magic == LIST_MAGIC); + list_mutex_lock(&i->list->mutex); + assert(i->list->magic == LIST_MAGIC); + if (*i->prev != i->pos) + v = list_node_destroy(i->list, i->prev); + list_mutex_unlock(&i->list->mutex); + return(v); +} + + +int +list_delete (ListIterator i) +{ + void *v; + + assert(i != NULL); + assert(i->magic == LIST_MAGIC); + if ((v = list_remove(i))) { + if (i->list->fDel) + i->list->fDel(v); + return(1); + } + return(0); +} + + +static void * +list_node_create (List l, ListNode *pp, void *x) +{ +/* Inserts data pointed to by [x] into list [l] after [pp], + * the address of the previous node's "next" ptr. + * Returns a ptr to data [x], or NULL if insertion fails. + * This routine assumes the list is already locked upon entry. + */ + ListNode p; + ListIterator i; + + assert(l != NULL); + assert(l->magic == LIST_MAGIC); + assert(list_mutex_is_locked(&l->mutex)); + assert(pp != NULL); + assert(x != NULL); + if (!(p = list_node_alloc())) + return(lsd_nomem_error(__FILE__, __LINE__, "list node create")); + p->data = x; + if (!(p->next = *pp)) + l->tail = &p->next; + *pp = p; + l->count++; + for (i=l->iNext; i; i=i->iNext) { + assert(i->magic == LIST_MAGIC); + if (i->prev == pp) + i->prev = &p->next; + else if (i->pos == p->next) + i->pos = p; + assert((i->pos == *i->prev) || (i->pos == (*i->prev)->next)); + } + return(x); +} + + +static void * +list_node_destroy (List l, ListNode *pp) +{ +/* Removes the node pointed to by [*pp] from from list [l], + * where [pp] is the address of the previous node's "next" ptr. + * Returns the data ptr associated with list item being removed, + * or NULL if [*pp] points to the NULL element. + * This routine assumes the list is already locked upon entry. + */ + void *v; + ListNode p; + ListIterator i; + + assert(l != NULL); + assert(l->magic == LIST_MAGIC); + assert(list_mutex_is_locked(&l->mutex)); + assert(pp != NULL); + if (!(p = *pp)) + return(NULL); + v = p->data; + if (!(*pp = p->next)) + l->tail = pp; + l->count--; + for (i=l->iNext; i; i=i->iNext) { + assert(i->magic == LIST_MAGIC); + if (i->pos == p) + i->pos = p->next, i->prev = pp; + else if (i->prev == &p->next) + i->prev = pp; + assert((i->pos == *i->prev) || (i->pos == (*i->prev)->next)); + } + list_node_free(p); + return(v); +} + + +static List +list_alloc (void) +{ + return(list_alloc_aux(sizeof(struct list), &list_free_lists)); +} + + +static void +list_free (List l) +{ + list_free_aux(l, &list_free_lists); + return; +} + + +static ListNode +list_node_alloc (void) +{ + return(list_alloc_aux(sizeof(struct listNode), &list_free_nodes)); +} + + +static void +list_node_free (ListNode p) +{ + list_free_aux(p, &list_free_nodes); + return; +} + + +static ListIterator +list_iterator_alloc (void) +{ + return(list_alloc_aux(sizeof(struct listIterator), &list_free_iterators)); +} + + +static void +list_iterator_free (ListIterator i) +{ + list_free_aux(i, &list_free_iterators); + return; +} + + +static void * +list_alloc_aux (int size, void *pfreelist) +{ +/* Allocates an object of [size] bytes from the freelist [*pfreelist]. + * Memory is added to the freelist in chunks of size LIST_ALLOC. + * Returns a ptr to the object, or NULL if the memory request fails. + */ + void **px; + void **pfree = pfreelist; + void **plast; + + assert(sizeof(char) == 1); + assert(size >= sizeof(void *)); + assert(pfreelist != NULL); + assert(LIST_ALLOC > 0); + list_mutex_lock(&list_free_lock); + if (!*pfree) { + if ((*pfree = malloc(LIST_ALLOC * size))) { + px = *pfree; + plast = (void **) ((char *) *pfree + ((LIST_ALLOC - 1) * size)); + while (px < plast) + *px = (char *) px + size, px = *px; + *plast = NULL; + } + } + if ((px = *pfree)) + *pfree = *px; + else + errno = ENOMEM; + list_mutex_unlock(&list_free_lock); + return(px); +} + + +static void +list_free_aux (void *x, void *pfreelist) +{ +/* Frees the object [x], returning it to the freelist [*pfreelist]. + */ + void **px = x; + void **pfree = pfreelist; + + assert(x != NULL); + assert(pfreelist != NULL); + list_mutex_lock(&list_free_lock); + *px = *pfree; + *pfree = px; + list_mutex_unlock(&list_free_lock); + return; +} + + +#ifndef NDEBUG +#ifdef WITH_PTHREADS +static int +list_mutex_is_locked (pthread_mutex_t *mutex) +{ +/* Returns true if the mutex is locked; o/w, returns false. + */ + int rc; + + assert(mutex != NULL); + rc = pthread_mutex_trylock(mutex); + return(rc == EBUSY ? 1 : 0); +} +#endif /* WITH_PTHREADS */ +#endif /* !NDEBUG */ diff --git a/lib/list.h b/lib/list.h new file mode 100644 index 0000000..b031fc7 --- /dev/null +++ b/lib/list.h @@ -0,0 +1,281 @@ +/***************************************************************************** + * $Id: list.h,v 1.14 2002/12/11 19:00:36 dun Exp $ + ***************************************************************************** + * Copyright (C) 2001-2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Chris Dunlap . + * + * This file is from LSD-Tools, the LLNL Software Development Toolbox. + * + * LSD-Tools is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * LSD-Tools is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with LSD-Tools; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + *****************************************************************************/ + + +#ifndef LSD_LIST_H +#define LSD_LIST_H + + +/*********** + * Notes * + ***********/ +/* + * If NDEBUG is not defined, internal debug code will be enabled. This is + * intended for development use only and production code should define NDEBUG. + * + * If WITH_LSD_FATAL_ERROR_FUNC is defined, the linker will expect to + * find an external lsd_fatal_error(file,line,mesg) function. By default, + * lsd_fatal_error(file,line,mesg) is a macro definition that outputs an + * error message to stderr. This macro may be redefined to invoke another + * routine instead. + * + * If WITH_LSD_NOMEM_ERROR_FUNC is defined, the linker will expect to + * find an external lsd_nomem_error(file,line,mesg) function. By default, + * lsd_nomem_error(file,line,mesg) is a macro definition that returns NULL. + * This macro may be redefined to invoke another routine instead. + * + * If WITH_PTHREADS is defined, these routines will be thread-safe. + */ + + +/**************** + * Data Types * + ****************/ + +typedef struct list * List; +/* + * List opaque data type. + */ + +typedef struct listIterator * ListIterator; +/* + * List Iterator opaque data type. + */ + +typedef void (*ListDelF) (void *x); +/* + * Function prototype to deallocate data stored in a list. + * This function is responsible for freeing all memory associated + * with an item, including all subordinate items (if applicable). + */ + +typedef int (*ListCmpF) (void *x, void *y); +/* + * Function prototype for comparing two items in a list. + * Returns less-than-zero if (xy). + */ + +typedef int (*ListFindF) (void *x, void *key); +/* + * Function prototype for matching items in a list. + * Returns non-zero if (x==key); o/w returns zero. + */ + +typedef int (*ListForF) (void *x, void *arg); +/* + * Function prototype for operating on each item in a list. + * Returns less-than-zero on error. + */ + + +/******************************* + * General-Purpose Functions * + *******************************/ + +List list_create (ListDelF f); +/* + * Creates and returns a new empty list, or lsd_nomem_error() on failure. + * The deletion function [f] is used to deallocate memory used by items + * in the list; if this is NULL, memory associated with these items + * will not be freed when the list is destroyed. + * Note: Abandoning a list without calling list_destroy() will result + * in a memory leak. + */ + +void list_destroy (List l); +/* + * Destroys list [l], freeing memory used for list iterators and the + * list itself; if a deletion function was specified when the list + * was created, it will be called for each item in the list. + */ + +int list_is_empty (List l); +/* + * Returns non-zero if list [l] is empty; o/w returns zero. + */ + +int list_count (List l); +/* + * Returns the number of items in list [l]. + */ + + +/*************************** + * List Access Functions * + ***************************/ + +void * list_append (List l, void *x); +/* + * Inserts data [x] at the end of list [l]. + * Returns the data's ptr, or lsd_nomem_error() if insertion failed. + */ + +void * list_prepend (List l, void *x); +/* + * Inserts data [x] at the beginning of list [l]. + * Returns the data's ptr, or lsd_nomem_error() if insertion failed. + */ + +void * list_find_first (List l, ListFindF f, void *key); +/* + * Traverses list [l] using [f] to match each item with [key]. + * Returns a ptr to the first item for which the function [f] + * returns non-zero, or NULL if no such item is found. + * Note: This function differs from list_find() in that it does not require + * a list iterator; it should only be used when all list items are known + * to be unique (according to the function [f]). + */ + +int list_delete_all (List l, ListFindF f, void *key); +/* + * Traverses list [l] using [f] to match each item with [key]. + * Removes all items from the list for which the function [f] returns + * non-zero; if a deletion function was specified when the list was + * created, it will be called to deallocate each item being removed. + * Returns a count of the number of items removed from the list. + */ + +int list_for_each (List l, ListForF f, void *arg); +/* + * For each item in list [l], invokes the function [f] with [arg]. + * Returns a count of the number of items on which [f] was invoked. + * If [f] returns <0 for a given item, the iteration is aborted and the + * function returns the negative of that item's position in the list. + */ + +void list_sort (List l, ListCmpF f); +/* + * Sorts list [l] into ascending order according to the function [f]. + * Note: Sorting a list resets all iterators associated with the list. + * Note: The sort algorithm is stable. + */ + + +/**************************** + * Stack Access Functions * + ****************************/ + +void * list_push (List l, void *x); +/* + * Pushes data [x] onto the top of stack [l]. + * Returns the data's ptr, or lsd_nomem_error() if insertion failed. + */ + +void * list_pop (List l); +/* + * Pops the data item at the top of the stack [l]. + * Returns the data's ptr, or NULL if the stack is empty. + */ + +void * list_peek (List l); +/* + * Peeks at the data item at the top of the stack (or head of the queue) [l]. + * Returns the data's ptr, or NULL if the stack (or queue) is empty. + * Note: The item is not removed from the list. + */ + + +/**************************** + * Queue Access Functions * + ****************************/ + +void * list_enqueue (List l, void *x); +/* + * Enqueues data [x] at the tail of queue [l]. + * Returns the data's ptr, or lsd_nomem_error() if insertion failed. + */ + +void * list_dequeue (List l); +/* + * Dequeues the data item at the head of the queue [l]. + * Returns the data's ptr, or NULL if the queue is empty. + */ + + +/***************************** + * List Iterator Functions * + *****************************/ + +ListIterator list_iterator_create (List l); +/* + * Creates and returns a list iterator for non-destructively traversing + * list [l], or lsd_nomem_error() on failure. + */ + +void list_iterator_reset (ListIterator i); +/* + * Resets the list iterator [i] to start traversal at the beginning + * of the list. + */ + +void list_iterator_destroy (ListIterator i); +/* + * Destroys the list iterator [i]; list iterators not explicitly destroyed + * in this manner will be destroyed when the list is deallocated via + * list_destroy(). + */ + +void * list_next (ListIterator i); +/* + * Returns a ptr to the next item's data, + * or NULL once the end of the list is reached. + * Example: i=list_iterator_create(i); while ((x=list_next(i))) {...} + */ + +void * list_insert (ListIterator i, void *x); +/* + * Inserts data [x] immediately before the last item returned via list + * iterator [i]; once the list iterator reaches the end of the list, + * insertion is made at the list's end. + * Returns the data's ptr, or lsd_nomem_error() if insertion failed. + */ + +void * list_find (ListIterator i, ListFindF f, void *key); +/* + * Traverses the list from the point of the list iterator [i] + * using [f] to match each item with [key]. + * Returns a ptr to the next item for which the function [f] + * returns non-zero, or NULL once the end of the list is reached. + * Example: i=list_iterator_reset(i); while ((x=list_find(i,f,k))) {...} + */ + +void * list_remove (ListIterator i); +/* + * Removes from the list the last item returned via list iterator [i] + * and returns the data's ptr. + * Note: The client is responsible for freeing the returned data. + */ + +int list_delete (ListIterator i); +/* + * Removes from the list the last item returned via list iterator [i]; + * if a deletion function was specified when the list was created, + * it will be called to deallocate the item being removed. + * Returns a count of the number of items removed from the list + * (ie, '1' if the item was removed, and '0' otherwise). + */ + + +#endif /* !LSD_LIST_H */ diff --git a/lib/split.c b/lib/split.c new file mode 100644 index 0000000..e85b704 --- /dev/null +++ b/lib/split.c @@ -0,0 +1,149 @@ +/*****************************************************************************\ + * $Id: split.c 1042 2006-03-30 20:55:59Z grondo $ + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Jim Garlick . + * UCRL-CODE-2003-005. + * + * This file is part of Pdsh, a parallel remote shell program. + * For details, see . + * + * Pdsh is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Pdsh; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include "split.h" + +/* + * Helper function for list_split(). Extract tokens from str. + * Return a pointer to the next token; at the same time, advance + * *str to point to the next separator. + * sep (IN) string containing list of separator characters + * str (IN) double-pointer to string containing tokens and separators + * RETURN next token + */ +static char *_next_tok(char *sep, char **str) +{ + char *tok; + + /* push str past any leading separators */ + while (**str != '\0' && strchr(sep, **str) != NULL) + (*str)++; + + if (**str == '\0') + return NULL; + + /* assign token pointer */ + tok = *str; + + /* push str past token and leave pointing to first separator */ + while (**str != '\0' && strchr(sep, **str) == NULL) + (*str)++; + + /* nullify consecutive separators and push str beyond them */ + while (**str != '\0' && strchr(sep, **str) != NULL) + *(*str)++ = '\0'; + + return tok; +} + +/* + * Given a list of separators and a string, generate a list + * sep (IN) string containing separater characters + * str (IN) string containing tokens and separators + * RETURN new list containing all tokens + */ +List list_split(char *sep, char *str) +{ + List new = list_create((ListDelF) free); + char *tok; + + if (sep == NULL) + sep = " \t"; + + while ((tok = _next_tok(sep, &str)) != NULL) { + if (strlen(tok) > 0) + list_append(new, strdup(tok)); + } + + return new; +} + +List list_split_append (List l, char *sep, char *str) +{ + char *tok; + + if (sep == NULL) + sep = " \t"; + + while ((tok = _next_tok(sep, &str)) != NULL) { + if (strlen(tok) > 0) + list_append(l, strdup(tok)); + } + + return l; +} + +int list_join (char *result, size_t len, const char *sep, List l) +{ + char *str = NULL; + int n = 0; + int truncated = 0; + ListIterator i; + + memset (result, 0, len); + + if (list_count(l) == 0) + return (0); + + i = list_iterator_create(l); + while ((str = list_next(i))) { + int count; + + if (!truncated) { + count = snprintf(result + n, len - n, "%s%s", str, sep); + + if ((count >= (len - n)) || (count < 0)) + truncated = 1; + else + n += count; + } + else + n += strlen (str) + strlen (sep); + } + list_iterator_destroy(i); + + if (truncated) + result [len - 1] = '\0'; + else { + /* + * Delete final separator + */ + result[strlen(result) - strlen(sep)] = '\0'; + } + + return (n); +} + +/* vi: ts=4 sw=4 expandtab + */ + diff --git a/lib/split.h b/lib/split.h new file mode 100644 index 0000000..6201ea4 --- /dev/null +++ b/lib/split.h @@ -0,0 +1,35 @@ +/*****************************************************************************\ + * $Id$ + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Jim Garlick . + * UCRL-CODE-2003-005. + * + * This file is part of Pdsh, a parallel remote shell program. + * For details, see . + * + * Pdsh is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Pdsh; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#ifndef _SPLIT_H +#define _SPLIT_H + +#include "list.h" + +List list_split (char *sep, char *str); +List list_split_append (List l, char *sep, char *str); +int list_join (char *result, size_t len, const char *sep, List l); + +#endif /* !_SPLIT_H */ diff --git a/oom-detect.c b/oom-detect.c new file mode 100644 index 0000000..512765b --- /dev/null +++ b/oom-detect.c @@ -0,0 +1,315 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +/*############################################################################ + * $Id$ + *############################################################################ + * + * SLURM spank plugin to detect tasks killed by OOM killer using CHAOS + * kernel /proc/oomkilled file. + * + * Requires SGI Job container-based process tracking. + * + *############################################################################ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include + +SPANK_PLUGIN(oom-detect, 1) + +typedef jid_t (*getjid_f) (pid_t pid); + +static int do_syslog = 0; + +static void * libjob = NULL; +static getjid_f getjid = NULL; +static jid_t jid = (jid_t) -1; +static uint32_t ntasks = (uint32_t) -1; + + +int slurm_spank_init (spank_t sp, int ac, char *av[]) +{ + if (!spank_remote (sp)) + return (0); + + if (ac > 0 && strcmp (av[0], "do_syslog") == 0) + do_syslog = 1; + + if (!(libjob = dlopen ("libjob.so", RTLD_LAZY))) { + slurm_error ("Failed to open libjob.so: %s", dlerror ()); + return (-1); + } + + if (!(getjid = (getjid_f) dlsym (libjob, "job_getjid"))) { + slurm_error ("Failed to resolve job_getjid in libjob: %s", + dlerror ()); + return (-1); + } + + /* + * spank_init runs after slurm job container has been created, so + * now determine our jid. + */ + if ((jid = (*getjid) (getpid ())) == (jid_t) -1) + slurm_info ("Failed to get job container id"); + + if (spank_get_item (sp, S_JOB_LOCAL_TASK_COUNT, &ntasks)) { + slurm_error ("spank_get_item (S_JOB_LOCAL_TASK_COUNT) failed."); + /* must be at least one task */ + ntasks = 1; + } + + return (0); +} + +#define OOMKILLED_FILENAME "/proc/oomkilled" + +struct oomkilled_data { + uint64_t jobid; + pid_t pid; + long vmsize; + long rss; + char comm[16]; +}; + +static int _parse_oomkilled (struct oomkilled_data *data, size_t size) +{ + char buf [4096]; + char *bufptr; + char *line; + ssize_t len; + int count = 0; + int fd = -1; + int rv = -1; + + assert(data && size); + + if (access (OOMKILLED_FILENAME, R_OK) < 0) { + goto cleanup; + } + + if ((fd = open (OOMKILLED_FILENAME, O_RDONLY)) < 0) { + goto cleanup; + } + + memset(buf, '\0', sizeof (buf)); + if ((len = read (fd, buf, sizeof (buf))) < 0) { + goto cleanup; + } + + if (!len) + return 0; + + + line = strtok_r (buf, "\n", &bufptr); + do { + struct oomkilled_data *d; + + if (count >= size) { + errno = ENOSPC; + goto cleanup; + } + + d = &data[count]; + memset (d, 0, sizeof (*d)); + if (sscanf (line, "%lu %d %ld %ld %15c", + &d->jobid, &d->pid, &d->vmsize, &d->rss, d->comm) != 5) { + goto cleanup; + } + count++; + + } while ((line = strtok_r (NULL, "\n", &bufptr))); + + rv = count; +cleanup: + close(fd); + return rv; +} + + +int oomkilled_pids (jid_t jid, struct oomkilled_data *d, size_t len) +{ + struct oomkilled_data data [64]; + int count; + int i; + int index = 0; + + if ((count = _parse_oomkilled (data, 64)) < 0) { + return -1; + } + + for (i = 0; i < count; i++) { + if ((jid_t) data[i].jobid == jid) { + if (index >= len) { + errno = ENOSPC; + return -1; + } + d[index++] = data[i]; + } + } + + return (index); +} + +static int pid_reported (pid_t pid) +{ + static pid_t pids[64]; + static int initialized = 0; + int i = 0; + + if (!initialized) { + memset (pids, 0, sizeof (pids)); + initialized = 1; + } + + for (i = 0; i < 64; i++) { + if (pids[i] == 0) { + pids[i] = pid; + return (0); + } + if (pids[i] == pid) + return (1); + } + + return (0); +} + +static void print_oomkilled_error (struct oomkilled_data *d, int taskid) +{ + char buf [256]; + const size_t siz = sizeof (buf); + int len = 0; + + memset (buf, 0, sizeof (buf)); + + if (d->vmsize) { + if ((len = snprintf (buf, siz, " VmSize: %ldM", d->vmsize/1024)) < 0) + len = 0; + } + if (d->rss) + len += snprintf (buf+len, siz - len, " RSS: %ldM", d->rss/1024); + + if ((len >= siz)) { + buf [siz - 2] = '+'; + buf [siz - 1] = '\0'; + } + + if (taskid >= 0) { + slurm_error ("task%d: [%s] terminated by OOM killer.", taskid, d->comm); + if (d->vmsize || d->rss) + slurm_error ("task%d:%s", taskid, buf); + } else { + slurm_error ("pid %ld: [%s] %s terminated by OOM killer.\n", + d->pid, d->comm, "(task id unknown)"); + if (d->vmsize || d->rss) + slurm_error ("pid %ld:%s", d->pid, buf); + } + return; +} + +static void send_syslog_oom_msg (spank_t sp) +{ + uint32_t jobid; + uint32_t stepid; + uid_t uid; + + if ((spank_get_item (sp, S_JOB_ID, &jobid) != ESPANK_SUCCESS) || + (spank_get_item (sp, S_JOB_STEPID, &stepid) != ESPANK_SUCCESS) || + (spank_get_item (sp, S_JOB_UID, &uid) != ESPANK_SUCCESS)) { + slurm_error ("Failed to get jobid, stepid, or uid for syslog msg."); + return; + } + + openlog ("slurmd", 0, LOG_USER); + syslog (LOG_WARNING, "OOM detected: jobid=%u.%u uid=%u", jobid, stepid, uid); + closelog (); + slurm_verbose ("Sent OOM message via syslog for this job."); + +} + +int slurm_spank_task_exit (spank_t sp, int ac, char *av[]) +{ + static int nexited = 0; + struct oomkilled_data killed [16]; + int n; + int i; + + if ((jid == (jid_t) -1) || (ntasks == (uint32_t) -1)) + return (0); + + ++nexited; + + /* + * As each task exits, report to user if any processes + * were terminated by OOM killer + */ + if (!(n = oomkilled_pids (jid, killed, 16))) + return (0); + + for (i = 0; i < n; i++) { + struct oomkilled_data *d = &killed[i]; + uint32_t taskid; + + if (pid_reported (d->pid)) + continue; + + spank_get_item (sp, S_JOB_PID_TO_GLOBAL_ID, d->pid, &taskid); + + print_oomkilled_error (d, taskid); + } + + if (nexited == ntasks) { + if (do_syslog) + send_syslog_oom_msg (sp); + /* + * If we got here, then we printed one or more OOM killed message + * to user's stderr. Delay a bit here to make it more likely + * that the user gets the message. + */ + sleep (2); + } + return (0); +} + +int slurm_spank_exit (spank_t sp, int ac, char *av[]) +{ + return (0); +} + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/overcommit-memory/Makefile b/overcommit-memory/Makefile new file mode 100644 index 0000000..a065cc9 --- /dev/null +++ b/overcommit-memory/Makefile @@ -0,0 +1,17 @@ +SHOPTS := -shared +OBJS := overcommit-memory.o overcommit.o ../lib/fd.o + +all: overcommit-memory.so overcommit-util + +overcommit-memory.so : $(OBJS) + $(CC) $(SHOPTS) -o overcommit-memory.so $(OBJS) + +overcommit-util : util.o overcommit.o ../lib/fd.o + $(CC) -o overcommit-util util.o overcommit.o ../lib/fd.o -lpthread + +.c.o : + $(CC) -ggdb -I../lib -Wall $(CFLAGS) -o $@ -fPIC -c $< + + +clean: + rm -f *.o *.so overcommit-util diff --git a/overcommit-memory/overcommit-memory.c b/overcommit-memory/overcommit-memory.c new file mode 100644 index 0000000..41200ec --- /dev/null +++ b/overcommit-memory/overcommit-memory.c @@ -0,0 +1,220 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "overcommit.h" + +SPANK_PLUGIN (overcommit, 1); + +const char env_flag [] = "SPANK_OVERCOMMIT_MEMORY_FLAG"; + +static int jobid; +static int stepid; +static int overcommit_ratio = 100; +static overcommit_shared_ctx_t ctx = NULL; + +static int overcommit_opt_process (int val, const char *arg, int remote); + +struct spank_option spank_options [] = +{ + { "overcommit-memory", "[m]", + "Choose memory overcommit mode [m] (always|off|on) for all nodes of job.", + 1, 0, + (spank_opt_cb_f) overcommit_opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + +static int set_overcommit_policy (int val) +{ + ctx = overcommit_shared_ctx_create (jobid, stepid); + + if (ctx == NULL) + return (-1); + + if (overcommit_in_use (ctx, val)) { + slurm_error ("overcommit-memory: Cannot set desired mode on this node"); + overcommit_shared_ctx_destroy (ctx); + } + else if (overcommit_memory_set_current_state (val) < 0) + slurm_error ("overcommit-memory: Failed to set overcommit = %d", val); + else if (overcommit_ratio_set (overcommit_ratio) < 0) + slurm_error ("overcommit-memory: Failed to set overcommit_ratio to %d\n", + overcommit_ratio); + + return (0); +} + +static int strnmatch (const char *src, int n, ...) +{ + int i = 0; + int rc = 0; + va_list ap; + + va_start (ap, n); + + while ((i++ < n) && !(rc = (strcmp (src, va_arg (ap, char *)) == 0))) {;} + + va_end (ap); + + return (rc); +} + +static int overcommit_opt_process (int val, const char *arg, int remote) +{ + int overcommit_mode = 0; + + if (strnmatch (arg, 4, "off", "no", "never", "2")) + overcommit_mode = 2; + else if (strnmatch (arg, 2, "always", "1")) + overcommit_mode = 1; + else if (strnmatch (arg, 2, "on", "yes", "0")) + overcommit_mode = 0; + else { + slurm_error ("--overcommit-memory: invalid argument: %s", arg); + return (-1); + } + + if (!remote) { + /* Need to set a flag in environment so slurmd knows that a + * command line option is called and won't apply any environment + * options. + */ + setenv ("SPANK_OVERCOMMIT_MEMORY_FLAG", "1", 1); + return (0); + } + + if (set_overcommit_policy (overcommit_mode) < 0) + return (-1); + + return (0); +} + +static int check_env (spank_t sp, int remote) +{ + char buf [64]; + const char var[] = "SLURM_OVERCOMMIT_MEMORY"; + + /* If env_flag is set in environment, ignore options set from + * environment since command line option should override + */ + if (spank_getenv (sp, env_flag, buf, sizeof (buf)) == ESPANK_SUCCESS) { spank_unsetenv (sp, env_flag); + return (0); + } + + if (spank_getenv (sp, var, buf, sizeof (buf)) == ESPANK_SUCCESS) { + if (overcommit_opt_process (0, buf, remote) < 0) { + slurm_error ("Environment setting %s=%s invalid", var, buf); + return (-1); + } + } + + return (0); +} + +static int str2int (const char *str) +{ + char *p; + long l = strtol (str, &p, 10); + + if (p && (*p != '\0')) + return (-1); + + return ((int) l); +} + +int parse_options (int ac, char **av) +{ + int i; + int retval = 0; + + for (i = 0; i < ac; i++) { + if (strncmp ("ratio=", av[i], 6) == 0) { + char *ratio = av[i] + 6; + if ((overcommit_ratio = str2int (ratio)) < 0) { + slurm_error ("overcommit-memory: Invalid ratio = %s\n", ratio); + retval = -1; + } + } + else { + slurm_error ("overcommit-memory: Invalid option %s\n", av[i]); + retval = -1; + } + } + + return (retval); +} + +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + if (parse_options (ac, av) < 0) + return (-1); + + if (!spank_remote (sp)) { + if (check_env (sp, 0) < 0) + return (-1); + return (0); + } + + /* + * Set jobid and stepid from spank_init. Options are processed + * *after* spank_init, but the option handler does not have access + * to the spank_t handle. + */ + spank_get_item (sp, S_JOB_ID, &jobid); + spank_get_item (sp, S_JOB_STEPID, &stepid); + + if (check_env (sp, 1) < 0) + return (-1); + + return (0); +} + + +int slurm_spank_exit (spank_t sp, int ac, char **av) +{ + if (!spank_remote (sp) || !ctx) + return (0); + + overcommit_shared_ctx_unregister (ctx); + + return (0); +} + + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/overcommit-memory/overcommit.c b/overcommit-memory/overcommit.c new file mode 100644 index 0000000..eb556a1 --- /dev/null +++ b/overcommit-memory/overcommit.c @@ -0,0 +1,383 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "overcommit.h" +#include "fd.h" + +static const char shared_filename [] = "/tmp/spank-overcommit-memory"; +static const char overcommit_file [] = "/proc/sys/vm/overcommit_memory"; +static const char overcommit_ratio_file [] = "/proc/sys/vm/overcommit_ratio"; + +struct overcommit_job_info { + int jobid; + int stepid; + int used; +}; + +struct overcommit_shared_info { + sem_t sem; + int initialized; + int overcommit_value; + int previous_overcommit_ratio; + int nusers; + struct overcommit_job_info users [64]; +}; + +struct overcommit_shared_context { + int fd; + int jobid; + int stepid; + struct overcommit_shared_info *shared; +}; + +static int +unregister_job (overcommit_shared_ctx_t ctx) +{ + int i; + int maxn = sizeof (ctx->shared->users) / sizeof (int); + + for (i = 0; i < maxn; i++) { + struct overcommit_job_info *j = &ctx->shared->users[i]; + + if ((j->jobid == ctx->jobid) + && ((ctx->stepid < 0) || (j->stepid == ctx->stepid))) { + memset (j, 0, sizeof (*j)); + ctx->shared->nusers--; + return (0); + } + } + + return (-1); +} + +static int register_job (overcommit_shared_ctx_t ctx) +{ + int i; + int maxn = sizeof (ctx->shared->users) / sizeof (int); + + for (i = 0; i < maxn; i++) { + struct overcommit_job_info *j = &ctx->shared->users[i]; + if (!j->used) { + j->used = 1; + j->jobid = ctx->jobid; + j->stepid = ctx->stepid; + ctx->shared->nusers++; + return (0); + } + } + + return (-1); +} + +static int overcommit_shared_file_initialized (overcommit_shared_ctx_t ctx) +{ + struct stat st; + + if (fstat (ctx->fd, &st) < 0) { + fprintf (stderr, "fstat (%s): %s\n", shared_filename, strerror (errno)); + return (-1); + } + + if (st.st_uid != geteuid ()) { + fprintf (stderr, "Bad owner on %s: uid=%d\n", + shared_filename, st.st_uid); + return (-1); + } + + if (st.st_size == sizeof (*ctx->shared)) + return (1); + + return (0); +} + +static int overcommit_shared_info_init (overcommit_shared_ctx_t ctx) +{ + int len = sizeof (*ctx->shared); + int initialized; + + if (ctx->fd < 0) { + fprintf (stderr, "ctx->fd < 0!\n"); + return (-1); + } + if (fd_get_write_lock (ctx->fd) < 0) + fprintf (stderr, "Failed to get write lock: %s\n", strerror (errno)); + + if (fd_set_close_on_exec (ctx->fd)) + fprintf (stderr, "fd_set_close_on_exec(): %s\n", strerror (errno)); + + if ((initialized = overcommit_shared_file_initialized (ctx)) < 0) + return (-1); + + if (!initialized) + ftruncate (ctx->fd, len); + + ctx->shared = mmap (0, len, PROT_READ|PROT_WRITE, MAP_SHARED, ctx->fd, 0); + + if (ctx->shared == MAP_FAILED) { + fprintf (stderr, "mmap (%s): %s\n", shared_filename, strerror (errno)); + return (-1); + } + + if (!initialized) { + memset (ctx->shared, 0, len); + + if (sem_init (&ctx->shared->sem, 1, 1) < 0) { + fprintf (stderr, "sem_init: %s\n", strerror (errno)); + return (-1); + } + } + + if (fd_release_lock (ctx->fd) < 0) + fprintf (stderr, "Failed to release file lock: %s\n", strerror (errno)); + + return (0); +} + +overcommit_shared_ctx_t overcommit_shared_ctx_attach () +{ + overcommit_shared_ctx_t ctx = malloc (sizeof (*ctx)); + + memset (ctx, 0, sizeof (*ctx)); + ctx->jobid = ctx->stepid = -1; + + if ((ctx->fd = open (shared_filename, O_RDWR)) < 0) + return (NULL); + + if (overcommit_shared_info_init (ctx) < 0) { + overcommit_shared_ctx_destroy (ctx); + return (NULL); + } + + sem_wait (&ctx->shared->sem); + + return (ctx); +} + +overcommit_shared_ctx_t +overcommit_shared_ctx_create (int jobid, int stepid) +{ + int flags = O_RDWR | O_CREAT | O_EXCL; + + overcommit_shared_ctx_t ctx = malloc (sizeof (*ctx)); + + if (!ctx) + return (NULL); + + memset (ctx, 0, sizeof (*ctx)); + ctx->jobid = jobid; + ctx->stepid = stepid; + + if ((ctx->fd = open (shared_filename, flags, 0600)) < 0) { + if ((errno != EEXIST) + || ((ctx->fd = open (shared_filename, O_RDWR)) < 0)) { + fprintf (stderr, "Failed to open overcommit shared info: %s", + strerror (errno)); + overcommit_shared_ctx_destroy (ctx); + return (NULL); + } + } + + + if (overcommit_shared_info_init (ctx) < 0) { + overcommit_shared_ctx_destroy (ctx); + return (0); + } + + sem_wait (&ctx->shared->sem); + + return (ctx); +} + + +int overcommit_shared_cleanup (int jobid, int stepid) +{ + int rc = 0; + overcommit_shared_ctx_t ctx; + + if ((ctx = overcommit_shared_ctx_create (jobid, stepid))) { + rc = unregister_job (ctx); + overcommit_shared_ctx_destroy (ctx); + } else if (overcommit_memory_get_current_state () != 0) { + overcommit_memory_set_current_state (0); + } + return (rc); +} + +int overcommit_force_cleanup () +{ + if ((unlink (shared_filename) < 0) && (errno != ENOENT)) + fprintf (stderr, "Failed to remove %s: %s\n", shared_filename, + strerror (errno)); + if (overcommit_memory_get_current_state () != 0) { + return (overcommit_memory_set_current_state (0)); + overcommit_ratio_set (50); /* XXX: Need a way to set default!! */ + } + return (0); +} + +void overcommit_shared_ctx_destroy (overcommit_shared_ctx_t ctx) +{ + if (ctx->shared->nusers == 0) { + unlink (shared_filename); + if (overcommit_memory_get_current_state () != 0) + overcommit_memory_set_current_state (0); + overcommit_ratio_set (ctx->shared->previous_overcommit_ratio); + } + sem_post (&ctx->shared->sem); + munmap (ctx->shared, sizeof (*ctx->shared)); + close (ctx->fd); + free (ctx); +} + +void overcommit_shared_ctx_unregister (overcommit_shared_ctx_t ctx) +{ + sem_wait (&ctx->shared->sem); + unregister_job (ctx); + overcommit_shared_ctx_destroy (ctx); +} + +int overcommit_shared_list_users () +{ + overcommit_shared_ctx_t ctx; + int i; + int maxn = sizeof (ctx->shared->users) / sizeof (int); + + if (!(ctx = overcommit_shared_ctx_attach ()) || ctx->shared->nusers == 0) { + fprintf (stdout, "No users currently using overcommit-memory\n"); + return (0); + } + + fprintf (stdout, "%d users of overcommit-memory on this node:\n", + ctx->shared->nusers); + + for (i = 0; i < maxn; i++) { + struct overcommit_job_info *j = &ctx->shared->users[i]; + if (j->used) + fprintf (stdout, "%d.%d\n", j->jobid, j->stepid); + } + fprintf (stdout, "\n"); + fprintf (stdout, "Current setting = %d\n", ctx->shared->overcommit_value); + fprintf (stdout, "Current ratio = %d\n", overcommit_ratio_get ()); + fprintf (stdout, "Previous ratio = %d\n", + ctx->shared->previous_overcommit_ratio); + + + overcommit_shared_ctx_destroy (ctx); + return (0); +} + +int overcommit_in_use (overcommit_shared_ctx_t ctx, int value) +{ + int rc = 0; + if ((ctx->shared->nusers > 0) && (ctx->shared->overcommit_value != value)) + rc = 1; + else { + if (!ctx->shared->nusers) { + ctx->shared->overcommit_value = value; + ctx->shared->previous_overcommit_ratio = overcommit_ratio_get (); + } + register_job (ctx); + } + sem_post (&ctx->shared->sem); + + return (rc); +} + +int overcommit_memory_get_current_state () +{ + int val = -1; + FILE *fp; + + if (!(fp = fopen (overcommit_file, "r"))) + return (-1); + + fscanf (fp, "%d", &val); + + fclose (fp); + + return (val); +} + +int overcommit_memory_set_current_state (int val) +{ + FILE *fp; + + if (val > 2 || val < 0) + return (-1); + + if (!(fp = fopen (overcommit_file, "w"))) { + fprintf (stderr, "open (%s): %s\n", overcommit_file, strerror (errno)); + return (-1); + } + + fprintf (fp, "%d\n", val); + + fclose (fp); + + return (0); +} + +int overcommit_ratio_set (int val) +{ + FILE *fp; + + if (!(fp = fopen (overcommit_ratio_file, "w"))) + return (-1); + + fprintf (fp, "%d\n", val); + + fclose (fp); + + return (0); +} + +int overcommit_ratio_get () +{ + int val = -1; + FILE *fp; + + if (!(fp = fopen (overcommit_ratio_file, "r"))) + return (-1); + + fscanf (fp, "%d", &val); + + fclose (fp); + + return (val); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/overcommit-memory/overcommit.h b/overcommit-memory/overcommit.h new file mode 100644 index 0000000..0341980 --- /dev/null +++ b/overcommit-memory/overcommit.h @@ -0,0 +1,47 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#ifndef _HAVE_OVERCOMMIT_H +#define _HAVE_OVERCOMMIT_H + +typedef struct overcommit_shared_context * overcommit_shared_ctx_t; + +overcommit_shared_ctx_t overcommit_shared_ctx_create (int jobid, int stepid); + +void overcommit_shared_ctx_destroy (overcommit_shared_ctx_t ctx); +void overcommit_shared_ctx_unregister (overcommit_shared_ctx_t ctx); + +int overcommit_in_use (overcommit_shared_ctx_t ctx, int value); +int overcommit_shared_list_users (); + +int overcommit_shared_cleanup (int jobid, int stepid); +int overcommit_force_cleanup (); + +int overcommit_memory_get_current_state (); +int overcommit_memory_set_current_state (int value); + +int overcommit_ratio_get (); +int overcommit_ratio_set (int value); + +#endif /* !_HAVE_OVERCOMMIT_H */ diff --git a/overcommit-memory/util.c b/overcommit-memory/util.c new file mode 100644 index 0000000..dd5f253 --- /dev/null +++ b/overcommit-memory/util.c @@ -0,0 +1,201 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include + +#include "overcommit.h" + +char *prog = NULL; + +static int cleanup = 0; +static int list_users = 0; +static int force_reset = 0; +static int jobid = -1; + +#define __GNU_SOURCE +#include + +struct option opt_table [] = { + { "help", 0, NULL, 'h' }, + { "cleanup", 0, NULL, 'c' }, + { "list-users", 0, NULL, 'l' }, + { "force-reset", 0, NULL, 'f' }, + { "jobid", 1, NULL, 'j' }, + { NULL, 0, NULL, 0 } +}; + +const char opt_string[] = "hclfj:"; + +#define USAGE "\ +Usage: %s [OPTONS]\n\ + -h, --help Display this message\n\ + -l, --list-users List current jobs using overcommit-memory plugin.\n\ + -c, --cleanup Cleanup any overcommit-memory usage by a SLURM job.\n\ + SLURM_JOBID and SLURM_STEPID should be set in current\n\ + environment. Removes shared memory file and resets\n\ + overcommit_memory to default if no more references\n\ + to overcommit-memory exist.\n\ + -f, --force-reset Force total cleanup of overcommit-memory state. Reset\n\ + overcommit_memory setting to default and remove\n\ + overcommit shared file.\n\ + -j, --jobid=ID Specify SLURM jobid to clean up after if SLURM_JOBID\n\ + not set in environment\n" + +static int get_env_int (const char *var); +static int str2int (const char *str); +static int parse_cmdline (int ac, char **av); +static void log_fatal (char *fmt, ...); + +int main (int ac, char *av[]) +{ + int stepid = -1; + + parse_cmdline (ac, av); + + if (jobid < 0) + jobid = get_env_int ("SLURM_JOBID"); + if (stepid < 0) + stepid = get_env_int ("SLURM_STEPID"); + + if (cleanup && jobid < 0) + log_fatal ("--cleanup requires SLURM_JOBID in environment\n"); + + if (!cleanup && !list_users && !force_reset) + log_fatal ("Specify one of --cleanup, --force-reset, or --list-users.\n"); + + if (list_users) + overcommit_shared_list_users (); + + if (force_reset) { + if (overcommit_force_cleanup () < 0) + return (1); + printf ("Successfuly reset overcommit-memory state\n"); + } + else if (cleanup) { + /* + * If overcommit_shared_cleanup returns < 0, this probably just + * means that the jobid.stepid is not in the shared memory state. + */ + if (overcommit_shared_cleanup (jobid, stepid) < 0) + printf ("No overcommit state for job %d\n", jobid); + else + printf ("Succesfully cleaned up overcommit state for job %d\n", + jobid); + } + + return (0); +} + +static void usage (const char *prog) +{ + fprintf (stderr, USAGE, prog); +} + +static int parse_cmdline (int ac, char **av) +{ + prog = basename (av[0]); + + for (;;) { + char c = getopt_long (ac, av, opt_string, opt_table, NULL); + + if (c == -1) + break; + + switch (c) { + case 'h': + usage (prog); + exit (0); + case 'c': + cleanup = 1; + break; + case 'l': + list_users = 1; + break; + case 'f': + force_reset = 1; + break; + case 'j': + if ((jobid = str2int (optarg)) < 0) + log_fatal ("Invalid argument: --jobid=%s\n", optarg); + break; + case '?': + if (optopt > 0) + fprintf (stderr, "%s: Invalid option \"-%c\"\n", + prog, optopt); + else + fprintf (stderr, "%s: Invalid option \"%s\"\n", + prog, av[optind-1]); + break; + default: + fprintf (stderr, "%s: Unimplemented option \"%s\"\n", + prog, av[optind-1]); + break; + } + } + + return (0); +} + +static void log_fatal (char *fmt, ...) +{ + va_list ap; + va_start (ap, fmt); + fprintf (stderr, "%s: ", prog); + vfprintf (stderr, fmt, ap); + va_end (ap); + exit (1); +} + +static int str2int (const char *str) +{ + char *p; + long l = strtol (str, &p, 10); + + if (p && (*p != '\0')) + return (-1); + + return ((int) l); +} + +static int get_env_int (const char *var) +{ + char *val; + int id; + + if (!(val = getenv (var))) + return (-1); + + if ((id = str2int (val)) < 0) + log_fatal ("Bad environment value: %s=%s\n", var, val); + + return (id); +} + + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/preserve-env.c b/preserve-env.c new file mode 100644 index 0000000..bf738ad --- /dev/null +++ b/preserve-env.c @@ -0,0 +1,244 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +/**************************************************************************** + * + * preserve-env.so + * + * This SLURM spank plugin will preserve all SLURM_* environment + * variables from srun's invoking shell to the remote node or nodes + * on which the command specified by srun is invoked. The main purpose + * is to preserve the environment from a SLURM allocation shell + * (e.g. salloc), onto a remote "login" shell spawned with + * + * srun -n1 --pty $SHELL. + * + * Normally, SLURM environment variables would be reset in the + * remote shell, but when using --preserve-slurm-env, they will + * remain essentially the same as in the shell spawned by salloc. + * + ****************************************************************************/ + +#include +#include +#include +#include +#include + +#include "lib/list.h" + +#include + +SPANK_PLUGIN (preserve-env, 1) + + +/**************************************************************************** + * + * Set up a --preserve-slurm-env option for srun: + * + ****************************************************************************/ +static unsigned int enabled = 0; + +static int preserve_opt_process (int val, const char *optarg, int remote) +{ + enabled = 1; + return (0); +} + +struct spank_option spank_options [] = +{ + { "preserve-slurm-env", NULL, + "Preserve all current SLURM_ env vars in remote session", + 0, 0, (spank_opt_cb_f) preserve_opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + +/****************************************************************************/ + +/* + * Copy env var entry in [entry] into buffer [var] of size [len], + * NUL terminating at '='. Furthermore, if [valp] is non-NULL, + * set [valp] to point to first character after nullified '='. + * + */ +static int get_env_var (const char *entry, char *var, int len, char **valp) +{ + const char *p = entry; + + memset (var, 0, len); + + while (*p != '\0') { + *var = *p; + + if (*var == '=') { + *var = '\0'; + if (valp) + *valp = var + 1; + } + p++; + var++; + } + + return 0; +} + +/* + * Preserve the SLURM_* environment entry in [entry] by renaming + * it save_SLURM_*. + */ +static int preserve_slurm_var (const char *entry) +{ + char *val; + char var [1024]; + char newvar [1024]; + int n; + int len = sizeof (var) - 1; + + get_env_var (entry, var, len, &val); + + n = snprintf (newvar, len, "save_%s", var); + + if (n < 0 || n >= len) { + fprintf (stderr, "Variable name %s too long to copy!\n", var); + return (-1); + } + + if (setenv (newvar, val, 1) < 0) { + fprintf (stderr, "Failed to set %s=%s: %s\n", + newvar, val, strerror (errno)); + return (-1); + } + return (0); +} + +extern char **environ; + +int slurm_spank_local_user_init (spank_t sp, int ac, char **av) +{ + char **p = environ; + + if (!enabled) + return (0); + + while (*p != NULL) { + /* + * Preserve SLURM environment variables + * (except for those we know we don't need) + */ + if (strncmp (*p, "SLURM_", 6) == 0 && + strncmp (*p, "SLURM_RLIMIT", 12) != 0 && + strncmp (*p, "SLURM_UMASK", 11) != 0 && + strncmp (*p, "SLURM_PRIO", 10) != 0 && + preserve_slurm_var (*p) < 0) + return (-1); + ++p; + } + + return (0); +} + +int slurm_spank_task_init (spank_t sp, int ac, char **av) +{ + List l; + const char **env; + char *entry; + char var [64]; + char *val; + + if (!enabled) + return (0); + + /* + * The following routine unsets all SLURM_* and MPIRUN_* + * environment variables, and resets the saved variables + * in save_*. We are careful not to walk the env array + * at the same time as adding and removing variables, so + * we instead use the list 'l' to hold environment entries + * for the next operation. + * + * The first step accumulates and removes all unwanted variables, + * then the second step resets the saved variables. + */ + l = list_create (NULL); + + if (spank_get_item (sp, S_JOB_ENV, &env) != ESPANK_SUCCESS) { + fprintf (stderr, "Failed to get job environment!\n"); + return (-1); + } + + /* + * First collect all env vars to unset + */ + while (*env != NULL) { + if (strncmp (*env, "SLURM_", 6) == 0 || + strncmp (*env, "MPIRUN_", 7) == 0) + list_push (l, strdup (*env)); + ++env; + } + + while ((entry = list_pop (l))) { + get_env_var (entry, var, sizeof (var), &val); + spank_unsetenv (sp, var); + free (entry); + } + + /* + * Now search for saved SLURM env vars to reset + */ + + if (spank_get_item (sp, S_JOB_ENV, &env) != ESPANK_SUCCESS) { + fprintf (stderr, "Failed to get job environment!\n"); + return (-1); + } + + while (*env != NULL) { + if (strncmp (*env, "save_SLURM_", 11) == 0) + list_push (l, strdup (*env)); + env++; + } + + while ((entry = list_pop (l))) { + get_env_var (entry, var, sizeof (var), &val); + + if (spank_setenv (sp, var + 5, val, 1) != ESPANK_SUCCESS) { + fprintf (stderr, "spank_setenv (%s) failed\n", var + 5); + } + + /* + * Now unset the unneeded save_* var + */ + spank_unsetenv (sp, var); + + free (entry); + } + + list_destroy (l); + + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/pty.c b/pty.c new file mode 100644 index 0000000..fd4451a --- /dev/null +++ b/pty.c @@ -0,0 +1,565 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +/* + * Hack to run task 0 under a pty for a slurm job. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + + +#include + +SPANK_PLUGIN (pty, 1) + +/* + * Globals: + */ +static int do_pty = 0; +static int master = -1; +static int listenfd = -1; +static pid_t pid; +static struct termios termdefaults; + +static int pty_opt_process (int val, const char *optarg, int remote); + +struct spank_option spank_options[] = +{ + { "pty", NULL, + "Allocate a pty for rank 0. Must also specify -u." + " (Use of --pty implies --output=0)", + 0, 0, (spank_opt_cb_f) pty_opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + + +struct pty_winsz { + unsigned rows; + unsigned cols; +}; + +static void pty_winsz_pack (struct pty_winsz *w) +{ + w->rows = htonl (w->rows); + w->cols = htonl (w->cols); +} + +static void pty_winsz_unpack (struct pty_winsz *w) +{ + w->rows = ntohl (w->rows); + w->cols = ntohl (w->cols); +} + +static int pty_opt_process (int val, const char *optarg, int remote) +{ + do_pty = 1; + return (0); +} + +void process_pty () +{ + unsigned char buf [4096]; + int len; + + if ((len = read (master, buf, sizeof (buf))) < 0) { + if (errno == EAGAIN) + return; + if (errno == EIO) /* Why do we get this sometimes */ + return; + slurm_error ("read (pty master): %m\n"); + exit (1); + } + else if (len == 0) { + close (STDOUT_FILENO); + close (master); + master = -1; + return; + } + + write (STDOUT_FILENO, buf, len); +} + +void process_stdin () +{ + unsigned char buf [4096]; + int len; + + if ((len = read (STDIN_FILENO, buf, sizeof (buf))) < 0) { + slurm_error ("stdin read: %m\n"); + exit (1); + } + else if (len == 0) { + close (STDOUT_FILENO); + master = -1; + return; + } + + write (master, buf, len); +} + +void check_for_slave_exit () +{ + int status = 0; + + if (waitpid (pid, &status, WNOHANG) <= 0) + return; + + if (WIFEXITED (status)) + exit (status); +} + +static int fd_set_nonblocking (int fd) +{ + int fval; + + assert (fd >= 0); + + if ((fval = fcntl (fd, F_GETFL, 0)) < 0) + return (-1); + if (fcntl (fd, F_SETFL, fval | O_NONBLOCK) < 0) + return (-1); + return (0); +} + +static int get_winsize (spank_t sp, struct winsize *wsp) +{ + char val [64]; + + memset (wsp, 0, sizeof (*wsp)); + + if (spank_getenv (sp, "SLURM_PTY_WIN_ROW", val, 64) == ESPANK_SUCCESS) { + spank_unsetenv (sp, "SLURM_PTY_WIN_ROW"); + wsp->ws_row = atoi (val); + } + + if (spank_getenv (sp, "SLURM_PTY_WIN_COL", val, 64) == ESPANK_SUCCESS) { + spank_unsetenv (sp, "SLURM_PTY_WIN_COL"); + wsp->ws_col = atoi (val); + } + + if (!wsp->ws_row && !wsp->ws_col) + return (0); + return (1); +} + +int pty_connect_back (spank_t sp) +{ + char ip [64], port [16]; + struct sockaddr_in addr; + int s; + + int rc = spank_getenv (sp, "SLURM_LAUNCH_NODE_IPADDR", ip, 64); + if (rc != ESPANK_SUCCESS) { + slurm_error ("failed to read SLURM_NODE_IPADDR in env!"); + return (-1); + } + + if (spank_getenv (sp, "SLURM_PTY_PORT", port, 16) != ESPANK_SUCCESS) { + slurm_error ("failed to read SLURM_PTY_PORT in env!"); + return (-1); + } + + addr.sin_family = AF_INET; + inet_aton (ip, &addr.sin_addr); + addr.sin_port = htons (atoi (port)); + + + if ((s = socket (AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + slurm_error ("pty: socket: %m"); + return (-1); + } + + + if (connect (s, (struct sockaddr *) &addr, sizeof (addr)) < 0) { + slurm_error ("pty: connect: %m"); + close (s); + return (-1); + } + + return (s); +} + +static int write_pty_winsize (int fd, struct winsize *ws) +{ + int len; + struct pty_winsz winsz; + + winsz.rows = ws->ws_row; + winsz.cols = ws->ws_col; + + pty_winsz_pack (&winsz); + + if ((len = write (fd, &winsz, sizeof (winsz))) < 0) { + slurm_error ("write_pty_winsz: %m"); + return (-1); + } + + return (len); +} + +static int read_pty_winsize (int fd, struct winsize *ws) +{ + struct pty_winsz winsz; + int len; + + if ((len = read (fd, &winsz, sizeof (winsz))) < 0) { + slurm_error ("read_pty_winsz: %m"); + return (-1); + } + + if (len == 0) { + slurm_error ("read_pty_winsz: Remote closed connection."); + return (-1); + } + + pty_winsz_unpack (&winsz); + + memset (ws, 0, sizeof (*ws)); + + ws->ws_col = winsz.cols; + ws->ws_row = winsz.rows; + + return (0); +} + +static void process_winsz_event (int fd, int master) +{ + struct winsize ws; + + if (read_pty_winsize (fd, &ws) < 0) + return; + + ioctl (master, TIOCSWINSZ, &ws); + kill (0, SIGWINCH); +} + +static int no_close_stdio (spank_t sp) +{ + char val [64]; + const char var[] = "SLURM_PTY_NO_CLOSE_STDIO"; + + if (spank_getenv (sp, var, val, 64) == ESPANK_SUCCESS) + return (1); + return 0; +} + +static void close_stdio (void) +{ + int devnull; + + if ((devnull = open ("/dev/null", O_RDWR)) < 0) { + slurm_error ("Failed to open /dev/null: %m"); + } + else { + dup2 (devnull, STDOUT_FILENO); + dup2 (devnull, STDIN_FILENO); + dup2 (devnull, STDERR_FILENO); + close (devnull); + } +} + +int slurm_spank_task_init (spank_t sp, int ac, char **av) +{ + int taskid; + int rfd; + struct winsize ws; + struct winsize *wsp = NULL; + + if (!do_pty) + return (0); + + spank_get_item (sp, S_TASK_GLOBAL_ID, &taskid); + + if (taskid != 0) { + if (!no_close_stdio (sp)) + close_stdio (); + return (0); + } + + if ((rfd = pty_connect_back (sp)) < 0) { + slurm_error ("Failed to connect back to pty server"); + } + + if (get_winsize (sp, &ws)) + wsp = &ws; + + if ((pid = forkpty (&master, NULL, NULL, wsp)) < 0) { + slurm_error ("Failed to allocate a pty for rank 0: %m\n"); + return (0); + } + else if (pid == 0) { + /* Child. Continue with SLURM code */ + return (0); + } + + /* Parent: process data from client */ + + while (1) { + struct pollfd fds[3]; + int rc; + int nfds = 2; + + fd_set_nonblocking (master); + fd_set_nonblocking (STDIN_FILENO); + + fds[0].fd = master; + fds[1].fd = STDIN_FILENO; + fds[0].events = POLLIN | POLLERR; + fds[1].events = POLLIN | POLLERR; + + if (rfd >= 0) { + fd_set_nonblocking (rfd); + fds[2].fd = rfd; + fds[2].events = POLLIN | POLLERR; + nfds++; + } + + + if ((rc = poll (fds, 3, -1)) < 0) { + slurm_error ("poll: %m\n"); + exit (1); + } + + if (fds[0].revents & POLLERR) { + check_for_slave_exit (); + continue; + } + + if (fds[0].revents & POLLIN) + process_pty (); + + if (fds[1].revents & POLLIN) + process_stdin (); + + if (fds[2].revents & POLLIN) + process_winsz_event (rfd, master); + + check_for_slave_exit (); + } + + return (0); +} + +static void pty_restore (void) +{ + /* STDIN is probably closed by now */ + if (tcsetattr (STDOUT_FILENO, TCSANOW, &termdefaults) < 0) + fprintf (stderr, "tcsetattr: %s\n", strerror (errno)); +} + +static int set_winsize (spank_t sp) +{ + struct winsize ws; + char buf[64]; + ioctl (STDIN_FILENO, TIOCGWINSZ, &ws); + + snprintf (buf, sizeof (buf), "%d", ws.ws_row); + setenv ("SLURM_PTY_WIN_ROW", buf, 1); + + snprintf (buf, sizeof (buf), "%d", ws.ws_col); + setenv ("SLURM_PTY_WIN_COL", buf, 1); + + return (0); +} + +static void sigset_sigwinch (sigset_t *pset) +{ + sigemptyset (pset); + sigaddset (pset, SIGWINCH); +} + +static int notify_winsize_change (int fd) +{ + struct winsize ws; + ioctl (STDOUT_FILENO, TIOCGWINSZ, &ws); + write_pty_winsize (fd, &ws); + return (0); +} + +/* + * Detect when a window size change event occurs. + */ +static int winch = 0; +static void handle_sigwinch (int sig) +{ + winch = 1; + signal (SIGWINCH, handle_sigwinch); +} + +static void * pty_thread (void *arg) +{ + int fd; + sigset_t set; + + sigset_sigwinch (&set); + pthread_sigmask (SIG_UNBLOCK, &set, NULL); + + signal (SIGWINCH, handle_sigwinch); + + if ((fd = accept (listenfd, NULL, NULL)) < 0) { + slurm_error ("pty: accept: %m"); + return NULL; + } + + for (;;) { + poll (NULL, 0, -1); + if (winch && notify_winsize_change (fd) < 0) + return NULL; + winch = 0; + } + + return (NULL); +} + +static int bind_wild (int sockfd) +{ + socklen_t len; + struct sockaddr_in sin; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(0); /* bind ephemeral port */ + + if (bind (sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + slurm_error ("bind: %m\n"); + return (-1); + } + len = sizeof(sin); + if (getsockname(sockfd, (struct sockaddr *) &sin, &len) < 0) + return (-1); + return ntohs(sin.sin_port); + +} + +static int do_listen (int *fd, short *port) +{ + int rc, val; + + if ((*fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) + return -1; + + val = 1; + rc = setsockopt(*fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(int)); + if (rc > 0) { + goto cleanup; + } + + *port = bind_wild (*fd); + + if ((rc = listen(*fd, 16)) < 0) { + slurm_error ("listen: %m"); + goto cleanup; + } + + return (0); + +cleanup: + close (*fd); + return (-1); +} + +static void set_pty_env (short port) +{ + char buf [64]; + + snprintf (buf, sizeof (buf), "%hu", port); + setenv ("SLURM_PTY_PORT", buf, 1); +} + +static int pty_thread_create (spank_t sp) +{ + short port; + int err; + pthread_attr_t attr; + pthread_t tid; + + if (do_listen (&listenfd, &port) < 0) { + slurm_error ("Unable to create pty listen port: %m"); + return (-1); + } + set_pty_env (port); + + pthread_attr_init (&attr); + pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); + err = pthread_create (&tid, &attr, &pty_thread, NULL); + pthread_attr_destroy (&attr); + if (err) + return (-1); + return (0); +} + + +static void block_sigwinch (void) +{ + sigset_t set; + sigset_sigwinch (&set); + pthread_sigmask (SIG_BLOCK, &set, NULL); +} + +int slurm_spank_local_user_init (spank_t sp, int ac, char **av) +{ + struct termios term; + int fd = STDIN_FILENO; + + if (!do_pty) + return (0); + + + /* Save terminal settings for restore */ + tcgetattr (fd, &termdefaults); + tcgetattr (fd, &term); + /* Set raw mode on local tty */ + cfmakeraw (&term); + tcsetattr (fd, TCSANOW, &term); + atexit (&pty_restore); + + set_winsize (sp); + + block_sigwinch (); + + pty_thread_create (sp); + + return (0); +} diff --git a/renice.c b/renice.c new file mode 100644 index 0000000..8e69722 --- /dev/null +++ b/renice.c @@ -0,0 +1,190 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * All spank plugins must define this macro for the SLURM plugin loader. + */ +SPANK_PLUGIN(renice, 1) + +#define PRIO_ENV_VAR "SLURM_RENICE" +#define PRIO_NOT_SET 42 + +/* + * Minimum allowable value for priority. May be set globally + * via plugin option min_prio= + */ +static int min_prio = -20; +static int default_prio = 0; + +static int prio = PRIO_NOT_SET; + +static int _renice_opt_process (int val, const char *optarg, int remote); +static int _str2prio (const char *str, int *p2int); +static int _check_env (spank_t sp); + +/* + * Provide a --renice=[prio] option to srun: + */ +struct spank_option spank_options[] = +{ + { "renice", "[prio]", "Re-nice job tasks to priority [prio].", 1, 0, + (spank_opt_cb_f) _renice_opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + + +/* + * Called from both srun and slurmd. + */ +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + int i; + + for (i = 0; i < ac; i++) { + if (strncmp ("min_prio=", av[i], 9) == 0) { + const char *optarg = av[i] + 9; + if (_str2prio (optarg, &min_prio) < 0) + slurm_error ("Ignoring invalid min_prio value \"%s\"", av[i]); + } + else if (strncmp ("default=", av[i], 8) == 0) { + const char *optarg = av[i] + 8; + if (_str2prio (optarg, &default_prio) < 0) + slurm_error ("renice: Ignoring invalid default value \"%s\"", + av[i]); + } + else { + slurm_error ("renice: Invalid option \"%s\"", av[i]); + } + } + + if (!spank_remote (sp)) + slurm_verbose ("renice: min_prio = %d", min_prio); + + return (0); +} + + +int slurm_spank_task_post_fork (spank_t sp, int ac, char **av) +{ + pid_t pid; + int taskid; + + /* + * Use default priority if prio not set by command line or env var + */ + if ((prio == PRIO_NOT_SET) && (_check_env (sp) < 0)) + prio = default_prio; + + if (prio < min_prio) + prio = min_prio; + + spank_get_item (sp, S_TASK_GLOBAL_ID, &taskid); + spank_get_item (sp, S_TASK_PID, &pid); + + /* + * No need to do any thing if priority is system default + */ + if (prio == getpriority (PRIO_PROCESS, (int) pid)) + return (0); + + slurm_verbose ("re-nicing task%d pid %ld to %d\n", taskid, pid, prio); + + if (setpriority (PRIO_PROCESS, (int) pid, (int) prio) < 0) { + slurm_error ("setpriority: %m"); + return (-1); + } + + return (0); +} + +static int _renice_opt_process (int val, const char *optarg, int remote) +{ + if (optarg == NULL) { + slurm_error ("--renice: invalid argument!"); + return (-1); + } + + if (_str2prio (optarg, &prio) < 0) { + slurm_error ("Bad value for --renice: \"%s\"\n", optarg); + return (-1); + } + + if (prio < min_prio) + slurm_error ("--renice=%d not allowed, will use min=%d", + prio, min_prio); + + return (0); +} + +static int _str2prio (const char *str, int *p2int) +{ + long int l; + char *p; + + l = strtol (str, &p, 10); + if ((*p != '\0') || (l < -20) || (l > 20)) + return (-1); + + *p2int = (int) l; + + return (0); +} + +static int _check_env (spank_t sp) +{ + /* + * See if SLURM_RENICE env var is set by user + */ + char val [1024]; + + if (spank_getenv (sp, PRIO_ENV_VAR, val, 1024) != ESPANK_SUCCESS) + return (-1); + + if (_str2prio (val, &prio) < 0) { + slurm_error ("Bad value for %s: \"%s\".\n", PRIO_ENV_VAR, val); + return (-1); + } + + if (prio < min_prio) { + slurm_error ("%s=%d not allowed, using min=%d", + PRIO_ENV_VAR, prio, min_prio); + } + + return (0); +} + + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/system-safe-preload.c b/system-safe-preload.c new file mode 100644 index 0000000..bb9373e --- /dev/null +++ b/system-safe-preload.c @@ -0,0 +1,343 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +/* + * safe-system.so : Making system(3) safe for MPI jobs everywhere. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char **environ; + +typedef int (*system_f) (const char * cmd); + +static void * libc_handle; +static system_f real_system; + +static int client_fd = -1; +static int server_fd = -1; + +static int write_n (int fd, const void *buf, size_t n) +{ + size_t nleft; + ssize_t nwritten; + unsigned const char *p; + + p = buf; + nleft = n; + while (nleft > 0) { + if ((nwritten = write (fd, p, nleft)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + } + nleft -= nwritten; + p += nwritten; + } + return (n); +} + +static int read_n (int fd, void *buf, size_t n) +{ + size_t nleft; + ssize_t nread; + unsigned char *p; + + p = buf; + nleft = n; + while (nleft > 0) { + if ((nread = read (fd, p, nleft)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + } + else if (nread == 0) { /* EOF */ + break; + } + nleft -= nread; + p += nread; + } + return (n - nleft); +} + + +static int create_socketpair (void) +{ + int pfds[2]; + + if (socketpair (AF_UNIX, SOCK_STREAM, 0, pfds) < 0) { + fprintf (stderr, "systemsafe: socketpair failed: %s\n", strerror (errno)); + return (-1); + } + + client_fd = pfds[0]; + server_fd = pfds[1]; + + fcntl (client_fd, F_SETFD, FD_CLOEXEC); + fcntl (server_fd, F_SETFD, FD_CLOEXEC); + + return (0); +} + +static int read_string (int fd, char **bufp) +{ + int len = 0; + int rc; + + *bufp = NULL; + + /* + * Read string length + */ + if ((rc = read_n (fd, &len, sizeof (int))) < 0) { + fprintf (stderr, "systemsafe: read_string: %s\n", strerror (errno)); + return (-1); + } + + if (rc == 0) + return (0); + + if ((*bufp = malloc (len + 1)) == NULL) { + fprintf (stderr, "systemsafe: read_string: malloc (%d): %s\n", + len, strerror (errno)); + return (-1); + } + + if ((rc = read_n (fd, *bufp, len)) < 0) { + fprintf (stderr, "systemsafe: read_string: %s\n", strerror (errno)); + return (-1); + } + + if (rc == 0) + return (0); + + (*bufp) [len] = '\0'; + + return (len); +} + +static int write_string (int fd, const char *str) +{ + int len = strlen (str); + int rc; + + if (write_n (fd, &len, sizeof (int)) < 0) { + fprintf (stderr, "systemsafe: write: %s\n", strerror (errno)); + return (-1); + } + + rc = write_n (fd, str, len); + + return (rc); +} + +void free_env (char **env) +{ + int i = 0; + while (env [i]) + free (env [i++]); + free (env); + return; +} + +int read_env (int fd, char ***envp) +{ + int envc = 0; + int i; + + if (read_n (fd, &envc, sizeof (int)) < 0) { + fprintf (stderr, "systemsafe: read_env: %s\n", strerror (errno)); + return (-1); + } + + if (!(*envp = malloc ((envc + 1) * sizeof (**envp)))) { + fprintf (stderr, "systemsafe: read_env: malloc: %s\n", strerror (errno)); + return (-1); + } + + for (i = 0; i < envc; i++) { + char *entry; + if (read_string (fd, &entry) < 0) { + fprintf (stderr, "systemsafe: %s\n", strerror (errno)); + free_env (*envp); + return (-1); + } + + if (strncmp ("LD_PRELOAD=", entry, 10) == 0) + entry [11] = '\0'; + + (*envp)[i] = entry; + } + + (*envp)[envc] = NULL; + + return (0); +} + +static void handle_system_request (int fd) +{ + char *cmd, *path, **env, **oldenv; + int rc; + + if ((rc = read_string (fd, &cmd)) < 0) { + fprintf (stderr, "systemsafe: read cmd: %s\n", strerror (errno)); + exit (0); + } + + if (rc == 0) /* EOF, time to exit */ + exit (0); + + if (read_string (fd, &path) < 0) { + fprintf (stderr, "systemsafe: read path: %s\n", strerror (errno)); + exit (0); + } + + if (read_env (fd, &env) < 0) { + fprintf (stderr, "systemsafe: read env: %s\n", strerror (errno)); + exit (0); + } + + if (chdir (path) < 0) + fprintf (stderr, "systemsafe: Failed to chdir to %s: %s\n", + path, strerror (errno)); + + oldenv = environ; + environ = env; + + rc = (*real_system) (cmd); + + write_n (fd, &rc, sizeof (int)); + + environ = oldenv; + free_env (env); + free (cmd); + free (path); + + return; +} + +static void system_server (void) +{ + char c = 0; + close (client_fd); + write (server_fd, &c, 1); + for (;;) + handle_system_request (server_fd); + return; +} + +static int create_system_server (void) +{ + pid_t pid; + char c; + + create_socketpair (); + + if ((pid = fork ()) < 0) + return (-1); + + if (pid == 0) { + system_server (); + exit (0); + } + + close (server_fd); + + /* + * Wait for system_server setup to complete + */ + read (client_fd, &c, 1); + + return (0); +} + +static int write_env (int fd) +{ + int i, envc = 0; + + while (environ[envc]) + envc++; + + write (fd, &envc, sizeof (int)); + + for (i = 0; i < envc; i++) + write_string (fd, environ [i]); + + return (0); +} + +int system (const char *cmd) +{ + int rc; + char path [4096]; + + if (cmd == NULL) { + errno = EINVAL; + return (-1); + } + + write_string (client_fd, cmd); + write_string (client_fd, getcwd (path, sizeof (path))); + write_env (client_fd); + + if (read (client_fd, &rc, sizeof (int)) < 0) { + fprintf (stderr, "system: failed to read status from server: %s\n", + strerror (errno)); + return (-1); + } + + return (rc); +} + +void __attribute__ ((constructor)) fork_safe_init (void) +{ + if ((libc_handle = dlopen ("libc.so.6", RTLD_LAZY)) == NULL) { + exit (1); + } + + if ((real_system = dlsym (libc_handle, "system")) == NULL) + exit (2); + + create_system_server (); + + return; +} + + +/* + * vi: ts=4 sw=4 expandtab + */ + diff --git a/system-safe.c b/system-safe.c new file mode 100644 index 0000000..af07096 --- /dev/null +++ b/system-safe.c @@ -0,0 +1,123 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include + +#include + +/* + * All spank plugins must define this macro for the SLURM plugin loader. + */ +SPANK_PLUGIN(system-safe, 1) + +#define SYSTEM_SAFE_ENABLE 0x0 +#define SYSTEM_SAFE_DISABLE 0x1 + +/* + * Disabled by default + */ +static int enabled = 0; +static int opt_enable = 0; +static int opt_disable = 0; + +static int _opt_process (int val, const char *optarg, int remote); + +/* + * Provide a --renice=[prio] option to srun: + */ +struct spank_option spank_options[] = +{ + { "system-safe", NULL, "Replace system(3) with version safe for MPI.", + 0, SYSTEM_SAFE_ENABLE, + (spank_opt_cb_f) _opt_process + }, + { "no-system-safe", NULL, "Disable system(3) replacement.", + 0, SYSTEM_SAFE_DISABLE, + (spank_opt_cb_f) _opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + + +/* + * Called from both srun and slurmd. + */ +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + int i; + + if (!spank_remote (sp)) + return (0); + + for (i = 0; i < ac; i++) { + if (strncmp ("enabled", av[i], 7) == 0) { + enabled = 1; + } + else if (strncmp ("disabled", av[i], 8) == 0) { + enabled = 0; + } + else { + slurm_error ("system-safe: Invalid option \"%s\"", av[i]); + } + } + + return (0); +} + +int slurm_spank_user_init (spank_t sp, int ac, char **av) +{ + char buf [4096]; + const char *preload = "system-safe-preload.so"; + + if (opt_disable || (!enabled && !opt_enable)) + return (0); + + if (spank_getenv (sp, "LD_PRELOAD", buf, sizeof (buf)) == ESPANK_SUCCESS) + snprintf (buf, sizeof (buf), "%s %s", buf, preload); + else + strncpy (buf, preload, strlen (preload)); + + if (spank_setenv (sp, "LD_PRELOAD", buf, 1) != ESPANK_SUCCESS) + slurm_error ("Failed to set LD_PRELOAD=%s\n", buf); + + return (0); +} + +static int _opt_process (int val, const char *optarg, int remote) +{ + if (val == SYSTEM_SAFE_ENABLE) + opt_enable = 1; + else + opt_disable = 0; + + return (0); +} + + + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/tmpdir.c b/tmpdir.c new file mode 100644 index 0000000..c3bad13 --- /dev/null +++ b/tmpdir.c @@ -0,0 +1,111 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include + +SPANK_PLUGIN (tmpdir, 1); + +/* + * Create job-specific TMPDIR. + * Called from srun after allocation before launch. + * Does the equivalent of TMPDIR=${TMPDIR-/tmp}/$SLURM_JOBID.$SLURM_STEPID + */ +int slurm_spank_local_user_init (spank_t sp, int ac, char **av) +{ + uint32_t jobid, stepid; + const char *tmpdir; + char buf [1024]; + int n; + + if (spank_get_item (sp, S_JOB_ID, &jobid) != ESPANK_SUCCESS) { + slurm_error ("Failed to get jobid from SLURM"); + return (-1); + } + + if (spank_get_item (sp, S_JOB_STEPID, &stepid) != ESPANK_SUCCESS) { + slurm_error ("Failed to get job step id from SLURM"); + return (-1); + } + + if (!(tmpdir = getenv ("TMPDIR"))) + tmpdir = "/tmp"; + + n = snprintf (buf, sizeof (buf), "%s/%u.%u", tmpdir, jobid, stepid); + + if ((n < 0) || (n > sizeof (buf) - 1)) { + slurm_error ("TMPDIR = \"%s\" too large. Aborting"); + return (-1); + } + + if (setenv ("TMPDIR", buf, 1) < 0) { + slurm_error ("setenv (TMPDIR, \"%s\"): %m", buf); + return (-1); + } + + return (0); +} + +/* + * ``rm -rf TMPDIR'' *as user* after job tasks have exited + */ +int slurm_spank_exit (spank_t sp, int ac, char **av) +{ + const char sudo [] = "/usr/bin/sudo -u"; + const char rm [] = "/bin/rm -rf"; + char tmp [1024]; + char cmd [4096]; + int n; + int status; + uid_t uid = (uid_t) -1; + + if (!spank_remote (sp)) + return (0); + + if (spank_getenv (sp, "TMPDIR", tmp, sizeof (tmp)) != ESPANK_SUCCESS) { + slurm_error ("Unable to remove TMPDIR at exit!"); + return (-1); + } + + if (spank_get_item (sp, S_JOB_UID, &uid) != ESPANK_SUCCESS) { + slurm_error ("tmpdir: Unable to get job's user id"); + return (-1); + } + + n = snprintf (cmd, sizeof (cmd), "%s \\#%d %s %s", sudo, uid, rm, tmp); + + if ((n < 0) || (n > sizeof (cmd) - 1)) { + slurm_error ("Unable to remove TMPDIR at exit!"); + return (-1); + } + + if ((status = system (cmd)) != 0) { + slurm_error ("\"%s\" exited with status=0x%04x\n", cmd, status); + return (-1); + } + + return (0); +} diff --git a/use-env/Makefile b/use-env/Makefile new file mode 100644 index 0000000..7043884 --- /dev/null +++ b/use-env/Makefile @@ -0,0 +1,27 @@ + +OBJS := lex.yy.o use-env-parser.o ../lib/list.o log_msg.o ../lib/split.o +HDRS := use-env.h ../lib/list.h ../lib/split.h log_msg.h use-env-parser.h +SHOPTS := -shared -Wl,--version-script=version.map + +all: use-env.so test + +use-env.so : $(OBJS) use-env.o + $(CC) $(SHOPTS) -o use-env.so $(OBJS) use-env.o + +test: $(OBJS) main.o + $(CC) -ggdb -o test $(OBJS) main.o + +check: test + ./test -f test.conf + +.c.o : + $(CC) -ggdb -I../lib -Wall $(CFLAGS) -o $@ -fPIC -c $< + +use-env-parser.c use-env-parser.h : use-env-parser.y + bison -d -o use-env-parser.c $< + +lex.yy.c : use-env-parser.l use-env-parser.h + lex $< + +clean: + rm -f test *.o use-env-parser.[ch] lex.yy.c *.so diff --git a/use-env/log_msg.c b/use-env/log_msg.c new file mode 100644 index 0000000..25ba6cb --- /dev/null +++ b/use-env/log_msg.c @@ -0,0 +1,241 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include + +#include "use-env.h" + +struct log_ctx { + int quiet; + int verbose; + char *prefix; +}; + +static struct log_ctx log_ctx = { 0, 0, "" }; + +int log_msg_init (const char *prefix) +{ + if (prefix) + log_ctx.prefix = strdup (prefix); + return (0); +} + +void log_msg_fini () +{ + if (log_ctx.prefix) + free (log_ctx.prefix); +} + +int log_msg_verbose () +{ + return (log_ctx.verbose++); +} + +int log_msg_set_verbose (int level) +{ + return (log_ctx.verbose = level); +} + +int log_msg_quiet () +{ + return (log_ctx.quiet++); +} + + +static void +vlog_msg (const char *prefix, int use_basename, const char *format, va_list ap) +{ + char buf[4096]; + char *p; + int n; + int len; + + p = buf; + len = sizeof (buf); + + /* Prefix output with facility name. + */ + if (log_ctx.prefix && (*log_ctx.prefix != '\0')) { + n = snprintf (buf, len, "%s: ", log_ctx.prefix); + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + } + + /* Add a log level prefix. + */ + if ((len > 0) && (prefix)) { + n = snprintf (p, len, "%s: ", prefix); + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + } + + /* Add file and line number information + */ + if (len > 0 && (lex_file () != NULL)) { + char *file = strdup (lex_file ()); + char *name = use_basename ? basename (file) : file; + + n = snprintf (p, len, "%s: %d: ", name, lex_line()); + + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + free (file); + } + + if ((len > 0) && (format)) { + n = vsnprintf (p, len, format, ap); + if ((n < 0) || (n >= len)) { + p += len - 1; + len = 0; + } + else { + p += n; + len -= n; + } + } + + /* Add suffix for truncation if necessary. + */ + if (len <= 0) { + char *q; + const char *suffix = "+"; + q = buf + sizeof (buf) - 1 - strlen (suffix); + p = (p < q) ? p : q; + strcpy (p, suffix); + p += strlen (suffix); + } + + *p = '\0'; + + fprintf (stderr, "%s", buf); + + return; +} + + +int log_err (const char *format, ...) +{ + va_list ap; + + if (log_ctx.quiet) + return (-1); + + va_start (ap, format); + vlog_msg ("Error", 0, format, ap); + va_end (ap); + return (-1); +} + +void log_msg (const char *format, ...) +{ + va_list ap; + + if (log_ctx.quiet) + return; + + va_start (ap, format); + vlog_msg (NULL, 1, format, ap); + va_end (ap); + return; +} + +void log_verbose (const char *format, ...) +{ + va_list ap; + + if (log_ctx.quiet || !log_ctx.verbose) + return; + + va_start (ap, format); + vlog_msg (NULL, 1, format, ap); + va_end (ap); + return; +} + +void log_debug (const char *format, ...) +{ + va_list ap; + + if ((log_ctx.quiet) || (log_ctx.verbose < 2)) + return; + + va_start (ap, format); + vlog_msg (NULL, 1, format, ap); + va_end (ap); + return; +} + +void log_debug2 (const char *format, ...) +{ + va_list ap; + + if ((log_ctx.quiet) || (log_ctx.verbose < 3)) + return; + + va_start (ap, format); + vlog_msg (NULL, 1, format, ap); + va_end (ap); + return; +} + +void log_debug3 (const char *format, ...) +{ + va_list ap; + + if ((log_ctx.quiet) || (log_ctx.verbose < 4)) + return; + + va_start (ap, format); + vlog_msg (NULL, 1, format, ap); + va_end (ap); + return; +} + + + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/use-env/log_msg.h b/use-env/log_msg.h new file mode 100644 index 0000000..04ed6e2 --- /dev/null +++ b/use-env/log_msg.h @@ -0,0 +1,41 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#ifndef _LOG_MSG_H +#define _LOG_MSG_H + +int log_msg_init (const char *prefix); +void log_msg_fini (); + +int log_msg_verbose (); +int log_msg_set_verbose (int level); +int log_msg_quiet (); +int log_err (const char *format, ...); +void log_msg (const char *format, ...); +void log_verbose (const char *format, ...); +void log_debug (const char *format, ...); +void log_debug2 (const char *format, ...); +void log_debug3 (const char *format, ...); + +#endif /* !_LOG_MSG_H */ diff --git a/use-env/main.c b/use-env/main.c new file mode 100644 index 0000000..2851e7c --- /dev/null +++ b/use-env/main.c @@ -0,0 +1,92 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include + +#include "use-env.h" +#include "log_msg.h" + +extern int yydebug; +static char *run_as_task = NULL; + +int get_options (int ac, char **av, char **ppath, char **nnodes, char **nprocs) +{ + int c; + + while ((c = getopt (ac, av, "dvt:f:n:N:")) >= 0) { + switch (c) { + case 'd' : + yydebug = 1; + break; + case 'v': + log_msg_verbose (); + break; + case 'f': + *ppath = optarg; + break; + case 'n': + *nprocs = optarg; + break; + case 'N': + *nnodes = optarg; + break; + case 't': + run_as_task = optarg; + break; + case '?' : + default: + exit (1); + } + } + return (0); +} + + +int main (int ac, char **av) +{ + int rc = 0; + char *filename = NULL; + char *nnodes = "0"; + char *nprocs = "0"; + + log_msg_init ("use-env"); + + get_options (ac, av, &filename, &nnodes, &nprocs); + + keyword_define ("SLURM_NNODES", nnodes); + keyword_define ("SLURM_NPROCS", nprocs); + + if (run_as_task) { + keyword_define ("SLURM_PROCID", run_as_task); + keyword_define ("SLURM_NODEID", "0"); + } + + use_env_parser_init (run_as_task != NULL); + rc = use_env_parse (filename); + use_env_parser_fini (); + log_msg_fini (); + + return (rc); +} diff --git a/use-env/test.conf b/use-env/test.conf new file mode 100644 index 0000000..349b136 --- /dev/null +++ b/use-env/test.conf @@ -0,0 +1,79 @@ +# Test file for use-env parser + +# Comment + # Comment + # Comment # +FOO = 1 # Comment +FOO=2# + +A = 1 +A |= 2 + + B = 3 + +C = 1;D=1; + +if ($A == 2) + print "ERROR |= didn't seem to work" +endif + +PATH += /foo/bin + +PATH = "${PATH}:/usr/local/bin" + +print "$PATH" + +if (($A == 1) && ($B >= 3)) + C = 10 +else if ($A == 1) + print "ERROR else if fallthrough not working" +else + print "ERROR else fallthrough not working" +endif + +EMPTY = "" +EMPTY = + +print "EMPTY = \"$EMPTY\"" + +unset EMPTY + +define n = ${EMPTY}$SLURM_NPROCS +define N = $SLURM_NNODES + +define x = 101 +define y = 10 + +if ($x < 100) + print "ERROR: x not < 100" +else if ($x < 200) + if ($y > 1) + # + else if ($y > 5) + print "ERROR: nested else if fallthrough failed" + else + print "ERROR: nested else fallthrough failed" + endif +else + print "ERROR: else fallthrough failed" +endif + +include test.conf.include + +undefine n + +dump all + + +set debuglevel 3 + +in task { + print "In task $SLURM_PROCID"; + if (defined $LD_PRELOAD) + LD_PRELOAD = "$LD_PRELOAD libfoo.so" + else + LD_PRELOAD = libfoo.so + endif +} + +print ~/bin diff --git a/use-env/test.conf.include b/use-env/test.conf.include new file mode 100644 index 0000000..f8e6a3d --- /dev/null +++ b/use-env/test.conf.include @@ -0,0 +1,3 @@ + +print "Included file" + diff --git a/use-env/use-env-parser.l b/use-env/use-env-parser.l new file mode 100644 index 0000000..9275075 --- /dev/null +++ b/use-env/use-env-parser.l @@ -0,0 +1,906 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +%{ +#include +#include +#include +#include +#include +#include +#include + +#include "use-env.h" +#include "use-env-parser.h" +#include "list.h" +#include "log_msg.h" + +static char *s; +static char buf [4096]; + +/* + * True if we've returned an item in POSTOP condition + */ +static int postop_got_item = 0; + +extern int yyerror (char *); + +/* + * Macro for entering POSTOP start condition: + * - Initialize buf and string pointer `s' + * - reset postop_got_item to 0 + */ +#define BEGIN_POSTOP \ + do { \ + memset (s = buf, 0, sizeof (buf)); \ + BEGIN (POSTOP); \ + postop_got_item = 0; \ + } while (0) + +/* + * Initialize string buffer and begin STR condition. + */ +#define BEGIN_STR \ + do { \ + memset (s = buf, 0, sizeof (buf)); \ + BEGIN (STR); \ + } while (0) + +/* + * Place a bracketed identifier ${id} item into yylval.item + */ +#define GET_BRACKETED_ITEM \ + do { \ + yytext [strlen(yytext) - 1] = '\0'; /* Nullify closing brace */ \ + yylval.item = lex_item_create (yytext+2, TYPE_SYM); \ + } while (0) + + + +%} + +%option noyywrap + +digit [0-9] +alpha [a-zA-Z] +alnum [0-9a-zA-Z] +ident [0-9a-zA-Z_] +id [_a-zA-z][0-9a-zA-Z_]* +p [\)<>;=+\n \t#] + + +%x STR STR2 POSTOP + +%% + +[ \t]+ ; /* Ignore whitespace */ +#[^\n]* ; /* Ignore comments */ + +\n lex_line_increment (); return '\n'; + +\" BEGIN_STR; + +dump return DUMP; +define return DEF; +undefine return UNDEF; +set return SET; +unset return UNSET; +if return IF; +else return ELSE; +endif return ENDIF; +defined return DEFINED; +"in task" return IN_TASK; +match(es)? return MATCH; + +print BEGIN_POSTOP; return PRINT; +include BEGIN_POSTOP; return INCLUDE; +"|=" BEGIN_POSTOP; return COND_SET; +"+=" BEGIN_POSTOP; return PREPEND; +"=+" BEGIN_POSTOP; return APPEND; +"=" BEGIN_POSTOP; return '='; +"," return ','; +"!" return '!'; +"(" return '('; +")" return ')'; +"{" return '{'; +"}" return '}'; +";" return ';'; +"<" return LT; +">" return GT; +"==" return EQ; +"<=" return LE; +">=" return GE; +"!=" return NE; +"&&" return AND; +"||" return OR; + + +[0-9]+/{p} { yylval.item = lex_item_create (yytext, TYPE_INT); return ITEM; } + +{ident}+ { yylval.item = lex_item_create (yytext, TYPE_STR); return ITEM; } + +\${id} { yylval.item = lex_item_create (yytext+1, TYPE_SYM); return ITEM;} + +\$\{{id}\} { GET_BRACKETED_ITEM; return ITEM; } + + +{ + \" { BEGIN (STR2); } + + (\n|;) { + BEGIN INITIAL; + unput (*yytext); /* Return the newline or ; to the stream */ + if (strlen (buf) || !postop_got_item) { + yylval.item = lex_item_create (buf, TYPE_STR); + return ITEM; + } + } + + [ \t]+ { + if (strlen (buf)) { /* Don't return an empty string separated by ws */ + postop_got_item = 1; + yylval.item = lex_item_create (buf, TYPE_STR); + memset (s = buf, 0, sizeof (buf)); + return ITEM; + } + } + + #[^\n]* ; /* Skip comments */ + + \\\ { *s++ = ' '; } +} + +{ + \" { + BEGIN INITIAL; + yylval.item = lex_item_create (buf, TYPE_STR); + return ITEM; + } +} + +{ + \" { + postop_got_item = 1; + *s = '\0'; + yylval.item = lex_item_create (buf, TYPE_STR); + memset (s = buf, 0, sizeof (buf)); + BEGIN POSTOP; + return ITEM; + } +} + +{ + \n { + log_err ("Unterminated double-quoted string?\n"); + lex_line_increment (); + BEGIN (INITIAL); + } +} + +{ + ~ { + const char *home; + if ((s == buf) && (home = getenv ("HOME"))) { + strncat (buf, home, sizeof (buf)); + s += strlen (home); + } else + *s++ = '~'; + } + + \${id} { + const struct sym *m = sym (yytext+1); + if (m) { + strncat (buf, m->string, sizeof (buf)); + s += strlen (m->string); + } + } + \$\{{id}\} { + const struct sym *m; + yytext[strlen(yytext)-1] = '\0'; /* Nullify closing brace */ + if ((m = sym (yytext+2))) { + *s = '\0'; + strncat (buf, m->string, sizeof (buf)); + s += strlen (m->string); + } + } + \\$ { *s++ = '$'; } + \\n { *s++ = '\n'; } + \\t { *s++ = '\t'; } + \\r { *s++ = '\r'; } + \\\" { *s++ = '\"'; } + . { *s++ = *yytext; } +} + + +<> { + if (!lex_include_pop ()) + yyterminate (); +} + +%% + + +/**************************************************************************** + * Data Types + ****************************************************************************/ + +struct file_info { + FILE * fp; + char * path; + int line; + YY_BUFFER_STATE yybuf; +}; + + +/**************************************************************************** + * Static Globals + ****************************************************************************/ + +static List includes = NULL; +static struct file_info *current; + +/* + * Three-level symbol table. I know, overly complex - but it is actually + * pretty simple. + * + * Keywords (stored in the keytab) have the highest precedence and + * cannot be overridden by the config file nor environment. + * + * Local symbols defined by the user using the "define" command are + * stored in the symtab. These have higher precedence than environment + * variables, and can be updated and changed by the user with + * subsequent ``define'' invocations. + * + * The envtab contains cached environment variable "symbol" records + * for later destruction. + */ +static List keytab = NULL; +static List symtab = NULL; +static List envtab = NULL; + +static List itemcache = NULL; + +/**************************************************************************** + * Include file funtions + ****************************************************************************/ + +static void file_info_destroy (struct file_info *f) +{ + if (f == NULL) + return; + if (f->path) + free (f->path); + if (f->fp && f->fp != stdin) + fclose (f->fp); + if (f->yybuf) + yy_delete_buffer (f->yybuf); + free (f); + return; +} + +static struct file_info * file_info_create (const char *path) +{ + struct file_info *f = malloc (sizeof (*f)); + + memset (f, 0, sizeof (*f)); + + if (f == NULL) + return (NULL); + + f->line = 1; + + if (path == NULL) { + f->path = strdup ("stdin"); + f->fp = stdin; + } else { + f->path = strdup (path); + + if ((f->fp = fopen (path, "r")) == NULL) { + if (current) + log_err ("failed to include \"%s\"\n", path); + else + log_err ("Failed to open %s: %s\n", path, strerror (errno)); + + file_info_destroy (f); + return (NULL); + } + } + + f->yybuf = yy_create_buffer (f->fp, YY_BUF_SIZE); + + return (f); +} + +static int lex_switch_buffer (struct file_info *f) +{ + yyin = f->fp; + yy_switch_to_buffer (f->yybuf); + current = f; + return (0); +} + +static int find_f (struct file_info *f, char *file) +{ + return (strcmp (f->path, file) == 0); +} + +int lex_file_init (const char *path) +{ + struct file_info *f = file_info_create (path); + + if (f == NULL) + return (-1); + + lex_switch_buffer (f); + + return (0); +} + +const char * lex_file () +{ + if (!current) + return (NULL); + return (current->path); +} + +int lex_line () +{ + if (!current) + return (0); + return (current->line); +} + +int lex_line_increment () +{ + if (!current) + return (0); + return (current->line++); +} + +static char * full_path (const char *path, const char *include, + char *buf, size_t len) +{ + char *p = strdup (path); + char *prefix; + + if (p == NULL) + return (NULL); + + if (include[0] == '/') + return (strdup (include)); + + if (strcmp ("stdin", path) == 0) + prefix = "."; + else + prefix = dirname (p); + + snprintf (buf, len, "%s/%s", prefix, include); + + buf [len - 1] = '\0'; + + free (p); + + return (buf); +} + + +int lex_include_push (const char *include) +{ + struct file_info *f; + char buf [4096]; + char *path; + + assert (include != NULL); + + /* + * Decrement line counter for this file so that error messages + * correspond to the line that the include is on. + */ + current->line--; + + path = full_path (current->path, include, buf, sizeof (buf)); + + if ((path == NULL) || !(f = file_info_create (path))) + return (-1); + + if (!includes) + includes = list_create ((ListDelF) file_info_destroy); + else if (list_find_first (includes, (ListFindF) find_f, f->path)) { + log_err ("Recursively included file\n"); + file_info_destroy (f); + return (-1); + } + else if (list_count (includes) > 20) { + log_err ("include files nested too deep\n"); + file_info_destroy (f); + return (-1); + } + log_verbose ("including file %s\n", f->path); + + current->fp = yyin; + current->yybuf = YY_CURRENT_BUFFER; + + list_push (includes, current); + + lex_switch_buffer (f); + + return (0); +} + +int lex_include_pop () +{ + struct file_info *f, *tmp = current; + + if (!includes) + return (0); + + assert (current); + + if (!(f = list_pop (includes))) + return (0); + + lex_switch_buffer (f); + + /* + * Re-increment line counter when popping back to original file. + */ + current->line++; + + log_verbose ("popping back to file %s\n", current->path); + + file_info_destroy (tmp); + + return (1); +} + + +/**************************************************************************** + * Lex Item Functions + ****************************************************************************/ + + +static void lex_item_clear (struct lex_item *i) +{ + if ((i->type == TYPE_SYM) && (i->val.sym == NULL) && i->str) + free (i->str); + if (i->name) + free (i->name); + memset (i, 0, sizeof (*i)); + return; +} + +static void lex_item_destroy (struct lex_item *i) +{ + lex_item_clear (i); + free (i); +} + +static int item_unused (struct lex_item *i, void *arg) +{ + return (i->used == 0); +} + +static struct lex_item * item_cache_find_unused () +{ + if (itemcache == NULL) + itemcache = list_create ((ListDelF) lex_item_destroy); + + return (list_find_first (itemcache, (ListFindF) item_unused, NULL)); +} + +static struct lex_item * lex_item_alloc () +{ + struct lex_item *i = item_cache_find_unused (); + + if (i == NULL) { + log_debug3 ("allocated new lex_item\n"); + i = malloc (sizeof (*i)); + list_append (itemcache, i); + } else + log_debug3 ("pulled lex_item off cache with %d items\n", + list_count (itemcache)); + + i->used = 1; + + return (i); +} + +struct lex_item * lex_item_create (char *name, int type) +{ + struct lex_item *i = lex_item_alloc (); + + i->name = strdup (name); + i->str = i->name; + i->type = type; + + if (type == TYPE_STR) + i->val.str = i->name; + else if (type == TYPE_INT) + i->val.num = atoi (name); + else if (type == TYPE_SYM) { + if ((i->val.sym = sym (name))) + i->str = i->val.sym->string; + else + i->str = strdup (""); + } + + log_debug2 ("creating item \"%s\"\n", name); + + return (i); +} + +static int item_clear (struct lex_item *i, void *arg) +{ + if (i->used) { + lex_item_clear (i); + i->used = 0; + } + return (0); +} + +void lex_item_cache_clear () +{ + int a = 1; + + if (itemcache == NULL) + return; + + log_debug3 ("clearing %d items in cache\n", list_count (itemcache)); + + list_for_each (itemcache, (ListForF) item_clear, (void *) &a); +} + +int item_type_int (struct lex_item *i) +{ + if (i->type == TYPE_INT) + return (1); + if ((i->type == TYPE_SYM) && i->val.sym && (i->val.sym->type == SYM_INT)) + return (1); + return (0); +} + +int item_val (struct lex_item *item) +{ + assert (item_type_int (item)); + + if (item->type == TYPE_INT) + return (item->val.num); + + if (item->type == TYPE_SYM) + return (item->val.sym->val); + + return (0); +} + +char * item_str (struct lex_item *item) +{ + return (item->str); +} + +int item_strcmp (struct lex_item *x, struct lex_item *y) +{ + return (strcmp (item_str (x), item_str (y))); +} + +static const char * cmp_str (int cmp) +{ + switch (cmp) { + case LT: return "<"; + case GT: return ">"; + case LE: return "<="; + case GE: return ">="; + case EQ: return "=="; + case NE: return "!="; + } + return ("??"); +} + +int item_cmp (int cmp, struct lex_item *x, struct lex_item *y) +{ + int rv = -1; + + switch (cmp) { + case LT: + if (item_type_int (x) && item_type_int (y)) + rv = (item_val (x) < item_val (y)); + break; + case GT: + if (item_type_int (x) && item_type_int (y)) + rv = (item_val (x) > item_val(y)); + break; + case LE: + if (item_type_int (x) && item_type_int (y)) + rv = (item_val (x) <= item_val (y)); + break; + case GE: + if (item_type_int (x) && item_type_int (y)) + rv = (item_val (x) >= item_val (y)); + break; + case EQ: + if (item_type_int (x) && item_type_int (y)) + rv = (item_val (x) == item_val (y)); + else + rv = (item_strcmp (x, y) == 0); + break; + case NE: + if (item_type_int (x) && item_type_int (y)) + rv = (x->val.num != y->val.num); + else + rv = (item_strcmp (x, y) != 0); + break; + default: + log_err ("Invalid comparitor %d\n", cmp); + } + + if (rv < 0) + log_err ("Invalid comparison: `%s'(=%s) %s `%s'(=%s)\n", + x->name, item_str (x), cmp_str (cmp), + y->name, item_str (y)); + else + log_debug ("testing: (`%s'(=%s) %s `%s'(=%s)) = %s\n", + x->name, item_str (x), cmp_str (cmp), + y->name, item_str (y), (rv ? "true":"false")); + + return (rv); +} + +int is_valid_identifier (const char *str) +{ + const char *p; + + if (!str) + return (0); + + /* + * First character must be [a-zA-Z_] + */ + if (!(isalpha (str[0]) || str[0] == '_')) + return (0); + + for (p = str + 1; *p != '\0'; p++) { + if (!(isalnum (*p) || *p == '_')) + return (0); + } + + return (1); +} + + +/**************************************************************************** + * Symbol functions + ****************************************************************************/ + +int sym_find (struct sym *s, char *name) +{ + return (strcmp (s->name, name) == 0); +} + +void sym_destroy (struct sym *s) +{ + if (s == NULL) + return; + + if (s->name) + free (s->name); + if (s->string) + free (s->string); + free (s); +} + +static int sym_reset_value (struct sym *s, const char *value) +{ + long val; + char *p; + + if (s->string) + free (s->string); + + s->string = strdup (value); + s->type = SYM_STR; + s->val = -1; + + val = strtol (value, &p, 10); + + if (p && *p == '\0') { + s->type = SYM_INT; + s->val = (int) val; + } + + return (0); +} + +struct sym * sym_create (const char *name, const char *value) +{ + struct sym *s = malloc (sizeof (*s)); + + memset (s, 0, sizeof (*s)); + + s->name = strdup (name); + + sym_reset_value (s, value); + + return (s); + +} + +static struct sym * sym_lookup (List l, char *s) +{ + if (l == NULL) + return (NULL); + return (list_find_first (l, (ListFindF) sym_find, s)); +} + +int sym_delete (char *name) +{ + int rc = 0; + + log_verbose ("undef \"%s\"\n", name); + + if (symtab) + rc = list_delete_all (symtab, (ListFindF) sym_find, name); + + return (rc); +} + +int env_cache_delete (char *name) +{ + int rc = 0; + if (envtab) + rc = list_delete_all (envtab, (ListFindF) sym_find, name); + + return (rc); +} + +const struct sym * keyword_define (char *name, const char *value) +{ + struct sym *s; + + if (!keytab) + keytab = list_create ((ListDelF) sym_destroy); + else + list_delete_all (keytab, (ListFindF) sym_find, name); + + if ((s = sym_create (name, value))) + list_prepend (keytab, s); + + return (s); +} + +const struct sym * sym_define (char *name, const char *value) +{ + struct sym *s; + + /* + * Do not override a keyword with a symbol + */ + if (sym_lookup (keytab, name)) + return (NULL); + + if (!symtab) + symtab = list_create ((ListDelF) sym_destroy); + + if ((s = sym_lookup (symtab, name))) + sym_reset_value (s, value); + else if ((s = sym_create (name, value))) + list_prepend (symtab, s); + + return (s); +} + +static const struct sym * env_sym_create (char *name, const char *value) +{ + struct sym *s = NULL; + + if (envtab == NULL) + envtab = list_create ((ListDelF) sym_destroy); + + if ((s = sym_create (name, value))) + list_prepend (envtab, s); + else + log_err ("Failed to create env symbol \"%s\". Out of memory?", name); + + return (s); +} + +const struct sym * sym (char *name) +{ + const char *rv; + const struct sym *s; + + if ((s = sym_lookup (keytab, name))) + return (s); + + if ((s = sym_lookup (symtab, name))) + return (s); + + if ((rv = xgetenv (name))) + return (env_sym_create (name, rv)); + + return (NULL); +} + +void symtab_destroy () +{ + if (symtab) { + list_destroy (symtab); + symtab = NULL; + } + + if (envtab) { + list_destroy (envtab); + envtab = NULL; + } +} + +void keytab_destroy () +{ + if (keytab) { + list_destroy (keytab); + keytab = NULL; + } +} + +int print_sym (struct sym *s, void *arg) +{ + log_msg (" %s = \"%s\"\n", s->name, s->string); + return (0); +} + +void dump_symbols (void) +{ + log_msg ("Dumping symbols\n"); + list_for_each (symtab, (ListForF) print_sym, NULL); +} + +void dump_keywords (void) +{ + log_msg ("Dumping keywords\n"); + list_for_each (keytab, (ListForF) print_sym, NULL); +} + +/**************************************************************************** + * Initialization and Cleanup + ****************************************************************************/ + +void lex_fini () +{ + symtab_destroy (); + + if (itemcache) { + list_destroy (itemcache); + itemcache = NULL; + } + + if (includes) { + list_destroy (includes); + includes = NULL; + } + + file_info_destroy (current); + current = NULL; +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/use-env/use-env-parser.y b/use-env/use-env-parser.y new file mode 100644 index 0000000..f1fbbea --- /dev/null +++ b/use-env/use-env-parser.y @@ -0,0 +1,676 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +%{ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "use-env.h" +#include "log_msg.h" +#include "list.h" + +#define YYDEBUG 1 +int yydebug = 0; + +extern int yylex (); +void yyerror (const char *); + +/* + * Set parser options from config file + */ +static int set_parser_option (const char *option, struct lex_item *x); +static int define_symbol (char *name, struct lex_item *x); + +/* + * Environment manipulation functions + */ +static int env_var_set (char *name, char *val, int op); +static int env_var_unset (char *name); + +/* + * Condition functions + */ +static int condition_push_if (int val); +static int condition_push_else_if (int val); +static int condition_push_else (); +static int condition_pop_endif (); +static int condition_pop (); +static int condition (); + +/* + * Special in-task block + */ +static int in_task_begin (); +static int in_task_end (); + +/* + * Item tests + */ +static int do_fnmatch (struct lex_item *x, struct lex_item *y); +static int item_defined (struct lex_item *i); +static int cmp_items (int cmp, struct lex_item *x, struct lex_item *y); +static int test_item (struct lex_item *i); +static int include_file (char *name); +static void dump_item (char *name); + + +struct parser_ctx { + int in_task; + struct use_env_ops *ops; + void *arg; +}; + +static struct parser_ctx ctx = { 0, NULL, NULL }; + + +%} + +%union { + int val; + struct lex_item *item; +} + +%token APPEND +%token PREPEND +%token UNSET +%token INCLUDE +%token COND_SET +%token IF +%token ELSE +%token ENDIF +%token AND +%token OR +%token DEFINED +%token PRINT +%token SET +%token DEF +%token UNDEF +%token DUMP +%token IN_TASK +%token MATCH +%token ITEM +%token EQ LT GT LE GE NE + +%type test tests cmp op + +%left EQ LT GT LE GE NE AND OR '!' + +%% + +stmts : /* empty */ + | stmt { lex_item_cache_clear (); } + | stmts stmt { lex_item_cache_clear (); } + ; + +stmt : stmt_end + | expr stmt_end + | if_stmt stmt_end + | in_task stmt_end + | print stmt_end + | error stmt_end + | INCLUDE ITEM '\n' { if (include_file ($2->name) < 0) YYABORT; } + ; + +stmt_end: '\n' + | ';' + ; + +print : PRINT ITEM { if (condition()) printf ("%s\n", item_str ($2)); } + ; + +if_stmt : IF '(' tests ')' { if (condition_push_if ($3) < 0) YYABORT; } + '\n' + stmts if_tail + ; + +if_tail : ENDIF { condition_pop_endif (); } + + | ELSE '\n' { if (condition_push_else () < 0) YYABORT; } + stmts ENDIF { condition_pop_endif (); } + + | ELSE IF { condition_pop (); } + '(' tests ')' { if (condition_push_else_if ($5) < 0) YYABORT; } + '\n' + stmts if_tail + ; + +in_task : IN_TASK { in_task_begin (); } + block { in_task_end (); } + | IN_TASK '\n' { in_task_begin (); } + block { in_task_end (); } + +block : '{' stmts '}' + +tests : tests AND test { $$ = ($1 && $3); } + | tests OR test { $$ = ($1 || $3); } + | test + ; + +test : ITEM cmp ITEM { if (($$ = cmp_items ($2, $1, $3)) < 0) YYABORT; } + | DEFINED ITEM { if (($$ = item_defined ($2)) < 0) YYABORT; } + | ITEM { if (($$ = test_item ($1)) < 0) YYABORT; } + | '(' test ')' { $$ = $2; } + | '!' test { if (condition ()) $$ = !($2); else $$ = 0; } + | ITEM MATCH ITEM { if (($$ = do_fnmatch ($3, $1)) < 0) YYABORT; } + + +expr : ITEM op ITEM { env_var_set ($1->name, item_str ($3), $2); } + | UNSET ITEM { env_var_unset ($2->name); } + | SET ITEM ITEM { set_parser_option ($2->name, $3); } + | DUMP ITEM { dump_item ($2->name); } + | DEF ITEM '=' ITEM { if (define_symbol ($2->name, $4) < 0) YYABORT; } + | UNDEF ITEM { if (condition ()) sym_delete ($2->name); } + ; + +op : '=' { $$ = '='; } + | COND_SET { $$ = COND_SET; } + | APPEND { $$ = APPEND; } + | PREPEND { $$ = PREPEND; } + ; + +cmp : EQ { $$ = EQ; } + | LT { $$ = LT; } + | GT { $$ = GT; } + | LE { $$ = LE; } + | GE { $$ = GE; } + | NE { $$ = NE; } + ; + +%% + +void yyerror (const char *msg) +{ + log_err ("%s\n", msg); +} + +/**************************************************************************** + * Data Types + ****************************************************************************/ + +struct cond { + unsigned int val:1; + unsigned int fallthru:1; +}; + +/**************************************************************************** + * Global static variables + ****************************************************************************/ + +static List cond_stack = NULL; + +/**************************************************************************** + * Includes + ****************************************************************************/ + +static int include_file (char *name) +{ + if (condition ()) + return (lex_include_push (name)); + + return (0); +} + +/**************************************************************************** + * Item tests + ****************************************************************************/ + +static int do_fnmatch (struct lex_item *x, struct lex_item *y) +{ + log_debug ("fnmatch (\"%s\", \"%s\")\n", item_str (x), item_str (y)); + if (condition ()) + return (fnmatch (item_str (x), item_str (y), 0) == 0); + return (0); +} + +static int item_defined (struct lex_item *i) +{ + if (i->type != TYPE_SYM) { + log_err ("use of `defined' keyword on non-symbol \"%s\"\n", i->name); + return (-1); + } + + if (condition ()) + return (i->val.sym != NULL); + else + return (0); +} + +static int test_item (struct lex_item *i) +{ + int rc = 0; + + if (!condition ()) + return (0); + + /* + * Return 0 unless item is INT and non-zero, or + * item string is not empty. + */ + if (item_type_int (i)) + rc = (item_val (i) != 0); + else { + char *p = item_str (i); + rc = p ? strlen (p) > 0 : 0; + } + + return (rc); +} + +static int cmp_items (int cmp, struct lex_item *x, struct lex_item *y) +{ + if (condition () == 0) + return (0); + + return (item_cmp (cmp, x, y)); +} + +/**************************************************************************** + * Set parser options + ****************************************************************************/ + +static int set_parser_option (const char *option, struct lex_item *x) +{ + if (condition() == 0) + return (0); + + if (strcmp (option, "debuglevel") == 0) { + + if (!item_type_int (x)) { + log_err ("Invalid value in \"set debuglevel %s\"\n", item_str (x)); + return (-1); + } + + log_msg_set_verbose (item_val (x)); + + log_verbose ("set debuglevel %d\n", item_val (x)); + } + else { + log_err ("Unknown option \"%s\" to set keyword\n", option); + return (-1); + } + + return (0); +} + +static void dump_item (char *name) +{ + if (condition() == 0) + return; + + if (strncmp (name, "symbols", strlen (name)) == 0) + dump_symbols (); + else if (strncmp (name, "keywords", strlen (name)) == 0) + dump_keywords (); + else if (strncmp (name, "all", strlen (name)) == 0) { + dump_keywords (); + dump_symbols (); + } + else + log_err ("Invalid argument \"%s\" to `dump' command\n", name); + + return; +} + +static int define_symbol (char *name, struct lex_item *x) +{ + if (!is_valid_identifier (name)) { + log_err ("Unable to define invalid identifier \"%s\"\n", name); + return (-1); + } + + if (condition() == 0) + return (0); + + log_verbose ("define %s = \"%s\"\n", name, item_str (x)); + + return (sym_define (name, item_str (x)) != NULL); +} + +/**************************************************************************** + * Environment manipulation + ****************************************************************************/ + +const char * xgetenv (const char *name) +{ + if (ctx.ops && ctx.ops->getenv) + return ((*ctx.ops->getenv) (ctx.arg, name)); + else + return (getenv (name)); +} + +int xunsetenv (const char *name) +{ + if (ctx.ops && ctx.ops->getenv) + return ((*ctx.ops->unsetenv) (ctx.arg, name)); + else + return (unsetenv (name)); +} + +int xsetenv (const char *name, const char *value, int overwrite) +{ + if (ctx.ops && ctx.ops->setenv) + return ((*ctx.ops->setenv) (ctx.arg, name, value, overwrite)); + else + return (setenv (name, value, overwrite)); +} + +static char * env_var_add (char *buf, size_t size, + const char *orig, const char *val, int append) +{ + if (strlen (val) >= size) + return (NULL); + + if (strlen (orig) == 0) + return (strcpy (buf, val)); + + if ((strlen (val) + strlen (orig) + 2) > size) + return (NULL); + + if (append) + snprintf (buf, size, "%s:%s", orig, val); + else + snprintf (buf, size, "%s:%s", val, orig); + + return (buf); +} + +static int env_var_unset (char *name) +{ + if (!is_valid_identifier (name)) + return (log_err ("Invalid identifier \"%s\" in unset\n", name)); + + if (condition () == 0) + return (0); + + log_verbose ("unsetenv (%s)\n", name); + + /* + * Delete any references to this value in the local env_cache + */ + env_cache_delete (name); + + if (xunsetenv (name) < 0) + return ((log_err ("unsetenv (%s): %s\n", name, strerror (errno)))); + + return (0); +} + + +static int env_var_set (char *name, char *val, int op) +{ + char buf [4096]; + const char *orig = NULL; + char *newval = val; + int overwrite = 1; + + if (!is_valid_identifier (name)) + return (log_err ("Invalid identifier \"%s\" in expression\n", name)); + + if (condition () == 0) + return (0); + + if (op == COND_SET) + overwrite = 0; + + if (((op == APPEND) || (op == PREPEND)) && (orig = xgetenv (name))) + newval = env_var_add (buf, sizeof (buf), orig, val, op == APPEND); + + /* + * Delete any references to this value in the local env_cache + */ + env_cache_delete (name); + + log_verbose ("setenv (%s, \"%s\", overwrite=%d)\n", + name, newval, overwrite); + + return (xsetenv (name, newval, overwrite)); +} + + +/**************************************************************************** + * Conditional stack + ****************************************************************************/ + +static struct cond * cond_create (int v) +{ + struct cond *c = malloc (sizeof (*c)); + + if (c == NULL) + return (NULL); + + c->val = v; + c->fallthru = 0; + return (c); +} + +static void cond_destroy (struct cond *c) +{ + free (c); +} + +static void condition_fallthru_set () +{ + struct cond *c = list_peek (cond_stack); + c->fallthru = 1; +} + +static void condition_fallthru_clear () +{ + struct cond *c = list_peek (cond_stack); + c->fallthru = 0; +} + +static int condition_fallthru () +{ + struct cond *c = list_peek (cond_stack); + return (c->fallthru); +} + +int condition () +{ + struct cond *c = list_peek (cond_stack); + /* + * The current condition value must be true + * AND fallthru must NOT be set in order + * to evaluate expressions. + * + * If fallthru is set this means we are in + * the middle of evaluating an if/else(if)* + * within this block, and the true condition + * was already evaluated. Thus we no longer need + * to evaluate expressions in else if's for this + * block. + */ + return (c->val && !c->fallthru); +} + +static int condition_push_val (int val) +{ + struct cond *c; + + if (!(c = cond_create (val))) + return (-1); + + log_debug2 ("Pushing new condition %d\n", val); + + list_push (cond_stack, c); + + return (c->val); +} + +void condition_init () +{ + cond_stack = list_create ((ListDelF) cond_destroy); + + /* + * Intiial condition is false if we're in task context. + */ + if (ctx.in_task) + condition_push_val (0); + else + condition_push_val (1); +} + +void condition_fini () +{ + if (cond_stack) { + list_destroy (cond_stack); + cond_stack = NULL; + } +} + +static int condition_pop () +{ + int rv; + struct cond *c; + + + if (!(c = list_pop (cond_stack))) + return (log_err ("else/endif without if")); + + log_debug2 ("Popped old condition %d\n", c->val); + + rv = c->val; + cond_destroy (c); + + return (rv); +} + +static int condition_pop_endif () +{ + int rv = condition_pop (); + /* + * endif resets fallthru state + */ + condition_fallthru_clear (); + return (rv); +} + +static int condition_push_if (int val) +{ + /* + * If this `if' statement is true, then update current + * fall thru state to true so we fall through any + * subsequent else statements (and don't evaluate + * any else if expressions). + */ + if (val) + condition_fallthru_set (); + + return (condition_push_val (val)); +} + +static int condition_push_else () +{ + int val = condition_pop (); + + /* + * If we're falling through subsequent else's push false + * Otherwise, push the inverse of the last value if + * this block is being evaluated (condtion == true) + */ + val = 0; + if (condition () && !condition_fallthru ()) + val = !val; + + return (condition_push_val (val)); +} + +static int condition_push_else_if (int val) +{ + if (!condition () || condition_fallthru ()) + val = 0; + else if (val != 0) + condition_fallthru_set (); + + return (condition_push_val (val)); +} + +/**************************************************************************** + * In-task support + ****************************************************************************/ + +static int in_task_begin (void) +{ + log_debug ("Found `in task' block: in_task = %d\n", ctx.in_task); + return condition_push_val (ctx.in_task); +} + +static int in_task_end (void) +{ + return condition_pop (); +} + + +/**************************************************************************** + * Initialization and Cleanup + ****************************************************************************/ + +void use_env_parser_init (int in_task) +{ + ctx.in_task = in_task; + /* + * Keytab created on-demand + */ + condition_init (); +} + +void use_env_set_operations (struct use_env_ops *ops, void *arg) +{ + ctx.ops = ops; + ctx.arg = arg; +} + +int use_env_parse (const char *filename) +{ + if (lex_file_init (filename) < 0) { + log_err ("Failed to open config file %s\n", filename); + return (-1); + } + + if (yyparse ()) { + log_err ("%s: Parser failed.\n", filename); + return (-1); + } + + lex_fini (); + + return (0); +} + +void use_env_parser_fini () +{ + condition_fini (); + keytab_destroy (); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/use-env/use-env.c b/use-env/use-env.c new file mode 100644 index 0000000..d91d15b --- /dev/null +++ b/use-env/use-env.c @@ -0,0 +1,460 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#include +#include +#include +#include +#include + +#include + +#include "use-env.h" +#include "list.h" +#include "split.h" +#include "log_msg.h" + +#define NO_SEARCH_SYSTEM 1<<0 +#define NO_SEARCH_USER 1<<1 + +SPANK_PLUGIN(use-env, 1) + +/**************************************************************************** + * Static Variables + ****************************************************************************/ + +static int local_user_cb_supported = 0; /* 1 if spank_local_user is avail*/ + +static int disable_in_task = 0; /* Don't run in task if nonzero */ +static char * default_name = "default"; /* Name of system default file */ +static List env_list = NULL; /* Global list of files to read */ + + +/**************************************************************************** + * Wrappers for spank environment manipulation + ****************************************************************************/ + +static int use_env_setenv (spank_t, const char *, const char *, int); +static int use_env_unsetenv (spank_t, const char *); +static const char *use_env_getenv (spank_t, const char *); + +static struct use_env_ops spank_env_ops = { + (getenv_f) use_env_getenv, + (setenv_f) use_env_setenv, + (unsetenv_f) use_env_unsetenv +}; + +/**************************************************************************** + * SPANK Options + ****************************************************************************/ + +static int use_env_opt_process (int val, char *optarg, int remote); + +struct spank_option spank_options[] = +{ + { "use-env", "[name]", + "Read env from ~/.slurm/environment/[name] or " + "/etc/slurm/environment/[name]", 1, 0, + (spank_opt_cb_f) use_env_opt_process + }, + SPANK_OPTIONS_TABLE_END +}; + +/**************************************************************************** + * Forward Declarations + ****************************************************************************/ + +static int check_local_user_symbol (); +static int use_env_debuglevel (); +static int process_args (int ac, char **av); +static char * env_override_file_search (char *, size_t, const char *, int); +static int do_env_override (const char *path, spank_t sp); +static int define_all_keywords (spank_t sp); + +/**************************************************************************** + * SPANK Functions + ****************************************************************************/ + +/* + * slurm_spank_init is called as root in slurmd, but I don't + * think this matters in this case because all we do here + * is initialize the parser and search for default environment + * override files. Maybe later this can be duplicated in + * slurm_spank_user_init for safety. + */ +int slurm_spank_init (spank_t sp, int ac, char **av) +{ + char buf [4096]; + size_t len = sizeof (buf); + + check_local_user_symbol (); + + if (process_args (ac, av) < 0) + return (-1); + + env_list = list_create ((ListDelF) free); + + /* + * Set environment access functions to spank versions + * if we're running remotely. + */ + if (spank_remote (sp)) + use_env_set_operations (&spank_env_ops, sp); + + /* + * Check for default files in the following order: + * /etc/slurm/environment/default || /etc/slurm/env-default.conf + * ~/.slurm/environment/default || ~/.slurm/env-default.conf + */ + if (env_override_file_search (buf, len, default_name, NO_SEARCH_USER)) + list_append (env_list, strdup (buf)); + + /* + * Always use name "default" for user default environment + */ + if (env_override_file_search (buf, len, "default", NO_SEARCH_SYSTEM)) + list_append (env_list, strdup (buf)); + + /* + * Initialize logging and parser: + */ + log_msg_init ("use-env"); + use_env_parser_init (spank_remote (sp)); + log_msg_set_verbose (use_env_debuglevel ()); + + /* + * if we don't have the local_user callback, then we have + * to instantiate the default environment here. + */ + if (!local_user_cb_supported && !spank_remote (sp)) { + list_for_each (env_list, (ListForF) do_env_override, NULL); + list_destroy (env_list); + } + + return (0); +} + +int slurm_spank_local_user_init (spank_t sp, int ac, char **av) +{ + if (define_all_keywords (sp) < 0) + return (-1); + + list_for_each (env_list, (ListForF) do_env_override, NULL); + list_destroy (env_list); + + return (0); +} + +int slurm_spank_task_init (spank_t sp, int ac, char **av) +{ + /* + * Reset operations to make sure the right spank handle is + * available. + */ + use_env_set_operations (&spank_env_ops, sp); + + if (define_all_keywords (sp) < 0) + return (-1); + + list_for_each (env_list, (ListForF) do_env_override, (void *) sp); + list_destroy (env_list); + return (0); +} + +int slurm_spank_exit (spank_t sp, int ac, char **av) +{ + use_env_parser_fini (); + log_msg_fini (); + return (0); +} + +/**************************************************************************** + * Static Functions + ****************************************************************************/ + +static int check_local_user_symbol () +{ + int (*sym_supported) (const char *); + + if ( (sym_supported = dlsym (NULL, "spank_symbol_supported")) + && (*sym_supported) ("slurm_spank_local_user_init")) + local_user_cb_supported = 1; + else + slurm_debug3 ("use-env: slurm_spank_local_user_init not supported"); + + return (0); +} + +static int use_env_debuglevel () +{ + const char *val; + int rv = 0; + + if ((val = xgetenv ("SPANK_USE_ENV_DEBUG"))) { + char *p; + long n = strtol (val, &p, 10); + if (p && (*p == '\0')) + rv = n; + else + slurm_error ("Invalid value %s for SPANK_USE_ENV_DEBUG", val); + } + + return (rv); +} + +static char * +env_override_file_search (char *path, size_t len, const char *name, int flags) +{ + const char *home; + int check_user = !(flags & NO_SEARCH_USER); + int check_sys = !(flags & NO_SEARCH_SYSTEM); + + if (check_user && (home = xgetenv ("HOME"))) { + snprintf (path, len, "%s/.slurm/environment/%s", home, name); + if (access (path, R_OK) >= 0) + return (path); + snprintf (path, len, "%s/.slurm/env-%s.conf", home, name); + if (access (path, R_OK) >= 0) { + return (path); + } + } + + if (check_sys) { + snprintf (path, len, "/etc/slurm/environment/%s", name); + if (access (path, R_OK) >= 0) + return (path); + snprintf (path, len, "/etc/slurm/env-%s.conf", name); + if (access (path, R_OK) >= 0) + return (path); + } + + return (NULL); +} + +static int do_env_override (const char *path, spank_t sp) +{ + slurm_verbose ("use_env_parse (%s)", path); + + if (use_env_parse (path) < 0) { + slurm_error ("--use-env: Errors reading %s\n", path); + return (-1); + } + return (0); +} + +static int path_cmp (char *x, char *y) +{ + return (strcmp (x, y) == 0); +} + +static int check_and_append_env_opt (char *name, List l) +{ + int rc = 0; + char buf [4096]; + size_t len = sizeof (buf); + + if (!env_override_file_search (buf, len, name, 0)) { + slurm_error ("use-env: Unable to find env override file \"%s\"", name); + return (-1); + } + + /* + * If we don't have the local_user callback, then we have + * to call do_env_override immediately + */ + if (!local_user_cb_supported) + rc = do_env_override (buf, NULL); + else if (!list_find_first (env_list, (ListFindF) path_cmp, buf)) + list_append (env_list, strdup (buf)); + + return (rc); +} + +static int use_env_opt_process (int val, char *optarg, int remote) +{ + List l; + + if (optarg == NULL) { + slurm_error ("--use-env: Invalid argument"); + return (-1); + } + + l = list_split (",", optarg); + if (list_for_each (l, (ListForF) check_and_append_env_opt, env_list) < 0) + return (-1); + list_destroy (l); + + return (0); +} + +static int +define_use_env_keyword (spank_t sp, char *name, spank_item_t item) +{ + int n; + int val; + char buf [64]; + + if (spank_get_item (sp, item, &val) != ESPANK_SUCCESS) { + slurm_error ("use-env: spank_get_item failed for %s\n", name); + return (-1); + } + + n = snprintf (buf, sizeof (buf), "%u", val); + + if ((n < 0) || (n >= sizeof (buf))) { + slurm_error ("use-env: value of %s too large for buffer\n", name); + return (-1); + } + + if (keyword_define (name, buf) == NULL) + return (-1); + + return (0); +} + +static int set_argv_keywords (spank_t sp) +{ + char cmdline [4096]; + char buf [64]; + const char **av; + int ac; + int i; + int n; + + if (spank_get_item (sp, S_JOB_ARGV, &ac, &av) != ESPANK_SUCCESS) { + slurm_error ("use-env: spank_get_item failed for argv"); + return (-1); + } + + n = snprintf (buf, sizeof (buf), "%d", ac); + + if ((n < 0) || (n >= sizeof (buf))) { + slurm_error ("use-env: value of ARGC too large"); + return (-1); + } + + keyword_define ("SLURM_ARGC", buf); + + memset (cmdline, 0, sizeof (cmdline)); + + for (i = 0; i < ac; i++) { + snprintf (buf, sizeof (buf), "SLURM_ARGV%d", i); + keyword_define (buf, av[i]); + + if ((n = strlen (cmdline)) != 0) { + strcat (cmdline, " "); + n++; + } + + if (sizeof (cmdline) > (n + strlen (av[i]) + 1)) + strcat (cmdline, av[i]); + } + + keyword_define ("SLURM_CMDLINE", cmdline); + + return (0); +} + +static int define_all_keywords (spank_t sp) +{ + /* + * These keywords are only accessible from this context + */ + if (define_use_env_keyword (sp, "SLURM_NNODES", S_JOB_NNODES) < 0) + return (-1); + if (define_use_env_keyword (sp, "SLURM_NPROCS", S_JOB_TOTAL_TASK_COUNT) < 0) + return (-1); + if (define_use_env_keyword (sp, "SLURM_JOBID", S_JOB_ID) < 0) + return (-1); + if (define_use_env_keyword (sp, "SLURM_STEPID", S_JOB_STEPID) < 0) + return (-1); + + if (set_argv_keywords (sp) < 0) + return (-1); + + if (!spank_remote (sp)) + return (0); + + if (define_use_env_keyword (sp, "SLURM_PROCID", S_TASK_GLOBAL_ID) < 0) + return (-1); + if (define_use_env_keyword (sp, "SLURM_LOCALID", S_TASK_ID) < 0) + return (-1); + if (define_use_env_keyword (sp, "SLURM_NODEID", S_JOB_NODEID) < 0) + return (-1); + + return (0); +} + +static int process_args (int ac, char **av) +{ + int i; + for (i = 0; i < ac; i++) { + if (strncmp ("default=", av[i], 8) == 0) + default_name = av[i] + 8; + else if (strcmp ("disable_in_task", av[i]) == 0) + disable_in_task = 1; + else { + slurm_error ("use-env: Invalid option \"%s\"", av[i]); + return (-1); + } + } + + return (0); +} + +/**************************************************************************** + * Environment manipulation wrappers + ****************************************************************************/ + +static const char *use_env_getenv (spank_t sp, const char *name) +{ + static char buf [4096]; + + memset (buf, 0, sizeof (buf)); + + if (spank_getenv (sp, name, buf, sizeof (buf)) != ESPANK_SUCCESS) + return (NULL); + + return (buf); +} + +static int use_env_unsetenv (spank_t sp, const char *name) +{ + if (spank_unsetenv (sp, name) != ESPANK_SUCCESS) + return (-1); + return (0); +} + + +static int use_env_setenv (spank_t sp, const char *name, const char *val, + int overwrite) +{ + if (spank_setenv (sp, name, val, overwrite) != ESPANK_SUCCESS && overwrite) + return (-1); + return (0); +} + +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/use-env/use-env.h b/use-env/use-env.h new file mode 100644 index 0000000..ee1b068 --- /dev/null +++ b/use-env/use-env.h @@ -0,0 +1,123 @@ +/***************************************************************************** + * + * Copyright (C) 2007-2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory. + * Written by Mark Grondona . + * + * UCRL-CODE-235358 + * + * This file is part of chaos-spankings, a set of spank plugins for SLURM. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + ****************************************************************************/ + +#ifndef _USE_ENV_H +#define _USE_ENV_H + +enum { TYPE_STR, TYPE_INT, TYPE_SYM }; +enum { SYM_INT, SYM_STR }; + +struct lex_item { + int used; /* Is item still used (for item cache) */ + char * name; /* Name of item */ + int type; /* Type of item (int, string, symbol) */ + char * str; /* String representation of item */ + + union { /* Union of different item types */ + int num; + char *str; + const struct sym *sym; + } val; +}; + +struct sym { + char * name; /* Name of symbol */ + int type; /* Type of symbol (INT || STRING) */ + int val; /* Value if type is INT */ + char * string; /* String representation */ +}; + +typedef char * (*getenv_f) (void *arg, const char *name); +typedef int (*unsetenv_f) (void *arg, const char *name); +typedef int (*setenv_f) (void *arg, const char *name, + const char *value, int overwrite); + +struct use_env_ops { + getenv_f getenv; + setenv_f setenv; + unsetenv_f unsetenv; +}; + +/* + * Environment manipulation + */ +const char * xgetenv (const char *name); +int xunsetenv (const char *name); +int xsetenv (const char *name, const char *value, int overwrite); + + +/* + * Parser operations: + */ +void use_env_parser_init (); +void use_env_set_operations (struct use_env_ops *ops, void *arg); +int use_env_parse (const char *filename); +void use_env_parser_fini (); + +/* + * Lexer cleanup + */ +void lex_fini (); + +/* + * lex_item functions + */ +void lex_item_cache_clear (); +struct lex_item * lex_item_create (char *name, int type); +int is_valid_identifier (const char *s); + +int item_cmp (int cmp, struct lex_item *x, struct lex_item *y); +int item_strcmp (struct lex_item *x, struct lex_item *y); +char * item_str (struct lex_item *item); +int item_val (struct lex_item *item); +int item_type_int (struct lex_item *i); + +/* + * symbol lookup and definition functions + */ +const struct sym * sym (char *name); +const struct sym * sym_define (char *name, const char *value); +const struct sym * keyword_define (char *name, const char *value); +int sym_delete (char *name); +int env_cache_delete (char *name); +void symtab_destroy (); +void keytab_destroy (); +void dump_keywords (); +void dump_symbols (); + +/* + * include file functions + */ +int lex_file_init (const char *file); +int lex_include_push (const char *include); +int lex_include_pop (); + +const char *lex_file (); +int lex_line (); +int lex_line_increment (); + +#endif +/* + * vi: ts=4 sw=4 expandtab + */ diff --git a/use-env/version.map b/use-env/version.map new file mode 100644 index 0000000..e234ff4 --- /dev/null +++ b/use-env/version.map @@ -0,0 +1,9 @@ +{ global: + plugin_name; + plugin_type; + plugin_version; + spank*; + slurm_spank*; + local: + *; +};