response error when submitting and waiting on batch jobs over SSH #3156
-
Not sure what the true source of the error is. It could be how we've structured the submission. It could be the use of the SSH uri proxy. It could be something else entirely. The idea start as trying to do the API equivalent of `flux proxy flux mini batch Here is the python script we are using to do the submission from outside the Flux instance: #!/usr/bin/env python3
import json
import os
import sys
import flux
from flux import job
from flux.job import JobspecV1
# open connection to broker
uri = "ssh://quartz44/var/tmp/herbein1/flux-zyJfaP/0/local"
h = flux.Flux(uri)
jobspec = JobspecV1.from_command(
command=["./CDT3Docking.job"],
num_tasks=1, num_nodes=1, cores_per_task=1
)
#jobspec.duration = "16h"
jobspec.environment = dict(os.environ)
jobspec.environment["PATH"] = "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/bin/:" + jobspec.environment["PATH"]
jobspec.cwd = os.getcwd()
jobspec.stdout = "flux-{{id}}.out"
# submit job
jobid = flux.job.submit(h, jobspec, waitable=True)
print("submit: {} jobspec".format(jobid))
# wait for job to complete
#result = job.wait(h, jobid)
result = job.wait(h)
if result.success:
print("wait: {} Success".format(result.jobid))
else:
print("wait: {} Error: {}".format(result.jobid, result.errstr)) The contents of CDT3Docking.job: which flux
flux mini run -N1 -n1 hostname
echo "Successfully ran!" The output on the node outside the Flux instance
The output on the node within the Flux instance (rank 0 pty)
The jobspec that was generated for the job: {"resources": [{"type": "node", "count": 1, "with": [{"type": "slot", "count": 1, "with": [{"type": "core", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["./CDT3Docking.job"], "slot": "task", "count": {"per_slot": 1}}], "at
tributes": {"system": {"duration": 0, "environment": {"MANPATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/share/man:/usr/tce/packages/dotkit/dotkit/man:/usr/man:/usr/share/man:/usr/local/man:/usr/X11R6/man:/
usr/lib64/mvapich/default/man", "FLUX_MODULE_PATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/lib/flux/modules", "GUESTFISH_INIT": "\\e[1;34m", "HOSTNAME": "quartz2498", "SPACK_ROOT": "/g/g0/herbein1/opt/pack
ages/toss3/spack", "_ModuleTable003_": "ci5sdWEiLFsiZnVsbE5hbWUiXT0iZ2l0aHViY2xpL21hc3RlciIsWyJsb2FkT3JkZXIiXT0zLHByb3BUPXt9LFsic3RhY2tEZXB0aCJdPTEsWyJzdGF0dXMiXT0iYWN0aXZlIixbInVzZXJOYW1lIl09ImdpdGh1YmNsaSIsfSx9LG1wYXRoQT17Ii91c3IvdGN
lL21vZHVsZWZpbGVzL0NvbXBpbGVyL2djYy80LjkuMyIsIi9nL2cwL2hlcmJlaW4xL29wdC9tb2R1bGVmaWxlcy90b3NzMy9MaW51eCIsIi9nL2cwL2hlcmJlaW4xL29wdC9tb2R1bGVmaWxlcy90b3NzMy9Db3JlIiwiL3Vzci90Y2UvbW9kdWxlZmlsZXMvQ29yZSIsIi91c3IvYXBwcy9tb2R1bGVmaWxlcyIsIi
91c3Ivc2hhcmUvbW9kdWxlZmlsZXMvTGludXgiLCIvdXNyL3NoYXJlL21vZHVsZWZpbGVzL0NvcmUiLCIv", "SHELL": "/bin/zsh", "TERM": "xterm-256color", "__LMOD_REF_COUNT_MODULEPATH": "/g/g0/herbein1/opt/modulefiles/toss3/Linux:2;/g/g0/herbein1/opt/modulef
iles/toss3/Core:2;/usr/tce/modulefiles/Core:1;/usr/apps/modulefiles:1;/usr/share/modulefiles/Linux:1;/usr/share/modulefiles/Core:1;/usr/share/lmod/lmod/modulefiles/Core:1", "HISTSIZE": "50000", "LMOD_ROOT": "/usr/share/lmod", "WISECONF
IGDIR": "/usr/share/wise2/", "LMOD_SYSTEM_DEFAULT_MODULES": "StdEnv", "MODULEPATH_ROOT": "/usr/share/modulefiles", "SSH_CLIENT": "134.9.49.5 43164 622", "TMPDIR": "/var/tmp/herbein1", "CONDA_SHLVL": "0", "LIBRARY_PATH": "/g/g0/herbein1
/opt/packages/toss3/spack-views/daily-driver-2020-07-24/lib:/usr/lib64", "FPATH": "/usr/tce/packages/dotkit/dotkit/ksh:/usr/share/zsh/site-functions:/usr/share/zsh/5.0.2/functions:/g/g0/herbein1/.antigen/bundles/robbyrussell/oh-my-zsh/
lib:/g/g0/herbein1/.antigen/bundles/agkozak/agkozak-zsh-prompt", "LMOD_PKG": "/usr/share/lmod/lmod", "QTDIR": "/usr/lib64/qt-3.3", "LMOD_VERSION": "8.2.7", "QTINC": "/usr/lib64/qt-3.3/include", "SSH_TTY": "/dev/pts/86", "LC_ALL": "en_U
S.UTF-8", "HISTFILESIZE": "1000000", "USER": "herbein1", "LD_LIBRARY_PATH": "/usr/lib64", "LMOD_sys": "Linux", "LS_COLORS": "rs=0:di=38;5;27:ln=38;5;51:mh=44;38;5;15:pi=40;38;5;11:so=38;5;13:do=38;5;5:bd=48;5;232;38;5;11:cd=48;5;232;38
;5;3:or=48;5;232;38;5;9:mi=05;48;5;232;38;5;15:su=48;5;196;38;5;15:sg=48;5;11;38;5;16:ca=48;5;196;38;5;226:tw=48;5;10;38;5;16:ow=48;5;10;38;5;21:st=48;5;21;38;5;15:ex=38;5;34:*.tar=38;5;9:*.tgz=38;5;9:*.arc=38;5;9:*.arj=38;5;9:*.taz=38
;5;9:*.lha=38;5;9:*.lz4=38;5;9:*.lzh=38;5;9:*.lzma=38;5;9:*.tlz=38;5;9:*.txz=38;5;9:*.tzo=38;5;9:*.t7z=38;5;9:*.zip=38;5;9:*.z=38;5;9:*.Z=38;5;9:*.dz=38;5;9:*.gz=38;5;9:*.lrz=38;5;9:*.lz=38;5;9:*.lzo=38;5;9:*.xz=38;5;9:*.bz2=38;5;9:*.b
z=38;5;9:*.tbz=38;5;9:*.tbz2=38;5;9:*.tz=38;5;9:*.deb=38;5;9:*.rpm=38;5;9:*.jar=38;5;9:*.war=38;5;9:*.ear=38;5;9:*.sar=38;5;9:*.rar=38;5;9:*.alz=38;5;9:*.ace=38;5;9:*.zoo=38;5;9:*.cpio=38;5;9:*.7z=38;5;9:*.rz=38;5;9:*.cab=38;5;9:*.jpg=
38;5;13:*.jpeg=38;5;13:*.gif=38;5;13:*.bmp=38;5;13:*.pbm=38;5;13:*.pgm=38;5;13:*.ppm=38;5;13:*.tga=38;5;13:*.xbm=38;5;13:*.xpm=38;5;13:*.tif=38;5;13:*.tiff=38;5;13:*.png=38;5;13:*.svg=38;5;13:*.svgz=38;5;13:*.mng=38;5;13:*.pcx=38;5;13:
*.mov=38;5;13:*.mpg=38;5;13:*.mpeg=38;5;13:*.m2v=38;5;13:*.mkv=38;5;13:*.webm=38;5;13:*.ogm=38;5;13:*.mp4=38;5;13:*.m4v=38;5;13:*.mp4v=38;5;13:*.vob=38;5;13:*.qt=38;5;13:*.nuv=38;5;13:*.wmv=38;5;13:*.asf=38;5;13:*.rm=38;5;13:*.rmvb=38;
5;13:*.flc=38;5;13:*.avi=38;5;13:*.fli=38;5;13:*.flv=38;5;13:*.gl=38;5;13:*.dl=38;5;13:*.xcf=38;5;13:*.xwd=38;5;13:*.yuv=38;5;13:*.cgm=38;5;13:*.emf=38;5;13:*.axv=38;5;13:*.anx=38;5;13:*.ogv=38;5;13:*.ogx=38;5;13:*.aac=38;5;45:*.au=38;
5;45:*.flac=38;5;45:*.mid=38;5;45:*.midi=38;5;45:*.mka=38;5;45:*.mp3=38;5;45:*.mpc=38;5;45:*.ogg=38;5;45:*.ra=38;5;45:*.wav=38;5;45:*.axa=38;5;45:*.oga=38;5;45:*.spx=38;5;45:*.xspf=38;5;45:", "GIT_SSH": "/g/g0/herbein1/.dotfiles/vendor
/ssh-ident/ssh-ident", "_dk_shell": "ksh", "AGKOZAK_GIT_VERSION": "2.27.0", "ENV": "/g/g0/herbein1/.bashrc", "PFTP_CONFIG_FILENAME": "/etc/pftp_config", "HOST_GRP": "linux", "FLUX_SEC_DIRECTORY": "/g/g0/herbein1/.flux", "PAGER": "less"
, "VIRTUAL_ENV_DISABLE_PROMPT": "1", "FLUX_CONNECTOR_PATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/lib/flux/connectors", "DOTFILES": "/g/g0/herbein1/.dotfiles", "TMUX": "/tmp/tmux-51249/default,27638,0", "
DK_UEQRU": "1", "GUESTFISH_PS1": "\\[\\e[1;32m\\]><fs>\\[\\e[0;31m\\] ", "LMOD_PREPEND_BLOCK": "normal", "LSCOLORS": "Gxfxcxdxbxegedabagacad", "MAIL": "/var/spool/mail/herbein1", "PATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/
flux-c0.18.0-s0.10.0/bin:/g/g0/herbein1/Repositories/llnl-scripts:/g/g0/herbein1/.cargo/bin:/g/g0/herbein1/.dotfiles/bin:/usr/lib64/qt-3.3/bin:/usr/condabin:/usr/local/bin:/usr/bin:/g/g0/herbein1/bin:/usr/local/sbin:/usr/sbin:/g/g0/her
bein1/.antigen/bundles/robbyrussell/oh-my-zsh/lib:/g/g0/herbein1/.antigen/bundles/agkozak/agkozak-zsh-prompt:/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/bin/", "_ModuleTable001_": "X01vZHVsZVRhYmxlXz17WyJNVHZlcn
Npb24iXT0zLFsiY19yZWJ1aWxkVGltZSJdPWZhbHNlLFsiY19zaG9ydFRpbWUiXT1mYWxzZSxkZXB0aFQ9e30sZmFtaWx5PXt9LG1UPXt9LG1wYXRoQT17Ii9nL2cwL2hlcmJlaW4xL29wdC9tb2R1bGVmaWxlcy90b3NzMy9MaW51eCIsIi9nL2cwL2hlcmJlaW4xL29wdC9tb2R1bGVmaWxlcy90b3NzMy9Db3JlI
iwiL3Vzci90Y2UvbW9kdWxlZmlsZXMvQ29yZSIsIi91c3IvYXBwcy9tb2R1bGVmaWxlcyIsIi91c3Ivc2hhcmUvbW9kdWxlZmlsZXMvTGludXgiLCIvdXNyL3NoYXJlL21vZHVsZWZpbGVzL0NvcmUiLCIvdXNyL3NoYXJlL2xtb2QvbG1vZC9tb2R1bGVmaWxlcy9Db3JlIix9LFsic3lzdGVtQmFzZU1QQVRIIl09
Ii91c3IvdGNl", "LCSCHEDCLUSTER": "quartz", "LMOD_SETTARG_CMD": ":", "FLUX_PMI_LIBRARY_PATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/lib/flux/libpmi.so", "INPUTRC": "/etc/inputrc", "PWD": "/g/g0/herbein1/Re
positories/flux-framework/vinaLC", "EDITOR": "emacsclient -nw", "LANG": "en_US.UTF-8", "LUA_PATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/share/lua/5.1/?.lua;;;", "MODULEPATH": "/g/g0/herbein1/opt/modulefi
les/toss3/Linux:/g/g0/herbein1/opt/modulefiles/toss3/Core:/usr/tce/modulefiles/Core:/usr/apps/modulefiles:/usr/share/modulefiles/Linux:/usr/share/modulefiles/Core:/usr/share/lmod/lmod/modulefiles/Core", "LUA_CPATH": "/collab/usr/global
/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/lib/lua/5.1/?.so;;;", "GUESTFISH_OUTPUT": "\\e[0m", "KDEDIRS": "/usr", "_ModuleTable_Sz_": "2", "TMUX_PANE": "%3", "DK_SUBNODE": "ksh/toss_3_x86_64_ib:ksh:toss_3_x86_64_ib:.", "LMOD_CMD
": "/usr/share/lmod/lmod/libexec/lmod", "HISTIGNORE": "ls:bg:fg:history:hist", "ENVIRONMENT": "INTERACTIVE", "HISTCONTROL": "ignorespace", "SSH_ASKPASS": "/usr/libexec/openssh/gnome-ssh-askpass", "_dk_inuse": "lcinit.0 reuse.1", "FLUX_
EXEC_PATH": "/collab/usr/global/tools/flux/toss_3_x86_64_ib/flux-c0.18.0-s0.10.0/libexec/flux/cmd", "HOME": "/g/g0/herbein1", "SHLVL": "2", "SUDO_EDITOR": "emacsclient -nw", "LANGUAGE": "en_US.UTF-8", "__LMOD_REF_COUNT_PATH": "/g/g0/he
rbein1/Repositories/llnl-scripts:2;/g/g0/herbein1/.cargo/bin:2;/g/g0/herbein1/.dotfiles/bin:1;/usr/lib64/qt-3.3/bin:1;/usr/condabin:1;/usr/local/bin:1;/usr/bin:1;/g/g0/herbein1/bin:1;/usr/local/sbin:1;/usr/sbin:1;/g/g0/herbein1/.antige
n/bundles/robbyrussell/oh-my-zsh/lib:1;/g/g0/herbein1/.antigen/bundles/agkozak/agkozak-zsh-prompt:1", "_ModuleTable002_": "L21vZHVsZWZpbGVzL0NvcmU6L3Vzci9hcHBzL21vZHVsZWZpbGVzOi91c3Ivc2hhcmUvbW9kdWxlZmlsZXMvTGludXg6L3Vzci9zaGFyZS9tb2R1
bGVmaWxlcy9Db3JlOi91c3Ivc2hhcmUvbG1vZC9sbW9kL21vZHVsZWZpbGVzL0NvcmUiLH0=", "BASH_ENV": "/usr/share/lmod/lmod/init/bash", "LESS": "-XR", "LMOD_arch": "x86_64", "LOGNAME": "herbein1", "PYTHONPATH": "/collab/usr/global/tools/flux/toss_3_x
86_64_ib/flux-c0.18.0-s0.10.0/lib/flux/python3.7", "CVS_RSH": "ssh", "QTLIB": "/usr/lib64/qt-3.3/lib", "VISUAL": "emacsclient -nw", "LC_CTYPE": "en_US.UTF-8", "SSH_CONNECTION": "134.9.49.5 43164 134.9.54.16 622", "XDG_DATA_DIRS": "/g/g
0/herbein1/.local/share/flatpak/exports/share:/var/lib/flatpak/exports/share:/usr/local/share:/usr/share", "MODULESHOME": "/usr/share/lmod/lmod", "SYS_TYPE": "toss_3_x86_64_ib", "LESSOPEN": "||/usr/bin/lesspipe.sh %s", "LMOD_SETTARG_FU
LL_SUPPORT": "no", "__LMOD_REF_COUNT_LD_LIBRARY_PATH": "/usr/lib64:2", "GTAGSLIBPATH": "/g/g0/herbein1/Repositories/flux-framework/flux-core:/g/g0/herbein1/Repositories/flux-framework/capacitor", "PROMPT_COMMAND": "history -a", "LMOD_F
ULL_SETTARG_SUPPORT": "no", "ALTERNATE_EDITOR": "", "QT_PLUGIN_PATH": "/usr/lib64/kde4/plugins:/usr/lib/kde4/plugins", "LMOD_DIR": "/usr/share/lmod/lmod/libexec", "__LMOD_REF_COUNT_MANPATH": "/usr/tce/packages/dotkit/dotkit/man:1;/usr/
man:1;/usr/share/man:1;/usr/local/man:1;/usr/X11R6/man:1;/usr/lib64/mvapich/default/man:1", "GUESTFISH_RESTORE": "\\e[0m", "DK_NODE": "/usr/global/tools/dotkit", "DK_ROOT": "/usr/tce/packages/dotkit/dotkit", "HISTFILE": "/g/g0/herbein1
/.history/2020/08/20/15-18-36_95.hist", "HISTTIMEFORMAT": "[%F %T] ", "LMOD_COLORIZE": "yes"}, "cwd": "/g/g0/herbein1/Repositories/flux-framework/vinaLC", "shell": {"options": {"output": {"stdout": {"type": "file", "path": "flux-{{id}}
.out"}}}}}}, "version": 1} |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments
-
Luckily, there seems to only be one place that calls just If I didn't miss something, it looked the broker was going to send an error response to a request, but the I think I'd start by adding some better logging on failure if the problem is reproducible. I may have missed something obvious though. Oh can you dump the raw job eventlogs? Both the main eventlog and the |
Beta Was this translation helpful? Give feedback.
-
Thanks @grondo for the debugging suggestions! I instrumented that error location and went to reproduce when I realized that I was launching flux with:
When it should have been:
The response error went away as did the job-shell error. So this was a total PEBCAK, and one of the pitfalls of using an SSH proxy. I thought we had an open issue about some sort of version negotiation or warning that Flux could spit out when there is a mismatch between the Flux instance version and the connector version (not sure if I'm using the right terminology), but I cannot find it. If there isn't, maybe we should open one. |
Beta Was this translation helpful? Give feedback.
Thanks @grondo for the debugging suggestions! I instrumented that error location and went to reproduce when I realized that I was launching flux with:
When it should have been:
The response error went away as did the job-shell error. So this was a total PEBCAK, and one of the pitfalls of using an SSH proxy.
I thought we had an open issue about some sort of version …