diff --git a/CMakeLists.txt b/CMakeLists.txt index afdef1d955..8c76e15403 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,6 +155,9 @@ option(OMP "Enable OpenMP support in Charm++" OFF) option(TCP "use TCP sockets for communication (only for netlrts)" OFF) option(PTHREADS "compile with pthreads Converse threads" OFF) +# Slingshot-11 specific, off by default, auto enabled if support is found +option(CXI "Enable CXI (Cassini/Slingshot-11) extensions to OFI" OFF) + # Advanced options: option(OOC "Compile with out of core support" OFF) option(SYNCFT "Compile with Charm++ fault tolerance support" OFF) @@ -211,7 +214,7 @@ option(CUDA "Build with CUDA support" OFF) option(PXSHM "Build with PXSHM" OFF) # LRTS PMI options -set(LRTS_PMI "" CACHE STRING "PMI type for UCX and OFI layers, can be one of simplepmi, slurmpmi, slurmpmi2, ompipmix, or openpmix") +set(LRTS_PMI "" CACHE STRING "PMI type for UCX and OFI layers, can be one of simplepmi, slurmpmi, slurmpmi2, slurmpmi2cray, ompipmix, or openpmix") set(EXTRA_OPTS "" CACHE STRING "Extra flags to pass to compilers.") @@ -439,11 +442,10 @@ if(${NETWORK} STREQUAL "ucx") endif() set(CMK_BUILD_OFI 0) -if(${NETWORK} STREQUAL "ofi") +if(${NETWORK} MATCHES "ofi") set(CMK_BUILD_OFI 1) endif() - set(CMK_ERROR_CHECKING ${ERROR_CHECKING}) set(CMK_LOCKLESS_QUEUE 0) @@ -533,6 +535,7 @@ endif() include(cmake/detect-features.cmake) include(cmake/ci-files.cmake) + if(${TARGET} STREQUAL "all-test") add_custom_target(all-test ALL COMMAND $(MAKE) -C ${CMAKE_BINARY_DIR}/tmp all-test COMMENT "Building all-test. This will take several minutes.") @@ -708,6 +711,16 @@ if(CUDA) endif() endif() +if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${VDIR}/conv-mach-cxi.sh) + configure_file(src/arch/${VDIR}/conv-mach-cxi.sh include/ COPYONLY) + configure_file(src/arch/${VDIR}/conv-mach-cxi.h include/ COPYONLY) +endif() + +if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${VDIR}/conv-mach-slurmpmi2cray.sh) + configure_file(src/arch/${VDIR}/conv-mach-slurmpmi2cray.sh include/ COPYONLY) + configure_file(src/arch/${VDIR}/conv-mach-slurmpmi2cray.h include/ COPYONLY) +endif() + if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${VDIR}/conv-mach-pxshm.sh) configure_file(src/arch/${VDIR}/conv-mach-pxshm.sh include/ COPYONLY) configure_file(src/arch/${VDIR}/conv-mach-pxshm.h include/ COPYONLY) @@ -832,6 +845,8 @@ if(${NETWORK} MATCHES "ucx" OR ${NETWORK} MATCHES "ofi") # file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/proc_management/simple_pmi) set(proc_management-sources + src/arch/util/proc_management/runtime-craypmi2.C + src/arch/util/proc_management/runtime-craypmi.C src/arch/util/proc_management/runtime-pmi.C src/arch/util/proc_management/runtime-pmix.C src/arch/util/proc_management/runtime-codec.h @@ -1013,6 +1028,12 @@ else() set(CMK_USER_DISABLED_TLS "0") endif() + +if(CMK_BUILD_ON_CXI) + set(CMK_CXI 1) + set(CXI ON) +endif() + foreach(l BUILDOPTS CMK_AMPI_WITH_ROMIO CMK_BUILD_PYTHON CMK_CAN_LINK_FORTRAN CMK_CHARMDEBUG CMK_COMPILER_KNOWS_TLSDIRECTSEGREFS CMK_HAS_INT16 CMK_HAS_MMAP CMK_LIBJPEG CMK_MOD_EXT CMK_SUPPORTS_FSGLOBALS CMK_SUPPORTS_PIPGLOBALS @@ -1022,7 +1043,7 @@ foreach(l BUILDOPTS CMK_AMPI_WITH_ROMIO CMK_BUILD_PYTHON CMK_CAN_LINK_FORTRAN CXX_NO_AS_NEEDED LDXX_WHOLE_ARCHIVE_PRE LDXX_WHOLE_ARCHIVE_POST CMK_MACOSX CMK_POST_EXE CMK_SHARED_SUF CMK_USER_SUFFIX OPTS_LD CMK_COMPILER_KNOWS_FVISIBILITY CMK_LINKER_KNOWS_UNDEFINED - CMK_SUPPORTS_MEMORY_ISOMALLOC CUDA_DIR CMK_USER_DISABLED_TLS) + CMK_SUPPORTS_MEMORY_ISOMALLOC CUDA_DIR CMK_USER_DISABLED_TLS CMK_CXI) file(APPEND ${optfile_sh} "${l}=\"${${l}}\"\n" ) endforeach(l) @@ -1043,7 +1064,7 @@ set(optfile_h ${CMAKE_BINARY_DIR}/include/conv-mach-opt.h) file(WRITE ${optfile_h} "/* Build-time options header, automatically generated by cmake. */\n") foreach(l CMK_AMPI_WITH_ROMIO CMK_OPTIMIZE CMK_AMPI_ONLY CMK_POST_EXE CMK_SHARED_SUF - CMK_USER_SUFFIX) + CMK_USER_SUFFIX CMK_CXI) file(APPEND ${optfile_h} "#define ${l} ${${l}}\n") endforeach(l) @@ -1065,12 +1086,13 @@ foreach(l CUDA_DIR BUILD_CUDA CMK_AMPI_WITH_ROMIO CMK_MACOSX CMK_BUILD_PYTHON CMK_SMP CMK_SUPPORTS_FSGLOBALS CMK_SUPPORTS_PIPGLOBALS CMK_SUPPORTS_PIEGLOBALS CMK_SUPPORTS_SWAPGLOBALS CMK_SUPPORTS_TLSGLOBALS CMK_HAS_OPENMP CMK_TRACE_ENABLED CMK_USE_LRTS CMK_VDIR OPTSATBUILDTIME CMK_AMPI_ONLY CMK_WINDOWS - CMK_USE_CMA CMK_USER_SUFFIX CMK_CAN_LINK_FORTRAN CMK_SUPPORTS_MEMORY_ISOMALLOC) + CMK_USE_CMA CMK_USER_SUFFIX CMK_CAN_LINK_FORTRAN CMK_SUPPORTS_MEMORY_ISOMALLOC + CMK_CXI) file(APPEND ${optfile_mak} "${l}:=${${l}}\n" ) endforeach(l) # Add options -foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA PAPI) +foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA PAPI CXI) if(${opt}) string(TOLOWER ${opt} optl) file(APPEND ${optfile_sh} ". ${CMAKE_BINARY_DIR}/include/conv-mach-${optl}.sh\n") @@ -1078,7 +1100,6 @@ foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA PAPI) set(opts_enabled "${opts_enabled}${opt} ") endif() endforeach() - # Options that need no .h/.sh additions foreach(opt TRACING TRACING_COMMTHREAD ERROR_CHECKING LBUSERDATA QLOGIC BUILD_SHARED TASK_QUEUE DRONE_MODE LOCKLESS_QUEUE CHARMDEBUG CCS CONTROLPOINT @@ -1160,6 +1181,7 @@ message(" Charmc flags: ${MAIN_CFLAGS}") message(" Enabled options: ${opts_enabled}") message("==============================") + # Make symlinks if(CMK_WINDOWS) # add_executable(createlink src/arch/win/createlink.cpp) diff --git a/README.md b/README.md index 6ce3080e32..e2ed0a3723 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ for more information: * `regularpages` - On Cray systems, Charm++'s default is to use `hugepages`. This option disables `hugepages`, and uses regular `malloc` for messages. * `persistent` - On Cray systems, this option enables use of persistent mode for - communication. +* `cxi` - On HPE Slingshot-11 systems, this option enables use of Cassini extensions for communication. Usually autodetected and enabled where available. * `pxshm` - Use POSIX Shared Memory for communication between Charm++ processes within a shared-memory host. * `syncft` - Enable in-memory fault tolerance support in Charm++. diff --git a/buildcmake b/buildcmake index eed2a7a150..ab65c2407b 100755 --- a/buildcmake +++ b/buildcmake @@ -92,7 +92,8 @@ function parse_triplet() { parse_triplet extra_triplet_opts=${extra_triplet_opts//-/ } - +opt_build_ofi=0 +opt_cxi=0 opt_ampi_error_checking=0 opt_ampi_mpich_tests=0 opt_ampi_only=0 @@ -175,6 +176,9 @@ function parse_platform_compilers() { cuda) opt_cuda=1 ;; + cxi) + opt_cxi=1 + ;; smp) opt_smp=1 ;; @@ -193,7 +197,7 @@ function parse_platform_compilers() { pxshm) opt_pxshm=1 ;; - simplepmi|slurmpmi|slurmpmi2|ompipmix|openpmix) + simplepmi|slurmpmi|slurmpmi2|ompipmix|openpmix|slurmpmi2cray) opt_lrts_pmi="$arg" ;; persistent) @@ -492,7 +496,7 @@ fi # Append certain features and compilers to the end of $builddir builddir_extra="" -for flag in opt_omp opt_smp opt_tcp opt_pthreads opt_pxshm opt_syncft opt_ooc opt_persistent opt_cuda; do +for flag in opt_omp opt_smp opt_tcp opt_pthreads opt_pxshm opt_syncft opt_ooc opt_persistent opt_cuda opt_cxi; do [[ $flag -eq 1 ]] && builddir_extra+="-${flag/opt_/}" done @@ -500,10 +504,14 @@ for c in "$opt_lrts_pmi" "${opt_compiler[@]}"; do [[ -n "$c" ]] && builddir_extra+="-$c" done -# Use slurmpmi2 by default for ofi builds on Cray platforms (e.g. Cray -# Shasta/EX) since it matches the interface of cray-pmi +if [[ "$actual_triplet" = ofi* ]]; then + opt_build_ofi=1 +fi + +# Use slurmpmi2cray by default for ofi builds on Cray platforms (e.g. Cray +# Shasta/EX) to access cray extentions to PMI from cray-pmi if [[ "$actual_triplet" = ofi-cray* && -z "$opt_lrts_pmi" ]]; then - opt_lrts_pmi="slurmpmi2" + opt_lrts_pmi="slurmpmi2cray" fi # Default to using simplepmi on non-Cray OFI platforms @@ -673,6 +681,8 @@ CC=$opt_CC CXX=$opt_CXX FC=$opt_FC cmake "$my_srcdir" \ -DTCP="$opt_tcp" \ -DTRACING="$opt_tracing" \ -DTRACING_COMMTHREAD="$opt_tracing_commthread" \ + -DCXI="$opt_cxi" \ + -DCMK_BUILD_OFI="$opt_build_ofi" \ -DZLIB="$opt_zlib" diff --git a/cmake/detect-features-c.cmake b/cmake/detect-features-c.cmake index 868b6fd01c..313c407c19 100644 --- a/cmake/detect-features-c.cmake +++ b/cmake/detect-features-c.cmake @@ -367,9 +367,43 @@ int main() { } " CMK_BALANCED_INJECTION_API) -if(${CMK_BUILD_OFI} EQUAL 1) +if(${NETWORK} STREQUAL "ofi" OR ${NETWORK} STREQUAL "ofi-crayshasta" OR ${NETWORK} STREQUAL "ofi-linux") +# assume HPC installation +include(CMakePrintHelpers) + find_package(EnvModules REQUIRED) + find_package(PkgConfig REQUIRED) + if(EnvModules_FOUND) + #at least get libfabric loaded if it isn't already + env_module(load libfabric) + endif() set(tmp ${CMAKE_REQUIRED_LIBRARIES}) - set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} -lfabric") + if(${PkgConfig_FOUND}) +# this is tortured because pkg-config and cmake are infuriating + set(myconfigCommand "pkg-config") + set(myargs1 "libfabric") + set(myargs2 "--libs") + execute_process(COMMAND ${myconfigCommand} ${myargs1} ${myargs2} + OUTPUT_VARIABLE PKG_CONFIG_OFI_LIBS_OUTPUT + RESULT_VARIABLE PKG_CONFIG_OFI_LIBS_RESULT + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + ERROR_VARIABLE thiserror + ) + string(STRIP ${PKG_CONFIG_OFI_LIBS_OUTPUT} CMAKE_PKG_CONFIG_OFI_LIBS) + set(myargs2 "--cflags") + execute_process(COMMAND ${myconfigCommand} ${myargs1} ${myargs2} + OUTPUT_VARIABLE PKG_CONFIG_OFI_CFLAGS_OUTPUT + RESULT_VARIABLE PKG_CONFIG_OFI_CFLAGS_RESULT + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + ERROR_VARIABLE $thaterror + ) + string(STRIP ${PKG_CONFIG_OFI_CFLAGS_OUTPUT} CMAKE_PKG_CONFIG_OFI_CFLAGS) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${CMAKE_PKG_CONFIG_OFI_CFLAGS}") + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} ${CMAKE_PKG_CONFIG_OFI_LIBS}") + else() + message(WARNING "cmake can't find pkg-config") + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + endif() + check_c_source_compiles(" #include int main(int argc, char **argv) @@ -379,9 +413,13 @@ if(${CMK_BUILD_OFI} EQUAL 1) return 0; } " CMK_BUILD_ON_OFI) - set(CMAKE_REQUIRED_LIBRARIES ${tmp}) + if("${CMK_BUILD_ON_OFI}" STREQUAL "") - message(FATAL_ERROR "Unable to build ofi.") + message(FATAL_ERROR "Unable to build ofi with FLAGS ${CMAKE_REQUIRED_FLAGS} LIBS ${CMAKE_REQUIRED_LIBRARIES} for network ${NETWORK}.") + set(CMAKE_REQUIRED_LIBRARIES ${tmp}) + else() +# set(CMAKE_EXTRA_INCLUDE_FILES "{CMAKE_EXTRA_INCLUDE_FILES} CMAKE_PKG_CONFIG_OFI_CFLAGS") +# set(CMK_LIBDIR "{CMK_LIBS} CMAKE_PKG_CONFIG_OFI_LIBS") endif() endif() diff --git a/cmake/detect-features-cxx.cmake b/cmake/detect-features-cxx.cmake index bda53febfd..ae85002c64 100644 --- a/cmake/detect-features-cxx.cmake +++ b/cmake/detect-features-cxx.cmake @@ -165,3 +165,55 @@ int main() { # Unset workaround from above set(CMAKE_REQUIRED_FLAGS "") +if(${NETWORK} STREQUAL "ofi" OR ${NETWORK} STREQUAL "ofi-crayshasta" OR ${NETWORK} STREQUAL "ofi-linux") +# assume HPC installation with LMOD + include(CMakePrintHelpers) + find_package(EnvModules REQUIRED) + find_package(PkgConfig REQUIRED) + if(EnvModules_FOUND) + # we need libfabric cray-libpals and cray-pmi + env_module(load libfabric) + env_module(load cray-libpals) + env_module(load cray-pmi) + endif() + set(tmp ${CMAKE_REQUIRED_LIBRARIES}) + if(${PkgConfig_FOUND}) +# this is tortured because pkg-config and cmake are infuriating + set(myconfigCommand "pkg-config") + set(myargs1 "libfabric") + set(myargs2 "--libs") + execute_process(COMMAND ${myconfigCommand} ${myargs1} ${myargs2} + OUTPUT_VARIABLE PKG_CONFIG_OFI_LIBS_OUTPUT + RESULT_VARIABLE PKG_CONFIG_OFI_LIBS_RESULT + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + ERROR_VARIABLE thiserror + ) + string(STRIP ${PKG_CONFIG_OFI_LIBS_OUTPUT} CMAKE_PKG_CONFIG_OFI_LIBS) + set(myargs2 "--cflags") + execute_process(COMMAND ${myconfigCommand} ${myargs1} ${myargs2} + OUTPUT_VARIABLE PKG_CONFIG_OFI_CFLAGS_OUTPUT + RESULT_VARIABLE PKG_CONFIG_OFI_CFLAGS_RESULT + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + ERROR_VARIABLE $thaterror + ) + string(STRIP ${PKG_CONFIG_OFI_CFLAGS_OUTPUT} CMAKE_PKG_CONFIG_OFI_CFLAGS) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${CMAKE_PKG_CONFIG_OFI_CFLAGS}") + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} ${CMAKE_PKG_CONFIG_OFI_LIBS}") + else() + message(WARNING "cmake can't find pkg-config") + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + endif() + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${CMAKE_PKG_CONFIG_OFI_CFLAGS}") + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} ${CMAKE_PKG_CONFIG_OFI_LIBS}") + +check_cxx_source_compiles(" + #include + #include + int main(int argc, char **argv) + { + struct fi_info *providers; + int ret = fi_getinfo(FI_VERSION(1,0), NULL, NULL, 0ULL, NULL, &providers); + return 0; + } + " CMK_BUILD_ON_CXI) +endif() diff --git a/doc/libraries/manual.rst b/doc/libraries/manual.rst index 4862ad0104..7908b9c0bc 100644 --- a/doc/libraries/manual.rst +++ b/doc/libraries/manual.rst @@ -795,11 +795,15 @@ CkIO Overview -------- -CkIO is a library for parallel I/O in Charm++. Currently it only supports -writing files, not reading them. CkIO improves the performance of write +CkIO is a library for parallel I/O in Charm++. It supports both reading and writing via two independent library components which both involve aggregation. The CkIO abstraction helps get the best performance out of the parallel file system and avoid contention on I/O nodes, while supporting any user-level chare decomposition. + +CkIO Output +----------- + +The CkIO output library improves the performance of write operations by aggregating data at intermediate nodes and batching writes to align with the stripe size of the underlying parallel file system (such as -Lustre). This avoids contention on the I/O nodes by using fewer messages to +Lustre). This helps avoid contention on the I/O nodes by using fewer messages to communicate with them and preventing small or non-contiguous disk operations. Under the hood, when a write is issued, the associated data is sent to the PE(s) @@ -809,22 +813,30 @@ entire stripe is actually written to the filesystem all in one fell swoop. The size and layout of stripes and the number and organization of aggregating PEs are available as options for the user to customize. +CkIO Input +---------- + +The CkIO input library similarly aggregates read requests for a single file via an intermediate layer of chares, called "Buffer Chares." The number of Buffer Chares should be chosen to read from the file system with optimal granularity. Currently, the choice of the number of Buffer Chares must be made by the user (via the Options parameter, discussed below), considering factors such as file size, number of PEs, and number of nodes. + Using CkIO ---------- CkIO is designed as a session-oriented, callback-centric library. The steps to -using the library are as follows (each step is invoked via a callback specified -in an earlier step): - -#. Open a file via ``Ck::IO::open``. -#. Create a session for writing to the file via ``Ck::IO::startSession``. -#. Write to the file via ``Ck::IO::write``. Note that this function takes a - session token that is passed into the callback. -#. When the specified amount of data for the session has been written, a +using the library are different for input and output, but follow the same basic structure: + +#. Open a file via ``Ck::IO::open``. Note that at the lowest level CkIO uses POSIX seek, read, and write (or the Microsoft equivalent for Windows) and therefore must only be used on seek-able file types. +#. Create a session for writing to the file via ``Ck::IO::startSession`` or create a session for reading from a file via ``Ck::IO::startReadSession``. +#. Write or read via ``Ck::IO::write`` or ``Ck::IO::read``. Note that these function take a + session token that is passed into the callback, which should refer to the current session. +#. In the case of a read session, the session must be closed manually when the read is complete via ``Ck::IO::closeReadSession`` +#. When the specified amount of data for the session has been written or a read session has been closed, a completion callback is invoked, from which one may start another session or - close the file via ``Ck::IO::close``. + close the file via ``Ck::IO::close`` (same call for writing or reading). + +Parallel Output API +~~~~~~~~~~~~~~~~~~~ -The following functions comprise the interface to the library: +The following functions comprise the interface to the library for parallel file output: - Opening a file: @@ -839,7 +851,7 @@ The following functions comprise the interface to the library: specified file does not exist, it will be created. Should only be called from a single PE, once per file. - ``Ck::IO::Options`` is a struct with the following fields: + ``Ck::IO::Options`` is a struct with the following output-relevant fields: - ``writeStripe`` - Amount of contiguous data (in bytes) to gather before writing to the file (default: file system stripe size if using Lustre and @@ -852,7 +864,7 @@ The following functions comprise the interface to the library: - ``skipPEs`` - Gap between participating PEs (default : ``CkMyNodeSize()``) -- Starting a session: +- Starting a write session: Note there are two variants of the ``startSession`` function, a regular one and one that writes a user specified chunk of data to the file at the end of a @@ -909,8 +921,78 @@ The following functions comprise the interface to the library: the ``FileReadyMsg`` sent to the ``opened`` callback after a file has been opened. Should only be called from a single PE, once per file. +Parallel Input API +~~~~~~~~~~~~~~~~~~ -Example -------- +The following functions comprise the interface to the library for parallel file input: + + +- Opening a file: + + .. code-block:: c++ + + void Ck::IO::open(std::string path, CkCallback opened, Ck::IO::Options opts) + + Open the given file with the options specified in ``opts``, and send a + ``FileReadyMsg`` (wraps a ``Ck::IO::File file``) to the ``opened`` callback + when the system is ready to accept session requests on that file. If the + specified file does not exist, it will be created. Should only be called from + a single PE, once per file. + + ``Ck::IO::Options`` is a struct with the following input-relevant fields: + + - ``numReaders`` - number of Buffer Chares, or aggregators. The user should chose this number to optimally decompose the read. Typically, chosing the number of Buffer Chares to be the number of PEs performs well. + + +- Starting a read session: + + Note there are two variants of the ``startReadSession`` function, a regular one and a variant which takes an additional argument allowing the user to map Buffer Chares to specified PEs in a round-robin fashion. + + .. code-block:: c++ + + void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready) + + Prepare to read data from ``file``, in the window defined by ``size`` and + ``offset`` (both specified in bytes). On starting the session, the buffer + chares begin eagerly reading all requested data into memory. The ready callback + is invoked once these reads have been initiated (but they are not guaranteed to be complete at this point). + + .. code-block:: c++ + + void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready, std::vector pes_to_map) + + This function is similar to the previous one, but the extra argument pes_to_map allows the user to specify a list of PEs to map the Buffer Chares to. + This argument should contain a sequence of numbers representing pes. The Buffer Chares will be mapped to the PEs in a round-robin fashion. + This can be useful when the user has a specific decomposition in mind for the read. + +- Reading data: + + .. code-block:: c++ + + void read(Session session, size_t bytes, size_t offset, char* data, CkCallback after_read); + + This method is invoked to read data asynchronously from the read session. This method returns immediately to the caller, but the + read is only guaranteed complete once the callback ``after_read`` is called. Internally, the read request is buffered + until the Buffer Chares can respond with the requested data. After the read finishes, the + after_read callback is invoked taking a ReadCompleteMsg* which points to a vector buffer, the offset, + and the number of bytes of the read. + + +- Closing a file: + + .. code-block:: c++ + + void Ck::IO::close(Ck::IO::File file, CkCallback closed) + + Close a previously-opened file. All read sessions on that file must have already + been closed. Note that ``file`` is provided as a member of + the ``FileReadyMsg`` sent to the ``opened`` callback after a file has been + opened. This method should only be called from a single PE, once per file. + + +Examples +-------- + +For example code showing how to use CkIO for output, see ``tests/charm++/io/``. -For example code showing how to use CkIO, see ``tests/charm++/io/``. +For example code showing how to use CkIO for input, see ``tests/charm++/io_read/``. diff --git a/src/arch/ofi-crayshasta/conv-mach-cxi.h b/src/arch/ofi-crayshasta/conv-mach-cxi.h new file mode 100644 index 0000000000..8267a8fdb8 --- /dev/null +++ b/src/arch/ofi-crayshasta/conv-mach-cxi.h @@ -0,0 +1,94 @@ +#ifndef _CONV_MACH_H +#define _CONV_MACH_H + +#define CMK_OFI 1 +/* for Slingshot-11 the provider is CXI, this is notably different + in how memory registration is handled from the old OFI. */ +#ifdef CMK_CXI +#undef CMK_CXI +#endif +#define CMK_CXI 1 + +/* define the default linker, together with its options */ +#define CMK_DLL_CC "g++ -shared -O3 -o " + +/* 1 if the machine has a function called "getpagesize()", 0 otherwise . + used in the memory files of converse */ +#define CMK_GETPAGESIZE_AVAILABLE 1 + +/* defines which version of memory handlers should be used. + used in conv-core/machine.C */ +#define CMK_MALLOC_USE_GNU_MALLOC 0 +#define CMK_MALLOC_USE_OS_BUILTIN 1 + +#define CMK_MEMORY_PAGESIZE 4096 +#define CMK_MEMORY_PROTECTABLE 1 + +/* the following definitions set the type of shared variables to be used. only + one of them must be 1, all the others 0. The different implementations are in + converse.h. Typically used are UNAVAILABLE for non SMP versions and + POSIX_THREADS_SMP for SMP versions. The others are used only in special + cases: NT_THREADS for Windows. */ +#define CMK_SHARED_VARS_UNAVAILABLE 1 /* non SMP versions */ +#define CMK_SHARED_VARS_POSIX_THREADS_SMP 0 /* SMP versions */ +#define CMK_SHARED_VARS_NT_THREADS 0 + +/* the following define if signal handlers should be used, both equal to zero + means that signals will not be used. only one of the following can be 1, the + other must be 0. they differ in the fact that the second (_WITH_RESTART) + enables retry on interrupt (a function is recalled upon interrupt and does + not return EINTR as in the first case) */ +#define CMK_SIGNAL_USE_SIGACTION 0 +#define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART 1 + +/* specifies whether the CthCpv variables should be defined as Cpv (0) or + directly as normal c variables (1) */ +#define CMK_THREADS_REQUIRE_NO_CPV 0 + +/* decide which is the default implementation of the threads (see threads.C) + Only one of the following can be 1. If none of them is selected, qthreads + will be used as default. This default can be overwritten at compile time + using -DCMK_THREADS_BUILD_"type"=1 */ +#define CMK_THREADS_USE_CONTEXT 0 +#define CMK_THREADS_USE_FCONTEXT 1 +#define CMK_THREADS_USE_JCONTEXT 0 +#define CMK_THREADS_USE_PTHREADS 0 + +/* Specifies what kind of timer to use, and the correspondent headers will be + included in convcore.C. If none is selected, then the machine.C file needs to + implement the timer primitives. */ +#define CMK_TIMER_USE_RTC 0 +#define CMK_TIMER_USE_RDTSC 0 +#define CMK_TIMER_USE_GETRUSAGE 1 +#define CMK_TIMER_USE_SPECIAL 0 +#define CMK_TIMER_USE_TIMES 0 + +/* Specifies what the processor will do when it is idle, either sleep (1) or go + into busy waiting mode (0). In convcore.C there are a few files included if + sleeping mode, but the real distinct implementation is in the machine.C + file. */ +#define CMK_WHEN_PROCESSOR_IDLE_USLEEP 0 + +/* specifies whether there is a web server collecting utilization statistics (1) + or not (0) */ +#define CMK_WEB_MODE 1 + +#define CMK_DEBUG_MODE 0 + +/* enables the load balancer framework. set to 1 for almost all the machines */ +#define CMK_LBDB_ON 1 + +#define CMK_64BIT 1 +#define CMK_AMD64 1 + +/* Other possible definitions: + +In fault tolerant architectures, CK_MEM_CHECKPOINT can be set. In this case the +extended header must contain also another field called "pn" (phase number). + +*/ + +/* Use PMI2 by default on Cray systems with cray-pmi */ +#include "conv-mach-slurmpmi2cray.h" + +#endif diff --git a/src/arch/ofi-crayshasta/conv-mach-cxi.sh b/src/arch/ofi-crayshasta/conv-mach-cxi.sh new file mode 100644 index 0000000000..3e6a6b5dba --- /dev/null +++ b/src/arch/ofi-crayshasta/conv-mach-cxi.sh @@ -0,0 +1,27 @@ + + +# For libfabric If the user doesn't pass --basedir, use pkg-config for +#libfabric headers and library to avoid some linker wackiness, we +#order them: pal libs, PMI libs, lib64. So that if someplace (i.e., +#NCSA) puts regular pmi libs in /usr/lib64, we get them from the +#package's cray-pmi dir not their unextended pmi. libpals comes along +#for the ride here due to a dependency in pmi. fabric can just go +#after the others. + + +if test -z "$USER_OPTS_LD" +then + module load cray-libpals cray-pmi libfabric + CMK_LIBFABRIC_INC=`pkg-config --cflags libfabric` + CMK_LIBFABRIC_LIBS=`pkg-config --libs libfabric` + CMK_LIBPALS_LIBS=`pkg-config --libs libpals` + CMK_LIBPALS_LDPATH=`pkg-config libpals --variable=libdir` + CMK_PMI_INC=`pkg-config --cflags cray-pmi` + CMK_PMI_LIBS=`pkg-config --libs cray-pmi` + CMK_LIBPMI_LDPATH=`pkg-config cray-pmi --variable=libdir` + CMK_INCDIR="$CMK_PMI_INC -I/usr/include/slurm/ $CMK_LIBFABRIC_INC $CMK_INCDIR " + CMK_LIBS="-Wl,-rpath,$CMK_LIBPALS_LDPATH,-rpath,$CMK_LIBPMI_LDPATH $CMK_LIBPALS_LIBS $CMK_PMI_LIBS -L/usr/lib64/ $CMK_LIBFABRIC_LIBS $CMK_LIBS " +fi + +# For runtime +CMK_INCDIR="$CMK_INCDIR -I./proc_management/" diff --git a/src/arch/ofi-crayshasta/conv-mach.h b/src/arch/ofi-crayshasta/conv-mach.h index 893d3d74b0..bef80a9499 100644 --- a/src/arch/ofi-crayshasta/conv-mach.h +++ b/src/arch/ofi-crayshasta/conv-mach.h @@ -75,7 +75,12 @@ #define CMK_64BIT 1 #define CMK_AMD64 1 - +#ifdef CMK_HAS_GET_MYADDRESS +#undef CMK_HAS_GET_MYADDRESS +#define CMK_HAS_GET_MYADDRESS 0 +#else +#define CMK_HAS_GET_MYADDRESS 0 +#endif /* Other possible definitions: In fault tolerant architectures, CK_MEM_CHECKPOINT can be set. In this case the @@ -83,7 +88,7 @@ extended header must contain also another field called "pn" (phase number). */ -/* Use PMI2 by default on Cray systems with cray-pmi */ -#include "conv-mach-slurmpmi2.h" +/* Use PMI2 with cray extensions by default on Cray systems with cray-pmi */ +#include "conv-mach-slurmpmi2cray.h" #endif diff --git a/src/arch/ofi-crayshasta/conv-mach.sh b/src/arch/ofi-crayshasta/conv-mach.sh index 8c96994910..f87bafcf31 100644 --- a/src/arch/ofi-crayshasta/conv-mach.sh +++ b/src/arch/ofi-crayshasta/conv-mach.sh @@ -6,26 +6,20 @@ CMK_CRAY_NOGNI=1 #If the user doesn't pass --basedir, use defaults for libfabric headers and library if test -z "$USER_OPTS_LD" then - if test -z "$LIBFABRIC" + if test -z $"CMK_LIBFABRIC_INC" then - CMK_INCDIR="$CMK_INCDIR -I/usr/include/" - CMK_LIBDIR="$CMK_LIBDIR -L/usr/lib64/" - else - CMK_INCDIR="$CMK_INCDIR -I$LIBFABRIC/include/" - CMK_LIBDIR="$CMK_LIBDIR -L$LIBFABRIC/lib64/" + CMK_LIBFABRIC_INC=`pkg-config --cflags libfabric` + CMK_LIBFABRIC_LIBS=`pkg-config --libs libfabric` + CMK_LIBPALS_LIBS=`pkg-config --libs libpals` + CMK_LIBPALS_LDPATH=`pkg-config libpals --variable=libdir` fi fi -# For cray-pmi -if test -n "$CRAY_PMI_PREFIX" -then - CMK_INCDIR="$CMK_INCDIR -I$CRAY_PMI_PREFIX/include" - CMK_LIBDIR="$CMK_LIBDIR -L$CRAY_PMI_PREFIX/lib" -fi - -CMK_LIBS="$CMK_LIBS -lfabric" # Use PMI2 by default on Cray systems with cray-pmi -. $CHARMINC/conv-mach-slurmpmi2.sh +. $CHARMINC/conv-mach-slurmpmi2cray.sh + +CMK_INCDIR="$CMK_PMI_INC -I/usr/include/slurm/ $CMK_LIBFABRIC_INC $CMK_INCDIR " +CMK_LIBS="-Wl,-rpath,$CMK_LIBPALS_LDPATH,-rpath,$CMK_LIBPMI_LDPATH $CMK_LIBPALS_LIBS $CMK_PMI_LIBS -L/usr/lib64/ $CMK_LIBFABRIC_LIBS $CMK_LIBS " # For runtime CMK_INCDIR="$CMK_INCDIR -I./proc_management/" diff --git a/src/arch/ofi-linux-x86_64/conv-mach-cxi.h b/src/arch/ofi-linux-x86_64/conv-mach-cxi.h new file mode 100644 index 0000000000..8267a8fdb8 --- /dev/null +++ b/src/arch/ofi-linux-x86_64/conv-mach-cxi.h @@ -0,0 +1,94 @@ +#ifndef _CONV_MACH_H +#define _CONV_MACH_H + +#define CMK_OFI 1 +/* for Slingshot-11 the provider is CXI, this is notably different + in how memory registration is handled from the old OFI. */ +#ifdef CMK_CXI +#undef CMK_CXI +#endif +#define CMK_CXI 1 + +/* define the default linker, together with its options */ +#define CMK_DLL_CC "g++ -shared -O3 -o " + +/* 1 if the machine has a function called "getpagesize()", 0 otherwise . + used in the memory files of converse */ +#define CMK_GETPAGESIZE_AVAILABLE 1 + +/* defines which version of memory handlers should be used. + used in conv-core/machine.C */ +#define CMK_MALLOC_USE_GNU_MALLOC 0 +#define CMK_MALLOC_USE_OS_BUILTIN 1 + +#define CMK_MEMORY_PAGESIZE 4096 +#define CMK_MEMORY_PROTECTABLE 1 + +/* the following definitions set the type of shared variables to be used. only + one of them must be 1, all the others 0. The different implementations are in + converse.h. Typically used are UNAVAILABLE for non SMP versions and + POSIX_THREADS_SMP for SMP versions. The others are used only in special + cases: NT_THREADS for Windows. */ +#define CMK_SHARED_VARS_UNAVAILABLE 1 /* non SMP versions */ +#define CMK_SHARED_VARS_POSIX_THREADS_SMP 0 /* SMP versions */ +#define CMK_SHARED_VARS_NT_THREADS 0 + +/* the following define if signal handlers should be used, both equal to zero + means that signals will not be used. only one of the following can be 1, the + other must be 0. they differ in the fact that the second (_WITH_RESTART) + enables retry on interrupt (a function is recalled upon interrupt and does + not return EINTR as in the first case) */ +#define CMK_SIGNAL_USE_SIGACTION 0 +#define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART 1 + +/* specifies whether the CthCpv variables should be defined as Cpv (0) or + directly as normal c variables (1) */ +#define CMK_THREADS_REQUIRE_NO_CPV 0 + +/* decide which is the default implementation of the threads (see threads.C) + Only one of the following can be 1. If none of them is selected, qthreads + will be used as default. This default can be overwritten at compile time + using -DCMK_THREADS_BUILD_"type"=1 */ +#define CMK_THREADS_USE_CONTEXT 0 +#define CMK_THREADS_USE_FCONTEXT 1 +#define CMK_THREADS_USE_JCONTEXT 0 +#define CMK_THREADS_USE_PTHREADS 0 + +/* Specifies what kind of timer to use, and the correspondent headers will be + included in convcore.C. If none is selected, then the machine.C file needs to + implement the timer primitives. */ +#define CMK_TIMER_USE_RTC 0 +#define CMK_TIMER_USE_RDTSC 0 +#define CMK_TIMER_USE_GETRUSAGE 1 +#define CMK_TIMER_USE_SPECIAL 0 +#define CMK_TIMER_USE_TIMES 0 + +/* Specifies what the processor will do when it is idle, either sleep (1) or go + into busy waiting mode (0). In convcore.C there are a few files included if + sleeping mode, but the real distinct implementation is in the machine.C + file. */ +#define CMK_WHEN_PROCESSOR_IDLE_USLEEP 0 + +/* specifies whether there is a web server collecting utilization statistics (1) + or not (0) */ +#define CMK_WEB_MODE 1 + +#define CMK_DEBUG_MODE 0 + +/* enables the load balancer framework. set to 1 for almost all the machines */ +#define CMK_LBDB_ON 1 + +#define CMK_64BIT 1 +#define CMK_AMD64 1 + +/* Other possible definitions: + +In fault tolerant architectures, CK_MEM_CHECKPOINT can be set. In this case the +extended header must contain also another field called "pn" (phase number). + +*/ + +/* Use PMI2 by default on Cray systems with cray-pmi */ +#include "conv-mach-slurmpmi2cray.h" + +#endif diff --git a/src/arch/ofi-linux-x86_64/conv-mach-cxi.sh b/src/arch/ofi-linux-x86_64/conv-mach-cxi.sh new file mode 100644 index 0000000000..3e6a6b5dba --- /dev/null +++ b/src/arch/ofi-linux-x86_64/conv-mach-cxi.sh @@ -0,0 +1,27 @@ + + +# For libfabric If the user doesn't pass --basedir, use pkg-config for +#libfabric headers and library to avoid some linker wackiness, we +#order them: pal libs, PMI libs, lib64. So that if someplace (i.e., +#NCSA) puts regular pmi libs in /usr/lib64, we get them from the +#package's cray-pmi dir not their unextended pmi. libpals comes along +#for the ride here due to a dependency in pmi. fabric can just go +#after the others. + + +if test -z "$USER_OPTS_LD" +then + module load cray-libpals cray-pmi libfabric + CMK_LIBFABRIC_INC=`pkg-config --cflags libfabric` + CMK_LIBFABRIC_LIBS=`pkg-config --libs libfabric` + CMK_LIBPALS_LIBS=`pkg-config --libs libpals` + CMK_LIBPALS_LDPATH=`pkg-config libpals --variable=libdir` + CMK_PMI_INC=`pkg-config --cflags cray-pmi` + CMK_PMI_LIBS=`pkg-config --libs cray-pmi` + CMK_LIBPMI_LDPATH=`pkg-config cray-pmi --variable=libdir` + CMK_INCDIR="$CMK_PMI_INC -I/usr/include/slurm/ $CMK_LIBFABRIC_INC $CMK_INCDIR " + CMK_LIBS="-Wl,-rpath,$CMK_LIBPALS_LDPATH,-rpath,$CMK_LIBPMI_LDPATH $CMK_LIBPALS_LIBS $CMK_PMI_LIBS -L/usr/lib64/ $CMK_LIBFABRIC_LIBS $CMK_LIBS " +fi + +# For runtime +CMK_INCDIR="$CMK_INCDIR -I./proc_management/" diff --git a/src/arch/ofi/charmrun b/src/arch/ofi/charmrun index 0f0a117043..7bdf8b3e40 100755 --- a/src/arch/ofi/charmrun +++ b/src/arch/ofi/charmrun @@ -184,7 +184,39 @@ then elif [[ -n "$LSB_HOSTS" ]] then # Tungsten - runCmd cmpirun -lsf -poll -no_smp -gm_long 200000 "${args[@]}" + runCmd cmpirun -lsf -poll -no_smp -gm_long 200000 "${args[@]}" +elif [[ -n "$SLURM_JOB_ID" ]] +then + #use srun + # srun in a cray-shasta environment should support --mpi=cray-shasta regardless, the question here is how this charm was built + # assume built with craype if that is loaded + craype=`module -t list craype 2>&1 | grep craype` + if [[ $SLURM_JOB_NUM_NODES -eq 1 ]] + then + NET_ARGS="--network=single_node_vni" + else + if [[ $pes -eq 1 ]] + then + NET_ARGS="--network=single_node_vni" + else + if [[ $nodes -eq 1 ]] + then + NET_ARGS="--network=single_node_vni" + else + NET_ARGS="" + fi + fi + fi + + if [[ $? -eq 0 ]] + then + runCmd srun --mpi=cray_shasta $NET_ARGS -n "$nodes" "${args[@]}" + else + #someday this should be pmix, but our pmix launcher needs some + # work and cray hasn't really adopted PMIX at this time. + # unclear what the right answer is in other OFI environments + runCmd srun --mpi=pmi2 -n "$nodes" "${args[@]}" + fi elif [[ -n "$PBS_QUEUE" || -n "$LSF_QUEUE" ]] then # Interactive mode: create, and submit a batch job diff --git a/src/arch/ofi/conv-common.h b/src/arch/ofi/conv-common.h index 5aeac37bae..3cc1653852 100644 --- a/src/arch/ofi/conv-common.h +++ b/src/arch/ofi/conv-common.h @@ -91,11 +91,11 @@ #define CMK_USE_PMI 1 #define CMK_USE_PMI2 0 #define CMK_USE_PMIX 0 - +#define CMK_USE_CRAYPMI 0 /* * Use Simple client-side implementation of PMI. * Valid only for CMK_USE_PMI. * Optional in an SLURM environment. * See src/arch/util/proc_management/simple_pmi/ */ -#define CMK_USE_SIMPLEPMI 1 +#define CMK_USE_SIMPLEPMI 0 diff --git a/src/arch/ofi/conv-mach-ompipmix.sh b/src/arch/ofi/conv-mach-ompipmix.sh index 5378d44f36..edc5964ece 100644 --- a/src/arch/ofi/conv-mach-ompipmix.sh +++ b/src/arch/ofi/conv-mach-ompipmix.sh @@ -1 +1,2 @@ -CMK_LIBS="$CMK_LIBS -lpmix -lopen-pal -lopen-rte" +#CMK_LIBS="$CMK_LIBS -lpmix -lopen-pal -lopen-rte" +CMK_LIBS="$CMK_LIBS -lpmix" diff --git a/src/arch/ofi/conv-mach-slurmpmi2cray.h b/src/arch/ofi/conv-mach-slurmpmi2cray.h new file mode 100644 index 0000000000..caa1aba8f0 --- /dev/null +++ b/src/arch/ofi/conv-mach-slurmpmi2cray.h @@ -0,0 +1,5 @@ +#undef CMK_USE_PMI +#undef CMK_USE_PMI2 +#undef CMK_USE_PMIX +#undef CMK_USE_SIMPLEPMI +#define CMK_USE_CRAYPMI2 1 diff --git a/src/arch/ofi/conv-mach-slurmpmi2cray.sh b/src/arch/ofi/conv-mach-slurmpmi2cray.sh new file mode 100644 index 0000000000..df67fe569b --- /dev/null +++ b/src/arch/ofi/conv-mach-slurmpmi2cray.sh @@ -0,0 +1,7 @@ +if test -z $"CMK_PMI_INC" +then + CMK_PMI_INC=`pkg-config --cflags cray-pmi` + CMK_PMI_LIBS=`pkg-config --libs cray-pmi` + CMK_LIBPMI_LDPATH=`pkg-config cray-pmi --variable=libdir` +fi + diff --git a/src/arch/ofi/conv-mach-slurmpmicray.h b/src/arch/ofi/conv-mach-slurmpmicray.h new file mode 100644 index 0000000000..bc15717da4 --- /dev/null +++ b/src/arch/ofi/conv-mach-slurmpmicray.h @@ -0,0 +1,5 @@ +#undef CMK_USE_PMI +#undef CMK_USE_PMI2 +#undef CMK_USE_PMIX +#undef CMK_USE_SIMPLEPMI +#define CMK_USE_CRAYPMI 1 diff --git a/src/arch/ofi/conv-mach-slurmpmicray.sh b/src/arch/ofi/conv-mach-slurmpmicray.sh new file mode 100644 index 0000000000..0be05dde89 --- /dev/null +++ b/src/arch/ofi/conv-mach-slurmpmicray.sh @@ -0,0 +1,6 @@ +CMK_LIBFABRIC_INC=`pkg-config --cflags libfabric` +CMK_LIBFABRIC_LIBS=`pkg-config --libs libfabric` +CMK_PMI_INC=`pkg-config --cflags cray-pmi` +CMK_PMI_LIBS=`pkg-config --libs cray-pmi` +CMK_INCDIR="$CMK_INCDIR $CMK_PMI_INC -I/usr/include/slurm/ $CMK_LIBFABRIC_INC" +CMK_LIBS="$CMK_LIBS $CMK_PMI_LIBS $CMK_LIBFABRIC_LIBS" diff --git a/src/arch/ofi/conv-mach-xpmem.h b/src/arch/ofi/conv-mach-xpmem.h new file mode 100644 index 0000000000..eca0d33b95 --- /dev/null +++ b/src/arch/ofi/conv-mach-xpmem.h @@ -0,0 +1,16 @@ +#ifndef _CONV_MACH_XPMEM_ +#define _CONV_MACH_XPMEM + +#undef CMK_USE_PXSHM +#undef CMK_USE_XPMEM +#define CMK_USE_XPMEM 1 + +#undef CMK_IMMEDIATE_MSG +#define CMK_IMMEDIATE_MSG 0 + +#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP +#define CMK_WHEN_PROCESSOR_IDLE_USLEEP 0 + +#define XPMEM_LOCK 1 + +#endif diff --git a/src/arch/ofi/conv-mach-xpmem.sh b/src/arch/ofi/conv-mach-xpmem.sh new file mode 100644 index 0000000000..08304098a4 --- /dev/null +++ b/src/arch/ofi/conv-mach-xpmem.sh @@ -0,0 +1,4 @@ +CMK_XPMEM_INC=`pkg-config --cflags cray-xpmem` +CMK_XPMEM_LIBS=`pkg-config --libs cray-xpmem` +CMK_INCDIR="$CMK_INCDIR $CMK_XPMEM_INC" +CMK_LIBS="$CMK_LIBS $CMK_XPMEM_LIBS -lrt -lpthread" diff --git a/src/arch/ofi/machine-onesided.C b/src/arch/ofi/machine-onesided.C index 3c219b3e7a..1da03de1ac 100644 --- a/src/arch/ofi/machine-onesided.C +++ b/src/arch/ofi/machine-onesided.C @@ -1,23 +1,36 @@ + +// if FI_MR_VIRT_ADDR is not enabled, then we use offset from the +// registered address which would be a 0 offset in this case. +#define DETERMINE_OFFSET(x) ((FI_MR_SCALABLE == context.mr_mode) || ( FI_MR_VIRT_ADDR & context.mr_mode)==0) ? 0 : (const char*)((x)) + void registerDirectMemory(void *info, const void *addr, int size) { CmiOfiRdmaPtr_t *rdmaInfo = (CmiOfiRdmaPtr_t *)info; uint64_t requested_key = 0; int err; - if(FI_MR_SCALABLE == context.mr_mode) { + if(FI_MR_ENDPOINT & context.mr_mode) + { + ofi_reg_bind_enable(addr, size, &(rdmaInfo->mr),&context); + } + else if(FI_MR_SCALABLE == context.mr_mode){ requested_key = __sync_fetch_and_add(&(context.mr_counter), 1); - } - err = fi_mr_reg(context.domain, - addr, - size, - FI_REMOTE_READ | FI_REMOTE_WRITE | FI_READ | FI_WRITE, - 0ULL, - requested_key, - 0ULL, - &(rdmaInfo->mr), - NULL); - if (err) { - CmiAbort("registerDirectMemory: fi_mr_reg failed!\n"); - } + err = fi_mr_reg(context.domain, + addr, + size, + FI_REMOTE_READ | FI_REMOTE_WRITE | FI_READ | FI_WRITE, + 0ULL, + requested_key, + 0ULL, + &(rdmaInfo->mr), + NULL); + if (err) { + CmiAbort("registerDirectMemory: fi_mr_reg failed!\n"); + } + } + else + { + CmiAbort("registerDirectMemory: fi_mr_reg failed!\n"); + } rdmaInfo->key = fi_mr_key(rdmaInfo->mr); } @@ -142,8 +155,7 @@ void process_onesided_reg_and_put(struct fi_cq_tagged_entry *e, OFIRequest *req) ncpyOpInfo->srcSize); ncpyOpInfo->isSrcRegistered = 1; // Set isSrcRegistered to 1 after registration - - const char *rbuf = (FI_MR_SCALABLE == context.mr_mode) ? 0 : (const char*)(ncpyOpInfo->destPtr); + const char *rbuf = DETERMINE_OFFSET(ncpyOpInfo->destPtr); // Allocate a completion object for tracking write completion and ack handling CmiOfiRdmaComp_t* rdmaComp = (CmiOfiRdmaComp_t *)malloc(sizeof(CmiOfiRdmaComp_t)); @@ -181,8 +193,7 @@ void process_onesided_reg_and_get(struct fi_cq_tagged_entry *e, OFIRequest *req) ncpyOpInfo->destSize); ncpyOpInfo->isDestRegistered = 1; // Set isDestRegistered to 1 after registration - - const char *rbuf = (FI_MR_SCALABLE == context.mr_mode) ? 0 : (const char*)(ncpyOpInfo->srcPtr); + const char *rbuf = DETERMINE_OFFSET(ncpyOpInfo->srcPtr); // Allocate a completion object for tracking write completion and ack handling CmiOfiRdmaComp_t* rdmaComp = (CmiOfiRdmaComp_t *)malloc(sizeof(CmiOfiRdmaComp_t)); @@ -232,18 +243,29 @@ void LrtsIssueRget(NcpyOperationInfo *ncpyOpInfo) { req->size = ncpyOpInfo->ncpyOpInfoSize; req->callback = send_short_callback; req->data.short_msg = ncpyOpInfo; - - ofi_send(ncpyOpInfo, + // in CXI we cannot just send this unregistered thing + // +#if CMK_CXI + ofi_register_and_send(ncpyOpInfo, ncpyOpInfo->ncpyOpInfoSize, CmiNodeOf(ncpyOpInfo->srcPe), OFI_RDMA_DIRECT_REG_AND_PUT, req); + +#else + CmiOfiRdmaPtr_t *dest_info = (CmiOfiRdmaPtr_t *)((char *)ncpyOpInfo->destLayerInfo + CmiGetRdmaCommonInfoSize()); + ofi_send_reg(ncpyOpInfo, + ncpyOpInfo->ncpyOpInfoSize, + CmiNodeOf(ncpyOpInfo->srcPe), + OFI_RDMA_DIRECT_REG_AND_PUT, + req, dest_info->mr); +#endif } else { CmiOfiRdmaPtr_t *dest_info = (CmiOfiRdmaPtr_t *)((char *)ncpyOpInfo->destLayerInfo + CmiGetRdmaCommonInfoSize()); CmiOfiRdmaPtr_t *src_info = (CmiOfiRdmaPtr_t *)((char *)ncpyOpInfo->srcLayerInfo + CmiGetRdmaCommonInfoSize()); - const char *rbuf = (FI_MR_SCALABLE == context.mr_mode) ? 0 : (const char*)(ncpyOpInfo->srcPtr); + const char *rbuf = DETERMINE_OFFSET(ncpyOpInfo->srcPtr); // Allocate a completion object for tracking read completion and ack handling CmiOfiRdmaComp_t* rdmaComp = (CmiOfiRdmaComp_t *)malloc(sizeof(CmiOfiRdmaComp_t)); @@ -285,18 +307,25 @@ void LrtsIssueRput(NcpyOperationInfo *ncpyOpInfo) { req->size = ncpyOpInfo->ncpyOpInfoSize; req->callback = send_short_callback; req->data.short_msg = ncpyOpInfo; - +#if CMK_CXI + ofi_register_and_send(ncpyOpInfo, + ncpyOpInfo->ncpyOpInfoSize, + CmiNodeOf(ncpyOpInfo->destPe), + OFI_RDMA_DIRECT_REG_AND_GET, + req); +#else ofi_send(ncpyOpInfo, ncpyOpInfo->ncpyOpInfoSize, CmiNodeOf(ncpyOpInfo->destPe), OFI_RDMA_DIRECT_REG_AND_GET, req); +#endif } else { CmiOfiRdmaPtr_t *dest_info = (CmiOfiRdmaPtr_t *)((char *)(ncpyOpInfo->destLayerInfo) + CmiGetRdmaCommonInfoSize()); CmiOfiRdmaPtr_t *src_info = (CmiOfiRdmaPtr_t *)((char *)(ncpyOpInfo->srcLayerInfo) + CmiGetRdmaCommonInfoSize()); - const char *rbuf = (FI_MR_SCALABLE == context.mr_mode) ? 0 : (const char*)(ncpyOpInfo->destPtr); + const char *rbuf = DETERMINE_OFFSET(ncpyOpInfo->destPtr); // Allocate a completion object for tracking write completion and ack handling CmiOfiRdmaComp_t* rdmaComp = (CmiOfiRdmaComp_t *)malloc(sizeof(CmiOfiRdmaComp_t)); diff --git a/src/arch/ofi/machine.C b/src/arch/ofi/machine.C index 544c8610f5..42ec03d44b 100644 --- a/src/arch/ofi/machine.C +++ b/src/arch/ofi/machine.C @@ -21,6 +21,40 @@ * - The receiver uses a OFILongMsg structure to keep track of an * ongoing long message retrieval. * + * Changes For CXI (as found on Slingshot-11): + * Date : 2024-01-04 + * Author: Eric Bohm + * + * * Add support for CXI extensions for Cassini (AKA Slingshot-11) + * + * - CXI required FI_MR_ENDPOINT + * + * 1) Which requires that all message memory be: registered, bound to + * the endpoint, and activated before use. + * + * 2) CXI supporting endpoint must be selected for in fi_getinfo + * + * 3) CXI is reportedly not optimized for within node communication, + * so process to process schemes, i.e., XPMEM or CMA should be + * pursued. However, the current implementations have not been shown + * to be robust and performant, so they are not enabled by default. + * + * 4) Memory requirements add tracking for the memory registration + * key. This is kept in a prefix header for each allocated buffer. + * Most use cases are managed by the memory pool, which is also on by + * default and should not be disabled without good reason. + * + * 5) CXI comes with FI_MR_VIRT_ADDR=0, which means RMA transactions + * require both the key and the offset from the base address of the + * allocated buffer associated with that key. + * + * 6) We update to the build time environment version of libfabric + * instead of forcing 1.0. (e.g. libfabric 1.15.2.0 at time of writing) + * + * 7) CXI defines that memory keys 0-99 support CXI optimized + * operations (such as reductions, or reducing depency on delivery + * ordering ). We set aside 0-50 for TBD use and build up from 51. + * * Runtime options: * +ofi_eager_maxsize: (default: 65536) Threshold between buffered and RMA * paths. @@ -58,11 +92,48 @@ #include "machine.h" +// Trace communication thread + +#if CMK_TRACE_ENABLED && CMK_SMP_TRACE_COMMTHREAD +#define TRACE_THRESHOLD 0.00001 +#undef CMI_MACH_TRACE_USEREVENTS +#define CMI_MACH_TRACE_USEREVENTS 1 +#else +#undef CMK_SMP_TRACE_COMMTHREAD +#define CMK_SMP_TRACE_COMMTHREAD 0 +#endif + +#define CMK_TRACE_COMMOVERHEAD 0 +#if CMK_TRACE_ENABLED && CMK_TRACE_COMMOVERHEAD +#undef CMI_MACH_TRACE_USEREVENTS +#define CMI_MACH_TRACE_USEREVENTS 1 +#else +#undef CMK_TRACE_COMMOVERHEAD +#define CMK_TRACE_COMMOVERHEAD 0 +#endif + +#if CMI_MACH_TRACE_USEREVENTS && CMK_TRACE_ENABLED +CpvStaticDeclare(double, projTraceStart); +#define START_EVENT() CpvAccess(projTraceStart) = CmiWallTimer(); +#define END_EVENT(x) traceUserBracketEvent(x, CpvAccess(projTraceStart), CmiWallTimer()); +#define EVENT_TIME() CpvAccess(projTraceStart) +#else +#define START_EVENT() +#define END_EVENT(x) +#define EVENT_TIME() (0.0) +#endif + + /* TODO: macros regarding redefining locks that will affect pcqueue.h*/ #include "pcqueue.h" /* =======Beginning of Definitions of Performance-Specific Macros =======*/ /* TODO: add any that are related */ + +/* ======= This where we define the macros for the 0-99 special MRs ====== */ + +#define OFI_POSTED_RECV_MR_KEY 0 +#define CMK_SMP_SENDQ 0 /* =======End of Definitions of Performance-Specific Macros =======*/ @@ -72,10 +143,12 @@ /* =====Beginning of Declarations of Machine Specific Variables===== */ -/* TODO: add any that are related */ + + /* =====End of Declarations of Machine Specific Variables===== */ #include "machine-lrts.h" + #include "machine-common-core.C" /* Libfabric headers */ @@ -87,40 +160,97 @@ #include #include -#define USE_OFIREQUEST_CACHE 0 +#define USE_OFIREQUEST_CACHE 1 /* Definition of OFIRequest + request cache */ #include "request.h" /* Runtime to exchange EP addresses during LrtsInit() */ -#if CMK_USE_PMI || CMK_USE_SIMPLEPMI +/* someday, we'll update to pmix, today is not that day */ +#if CMK_CXI +#define CMK_USE_CRAYPMI2 1 +#endif +#if CMK_USE_CRAYPMI2 +#include "runtime-craypmi2.C" +#elif CMK_USE_CRAYPMI +#include "runtime-craypmi.C" +#elif CMK_USE_PMI || CMK_USE_SIMPLEPMI #include "runtime-pmi.C" #elif CMK_USE_PMI2 #include "runtime-pmi2.C" #elif CMK_USE_PMIX #include "runtime-pmix.C" #endif - +#define ALIGN64(x) (size_t)((~63)&((x)+63)) +#if CMK_CXI + /** use mempools in CXI to aggregate FI_MR_ENDPOINT registration reqs into big blocks */ +#define ONE_MB (1024ll*1024) +#define ALIGN64(x) (size_t)((~63)&((x)+63)) +#define ALIGNHUGEPAGE(x) (size_t)((~(_tlbpagesize-1))&((x)+_tlbpagesize-1)) + +#define USE_MEMPOOL 1 +#define LARGEPAGE 0 +#else #define USE_MEMPOOL 0 +#endif +static int _tlbpagesize = 4096; #if USE_MEMPOOL +#if LARGEPAGE +// separate pool of memory mapped huge pages +static CmiInt8 BIG_MSG = 16 * ONE_MB; +#else +static CmiInt8 BIG_MSG = 2 * ONE_MB; +#endif + +void* LrtsPoolAlloc(int n_bytes); #include "mempool.h" -#define MEMPOOL_INIT_SIZE_MB_DEFAULT 8 +#if CMK_SMP +// nothing to do here +#else +//minimal per process memory pool use for nonsmp mode +#define USE_SMALL_BASE_POOL_DEFAULTS 1 +#endif + +#if USE_SMALL_BASE_POOL_DEFAULTS +#define MEMPOOL_INIT_SIZE_MB_DEFAULT 1 #define MEMPOOL_EXPAND_SIZE_MB_DEFAULT 4 +#define MEMPOOL_MAX_SIZE_MB_DEFAULT 16 +#define MEMPOOL_LB_DEFAULT 0 +#define MEMPOOL_RB_DEFAULT 32*ONE_MB +#else +#define MEMPOOL_INIT_SIZE_MB_DEFAULT 4 +#define MEMPOOL_EXPAND_SIZE_MB_DEFAULT 16 #define MEMPOOL_MAX_SIZE_MB_DEFAULT 512 -#define MEMPOOL_LB_DEFAULT 1024 -#define MEMPOOL_RB_DEFAULT 67108864 -#define ONE_MB 1048576 +#define MEMPOOL_LB_DEFAULT 0 +#define MEMPOOL_RB_DEFAULT 134217728 +#endif + +#define ALIGNBUF (sizeof(mempool_header)+sizeof(CmiChunkHeader)) +#define GetMempoolBlockPtr(x) MEMPOOL_GetBlockPtr(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) +#define GetMempoolPtr(x) MEMPOOL_GetMempoolPtr(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) + +#define GetMempoolsize(x) MEMPOOL_GetSize(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) +#define GetMemHndl(x) MEMPOOL_GetMemHndl(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) + +#define GetMemHndlFromBlockHeader(x) MEMPOOL_GetBlockMemHndl(x) +#define GetSizeFromBlockHeader(x) MEMPOOL_GetBlockSize(x) +#define GetBaseAllocPtr(x) GetMempoolBlockPtr(x) +#define GetMemOffsetFromBase(x) ((char*)(x) - (char *) GetBaseAllocPtr(x)) + -CpvDeclare(mempool_type*, mempool); +CpvDeclare(mempool_type*, mempool); +#else +#define ALIGNBUF sizeof(CmiChunkHeader) #endif /* USE_MEMPOOL */ #define CmiSetMsgSize(msg, sz) ((((CmiMsgHeaderBasic *)msg)->size) = (sz)) +#define CmiGetMsgSize(msg) ((((CmiMsgHeaderBasic *)msg)->size)) #define CACHELINE_LEN 64 - +#if CMK_SMP #define OFI_NUM_RECV_REQS_DEFAULT 8 #define OFI_NUM_RECV_REQS_MAX 4096 @@ -129,6 +259,16 @@ CpvDeclare(mempool_type*, mempool); #define OFI_CQ_ENTRIES_COUNT_DEFAULT 8 #define OFI_CQ_ENTRIES_COUNT_MAX 1024 +#else +#define OFI_NUM_RECV_REQS_DEFAULT 4 +#define OFI_NUM_RECV_REQS_MAX 64 + +#define OFI_EAGER_MAXSIZE_DEFAULT 65536 +#define OFI_EAGER_MAXSIZE_MAX 1048576 + +#define OFI_CQ_ENTRIES_COUNT_DEFAULT 4 +#define OFI_CQ_ENTRIES_COUNT_MAX 64 +#endif #define OFI_USE_INJECT_DEFAULT 1 @@ -149,7 +289,7 @@ CpvDeclare(mempool_type*, mempool); #define OFI_OP_MASK 0x7ULL -#define MR_ACCESS_PERMISSIONS (FI_REMOTE_READ | FI_READ | FI_RECV | FI_SEND) +#define MR_ACCESS_PERMISSIONS (FI_REMOTE_READ | FI_READ | FI_RECV | FI_SEND | FI_REMOTE_WRITE | FI_WRITE) static inline int process_completion_queue(); @@ -195,16 +335,18 @@ static inline int process_completion_queue(); * OFI RMA Header * Message sent by sender to receiver during RMA Read of long messages. * - nodeNo: Target node number - * - src_msg: Address of source msg; Sent back as part of OFIRmaAck + * - src_msg: Address or offset from registered source address * - len: Length of message * - key: Remote key * - mr: Address of memory region; Sent back as part of OFIRmaAck + * - orig_msg: actual address of source message; Sent back as part of OFIRmaAck */ typedef struct OFIRmaHeader { uint64_t src_msg; uint64_t len; uint64_t key; uint64_t mr; + uint64_t orig_msg; int nodeNo; } OFIRmaHeader; @@ -276,7 +418,7 @@ typedef struct OFIContext { request_cache_t *request_cache; #endif -#if CMK_SMP +#if CMK_SMP && CMK_SMP_SENDQ /** * Producer/Consumer Queue used in CMK_SMP mode: * - worker thread pushes messages to the queue @@ -304,15 +446,23 @@ typedef struct OFIContext { /** * MR mode: * - FI_MR_SCALABLE allows us to register all the memory with our own key, - * - FI_MR_BASIC requires us to register the RMA buffers - * and to exchange the keys. + * - FI_MR_BASIC requires us to register the RMA buffers and to exchange the keys. + * - FI_MR_ENDPOINT requires us to register and bind and enable our MRs, but we can use our own 32 bit keys locally. */ +#if CMK_CXI + uint32_t mr_mode; +#else enum fi_mr_mode mr_mode; +#endif - /** Used as unique key value in FI_MR_SCALABLE mode */ - uint64_t mr_counter; - - /** Number of pre-posted receive requests */ +#if CMK_CXI + /** Used as unique key value in FI_MR_ENDPOINT mode */ + // only 32 bits available to us + uint32_t mr_counter; +#else + /** Used as unique key value in FI_MR_SCALABLE mode */ + uint64_t mr_counter; +#endif int num_recv_reqs; /** Pre-posted receive requests */ @@ -321,7 +471,7 @@ typedef struct OFIContext { #if USE_MEMPOOL size_t mempool_init_size; size_t mempool_expand_size; - size_t mempool_max_size; + long long mempool_max_size; size_t mempool_lb_size; size_t mempool_rb_size; #endif @@ -332,308 +482,670 @@ static int fill_av(int myid, int nnodes, struct fid_ep *ep, struct fid_av *av, struct fid_cq *cq); static int fill_av_ofi(int myid, int nnodes, struct fid_ep *ep, struct fid_av *av, struct fid_cq *cq); - -static OFIContext context; - -#include "machine-rdma.h" -#if CMK_ONESIDED_IMPL -#include "machine-onesided.h" +#if CMK_CXI +static int ofi_reg_bind_enable(const void *buf, + size_t len, struct fid_mr **mr, OFIContext *context); #endif -/* ### Beginning of Machine-startup Related Functions ### */ -void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) -{ - struct fi_info *providers; - struct fi_info *prov; - struct fi_info *hints; - struct fi_domain_attr domain_attr = {0}; - struct fi_tx_attr tx_attr = { 0 }; - struct fi_cq_attr cq_attr = { 0 }; - struct fi_av_attr av_attr = { (enum fi_av_type)0 }; - int fi_version; - size_t max_header_size; - int i; - int ret; - - /** - * Initialize our runtime environment -- e.g. PMI. - */ - ret = runtime_init(myNodeID, numNodes); - if (ret) { - CmiAbort("OFI::LrtsInit::runtime_init failed"); - } - /** - * Hints to filter providers - * See man fi_getinfo for a list of all filters - * mode: This OFI machine will pass in context into communication calls - * ep_type: Reliable datagram operation - * resource_mgmt: Let the provider manage the resources - * caps: Capabilities required from the provider. We want to use the - * tagged message queue and rma read APIs. - */ - hints = fi_allocinfo(); - CmiAssert(NULL != hints); - hints->mode = FI_CONTEXT; - hints->ep_attr->type = FI_EP_RDM; - hints->domain_attr->resource_mgmt = FI_RM_ENABLED; - hints->caps = FI_TAGGED; - hints->caps |= FI_RMA; - hints->caps |= FI_REMOTE_READ; +static OFIContext context; +#if LARGEPAGE - /** - * FI_VERSION provides binary backward and forward compatibility support - * Specify the version of OFI this machine is coded to, the provider will - * select struct layouts that are compatible with this version. - */ - fi_version = FI_VERSION(1, 0); +/* directly mmap memory from hugetlbfs for large pages */ - ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &providers); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fi_getinfo error"); - } +#include +#include +#include - if (providers == NULL) { - CmiAbort("OFI::LrtsInit::No provider found"); - } +#ifdef __cplusplus +extern "C" { +#endif +#include +#ifdef __cplusplus +} +#endif +/** copied from the GNI layer */ +// size must be _tlbpagesize aligned +void *my_get_huge_pages(size_t size) +{ + char filename[512]; + int fd; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + void *ptr = NULL; - /** - * Here we elect to use the first provider from the list. - * Further filtering could be done at this point (e.g. name). - */ - prov = providers; - - OFI_INFO("provider: %s\n", prov->fabric_attr->prov_name); - OFI_INFO("control progress: %d\n", prov->domain_attr->control_progress); - OFI_INFO("data progress: %d\n", prov->domain_attr->data_progress); - - context.inject_maxsize = prov->tx_attr->inject_size; - OFI_INFO("maximum inject message size: %ld\n", context.inject_maxsize); - - context.eager_maxsize = OFI_EAGER_MAXSIZE_DEFAULT; - CmiGetArgInt(*argv, "+ofi_eager_maxsize", (int*)&context.eager_maxsize); - if (context.eager_maxsize > prov->ep_attr->max_msg_size) - CmiAbort("OFI::LrtsInit::Eager max size > max msg size."); - if (context.eager_maxsize > OFI_EAGER_MAXSIZE_MAX || context.eager_maxsize < 0) - CmiAbort("OFI::LrtsInit::Eager max size range error."); - max_header_size = (sizeof(OFIRmaHeader) >= sizeof(OFIRmaAck)) ? sizeof(OFIRmaHeader) : sizeof(OFIRmaAck); - if (context.eager_maxsize < max_header_size) - CmiAbort("OFI::LrtsInit::Eager max size too small to fit headers."); - OFI_INFO("eager maximum message size: %ld (maximum header size: %ld)\n", - context.eager_maxsize, max_header_size); - - context.cq_entries_count = OFI_CQ_ENTRIES_COUNT_DEFAULT; - CmiGetArgInt(*argv, "+ofi_cq_entries_count", (int*)&context.cq_entries_count); - if (context.cq_entries_count > OFI_CQ_ENTRIES_COUNT_MAX || context.cq_entries_count <= 0) - CmiAbort("OFI::LrtsInit::Cq entries count range error"); - OFI_INFO("cq entries count: %ld\n", context.cq_entries_count); - - context.use_inject = OFI_USE_INJECT_DEFAULT; - CmiGetArgInt(*argv, "+ofi_use_inject", &context.use_inject); - if (context.use_inject < 0) - CmiAbort("OFI::LrtsInit::Use inject value error"); - OFI_INFO("use inject: %d\n", context.use_inject); - - context.rma_maxsize = prov->ep_attr->max_msg_size; - context.mr_mode = static_cast(prov->domain_attr->mr_mode); - context.mr_counter = 0; - - OFI_INFO("maximum rma size: %ld\n", context.rma_maxsize); - OFI_INFO("mr mode: 0x%x\n", context.mr_mode); - - if ((context.mr_mode != FI_MR_BASIC) && - (context.mr_mode != FI_MR_SCALABLE)) { - CmiAbort("OFI::LrtsInit::Unsupported MR mode"); + snprintf(filename, sizeof(filename), "%s/charm_mempool.%d.%d", hugetlbfs_find_path_for_size(_tlbpagesize), getpid(), rand()); + fd = open(filename, O_RDWR | O_CREAT, mode); + if (fd == -1) { + CmiAbort("my_get_huge_pages: open filed"); } + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (ptr == MAP_FAILED) ptr = NULL; +//printf("[%d] my_get_huge_pages: %s %d %p\n", myrank, filename, size, ptr); + close(fd); + unlink(filename); + return ptr; +} - OFI_INFO("use memory pool: %d\n", USE_MEMPOOL); - -#if USE_MEMPOOL - size_t mempool_init_size_mb = MEMPOOL_INIT_SIZE_MB_DEFAULT; - CmiGetArgInt(*argv, "+ofi_mempool_init_size_mb", (int*)&mempool_init_size_mb); - context.mempool_init_size = mempool_init_size_mb * ONE_MB; - - size_t mempool_expand_size_mb = MEMPOOL_EXPAND_SIZE_MB_DEFAULT; - CmiGetArgInt(*argv, "+ofi_mempool_expand_size_mb", (int*)&mempool_expand_size_mb); - context.mempool_expand_size = mempool_expand_size_mb * ONE_MB; - - size_t mempool_max_size_mb = MEMPOOL_MAX_SIZE_MB_DEFAULT; - CmiGetArgInt(*argv, "+ofi_mempool_max_size_mb", (int*)&mempool_max_size_mb); - context.mempool_max_size = mempool_max_size_mb * ONE_MB; - - context.mempool_lb_size = MEMPOOL_LB_DEFAULT; - CmiGetArgInt(*argv, "+ofi_mempool_lb_size", (int*)&context.mempool_lb_size); +void my_free_huge_pages(void *ptr, int size) +{ +//printf("[%d] my_free_huge_pages: %p %d\n", myrank, ptr, size); + int ret = munmap(ptr, size); + if (ret == -1) CmiAbort("munmap failed in my_free_huge_pages"); +} - context.mempool_rb_size = MEMPOOL_RB_DEFAULT; - CmiGetArgInt(*argv, "+ofi_mempool_rb_size", (int*)&context.mempool_rb_size); +#endif - if (context.mempool_lb_size > context.mempool_rb_size) - CmiAbort("OFI::LrtsInit::Mempool left border should be less or equal to right border"); +#include "machine-rdma.h" +#if CMK_ONESIDED_IMPL +#include "machine-onesided.h" +#endif - OFI_INFO("mempool init size: %ld\n", context.mempool_init_size); - OFI_INFO("mempool expand size: %ld\n", context.mempool_expand_size); - OFI_INFO("mempool max size: %ld\n", context.mempool_max_size); - OFI_INFO("mempool left border size: %ld\n", context.mempool_lb_size); - OFI_INFO("mempool right border size: %ld\n", context.mempool_rb_size); +#if CMK_CXI +/* transformed from cpuaffinity.C due to our need to parse the same + sort of arg string, but having to do so before CmiNumPesGlobal (and + similar quantities) have been defined +*/ +static int search_map(char *mapstring, int pe) +{ + int NumPesGlobal; + PMI_Get_universe_size(&NumPesGlobal); + int *map = (int *)malloc(NumPesGlobal*sizeof(int)); + char *ptr = NULL; + int h, i, j, k, count; + int plusarr[128]; + char *str; + + char *mapstr = (char*)malloc(strlen(mapstring)+1); + strcpy(mapstr, mapstring); + + str = strtok_r(mapstr, ",", &ptr); + count = 0; + while (str && count < NumPesGlobal) + { + int hasdash=0, hascolon=0, hasdot=0, hasstar1=0, hasstar2=0, numplus=0; + int start, end, stride=1, block=1; + int iter=1; + plusarr[0] = 0; + for (i=0; i stride) { + printf("Warning: invalid block size in \"%s\" ignored.\n", str); + block=1; + } + //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus); + for (k = 0; kend) break; + for (h=0; h<=numplus; h++) { + map[count++] = i+j+plusarr[h]; + if (count == NumPesGlobal) break; + } + if (count == NumPesGlobal) break; + } + if (count == NumPesGlobal) break; + } + if (count == NumPesGlobal) break; + } + str = strtok_r(NULL, ",", &ptr); + } + i = map[pe % count]; + + free(map); + free(mapstr); + return i; +} #endif - /** - * Open fabric - * The getinfo struct returns a fabric attribute struct that can be used to - * instantiate the virtual or physical network. This opens a "fabric - * provider". See man fi_fabric for details. - */ - ret = fi_fabric(prov->fabric_attr, &context.fabric, NULL); - if (ret < 0) { - fi_freeinfo(providers); - CmiAbort("OFI::LrtsInit::fi_fabric error"); +/* ### Beginning of Machine-startup Related Functions ### */ +void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) +{ + struct fi_info *providers; + struct fi_info *prov; + struct fi_info *hints; + struct fi_domain_attr domain_attr = {0}; + struct fi_tx_attr tx_attr = { 0 }; + struct fi_cq_attr cq_attr = { 0 }; + struct fi_av_attr av_attr = { (enum fi_av_type)0 }; + int fi_version; + size_t max_header_size; + + int i; + int ret; + + /** + * Initialize our runtime environment -- e.g. PMI. + */ + ret = runtime_init(myNodeID, numNodes); + // CmiPrintf("[%d] nodeid %d, numnodes %d\n", *myNodeID, *myNodeID, *numNodes); + if (ret) { + CmiAbort("OFI::LrtsInit::runtime_init failed"); + } + /* + int namelength; + PMI_KVS_Get_name_length_max(&namelength); + char *name1=(char *) malloc(namelength+1); + char *name2=(char *) malloc(namelength+1); + PMI_KVS_Get_my_name(name1, namelength); + CmiPrintf("[%d] PMI keyspace %s\n", *myNodeID, PMI); + */ + /** + * Hints to filter providers + * See man fi_getinfo for a list of all filters + * mode: This OFI machine will pass in context into communication calls + * ep_type: Reliable datagram operation + * resource_mgmt: Let the provider manage the resources + * caps: Capabilities required from the provider. We want to use the + * tagged message queue and rma read APIs. + */ + hints = fi_allocinfo(); + CmiAssert(NULL != hints); + hints->mode = ~0; + hints->domain_attr->mode = ~0; +#if CMK_CXI + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; +#endif + hints->mode = FI_CONTEXT; + hints->ep_attr->type = FI_EP_RDM; +#if CMK_CXI + hints->ep_attr->protocol = FI_PROTO_CXI; + hints->domain_attr->threading = FI_THREAD_SAFE; + //hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; + hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + hints->domain_attr->auth_key =NULL; + // hints->ep_attr->type = FI_EP_MSG; +#endif + hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->caps = FI_TAGGED; + hints->caps |= FI_RMA; + hints->caps |= FI_REMOTE_READ; +#if CMK_CXI + // Figure out which NIC we should request based on the one that + // should be closest. + /* This is overly complicated for several reasons: + + * 1. The hardware itself is not built to have the numerical ID + * ordering of different types of hardware correlate with + * proximity at all. E.g., on frontier core 0 is in NUMA 0 which + * means it is closest to GPU 4 and HSN (NIC) 2, but is a direct + * peer of cores 0-15. So, proximal ordering outside of type + * should not be considered predictive of proximity. That + * relationship has to be detected by other means. + + * 2. HWLOC doesn't have a hwloc_get_closest_nic because... NIC + * doesn't even rate an object type in their ontology, let + * alone get first class treatment. Given that PCI devices + * don't have a cpuset, there are a bunch of HWLOC features + * that don't work for them. But it is the portable hardware + * interrogation API we have to hand. So, instead we get our + * NUMAnode, and then get the PCI objects inside it. Get the + * (Ethernet)->Net(Slingshot) object and take the name from it, + * (e.g., hsn2). Get the last digit and append it to "cxi". + * There may be a better way to do this, but it isn't apparent + * to me based on their documentation. + + * 2a. How one actually extracts that information from HWLOC is + * difficult to unravel. As it somehow accessible to their + * lstopo utility, but from within their C API the PCI devices + * do *not* have such convenient labeling as something special + * needs to happen to get their linuxfs utilities to inject + * that derived information into your topology object. As an + * interim solution we allow the user to map their cxi[0..3] + * selection using command line arguments. + + * 2b. Likewise the 1:1 relationship we assume here between + * cxi[0..3] and hsn[0..3] is informed speculation backed up by + * no documentation. Because, why have cxi0..3 at all if they + * don't correlate with the underlying hsn0..3? We assume the + * designers aren't insane or malicious, just stuck on the other + * side of an NDA. + + * 3. LrtsInit is of necessity fairly early in the startup + * process, so a lot of the infrastructure we might otherwise rely + * upon hasn't been set up yet. But, we do have the hwloc + * topology and cray-pmi. + + * 4. We might not (depending on what does the binding) have + * bound this process yet, so exactly where we are and how + * close that is to any particular NIC is sort of fluid. + + * 5. How many CXI domain interfaces exist? You can't tell on + * the head node, the answer could easily be zero there. You + * also can't be sure that whatever was true at compile time + * will be true at run time. Crusher and Frontier have four. + * Delta has one. Perlmutter has four on GPU nodes and one on + * CPU nodes. The user could easily be confused, so we can't + * rely on them telling us. This has to be determined at + * run time. + */ + + char *cximap=NULL; + CmiGetArgStringDesc(*argv, "+cximap", &cximap, "define cxi interface to process mapping"); +#endif + /** + * FI_VERSION provides binary backward and forward compatibility support + * Specify the version of OFI this machine is coded to, the provider will + * select struct layouts that are compatible with this version. + */ + // fi_version = FI_VERSION(1, 15); + // CXI versions itself differently from OFI + +#if CMK_CXI + /* CXI has its own versioning, so just use whatever the build env + is until we come up with some CXI version specific changes */ + fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION); +#else + fi_version = FI_VERSION(1, 0); +#endif + ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &providers); + + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fi_getinfo error"); + } + + if (providers == NULL) { + CmiAbort("OFI::LrtsInit::No provider found"); + } + +#if CMK_CXI + char myDomainName[5]="cxi0"; + char priorDomain[5]="null"; + short numcxi=0; + for(fi_info *aprov = providers; aprov!=NULL; aprov=aprov->next) + { // count up the CXI interfaces + if(strncmp(aprov->domain_attr->name,myDomainName,3)==0 && strncmp(aprov->domain_attr->name,priorDomain,4)!=0) + { + numcxi++; + strncpy(priorDomain,aprov->domain_attr->name,4); + } } - - /** - * Create the access domain, which is the physical or virtual network or - * hardware port/collection of ports. Returns a domain object that can be - * used to create endpoints. See man fi_domain for details. - */ - ret = fi_domain(context.fabric, prov, &context.domain, NULL); - if (ret < 0) { - fi_freeinfo(providers); - CmiAbort("OFI::LrtsInit::fi_domain error"); + short myNet; + int numPesOnNode; +#define HAS_PMI_Get_numpes_in_app_on_smp 1 +#if HAS_PMI_Get_numpes_in_app_on_smp + PMI_Get_numpes_in_app_on_smp(&numPesOnNode); +#else + // how do we learn how many processes there are on this node? +#endif + int myRank=*myNodeID%numPesOnNode; + if(cximap != NULL) + { + myNet=search_map(cximap,myRank); + // CmiPrintf("map sets process %d to rank %d to cxi%d\n",*myNodeID, myRank, myNet); } - - /** - * Create a transport level communication endpoint. To use the endpoint, - * it must be bound to completion counters or event queues and enabled, - * and the resources consumed by it, such as address vectors, counters, - * completion queues, etc. See man fi_endpoint for more details. - */ - ret = fi_endpoint(context.domain, /* In: Domain object */ - prov, /* In: Provider */ - &context.ep, /* Out: Endpoint object */ - NULL); /* Optional context */ - if (ret < 0) { - fi_freeinfo(providers); - CmiAbort("OFI::LrtsInit::fi_endpoint error"); + else + { + int quad= (numPesOnNode>=numcxi) ? numcxi : numPesOnNode; + // CmiPrintf("[%d] divnumPesOnNode %d numcxi %d quad %d\n",myRank, numPesOnNode, numcxi, quad); + // determine where we fall in the ordering + // Default OS id order on frontier + /* 0-15 -> HSN-2 + * 16-31 -> HSN-1 + * 32-47 -> HSN-3 + * 48-63 -> HSN-0 + but experimentally, the best order seems to be 1302 + */ + + + /// short hsnOrder[numcxi]={2,1,3,0}; + if(numcxi==4) + { + short hsnOrder[4]= {1,3,0,2}; + if(myRank%quad>numcxi) + { + CmiPrintf("Error: myrank %d quad %d myrank/quad %n",myRank,quad, myRank/quad); + CmiAbort("cxi mapping failure"); + } + myNet=hsnOrder[myRank%quad]; + } + else + { + CmiAssert(numcxi==1); + //theoretically there are cases other than 4 and 1, but + //until someone sights such an incrayptid on a machine floor, + //we're just going to assume they don't exist. + myNet=0; + } } + snprintf(myDomainName,5, "cxi%d", myNet); - /** - * Create the objects that will be bound to the endpoint. - * The objects include: - * - completion queue for events - * - address vector of other endpoint addresses - */ - cq_attr.format = FI_CQ_FORMAT_TAGGED; - ret = fi_cq_open(context.domain, &cq_attr, &context.cq, NULL); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fi_cq_open error"); + for(fi_info *aprov = providers; aprov!=NULL; aprov=aprov->next) + { + // if we're running multiple processes per node, we should + // choose the CXI interface closest to our placement. This is + // a little awkward as we're at an information low moment + // early in the bootstrapping process. + //OFI_INFO("aprovider: %s domain %s\n", aprov->fabric_attr->prov_name, aprov->domain_attr->name); + if(strncmp(aprov->domain_attr->name,myDomainName,4)==0) + { + prov = aprov; +#if OFI_VERBOSE_STARTUP + if(*myNodeID<=numPesOnNode) + CmiPrintf("Process [%d] will use domain %s\n", *myNodeID, myDomainName); +#else + //assume a manual map wants confirming output + if(cximap != NULL) + CmiPrintf("Process [%d] will use domain %s\n", *myNodeID, myDomainName); +#endif + } } +#endif - /** - * Since the communications happen between Nodes and that each Node - * has a number (NodeNo), we can use the Address Vector in FI_AV_TABLE - * mode. The addresses of the Nodes simply need to be inserted in order - * so that the NodeNo becomes the index in the AV. The advantage being - * that the fi_addrs are stored by the OFI provider. - */ - av_attr.type = FI_AV_TABLE; - ret = fi_av_open(context.domain, &av_attr, &context.av, NULL); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fi_av_open error"); - } + context.inject_maxsize = prov->tx_attr->inject_size; + context.eager_maxsize = OFI_EAGER_MAXSIZE_DEFAULT; + CmiGetArgInt(*argv, "+ofi_eager_maxsize", (int*)&context.eager_maxsize); + if (context.eager_maxsize > prov->ep_attr->max_msg_size) + CmiAbort("OFI::LrtsInit::Eager max size > max msg size."); + if (context.eager_maxsize > OFI_EAGER_MAXSIZE_MAX || context.eager_maxsize < 0) + CmiAbort("OFI::LrtsInit::Eager max size range error."); + max_header_size = (sizeof(OFIRmaHeader) >= sizeof(OFIRmaAck)) ? sizeof(OFIRmaHeader) : sizeof(OFIRmaAck); + if (context.eager_maxsize < max_header_size) + CmiAbort("OFI::LrtsInit::Eager max size too small to fit headers."); + context.cq_entries_count = OFI_CQ_ENTRIES_COUNT_DEFAULT; + CmiGetArgInt(*argv, "+ofi_cq_entries_count", (int*)&context.cq_entries_count); + if (context.cq_entries_count > OFI_CQ_ENTRIES_COUNT_MAX || context.cq_entries_count <= 0) + CmiAbort("OFI::LrtsInit::Cq entries count range error"); + context.use_inject = OFI_USE_INJECT_DEFAULT; + CmiGetArgInt(*argv, "+ofi_use_inject", &context.use_inject); + if (context.use_inject < 0) + CmiAbort("OFI::LrtsInit::Use inject value error"); + context.rma_maxsize = prov->ep_attr->max_msg_size; +#if CMK_CXI + context.mr_mode = prov->domain_attr->mr_mode; +#else + // the old code path uses the defunct enum + context.mr_mode = static_cast(prov->domain_attr->mr_mode); +#endif - /** - * Bind the CQ and AV to the endpoint object. - */ - ret = fi_ep_bind(context.ep, - (fid_t)context.cq, - FI_RECV | FI_TRANSMIT); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fi_bind EP-CQ error"); - } - ret = fi_ep_bind(context.ep, - (fid_t)context.av, - 0); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fi_bind EP-AV error"); - } +#define OFI_VERBOSE_STARTUP 0 +#if OFI_VERBOSE_STARTUP + OFI_INFO("[%d]provider: %s\n", *myNodeID, prov->fabric_attr->prov_name); + OFI_INFO("[%d]domain: %s\n", *myNodeID, prov->domain_attr->name); + OFI_INFO("control progress: %d\n", prov->domain_attr->control_progress); + OFI_INFO("data progress: %d\n", prov->domain_attr->data_progress); + OFI_INFO("maximum inject message size: %ld\n", context.inject_maxsize); + OFI_INFO("eager maximum message size: %ld (maximum header size: %ld)\n", + context.eager_maxsize, max_header_size); + OFI_INFO("cq entries count: %ld\n", context.cq_entries_count); + OFI_INFO("use inject: %d\n", context.use_inject); + +#if CMK_CXI + OFI_INFO("requested mr mode: 0x%x\n", FI_MR_ENDPOINT); + OFI_INFO("requested mr mode & mr_mode: 0x%x\n", (FI_MR_ENDPOINT) & context.mr_mode); +#endif + // start at 51 for the normal stuff, like pool messages + context.mr_counter = 51; + OFI_INFO("maximum rma size: %ld\n", context.rma_maxsize); + OFI_INFO("mr mode: 0x%x\n", context.mr_mode); - /** - * Enable the endpoint for communication - * This commits the bind operations. - */ - ret = fi_enable(context.ep); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fi_enable error"); - } + OFI_INFO("mr virtual address support : 0x%x\n", context.mr_mode & FI_MR_VIRT_ADDR); + OFI_INFO("use memory pool: %d\n", USE_MEMPOOL); +#endif //verbose - OFI_INFO("use request cache: %d\n", USE_OFIREQUEST_CACHE); +#if CMK_CXI + if ((context.mr_mode & FI_MR_ENDPOINT)==0) + CmiAbort("OFI::LrtsInit::Unsupported MR mode FI_MR_ENDPOINT"); +#else + /* keeping this for now, should debug this on a non-cray and make + sure we get a basic OFI working there without these defunct MR + modes. Currently, we don't actually care about non CXI OFI, + but it could be good on AWS EFA and potentially good on future + platforms where there is an optimal provider for the underlying + hardware. */ + if ((context.mr_mode != FI_MR_BASIC) && + (context.mr_mode != FI_MR_SCALABLE)) { + CmiAbort("OFI::LrtsInit::Unsupported MR mode"); + } +#endif -#if USE_OFIREQUEST_CACHE - /** - * Create request cache. - */ - context.request_cache = create_request_cache(); + +#if USE_MEMPOOL + size_t mempool_init_size_mb = MEMPOOL_INIT_SIZE_MB_DEFAULT; + CmiGetArgInt(*argv, "+ofi_mempool_init_size_mb", (int*)&mempool_init_size_mb); + context.mempool_init_size = mempool_init_size_mb * ONE_MB; + + size_t mempool_expand_size_mb = MEMPOOL_EXPAND_SIZE_MB_DEFAULT; + CmiGetArgInt(*argv, "+ofi_mempool_expand_size_mb", (int*)&mempool_expand_size_mb); + context.mempool_expand_size = mempool_expand_size_mb * ONE_MB; + + long long mempool_max_size_mb = MEMPOOL_MAX_SIZE_MB_DEFAULT; + CmiGetArgInt(*argv, "+ofi_mempool_max_size_mb", (int*)&mempool_max_size_mb); + context.mempool_max_size = mempool_max_size_mb * ONE_MB; + + context.mempool_lb_size = MEMPOOL_LB_DEFAULT; + CmiGetArgInt(*argv, "+ofi_mempool_lb_size", (int*)&context.mempool_lb_size); + + context.mempool_rb_size = MEMPOOL_RB_DEFAULT; + CmiGetArgInt(*argv, "+ofi_mempool_rb_size", (int*)&context.mempool_rb_size); + + if (context.mempool_lb_size > context.mempool_rb_size) + CmiAbort("OFI::LrtsInit::Mempool left border should be less or equal to right border"); +#if OFI_VERBOSE_STARTUP + OFI_INFO("mempool init size: %ld\n", context.mempool_init_size); + OFI_INFO("mempool expand size: %ld\n", context.mempool_expand_size); + OFI_INFO("mempool max size: %lld\n", context.mempool_max_size); + OFI_INFO("mempool left border size: %ld\n", context.mempool_lb_size); + OFI_INFO("mempool right border size: %ld\n", context.mempool_rb_size); +#endif #endif - /** - * Create local receive buffers and pre-post them. - */ - context.num_recv_reqs = OFI_NUM_RECV_REQS_DEFAULT; - CmiGetArgInt(*argv, "+ofi_num_recvs", &context.num_recv_reqs); - if (context.num_recv_reqs > OFI_NUM_RECV_REQS_MAX || context.num_recv_reqs <= 0) - CmiAbort("OFI::LrtsInit::Num recv reqs range error"); - OFI_INFO("number of pre-allocated recvs: %i\n", context.num_recv_reqs); + /** + * Open fabric + * The getinfo struct returns a fabric attribute struct that can be used to + * instantiate the virtual or physical network. This opens a "fabric + * provider". See man fi_fabric for details. + */ + // CmiPrintf("[%d] PMI_initialized %d : %d\n",*myNodeID, PMI2_Initialized(), PMI_SUCCESS); + ret = fi_fabric(prov->fabric_attr, &context.fabric, NULL); + if (ret < 0) { + MACHSTATE1(3, "fi_fabric error: %d\n", ret); + fi_freeinfo(providers); + CmiAbort("OFI::LrtsInit::fi_fabric error"); + } + + /** + * Create the access domain, which is the physical or virtual network or + * hardware port/collection of ports. Returns a domain object that can be + * used to create endpoints. See man fi_domain for details. + */ + + ret = fi_domain(context.fabric, prov, &context.domain, NULL); + if (ret < 0) { + MACHSTATE2(3, "[%d] fi_domain error: %d\n",*myNodeID, ret); + fi_freeinfo(providers); + // CmiPrintf("[%d] fi_domain error: %d\n", *myNodeID, ret); + CmiAbort("OFI::LrtsInit::fi_domain error, for single node use try --network=single_node_vni"); + } + /** + * Create a transport level communication endpoint. To use the endpoint, + * it must be bound to completion counters or event queues and enabled, + * and the resources consumed by it, such as address vectors, counters, + * completion queues, etc. See man fi_endpoint for more details. + */ + ret = fi_endpoint(context.domain, /* In: Domain object */ + prov, /* In: Provider */ + &context.ep, /* Out: Endpoint object */ + NULL); /* Optional context */ + if (ret < 0) { + MACHSTATE1(3, "fi_endpoint error: %d\n", ret); + fi_freeinfo(providers); + CmiAbort("OFI::LrtsInit::fi_endpoint error %d", ret); + } + + /** + * Create the objects that will be bound to the endpoint. + * The objects include: + * - completion queue for events + * - address vector of other endpoint addresses + */ + cq_attr.format = FI_CQ_FORMAT_TAGGED; + ret = fi_cq_open(context.domain, &cq_attr, &context.cq, NULL); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fi_cq_open error"); + } + + /** + * Since the communications happen between Nodes and that each Node + * has a number (NodeNo), we can use the Address Vector in FI_AV_TABLE + * mode. The addresses of the Nodes simply need to be inserted in order + * so that the NodeNo becomes the index in the AV. The advantage being + * that the fi_addrs are stored by the OFI provider. + */ + av_attr.type = FI_AV_TABLE; + ret = fi_av_open(context.domain, &av_attr, &context.av, NULL); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fi_av_open error"); + } + + /** + * Bind the CQ and AV to the endpoint object. + */ + ret = fi_ep_bind(context.ep, + (fid_t)context.cq, + FI_RECV | FI_TRANSMIT); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fi_bind EP-CQ error"); + } + ret = fi_ep_bind(context.ep, + (fid_t)context.av, + 0); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fi_bind EP-AV error"); + } + + /** + * Enable the endpoint for communication + * This commits the bind operations. + */ + ret = fi_enable(context.ep); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fi_enable error"); + } +#if OFI_VERBOSE_STARTUP + OFI_INFO("use request cache: %d\n", USE_OFIREQUEST_CACHE); +#endif +#if USE_OFIREQUEST_CACHE + /** + * Create request cache. + */ + context.request_cache = create_request_cache(); +#endif - /** - * Exchange EP names and insert them into the AV. - */ - if (CmiGetArgFlag(*argv, "+ofi_runtime_tcp")) { - OFI_INFO("exchanging addresses over TCP\n"); - ret = fill_av(*myNodeID, *numNodes, context.ep, - context.av, context.cq); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fill_av"); - } - } else { - OFI_INFO("exchanging addresses over OFI\n"); - ret = fill_av_ofi(*myNodeID, *numNodes, context.ep, - context.av, context.cq); - if (ret < 0) { - CmiAbort("OFI::LrtsInit::fill_av_ofi"); - } + /** + * Create local receive buffers and pre-post them. + */ + context.num_recv_reqs = OFI_NUM_RECV_REQS_DEFAULT; + CmiGetArgInt(*argv, "+ofi_num_recvs", &context.num_recv_reqs); + if (context.num_recv_reqs > OFI_NUM_RECV_REQS_MAX || context.num_recv_reqs <= 0) + CmiAbort("OFI::LrtsInit::Num recv reqs range error"); +#if OFI_VERBOSE_STARTUP + OFI_INFO("number of pre-allocated recvs: %i\n", context.num_recv_reqs); +#endif + /** + * Exchange EP names and insert them into the AV. + */ + //this is now the default because it is stable, but allow the + //argument for backward compatibility + CmiGetArgFlag(*argv, "+ofi_runtime_tcp"); + + if (CmiGetArgFlag(*argv, "+ofi_runtime_ofi")) { + OFI_INFO("exchanging addresses over OFI\n"); + ret = fill_av_ofi(*myNodeID, *numNodes, context.ep, + context.av, context.cq); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fill_av_ofi"); + } + } + else // + { + OFI_INFO("exchanging addresses over TCP\n"); + ret = fill_av(*myNodeID, *numNodes, context.ep, + context.av, context.cq); + if (ret < 0) { + CmiAbort("OFI::LrtsInit::fill_av"); } + } -#if CMK_SMP - /** - * Initialize send queue. - */ - context.send_queue = PCQueueCreate(); +#if CMK_SMP && CMK_SMP_SENDQ + /** + * Initialize send queue. + */ + context.send_queue = PCQueueCreate(); #endif - /** - * Free providers info since it's not needed anymore. - */ - fi_freeinfo(hints); - hints = NULL; - fi_freeinfo(providers); - providers = NULL; + /** + * Free providers info since it's not needed anymore. + */ + fi_freeinfo(hints); + hints = NULL; + fi_freeinfo(providers); + providers = NULL; } static inline void prepost_buffers() { - OFIRequest **reqs; - ALIGNED_ALLOC(reqs, sizeof(void*) * context.num_recv_reqs); + OFIRequest **reqs=NULL; +#if CMK_CXI + // CmiAlloc will go through LrtsAlloc, which will use a memory + // pool, which should do all the right things wrt register, bind, + // enable behind the scenes + reqs = (OFIRequest **) CmiAlloc(sizeof(void*) * context.num_recv_reqs); +#else + ALIGNED_ALLOC(reqs,(sizeof(void*) * context.num_recv_reqs)); +#endif int i; for (i = 0; i < context.num_recv_reqs; i++) { #if USE_OFIREQUEST_CACHE reqs[i] = alloc_request(context.request_cache); #else - reqs[i] = (OFIRequest *)CmiAlloc(sizeof(OFIRequest)); + reqs[i] = (OFIRequest *) CmiAlloc(sizeof(OFIRequest)); #endif reqs[i]->callback = recv_callback; - reqs[i]->data.recv_buffer = CmiAlloc(context.eager_maxsize); + + reqs[i]->data.recv_buffer = CmiAlloc(context.eager_maxsize); CmiAssert(reqs[i]->data.recv_buffer); MACHSTATE2(3, "---> posting recv req %p buf=%p", @@ -659,7 +1171,7 @@ void send_short_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) * A short message was sent. * Free up resources. */ - char *msg; + char *msg=NULL; MACHSTATE(3, "OFI::send_short_callback {"); @@ -686,7 +1198,7 @@ void send_rma_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) * An OFIRmaHeader was sent. * Free up resources. */ - OFIRmaHeader *header; + OFIRmaHeader *header=NULL; MACHSTATE(3, "OFI::send_rma_callback {"); @@ -702,6 +1214,47 @@ void send_rma_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) MACHSTATE(3, "} OFI::send_rma_callback done"); } + +#ifdef CMK_CXI +static inline +void ofi_send_reg(void *buf, size_t buf_size, int addr, uint64_t tag, OFIRequest *req, struct fid_mr* mr) +{ + if (context.use_inject && buf_size <= context.inject_maxsize) + { + /** + * The message is small enough to be injected. + * This won't generate any completion, so we can free the msg now. + */ + MACHSTATE(3, "----> inject"); + + OFI_RETRY(fi_tinject(context.ep, + buf, + buf_size, + addr, + tag)); + req->callback(NULL, req); + } + else + { + + MACHSTATE3(3, "msg send mr %p: mr key %lu buf %p\n", mr, fi_mr_key(mr), buf); + /* Else, use regular send. */ + OFI_RETRY(fi_tsend(context.ep, + buf, + buf_size, +#if CMK_CXI + fi_mr_desc(mr), +#else + NULL, +#endif + addr, + tag, + &req->context)); + } +} +#endif + + static inline void ofi_send(void *buf, size_t buf_size, int addr, uint64_t tag, OFIRequest *req) { @@ -721,13 +1274,70 @@ void ofi_send(void *buf, size_t buf_size, int addr, uint64_t tag, OFIRequest *re req->callback(NULL, req); } else + { +#if CMK_CXI + + struct fid_mr* mr = (struct fid_mr *) GetMemHndl(buf); +#endif + + MACHSTATE3(3, "msg send mr %p: mr key %lu buf %p\n", mr, fi_mr_key(mr), buf); + /* Else, use regular send. */ + OFI_RETRY(fi_tsend(context.ep, + buf, + buf_size, +#if CMK_CXI + fi_mr_desc(mr), +#else + NULL, +#endif + addr, + tag, + &req->context)); + } +} + +static inline +void ofi_register_and_send(void *buf, size_t buf_size, int addr, uint64_t tag, OFIRequest *req) +{ + if (context.use_inject && buf_size <= context.inject_maxsize) { + /** + * The message is small enough to be injected. + * This won't generate any completion, so we can free the msg now. + */ + MACHSTATE(3, "----> inject"); +#if CMK_CXI + + struct fid_mr* mr; + ofi_reg_bind_enable(buf, buf_size, &mr,&context); +#endif + + OFI_RETRY(fi_tinject(context.ep, + buf, + buf_size, + addr, + tag)); + req->callback(NULL, req); + } + else + { +#if CMK_CXI + + struct fid_mr* mr; + ofi_reg_bind_enable(buf, buf_size, &mr,&context); +#endif + + MACHSTATE3(3, "msg send mr %p: mr key %lu buf %p\n", mr, fi_mr_key(mr), buf); /* Else, use regular send. */ OFI_RETRY(fi_tsend(context.ep, buf, buf_size, - NULL, - addr, +#if CMK_CXI + fi_mr_desc(mr), +#else + NULL, +#endif + addr, tag, &req->context)); } @@ -741,12 +1351,12 @@ static inline int sendMsg(OFIRequest *req) { int ret; uint64_t op; - char *buf; + char *buf=NULL; size_t len; MACHSTATE5(2, "OFI::sendMsg destNode=%i destPE=%i size=%i msg=%p mode=%i {", - req->destNode, req->destPE, req->size, req->data, req->mode); + req->destNode, req->destPE, req->size, req->data.short_msg, req->mode); if (req->size <= context.eager_maxsize) { /** @@ -775,6 +1385,42 @@ static inline int sendMsg(OFIRequest *req) return 0; } +const int event_send_short_callback = 10333; +const int event_send_rma_callback = 10444; +const int event_ofi_send = 10555; +const int event_sendMsg = 10556; +const int event_LrtsSendFunc = 10557; +const int event_send_ack_callback = 10560; +const int event_rma_read_callback = 10561; +const int event_process_short_recv = 10600; +const int event_process_long_recv = 10601; +const int event_process_long_send_ack = 10602; +const int event_recv_callback = 10610; +const int event_process_completion_queue = 10650; +const int event_process_send_queue = 10660; +const int event_reg_bind_enable = 10670; +static int postInit = 0; + +static void registerUserTraceEvents(void) { +#if CMI_MACH_TRACE_USEREVENTS && CMK_TRACE_ENABLED + traceRegisterUserEvent("send_short_callback", event_send_short_callback); + traceRegisterUserEvent("send_rma_callback", event_send_rma_callback); + traceRegisterUserEvent("ofi_send", event_ofi_send); + traceRegisterUserEvent("sendMsg", event_sendMsg); + traceRegisterUserEvent("LrtsSendFunc", event_LrtsSendFunc); + traceRegisterUserEvent("send_ack_callback", event_send_ack_callback); + traceRegisterUserEvent("rma_read_callback", event_rma_read_callback); + traceRegisterUserEvent("process_short_recv", event_process_short_recv); + traceRegisterUserEvent("process_long_recv", event_process_long_recv); + traceRegisterUserEvent("process_long_send_ack", event_process_long_send_ack); + traceRegisterUserEvent("recv_callback", event_recv_callback); + traceRegisterUserEvent("process_completion_queue", event_process_completion_queue); + traceRegisterUserEvent("process_send_queue", event_process_send_queue); + traceRegisterUserEvent("reg_bind_enable", event_reg_bind_enable); +#endif +} + + /** * In non-SMP mode, this is used to send a message. * In CMK_SMP mode, this is called by a worker thread to send a message. @@ -783,18 +1429,22 @@ CmiCommHandle LrtsSendFunc(int destNode, int destPE, int size, char *msg, int mo { int ret; - OFIRequest *req; - + OFIRequest *req=NULL; +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; +#endif MACHSTATE5(2, "OFI::LrtsSendFunc destNode=%i destPE=%i size=%i msg=%p mode=%i {", destNode, destPE, size, msg, mode); - +#if CMK_SMP_TRACE_COMMTHREAD + startT = CmiWallTimer(); +#endif CmiSetMsgSize(msg, size); #if USE_OFIREQUEST_CACHE req = alloc_request(context.request_cache); #else - req = (OFIRequest *)CmiAlloc(sizeof(OFIRequest)); + req = (OFIRequest *) CmiAlloc(sizeof(OFIRequest)); #endif CmiAssert(req); @@ -812,54 +1462,66 @@ CmiCommHandle LrtsSendFunc(int destNode, int destPE, int size, char *msg, int mo req->callback = send_short_callback; req->data.short_msg = msg; } else { - /** - * The message is too long to be sent directly. - * Let other side use RMA Read instead by sending an OFIRmaHeader. - */ - OFIRmaHeader *rma_header; - struct fid_mr *mr; - uint64_t requested_key = 0; - - MACHSTATE(3, "--> long"); - - ALIGNED_ALLOC(rma_header, sizeof(*rma_header)); - - if (FI_MR_SCALABLE == context.mr_mode) { - /** - * In FI_MR_SCALABLE mode, we need to specify a unique key when - * registering memory. Here we simply increment a counter - * atomically. - */ - requested_key = __sync_fetch_and_add(&(context.mr_counter), 1); - } - - /* Register new MR to RMA Read from */ - ret = fi_mr_reg(context.domain, /* In: domain object */ - msg, /* In: lower memory address */ - size, /* In: length */ - MR_ACCESS_PERMISSIONS, /* In: access permissions */ - 0ULL, /* In: offset (not used) */ - requested_key, /* In: requested key */ - 0ULL, /* In: flags */ - &mr, /* Out: memregion object */ - NULL); /* In: context (not used) */ - - if (ret) { - MACHSTATE1(3, "fi_mr_reg error: %d\n", ret); - CmiAbort("fi_mr_reg error"); - } - - rma_header->nodeNo = CmiMyNodeGlobal(); - rma_header->src_msg = (uint64_t)msg; - rma_header->len = size; - rma_header->key = fi_mr_key(mr); - rma_header->mr = (uint64_t)mr; + /** + * The message is too long to be sent directly. + * Let other side use RMA Read instead by sending an OFIRmaHeader. + */ + OFIRmaHeader *rma_header; + struct fid_mr *mr=NULL; +#if CMK_CXI + uint32_t requested_key = 0; + block_header *base_addr; +#else + uint64_t requested_key = 0; +#endif - req->callback = send_rma_callback; - req->data.rma_header = rma_header; + if ((FI_MR_BASIC & context.mr_mode) || + (FI_MR_SCALABLE & context.mr_mode)) + { + requested_key = __sync_fetch_and_add(&(context.mr_counter), 1); + /* Register new MR to RMA Read from */ + ret = fi_mr_reg(context.domain, /* In: domain object */ + msg, /* In: lower memory address */ + size, /* In: length */ + MR_ACCESS_PERMISSIONS, /* In: access permissions */ + 0ULL, /* In: offset (not used) */ + requested_key, /* In: requested key */ + 0ULL, /* In: flags */ + &mr, /* Out: memregion object */ + NULL); /* In: context (not used) */ + } + else if (FI_MR_ENDPOINT & context.mr_mode) + { + +#if CMK_CXI + mr = (struct fid_mr *) GetMemHndl(msg); + size_t offset = GetMemOffsetFromBase(msg); + MACHSTATE4(3, "msg send mr %p: mr key %lu buf %p offset %lu\n", mr, fi_mr_key(mr), msg, offset); +#else + CmiAbort("not implemented"); +#endif + } + MACHSTATE(3, "--> long"); + ALIGNED_ALLOC(rma_header,sizeof(OFIRmaHeader)); + rma_header->nodeNo = CmiMyNodeGlobal(); +#if CMK_CXI + rma_header->src_msg = GetMemOffsetFromBase(msg); + rma_header->orig_msg = (uint64_t) msg; +#else + rma_header->src_msg = (uint64_t)msg; +#endif + rma_header->len = size; + rma_header->key = fi_mr_key(mr); + rma_header->mr = (uint64_t) mr; + req->callback = send_rma_callback; + req->data.rma_header = rma_header; + MACHSTATE3(3, "sending msg size=%d, hdl=%d, xhdl=%d",CmiGetMsgSize(msg),CmiGetHandler(msg), CmiGetXHandler(msg)); } - -#if CMK_SMP +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_LrtsSendFunc, startT, endT); +#endif +#if CMK_SMP && CMK_SMP_SENDQ /* Enqueue message */ MACHSTATE2(2, " --> (PE=%i) enqueuing message (queue depth=%i)", CmiMyPe(), PCQueueLength(context.send_queue)); @@ -881,16 +1543,17 @@ void send_ack_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) * An OFIRmaAck was sent (see rma_read_callback()). * We are done with the RMA Read operation. Free up the resources. */ - OFILongMsg *long_msg; + OFILongMsg *long_msg=NULL; +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif MACHSTATE(3, "OFI::send_ack_callback {"); long_msg = req->data.long_msg; CmiAssert(long_msg); - if (long_msg->mr) - fi_close((struct fid*)long_msg->mr); - free(long_msg); #if USE_OFIREQUEST_CACHE @@ -898,7 +1561,10 @@ void send_ack_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) #else CmiFree(req); #endif - +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_send_ack_callback, startT, endT); +#endif MACHSTATE(3, "} OFI::send_ack_callback done"); } @@ -908,7 +1574,11 @@ void rma_read_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) /** * An RMA Read operation completed. */ - OFILongMsg *long_msg; + OFILongMsg *long_msg=NULL; +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif MACHSTATE(3, "OFI::rma_read_callback {"); @@ -944,13 +1614,21 @@ void rma_read_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) * Pass received message to upper layer. */ MACHSTATE1(3, "--> Finished receiving msg size=%i", CMI_MSG_SIZE(asm_msg)); - - handleOneRecvedMsg(CMI_MSG_SIZE(asm_msg), asm_msg); + MACHSTATE4(3, "received msg size=%d, hdl=%d, xhdl=%d last=%x",CmiGetMsgSize(asm_msg),CmiGetHandler(asm_msg), CmiGetXHandler(asm_msg), asm_msg[CMI_MSG_SIZE(asm_msg)-1]); +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_rma_read_callback, startT, endT); +#endif + handleOneRecvedMsg(CMI_MSG_SIZE(asm_msg), asm_msg); } else { #if USE_OFIREQUEST_CACHE free_request(req); #else CmiFree(req); +#endif +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_rma_read_callback, startT, endT); #endif } @@ -966,9 +1644,13 @@ void process_short_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) * - Allocate new recv buffer. */ - char *data; + char *data=NULL; size_t msg_size; - +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif + MACHSTATE(3, "OFI::process_short_recv"); data = (char *)req->data.recv_buffer; CmiAssert(data); @@ -977,7 +1659,11 @@ void process_short_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) req->data.recv_buffer = CmiAlloc(context.eager_maxsize); CmiAssert(req->data.recv_buffer); - + MACHSTATE3(3, "received msg size=%d, hdl=%d, xhdl=%d",CmiGetMsgSize(data),CmiGetHandler(data), CmiGetXHandler(data)); +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_process_short_recv, startT, endT); +#endif handleOneRecvedMsg(e->len, data); } @@ -992,21 +1678,25 @@ void process_long_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) */ int ret; - OFILongMsg *long_msg; - OFIRequest *rma_req; - OFIRmaHeader *rma_header; + OFILongMsg *long_msg=NULL; + OFIRequest *rma_req=NULL; + OFIRmaHeader *rma_header=NULL; struct fid_mr *mr = NULL; - char *asm_buf; + char *asm_buf=NULL; int nodeNo; uint64_t rbuf; size_t len; uint64_t rkey; uint64_t rmsg; uint64_t rmr; - char *lbuf; + char *lbuf=NULL; size_t remaining; size_t chunk_size; - +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif + MACHSTATE(3, "OFI::process_long_recv"); CmiAssert(e->len == sizeof(OFIRmaHeader)); /** @@ -1020,16 +1710,19 @@ void process_long_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) rkey = rma_header->key; rmr = rma_header->mr; - MACHSTATE2(3, "--> Receiving long msg len=%ld rkey=0x%lx", len, rkey); + MACHSTATE5(3, "--> Receiving long msg src node %d len=%ld rptr=0x%lx rmsg=0x%lu rmr=0x%lx", nodeNo, len, rma_header->orig_msg, rmsg, rmr); + MACHSTATE3(3, "--> Receiving long msg rptr=0x%lx rkey=0x%lu rmr=0x%lx", rma_header->orig_msg, rkey, rmr); /** * Prepare buffer */ - asm_buf = (char *)CmiAlloc(len); - CmiAssert(asm_buf); - if (FI_MR_BASIC == context.mr_mode) { - /* Register local MR to read into */ + + if (FI_MR_BASIC & context.mr_mode) + { + MACHSTATE1(3, "FI_MR_BASIC %d", context.mr_mode); + asm_buf = (char *)CmiAlloc(len); + /* Register local MR to read into */ ret = fi_mr_reg(context.domain, /* In: domain object */ asm_buf, /* In: lower memory address */ len, /* In: length */ @@ -1040,22 +1733,33 @@ void process_long_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) &mr, /* Out: memregion object */ NULL); /* In: context (not used) */ if (ret) { - MACHSTATE1(3, "fi_mr_reg error: %d\n", ret); + MACHSTATE1(3, "fi_mr_reg short buf error: %d\n", ret); CmiAbort("fi_mr_reg error"); } - } - + } + else if (FI_MR_ENDPOINT & context.mr_mode) { + asm_buf = (char *)CmiAlloc(len); + // memset(asm_buf,0,len); + } + CmiAssert(asm_buf); /** * Save some information about the RMA Read operation(s) */ - ALIGNED_ALLOC(long_msg, sizeof(*long_msg)); + ALIGNED_ALLOC(long_msg, sizeof(OFILongMsg)); + long_msg->asm_msg = asm_buf; long_msg->nodeNo = nodeNo; - long_msg->rma_ack.src_msg = rmsg; long_msg->rma_ack.mr = rmr; long_msg->completion_count = 0; - long_msg->mr = mr; - +#if CMK_CXI + // so the other side can free the right buffer in the offset case + long_msg->rma_ack.src_msg = rma_header->orig_msg; + long_msg->mr = (struct fid_mr *) GetMemHndl(asm_buf); + MACHSTATE2(3, "long msg mempool mr %p: mr key %lu\n", long_msg->mr, fi_mr_key(long_msg->mr)); +#else + long_msg->rma_ack.src_msg = rmsg; + long_msg->mr =mr; +#endif /** * Issue RMA Read request(s) */ @@ -1070,7 +1774,7 @@ void process_long_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) #if USE_OFIREQUEST_CACHE rma_req = alloc_request(context.request_cache); #else - rma_req = (OFIRequest *)CmiAlloc(sizeof(OFIRequest)); + rma_req = (OFIRequest *) CmiAlloc(sizeof(OFIRequest)); #endif CmiAssert(rma_req); rma_req->callback = rma_read_callback; @@ -1079,44 +1783,58 @@ void process_long_recv(struct fi_cq_tagged_entry *e, OFIRequest *req) /* Increment number of expected completions */ long_msg->completion_count++; - MACHSTATE5(3, "---> RMA Read lbuf %p rbuf %p rmsg %p len %ld chunk #%d", + MACHSTATE5(3, "---> RMA Read lbuf %p rbuf %lu rmsg %lu len %ld chunk #%lu", lbuf, rbuf, rmsg, chunk_size, long_msg->completion_count); - OFI_RETRY(fi_read(context.ep, + + OFI_RETRY(fi_read(context.ep, lbuf, chunk_size, - (mr) ? fi_mr_desc(mr) : NULL, + fi_mr_desc(long_msg->mr), nodeNo, rbuf, rkey, &rma_req->context)); - - remaining -= chunk_size; - lbuf += chunk_size; - rbuf += chunk_size; + remaining -= chunk_size; + lbuf += chunk_size; + rbuf += chunk_size; } + MACHSTATE4(3, "---> RMA completed lbuf %p rbuf %lu len %lu comp %lu", + lbuf, rbuf, len, long_msg->completion_count); +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_process_long_recv, startT, endT); +#endif } static inline void process_long_send_ack(struct fi_cq_tagged_entry *e, OFIRequest *req) { /** - * An OFIRmaAck was received; Close memory region and free original msg. + * An OFIRmaAck was received; free original msg. */ - struct fid *mr; - char *msg; - + struct fid *mr=NULL; + char *msg=NULL; +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif + MACHSTATE(3, "OFI::process_long_send_ack"); mr = (struct fid*)req->data.rma_ack->mr; CmiAssert(mr); - fi_close(mr); msg = (char *)req->data.rma_ack->src_msg; + MACHSTATE2(3, "OFI::process_long_send_ack for msg %p mr %p",msg, mr); CmiAssert(msg); - MACHSTATE1(3, "--> Finished sending msg size=%i", CMI_MSG_SIZE(msg)); + MACHSTATE2(3, "--> Finished sending msg size=%i msg ptr %p", CMI_MSG_SIZE(msg), msg); CmiFree(msg); +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_process_long_send_ack, startT, endT); +#endif } static inline @@ -1151,10 +1869,13 @@ void recv_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) break; #endif default: - MACHSTATE2(3, "--> unknown operation %x len=%ld", e->tag, e->len); + MACHSTATE2(3, "--> unknown operation %lu len=%lu", e->tag, e->len); CmiAbort("!! Wrong operation !!"); } - +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif MACHSTATE2(3, "Reposting recv req %p buf=%p", req, req->data.recv_buffer); OFI_RETRY(fi_trecv(context.ep, req->data.recv_buffer, @@ -1164,7 +1885,10 @@ void recv_callback(struct fi_cq_tagged_entry *e, OFIRequest *req) 0, OFI_OP_MASK, &req->context)); - +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_recv_callback, startT, endT); +#endif MACHSTATE(3, "} OFI::recv_callback done"); } @@ -1174,8 +1898,12 @@ int process_completion_queue() int ret; struct fi_cq_tagged_entry entries[context.cq_entries_count]; struct fi_cq_err_entry error; - OFIRequest *req; - + OFIRequest *req=NULL; +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif + MACHSTATE(3, "OFI::process_completion_queue"); ret = fi_cq_read(context.cq, entries, context.cq_entries_count); if (ret > 0) { @@ -1198,7 +1926,7 @@ int process_completion_queue() } else { - MACHSTATE1(3, "Missed event with flags=%x", e->flags); + MACHSTATE1(3, "Missed event with flags=%lu", e->flags); CmiAbort("!! Missed an event !!"); } } @@ -1221,8 +1949,8 @@ int process_completion_queue() { CmiAbort("can't retrieve error"); } - MACHSTATE2(3, "POLL: error is %d (ret=%d)\n", error.err, ret); - CmiPrintf("POLL: error is %d (ret=%d)\n", error.err, ret); + MACHSTATE4(3, "POLL: error is %d (ret=%d) len %lu tag %lu\n", error.err, ret, error.len, error.tag); + CmiPrintf("POLL: error is %d (ret=%d) len %lu tag %lu\n", error.err, ret, error.len, error.tag); const char* strerror = fi_cq_strerror(context.cq, error.prov_errno, error.err_data, nullptr, 0); if (strerror == nullptr) { @@ -1233,15 +1961,23 @@ int process_completion_queue() } CmiAbort("Polling error"); } +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_process_completion_queue, startT, endT); +#endif return ret; } -#if CMK_SMP +#if CMK_SMP && CMK_SMP_SENDQ static inline int process_send_queue() { - OFIRequest *req; + OFIRequest *req=NULL; int ret = 0; +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif /** * Comm thread sends the next message that is waiting in the send_queue. */ @@ -1256,6 +1992,10 @@ int process_send_queue() sendMsg(req); ret = 1; } +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_process_send_queue, startT, endT); +#endif return ret; } #endif @@ -1268,14 +2008,16 @@ void *alloc_mempool_block(size_t *size, mem_handle_t *mem_hndl, int expand_flag) if (*size < alloc_size) *size = alloc_size; if (*size > context.mempool_max_size) { - CmiPrintf("Error: there is attempt to allocate memory block with size %lld which is greater than the maximum mempool allowed %lld.\n" + CmiPrintf("Error: there is attempt to allocate memory block with size %ld which is greater than the maximum mempool allowed %lld.\n" "Please increase the maximum mempool size by using +ofi-mempool-max-size\n", *size, context.mempool_max_size); CmiAbort("alloc_mempool_block"); } void *pool; - ALIGNED_ALLOC(pool, *size); + posix_memalign(&pool,ALIGNBUF,*size); + ofi_reg_bind_enable(pool, *size, mem_hndl,&context); + MACHSTATE4(3, "alloc_mempool_block ptr %p mr %p key %lu inkey %d\n", pool, *mem_hndl, fi_mr_key(*mem_hndl) , context.mr_counter-1); return pool; } @@ -1294,10 +2036,15 @@ void LrtsPreCommonInit(int everReturn) #if USE_MEMPOOL CpvInitialize(mempool_type*, mempool); + CpvAccess(mempool) = mempool_init(context.mempool_init_size, alloc_mempool_block, free_mempool_block, context.mempool_max_size); + + block_header* current = &(CpvAccess(mempool)->block_head); + struct fid_mr* extractedmr = (struct fid_mr *) MEMPOOL_GetBlockMemHndl(current); + MACHSTATE2(3, "LrtsPreCommonInit mempool->block_head.mem_hndl %p extracted %p\n", CpvAccess(mempool)->block_head.mem_hndl, extractedmr ); #endif if (!CmiMyRank()) prepost_buffers(); @@ -1308,6 +2055,15 @@ void LrtsPreCommonInit(int everReturn) void LrtsPostCommonInit(int everReturn) { MACHSTATE(2, "OFI::LrtsPostCommonInit {"); +#if CMI_MACH_TRACE_USEREVENTS && CMK_TRACE_ENABLED + CpvInitialize(double, projTraceStart); + /* only PE 0 needs to care about registration (to generate sts file). */ + //if (CmiMyPe() == 0) + { + registerMachineUserEventsFunction(®isterUserTraceEvents); + } +#endif + postInit=1; MACHSTATE(2, "} OFI::LrtsPostCommonInit"); } @@ -1320,11 +2076,10 @@ void LrtsAdvanceCommunication(int whileidle) { processed_count = 0; processed_count += process_completion_queue(); -#if CMK_SMP +#if CMK_SMP && CMK_SMP_SENDQ processed_count += process_send_queue(); #endif } while (processed_count > 0); - MACHSTATE(2, "} OFI::LrtsAdvanceCommunication done"); } @@ -1341,46 +2096,144 @@ void LrtsDrainResources() /* used when exiting */ MACHSTATE(2, "} OFI::LrtsDrainResources"); } +#if USE_MEMPOOL +/* useful for Onesided so that it can avoid per buffer overheads, + which will otherwise totally dominate most micro benchmarks in an + unpleasant way. Basically same logic as LrtsAlloc, just no + converse header */ + +void* LrtsPoolAlloc(int n_bytes) +{ + return(LrtsAlloc(n_bytes,0)); +} +#endif + void* LrtsAlloc(int n_bytes, int header) { - void *ptr = NULL; + char *ptr = NULL; size_t size = n_bytes + header; - + MACHSTATE(3, "OFI::LrtsAlloc"); #if USE_MEMPOOL - if (size <= context.mempool_lb_size || size >= context.mempool_rb_size) - ALIGNED_ALLOC(ptr, size); + if (size <= context.mempool_lb_size) + { + CmiAbort("OFI pool lower boundary violation"); + } else - ptr = mempool_malloc(CpvAccess(mempool), size, 1); -#else - ALIGNED_ALLOC(ptr, size); + { + CmiAssert(header+sizeof(mempool_header) <= ALIGNBUF); + n_bytes=ALIGN64(n_bytes); + if( n_bytes < BIG_MSG) + { + char *res = (char *)mempool_malloc(CpvAccess(mempool), ALIGNBUF+n_bytes, 1); + + // note CmiAlloc wrapper will move the pointer past the header + if (res) ptr = res; + + MACHSTATE3(3, "OFI::LrtsAlloc ptr %p - header %d = %p", res, header, ptr); + size_t offset1=GetMemOffsetFromBase(ptr+header); + struct fid_mr* extractedmr = (struct fid_mr *) GetMemHndl(ptr+header); + MACHSTATE5(3, "OFI::LrtsAlloc not big from pool ret %p ptr %p memhndl %p mempoolptrfromret %p offset %lu", res, ptr, extractedmr, MEMPOOL_GetMempoolPtr(MEMPOOL_GetMempoolHeader(ptr+header,sizeof(mempool_header)+header)), offset1); + } + else + { +#if LARGEPAGE + n_bytes = ALIGNHUGEPAGE(n_bytes+ALIGNBUF); + char *res = (char *)my_get_huge_pages(n_bytes); +#else // not largepage + n_bytes = size+ sizeof(out_of_pool_header); + n_bytes = ALIGN64(n_bytes); + char *res; + + MACHSTATE1(3, "OFI::LrtsAlloc unpooled RB big %d", n_bytes); + posix_memalign((void **)&res,ALIGNBUF, n_bytes); + out_of_pool_header *mptr= (out_of_pool_header*) res; + // construct the minimal version of the + // mempool_header+block_header like a memory pool message + // so that all messages can be handled the same way with + // the same macros and functions. We need the mptr, + // block_ptr, and mem_hndl fields and can test the size to + // know to not put it back in the normal pool on free +#if CMK_CXI + struct fid_mr *mr; + ofi_reg_bind_enable(res, n_bytes, &mr,&context); + mptr->block_head.mem_hndl=mr; +#endif + mptr->block_head.mptr=(struct mempool_type*) res; + mptr->block.block_ptr=(struct block_header *)res; + ptr=(char *) res + (sizeof(out_of_pool_header)); + // char *testptr = ptr+sizeof(CmiChunkHeader); + // CmiAssert(GetBaseAllocPtr(testptr)==mptr->block.block_ptr); + // MACHSTATE5(3, "OFI::LrtsAlloc unpooled base %p, msg %p, size %lu, mr %p macrooffset %lu", res, testptr, n_bytes, mr, GetMemOffsetFromBase(testptr)); +#endif //LARGEPAGE + + } +#else //not MEMPOOL + n_bytes = ALIGN64(n_bytes); /* make sure size if 4 aligned */ + char *res; + posix_memalign((void **)&res, ALIGNBUF, n_bytes+ALIGNBUF); +#if CMK_CXI + struct fid_mr *mr; + ofi_reg_bind_enable(res, n_bytes+ALIGNBUF, &mr,&context); + ((block_header *)res)->mem_hndl = mr; +#endif // not cxi + ptr = res; +#endif //mempool +#if USE_MEMPOOL + } #endif - if (!ptr) CmiAbort("LrtsAlloc"); return ptr; } + void LrtsFree(void *msg) { + + int headersize = sizeof(CmiChunkHeader); + char *aligned_addr = (char *)msg + headersize - ALIGNBUF; + CmiUInt4 size = SIZEFIELD((char*)msg+headersize); + MACHSTATE1(3, "OFI::LrtsFree %p", msg); #if USE_MEMPOOL - CmiUInt4 size = SIZEFIELD((char*)msg + sizeof(CmiChunkHeader)) + sizeof(CmiChunkHeader); - if (size <= context.mempool_lb_size || size >= context.mempool_rb_size) - free(msg); - else + if (size <= context.mempool_lb_size) + CmiAbort("OFI: mempool lower boundary violation"); + else + size = ALIGN64(size); + if(size>=BIG_MSG) + { +#if LARGEPAGE + int s = ALIGNHUGEPAGE(size+ALIGNBUF); + my_free_huge_pages(msg, s); +#else +#if CMK_CXI + MACHSTATE1(3, "OFI::LrtsFree fi_close mr %p", (struct fid *)GetMemHndl( (char* )msg +sizeof(CmiChunkHeader))); + fi_close( (struct fid *)GetMemHndl( (char* )msg +sizeof(CmiChunkHeader))); + MACHSTATE2(3, "OFI::LrtsFree free msg next ptr %p vs ptr %p", GetBaseAllocPtr((char*)msg+sizeof(CmiChunkHeader)), (char *)msg-sizeof(out_of_pool_header)); + free((char *)msg-sizeof(out_of_pool_header)); +#else + free((char*)msg); +#endif //CXI + +#endif //LARGEPAGE + } + else + { #if CMK_SMP - mempool_free_thread(msg); + mempool_free_thread(msg); #else - mempool_free(CpvAccess(mempool), msg); + mempool_free(CpvAccess(mempool), msg); #endif /* CMK_SMP */ + } #else - free(msg); + free(aligned_addr); #endif /* USE_MEMPOOL */ + } void LrtsExit(int exitcode) { int ret; int i; - OFIRequest *req; + OFIRequest *req=NULL; MACHSTATE(2, "OFI::LrtsExit {"); @@ -1391,20 +2244,25 @@ void LrtsExit(int exitcode) req = context.recv_reqs[i]; ret = fi_cancel((fid_t)context.ep, (void *)&(req->context)); if (ret < 0) CmiAbort("fi_cancel error"); - CmiFree(req->data.recv_buffer); + CmiFree(req->data.recv_buffer); #if USE_OFIREQUEST_CACHE free_request(req); #else - CmiFree(req); + CmiFree(req); #endif } -#if CMK_SMP +#if CMK_SMP && CMK_SMP_SENDQ PCQueueDestroy(context.send_queue); #endif +#if CMK_CXI + if (context.recv_reqs) + CmiFree(context.recv_reqs); +#else if (context.recv_reqs) free(context.recv_reqs); +#endif if (context.av) fi_close((struct fid *)(context.av)); if (context.cq) @@ -1532,6 +2390,7 @@ int fill_av_ofi(int myid, */ epnamelen = sizeof(my_epname); ret = fi_getname((fid_t)ep, &my_epname, &epnamelen); + MACHSTATE1(3, "OFI::fill_av_ofi name %s", my_epname); CmiAssert(FI_NAME_MAX >= epnamelen); if (ret < 0) { CmiAbort("OFI::LrtsInit::fi_getname error"); @@ -1778,6 +2637,76 @@ int fill_av(int myid, return 0; } +//! convenience function to do registration, binding and enabling in one go +// primarily for CXI to support FI_MR_ENDPOINT, but it has no +// CXI specific dependencies +static int ofi_reg_bind_enable(const void *buf, + size_t len, struct fid_mr **mr, OFIContext *context) +{ + + uint32_t requested_key = __sync_fetch_and_add(&(context->mr_counter), 1); +#if CMK_SMP_TRACE_COMMTHREAD + double startT, endT; + startT = CmiWallTimer(); +#endif + /* Register new MR */ + int ret = fi_mr_reg(context->domain, /* In: domain object */ + buf, /* In: lower memory address */ + len, /* In: length */ + MR_ACCESS_PERMISSIONS, /* In: access permissions */ + 0ULL, /* In: offset (not used) */ + requested_key, /* In: requested key */ + 0ULL, /* In: flags */ + mr, /* Out: memregion object */ + NULL); /* In: context (not used) */ + + if (ret) { + MACHSTATE1(3, "fi_mr_reg error: %d\n", ret); + char errstring[100]; + snprintf(errstring, 100, "fi_mr_reg error: %d", ret); + CmiAbort(errstring); + } + else{ + MACHSTATE3(3, "fi_mr_reg success: %d buf %p mr %lu\n", ret, buf, fi_mr_key(*mr)); + } +#if CMK_CXI + ret = fi_mr_bind(*mr, (struct fid *)context->ep, 0); + if (ret) { + MACHSTATE1(3, "fi_mr_bind error: %d\n", ret); + char errstring[100]; + snprintf(errstring, 100, "fi_mr_bind error: %d", ret); + CmiAbort(errstring); + } + else + { + MACHSTATE3(3, "fi_mr_bind success: %d ep %p mr %lu\n", ret, context->ep, fi_mr_key(*mr)); + } + + ret = fi_mr_enable(*mr); + if (ret) { + MACHSTATE1(3, "fi_mr_enable error: %d\n", ret); + char errstring[100]; + snprintf(errstring, 100, "fi_mr_enable error: %d", ret); + CmiAbort(errstring); + } + else + { + MACHSTATE2(3, "fi_mr_enable success: %d mr %lu\n", ret, fi_mr_key(*mr)); + } +#endif +#if CMK_SMP_TRACE_COMMTHREAD + endT = CmiWallTimer(); + if (postInit==1 && ((endT-startT>=TRACE_THRESHOLD))) traceUserBracketEvent(event_reg_bind_enable, startT, endT); +#endif + return(ret); +} + +INLINE_KEYWORD void LrtsPrepareEnvelope(char *msg, int size) +{ + CmiSetMsgSize(msg, size); + // CMI_SET_CHECKSUM(msg, size); +} + #if CMK_ONESIDED_IMPL #include "machine-onesided.C" #endif diff --git a/src/arch/util/machine-xpmem.C b/src/arch/util/machine-xpmem.C index e145161511..92ed88ceff 100644 --- a/src/arch/util/machine-xpmem.C +++ b/src/arch/util/machine-xpmem.C @@ -22,7 +22,10 @@ There are three options here for synchronization: #include #include #include - +#include +#if CMK_CXI +#include +#endif #include "xpmem.h" /************** @@ -36,7 +39,9 @@ There are three options here for synchronization: /* Default to using fences */ #define XPMEM_FENCE 1 #endif - +#if CMK_CXI +#define CmiGetMsgSize(msg) ((((CmiMsgHeaderBasic *)msg)->size)) +#endif #define MEMDEBUG(x) //x #define XPMEM_STATS 0 @@ -70,7 +75,7 @@ There are three options here for synchronization: #endif #if CMK_SMP -#error "PXSHM can only be used in non-smp build of Charm++" +#error "XPMEM can only be used in non-smp build of Charm++" #endif /***************************************************************************************/ @@ -225,7 +230,7 @@ void CmiInitXpmem(char **argv){ CmiAbort("Opening /dev/xpmem"); } -#if CMK_CRAYXE || CMK_CRAYXC +#if CMK_CRAYXE || CMK_CRAYXC || CMK_OFI srand(getpid()); int Cmi_charmrun_pid = rand(); PMI_Bcast(&Cmi_charmrun_pid, sizeof(int)); @@ -345,8 +350,8 @@ void CmiSendMessageXpmem(char *msg, int size, int dstnode, int *refcount) int dstRank = XpmemRank(dstnode); MEMDEBUG(CmiMemoryCheck()); - MACHSTATE4(3,"Send Msg Xpmem ogm %p size %d dst %d dstRank %d",ogm,ogm->size,ogm->dst,dstRank); - MACHSTATE4(3,"Send Msg Xpmem ogm %p size %d dst %d dstRank %d",ogm,ogm->size,ogm->dst,dstRank); + // MACHSTATE4(3,"Send Msg Xpmem msg %p size %d dst %d dstRank %d",msg,msg->size,msg->dst,dstRank); + // MACHSTATE4(3,"Send Msg Xpmem msg %p size %d dst %d dstRank %d",msg,msg->size,msg->dst,dstRank); CmiAssert(dstRank >=0 && dstRank != xpmemContext->noderank); @@ -397,7 +402,7 @@ void CmiSendMessageXpmem(char *msg, int size, int dstnode, int *refcount) }else{ (*refcount)+=2;/*this message should not get deleted when the queue is flushed*/ pushSendQ(sendQ,msg,size,refcount); - MACHSTATE3(3,"Xpmem ogm %p pushed to sendQ length %d refcount %d",ogm,sendQ->numEntries,ogm->refcount); + // MACHSTATE3(3,"Xpmem msg %p pushed to sendQ length %d refcount %d",msg,sendQ->numEntries,msg->refcount); int sent = flushSendQ(sendQ); (*refcount)--; /*if it has been sent, can be deleted by caller, if not will be deleted when queue is flushed*/ MACHSTATE1(3,"Xpmem flushSendQ sent %d messages",sent); @@ -533,7 +538,7 @@ void allocBufNameStrings(char ***bufName) { int i,count; int totalAlloc = sizeof(char)*NAMESTRLEN*(xpmemContext->nodesize-1); - char *tmp = malloc(totalAlloc); + char *tmp = (char *) malloc(totalAlloc); MACHSTATE2(3,"allocBufNameStrings tmp %p totalAlloc %d",tmp,totalAlloc); @@ -553,7 +558,7 @@ __s64 createXpmemObject(int size,char **pPtr) struct xpmem_cmd_make make_info; int ret; - *pPtr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + *pPtr = (char*) mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (*pPtr == MAP_FAILED) { perror("Creating mapping."); return -1; @@ -599,7 +604,7 @@ void attachXpmemObject(__s64 segid, int size, char **pPtr) CmiAbort("xpmem_attach"); } - *pPtr = (void *)attach_info.vaddr; + *pPtr = (char *)attach_info.vaddr; } void createRecvXpmemAndSems(sharedBufData **bufs,char **bufNames){ @@ -608,7 +613,7 @@ void createRecvXpmemAndSems(sharedBufData **bufs,char **bufNames){ int size, pagesize = getpagesize(); *bufs = (sharedBufData *)calloc(xpmemContext->nodesize, sizeof(sharedBufData)); - segid_arr = malloc(sizeof(__s64)*xpmemContext->nodesize); + segid_arr = (__s64 *) malloc(sizeof(__s64)*xpmemContext->nodesize); size = XPMEMBUFLEN+sizeof(sharedBufHeader); size = ((~(pagesize-1))&(size+pagesize-1)); @@ -751,7 +756,7 @@ int sendMessage(char *msg, int size, int *refcount, sharedBufData *dstBuf,XpmemS dstBuf->header->count++; CmiMemcpy(dstBuf->data+dstBuf->header->bytes,msg,size); dstBuf->header->bytes += size; - MACHSTATE4(3,"Xpmem send done ogm %p size %d dstBuf->header->count %d dstBuf->header->bytes %d",ogm,ogm->size,dstBuf->header->count,dstBuf->header->bytes); + // MACHSTATE4(3,"Xpmem send done msg %p size %d dstBuf->header->count %d dstBuf->header->bytes %d",msg,msg->size,dstBuf->header->count,dstBuf->header->bytes); CmiFree(msg); return 1; } @@ -761,7 +766,7 @@ int sendMessage(char *msg, int size, int *refcount, sharedBufData *dstBuf,XpmemS //printf("[%d] send buffer is too full\n", CmiMyPe()); pushSendQ(dstSendQ,msg,size,refcount); (*refcount)++; - MACHSTATE3(3,"Xpmem send ogm %p size %d queued refcount %d",ogm,ogm->size,ogm->refcount); + // MACHSTATE3(3,"Xpmem send msg %p size %d queued refcount %d",ogm,ogm->size,ogm->refcount); return 0; } @@ -779,7 +784,7 @@ inline int flushSendQ(XpmemSendQ *dstSendQ){ while(count > 0){ OutgoingMsgRec *ogm = popSendQ(dstSendQ); (*ogm->refcount)--; - MACHSTATE4(3,"Xpmem trysending ogm %p size %d to dstRank %d refcount %d",ogm,ogm->size,dstSendQ->rank,ogm->refcount); + // MACHSTATE4(3,"Xpmem trysending ogm %p size %d to dstRank %d refcount %d",ogm,ogm->size,dstSendQ->rank,ogm->refcount); int ret = sendMessageRec(ogm,dstBuf,dstSendQ); if(ret==1){ sent++; diff --git a/src/arch/util/mempool.C b/src/arch/util/mempool.C index abff7ea2be..a312d155cb 100644 --- a/src/arch/util/mempool.C +++ b/src/arch/util/mempool.C @@ -110,6 +110,7 @@ INLINE_KEYWORD void fillblock(mempool_type* mptr, block_header* block_head, size { ((slot_header*)((char*)mptr + prev))->gnext = block_head->freelists[i]; } + DEBUG_PRINT("Pow %d size %ld addr %p offset %ld block_head->freelists[i]=%p\n", head->power, head->size, head, head- (slot_header*)((char *)mptr), block_head->freelists[i]); prev = block_head->freelists[i]; } } @@ -224,6 +225,7 @@ void removeblocks(mempool_type* mptr) } } + /** initialize mempool */ mempool_type* mempool_init(size_t pool_size, mempool_newblockfn allocfn, mempool_freeblock freefn, size_t limit) { @@ -241,7 +243,7 @@ mempool_type* mempool_init(size_t pool_size, mempool_newblockfn allocfn, mempool mptr->block_tail = 0; mptr->limit = limit; mptr->size = pool_size; -#if CMK_SMP && CMK_CONVERSE_UGNI +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) mptr->mempoolLock = CmiCreateLock(); #endif mptr->block_head.mptr = (struct mempool_type*)pool; @@ -285,7 +287,7 @@ void mempool_destroy(mempool_type* mptr) // append slot_header size before the real memory buffer void* mempool_malloc(mempool_type* mptr, size_t size, int expand) { -#if CMK_SMP && CMK_CONVERSE_UGNI +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) CmiLock(mptr->mempoolLock); #endif @@ -375,7 +377,7 @@ void* mempool_malloc(mempool_type* mptr, size_t size, int expand) head_free->block_ptr = current; current->used += power; -#if CMK_SMP && CMK_CONVERSE_UGNI +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) CmiUnlock(mptr->mempoolLock); #endif DEBUG_PRINT("Malloc done\n"); @@ -422,16 +424,18 @@ void* mempool_large_malloc(mempool_type* mptr, size_t size, int expand) head_free->block_ptr = (block_header*)current; head_free->size = expand_size - sizeof(large_block_header); head_free->status = -1; -#if CMK_SMP && CMK_CONVERSE_UGNI +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) CmiUnlock(mptr->mempoolLock); #endif DEBUG_PRINT("Large malloc done\n"); return (char*)head_free + sizeof(used_header); } -#if CMK_SMP && CMK_CONVERSE_UGNI +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) void mempool_free_thread(void* ptr_free) + { + slot_header* to_free = (slot_header*)((char*)ptr_free - sizeof(used_header)); mempool_type* mptr = to_free->status == -1 ? (mempool_type*)(((large_block_header*)(to_free->block_ptr))->mptr) diff --git a/src/arch/util/mempool.h b/src/arch/util/mempool.h index 4cabad9648..9a8d1a9dd1 100644 --- a/src/arch/util/mempool.h +++ b/src/arch/util/mempool.h @@ -9,6 +9,9 @@ #include "gni_pub.h" #include "pmi.h" typedef gni_mem_handle_t mem_handle_t; +#elif CMK_OFI +#include "rdma/fi_domain.h" +typedef struct fid_mr* mem_handle_t; #else // in uGNI, it is memory handler, other versions, this is an integer // a unique integer to represent the memory block @@ -90,6 +93,12 @@ typedef struct block_header #undef freelists_extra } block_header; +typedef struct out_of_pool_header +{ + block_header block_head; + mempool_header block; +} out_of_pool_header; + typedef struct large_block_header { mem_handle_t mem_hndl; @@ -113,7 +122,7 @@ typedef struct mempool_type size_t block_tail; size_t limit; size_t size; -#if CMK_SMP && CMK_CONVERSE_UGNI +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) CmiNodeLock mempoolLock; char padding[CMIPADDING((6 * sizeof(size_t) + sizeof(CmiNodeLock)), 16)]; #elif !CMK_64BIT @@ -136,7 +145,8 @@ void mempool_destroy(mempool_type* mptr); void* mempool_malloc(mempool_type* mptr, size_t size, int expand); void* mempool_large_malloc(mempool_type* mptr, size_t size, int expand); void mempool_free(mempool_type* mptr, void* ptr_free); -#if CMK_SMP && CMK_CONVERSE_UGNI + +#if CMK_SMP && (CMK_CONVERSE_UGNI || CMK_OFI) void mempool_free_thread(void* ptr_free); #endif diff --git a/src/arch/util/proc_management/runtime-craypmi.C b/src/arch/util/proc_management/runtime-craypmi.C new file mode 100644 index 0000000000..6ea55875e3 --- /dev/null +++ b/src/arch/util/proc_management/runtime-craypmi.C @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2017, Intel Corporation. All rights reserved. + * See LICENSE in src/arch/ofi. + * + * Runtime functions used by OFI LRTS machine layer to exchange + * addresses during the initialization. + * + * This example uses the PMI API as described in pmi.h. + */ + +/* EJB 2024/2/13 This has been modified to use the cray PMI + extensions, which are a superset of PMI1 and PMI2, plus within node + topology (i.e., cliques). That permits interactions using the PMI2 + API, with the hidden caveat that doing so does not support multiple + processes per node. Therefore, these revisions lean heavily on the + PMI1 protocol and Cray's extensions to that, so that process launch + can work normally within and across nodes. + */ + +#include +#include + +#include + +#include "runtime.h" + +#if CMK_USE_SIMPLEPMI +#include "simple_pmi.C" +#include "simple_pmiutil.C" +#endif + +/* For encode/decode functions */ +#include "runtime-codec.h" + +static int initialized; +static int max_keylen; +static int max_valuelen; +static char *kvsname; +static char *key; +static char *value; + +int runtime_init(int *rank, int *jobsize) +{ + int ret; + int first_spawned; + int max_kvsnamelen; + + ret = PMI_Init(&first_spawned); + if (PMI_SUCCESS != ret) { + return 1; + } + + ret = PMI_Get_size(jobsize); + if (PMI_SUCCESS != ret) { + return 2; + } + + ret = PMI_Get_rank(rank); + if (PMI_SUCCESS != ret) { + return 3; + } + + ret = PMI_KVS_Get_name_length_max(&max_kvsnamelen); + if (PMI_SUCCESS != ret) { + return 4; + } + + kvsname = (char *)calloc(max_kvsnamelen, sizeof(char)); + if (!kvsname) { + return 5; + } + + ret = PMI_KVS_Get_my_name(kvsname, max_kvsnamelen); + if (PMI_SUCCESS != ret) { + free(kvsname); + return 6; + } + + ret = PMI_KVS_Get_key_length_max(&max_keylen); + if (PMI_SUCCESS != ret) { + free(kvsname); + return 7; + } + + key = (char *)calloc(max_keylen, sizeof(char)); + if (!key) { + free(kvsname); + return 8; + } + + ret = PMI_KVS_Get_value_length_max(&max_valuelen); + if (PMI_SUCCESS != ret) { + free(key); + free(kvsname); + return 9; + } + + value = (char *)calloc(max_valuelen, sizeof(char)); + if (!value) { + free(key); + free(kvsname); + return 10; + } + + initialized = 1; + return 0; +} + +int runtime_fini() +{ + int ret; + + if (initialized) { + ret = PMI_Finalize(); + if (PMI_SUCCESS != ret) { + return 1; + } + } + + if (value) { + free(value); + value = NULL; + } + if (key) { + free(key); + key = NULL; + } + if (kvsname) { + free(kvsname); + kvsname = NULL; + } + + initialized = 0; + return 0; +} + +int runtime_get_max_keylen(int *len) +{ + if (!initialized) { + return 1; + } + *len = max_keylen; + return 0; +} + +int runtime_get_max_vallen(int *len) +{ + if (!initialized) { + return 1; + } + *len = (max_valuelen -1) / 2; + return 0; +} + +int runtime_kvs_put(const char *k, const void *v, int vlen) +{ + int ret; + int keylen; + + if (!initialized) { + return 1; + } + + keylen = strlen(k); + if (keylen > max_keylen) { + return 2; + } + + if (vlen > max_valuelen) { + return 3; + } + + ret = encode(v, vlen, value, max_valuelen); + if (ret) { + return 4; + } + + ret = PMI_KVS_Put(kvsname, k, value); + if (ret) { + return 5; + } + + ret = PMI_KVS_Commit(kvsname); + if (ret) { + return 6; + } + + return 0; +} + +int runtime_kvs_get(const char *k, void *v, int vlen, int id) +{ + int ret; + + if (!initialized) { + return 1; + } + + ret = PMI_KVS_Get(kvsname, k, value, max_valuelen); + if (ret) { + return 2; + } + + ret = decode(value, v, vlen); + if (ret) { + return 3; + } + + return 0; +} + +int runtime_barrier() +{ + int ret; + + if (!initialized) { + return 1; + } + + ret = PMI_Barrier(); + if (ret) { + return 2; + } + return 0; +} diff --git a/src/arch/util/proc_management/runtime-craypmi2.C b/src/arch/util/proc_management/runtime-craypmi2.C new file mode 100644 index 0000000000..63e4d2ee34 --- /dev/null +++ b/src/arch/util/proc_management/runtime-craypmi2.C @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2017, Intel Corporation. All rights reserved. + * See LICENSE in src/arch/ofi. + * + * Runtime functions used by OFI LRTS machine layer to exchange + * addresses during the initialization. + * + * This example uses the PMI2 API as described in pmi2.h. + */ + +/* This example has been modified to use the cray extensions + */ +#include +#include + +#include + +#include "runtime.h" + +/* For encode/decode functions */ +#include "runtime-codec.h" + +static int initialized; +static int max_keylen = PMI2_MAX_KEYLEN; +static int max_valuelen = PMI2_MAX_VALLEN; +static char *kvsname; +static char *key; +static char *value; + +int runtime_init(int *rank, int *jobsize) +{ + int ret; + int spawned; + int appnum; + int max_kvsnamelen = PMI2_MAX_VALLEN; + + ret = PMI2_Init(&spawned, jobsize, rank, &appnum); + // printf("PMI2_init ret %d jobsize %d rank %d\n",ret, *jobsize, *rank); + if (PMI2_SUCCESS != ret) { + return 1; + } + + kvsname = (char*)calloc(max_kvsnamelen, sizeof(char)); + if (!kvsname) { + return 2; + } + + ret = PMI2_Job_GetId(kvsname, max_kvsnamelen); + if (PMI2_SUCCESS != ret) { + return 3; + } + + key = (char*)calloc(max_keylen, sizeof(char)); + if (!key) { + free(kvsname); + return 4; + } + + value = (char*)calloc(max_valuelen, sizeof(char)); + if (!value) { + free(key); + free(kvsname); + return 5; + } + + initialized = 1; + return 0; +} + +int runtime_fini() +{ + int ret; + + if (initialized) { + ret = PMI2_Finalize(); + if (PMI2_SUCCESS != ret) { + return 1; + } + } + + if (value) { + free(value); + value = NULL; + } + if (key) { + free(key); + key = NULL; + } + if (kvsname) { + free(kvsname); + kvsname = NULL; + } + + initialized = 0; + return 0; +} + +int runtime_get_max_keylen(int *len) +{ + if (!initialized) { + return 1; + } + *len = max_keylen; + return 0; +} + +int runtime_get_max_vallen(int *len) +{ + if (!initialized) { + return 1; + } + *len = (max_valuelen - 1) / 2; + return 0; +} + +int runtime_kvs_put(const char *k, const void *v, int vlen) +{ + int ret; + int keylen; + + if (!initialized) { + return 1; + } + + keylen = strlen(k); + if (keylen > max_keylen) { + return 2; + } + + if (vlen > max_valuelen) { + return 3; + } + + ret = encode(v, vlen, value, max_valuelen); + if (ret) { + return 4; + } + + ret = PMI2_KVS_Put(k, value); + if (ret) { + return 5; + } + + return 0; +} + +int runtime_kvs_get(const char *k, void *v, int vlen, int id) +{ + int ret; + int len; + + if (!initialized) { + return 1; + } + + ret = PMI2_KVS_Get(kvsname, PMI2_ID_NULL, k, value, max_valuelen, &len); + if (ret) { + return 2; + } + + ret = decode(value, v, vlen); + if (ret) { + return 3; + } + + return 0; +} + +int runtime_barrier() +{ + int ret; + + if (!initialized) { + return 1; + } + + ret = PMI2_KVS_Fence(); + if (ret) { + return 2; + } + return 0; +} diff --git a/src/ck-core/debug-charm.C b/src/ck-core/debug-charm.C index ef74b25a46..b4750ed81e 100644 --- a/src/ck-core/debug-charm.C +++ b/src/ck-core/debug-charm.C @@ -334,7 +334,11 @@ public: p((char*)elt,size); } }; - +#if CMK_OFI +// EJB TODO the fix for this belongs elsewhere, but this is ok for now +#undef CMK_HAS_GET_MYADDRESS +#define CMK_HAS_GET_MYADDRESS 0 +#endif #if CMK_HAS_GET_MYADDRESS #include #endif diff --git a/src/libs/ck-libs/ampi/romio/configure b/src/libs/ck-libs/ampi/romio/configure index 6c2f5136c6..b43aa0e4ea 100755 --- a/src/libs/ck-libs/ampi/romio/configure +++ b/src/libs/ck-libs/ampi/romio/configure @@ -871,7 +871,6 @@ infodir docdir oldincludedir includedir -runstatedir localstatedir sharedstatedir sysconfdir @@ -977,7 +976,6 @@ datadir='${datarootdir}' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' -runstatedir='${localstatedir}/run' includedir='${prefix}/include' oldincludedir='/usr/include' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' @@ -1230,15 +1228,6 @@ do | -silent | --silent | --silen | --sile | --sil) silent=yes ;; - -runstatedir | --runstatedir | --runstatedi | --runstated \ - | --runstate | --runstat | --runsta | --runst | --runs \ - | --run | --ru | --r) - ac_prev=runstatedir ;; - -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \ - | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \ - | --run=* | --ru=* | --r=*) - runstatedir=$ac_optarg ;; - -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ @@ -1376,7 +1365,7 @@ fi for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ datadir sysconfdir sharedstatedir localstatedir includedir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ - libdir localedir mandir runstatedir + libdir localedir mandir do eval ac_val=\$$ac_var # Remove trailing slashes. @@ -1529,7 +1518,6 @@ Fine tuning of the installation directories: --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] - --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] @@ -23331,7 +23319,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) +#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -23377,7 +23365,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) +#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -23401,7 +23389,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) +#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -23446,7 +23434,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) +#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -23470,7 +23458,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) +#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; diff --git a/src/libs/ck-libs/io/ckio.h b/src/libs/ck-libs/io/ckio.h index ca2c61296b..f3281427be 100644 --- a/src/libs/ck-libs/io/ckio.h +++ b/src/libs/ck-libs/io/ckio.h @@ -1,209 +1,223 @@ #ifndef CK_IO_H #define CK_IO_H -#include -#include -#include #include #include +#include +#include +#include #include "CkIO.decl.h" - -namespace Ck { namespace IO { class Session; }} - -namespace Ck { namespace IO { - /// Note: The values in options are not currently a stable or working interface. - /// Users should not set anything in them. - struct Options { - Options() - : peStripe(0), writeStripe(0), activePEs(-1), basePE(-1), skipPEs(-1), read_stride(0), numReaders(0) - { } - - /// How much contiguous data (in bytes) should be assigned to each active PE - size_t peStripe; - /// How much contiguous data (in bytes) should a PE gather before writing it out - size_t writeStripe; - /// How many PEs should participate in this activity - int activePEs; - /// Which PE should be the first to participate in this activity - int basePE; - /// How should active PEs be spaced out? - int skipPEs; - // How many bytes each Read Session should hold - size_t read_stride; - // How many IO buffers should there be - size_t numReaders; - - void pup(PUP::er &p) { - p|peStripe; - p|writeStripe; - p|activePEs; - p|basePE; - p|skipPEs; - p|read_stride; - p | numReaders; - } - }; - - class File; - // class ReadAssembler; - /// Open the named file on the selected subset of PEs, and send a - /// FileReadyMsg to the opened callback when the system is ready to accept - /// session requests on that file. - /// Note: The values in options are not currently a stable or working interface. - /// Users should not set anything in them. - void open(std::string name, CkCallback opened, Options opts); - - /// Prepare to write data into the file described by token, in the window - /// defined by the offset and byte length. When the session is set up, a - /// SessionReadyMsg will be sent to the ready callback. When all of the data - /// has been written and synced, a message will be sent to the complete - /// callback. - void startSession(File file, size_t bytes, size_t offset, - CkCallback ready, CkCallback complete); - - /// Prepare to write data into @arg file, in the window defined by the @arg - /// offset and length in @arg bytes. When the session is set up, a - /// SessionReadyMsg will be sent to the @arg ready callback. When all of the - /// data has been written and synced, an additional write will be made to the - /// file to `commit' the session's work. When that write has completed, a - /// message will be sent to the @arg complete callback. - void startSession(File file, size_t bytes, size_t offset, CkCallback ready, - const char *commitData, size_t commitBytes, size_t commitOffset, - CkCallback complete); - - /// Write the given data into the file to which session is attached. The - /// offset is relative to the file as a whole, not to the session's offset. - void write(Session session, const char *data, size_t bytes, size_t offset); - - /// Close a previously-opened file. All sessions on that file must have - /// already signalled that they are complete. - void close(File file, CkCallback closed); - - /** - * Prepare to read data from @arg file section specified by @arg bytes and @arg offset. - * This method will proceed to eagerly read all of the data in that window into memory - * for future read calls. After all the data is read in, the ready callback will be invoked. - * The ready callback will take in a SessionReadyMessage* that will contain the offset, the amount of bytes - * , and the buffer in the form of a vector. - */ - void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready); - - /** - * Same as the above start session in function. However, there is an extra @arg pes_to_map. pes_to_map will contain a sequence - * of numbers representing pes. CkIO will map the IO Buffer chares to those pes specified in pes_to_map in a round_robin fashion. - */ - void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready, std::vector pes_to_map); - - /** - * Used to end the current read session and will then invoke the after_end callback that takes a CkReductionMsg* with nothing in it - * Will effectively call ckDestroy() on the CProxy_Reader of the associated FileInfo - */ - - void closeReadSession(Session read_session, CkCallback after_end); - /** - * Is a method that reads data from the @arg session of length @arg bytes at offset - * @arg offset (in file). After this read finishes, the @arg after_read callback is invoked, taking - * a ReadCompleteMsg* which points to a vector buffer, the offset, and the number of - * bytes of the read. - * */ - void read(Session session, size_t bytes, size_t offset, char* data, CkCallback after_read); - void read(Session session, size_t bytes, size_t offset, CkCallback after_read, size_t tag); - -// ZERO COPY READ; - void read(Session session, size_t bytes, size_t offset, CkCallback after_read, size_t tag, char* user_buffer); - - - class File { - int token; - friend void startSession(File file, size_t bytes, size_t offset, - CkCallback ready, CkCallback complete); - - friend void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready); - friend void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready, std::vector pes_to_map); - - friend void startSession(File file, size_t bytes, size_t offset, CkCallback ready, - const char *commitData, size_t commitBytes, size_t commitOffset, - CkCallback complete); - friend void close(File file, CkCallback closed); - friend class FileReadyMsg; - - public: - File(int token_) : token(token_) { } - File() : token(-1) { } - void pup(PUP::er &p) { p|token; } - }; - - class FileReadyMsg : public CMessage_FileReadyMsg { - public: - File file; - FileReadyMsg(const File &tok) : file(tok) {} - }; - - namespace impl { - class Manager; - int getRDMATag(); - class Director; // forward declare Director class as impl - class ReadAssembler; +namespace Ck +{ +namespace IO +{ +class Session; +} +} // namespace Ck + +namespace Ck +{ +namespace IO +{ +/// Note: The values in options are not currently a stable or working interface. +/// Users should not set anything in them. +struct Options +{ + Options() + : peStripe(0), writeStripe(0), activePEs(-1), basePE(-1), skipPEs(-1), numReaders(0) + { } - class Session { - int file; - size_t bytes, offset; - CkArrayID sessionID; - friend class Ck::IO::impl::Manager; - friend class Ck::IO::impl::Director; - friend class Ck::IO::impl::ReadAssembler; - friend void read(Session session, size_t bytes, size_t offset, char* data, CkCallback after_read); - friend struct std::hash; - public: - Session(int file_, size_t bytes_, size_t offset_, - CkArrayID sessionID_) - : file(file_), bytes(bytes_), offset(offset_), sessionID(sessionID_) - { } - Session() { } - void pup(PUP::er &p) { - p|file; - p|bytes; - p|offset; - p|sessionID; - } - - int getFile() const { return file;} - - size_t getBytes() const { return bytes; } - size_t getOffset() const { return offset;} - CkArrayID getSessionID() const { return sessionID;} - bool operator==(const Ck::IO::Session& other) const{ - return ((file == other.file) && (bytes==other.bytes) && (offset == other.offset) && (sessionID == other.sessionID)); - } + /// How much contiguous data (in bytes) should be assigned to each active PE + size_t peStripe; + /// How much contiguous data (in bytes) should a PE gather before writing it out + size_t writeStripe; + /// How many PEs should participate in this activity + int activePEs; + /// Which PE should be the first to participate in this activity + int basePE; + /// How should active PEs be spaced out? + int skipPEs; + // How many IO buffers should there be + size_t numReaders; + + void pup(PUP::er& p) + { + p | peStripe; + p | writeStripe; + p | activePEs; + p | basePE; + p | skipPEs; + p | numReaders; + } }; +class File; +// class ReadAssembler; +/// Open the named file on the selected subset of PEs, and send a +/// FileReadyMsg to the opened callback when the system is ready to accept +/// session requests on that file. +/// Note: The values in options are not currently a stable or working interface. +/// Users should not set anything in them. +void open(std::string name, CkCallback opened, Options opts); + +/// Prepare to write data into the file described by token, in the window +/// defined by the offset and byte length. When the session is set up, a +/// SessionReadyMsg will be sent to the ready callback. When all of the data +/// has been written and synced, a message will be sent to the complete +/// callback. +void startSession(File file, size_t bytes, size_t offset, CkCallback ready, + CkCallback complete); + +/// Prepare to write data into @arg file, in the window defined by the @arg +/// offset and length in @arg bytes. When the session is set up, a +/// SessionReadyMsg will be sent to the @arg ready callback. When all of the +/// data has been written and synced, an additional write will be made to the +/// file to `commit' the session's work. When that write has completed, a +/// message will be sent to the @arg complete callback. +void startSession(File file, size_t bytes, size_t offset, CkCallback ready, + const char* commitData, size_t commitBytes, size_t commitOffset, + CkCallback complete); + +/// Write the given data into the file to which session is attached. The +/// offset is relative to the file as a whole, not to the session's offset. +void write(Session session, const char* data, size_t bytes, size_t offset); + +/// Close a previously-opened file. All sessions on that file must have +/// already signalled that they are complete. +void close(File file, CkCallback closed); + +/** + * Prepare to read data from @arg file section specified by @arg bytes and @arg offset. + * On starting the session, the buffer chares begin eagerly reading all requested data + * into memory. The ready callback is invoked once all buffer chares have been created and + * their reads have been initiated (but the reads are not guaranteed to be complete at + * this point). + */ +void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready); + +/** + * Same as the above start session in function. However, there is an extra @arg + * pes_to_map. pes_to_map will contain a sequence of numbers representing pes. CkIO will + * map the IO Buffer chares to those pes specified in pes_to_map in a round_robin fashion. + */ +void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready, + std::vector pes_to_map); + +/** + * Used to end the current read session and will then invoke the after_end callback that + * takes a CkReductionMsg* with nothing in it Will effectively call ckDestroy() on the + * CProxy_Reader of the associated FileInfo + */ +void closeReadSession(Session read_session, CkCallback after_end); +/** + * Is a method that reads data from the @arg session of length @arg bytes at offset + * @arg offset (in file). After this read finishes, the @arg after_read callback is + * invoked, taking a ReadCompleteMsg* which points to a vector buffer, the offset, + * and the number of bytes of the read. + * */ +void read(Session session, size_t bytes, size_t offset, char* data, + CkCallback after_read); + +class File +{ + int token; + friend void startSession(File file, size_t bytes, size_t offset, CkCallback ready, + CkCallback complete); + + friend void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready); + friend void startReadSession(File file, size_t bytes, size_t offset, CkCallback ready, + std::vector pes_to_map); + + friend void startSession(File file, size_t bytes, size_t offset, CkCallback ready, + const char* commitData, size_t commitBytes, + size_t commitOffset, CkCallback complete); + friend void close(File file, CkCallback closed); + friend class FileReadyMsg; + +public: + File(int token_) : token(token_) {} + File() : token(-1) {} + void pup(PUP::er& p) { p | token; } +}; - class SessionReadyMsg : public CMessage_SessionReadyMsg { - public: - Session session; - SessionReadyMsg(Session session_) : session(session_) { } - }; +class FileReadyMsg : public CMessage_FileReadyMsg +{ +public: + File file; + FileReadyMsg(const File& tok) : file(tok) {} +}; - class ReadCompleteMsg : public CMessage_ReadCompleteMsg { - public: - size_t read_tag; - size_t offset; - size_t bytes; - ReadCompleteMsg(){} - ReadCompleteMsg(size_t in_tag, size_t in_offset, size_t in_bytes) : read_tag(in_tag), offset(in_offset), bytes(in_bytes){ +namespace impl +{ +class Manager; +int getRDMATag(); +class Director; // forward declare Director class as impl +class ReadAssembler; +} // namespace impl + +class Session +{ + int file; + size_t bytes, offset; + CkArrayID sessionID; + friend class Ck::IO::impl::Manager; + friend class Ck::IO::impl::Director; + friend class Ck::IO::impl::ReadAssembler; + friend void read(Session session, size_t bytes, size_t offset, char* data, + CkCallback after_read); + friend struct std::hash; + +public: + Session(int file_, size_t bytes_, size_t offset_, CkArrayID sessionID_) + : file(file_), bytes(bytes_), offset(offset_), sessionID(sessionID_) + { + } + Session() {} + void pup(PUP::er& p) + { + p | file; + p | bytes; + p | offset; + p | sessionID; + } - } - + int getFile() const { return file; } - }; + size_t getBytes() const { return bytes; } + size_t getOffset() const { return offset; } + CkArrayID getSessionID() const { return sessionID; } + bool operator==(const Ck::IO::Session& other) const + { + return ((file == other.file) && (bytes == other.bytes) && (offset == other.offset) && + (sessionID == other.sessionID)); + } +}; -}} +class SessionReadyMsg : public CMessage_SessionReadyMsg +{ +public: + Session session; + SessionReadyMsg(Session session_) : session(session_) {} +}; +class ReadCompleteMsg : public CMessage_ReadCompleteMsg +{ +public: + size_t read_tag; + size_t offset; + size_t bytes; + ReadCompleteMsg() {} + ReadCompleteMsg(size_t in_tag, size_t in_offset, size_t in_bytes) + : read_tag(in_tag), offset(in_offset), bytes(in_bytes) + { + } +}; +} // namespace IO +} // namespace Ck #endif - diff --git a/src/scripts/configure.ac b/src/scripts/configure.ac index fa7798259a..cd587d635c 100644 --- a/src/scripts/configure.ac +++ b/src/scripts/configure.ac @@ -1561,6 +1561,18 @@ add_make_flag "CMK_HAS_MALLOC_HOOK:=$pass" 'whether has __malloc_hook' #### test if we can build OFI #### if test "$CMK_BUILD_OFI" = 1 then + CMK_LIBFABRIC_INC=`pkg-config --cflags libfabric` + if test "$?" = 0 + then + add_flag CMK_INCDIR='"$CMK_INCDIR $CMK_LIBFABRIC_INC"' "libfabric inc" + CMK_INCDIR="$CMK_INCDIR $CMK_LIBFABRIC_INC" + fi + CMK_LIBFABRIC_LIBS=`pkg-config --libs libfabric` + if test "$?" = 0 + then + add_flag CMK_LIBCDIR='"$CMK_LIBDIR $CMK_LIBFABRIC_LIBONLY"' "libfabric lib" + CMK_LIBDIR="$CMK_LIBDIR $CMK_LIBFABRIC_LIB" + fi cat > $tc < int main(int argc, char **argv) @@ -1595,15 +1607,43 @@ else test_finish 1 fi else - echo "Error: -lfabric not working, $PSM_COMPAT_DIR not found" - echo "Pass '--basedir=/path/to/dir/' if -lfabric is located in a different directory" - test_finish 1 + CMK_LIBFABRIC_LIBONLY=`pkg-config --libs-only-l libfabric` + if test "$?" = 0 + then + test_linkc "whether pkg-config fabric" "ok" "no" "-lfabric" + else + echo "Error: -lfabric not working, $PSM_COMPAT_DIR not found, pkg-config libfabric not working" + echo "Pass '--basedir=/path/to/dir/' if -lfabric is located in a different directory" + test_finish 1 + fi + fi fi fi fi +#### If we can build OFI, test if we can also build CXI #### +if test "$BUILD_OFI" = 1 +then +# get path info + CMK_LIBFABRIC_LIBONLY=`pkg-config --libs-only-l libfabric` + CMK_LIBFABRIC_INC=`pkg-config --cflags libfabric` + cat > $t < +#include +int main(int argc, char **argv) +{ + struct fi_info *providers; + int ret = fi_getinfo(FI_VERSION(1,0), NULL, NULL, 0ULL, NULL, &providers); + return 0; +} +EOT + test_cxx "whether build on CXI" "yes" "no" + AC_DEFINE_UNQUOTED(CMK_CXI, $strictpass, [build CXI.]) + BUILD_CXI=$strictpass +fi + #### test if we can build UCX #### if test "$CMK_BUILD_UCX" = 1 then