From 3e85952ae6761b84c103f9aeff7446404f7fbb1e Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sat, 27 Apr 2019 17:03:21 -0700 Subject: [PATCH 01/18] Fix for clang versions >= 5 --- include/hptt_types.h | 1 - src/hptt.cpp | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/hptt_types.h b/include/hptt_types.h index 170288e..ebc5796 100644 --- a/include/hptt_types.h +++ b/include/hptt_types.h @@ -1,7 +1,6 @@ #pragma once #include -#include #define REGISTER_BITS 256 // AVX #ifdef HPTT_ARCH_ARM diff --git a/src/hptt.cpp b/src/hptt.cpp index 82d4e73..c3cafe0 100644 --- a/src/hptt.cpp +++ b/src/hptt.cpp @@ -180,8 +180,10 @@ void cTensorTranspose( const int *perm, const int dim, const float _Complex beta, float _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { + const hptt::FloatComplex* calpha = reinterpret_cast(&alpha); + const hptt::FloatComplex* cbeta = reinterpret_cast(&beta); auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, - (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + (const hptt::FloatComplex*) A, *calpha, (hptt::FloatComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->setConjA(conjA); plan->execute(); } @@ -191,8 +193,10 @@ void zTensorTranspose( const int *perm, const int dim, const double _Complex beta, double _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { + const hptt::DoubleComplex* calpha = reinterpret_cast(&alpha); + const hptt::DoubleComplex* cbeta = reinterpret_cast(&beta); auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, - (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + (const hptt::DoubleComplex*) A, *calpha, (hptt::DoubleComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->setConjA(conjA); plan->execute(); } From 555d85b1cf4517454c04e5482de5c2e5c4995a29 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Wed, 2 Oct 2019 11:03:24 -0700 Subject: [PATCH 02/18] openmp option --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 582ada3..f1b3d0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,15 @@ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") set(ENABLE_IBM ON) endif() +if(ENABLE_OPENMP) + set(HPTT_OMP_FLAGS -fopenmp) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + set(HPTT_OMP_FLAGS -qopenmp) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp) + endif() +endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qopenmp -xhost) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") From 3b9038137b4093975d9e243d9c79279f69df1b77 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Wed, 2 Oct 2019 11:19:56 -0700 Subject: [PATCH 03/18] fix --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1b3d0c..9ed32b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ if(ENABLE_OPENMP) if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") set(HPTT_OMP_FLAGS -qopenmp) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp) + set(HPTT_OMP_FLAGS -qsmp=omp) endif() endif() @@ -24,12 +24,12 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qopenmp -xhost) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(ENABLE_IBM) - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp) + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} ${HPTT_OMP_FLAGS}) else() - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp -march=native -mtune=native) + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} ${HPTT_OMP_FLAGS} -march=native -mtune=native) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp -march=native) + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} ${HPTT_OMP_FLAGS} -march=native) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -silent -w -Mnovect) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") From b6bc69032558f6b416e7233cc1c4f95ffd107302 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Wed, 4 Dec 2019 16:57:20 -0800 Subject: [PATCH 04/18] add -O2 to compile line when using GCC --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ed32b0..9d0e3f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,11 @@ add_library(hptt STATIC ${HPTT_SRCS}) target_compile_features(hptt PUBLIC cxx_std_11) target_include_directories(hptt PUBLIC ${PROJECT_SOURCE_DIR}/include) #target_compile_definitions(hptt PRIVATE ${HPTT_CXX_COMPILE_DEFS}) + +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -O2) +endif() + target_compile_options(hptt PUBLIC ${HPTT_CXX_FLAGS}) install(TARGETS hptt From 6fb59952c67063232c0eb9e4672f5c291e7644f0 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Mon, 30 Dec 2019 14:45:19 -0800 Subject: [PATCH 05/18] minor --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d0e3f7..efe2f0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ target_compile_features(hptt PUBLIC cxx_std_11) target_include_directories(hptt PUBLIC ${PROJECT_SOURCE_DIR}/include) #target_compile_definitions(hptt PRIVATE ${HPTT_CXX_COMPILE_DEFS}) -if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.1") set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -O2) endif() From a8071b28e3accf5fb15b1db4c7ef18ce3f4df001 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Thu, 13 Aug 2020 20:59:18 -0700 Subject: [PATCH 06/18] install targets, write package config files --- CMakeLists.txt | 94 +++++++++++++++++++++++++++++---------- cmake/HPTTConfig.cmake.in | 11 +++++ 2 files changed, 81 insertions(+), 24 deletions(-) create mode 100644 cmake/HPTTConfig.cmake.in diff --git a/CMakeLists.txt b/CMakeLists.txt index efe2f0c..ad5a922 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,16 @@ -cmake_minimum_required(VERSION 3.7 FATAL_ERROR) -project (HPTT C CXX) +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) +project (HPTT VERSION 1.0.0 LANGUAGES C CXX) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(ENABLE_IBM OFF) +if(NOT CMAKE_BUILD_TYPE) + set (CMAKE_BUILD_TYPE Release) +endif() + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") set(ENABLE_IBM ON) endif() @@ -44,31 +48,73 @@ elseif(ENABLE_IBM) set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -mtune=native -DHPTT_ARCH_IBM -maltivec -mabi=altivec) endif() +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "8.2") + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -O2) +endif() + set(HPTT_SRCS src/hptt.cpp src/plan.cpp src/transpose.cpp src/utils.cpp) -add_library(hptt STATIC ${HPTT_SRCS}) -target_compile_features(hptt PUBLIC cxx_std_11) -target_include_directories(hptt PUBLIC ${PROJECT_SOURCE_DIR}/include) -#target_compile_definitions(hptt PRIVATE ${HPTT_CXX_COMPILE_DEFS}) +add_library(hptt ${HPTT_SRCS}) -if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.1") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -O2) -endif() +add_library(HPTT::hptt ALIAS hptt) + +target_include_directories(hptt + PUBLIC + $ + $ +) + +target_compile_options(hptt PRIVATE ${HPTT_CXX_FLAGS}) + +# Install -target_compile_options(hptt PUBLIC ${HPTT_CXX_FLAGS}) +include(GNUInstallDirs) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/HPTT) install(TARGETS hptt - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) - -set(HPTT_INCLUDES - include/compute_node.h - include/hptt_types.h - include/hptt.h - include/macros.h - include/plan.h - include/utils.h - include/transpose.h) - -install(FILES ${HPTT_INCLUDES} - DESTINATION ${CMAKE_INSTALL_PREFIX}/include) + EXPORT hptt-targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} +) + +# set_target_properties(hptt PROPERTIES EXPORT_NAME HPTT) + +install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hptt) + +#Export the targets to a script +install(EXPORT hptt-targets + FILE + HPTTTargets.cmake + NAMESPACE + HPTT:: + DESTINATION + ${INSTALL_CONFIGDIR} +) + +#Create a ConfigVersion.cmake file +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) + +configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/HPTTConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfig.cmake + INSTALL_DESTINATION ${INSTALL_CONFIGDIR} +) + +#Install the config, configversion and custom find modules +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfigVersion.cmake + DESTINATION ${INSTALL_CONFIGDIR} +) + + +export(EXPORT hptt-targets + FILE ${CMAKE_CURRENT_BINARY_DIR}/HPTTTargets.cmake + NAMESPACE HPTT::) + +#Register package in user's package registry +export(PACKAGE HPTT) \ No newline at end of file diff --git a/cmake/HPTTConfig.cmake.in b/cmake/HPTTConfig.cmake.in new file mode 100644 index 0000000..80d223f --- /dev/null +++ b/cmake/HPTTConfig.cmake.in @@ -0,0 +1,11 @@ +get_filename_component(HPTT_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +include(CMakeFindDependencyMacro) + +list(APPEND CMAKE_MODULE_PATH ${HPTT_CMAKE_DIR}) +list(REMOVE_AT CMAKE_MODULE_PATH -1) + +if(NOT TARGET HPTT::hptt) + include("${HPTT_CMAKE_DIR}/HPTTTargets.cmake") +endif() + +set(HPTT_LIBRARIES HPTT::hptt) From d39edac9e50df34dda63b713d5d564e14c813d1c Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Mon, 17 Aug 2020 00:11:06 -0700 Subject: [PATCH 07/18] link openmp targets --- CMakeLists.txt | 57 +++++++++++++++++++------------------- cmake/HPTTConfig.cmake.in | 11 -------- cmake/hptt-config.cmake.in | 18 ++++++++++++ 3 files changed, 46 insertions(+), 40 deletions(-) delete mode 100644 cmake/HPTTConfig.cmake.in create mode 100644 cmake/hptt-config.cmake.in diff --git a/CMakeLists.txt b/CMakeLists.txt index ad5a922..828d12f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,33 +11,25 @@ if(NOT CMAKE_BUILD_TYPE) set (CMAKE_BUILD_TYPE Release) endif() +set(CMAKE_NO_SYSTEM_FROM_IMPORTED TRUE) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") set(ENABLE_IBM ON) endif() -if(ENABLE_OPENMP) - set(HPTT_OMP_FLAGS -fopenmp) - if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - set(HPTT_OMP_FLAGS -qopenmp) - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") - set(HPTT_OMP_FLAGS -qsmp=omp) - endif() -endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qopenmp -xhost) + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -xhost) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(ENABLE_IBM) - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} ${HPTT_OMP_FLAGS}) - else() - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} ${HPTT_OMP_FLAGS} -march=native -mtune=native) + if(NOT ENABLE_IBM) + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -march=native -mtune=native) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} ${HPTT_OMP_FLAGS} -march=native) + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -march=native) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -silent -w -Mnovect) -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp) +# elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") +# set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp) endif() if(ENABLE_AVX) @@ -56,7 +48,7 @@ set(HPTT_SRCS src/hptt.cpp src/plan.cpp src/transpose.cpp src/utils.cpp) add_library(hptt ${HPTT_SRCS}) -add_library(HPTT::hptt ALIAS hptt) +add_library(hptt::hptt ALIAS hptt) target_include_directories(hptt PUBLIC @@ -66,10 +58,17 @@ target_include_directories(hptt target_compile_options(hptt PRIVATE ${HPTT_CXX_FLAGS}) +if(ENABLE_OPENMP) + find_package(OpenMP REQUIRED) + target_link_libraries(hptt PUBLIC OpenMP::OpenMP_CXX) +endif() + +set_target_properties(hptt PROPERTIES EXPORT_NAME hptt) + # Install include(GNUInstallDirs) -set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/HPTT) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/hptt) install(TARGETS hptt EXPORT hptt-targets @@ -77,16 +76,15 @@ install(TARGETS hptt ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) -# set_target_properties(hptt PROPERTIES EXPORT_NAME HPTT) install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hptt) #Export the targets to a script install(EXPORT hptt-targets FILE - HPTTTargets.cmake + hptt-targets.cmake NAMESPACE - HPTT:: + hptt:: DESTINATION ${INSTALL_CONFIGDIR} ) @@ -94,27 +92,28 @@ install(EXPORT hptt-targets #Create a ConfigVersion.cmake file include(CMakePackageConfigHelpers) write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfigVersion.cmake + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config-version.cmake VERSION ${PROJECT_VERSION} COMPATIBILITY AnyNewerVersion ) -configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/HPTTConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfig.cmake +configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/hptt-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} + PATH_VARS CMAKE_INSTALL_INCLUDEDIR ) #Install the config, configversion and custom find modules install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/HPTTConfigVersion.cmake + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config-version.cmake DESTINATION ${INSTALL_CONFIGDIR} ) export(EXPORT hptt-targets - FILE ${CMAKE_CURRENT_BINARY_DIR}/HPTTTargets.cmake - NAMESPACE HPTT::) + FILE ${CMAKE_CURRENT_BINARY_DIR}/hptt-targets.cmake + NAMESPACE hptt::) #Register package in user's package registry -export(PACKAGE HPTT) \ No newline at end of file +export(PACKAGE hptt) \ No newline at end of file diff --git a/cmake/HPTTConfig.cmake.in b/cmake/HPTTConfig.cmake.in deleted file mode 100644 index 80d223f..0000000 --- a/cmake/HPTTConfig.cmake.in +++ /dev/null @@ -1,11 +0,0 @@ -get_filename_component(HPTT_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) -include(CMakeFindDependencyMacro) - -list(APPEND CMAKE_MODULE_PATH ${HPTT_CMAKE_DIR}) -list(REMOVE_AT CMAKE_MODULE_PATH -1) - -if(NOT TARGET HPTT::hptt) - include("${HPTT_CMAKE_DIR}/HPTTTargets.cmake") -endif() - -set(HPTT_LIBRARIES HPTT::hptt) diff --git a/cmake/hptt-config.cmake.in b/cmake/hptt-config.cmake.in new file mode 100644 index 0000000..673e714 --- /dev/null +++ b/cmake/hptt-config.cmake.in @@ -0,0 +1,18 @@ + +@PACKAGE_INIT@ + +set(ENABLE_OPENMP @ENABLE_OPENMP@) + +if(ENABLE_OPENMP) + # include( CMakeFindDependencyMacro ) + find_package(OpenMP REQUIRED) +endif() + +if(NOT TARGET hptt::hptt) + include("${CMAKE_CURRENT_LIST_DIR}/hptt-targets.cmake") +endif() + +set(HPTT_FOUND TRUE) +set(HPTT_LIBRARIES hptt::hptt) +set(HPTT_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@") + From e0d2ca666ecc12d5485ae924c2a59ad1b37dd216 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Tue, 18 Aug 2020 12:37:39 -0700 Subject: [PATCH 08/18] minor --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 828d12f..6a048bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,13 +5,16 @@ project (HPTT VERSION 1.0.0 LANGUAGES C CXX) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(ENABLE_IBM OFF) +set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY ON) + +option(ENABLE_IBM OFF) if(NOT CMAKE_BUILD_TYPE) set (CMAKE_BUILD_TYPE Release) endif() -set(CMAKE_NO_SYSTEM_FROM_IMPORTED TRUE) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") set(ENABLE_IBM ON) From 50d997d86d15334ebdfb417c9316736bcf146b3d Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Mon, 4 Jan 2021 09:25:54 -0800 Subject: [PATCH 09/18] minor --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a048bf..7ca8405 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,7 +63,7 @@ target_compile_options(hptt PRIVATE ${HPTT_CXX_FLAGS}) if(ENABLE_OPENMP) find_package(OpenMP REQUIRED) - target_link_libraries(hptt PUBLIC OpenMP::OpenMP_CXX) + target_link_libraries(hptt PUBLIC OpenMP::OpenMP_C) endif() set_target_properties(hptt PROPERTIES EXPORT_NAME hptt) From 45fc077b8be363a2de186bd787a97d7861a6bc85 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sat, 9 Jan 2021 13:04:34 -0800 Subject: [PATCH 10/18] revert --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ca8405..f3a27d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,7 +63,7 @@ target_compile_options(hptt PRIVATE ${HPTT_CXX_FLAGS}) if(ENABLE_OPENMP) find_package(OpenMP REQUIRED) - target_link_libraries(hptt PUBLIC OpenMP::OpenMP_C) + target_link_libraries(hptt PUBLIC OpenMP::OpenMP_CXX) endif() set_target_properties(hptt PROPERTIES EXPORT_NAME hptt) @@ -119,4 +119,4 @@ export(EXPORT hptt-targets NAMESPACE hptt::) #Register package in user's package registry -export(PACKAGE hptt) \ No newline at end of file +export(PACKAGE hptt) From 584aa0870d43540b66fe872d298bbf4d96c19cad Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Mon, 31 Jan 2022 13:07:26 -0800 Subject: [PATCH 11/18] arch flags fix --- CMakeLists.txt | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f3a27d5..6b2ef87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ if(NOT CMAKE_BUILD_TYPE) set (CMAKE_BUILD_TYPE Release) endif() +include(CheckCXXCompilerFlag) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") set(ENABLE_IBM ON) @@ -22,17 +23,25 @@ endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -xhost) + set(HPTT_ARCH_FLAGS -xhost) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(NOT ENABLE_IBM) - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -march=native -mtune=native) - endif() + set(HPTT_ARCH_FLAGS -march=native) # -mtune=native + endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -march=native) + set(HPTT_ARCH_FLAGS -march=native) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -silent -w -Mnovect) + set(HPTT_ARCH_FLAGS -silent -w -Mnovect) # elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") -# set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp) +# set(HPTT_ARCH_FLAGS -qsmp=omp) +#elseif(CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(HPTT_ARCH_FLAGS -mtune=a64fx) +endif() + +check_cxx_compiler_flag("${HPTT_ARCH_FLAGS}" __COMPILER_SUPPORTS_MARCH) +if(__COMPILER_SUPPORTS_MARCH) + set(HPTT_CXX_FLAGS "${HPTT_ARCH_FLAGS}") endif() if(ENABLE_AVX) From bf4fa878524d1e99bd7afa1b6c1807ff0fab4e7b Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Tue, 1 Feb 2022 17:35:11 -0800 Subject: [PATCH 12/18] arm mcpu flag --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b2ef87..f0201bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") # set(HPTT_ARCH_FLAGS -qsmp=omp) #elseif(CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(HPTT_ARCH_FLAGS -mtune=a64fx) + set(HPTT_ARCH_FLAGS -mcpu=native) endif() check_cxx_compiler_flag("${HPTT_ARCH_FLAGS}" __COMPILER_SUPPORTS_MARCH) From 36ee84a72704091a32234d5d3085baa523b94e1e Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Fri, 18 Feb 2022 11:33:48 -0800 Subject: [PATCH 13/18] allow option for march flags --- CMakeLists.txt | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f0201bf..d498d00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,21 +22,25 @@ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") endif() -if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - set(HPTT_ARCH_FLAGS -xhost) -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(NOT ENABLE_IBM) - set(HPTT_ARCH_FLAGS -march=native) # -mtune=native - endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(HPTT_ARCH_FLAGS -march=native) -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") - set(HPTT_ARCH_FLAGS -silent -w -Mnovect) -# elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") -# set(HPTT_ARCH_FLAGS -qsmp=omp) -#elseif(CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(HPTT_ARCH_FLAGS -mcpu=native) +if(DEFINED MARCH_FLAGS) + set(HPTT_ARCH_FLAGS ${MARCH_FLAGS}) +else() + if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + set(HPTT_ARCH_FLAGS -xhost) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(NOT ENABLE_IBM) + set(HPTT_ARCH_FLAGS -march=native) # -mtune=native + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(HPTT_ARCH_FLAGS -march=native) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") + set(HPTT_ARCH_FLAGS -silent -w -Mnovect) + # elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") + # set(HPTT_ARCH_FLAGS -qsmp=omp) + #elseif(CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(HPTT_ARCH_FLAGS -mcpu=native) + endif() endif() check_cxx_compiler_flag("${HPTT_ARCH_FLAGS}" __COMPILER_SUPPORTS_MARCH) From 73fbab26d280cebcb66340579b654ae3f358b55a Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Fri, 22 Apr 2022 17:43:44 -0700 Subject: [PATCH 14/18] std17 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d498d00..eca64e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,8 @@ -cmake_minimum_required(VERSION 3.15 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) project (HPTT VERSION 1.0.0 LANGUAGES C CXX) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) From aee625b2e51ede30a3b5edf50d159452e7839d95 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Thu, 4 Aug 2022 21:38:38 -0700 Subject: [PATCH 15/18] oneapi sdk fix --- src/transpose.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transpose.cpp b/src/transpose.cpp index f77cd5b..5cda270 100644 --- a/src/transpose.cpp +++ b/src/transpose.cpp @@ -880,7 +880,7 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, ) } else { if( useStreamingStores) -#pragma vector nontemporal +// #pragma vector nontemporal HPTT_DUPLICATE(spawnThreads, for(int32_t i = myStart; i < myEnd; i++) if( conjA ) From a9706b49a995249a8567254e29129b602f7bb501 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sat, 13 May 2023 11:30:58 -0700 Subject: [PATCH 16/18] fix signedness warnings --- include/plan.h | 2 +- include/transpose.h | 4 +-- include/utils.h | 3 +- src/transpose.cpp | 75 +++++++++++++++++++++++---------------------- 4 files changed, 43 insertions(+), 41 deletions(-) diff --git a/include/plan.h b/include/plan.h index 2ff260d..27e6b3c 100644 --- a/include/plan.h +++ b/include/plan.h @@ -17,7 +17,7 @@ class ComputeNode; class Plan { public: - Plan() : rootNodes_(nullptr), numTasks_(0) { } + Plan() : numTasks_(0), rootNodes_(nullptr) { } Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop); diff --git a/include/transpose.h b/include/transpose.h index 82f0239..fa6fd7d 100644 --- a/include/transpose.h +++ b/include/transpose.h @@ -254,8 +254,8 @@ class Transpose int dim_; //!< dimension of the tensor std::vector sizeA_; //!< size of A std::vector perm_; //!< permutation - std::vector outerSizeA_; //!< outer sizes of A - std::vector outerSizeB_; //!< outer sizes of B + std::vector outerSizeA_; //!< outer sizes of A + std::vector outerSizeB_; //!< outer sizes of B std::vector lda_; //!< strides for all dimensions of A (first dimension has a stride of 1) std::vector ldb_; //!< strides for all dimensions of B (first dimension has a stride of 1) std::vector threadIds_; //!< OpenMP threadIds of the threads involed in the transposition diff --git a/include/utils.h b/include/utils.h index a85b27c..3937f7b 100644 --- a/include/utils.h +++ b/include/utils.h @@ -16,6 +16,7 @@ template static floatType conj(floatType x){ return std::conj(x); } + template<> float conj(float x){ return x; @@ -66,7 +67,7 @@ void getPrimeFactors( int n, std::list &primeFactors ); template int findPos(t value, const std::vector &array) { - for(int i=0;i < array.size() ; ++i) + for(size_t i = 0; i < array.size(); ++i) if( array[i] == value ) return i; return -1; diff --git a/src/transpose.cpp b/src/transpose.cpp index 5cda270..b9d81a0 100644 --- a/src/transpose.cpp +++ b/src/transpose.cpp @@ -604,17 +604,17 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict floatType* __restrict__ B, const floatType* __restrict__ Bnext, const floatType alpha, const floatType beta, const ComputeNode* plan) { - const int32_t end = plan->end - (plan->inc - 1); - const int32_t inc = plan->inc; + const size_t end = plan->end - (plan->inc - 1); + const size_t inc = plan->inc; const size_t lda = plan->lda; const size_t ldb = plan->ldb; - constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType); - constexpr int blocking_ = blocking_micro_ * 4; + constexpr size_t blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType); + constexpr size_t blocking_ = blocking_micro_ * 4; if( plan->next->next != nullptr ){ // recurse - int32_t i; + size_t i; for(i = plan->start; i < end; i+= inc) { if( i + inc < end ) @@ -649,7 +649,7 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict const size_t ldb_macro = plan->next->ldb; // invoke macro-kernel - int32_t i; + size_t i; for(i = plan->start; i < end; i+= inc) if( i + inc < end ) macro_kernel(&A[i*lda], &A[(i+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta); @@ -704,11 +704,11 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r } else { if( useStreamingStores) if( conjA ) -#pragma vector nontemporal +// #pragma vector nontemporal for(int32_t i = plan->start; i < end; i+= inc) B[i] = alpha * conj(A[i]); else -#pragma vector nontemporal +// #pragma vector nontemporal for(int32_t i = plan->start; i < end; i+= inc) B[i] = alpha * A[i]; else @@ -742,12 +742,12 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r beta_(beta), dim_(-1), numThreads_(numThreads), - masterPlan_(nullptr), - selectionMethod_(selectionMethod), - maxAutotuningCandidates_(-1), selectedParallelStrategyId_(-1), selectedLoopOrderId_(-1), - conjA_(false) + conjA_(false), + masterPlan_(nullptr), + selectionMethod_(selectionMethod), + maxAutotuningCandidates_(-1) { #ifdef _OPENMP omp_init_lock(&writelock); @@ -793,12 +793,6 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r alpha_(other.alpha_), beta_(other.beta_), dim_(other.dim_), - numThreads_(other.numThreads_), - masterPlan_(other.masterPlan_), - selectionMethod_(other.selectionMethod_), - selectedParallelStrategyId_(other.selectedParallelStrategyId_), - selectedLoopOrderId_(other.selectedLoopOrderId_), - maxAutotuningCandidates_(other.maxAutotuningCandidates_), sizeA_(other.sizeA_), perm_(other.perm_), outerSizeA_(other.outerSizeA_), @@ -806,7 +800,14 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r lda_(other.lda_), ldb_(other.ldb_), threadIds_(other.threadIds_), - conjA_(other.conjA_) + numThreads_(other.numThreads_), + selectedParallelStrategyId_(other.selectedParallelStrategyId_), + selectedLoopOrderId_(other.selectedLoopOrderId_), + conjA_(other.conjA_), + masterPlan_(other.masterPlan_), + selectionMethod_(other.selectionMethod_), + maxAutotuningCandidates_(other.maxAutotuningCandidates_) + { #ifdef _OPENMP omp_init_lock(&writelock); @@ -918,7 +919,7 @@ static void axpy_2D( const floatType* __restrict__ A, const int lda, if( useStreamingStores) HPTT_DUPLICATE(spawnThreads, for(int32_t j = myStart; j < myEnd; j++) -_Pragma("vector nontemporal") +// _Pragma("vector nontemporal") for(int32_t i = 0; i < n0; i++) if( conjA ) B[i + j * ldb] = alpha * conj(A[i + j * lda]); @@ -1007,7 +1008,7 @@ void Transpose::execute_expert() noexcept } const int numTasks = masterPlan_->getNumTasks(); - const int numThreads = numThreads_; + // const int numThreads = numThreads_; getStartEnd(numTasks, myStart, myEnd); HPTT_DUPLICATE(spawnThreads, @@ -1129,7 +1130,7 @@ float Transpose::getLoadBalance( const std::vector ¶llelismS int totalTasks = 1; for(int i=0; i < dim_; ++i){ - int inc = this->getIncrement(i); + size_t inc = this->getIncrement(i); while(sizeA_[i] < inc) inc /= 2; int availableParallelism = (sizeA_[i] + inc - 1) / inc; @@ -1259,7 +1260,7 @@ void Transpose::getBestParallelismStrategy ( std::vector &bestPa float lb2 = getLoadBalance(strat2); // printVector(strat2,"strat2"); // printf("strat2: %f\n",getLoadBalance(strat2)); - if( lb1 > 0.8 && lb2 < 0.85 || lb1 >lb2 && lb1 > 0.75 ) + if( (lb1 > 0.8 && lb2 < 0.85) || (lb1 > lb2 && lb1 > 0.75) ) { std::copy(strat1.begin(), strat1.end(), bestParallelismStrategy.begin()); return; @@ -1514,11 +1515,11 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const } // compact arrays (remove -1) for(int i=0;i < dim ; ++i) - if( sizeA_[i] == -1 ) + if( (int)sizeA_[i] == -1 ) { int j=i+1; for(;j < dim ; ++j) - if( sizeA_[j] != -1 ) + if( (int)sizeA_[j] != -1 ) break; if( j < dim ) std::swap(sizeA_[i], sizeA_[j]); @@ -1614,8 +1615,8 @@ void Transpose::fuseIndices() int toMerge = i; perm.push_back(perm_[i]); while(i+1 < dim_ && perm_[i] + 1 == perm_[i+1] - && (sizeA_[perm_[i]] == outerSizeA_[perm_[i]]) - && (sizeA_[perm_[i]] == outerSizeB_[i]) ){ + && ((int)sizeA_[perm_[i]] == outerSizeA_[perm_[i]]) + && ((int)sizeA_[perm_[i]] == outerSizeB_[i]) ){ #ifdef DEBUG fprintf(stderr,"[HPTT] MERGING indices %d and %d\n",perm_[i], perm_[i+1]); #endif @@ -1641,11 +1642,11 @@ void Transpose::fuseIndices() perm_ = perm; // remove gaps in the perm, if requried (e.g., perm=3,1,0 -> 2,1,0) int currentValue = 0; - for(int i=0;i < perm_.size(); ++i){ + for(size_t i = 0;i < perm_.size(); ++i){ //find smallest element in perm_ and rename it to currentValue int minValue = 1000000; int minPos = -1; - for(int pos=0; pos < perm_.size(); ++pos){ + for(int pos = 0; pos < (int)perm_.size(); ++pos){ if ( perm_[pos] >= currentValue && perm_[pos] < minValue) { minValue = perm_[pos]; minPos = pos; @@ -1917,10 +1918,10 @@ void Transpose::createPlans( std::vector > &pla // heuristics, search the space with a growing rectangle (from best to worst, // see line marked with ***) bool done = false; - for( int start= 0; start< std::max( parallelismStrategies.size(), loopOrders.size() ) && !done; start++ ) - for( int i = 0; i < parallelismStrategies.size() && !done; i++) + for( size_t start= 0; start< std::max( parallelismStrategies.size(), loopOrders.size() ) && !done; start++ ) + for( size_t i = 0; i < parallelismStrategies.size() && !done; i++) { - for( int j = 0; j < loopOrders.size() && !done; j++) + for( size_t j = 0; j < loopOrders.size() && !done; j++) { if( i > start || j > start || (i != start && j != start) ) continue; //these are already done *** @@ -1975,10 +1976,10 @@ void Transpose::createPlans( std::vector > &pla } } plans.push_back(plan); - if( selectionMethod_ == ESTIMATE || - selectionMethod_ == MEASURE && plans.size() > 200 || - selectionMethod_ == PATIENT && plans.size() > 400 || - selectionMethod_ == CRAZY && plans.size() > 800 ) + if( (selectionMethod_ == ESTIMATE) || + (selectionMethod_ == MEASURE && plans.size() > 200) || + (selectionMethod_ == PATIENT && plans.size() > 400) || + (selectionMethod_ == CRAZY && plans.size() > 800) ) done = true; } } @@ -2078,7 +2079,7 @@ std::shared_ptr Transpose::selectPlan( const std::vectorinfoLevel_ > 0 ) - printf("We evaluated %d/%d candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); + printf("We evaluated %d/%ld candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); } return plans[bestPlan_id]; } From 43498c3575761407a835b4a7bd3591d4e0a4b9fa Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sat, 13 May 2023 11:48:40 -0700 Subject: [PATCH 17/18] fix other warnings --- src/transpose.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/transpose.cpp b/src/transpose.cpp index b9d81a0..dc0d3e1 100644 --- a/src/transpose.cpp +++ b/src/transpose.cpp @@ -835,19 +835,21 @@ void Transpose::executeEstimate(const Plan *plan) noexcept #ifdef _OPENMP #pragma omp parallel for num_threads(numThreads_) if(numThreads_ > 1) #endif + + const floatType* __restrict__ Bnext__ = B_; for( int taskId = 0; taskId < numTasks; taskId++) if ( perm_[0] != 0 ) { auto rootNode = plan->getRootNode_const( taskId ); if( std::abs(beta_) < getZeroThreshold() ) { if( conjA_ ) - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); else - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); } else { if( conjA_ ) - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); else - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); } } else { auto rootNode = plan->getRootNode_const( taskId ); @@ -1011,14 +1013,15 @@ void Transpose::execute_expert() noexcept // const int numThreads = numThreads_; getStartEnd(numTasks, myStart, myEnd); + const floatType* __restrict__ Bnext__ = B_; HPTT_DUPLICATE(spawnThreads, for( int taskId = myStart; taskId < myEnd; taskId++) if ( perm_[0] != 0 ) { auto rootNode = masterPlan_->getRootNode_const( taskId ); if( conjA_ ) - transpose_int( A_, A_, B_, B_, alpha_, beta_, rootNode); + transpose_int( A_, A_, B_, Bnext__, alpha_, beta_, rootNode); else - transpose_int( A_, A_, B_, B_, alpha_, beta_, rootNode); + transpose_int( A_, A_, B_, Bnext__, alpha_, beta_, rootNode); } else { auto rootNode = masterPlan_->getRootNode_const( taskId ); if( conjA_ ) From eff1bdd79734ddc4993dd4df1d0cdbd40758b9cb Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Sat, 13 May 2023 13:01:07 -0700 Subject: [PATCH 18/18] bug fix for issue introduced in commit @a9706b4 --- include/compute_node.h | 10 +++++----- include/transpose.h | 6 +++--- src/plan.cpp | 2 +- src/transpose.cpp | 14 +++++++------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/compute_node.h b/include/compute_node.h index b777857..a778f7c 100644 --- a/include/compute_node.h +++ b/include/compute_node.h @@ -15,11 +15,11 @@ class ComputeNode delete next; } - size_t start; //!< start index for at the current loop - size_t end; //!< end index for at the current loop - size_t inc; //!< increment for at the current loop - size_t lda; //!< stride of A w.r.t. the loop index - size_t ldb; //!< stride of B w.r.t. the loop index + int start; //!< start index for at the current loop + int end; //!< end index for at the current loop + int inc; //!< increment for at the current loop + int lda; //!< stride of A w.r.t. the loop index + int ldb; //!< stride of B w.r.t. the loop index ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called) }; diff --git a/include/transpose.h b/include/transpose.h index fa6fd7d..90dabae 100644 --- a/include/transpose.h +++ b/include/transpose.h @@ -252,12 +252,12 @@ class Transpose floatType alpha_; //!< scaling factor for A floatType beta_; //!< scaling factor for B int dim_; //!< dimension of the tensor - std::vector sizeA_; //!< size of A + std::vector sizeA_; //!< size of A std::vector perm_; //!< permutation std::vector outerSizeA_; //!< outer sizes of A std::vector outerSizeB_; //!< outer sizes of B - std::vector lda_; //!< strides for all dimensions of A (first dimension has a stride of 1) - std::vector ldb_; //!< strides for all dimensions of B (first dimension has a stride of 1) + std::vector lda_; //!< strides for all dimensions of A (first dimension has a stride of 1) + std::vector ldb_; //!< strides for all dimensions of B (first dimension has a stride of 1) std::vector threadIds_; //!< OpenMP threadIds of the threads involed in the transposition int numThreads_; int selectedParallelStrategyId_; diff --git a/src/plan.cpp b/src/plan.cpp index 7c5b9bc..d1d5d13 100644 --- a/src/plan.cpp +++ b/src/plan.cpp @@ -6,7 +6,7 @@ namespace hptt { - Plan::Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop) : rootNodes_(nullptr), loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop) { + Plan::Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop) : loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop), rootNodes_(nullptr) { numTasks_ = 1; for(auto nt : numThreadsAtLoop) numTasks_ *= nt; diff --git a/src/transpose.cpp b/src/transpose.cpp index dc0d3e1..03ec97e 100644 --- a/src/transpose.cpp +++ b/src/transpose.cpp @@ -604,17 +604,17 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict floatType* __restrict__ B, const floatType* __restrict__ Bnext, const floatType alpha, const floatType beta, const ComputeNode* plan) { - const size_t end = plan->end - (plan->inc - 1); - const size_t inc = plan->inc; + const int32_t end = plan->end - (plan->inc - 1); + const int32_t inc = plan->inc; const size_t lda = plan->lda; const size_t ldb = plan->ldb; - constexpr size_t blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType); - constexpr size_t blocking_ = blocking_micro_ * 4; + constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType); + constexpr int blocking_ = blocking_micro_ * 4; if( plan->next->next != nullptr ){ // recurse - size_t i; + int32_t i; for(i = plan->start; i < end; i+= inc) { if( i + inc < end ) @@ -649,7 +649,7 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict const size_t ldb_macro = plan->next->ldb; // invoke macro-kernel - size_t i; + int32_t i; for(i = plan->start; i < end; i+= inc) if( i + inc < end ) macro_kernel(&A[i*lda], &A[(i+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta); @@ -1133,7 +1133,7 @@ float Transpose::getLoadBalance( const std::vector ¶llelismS int totalTasks = 1; for(int i=0; i < dim_; ++i){ - size_t inc = this->getIncrement(i); + int inc = this->getIncrement(i); while(sizeA_[i] < inc) inc /= 2; int availableParallelism = (sizeA_[i] + inc - 1) / inc;