diff --git a/.asf.yaml b/.asf.yaml index f3a8ed9fee90f..685776c9a3872 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -20,7 +20,7 @@ github: homepage: https://arrow.apache.org/ collaborators: - anjakefala - - benibus + - hiroyuki-sato - jbonofre - js8544 - vibhatha diff --git a/.env b/.env index c18a3b066f9b2..bdb74d89e1c6e 100644 --- a/.env +++ b/.env @@ -54,6 +54,7 @@ UBUNTU=22.04 # Default versions for various dependencies CLANG_TOOLS=14 +CMAKE=3.25.0 CUDA=11.2.2 DASK=latest DOTNET=8.0 @@ -62,7 +63,7 @@ HDFS=3.2.1 JDK=11 KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. -LLVM=14 +LLVM=18 MAVEN=3.8.7 NODE=18 NUMBA=latest @@ -89,17 +90,17 @@ TZ=UTC # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release +VCPKG="f7423ee180c4b7f40d43402c2feb3859161ef625" # 2024.06.15 Release # This must be updated when we update # ci/docker/python-*-windows-*.dockerfile or the vcpkg config. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-01-27 -PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-01-27 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-02-25 +PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-02-25 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and # https://hub.docker.com/u/conanio for available images. -CONAN_BASE=gcc10 -CONAN_VERSION=1.62.0 +CONAN_BASE=gcc11-ubuntu16.04 +CONAN_VERSION=2.12.1 diff --git a/.github/ISSUE_TEMPLATE/usage_question.yaml b/.github/ISSUE_TEMPLATE/usage_question.yaml index c711190540afa..3c232dd931a54 100644 --- a/.github/ISSUE_TEMPLATE/usage_question.yaml +++ b/.github/ISSUE_TEMPLATE/usage_question.yaml @@ -23,23 +23,26 @@ body: - type: markdown attributes: value: > - While we enable issues as a mechanism for new contributors and passers-by who - are unfamiliar with Apache Software Foundation projects to ask questions and - interact with the project, we encourage users to ask such questions on public - mailing lists: - - * Development discussions: dev@arrow.apache.org (first subscribe by sending an - e-mail to dev-subscribe@arrow.apache.org). - - * User discussions: user@arrow.apache.org (first subscribe by sending an e-mail - to user-subscribe@arrow.apache.org). - - * Mailing list archives: https://arrow.apache.org/community/ - - - Do not be surprised by responses to issues raised here directing you to those - mailing lists, or to report a bug or feature request here. + While we enable issues as a mechanism for new contributors and + passers-by who are unfamiliar with Apache Software Foundation projects + to ask questions and interact with the project, we encourage users to + ask such questions on the [public mailing + lists](https://arrow.apache.org/community/) as these provide higher + visibility than GitHub issues: + + * For usage questions, please email user@arrow.apache.org (first + subscribe by sending an e-mail to user-subscribe@arrow.apache.org). + + * For discussions about contributing or development, please email + dev@arrow.apache.org (first subscribe by sending an e-mail to + dev-subscribe@arrow.apache.org). + Please see the [Apache Arrow Community + page](https://arrow.apache.org/community/) for more information on the + mailing lists as well as for a link to the searchable archives. + + Do not be surprised by responses to issues raised here directing you to those + mailing lists, or to report a bug or feature request here. Thank you! - type: textarea diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 3839d3e2fc889..4b3eac2d43305 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,61 +1,20 @@ - - +Please remove this line and the above text before creating your pull request. ### Rationale for this change - - ### What changes are included in this PR? - - ### Are these changes tested? - - ### Are there any user-facing changes? - - - - +**This PR includes breaking changes to public APIs.** (If there are any breaking changes to public APIs, please explain which changes are breaking. If not, you can remove this.) - - \ No newline at end of file +**This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 6dc4da306a1ea..e486ef0e16e59 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -65,7 +65,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 with: python-version: '3.9' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 83b6f6e31ffc3..578b47361b71e 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -42,7 +42,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 7dfe987d2eaff..cbb448cfa07f2 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -94,12 +94,12 @@ jobs: title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN ubuntu: 22.04 - arch: arm64v8 - clang-tools: 10 + clang-tools: 14 image: ubuntu-cpp - llvm: 10 + llvm: 14 runs-on: ubuntu-24.04-arm - title: ARM64 Ubuntu 20.04 C++ - ubuntu: 20.04 + title: ARM64 Ubuntu 22.04 C++ + ubuntu: 22.04 env: ARCH: ${{ matrix.arch }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} @@ -113,7 +113,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }} @@ -121,7 +121,7 @@ jobs: - name: Setup Python on hosted runner if: | matrix.runs-on == 'ubuntu-latest' - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Setup Python on self-hosted runner @@ -156,7 +156,7 @@ jobs: build-example: name: C++ Minimal Build Example - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 45 steps: @@ -234,7 +234,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -274,11 +274,11 @@ jobs: fail-fast: false matrix: os: - - windows-2019 + - windows-2022 include: - - os: windows-2019 + - os: windows-2022 simd-level: AVX2 - title: AMD64 Windows 2019 C++17 AVX2 + title: AMD64 Windows 2022 AVX2 env: ARROW_BOOST_USE_SHARED: OFF ARROW_BUILD_BENCHMARKS: ON @@ -352,7 +352,7 @@ jobs: - name: Build shell: cmd run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 bash -c "ci/scripts/cpp_build.sh $(pwd) $(pwd)/build" - name: Test shell: bash @@ -453,7 +453,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 id: python-install with: python-version: 3.9 diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 72ca0565ebd4c..6622323a7205d 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -54,11 +54,11 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.2.0 + uses: actions/setup-dotnet@v4.3.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Checkout Arrow @@ -86,7 +86,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.2.0 + uses: actions/setup-dotnet@v4.3.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -113,11 +113,11 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4.2.0 + uses: actions/setup-dotnet@v4.3.0 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Checkout Arrow @@ -182,7 +182,7 @@ jobs: "s/^ .+<\/Version>/ ${semver}<\/Version>/" \ csharp/Directory.Build.props - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Setup Archery @@ -199,7 +199,7 @@ jobs: dev/release/utils-generate-checksum.sh "${artifact}" done - name: Upload - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: name: nuget path: | diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index f9718cbf7bb18..a62e06a7e29b4 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -41,7 +41,7 @@ jobs: lint: name: Lint C++, Python, R, Docker, RAT - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 15 steps: @@ -50,7 +50,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Install pre-commit @@ -109,7 +109,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: '3.12' - name: Install Ruby @@ -117,7 +117,7 @@ jobs: with: ruby-version: ruby - name: Install .NET - uses: actions/setup-dotnet@87b7050bc53ea08284295505d98d2aa94301e852 # v4.2.0 + uses: actions/setup-dotnet@3951f0dfe7a07e2313ec93c75700083e2005cbab # v4.3.0 with: dotnet-version: '8.0.x' - name: Install Dependencies diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 83f835d588af2..55ca6e6f2cc2c 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -46,13 +46,13 @@ jobs: run: | ci/scripts/util_free_space.sh - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker key: debian-docs-${{ hashFiles('cpp/**') }} restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 0e23394e8a453..f406c7396c46e 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -53,13 +53,13 @@ jobs: with: fetch-depth: 0 - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index aef81df074888..a6a6d22d09f92 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -100,13 +100,13 @@ jobs: run: | ci/scripts/util_free_space.sh - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 5ef5b37c98815..e100e26a05d50 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -61,7 +61,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 2bdfd0743a547..101724b3e2cd3 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -46,24 +46,8 @@ permissions: jobs: ubuntu: - name: AMD64 Ubuntu 20.04 MATLAB - # Explicitly pin the Ubuntu version to 20.04 for the time being because: - # - # 1. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible - # with the GLIBCXX bundled with MATLAB R2023a. This is a relatively common - # issue. - # - # For example, see: - # - # https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found - # - # 2. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible with - # the version of GLIBCXX shipped with Debian 11. Several of the Arrow community - # members who work on the MATLAB bindings use Debian 11 locally for qualification. - # Using Ubuntu 20.04 eases development workflows for these community members. - # - # In the future, we can investigate adding support for building against more Linux (e.g. `ubuntu-22.04`) and MATLAB versions (e.g. R2023b). - runs-on: ubuntu-20.04 + name: AMD64 Ubuntu 22.04 MATLAB + runs-on: ubuntu-22.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository @@ -155,7 +139,7 @@ jobs: runs-on: windows-2022 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - - name: Check out repository + - name: Check out repository uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 1eedacf1abf31..a9ccdad0c52cb 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index a6dd5f1275331..4457e31cd986b 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index ba05fab65ada2..19e7754fd87e1 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -108,13 +108,13 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery @@ -183,7 +183,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5.4.0 with: python-version: '3.11' - name: Install Dependencies diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index cb000f8b95c1b..f74abd8e58ab0 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -145,7 +145,7 @@ jobs: run: | ci/scripts/util_free_space.sh - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker # As this key is identical on both matrix builds only one will be able to successfully cache, @@ -155,7 +155,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery @@ -177,7 +177,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -214,7 +214,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery @@ -237,7 +237,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -299,7 +299,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + - uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip @@ -337,7 +337,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts - uses: actions/download-artifact@v4.1.8 + uses: actions/download-artifact@v4.1.9 with: name: libarrow-rtools40-ucrt64.zip path: r/windows @@ -361,6 +361,7 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck + any::cyclocomp # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows # - name: Install Google Cloud Storage Testbench # shell: bash diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 4fcb399c91fc6..e8a3c58a8451b 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: cache: 'pip' python-version: 3.12 @@ -86,7 +86,7 @@ jobs: exit 1 fi - name: Cache Repo - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: repo key: r-nightly-${{ github.run_id }} @@ -103,6 +103,7 @@ jobs: remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} - run: tree repo + - uses: r-lib/actions/setup-r@v2 - name: Build Repository shell: Rscript {0} run: | diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 589b74cd687fd..7a29d35ee7b1d 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -84,13 +84,13 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@v4 with: path: .docker key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 2fd55d457c208..971c2590c5af4 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -64,7 +64,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python on hosted runner - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: 3 - name: Setup Archery diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ee2e233bb19f..54ee2a76b96bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: files: >- ( ?^ci/docker/conda-python-emscripten\.dockerfile$| - ?^ci/docker/python-.*-wheel-windows-test-vs2019.*\.dockerfile$| + ?^ci/docker/python-.*-wheel-windows-test-vs2022.*\.dockerfile$| ) types: [] - repo: https://github.com/pycqa/flake8 @@ -141,6 +141,18 @@ repos: ( ?^r/src/arrowExports\.cpp$| ) + - repo: https://github.com/rubocop/rubocop + rev: "v1.71.0" + hooks: + - id: rubocop + name: Ruby Format + alias: ruby-format + args: + - "--autocorrect" + exclude: >- + ( + ?^dev/tasks/homebrew-formulae/.*\.rb$| + ) - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: @@ -185,3 +197,8 @@ repos: ?^c_glib/test/run-test\.sh$| ?^dev/release/utils-generate-checksum\.sh$| ) + - repo: https://github.com/trim21/pre-commit-mirror-meson + rev: v1.6.1 + hooks: + - id: meson-fmt + args: ['--inplace'] diff --git a/ci/docker/ubuntu-20.04-verify-rc.dockerfile b/.rubocop.yml similarity index 75% rename from ci/docker/ubuntu-20.04-verify-rc.dockerfile rename to .rubocop.yml index cee1e50e080c5..3f48689796d2f 100644 --- a/ci/docker/ubuntu-20.04-verify-rc.dockerfile +++ b/.rubocop.yml @@ -15,12 +15,19 @@ # specific language governing permissions and limitations # under the License. -ARG arch=amd64 -FROM ${arch}/ubuntu:20.04 +# Ruby lint begins minimal. +# All of checkings changed to disable by default. +AllCops: + DisabledByDefault: true -ENV DEBIAN_FRONTEND=noninteractive -COPY dev/release/setup-ubuntu.sh / -RUN /setup-ubuntu.sh && \ - rm /setup-ubuntu.sh && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* +Lint: + Enabled: false + +Layout/LineLength: + Max: 100 + +Layout/ArgumentAlignment: + Enabled: true + +Layout/SpaceAfterComma: + Enabled: true diff --git a/LICENSE.txt b/LICENSE.txt index 7bb1330a1002b..7d5de9e3bfeb7 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2257,5 +2257,36 @@ SOFTWARE. java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java -These file are derived from code from Netty, which is made available under the +These files are derived from code from Netty, which is made available under the Apache License 2.0. + +-------------------------------------------------------------------------------- +cpp/src/arrow/util/math_internal.cc (some portions) + +Some portions of this file are derived from + +https://github.com/ankane/dist-rust/ + +which is made available under the MIT license + +The MIT License (MIT) + +Copyright (c) 2021-2023 Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md index f49ec4b8d98ee..c557716a4a88b 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Major components of the project include: - [Gandiva](https://github.com/apache/arrow/tree/main/cpp/src/gandiva): an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase - [Go libraries](https://github.com/apache/arrow-go) - - [Java libraries](https://github.com/apache/arrow/tree/main/java) + - [Java libraries](https://github.com/apache/arrow-java) - [JavaScript libraries](https://github.com/apache/arrow/tree/main/js) - [Python libraries](https://github.com/apache/arrow/tree/main/python) - [R libraries](https://github.com/apache/arrow/tree/main/r) diff --git a/c_glib/arrow-cuda-glib/meson.build b/c_glib/arrow-cuda-glib/meson.build index 36730dec6c4b7..0f93d95ca01f2 100644 --- a/c_glib/arrow-cuda-glib/meson.build +++ b/c_glib/arrow-cuda-glib/meson.build @@ -17,24 +17,27 @@ # specific language governing permissions and limitations # under the License. -sources = files( - 'cuda.cpp', -) +sources = files('cuda.cpp') -c_headers = files( - 'arrow-cuda-glib.h', - 'cuda.h', -) +c_headers = files('arrow-cuda-glib.h', 'cuda.h') -cpp_headers = files( - 'arrow-cuda-glib.hpp', - 'cuda.hpp', -) +cpp_headers = files('arrow-cuda-glib.hpp', 'cuda.hpp') version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GARROW_CUDA', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GARROW_CUDA', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h @@ -42,63 +45,61 @@ c_headers += version_h headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-cuda-glib') -dependencies = [ - arrow_cuda, - arrow_glib, -] -libarrow_cuda_glib = library('arrow-cuda-glib', - sources: sources, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGARROW_CUDA_COMPILATION'], - soversion: so_version, - version: library_version) -arrow_cuda_glib = declare_dependency(link_with: libarrow_cuda_glib, - include_directories: base_include_directories, - dependencies: dependencies) +dependencies = [arrow_cuda, arrow_glib] +libarrow_cuda_glib = library( + 'arrow-cuda-glib', + sources: sources, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGARROW_CUDA_COMPILATION'], + soversion: so_version, + version: library_version, +) +arrow_cuda_glib = declare_dependency( + link_with: libarrow_cuda_glib, + include_directories: base_include_directories, + dependencies: dependencies, +) if target_machine.system() != 'windows' - pkgconfig.generate(libarrow_cuda_glib, - description: 'C API for Apache Arrow CUDA based on GLib', - filebase: 'arrow-cuda-glib', - name: 'Apache Arrow CUDA GLib', - requires: ['arrow-glib', 'arrow-cuda'], - variables: pkgconfig_variables, - version: version) + pkgconfig.generate( + libarrow_cuda_glib, + description: 'C API for Apache Arrow CUDA based on GLib', + filebase: 'arrow-cuda-glib', + name: 'Apache Arrow CUDA GLib', + requires: ['arrow-glib', 'arrow-cuda'], + variables: pkgconfig_variables, + version: version, + ) endif if have_gi - gir_dependencies = [ - declare_dependency(sources: arrow_glib_gir), - ] - gir_extra_args = [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ] - arrow_cuda_glib_gir = \ - gnome.generate_gir(libarrow_cuda_glib, - dependencies: gir_dependencies, - export_packages: 'arrow-cuda-glib', - extra_args: gir_extra_args, - header: 'arrow-cuda-glib/arrow-cuda-glib.h', - identifier_prefix: 'GArrowCUDA', - includes: [ - 'Arrow-1.0', - ], - kwargs: generate_gi_common_args, - namespace: 'ArrowCUDA', - sources: sources + c_headers, - symbol_prefix: 'garrow_cuda') + gir_dependencies = [declare_dependency(sources: arrow_glib_gir)] + gir_extra_args = [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ] + arrow_cuda_glib_gir = gnome.generate_gir( + libarrow_cuda_glib, + dependencies: gir_dependencies, + export_packages: 'arrow-cuda-glib', + extra_args: gir_extra_args, + header: 'arrow-cuda-glib/arrow-cuda-glib.h', + identifier_prefix: 'GArrowCUDA', + includes: ['Arrow-1.0'], + kwargs: generate_gi_common_args, + namespace: 'ArrowCUDA', + sources: sources + c_headers, + symbol_prefix: 'garrow_cuda', + ) - if generate_vapi - arrow_cuda_glib_vapi = \ - gnome.generate_vapi('arrow-cuda-glib', - install: true, - packages: [ - arrow_glib_vapi, - 'gio-2.0', - ], - sources: [arrow_cuda_glib_gir[0]]) - endif + if generate_vapi + arrow_cuda_glib_vapi = gnome.generate_vapi( + 'arrow-cuda-glib', + install: true, + packages: [arrow_glib_vapi, 'gio-2.0'], + sources: [arrow_cuda_glib_gir[0]], + ) + endif endif diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 3425efc5555c8..5cb61fc462ca4 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -20,110 +20,124 @@ project_name = 'arrow-dataset-glib' sources = files( - 'dataset-factory.cpp', - 'dataset.cpp', - 'file-format.cpp', - 'fragment.cpp', - 'partitioning.cpp', - 'scanner.cpp', + 'dataset-factory.cpp', + 'dataset.cpp', + 'file-format.cpp', + 'fragment.cpp', + 'partitioning.cpp', + 'scanner.cpp', ) c_headers = files( - 'arrow-dataset-glib.h', - 'dataset-definition.h', - 'dataset-factory.h', - 'dataset.h', - 'file-format.h', - 'fragment.h', - 'partitioning.h', - 'scanner.h', + 'arrow-dataset-glib.h', + 'dataset-definition.h', + 'dataset-factory.h', + 'dataset.h', + 'file-format.h', + 'fragment.h', + 'partitioning.h', + 'scanner.h', ) cpp_headers = files( - 'arrow-dataset-glib.hpp', - 'dataset-factory.hpp', - 'dataset.hpp', - 'file-format.hpp', - 'fragment.hpp', - 'partitioning.hpp', - 'scanner.hpp', + 'arrow-dataset-glib.hpp', + 'dataset-factory.hpp', + 'dataset.hpp', + 'file-format.hpp', + 'fragment.hpp', + 'partitioning.hpp', + 'scanner.hpp', ) version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GADATASET', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GADATASET', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h -enums = gnome.mkenums('enums', - sources: c_headers, - identifier_prefix: 'GADataset', - symbol_prefix: 'gadataset', - c_template: 'enums.c.template', - h_template: 'enums.h.template', - install_dir: join_paths(include_dir, project_name), - install_header: true) +enums = gnome.mkenums( + 'enums', + sources: c_headers, + identifier_prefix: 'GADataset', + symbol_prefix: 'gadataset', + c_template: 'enums.c.template', + h_template: 'enums.h.template', + install_dir: join_paths(include_dir, project_name), + install_header: true, +) enums_source = enums[0] enums_header = enums[1] headers = c_headers + cpp_headers install_headers(headers, subdir: project_name) -dependencies = [ - arrow_dataset, - arrow_glib, -] -libarrow_dataset_glib = library('arrow-dataset-glib', - sources: sources + enums, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGADATASET_COMPILATION'], - c_args: ['-DGADATASET_COMPILATION'], - soversion: so_version, - version: library_version) -arrow_dataset_glib = declare_dependency(link_with: libarrow_dataset_glib, - include_directories: base_include_directories, - dependencies: dependencies, - sources: enums_header) +dependencies = [arrow_dataset, arrow_glib] +libarrow_dataset_glib = library( + 'arrow-dataset-glib', + sources: sources + enums, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGADATASET_COMPILATION'], + c_args: ['-DGADATASET_COMPILATION'], + soversion: so_version, + version: library_version, +) +arrow_dataset_glib = declare_dependency( + link_with: libarrow_dataset_glib, + include_directories: base_include_directories, + dependencies: dependencies, + sources: enums_header, +) -pkgconfig.generate(libarrow_dataset_glib, - description: 'C API for Apache Arrow Dataset based on GLib', - filebase: 'arrow-dataset-glib', - name: 'Apache Arrow Dataset GLib', - requires: ['arrow-glib', 'arrow-dataset'], - variables: pkgconfig_variables, - version: version) +pkgconfig.generate( + libarrow_dataset_glib, + description: 'C API for Apache Arrow Dataset based on GLib', + filebase: 'arrow-dataset-glib', + name: 'Apache Arrow Dataset GLib', + requires: ['arrow-glib', 'arrow-dataset'], + variables: pkgconfig_variables, + version: version, +) if have_gi - arrow_dataset_glib_gir = \ - gnome.generate_gir(libarrow_dataset_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), - export_packages: 'arrow-dataset-glib', - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ], - header: 'arrow-dataset-glib/arrow-dataset-glib.h', - identifier_prefix: 'GADataset', - includes: [ - 'Arrow-1.0', - ], - kwargs: generate_gi_common_args, - namespace: 'ArrowDataset', - sources: sources + c_headers + enums, - symbol_prefix: 'gadataset') + arrow_dataset_glib_gir = gnome.generate_gir( + libarrow_dataset_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + export_packages: 'arrow-dataset-glib', + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ], + header: 'arrow-dataset-glib/arrow-dataset-glib.h', + identifier_prefix: 'GADataset', + includes: ['Arrow-1.0'], + kwargs: generate_gi_common_args, + namespace: 'ArrowDataset', + sources: sources + c_headers + enums, + symbol_prefix: 'gadataset', + ) - if generate_vapi - gnome.generate_vapi('arrow-dataset-glib', - install: true, - packages: [ - arrow_glib_vapi, - 'gio-2.0', - ], - sources: [arrow_dataset_glib_gir[0]]) - endif + if generate_vapi + gnome.generate_vapi( + 'arrow-dataset-glib', + install: true, + packages: [arrow_glib_vapi, 'gio-2.0'], + sources: [arrow_dataset_glib_gir[0]], + ) + endif endif diff --git a/c_glib/arrow-flight-glib/meson.build b/c_glib/arrow-flight-glib/meson.build index c1422e0d10a7d..87fc473431240 100644 --- a/c_glib/arrow-flight-glib/meson.build +++ b/c_glib/arrow-flight-glib/meson.build @@ -17,30 +17,32 @@ # specific language governing permissions and limitations # under the License. -sources = files( - 'client.cpp', - 'common.cpp', - 'server.cpp', -) +sources = files('client.cpp', 'common.cpp', 'server.cpp') -c_headers = files( - 'arrow-flight-glib.h', - 'client.h', - 'common.h', - 'server.h', -) +c_headers = files('arrow-flight-glib.h', 'client.h', 'common.h', 'server.h') cpp_headers = files( - 'arrow-flight-glib.hpp', - 'client.hpp', - 'common.hpp', - 'server.hpp', + 'arrow-flight-glib.hpp', + 'client.hpp', + 'common.hpp', + 'server.hpp', ) version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GAFLIGHT', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GAFLIGHT', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h @@ -48,58 +50,58 @@ c_headers += version_h headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-flight-glib') -dependencies = [ - arrow_flight, - arrow_glib, -] -libarrow_flight_glib = library('arrow-flight-glib', - sources: sources, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGAFLIGHT_COMPILATION'], - soversion: so_version, - version: library_version) -arrow_flight_glib = declare_dependency(link_with: libarrow_flight_glib, - include_directories: base_include_directories, - dependencies: dependencies) +dependencies = [arrow_flight, arrow_glib] +libarrow_flight_glib = library( + 'arrow-flight-glib', + sources: sources, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGAFLIGHT_COMPILATION'], + soversion: so_version, + version: library_version, +) +arrow_flight_glib = declare_dependency( + link_with: libarrow_flight_glib, + include_directories: base_include_directories, + dependencies: dependencies, +) -pkgconfig.generate(libarrow_flight_glib, - description: 'C API for Apache Arrow Flight based on GLib', - filebase: 'arrow-flight-glib', - name: 'Apache Arrow Flight GLib', - requires: ['arrow-glib', 'arrow-flight'], - variables: pkgconfig_variables, - version: version) +pkgconfig.generate( + libarrow_flight_glib, + description: 'C API for Apache Arrow Flight based on GLib', + filebase: 'arrow-flight-glib', + name: 'Apache Arrow Flight GLib', + requires: ['arrow-glib', 'arrow-flight'], + variables: pkgconfig_variables, + version: version, +) if have_gi - arrow_flight_glib_gir = \ - gnome.generate_gir(libarrow_flight_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), - export_packages: 'arrow-flight-glib', - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ], - header: 'arrow-flight-glib/arrow-flight-glib.h', - identifier_prefix: 'GAFlight', - includes: [ - 'Arrow-1.0', - ], - kwargs: generate_gi_common_args, - namespace: 'ArrowFlight', - sources: sources + c_headers, - symbol_prefix: 'gaflight') + arrow_flight_glib_gir = gnome.generate_gir( + libarrow_flight_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + export_packages: 'arrow-flight-glib', + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ], + header: 'arrow-flight-glib/arrow-flight-glib.h', + identifier_prefix: 'GAFlight', + includes: ['Arrow-1.0'], + kwargs: generate_gi_common_args, + namespace: 'ArrowFlight', + sources: sources + c_headers, + symbol_prefix: 'gaflight', + ) - if generate_vapi - arrow_flight_glib_vapi = \ - gnome.generate_vapi('arrow-flight-glib', - install: true, - packages: [ - arrow_glib_vapi, - 'gio-2.0', - ], - sources: [arrow_flight_glib_gir[0]]) - endif + if generate_vapi + arrow_flight_glib_vapi = gnome.generate_vapi( + 'arrow-flight-glib', + install: true, + packages: [arrow_glib_vapi, 'gio-2.0'], + sources: [arrow_flight_glib_gir[0]], + ) + endif endif diff --git a/c_glib/arrow-flight-sql-glib/meson.build b/c_glib/arrow-flight-sql-glib/meson.build index d588ba4917c76..aa6798e763c25 100644 --- a/c_glib/arrow-flight-sql-glib/meson.build +++ b/c_glib/arrow-flight-sql-glib/meson.build @@ -17,27 +17,27 @@ # specific language governing permissions and limitations # under the License. -sources = files( - 'client.cpp', - 'server.cpp', -) +sources = files('client.cpp', 'server.cpp') -c_headers = files( - 'arrow-flight-sql-glib.h', - 'client.h', - 'server.h', -) +c_headers = files('arrow-flight-sql-glib.h', 'client.h', 'server.h') -cpp_headers = files( - 'arrow-flight-sql-glib.hpp', - 'client.hpp', - 'server.hpp', -) +cpp_headers = files('arrow-flight-sql-glib.hpp', 'client.hpp', 'server.hpp') version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GAFLIGHTSQL', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GAFLIGHTSQL', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h @@ -45,63 +45,62 @@ c_headers += version_h headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-flight-sql-glib') -dependencies = [ - arrow_flight_sql, - arrow_flight_glib, -] -libarrow_flight_sql_glib = library('arrow-flight-sql-glib', - sources: sources, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGAFLIGHTSQL_COMPILATION'], - soversion: so_version, - version: library_version) -arrow_flight_sql_glib = \ - declare_dependency(link_with: libarrow_flight_sql_glib, - include_directories: base_include_directories, - dependencies: dependencies) +dependencies = [arrow_flight_sql, arrow_flight_glib] +libarrow_flight_sql_glib = library( + 'arrow-flight-sql-glib', + sources: sources, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGAFLIGHTSQL_COMPILATION'], + soversion: so_version, + version: library_version, +) +arrow_flight_sql_glib = declare_dependency( + link_with: libarrow_flight_sql_glib, + include_directories: base_include_directories, + dependencies: dependencies, +) -pkgconfig.generate(libarrow_flight_sql_glib, - description: 'C API for Apache Arrow Flight SQL based on GLib', - filebase: 'arrow-flight-sql-glib', - name: 'Apache Arrow Flight SQL GLib', - requires: ['arrow-flight-glib', 'arrow-flight-sql'], - variables: pkgconfig_variables, - version: version) +pkgconfig.generate( + libarrow_flight_sql_glib, + description: 'C API for Apache Arrow Flight SQL based on GLib', + filebase: 'arrow-flight-sql-glib', + name: 'Apache Arrow Flight SQL GLib', + requires: ['arrow-flight-glib', 'arrow-flight-sql'], + variables: pkgconfig_variables, + version: version, +) if have_gi - arrow_flight_sql_glib_gir_dependencies = \ - declare_dependency(sources: [arrow_glib_gir, arrow_flight_glib_gir]) - arrow_flight_sql_glib_gir = \ - gnome.generate_gir(libarrow_flight_sql_glib, - dependencies: arrow_flight_sql_glib_gir_dependencies, - export_packages: 'arrow-flight-sql-glib', - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - '--include-uninstalled=./arrow-flight-glib/ArrowFlight-1.0.gir', - ], - header: 'arrow-flight-sql-glib/arrow-flight-sql-glib.h', - identifier_prefix: 'GAFlightSQL', - includes: [ - 'Arrow-1.0', - 'ArrowFlight-1.0', - ], - kwargs: generate_gi_common_args, - namespace: 'ArrowFlightSQL', - sources: sources + c_headers, - symbol_prefix: 'gaflightsql') + arrow_flight_sql_glib_gir_dependencies = declare_dependency( + sources: [arrow_glib_gir, arrow_flight_glib_gir], + ) + arrow_flight_sql_glib_gir = gnome.generate_gir( + libarrow_flight_sql_glib, + dependencies: arrow_flight_sql_glib_gir_dependencies, + export_packages: 'arrow-flight-sql-glib', + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + '--include-uninstalled=./arrow-flight-glib/ArrowFlight-1.0.gir', + ], + header: 'arrow-flight-sql-glib/arrow-flight-sql-glib.h', + identifier_prefix: 'GAFlightSQL', + includes: ['Arrow-1.0', 'ArrowFlight-1.0'], + kwargs: generate_gi_common_args, + namespace: 'ArrowFlightSQL', + sources: sources + c_headers, + symbol_prefix: 'gaflightsql', + ) - if generate_vapi - gnome.generate_vapi('arrow-flight-sql-glib', - install: true, - packages: [ - arrow_flight_glib_vapi, - arrow_glib_vapi, - 'gio-2.0', - ], - sources: [arrow_flight_sql_glib_gir[0]]) - endif + if generate_vapi + gnome.generate_vapi( + 'arrow-flight-sql-glib', + install: true, + packages: [arrow_flight_glib_vapi, arrow_glib_vapi, 'gio-2.0'], + sources: [arrow_flight_sql_glib_gir[0]], + ) + endif endif diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index 9e9753c4e007d..19437b01db96b 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -37,6 +37,8 @@ G_BEGIN_DECLS * @title: Basic array classes * @include: arrow-glib/arrow-glib.h * + * #GArrowArrayStatistics is a class for statistics of an array. + * * #GArrowArray is a base class for all array classes such as * #GArrowBooleanArray. * @@ -125,6 +127,11 @@ G_BEGIN_DECLS * string data. If you don't have Arrow format data, you need to * use #GArrowLargeStringArrayBuilder to create a new array. * + * #GArrayBinaryViewArray is a class for variable-size binary view array. + * It can store zero or more binary view data. If you don't have Arrow + * format data, you need to use #GArrowBinaryViewArrayBuilder to create + * a new array. + * * #GArrowFixedSizeBinaryArray is a class for fixed size binary array. * It can store zero or more fixed size binary data. If you don't have * Arrow format data, you need to use @@ -364,6 +371,106 @@ garrow_equal_options_is_approx(GArrowEqualOptions *options) return priv->approx; } +struct GArrowArrayStatisticsPrivate +{ + arrow::ArrayStatistics statistics; +}; + +enum { + PROP_STATISTICS = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowArrayStatistics, garrow_array_statistics, G_TYPE_OBJECT) + +#define GARROW_ARRAY_STATISTICS_GET_PRIVATE(object) \ + static_cast( \ + garrow_array_statistics_get_instance_private(GARROW_ARRAY_STATISTICS(object))) + +static void +garrow_array_statistics_finalize(GObject *object) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(object); + priv->statistics.~ArrayStatistics(); + G_OBJECT_CLASS(garrow_array_statistics_parent_class)->finalize(object); +} + +static void +garrow_array_statistics_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_STATISTICS: + priv->statistics = *static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_array_statistics_init(GArrowArrayStatistics *object) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(object); + new (&priv->statistics) arrow::ArrayStatistics; +} + +static void +garrow_array_statistics_class_init(GArrowArrayStatisticsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = garrow_array_statistics_finalize; + gobject_class->set_property = garrow_array_statistics_set_property; + + auto spec = g_param_spec_pointer( + "statistics", + "Statistics", + "The raw arrow::ArrayStatistics *", + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_STATISTICS, spec); +} + +/** + * garrow_array_statistics_has_null_count: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: %TRUE if @statistics has a valid null count value, + * %FALSE otherwise. + * + * Since: 20.0.0 + */ +gboolean +garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + return priv->statistics.null_count.has_value(); +} + +/** + * garrow_array_statistics_get_null_count: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: 0 or larger value if @statistics has a valid null count value, + * -1 otherwise. + * + * Since: 20.0.0 + */ +gint64 +garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + const auto &null_count = priv->statistics.null_count; + if (null_count) { + return null_count.value(); + } else { + return -1; + } +} + typedef struct GArrowArrayPrivate_ { std::shared_ptr array; @@ -1046,7 +1153,28 @@ gboolean garrow_array_validate_full(GArrowArray *array, GError **error) { const auto arrow_array = garrow_array_get_raw(array); - return garrow::check(error, arrow_array->ValidateFull(), "[array][validate_full]"); + return garrow::check(error, arrow_array->ValidateFull(), "[array][validate-full]"); +} + +/** + * garrow_array_get_statistics: + * @array: A #GArrowArray. + * + * Returns: (transfer full): The associated #GArrowArrayStatistics of @array, + * %NULL if @array doesn't have any associated statistics. + * + * Since: 20.0.0 + */ +GArrowArrayStatistics * +garrow_array_get_statistics(GArrowArray *array) +{ + const auto arrow_array = garrow_array_get_raw(array); + const auto &statistics = arrow_array->statistics(); + if (statistics) { + return garrow_array_statistics_new_raw(statistics.get()); + } else { + return nullptr; + } } G_DEFINE_TYPE(GArrowNullArray, garrow_null_array, GARROW_TYPE_ARRAY) @@ -2407,6 +2535,73 @@ garrow_large_string_array_get_string(GArrowLargeStringArray *array, gint64 i) i); } +G_DEFINE_TYPE(GArrowBinaryViewArray, garrow_binary_view_array, GARROW_TYPE_ARRAY) +static void +garrow_binary_view_array_init(GArrowBinaryViewArray *object) +{ +} + +static void +garrow_binary_view_array_class_init(GArrowBinaryViewArrayClass *klass) +{ +} + +/** + * garrow_binary_view_array_new: + * @length: The number of elements. + * @views: The view buffer. + * @data_buffers: (element-type GArrowBuffer): The data buffers. + * @null_bitmap: (nullable): The bitmap that shows null elements. The + * N-th element is null when the N-th bit is 0, not null otherwise. + * If the array has no null elements, the bitmap must be %NULL and + * @n_nulls is 0. + * @n_nulls: The number of null elements. If -1 is specified, the + * number of nulls are computed from @null_bitmap. + * @offset: The position of the first element. + * + * Returns: A newly created #GArrowBinaryViewArray. + * + * Since: 20.0.0 + */ +GArrowBinaryViewArray * +garrow_binary_view_array_new(gint64 length, + GArrowBuffer *views, + GList *data_buffers, + GArrowBuffer *null_bitmap, + gint64 n_nulls, + gint64 offset) +{ + std::vector> arrow_data_buffers; + for (GList *node = data_buffers; node; node = g_list_next(node)) { + arrow_data_buffers.push_back(garrow_buffer_get_raw(GARROW_BUFFER(node->data))); + } + auto binary_view_array = + std::make_shared(arrow::binary_view(), + length, + garrow_buffer_get_raw(views), + std::move(arrow_data_buffers), + garrow_buffer_get_raw(null_bitmap), + n_nulls, + offset); + return GARROW_BINARY_VIEW_ARRAY( + g_object_new(GARROW_TYPE_BINARY_VIEW_ARRAY, "array", &binary_view_array, nullptr)); +} + +/** + * garrow_binary_view_array_get_value: + * @array: A #GArrowBinaryViewArray. + * @i: The index of the target value. + * + * Returns: (transfer full): The @i-th value. + */ +GBytes * +garrow_binary_view_array_get_value(GArrowBinaryViewArray *array, gint64 i) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto view = static_cast(arrow_array.get())->GetView(i); + return g_bytes_new_static(view.data(), view.length()); +} + G_DEFINE_TYPE(GArrowDate32Array, garrow_date32_array, GARROW_TYPE_NUMERIC_ARRAY) static void @@ -3468,6 +3663,13 @@ garrow_equal_options_get_raw(GArrowEqualOptions *equal_options) return &(priv->options); } +GArrowArrayStatistics * +garrow_array_statistics_new_raw(arrow::ArrayStatistics *arrow_statistics) +{ + return GARROW_ARRAY_STATISTICS( + g_object_new(GARROW_TYPE_ARRAY_STATISTICS, "statistics", arrow_statistics, nullptr)); +} + GArrowArray * garrow_array_new_raw(std::shared_ptr *arrow_array) { @@ -3620,6 +3822,9 @@ garrow_array_new_raw_valist(std::shared_ptr *arrow_array, case arrow::Type::type::RUN_END_ENCODED: type = GARROW_TYPE_RUN_END_ENCODED_ARRAY; break; + case arrow::Type::type::BINARY_VIEW: + type = GARROW_TYPE_BINARY_VIEW_ARRAY; + break; default: type = GARROW_TYPE_ARRAY; break; diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index bc597a8a93104..901af822353f6 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -42,6 +42,22 @@ GARROW_AVAILABLE_IN_5_0 gboolean garrow_equal_options_is_approx(GArrowEqualOptions *options); +#define GARROW_TYPE_ARRAY_STATISTICS (garrow_array_statistics_get_type()) +GARROW_AVAILABLE_IN_20_0 +G_DECLARE_DERIVABLE_TYPE( + GArrowArrayStatistics, garrow_array_statistics, GARROW, ARRAY_STATISTICS, GObject) +struct _GArrowArrayStatisticsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics); +GARROW_AVAILABLE_IN_20_0 +gint64 +garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics); + GARROW_AVAILABLE_IN_6_0 GArrowArray * garrow_array_import(gpointer c_abi_array, GArrowDataType *data_type, GError **error); @@ -134,6 +150,10 @@ GARROW_AVAILABLE_IN_20_0 gboolean garrow_array_validate_full(GArrowArray *array, GError **error); +GARROW_AVAILABLE_IN_20_0 +GArrowArrayStatistics * +garrow_array_get_statistics(GArrowArray *array); + #define GARROW_TYPE_NULL_ARRAY (garrow_null_array_get_type()) GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( @@ -582,6 +602,28 @@ GARROW_AVAILABLE_IN_0_16 gchar * garrow_large_string_array_get_string(GArrowLargeStringArray *array, gint64 i); +#define GARROW_TYPE_BINARY_VIEW_ARRAY (garrow_binary_view_array_get_type()) +GARROW_AVAILABLE_IN_20_0 +G_DECLARE_DERIVABLE_TYPE( + GArrowBinaryViewArray, garrow_binary_view_array, GARROW, BINARY_VIEW_ARRAY, GArrowArray) +struct _GArrowBinaryViewArrayClass +{ + GArrowArrayClass parent_class; +}; + +GARROW_AVAILABLE_IN_20_0 +GArrowBinaryViewArray * +garrow_binary_view_array_new(gint64 length, + GArrowBuffer *views, + GList *data_buffers, + GArrowBuffer *null_bitmap, + gint64 n_nulls, + gint64 offset); + +GARROW_AVAILABLE_IN_20_0 +GBytes * +garrow_binary_view_array_get_value(GArrowBinaryViewArray *array, gint64 i); + #define GARROW_TYPE_DATE32_ARRAY (garrow_date32_array_get_type()) GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE( diff --git a/c_glib/arrow-glib/basic-array.hpp b/c_glib/arrow-glib/basic-array.hpp index b2a7ed6ae075f..361d367773c2d 100644 --- a/c_glib/arrow-glib/basic-array.hpp +++ b/c_glib/arrow-glib/basic-array.hpp @@ -27,6 +27,10 @@ GARROW_EXTERN arrow::EqualOptions * garrow_equal_options_get_raw(GArrowEqualOptions *equal_options); +GARROW_EXTERN +GArrowArrayStatistics * +garrow_array_statistics_new_raw(arrow::ArrayStatistics *arrow_statistics); + GARROW_EXTERN GArrowArray * garrow_array_new_raw(std::shared_ptr *arrow_array); diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index f5130e9344bec..c195af7de0313 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -2402,6 +2402,12 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::RUN_END_ENCODED: type = GARROW_TYPE_RUN_END_ENCODED_DATA_TYPE; break; + case arrow::Type::type::STRING_VIEW: + type = GARROW_TYPE_STRING_VIEW_DATA_TYPE; + break; + case arrow::Type::type::BINARY_VIEW: + type = GARROW_TYPE_BINARY_VIEW_DATA_TYPE; + break; default: type = GARROW_TYPE_DATA_TYPE; break; diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 854988e348986..a5e67463102d1 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -18,212 +18,195 @@ # under the License. sources = files( - 'array-builder.cpp', - 'basic-array.cpp', - 'basic-data-type.cpp', - 'buffer.cpp', - 'chunked-array.cpp', - 'codec.cpp', - 'composite-array.cpp', - 'composite-data-type.cpp', - 'datum.cpp', - 'decimal.cpp', - 'decoder.cpp', - 'error.cpp', - 'expression.cpp', - 'field.cpp', - 'interval.cpp', - 'memory-pool.cpp', - 'record-batch.cpp', - 'scalar.cpp', - 'schema.cpp', - 'table-builder.cpp', - 'table.cpp', - 'tensor.cpp', - 'timestamp-parser.cpp', - 'type.cpp', + 'array-builder.cpp', + 'basic-array.cpp', + 'basic-data-type.cpp', + 'buffer.cpp', + 'chunked-array.cpp', + 'codec.cpp', + 'composite-array.cpp', + 'composite-data-type.cpp', + 'datum.cpp', + 'decimal.cpp', + 'decoder.cpp', + 'error.cpp', + 'expression.cpp', + 'field.cpp', + 'interval.cpp', + 'memory-pool.cpp', + 'record-batch.cpp', + 'scalar.cpp', + 'schema.cpp', + 'table-builder.cpp', + 'table.cpp', + 'tensor.cpp', + 'timestamp-parser.cpp', + 'type.cpp', ) sources += files( - 'file.cpp', - 'file-mode.cpp', - 'input-stream.cpp', - 'output-stream.cpp', - 'readable.cpp', - 'writable.cpp', - 'writable-file.cpp', + 'file-mode.cpp', + 'file.cpp', + 'input-stream.cpp', + 'output-stream.cpp', + 'readable.cpp', + 'writable-file.cpp', + 'writable.cpp', ) sources += files( - 'ipc-options.cpp', - 'metadata-version.cpp', - 'reader.cpp', - 'writer.cpp', + 'ipc-options.cpp', + 'metadata-version.cpp', + 'reader.cpp', + 'writer.cpp', ) -sources += files( - 'compute.cpp', -) +sources += files('compute.cpp') -sources += files( - 'file-system.cpp', - 'local-file-system.cpp', -) +sources += files('file-system.cpp', 'local-file-system.cpp') if have_arrow_orc - sources += files( - 'orc-file-reader.cpp', - ) + sources += files('orc-file-reader.cpp') endif c_headers = files( - 'array.h', - 'array-builder.h', - 'arrow-glib.h', - 'basic-array-definition.h', - 'basic-array.h', - 'basic-data-type.h', - 'buffer.h', - 'chunked-array-definition.h', - 'chunked-array.h', - 'codec.h', - 'composite-array.h', - 'composite-data-type.h', - 'data-type.h', - 'datum.h', - 'decimal.h', - 'decoder.h', - 'error.h', - 'expression.h', - 'field.h', - 'interval.h', - 'memory-pool.h', - 'record-batch.h', - 'scalar.h', - 'schema.h', - 'table-builder.h', - 'table.h', - 'tensor.h', - 'timestamp-parser.h', - 'type.h', + 'array-builder.h', + 'array.h', + 'arrow-glib.h', + 'basic-array-definition.h', + 'basic-array.h', + 'basic-data-type.h', + 'buffer.h', + 'chunked-array-definition.h', + 'chunked-array.h', + 'codec.h', + 'composite-array.h', + 'composite-data-type.h', + 'data-type.h', + 'datum.h', + 'decimal.h', + 'decoder.h', + 'error.h', + 'expression.h', + 'field.h', + 'interval.h', + 'memory-pool.h', + 'record-batch.h', + 'scalar.h', + 'schema.h', + 'table-builder.h', + 'table.h', + 'tensor.h', + 'timestamp-parser.h', + 'type.h', ) c_headers += files( - 'file.h', - 'file-mode.h', - 'input-stream.h', - 'output-stream.h', - 'readable.h', - 'writable.h', - 'writable-file.h', + 'file-mode.h', + 'file.h', + 'input-stream.h', + 'output-stream.h', + 'readable.h', + 'writable-file.h', + 'writable.h', ) -c_headers += files( - 'ipc-options.h', - 'metadata-version.h', - 'reader.h', - 'writer.h', -) +c_headers += files('ipc-options.h', 'metadata-version.h', 'reader.h', 'writer.h') -c_headers += files( - 'compute-definition.h', - 'compute.h', -) +c_headers += files('compute-definition.h', 'compute.h') -c_headers += files( - 'file-system.h', - 'local-file-system.h', -) +c_headers += files('file-system.h', 'local-file-system.h') if have_arrow_orc - c_headers += files( - 'orc-file-reader.h', - ) + c_headers += files('orc-file-reader.h') endif cpp_headers = files( - 'array.hpp', - 'array-builder.hpp', - 'arrow-glib.hpp', - 'basic-array.hpp', - 'basic-data-type.hpp', - 'buffer.hpp', - 'chunked-array.hpp', - 'codec.hpp', - 'data-type.hpp', - 'datum.hpp', - 'decimal.hpp', - 'decoder.hpp', - 'error.hpp', - 'expression.hpp', - 'field.hpp', - 'interval.hpp', - 'memory-pool.hpp', - 'record-batch.hpp', - 'scalar.hpp', - 'schema.hpp', - 'table-builder.hpp', - 'table.hpp', - 'tensor.hpp', - 'timestamp-parser.hpp', - 'type.hpp', + 'array-builder.hpp', + 'array.hpp', + 'arrow-glib.hpp', + 'basic-array.hpp', + 'basic-data-type.hpp', + 'buffer.hpp', + 'chunked-array.hpp', + 'codec.hpp', + 'data-type.hpp', + 'datum.hpp', + 'decimal.hpp', + 'decoder.hpp', + 'error.hpp', + 'expression.hpp', + 'field.hpp', + 'interval.hpp', + 'memory-pool.hpp', + 'record-batch.hpp', + 'scalar.hpp', + 'schema.hpp', + 'table-builder.hpp', + 'table.hpp', + 'tensor.hpp', + 'timestamp-parser.hpp', + 'type.hpp', ) cpp_headers += files( - 'file.hpp', - 'file-mode.hpp', - 'input-stream.hpp', - 'output-stream.hpp', - 'readable.hpp', - 'writable.hpp', - 'writable-file.hpp', + 'file-mode.hpp', + 'file.hpp', + 'input-stream.hpp', + 'output-stream.hpp', + 'readable.hpp', + 'writable-file.hpp', + 'writable.hpp', ) cpp_headers += files( - 'ipc-options.hpp', - 'metadata-version.hpp', - 'reader.hpp', - 'writer.hpp', + 'ipc-options.hpp', + 'metadata-version.hpp', + 'reader.hpp', + 'writer.hpp', ) -cpp_headers += files( - 'compute.hpp', -) +cpp_headers += files('compute.hpp') -cpp_headers += files( - 'file-system.hpp', - 'local-file-system.hpp', -) +cpp_headers += files('file-system.hpp', 'local-file-system.hpp') if have_arrow_orc - cpp_headers += files( - 'orc-file-reader.hpp', - ) + cpp_headers += files('orc-file-reader.hpp') endif -cpp_internal_headers = files( - 'internal-hash-table.hpp', - 'internal-index.hpp', -) +cpp_internal_headers = files('internal-hash-table.hpp', 'internal-index.hpp') version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GARROW', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GARROW', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h -enums = gnome.mkenums('enums', - sources: c_headers, - identifier_prefix: 'GArrow', - symbol_prefix: 'garrow', - c_template: 'enums.c.template', - h_template: 'enums.h.template', - install_dir: join_paths(include_dir, 'arrow-glib'), - install_header: true) +enums = gnome.mkenums( + 'enums', + sources: c_headers, + identifier_prefix: 'GArrow', + symbol_prefix: 'garrow', + c_template: 'enums.c.template', + h_template: 'enums.h.template', + install_dir: join_paths(include_dir, 'arrow-glib'), + install_header: true, +) enums_source = enums[0] enums_header = enums[1] @@ -238,65 +221,67 @@ gobject_libdir = gobject.get_variable(pkgconfig: 'libdir') # confuses clang++ (/usr/bin/c++). gio = cxx.find_library('gio-2.0', dirs: [gobject_libdir], required: false) if not gio.found() - gio = dependency('gio-2.0') + gio = dependency('gio-2.0') endif -dependencies = [ - arrow, - arrow_acero, - gobject, - gio, -] -libarrow_glib = library('arrow-glib', - sources: sources + enums, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGARROW_COMPILATION'], - c_args: ['-DGARROW_COMPILATION'], - soversion: so_version, - version: library_version) -arrow_glib = declare_dependency(link_with: libarrow_glib, - include_directories: base_include_directories, - dependencies: dependencies, - sources: enums_header) +dependencies = [arrow, arrow_acero, gobject, gio] +libarrow_glib = library( + 'arrow-glib', + sources: sources + enums, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGARROW_COMPILATION'], + c_args: ['-DGARROW_COMPILATION'], + soversion: so_version, + version: library_version, +) +arrow_glib = declare_dependency( + link_with: libarrow_glib, + include_directories: base_include_directories, + dependencies: dependencies, + sources: enums_header, +) -pkgconfig.generate(libarrow_glib, - description: 'C API for Apache Arrow based on GLib', - filebase: meson.project_name(), - name: 'Apache Arrow GLib', - requires: ['gobject-2.0', 'arrow'], - variables: pkgconfig_variables, - version: version) +pkgconfig.generate( + libarrow_glib, + description: 'C API for Apache Arrow based on GLib', + filebase: meson.project_name(), + name: 'Apache Arrow GLib', + requires: ['gobject-2.0', 'arrow'], + variables: pkgconfig_variables, + version: version, +) if have_arrow_orc - pkgconfig.generate(filebase: 'arrow-orc-glib', - description: 'ORC modules for Apache Arrow GLib', - name: 'Apache Arrow GLib ORC', - requires: ['arrow-glib'], - version: version) + pkgconfig.generate( + filebase: 'arrow-orc-glib', + description: 'ORC modules for Apache Arrow GLib', + name: 'Apache Arrow GLib ORC', + requires: ['arrow-glib'], + version: version, + ) endif if have_gi - arrow_glib_gir = gnome.generate_gir(libarrow_glib, - export_packages: 'arrow-glib', - extra_args: [ - '--warn-all', - ], - header: 'arrow-glib/arrow-glib.h', - identifier_prefix: 'GArrow', - includes: [ - 'GObject-2.0', - 'Gio-2.0', - ], - namespace: 'Arrow', - sources: sources + c_headers + enums, - symbol_prefix: 'garrow', - kwargs: generate_gi_common_args) + arrow_glib_gir = gnome.generate_gir( + libarrow_glib, + export_packages: 'arrow-glib', + extra_args: ['--warn-all'], + header: 'arrow-glib/arrow-glib.h', + identifier_prefix: 'GArrow', + includes: ['GObject-2.0', 'Gio-2.0'], + namespace: 'Arrow', + sources: sources + c_headers + enums, + symbol_prefix: 'garrow', + kwargs: generate_gi_common_args, + ) - if generate_vapi - arrow_glib_vapi = gnome.generate_vapi('arrow-glib', - install: true, - packages: ['gio-2.0'], - sources: [arrow_glib_gir[0]]) - endif + if generate_vapi + arrow_glib_vapi = gnome.generate_vapi( + 'arrow-glib', + install: true, + packages: ['gio-2.0'], + sources: [arrow_glib_gir[0]], + ) + endif endif diff --git a/c_glib/arrow-glib/record-batch.cpp b/c_glib/arrow-glib/record-batch.cpp index 07e83c9f23ad0..2c94919d96609 100644 --- a/c_glib/arrow-glib/record-batch.cpp +++ b/c_glib/arrow-glib/record-batch.cpp @@ -516,6 +516,24 @@ garrow_record_batch_validate(GArrowRecordBatch *record_batch, GError **error) return garrow::check(error, arrow_record_batch->Validate(), "[record-batch][validate]"); } +/** + * garrow_record_batch_validate_full + * @record_batch: A #GArrowRecordBatch + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_record_batch_validate_full(GArrowRecordBatch *record_batch, GError **error) +{ + const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + return garrow::check(error, + arrow_record_batch->ValidateFull(), + "[record-batch][validate-full]"); +} + typedef struct GArrowRecordBatchIteratorPrivate_ { arrow::RecordBatchIterator iterator; diff --git a/c_glib/arrow-glib/record-batch.h b/c_glib/arrow-glib/record-batch.h index 8d17a44be5883..5a51ad983bbee 100644 --- a/c_glib/arrow-glib/record-batch.h +++ b/c_glib/arrow-glib/record-batch.h @@ -113,6 +113,10 @@ GARROW_AVAILABLE_IN_20_0 gboolean garrow_record_batch_validate(GArrowRecordBatch *record_batch, GError **error); +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_record_batch_validate_full(GArrowRecordBatch *record_batch, GError **error); + #define GARROW_TYPE_RECORD_BATCH_ITERATOR (garrow_record_batch_iterator_get_type()) GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchIterator, diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index f8569366685a2..4595ae7593998 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -339,20 +339,10 @@ garrow_table_new_values(GArrowSchema *schema, GList *values, GError **error) if (!arrow_chunked_arrays.empty()) { auto arrow_table = arrow::Table::Make(arrow_schema, std::move(arrow_chunked_arrays)); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, context)) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } else if (!arrow_arrays.empty()) { auto arrow_table = arrow::Table::Make(arrow_schema, std::move(arrow_arrays)); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, context)) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } else { auto maybe_table = arrow::Table::FromRecordBatches(arrow_schema, std::move(arrow_record_batches)); @@ -390,12 +380,7 @@ garrow_table_new_chunked_arrays(GArrowSchema *schema, } auto arrow_table = arrow::Table::Make(arrow_schema, arrow_chunked_arrays); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, "[table][new][chunked-arrays]")) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } /** @@ -422,12 +407,7 @@ garrow_table_new_arrays(GArrowSchema *schema, } auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); - auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, "[table][new][arrays]")) { - return garrow_table_new_raw(&arrow_table); - } else { - return NULL; - } + return garrow_table_new_raw(&arrow_table); } /** @@ -756,6 +736,42 @@ garrow_table_combine_chunks(GArrowTable *table, GError **error) } } +/** + * garrow_table_validate + * @table: A #GArrowTable + * @error: (nullable): Return location for a #GError or %NULL. + * + * Validate the given table. This is a cheap validation. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_table_validate(GArrowTable *table, GError **error) +{ + const auto arrow_table = garrow_table_get_raw(table); + return garrow::check(error, arrow_table->Validate(), "[table][validate]"); +} + +/** + * garrow_table_validate_full + * @table: A #GArrowTable + * @error: (nullable): Return location for a #GError or %NULL. + * + * Validate the given table. This is an extensive validation. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 20.0.0 + */ +gboolean +garrow_table_validate_full(GArrowTable *table, GError **error) +{ + const auto arrow_table = garrow_table_get_raw(table); + return garrow::check(error, arrow_table->ValidateFull(), "[table][validate-full]"); +} + typedef struct GArrowFeatherWritePropertiesPrivate_ { arrow::ipc::feather::WriteProperties properties; diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index d790e413df5fc..a78ee47fc9c40 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -142,6 +142,14 @@ GARROW_AVAILABLE_IN_0_16 GArrowTable * garrow_table_combine_chunks(GArrowTable *table, GError **error); +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_table_validate(GArrowTable *table, GError **error); + +GARROW_AVAILABLE_IN_20_0 +gboolean +garrow_table_validate_full(GArrowTable *table, GError **error); + #define GARROW_TYPE_FEATHER_WRITE_PROPERTIES (garrow_feather_write_properties_get_type()) GARROW_AVAILABLE_IN_0_17 G_DECLARE_DERIVABLE_TYPE(GArrowFeatherWriteProperties, diff --git a/c_glib/doc/meson.build b/c_glib/doc/meson.build index 8d0ac4229b2a6..2cdefd48195db 100644 --- a/c_glib/doc/meson.build +++ b/c_glib/doc/meson.build @@ -25,15 +25,17 @@ gi_docgen_toml_conf.set('SOURCE_REFERENCE', source_reference) # We can't use "version.replace('-SNAPSHOT', '.dev')" here because # Ubuntu 20.04's Meson is < 0.58.0. if version_tag == '' - gi_docgen_version_tag = '' + gi_docgen_version_tag = '' else - # GI-DocGen doesn't like MAJOR.MINOR.PATCH-SNAPSHOT format. - gi_docgen_version_tag = '.dev' + # GI-DocGen doesn't like MAJOR.MINOR.PATCH-SNAPSHOT format. + gi_docgen_version_tag = '.dev' endif -gi_docgen_version = '@0@.@1@.@2@@3@'.format(version_major, - version_minor, - version_micro, - gi_docgen_version_tag) +gi_docgen_version = '@0@.@1@.@2@@3@'.format( + version_major, + version_minor, + version_micro, + gi_docgen_version_tag, +) gi_docgen_toml_conf.set('VERSION', gi_docgen_version) gir_top_build_dir = meson.current_build_dir() / '..' @@ -41,53 +43,57 @@ arrow_glib_gir_dir = gir_top_build_dir / 'arrow-glib' arrow_flight_glib_gir_dir = gir_top_build_dir / 'arrow-flight-glib' entries = [['arrow-glib', arrow_glib_gir[0]]] if arrow_cuda.found() - entries += [['arrow-cuda-glib', arrow_cuda_glib_gir[0]]] + entries += [['arrow-cuda-glib', arrow_cuda_glib_gir[0]]] endif if arrow_dataset.found() - entries += [['arrow-dataset-glib', arrow_dataset_glib_gir[0]]] + entries += [['arrow-dataset-glib', arrow_dataset_glib_gir[0]]] endif if arrow_flight.found() - entries += [['arrow-flight-glib', arrow_flight_glib_gir[0]]] + entries += [['arrow-flight-glib', arrow_flight_glib_gir[0]]] endif if arrow_flight_sql.found() - entries += [['arrow-flight-sql-glib', arrow_flight_sql_glib_gir[0]]] + entries += [['arrow-flight-sql-glib', arrow_flight_sql_glib_gir[0]]] endif if gandiva.found() - entries += [['gandiva-glib', gandiva_glib_gir[0]]] + entries += [['gandiva-glib', gandiva_glib_gir[0]]] endif if parquet.found() - entries += [['parquet-glib', parquet_glib_gir[0]]] + entries += [['parquet-glib', parquet_glib_gir[0]]] endif foreach entry : entries - module_name = entry[0] - gir = entry[1] - gi_docgen_toml = configure_file(input: '@0@.toml.in'.format(module_name), - output: '@0@.toml'.format(module_name), - configuration: gi_docgen_toml_conf) - gir_dir = gir_top_build_dir / module_name - current_source_dir = meson.current_source_dir() - command = [ - gi_docgen, - 'generate', - '--add-include-path=@0@'.format(arrow_flight_glib_gir_dir), - '--add-include-path=@0@'.format(arrow_glib_gir_dir), - '--add-include-path=@0@'.format(gir_dir), - '--config=@INPUT0@', - '--content-dir=@0@'.format(current_source_dir), - '--no-namespace-dir', - '--output-dir=@OUTPUT@', - '--quiet', - ] - if get_option('werror') - command += ['--fatal-warnings'] - endif - command += ['@INPUT1@'] - custom_target('@0@-doc'.format(module_name), - input: [gi_docgen_toml, gir], - depend_files: ['urlmap.js'], - output: module_name, - command: command, - build_by_default: true, - install: true, - install_dir: doc_dir) + module_name = entry[0] + gir = entry[1] + gi_docgen_toml = configure_file( + input: '@0@.toml.in'.format(module_name), + output: '@0@.toml'.format(module_name), + configuration: gi_docgen_toml_conf, + ) + gir_dir = gir_top_build_dir / module_name + current_source_dir = meson.current_source_dir() + command = [ + gi_docgen, + 'generate', + '--add-include-path=@0@'.format(arrow_flight_glib_gir_dir), + '--add-include-path=@0@'.format(arrow_glib_gir_dir), + '--add-include-path=@0@'.format(gir_dir), + '--config=@INPUT0@', + '--content-dir=@0@'.format(current_source_dir), + '--no-namespace-dir', + '--output-dir=@OUTPUT@', + '--quiet', + ] + if get_option('werror') + command += ['--fatal-warnings'] + endif + command += ['@INPUT1@'] + custom_target( + '@0@-doc'.format(module_name), + input: [gi_docgen_toml, gir], + depend_files: ['urlmap.js'], + output: module_name, + command: command, + build_by_default: true, + install: true, + install_dir: doc_dir, + ) endforeach diff --git a/c_glib/example/lua/meson.build b/c_glib/example/lua/meson.build index 4836001287579..8994da1517305 100644 --- a/c_glib/example/lua/meson.build +++ b/c_glib/example/lua/meson.build @@ -17,12 +17,11 @@ # specific language governing permissions and limitations # under the License. -install_data('README.md', - 'read-file.lua', - 'read-stream.lua', - 'write-file.lua', - 'write-stream.lua', - install_dir: join_paths(data_dir, - meson.project_name(), - 'example', - 'lua')) +install_data( + 'README.md', + 'read-file.lua', + 'read-stream.lua', + 'write-file.lua', + 'write-stream.lua', + install_dir: join_paths(data_dir, meson.project_name(), 'example', 'lua'), +) diff --git a/c_glib/example/meson.build b/c_glib/example/meson.build index e2d55d4788ab5..99b9b8ae22f11 100644 --- a/c_glib/example/meson.build +++ b/c_glib/example/meson.build @@ -17,33 +17,48 @@ # specific language governing permissions and limitations # under the License. -executable('build', 'build.c', - dependencies: [arrow_glib], - link_language: 'c') -executable('extension-type', 'extension-type.c', - dependencies: [arrow_glib], - link_language: 'c') -executable('read-file', 'read-file.c', - dependencies: [arrow_glib], - link_language: 'c') -executable('read-stream', 'read-stream.c', - dependencies: [arrow_glib], - link_language: 'c') -executable('receive-network', 'receive-network.c', - dependencies: [arrow_glib], - link_language: 'c') -executable('send-network', 'send-network.c', - dependencies: [arrow_glib], - link_language: 'c') +executable('build', 'build.c', dependencies: [arrow_glib], link_language: 'c') +executable( + 'extension-type', + 'extension-type.c', + dependencies: [arrow_glib], + link_language: 'c', +) +executable( + 'read-file', + 'read-file.c', + dependencies: [arrow_glib], + link_language: 'c', +) +executable( + 'read-stream', + 'read-stream.c', + dependencies: [arrow_glib], + link_language: 'c', +) +executable( + 'receive-network', + 'receive-network.c', + dependencies: [arrow_glib], + link_language: 'c', +) +executable( + 'send-network', + 'send-network.c', + dependencies: [arrow_glib], + link_language: 'c', +) -install_data('README.md', - 'build.c', - 'extension-type.c', - 'read-file.c', - 'read-stream.c', - 'receive-network.c', - 'send-network.c', - install_dir: join_paths(data_dir, meson.project_name(), 'example')) +install_data( + 'README.md', + 'build.c', + 'extension-type.c', + 'read-file.c', + 'read-stream.c', + 'receive-network.c', + 'send-network.c', + install_dir: join_paths(data_dir, meson.project_name(), 'example'), +) subdir('lua') subdir('vala') diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index b7eb86200ddd6..893b7a5198c17 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -18,41 +18,41 @@ # under the License. if generate_vapi - c_flags = [ - '-Wno-unused-but-set-variable', - ] - c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) - vala_example_executable_kwargs = { - 'c_args': [ - '-I' + project_build_root, - '-I' + project_source_root, - ] + c_flags, - 'dependencies': [ - arrow_glib_vapi, - dependency('gio-2.0'), - ], - 'vala_args': [ - '--pkg', 'posix', - ], - } - executable('build', 'build.vala', - kwargs: vala_example_executable_kwargs) - executable('read-file', 'read-file.vala', - kwargs: vala_example_executable_kwargs) - executable('read-stream', 'read-stream.vala', - kwargs: vala_example_executable_kwargs) - executable('write-file', 'write-file.vala', - kwargs: vala_example_executable_kwargs) - executable('write-stream', 'write-stream.vala', - kwargs: vala_example_executable_kwargs) + c_flags = ['-Wno-unused-but-set-variable'] + c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) + vala_example_executable_kwargs = { + 'c_args': ['-I' + project_build_root, '-I' + project_source_root] + c_flags, + 'dependencies': [arrow_glib_vapi, dependency('gio-2.0')], + 'vala_args': ['--pkg', 'posix'], + } + executable('build', 'build.vala', kwargs: vala_example_executable_kwargs) + executable( + 'read-file', + 'read-file.vala', + kwargs: vala_example_executable_kwargs, + ) + executable( + 'read-stream', + 'read-stream.vala', + kwargs: vala_example_executable_kwargs, + ) + executable( + 'write-file', + 'write-file.vala', + kwargs: vala_example_executable_kwargs, + ) + executable( + 'write-stream', + 'write-stream.vala', + kwargs: vala_example_executable_kwargs, + ) endif -install_data('README.md', - 'read-file.vala', - 'read-stream.vala', - 'write-file.vala', - 'write-stream.vala', - install_dir: join_paths(data_dir, - meson.project_name(), - 'example', - 'vala')) +install_data( + 'README.md', + 'read-file.vala', + 'read-stream.vala', + 'write-file.vala', + 'write-stream.vala', + install_dir: join_paths(data_dir, meson.project_name(), 'example', 'vala'), +) diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build index 94b923388b7f2..267b01344f524 100644 --- a/c_glib/gandiva-glib/meson.build +++ b/c_glib/gandiva-glib/meson.build @@ -20,114 +20,128 @@ project_name = 'gandiva-glib' sources = files( - 'expression.cpp', - 'filter.cpp', - 'function-registry.cpp', - 'function-signature.cpp', - 'native-function.cpp', - 'node.cpp', - 'projector.cpp', - 'selection-vector.cpp', + 'expression.cpp', + 'filter.cpp', + 'function-registry.cpp', + 'function-signature.cpp', + 'native-function.cpp', + 'node.cpp', + 'projector.cpp', + 'selection-vector.cpp', ) c_headers = files( - 'expression.h', - 'filter.h', - 'function-registry.h', - 'function-signature.h', - 'gandiva-glib.h', - 'native-function.h', - 'node.h', - 'projector.h', - 'selection-vector.h', + 'expression.h', + 'filter.h', + 'function-registry.h', + 'function-signature.h', + 'gandiva-glib.h', + 'native-function.h', + 'node.h', + 'projector.h', + 'selection-vector.h', ) cpp_headers = files( - 'expression.hpp', - 'filter.hpp', - 'function-signature.hpp', - 'gandiva-glib.hpp', - 'native-function.hpp', - 'node.hpp', - 'projector.hpp', - 'selection-vector.hpp', + 'expression.hpp', + 'filter.hpp', + 'function-signature.hpp', + 'gandiva-glib.hpp', + 'native-function.hpp', + 'node.hpp', + 'projector.hpp', + 'selection-vector.hpp', ) version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GGANDIVA', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GGANDIVA', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h -enums = gnome.mkenums('enums', - sources: c_headers, - identifier_prefix: 'GGandiva', - symbol_prefix: 'ggandiva', - c_template: 'enums.c.template', - h_template: 'enums.h.template', - install_dir: join_paths(include_dir, 'gandiva-glib'), - install_header: true) +enums = gnome.mkenums( + 'enums', + sources: c_headers, + identifier_prefix: 'GGandiva', + symbol_prefix: 'ggandiva', + c_template: 'enums.c.template', + h_template: 'enums.h.template', + install_dir: join_paths(include_dir, 'gandiva-glib'), + install_header: true, +) enums_source = enums[0] enums_header = enums[1] headers = c_headers + cpp_headers install_headers(headers, subdir: project_name) -dependencies = [ - gandiva, - arrow_glib, -] -libgandiva_glib = library('gandiva-glib', - sources: sources + enums, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGGANDIVA_COMPILATION'], - c_args: ['-DGGANDIVA_COMPILATION'], - soversion: so_version, - version: library_version) -gandiva_glib = declare_dependency(link_with: libgandiva_glib, - include_directories: base_include_directories, - dependencies: dependencies, - sources: enums_header) +dependencies = [gandiva, arrow_glib] +libgandiva_glib = library( + 'gandiva-glib', + sources: sources + enums, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGGANDIVA_COMPILATION'], + c_args: ['-DGGANDIVA_COMPILATION'], + soversion: so_version, + version: library_version, +) +gandiva_glib = declare_dependency( + link_with: libgandiva_glib, + include_directories: base_include_directories, + dependencies: dependencies, + sources: enums_header, +) -pkgconfig.generate(libgandiva_glib, - description: 'C API for Apache Arrow Gandiva based on GLib', - filebase: project_name, - name: 'Apache Arrow Gandiva GLib', - requires: ['gandiva', 'arrow-glib'], - variables: pkgconfig_variables, - version: version) +pkgconfig.generate( + libgandiva_glib, + description: 'C API for Apache Arrow Gandiva based on GLib', + filebase: project_name, + name: 'Apache Arrow Gandiva GLib', + requires: ['gandiva', 'arrow-glib'], + variables: pkgconfig_variables, + version: version, +) if have_gi - gandiva_glib_gir = \ - gnome.generate_gir(libgandiva_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), - export_packages: 'gandiva-glib', - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ], - header: 'gandiva-glib/gandiva-glib.h', - identifier_prefix: 'GGandiva', - includes: [ - 'Arrow-1.0' - ], - kwargs: generate_gi_common_args, - namespace: 'Gandiva', - sources: sources + c_headers + enums, - symbol_prefix: 'ggandiva') + gandiva_glib_gir = gnome.generate_gir( + libgandiva_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + export_packages: 'gandiva-glib', + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ], + header: 'gandiva-glib/gandiva-glib.h', + identifier_prefix: 'GGandiva', + includes: ['Arrow-1.0'], + kwargs: generate_gi_common_args, + namespace: 'Gandiva', + sources: sources + c_headers + enums, + symbol_prefix: 'ggandiva', + ) - if generate_vapi - gnome.generate_vapi('gandiva-glib', - install: true, - packages: [ - arrow_glib_vapi, - 'gio-2.0', - ], - sources: [gandiva_glib_gir[0]]) - endif + if generate_vapi + gnome.generate_vapi( + 'gandiva-glib', + install: true, + packages: [arrow_glib_vapi, 'gio-2.0'], + sources: [gandiva_glib_gir[0]], + ) + endif endif diff --git a/c_glib/meson.build b/c_glib/meson.build index 017765cd14626..11b6ba2f476f8 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -17,31 +17,33 @@ # specific language governing permissions and limitations # under the License. -project('arrow-glib', 'c', 'cpp', - default_options: [ - 'c_std=c99', - 'cpp_std=c++17', - ], - license: 'Apache-2.0', - # Debian: - # https://packages.debian.org/search?keywords=meson - # - # * bookworm: 1.0.0 - # - # Ubuntu: - # https://packages.ubuntu.com/search?keywords=meson - # - # * 20.04: 0.53.2 - # * 22.04: 0.61.2 - meson_version: '>=0.53.2') - -version = '20.0.0-SNAPSHOT' +project( + 'arrow-glib', + 'c', + 'cpp', + default_options: ['c_std=c99', 'cpp_std=c++17'], + license: 'Apache-2.0', + # Debian: + # https://packages.debian.org/search?keywords=meson + # + # * bookworm: 1.0.0 + # + # Ubuntu: + # https://packages.ubuntu.com/search?keywords=meson + # + # * 22.04: 0.61.2 + # * 24.04: 1.3.2 + meson_version: '>=0.61.2', + version: '20.0.0-SNAPSHOT', +) + +version = meson.project_version() if version.endswith('-SNAPSHOT') - version_numbers = version.split('-')[0].split('.') - version_tag = version.split('-')[1] + version_numbers = version.split('-')[0].split('.') + version_tag = version.split('-')[1] else - version_numbers = version.split('.') - version_tag = '' + version_numbers = version.split('.') + version_tag = '' endif version_major = version_numbers[0].to_int() version_minor = version_numbers[1].to_int() @@ -52,13 +54,8 @@ so_version = version_major * 100 + version_minor so_version_patch = version_micro library_version = '@0@.@1@.@2@'.format(so_version, so_version_patch, 0) -if meson.version().version_compare('>=0.56.0') - project_build_root = meson.project_build_root() - project_source_root = meson.project_source_root() -else - project_build_root = meson.build_root() - project_source_root = meson.source_root() -endif +project_build_root = meson.project_build_root() +project_source_root = meson.project_source_root() prefix = get_option('prefix') include_dir = join_paths(prefix, get_option('includedir')) @@ -71,64 +68,84 @@ gnome = import('gnome') pkgconfig = import('pkgconfig') pkgconfig_variables = [] -base_include_directories = [ - include_directories('.') -] +base_include_directories = [include_directories('.')] -generate_gi_common_args = { - 'install': true, - 'nsversion': api_version, -} -if get_option('werror') and meson.version().version_compare('>=0.55.0') - generate_gi_common_args += {'fatal_warnings': true} +generate_gi_common_args = {'install': true, 'nsversion': api_version} +if get_option('werror') + generate_gi_common_args += {'fatal_warnings': true} endif have_gi = dependency('gobject-introspection-1.0', required: false).found() if have_gi - pkgconfig_variables += ['girdir=@0@'.format(gir_dir)] + pkgconfig_variables += ['girdir=@0@'.format(gir_dir)] endif generate_vapi = have_gi and get_option('vapi') if generate_vapi - pkgconfig_variables += ['vapidir=@0@'.format(vapi_dir)] - add_languages('vala') + pkgconfig_variables += ['vapidir=@0@'.format(vapi_dir)] + add_languages('vala') endif arrow_cpp_build_dir = get_option('arrow_cpp_build_dir') arrow_cpp_build_type = get_option('arrow_cpp_build_type') if arrow_cpp_build_dir == '' - arrow_cpp_build_lib_dir = '' + arrow_cpp_build_lib_dir = '' else - arrow_cpp_build_lib_dir = join_paths(project_source_root, - arrow_cpp_build_dir, - arrow_cpp_build_type.to_lower()) + arrow_cpp_build_lib_dir = join_paths( + project_source_root, + arrow_cpp_build_dir, + arrow_cpp_build_type.to_lower(), + ) endif if arrow_cpp_build_lib_dir == '' - arrow = dependency('arrow', version: ['>=' + version]) - # They are just for checking required modules are enabled. They are built into - # libarrow.so. So we don't need additional build flags for them. - dependency('arrow-compute', version: ['>=' + version]) - dependency('arrow-csv', version: ['>=' + version]) - dependency('arrow-filesystem', version: ['>=' + version]) - dependency('arrow-json', version: ['>=' + version]) - - have_arrow_orc = dependency('arrow-orc', required: false, version: ['>=' + version]).found() - arrow_cuda = dependency('arrow-cuda', required: false, version: ['>=' + version]) - # we do not support compiling glib without acero engine - arrow_acero = dependency('arrow-acero', required: true, version: ['>=' + version]) - arrow_dataset = dependency('arrow-dataset', required: false, version: ['>=' + version]) - arrow_flight = dependency('arrow-flight', required: false, version: ['>=' + version]) - arrow_flight_sql = dependency('arrow-flight-sql', required: false, version: ['>=' + version]) - gandiva = dependency('gandiva', required: false, version: ['>=' + version]) - parquet = dependency('parquet', required: false, version: ['>=' + version]) + arrow = dependency('arrow', version: ['>=' + version]) + # They are just for checking required modules are enabled. They are built into + # libarrow.so. So we don't need additional build flags for them. + dependency('arrow-compute', version: ['>=' + version]) + dependency('arrow-csv', version: ['>=' + version]) + dependency('arrow-filesystem', version: ['>=' + version]) + dependency('arrow-json', version: ['>=' + version]) + + have_arrow_orc = dependency( + 'arrow-orc', + required: false, + version: ['>=' + version], + ).found() + arrow_cuda = dependency( + 'arrow-cuda', + required: false, + version: ['>=' + version], + ) + # we do not support compiling glib without acero engine + arrow_acero = dependency( + 'arrow-acero', + required: true, + version: ['>=' + version], + ) + arrow_dataset = dependency( + 'arrow-dataset', + required: false, + version: ['>=' + version], + ) + arrow_flight = dependency( + 'arrow-flight', + required: false, + version: ['>=' + version], + ) + arrow_flight_sql = dependency( + 'arrow-flight-sql', + required: false, + version: ['>=' + version], + ) + gandiva = dependency('gandiva', required: false, version: ['>=' + version]) + parquet = dependency('parquet', required: false, version: ['>=' + version]) else - base_include_directories += [ - include_directories(join_paths(arrow_cpp_build_dir, 'src')), - include_directories('../cpp/src'), - ] - cpp_compiler = meson.get_compiler('cpp') - arrow = cpp_compiler.find_library('arrow', - dirs: [arrow_cpp_build_lib_dir]) - arrow_orc_code = ''' + base_include_directories += [ + include_directories(join_paths(arrow_cpp_build_dir, 'src')), + include_directories('../cpp/src'), + ] + cpp_compiler = meson.get_compiler('cpp') + arrow = cpp_compiler.find_library('arrow', dirs: [arrow_cpp_build_lib_dir]) + arrow_orc_code = ''' #include int @@ -138,36 +155,52 @@ main(void) return 0; } ''' - have_arrow_orc = cpp_compiler.links(arrow_orc_code, - include_directories: base_include_directories, - dependencies: [arrow]) - arrow_cuda = cpp_compiler.find_library('arrow_cuda', - dirs: [arrow_cpp_build_lib_dir], - required: false) - arrow_acero = cpp_compiler.find_library('arrow_acero', - dirs: [arrow_cpp_build_lib_dir], - required: true) - arrow_dataset = cpp_compiler.find_library('arrow_dataset', - dirs: [arrow_cpp_build_lib_dir], - required: false) - arrow_flight = cpp_compiler.find_library('arrow_flight', - dirs: [arrow_cpp_build_lib_dir], - required: false) - arrow_flight_sql = cpp_compiler.find_library('arrow_flight_sql', - dirs: [arrow_cpp_build_lib_dir], - required: false) - gandiva = cpp_compiler.find_library('gandiva', - dirs: [arrow_cpp_build_lib_dir], - required: false) - parquet = cpp_compiler.find_library('parquet', - dirs: [arrow_cpp_build_lib_dir], - required: false) + have_arrow_orc = cpp_compiler.links( + arrow_orc_code, + include_directories: base_include_directories, + dependencies: [arrow], + ) + arrow_cuda = cpp_compiler.find_library( + 'arrow_cuda', + dirs: [arrow_cpp_build_lib_dir], + required: false, + ) + arrow_acero = cpp_compiler.find_library( + 'arrow_acero', + dirs: [arrow_cpp_build_lib_dir], + required: true, + ) + arrow_dataset = cpp_compiler.find_library( + 'arrow_dataset', + dirs: [arrow_cpp_build_lib_dir], + required: false, + ) + arrow_flight = cpp_compiler.find_library( + 'arrow_flight', + dirs: [arrow_cpp_build_lib_dir], + required: false, + ) + arrow_flight_sql = cpp_compiler.find_library( + 'arrow_flight_sql', + dirs: [arrow_cpp_build_lib_dir], + required: false, + ) + gandiva = cpp_compiler.find_library( + 'gandiva', + dirs: [arrow_cpp_build_lib_dir], + required: false, + ) + parquet = cpp_compiler.find_library( + 'parquet', + dirs: [arrow_cpp_build_lib_dir], + required: false, + ) endif cxx = meson.get_compiler('cpp') cxx_flags = [] if cxx.get_id() != 'msvc' - cxx_flags += ['-Wmissing-declarations'] + cxx_flags += ['-Wmissing-declarations'] endif add_project_arguments(cxx.get_supported_arguments(cxx_flags), language: 'cpp') @@ -177,42 +210,54 @@ generate_version_header_py = project_source_root / 'tool' / 'generate-version-he subdir('arrow-glib') if arrow_cuda.found() - subdir('arrow-cuda-glib') + subdir('arrow-cuda-glib') endif if arrow_dataset.found() - subdir('arrow-dataset-glib') + subdir('arrow-dataset-glib') endif if arrow_flight.found() - subdir('arrow-flight-glib') + subdir('arrow-flight-glib') endif if arrow_flight_sql.found() - subdir('arrow-flight-sql-glib') + subdir('arrow-flight-sql-glib') endif if gandiva.found() - subdir('gandiva-glib') + subdir('gandiva-glib') endif if parquet.found() - subdir('parquet-glib') + subdir('parquet-glib') endif subdir('example') if get_option('doc') - subdir('doc') + subdir('doc') endif -install_data('../LICENSE.txt', - 'README.md', - install_dir: data_dir / meson.project_name()) +install_data( + '../LICENSE.txt', + 'README.md', + install_dir: data_dir / meson.project_name(), +) run_test = find_program('test/run-test.sh') -test('unit test', - run_test, - env: [ - 'ARROW_CUDA_GLIB_TYPELIB_DIR=@0@/arrow-cuda-glib'.format(project_build_root), - 'ARROW_DATASET_GLIB_TYPELIB_DIR=@0@/arrow-dataset-glib'.format(project_build_root), - 'ARROW_FLIGHT_GLIB_TYPELIB_DIR=@0@/arrow-flight-glib'.format(project_build_root), - 'ARROW_FLIGHT_SQL_GLIB_TYPELIB_DIR=@0@/arrow-flight-sql-glib'.format(project_build_root), - 'ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(project_build_root), - 'GANDIVA_GLIB_TYPELIB_DIR=@0@/gandiva-glib'.format(project_build_root), - 'PARQUET_GLIB_TYPELIB_DIR=@0@/parquet-glib'.format(project_build_root), - ]) +test( + 'unit test', + run_test, + env: [ + 'ARROW_CUDA_GLIB_TYPELIB_DIR=@0@/arrow-cuda-glib'.format( + project_build_root, + ), + 'ARROW_DATASET_GLIB_TYPELIB_DIR=@0@/arrow-dataset-glib'.format( + project_build_root, + ), + 'ARROW_FLIGHT_GLIB_TYPELIB_DIR=@0@/arrow-flight-glib'.format( + project_build_root, + ), + 'ARROW_FLIGHT_SQL_GLIB_TYPELIB_DIR=@0@/arrow-flight-sql-glib'.format( + project_build_root, + ), + 'ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(project_build_root), + 'GANDIVA_GLIB_TYPELIB_DIR=@0@/gandiva-glib'.format(project_build_root), + 'PARQUET_GLIB_TYPELIB_DIR=@0@/parquet-glib'.format(project_build_root), + ], +) diff --git a/c_glib/parquet-glib/meson.build b/c_glib/parquet-glib/meson.build index a3de1d0933f7f..a85ba18f30f9c 100644 --- a/c_glib/parquet-glib/meson.build +++ b/c_glib/parquet-glib/meson.build @@ -20,32 +20,43 @@ project_name = 'parquet-glib' sources = files( - 'arrow-file-reader.cpp', - 'arrow-file-writer.cpp', - 'metadata.cpp', - 'statistics.cpp', + 'arrow-file-reader.cpp', + 'arrow-file-writer.cpp', + 'metadata.cpp', + 'statistics.cpp', ) c_headers = files( - 'arrow-file-reader.h', - 'arrow-file-writer.h', - 'metadata.h', - 'statistics.h', - 'parquet-glib.h', + 'arrow-file-reader.h', + 'arrow-file-writer.h', + 'metadata.h', + 'parquet-glib.h', + 'statistics.h', ) cpp_headers = files( - 'arrow-file-reader.hpp', - 'arrow-file-writer.hpp', - 'metadata.hpp', - 'statistics.hpp', - 'parquet-glib.hpp', + 'arrow-file-reader.hpp', + 'arrow-file-writer.hpp', + 'metadata.hpp', + 'parquet-glib.hpp', + 'statistics.hpp', ) version_h = configure_file( - input: 'version.h.in', - output: 'version.h', - command: [python3, generate_version_header_py, '--library', 'GPARQUET', '--version', version, '--input', '@INPUT@', '--output', '@OUTPUT@'], + input: 'version.h.in', + output: 'version.h', + command: [ + python3, + generate_version_header_py, + '--library', + 'GPARQUET', + '--version', + version, + '--input', + '@INPUT@', + '--output', + '@OUTPUT@', + ], ) c_headers += version_h @@ -53,58 +64,58 @@ c_headers += version_h headers = c_headers + cpp_headers install_headers(headers, subdir: project_name) -dependencies = [ - arrow, - parquet, - arrow_glib, -] -libparquet_glib = library('parquet-glib', - sources: sources, - install: true, - dependencies: dependencies, - implicit_include_directories: false, - include_directories: base_include_directories, - cpp_args: ['-DGPARQUET_COMPILATION'], - soversion: so_version, - version: library_version) -parquet_glib = declare_dependency(link_with: libparquet_glib, - include_directories: base_include_directories, - dependencies: dependencies) +dependencies = [arrow, parquet, arrow_glib] +libparquet_glib = library( + 'parquet-glib', + sources: sources, + install: true, + dependencies: dependencies, + implicit_include_directories: false, + include_directories: base_include_directories, + cpp_args: ['-DGPARQUET_COMPILATION'], + soversion: so_version, + version: library_version, +) +parquet_glib = declare_dependency( + link_with: libparquet_glib, + include_directories: base_include_directories, + dependencies: dependencies, +) -pkgconfig.generate(libparquet_glib, - description: 'C API for Apache Parquet based on GLib', - filebase: project_name, - name: 'Apache Parquet GLib', - requires: ['parquet', 'arrow-glib'], - variables: pkgconfig_variables, - version: version) +pkgconfig.generate( + libparquet_glib, + description: 'C API for Apache Parquet based on GLib', + filebase: project_name, + name: 'Apache Parquet GLib', + requires: ['parquet', 'arrow-glib'], + variables: pkgconfig_variables, + version: version, +) if have_gi - parquet_glib_gir = \ - gnome.generate_gir(libparquet_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), - export_packages: 'parquet-glib', - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ], - header: 'parquet-glib/parquet-glib.h', - identifier_prefix: 'GParquet', - includes: [ - 'Arrow-1.0', - ], - kwargs: generate_gi_common_args, - namespace: 'Parquet', - sources: sources + c_headers, - symbol_prefix: 'gparquet') + parquet_glib_gir = gnome.generate_gir( + libparquet_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + export_packages: 'parquet-glib', + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ], + header: 'parquet-glib/parquet-glib.h', + identifier_prefix: 'GParquet', + includes: ['Arrow-1.0'], + kwargs: generate_gi_common_args, + namespace: 'Parquet', + sources: sources + c_headers, + symbol_prefix: 'gparquet', + ) - if generate_vapi - gnome.generate_vapi('parquet-glib', - install: true, - packages: [ - arrow_glib_vapi, - 'gio-2.0', - ], - sources: [parquet_glib_gir[0]]) - endif + if generate_vapi + gnome.generate_vapi( + 'parquet-glib', + install: true, + packages: [arrow_glib_vapi, 'gio-2.0'], + sources: [parquet_glib_gir[0]], + ) + endif endif diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index 96deedf6b4eb0..25c50ef9e4ff9 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -91,15 +91,15 @@ def create_dataset dataset = @factory.finish expected_table = build_table(count: [ - build_int32_array([1, 10]), - build_int32_array([2]), - build_int32_array([3]), - ], - label: [ - build_string_array(["a", "a"]), - build_string_array(["b"]), - build_string_array(["c"]), - ]) + build_int32_array([1, 10]), + build_int32_array([2]), + build_int32_array([3]), + ], + label: [ + build_string_array(["a", "a"]), + build_string_array(["b"]), + build_string_array(["c"]), + ]) return dataset, expected_table end diff --git a/c_glib/test/test-array-statistics.rb b/c_glib/test/test-array-statistics.rb new file mode 100644 index 0000000000000..bf470b4e72235 --- /dev/null +++ b/c_glib/test/test-array-statistics.rb @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestArrayStatistics < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Parquet is required") unless defined?(::Parquet) + + Tempfile.create(["data", ".parquet"]) do |file| + @file = file + array = build_int64_array([nil, -(2 ** 32), 2 ** 32]) + @table = build_table("int64" => array) + writer = Parquet::ArrowFileWriter.new(@table.schema, @file.path) + chunk_size = 1024 + writer.write_table(@table, chunk_size) + writer.close + reader = Parquet::ArrowFileReader.new(@file.path) + begin + @statistics = reader.read_table.get_column_data(0).get_chunk(0).statistics + yield + ensure + reader.unref + end + end + end + + test("#has_null_count?") do + assert do + @statistics.has_null_count? + end + end + + test("#null_count") do + assert_equal(1, @statistics.null_count) + end +end diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index cd62d917cf664..18a54a2963134 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -118,8 +118,9 @@ def test_to_s sub_test_case("#view") do def test_valid + int32_array = build_int32_array([0, 1069547520, -1071644672, nil]) assert_equal(build_float_array([0.0, 1.5, -2.5, nil]), - build_int32_array([0, 1069547520, -1071644672, nil]).view(Arrow::FloatDataType.new)) + int32_array.view(Arrow::FloatDataType.new)) end def test_invalid @@ -212,7 +213,7 @@ def test_valid end def test_invalid - message = "[array][validate_full]: Invalid: Invalid UTF8 sequence at string index 0" + message = "[array][validate-full]: Invalid: Invalid UTF8 sequence at string index 0" # U+3042 HIRAGANA LETTER A, U+3044 HIRAGANA LETTER I data = "\u3042\u3044".b[0..-2] diff --git a/c_glib/test/test-binary-view-array.rb b/c_glib/test/test-binary-view-array.rb new file mode 100644 index 0000000000000..e1c97ecdced30 --- /dev/null +++ b/c_glib/test/test-binary-view-array.rb @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestBinaryViewArray < Test::Unit::TestCase + def test_new + short_binary_data = "test" + short_view_buffer_space = 12 + short_view_buffer = [short_binary_data.size].pack("l") + short_view_buffer += short_binary_data.ljust(short_view_buffer_space, "\x00") + + arrow_view_buffer = Arrow::Buffer.new(short_view_buffer) + arrow_data_buffer = Arrow::Buffer.new(short_binary_data) + bitmap = Arrow::Buffer.new([0b1].pack("C*")) + + binary_view_array = Arrow::BinaryViewArray.new(1, + arrow_view_buffer, + [arrow_data_buffer], + bitmap, + 0, + 0) + assert do + binary_view_array.validate_full + end + assert_equal(short_binary_data, binary_view_array.get_value(0).to_s) + end +end diff --git a/c_glib/test/test-binary-view-data-type.rb b/c_glib/test/test-binary-view-data-type.rb index f143b62df4ebc..9c97982862fe3 100644 --- a/c_glib/test/test-binary-view-data-type.rb +++ b/c_glib/test/test-binary-view-data-type.rb @@ -30,4 +30,11 @@ def test_to_s data_type = Arrow::BinaryViewDataType.new assert_equal("binary_view", data_type.to_s) end + + def test_export + data_type = Arrow::BinaryViewDataType.new + c_abi_schema = data_type.export + assert_equal(data_type, + Arrow::DataType.import(c_abi_schema)) + end end diff --git a/c_glib/test/test-chunked-array-datum.rb b/c_glib/test/test-chunked-array-datum.rb index b82f3eed8a7af..99e35fc57b085 100644 --- a/c_glib/test/test-chunked-array-datum.rb +++ b/c_glib/test/test-chunked-array-datum.rb @@ -49,7 +49,14 @@ def test_false end def test_to_string - assert_equal("ChunkedArray([\n" + " [\n" + " true,\n" + " false\n" + " ]\n" + "])", @datum.to_s) + assert_equal(<<-DATUM.chomp, @datum.to_s) +ChunkedArray([ + [ + true, + false + ] +]) + DATUM end def test_value diff --git a/c_glib/test/test-large-list-array.rb b/c_glib/test/test-large-list-array.rb index 2f7efab5a074a..fa9c92ec87d0c 100644 --- a/c_glib/test/test-large-list-array.rb +++ b/c_glib/test/test-large-list-array.rb @@ -88,10 +88,10 @@ def test_value_length def test_value_offsets array = build_large_list_array(Arrow::Int8DataType.new, - [ - [-29, 29], - [-1, 0, 1], - ]) + [ + [-29, 29], + [-1, 0, 1], + ]) assert_equal([0, 2, 5], array.value_offsets) end diff --git a/c_glib/test/test-memory-pool.rb b/c_glib/test/test-memory-pool.rb index 515edb0161399..b471437208503 100644 --- a/c_glib/test/test-memory-pool.rb +++ b/c_glib/test/test-memory-pool.rb @@ -20,6 +20,8 @@ class TestMemoryPool < Test::Unit::TestCase def setup @memory_pool = Arrow::MemoryPool.default + # Our tests assume that some memory is allocated. + @buffer = Arrow::ResizableBuffer.new(1) end def test_bytes_allocated diff --git a/c_glib/test/test-record-batch-datum.rb b/c_glib/test/test-record-batch-datum.rb index ec572e0f13023..c50e50f9029e8 100644 --- a/c_glib/test/test-record-batch-datum.rb +++ b/c_glib/test/test-record-batch-datum.rb @@ -49,7 +49,13 @@ def test_false end def test_to_string - assert_equal("RecordBatch(visible: [\n" + " true,\n" + " false\n" + " ]\n" + ")", @datum.to_s) + assert_equal(<<-DATUM.chomp, @datum.to_s) +RecordBatch(visible: [ + true, + false + ] +) + DATUM end def test_value diff --git a/c_glib/test/test-record-batch.rb b/c_glib/test/test-record-batch.rb index ba4b15a67782a..86ae5b2e2d4aa 100644 --- a/c_glib/test/test-record-batch.rb +++ b/c_glib/test/test-record-batch.rb @@ -221,5 +221,46 @@ def test_invalid end end end + + sub_test_case("#validate_full") do + def setup + @id_field = Arrow::Field.new("uint8", Arrow::UInt8DataType.new) + @name_field = Arrow::Field.new("string", Arrow::StringDataType.new) + @schema = Arrow::Schema.new([@id_field, @name_field]) + + @uint8_value = build_uint_array([1]) + @valid_name_value = build_string_array(["abc"]) + @n_rows = @uint8_value.length + + # U+3042 HIRAGANA LETTER A, U+3044 HIRAGANA LETTER I + data = "\u3042\u3044".b[0..-2] + value_offsets = Arrow::Buffer.new([0, data.size].pack("l*")) + @invalid_name_value = Arrow::StringArray.new(1, + value_offsets, + Arrow::Buffer.new(data), + nil, + -1) + end + + def test_valid + columns = [@uint8_value, @valid_name_value] + record_batch = Arrow::RecordBatch.new(@schema, @n_rows, columns) + + assert do + record_batch.validate_full + end + end + + def test_invalid + message = "[record-batch][validate-full]: Invalid: " + + "In column 1: Invalid: Invalid UTF8 sequence at string index 0" + columns = [@uint8_value, @invalid_name_value] + record_batch = Arrow::RecordBatch.new(@schema, @n_rows, columns) + + assert_raise(Arrow::Error::Invalid.new(message)) do + record_batch.validate_full + end + end + end end end diff --git a/c_glib/test/test-string-view-data-type.rb b/c_glib/test/test-string-view-data-type.rb index f1f3655493c8e..bddc9eeec265b 100644 --- a/c_glib/test/test-string-view-data-type.rb +++ b/c_glib/test/test-string-view-data-type.rb @@ -30,4 +30,11 @@ def test_to_s data_type = Arrow::StringViewDataType.new assert_equal("string_view", data_type.to_s) end + + def test_export + data_type = Arrow::StringViewDataType.new + c_abi_schema = data_type.export + assert_equal(data_type, + Arrow::DataType.import(c_abi_schema)) + end end diff --git a/c_glib/test/test-struct-field-options.rb b/c_glib/test/test-struct-field-options.rb index 4a614de6df6e7..f9b492fc4da94 100644 --- a/c_glib/test/test-struct-field-options.rb +++ b/c_glib/test/test-struct-field-options.rb @@ -42,7 +42,8 @@ def test_set_dot_path end def test_set_invalid - message = "[struct-field-options][set-field-ref]: Invalid: Dot path '[foo]' contained an unterminated index" + message = "[struct-field-options][set-field-ref]: " + + "Invalid: Dot path '[foo]' contained an unterminated index" assert_raise(Arrow::Error::Invalid.new(message)) do @options.field_ref = "[foo]" end diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index 615a90c2f0baf..1c8c6fb3f796f 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -243,6 +243,77 @@ def test_combine_chunks all_values) end + sub_test_case("#validate") do + def setup + @id_field = Arrow::Field.new("id", Arrow::UInt8DataType.new) + @name_field = Arrow::Field.new("name", Arrow::StringDataType.new) + @schema = Arrow::Schema.new([@id_field, @name_field]) + + @id_array = build_uint_array([1]) + @name_array = build_string_array(["abc"]) + @arrays = [@id_array, @name_array] + end + + def test_valid + table = Arrow::Table.new(@schema, @arrays) + + assert do + table.validate + end + end + + def test_invalid + message = "[table][validate]: Invalid: " + + "Column 1 named name expected length 1 but got length 2" + + invalid_values = [@id_array, build_string_array(["abc", "def"])] + table = Arrow::Table.new(@schema, invalid_values) + assert_raise(Arrow::Error::Invalid.new(message)) do + table.validate + end + end + end + + sub_test_case("#validate_full") do + def setup + @id_field = Arrow::Field.new("uint8", Arrow::UInt8DataType.new) + @name_field = Arrow::Field.new("string", Arrow::StringDataType.new) + @schema = Arrow::Schema.new([@id_field, @name_field]) + + @id_values = build_uint_array([1]) + @valid_name_values = build_string_array(["abc"]) + + # U+3042 HIRAGANA LETTER A, U+3044 HIRAGANA LETTER I + data = "\u3042\u3044".b[0..-2] + value_offsets = Arrow::Buffer.new([0, data.size].pack("l*")) + @invalid_name_values = Arrow::StringArray.new(1, + value_offsets, + Arrow::Buffer.new(data), + nil, + -1) + end + + def test_valid + columns = [@id_values, @valid_name_values] + table = Arrow::Table.new(@schema, columns) + + assert do + table.validate_full + end + end + + def test_invalid + message = "[table][validate-full]: Invalid: " + + "Column 1: In chunk 0: Invalid: Invalid UTF8 sequence at string index 0" + columns = [@id_values, @invalid_name_values] + table = Arrow::Table.new(@schema, columns) + + assert_raise(Arrow::Error::Invalid.new(message)) do + table.validate_full + end + end + end + sub_test_case("#write_as_feather") do def setup super diff --git a/c_glib/test/test-take.rb b/c_glib/test/test-take.rb index f97c7ad730bc6..5b7af2d21f376 100644 --- a/c_glib/test/test-take.rb +++ b/c_glib/test/test-take.rb @@ -23,7 +23,7 @@ class TestTake < Test::Unit::TestCase def test_no_null indices = build_int16_array([1, 0, 2]) assert_equal(build_int16_array([0, 1, 2]), - build_int16_array([1, 0 ,2]).take(indices)) + build_int16_array([1, 0, 2]).take(indices)) end def test_null diff --git a/c_glib/test/test-uint-array-builder.rb b/c_glib/test/test-uint-array-builder.rb index 89621189b4571..3aa3a1c488d83 100644 --- a/c_glib/test/test-uint-array-builder.rb +++ b/c_glib/test/test-uint-array-builder.rb @@ -32,9 +32,9 @@ def test_uint16 values = [0, border_value] assert_equal(build_uint_array([*values, nil]), Arrow::UInt16Array.new(3, - Arrow::Buffer.new(values.pack("S*")), - Arrow::Buffer.new([0b011].pack("C*")), - -1)) + Arrow::Buffer.new(values.pack("S*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) end def test_uint32 @@ -42,9 +42,9 @@ def test_uint32 values = [0, border_value] assert_equal(build_uint_array([*values, nil]), Arrow::UInt32Array.new(3, - Arrow::Buffer.new(values.pack("L*")), - Arrow::Buffer.new([0b011].pack("C*")), - -1)) + Arrow::Buffer.new(values.pack("L*")), + Arrow::Buffer.new([0b011].pack("C*")), + -1)) end def test_uint64 diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index fb75f3995c62e..a13b31c2e82df 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,64 +21,47 @@ # SOFTWARE. sources: + "18.1.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-18.1.0/apache-arrow-18.1.0.tar.gz?action=download" + sha256: "2dc8da5f8796afe213ecc5e5aba85bb82d91520eff3cf315784a52d0fa61d7fc" + "18.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-18.0.0/apache-arrow-18.0.0.tar.gz?action=download" + sha256: "abcf1934cd0cdddd33664e9f2d9a251d6c55239d1122ad0ed223b13a583c82a9" + "17.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-17.0.0/apache-arrow-17.0.0.tar.gz?action=download" + sha256: "9d280d8042e7cf526f8c28d170d93bfab65e50f94569f6a790982a878d8d898d" + "16.1.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-16.1.0/apache-arrow-16.1.0.tar.gz?action=download" + sha256: "c9e60c7e87e59383d21b20dc874b17153729ee153264af6d21654b7dff2c60d7" "15.0.0": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-15.0.0/apache-arrow-15.0.0.tar.gz?action=download" sha256: "01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d" "14.0.2": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.2/apache-arrow-14.0.2.tar.gz?action=download" sha256: "1304dedb41896008b89fe0738c71a95d9b81752efc77fa70f264cb1da15d9bc2" - "14.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.1/apache-arrow-14.0.1.tar.gz?action=download" - sha256: "5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e" - "14.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.0/apache-arrow-14.0.0.tar.gz?action=download" - sha256: "4eb0da50ec071baf15fc163cb48058931e006f1c862c8def0e180fd07d531021" - "13.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-13.0.0/apache-arrow-13.0.0.tar.gz?action=download" - sha256: "35dfda191262a756be934eef8afee8d09762cad25021daa626eb249e251ac9e6" - "12.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.1/apache-arrow-12.0.1.tar.gz?action=download" - sha256: "3481c411393aa15c75e88d93cf8315faf7f43e180fe0790128d3840d417de858" - "12.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.0/apache-arrow-12.0.0.tar.gz?action=download" - sha256: "ddd8347882775e53af7d0965a1902b7d8fcd0a030fd14f783d4f85e821352d52" - "11.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-11.0.0/apache-arrow-11.0.0.tar.gz?action=download" - sha256: "2dd8f0ea0848a58785628ee3a57675548d509e17213a2f5d72b0d900b43f5430" - "10.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.1/apache-arrow-10.0.1.tar.gz?action=download" - sha256: "c814e0670112a22c1a6ec03ab420a52ae236a9a42e9e438c3cbd37f37e658fb3" - "10.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.0/apache-arrow-10.0.0.tar.gz?action=download" - sha256: "5b46fa4c54f53e5df0019fe0f9d421e93fc906b625ebe8e89eed010d561f1f12" - "8.0.1": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-8.0.1/apache-arrow-8.0.1.tar.gz?action=download" - sha256: "82d46929f7574715551da21700f100b39f99c3c4d6790f26cac86d869d64e94e" - "8.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-8.0.0/apache-arrow-8.0.0.tar.gz?action=download" - sha256: "ad9a05705117c989c116bae9ac70492fe015050e1b80fb0e38fde4b5d863aaa3" - "7.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz?action=download" - sha256: "e8f49b149a15ecef4e40fcfab1b87c113c6b1ee186005c169e5cdf95d31a99de" patches: - "8.0.1": - - patch_file: "patches/8.0.0-0005-install-utils.patch" - patch_description: "enable utils installation" + "18.1.0": + - patch_file: "patches/18.0.0-0001-fix-cmake.patch" + patch_description: "use cci package" patch_type: "conan" - - patch_file: "patches/8.0.0-0006-fix-cmake.patch" + "18.0.0": + - patch_file: "patches/18.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "8.0.0": - - patch_file: "patches/8.0.0-0005-install-utils.patch" - patch_description: "enable utils installation" + "17.0.0": + - patch_file: "patches/16.0.0-0001-fix-cmake.patch" + patch_description: "use cci package" patch_type: "conan" - - patch_file: "patches/8.0.0-0006-fix-cmake.patch" + "16.1.0": + - patch_file: "patches/16.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "7.0.0": - - patch_file: "patches/7.0.0-0006-install-utils.patch" - patch_description: "enable utils installation" + "15.0.0": + - patch_file: "patches/11.0.0-0001-fix-cmake.patch" + patch_description: "use cci package" patch_type: "conan" - - patch_file: "patches/7.0.0-0007-fix-cmake.patch" + "14.0.2": + - patch_file: "patches/11.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" + \ No newline at end of file diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 178cd03da1555..5db9fe356726a 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -31,7 +31,7 @@ import os import glob -required_conan_version = ">=1.53.0" +required_conan_version = ">=2.1.0" class ArrowConan(ConanFile): name = "arrow" @@ -93,7 +93,7 @@ class ArrowConan(ConanFile): "shared": False, "fPIC": True, "gandiva": False, - "parquet": False, + "parquet": True, "skyhook": False, "substrait": False, "acero": False, @@ -108,7 +108,7 @@ class ArrowConan(ConanFile): "simd_level": "default", "runtime_simd_level": "max", "with_backtrace": False, - "with_boost": False, + "with_boost": True, "with_brotli": False, "with_bz2": False, "with_csv": False, @@ -122,7 +122,7 @@ class ArrowConan(ConanFile): "with_glog": False, "with_grpc": False, "with_json": False, - "with_thrift": False, + "with_thrift": True, "with_llvm": False, "with_openssl": False, "with_opentelemetry": False, @@ -133,7 +133,7 @@ class ArrowConan(ConanFile): "with_utf8proc": False, "with_lz4": False, "with_snappy": False, - "with_zlib": False, + "with_zlib": True, "with_zstd": False, } short_paths = True @@ -144,21 +144,6 @@ def _min_cppstd(self): # https://github.com/apache/arrow/pull/13991 return "11" if Version(self.version) < "10.0.0" else "17" - @property - def _compilers_minimum_version(self): - return { - "11": { - "clang": "3.9", - }, - "17": { - "gcc": "8", - "clang": "7", - "apple-clang": "10", - "Visual Studio": "15", - "msvc": "191", - }, - }.get(self._min_cppstd, {}) - def export_sources(self): export_conandata_patches(self) copy(self, "conan_cmake_project_include.cmake", self.recipe_folder, os.path.join(self.export_sources_folder, "src")) @@ -183,15 +168,15 @@ def _requires_rapidjson(self): def requirements(self): if self.options.with_thrift: - self.requires("thrift/0.17.0") + self.requires("thrift/0.20.0") if self.options.with_protobuf: - self.requires("protobuf/3.21.9") + self.requires("protobuf/3.21.12") if self.options.with_jemalloc: self.requires("jemalloc/5.3.0") if self.options.with_mimalloc: self.requires("mimalloc/1.7.6") if self.options.with_boost: - self.requires("boost/1.84.0") + self.requires("boost/1.85.0") if self.options.with_gflags: self.requires("gflags/2.2.2") if self.options.with_glog: @@ -223,18 +208,23 @@ def requirements(self): if self.options.with_snappy: self.requires("snappy/1.1.9") if self.options.get_safe("simd_level") != None or \ - self.options.get_safe("runtime_simd_level") != None: - self.requires("xsimd/9.0.1") + self.options.get_safe("runtime_simd_level") != None: + if Version(self.version) < 8: + self.requires("xsimd/9.0.1") + else: + self.requires("xsimd/13.0.0") if self.options.with_zlib: self.requires("zlib/[>=1.2.11 <2]") if self.options.with_zstd: - self.requires("zstd/1.5.5") + self.requires("zstd/[>=1.5 <1.6]") if self.options.with_re2: self.requires("re2/20230301") if self.options.with_utf8proc: self.requires("utf8proc/2.8.0") if self.options.with_backtrace: self.requires("libbacktrace/cci.20210118") + if self.options.with_orc: + self.requires("orc/2.0.0") def validate(self): # Do not allow options with 'auto' value @@ -247,27 +237,35 @@ def validate(self): # From https://github.com/conan-io/conan-center-index/pull/23163#issuecomment-2039808851 if self.options.gandiva: if not self.options.with_re2: - raise ConanException("'with_re2' option should be True when'gandiva=True'") + raise ConanException("'with_re2' option should be True when 'gandiva=True'") if not self.options.with_boost: - raise ConanException("'with_boost' option should be True when'gandiva=True'") + raise ConanException("'with_boost' option should be True when 'gandiva=True'") if not self.options.with_utf8proc: - raise ConanException("'with_utf8proc' option should be True when'gandiva=True'") + raise ConanException("'with_utf8proc' option should be True when 'gandiva=True'") + if self.options.with_thrift and not self.options.with_boost: + raise ConanException("'with_boost' option should be True when 'thrift=True'") + if self.options.parquet: + if not self.options.with_thrift: + raise ConanException("'with_thrift' option should be True when 'parquet=True'") + if self.options.with_flight_rpc and not self.options.with_protobuf: + raise ConanException("'with_protobuf' option should be True when 'with_flight_rpc=True'") if self.settings.compiler.get_safe("cppstd"): check_min_cppstd(self, self._min_cppstd) - minimum_version = self._compilers_minimum_version.get(str(self.settings.compiler), False) - if minimum_version and Version(self.settings.compiler.version) < minimum_version: + if ( + Version(self.version) < "10.0.0" + and self.settings.compiler == "clang" + and Version(self.settings.compiler.version) < "3.9" + ): raise ConanInvalidConfiguration( - f"{self.ref} requires C++{self._min_cppstd}, which your compiler does not support." + f"{self.ref} requires C++11, which needs at least clang-3.9" ) if self.options.get_safe("skyhook", False): raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") if self.options.with_cuda: raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") - if self.options.with_orc: - raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") if self.options.with_s3 and not self.dependencies["aws-sdk-cpp"].options.config: raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") @@ -275,6 +273,11 @@ def validate(self): if self.dependencies["jemalloc"].options.enable_cxx: raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") + if self.options.with_thrift and not self.options.with_zlib: + raise ConanInvalidConfiguration("arrow:with_thrift requires arrow:with_zlib") + + if self.options.parquet and not self.options.with_thrift: + raise ConanInvalidConfiguration("arrow:parquet requires arrow:with_thrift") def build_requirements(self): if Version(self.version) >= "13.0.0": @@ -352,6 +355,7 @@ def generate(self): tc.variables["GLOG_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_BACKTRACE"] = bool(self.options.with_backtrace) tc.variables["ARROW_WITH_BROTLI"] = bool(self.options.with_brotli) + tc.variables["ARROW_WITH_RE2"] = bool(self.options.with_re2) tc.variables["brotli_SOURCE"] = "SYSTEM" if self.options.with_brotli: tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.dependencies["brotli"].options.shared) @@ -383,8 +387,10 @@ def generate(self): tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.dependencies["zstd"].options.shared) tc.variables["ORC_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_THRIFT"] = bool(self.options.with_thrift) + tc.variables["ARROW_THRIFT"] = bool(self.options.with_thrift) tc.variables["Thrift_SOURCE"] = "SYSTEM" if self.options.with_thrift: + tc.variables["ARROW_THRIFT"] = True tc.variables["THRIFT_VERSION"] = bool(self.dependencies["thrift"].ref.version) # a recent thrift does not require boost tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.dependencies["thrift"].options.shared) tc.variables["ARROW_USE_OPENSSL"] = self.options.with_openssl @@ -444,28 +450,53 @@ def _patch_sources(self): def build(self): self._patch_sources() - cmake =CMake(self) + cmake = CMake(self) cmake.configure(build_script_folder=os.path.join(self.source_folder, "cpp")) cmake.build() def package(self): copy(self, pattern="LICENSE.txt", dst=os.path.join(self.package_folder, "licenses"), src=self.source_folder) copy(self, pattern="NOTICE.txt", dst=os.path.join(self.package_folder, "licenses"), src=self.source_folder) - cmake =CMake(self) + cmake = CMake(self) cmake.install() rmdir(self, os.path.join(self.package_folder, "lib", "cmake")) rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) rmdir(self, os.path.join(self.package_folder, "share")) + cmake_suffix = "shared" if self.options.shared else "static" + + alias_map = { f"Arrow::arrow_{cmake_suffix}": f"arrow::arrow_{cmake_suffix}" } + + if self.options.parquet: + alias_map[f"Parquet::parquet_{cmake_suffix}"] = f"arrow::parquet_{cmake_suffix}" + + if self.options.get_safe("substrait"): + alias_map[f"Arrow::arrow_substrait_{cmake_suffix}"] = f"arrow::arrow_substrait_{cmake_suffix}" + + if self.options.acero: + alias_map[f"Arrow::arrow_acero_{cmake_suffix}"] = f"arrow::arrow_acero_{cmake_suffix}" + + if self.options.gandiva: + alias_map[f"Gandiva::gandiva_{cmake_suffix}"] = f"arrow::gandiva_{cmake_suffix}" + + if self.options.with_flight_rpc: + alias_map[f"ArrowFlight::arrow_flight_sql_{cmake_suffix}"] = f"arrow::arrow_flight_sql_{cmake_suffix}" + + @property + def _module_subfolder(self): + return os.path.join("lib", "cmake") + def package_info(self): # FIXME: fix CMake targets of components self.cpp_info.set_property("cmake_file_name", "Arrow") suffix = "_static" if is_msvc(self) and not self.options.shared else "" + cmake_suffix = "shared" if self.options.shared else "static" self.cpp_info.components["libarrow"].set_property("pkg_config_name", "arrow") + self.cpp_info.components["libarrow"].set_property("cmake_target_name", f"Arrow::arrow_{cmake_suffix}") self.cpp_info.components["libarrow"].libs = [f"arrow{suffix}"] if not self.options.shared: self.cpp_info.components["libarrow"].defines = ["ARROW_STATIC"] @@ -474,6 +505,7 @@ def package_info(self): if self.options.parquet: self.cpp_info.components["libparquet"].set_property("pkg_config_name", "parquet") + self.cpp_info.components["libparquet"].set_property("cmake_target_name", f"Parquet::parquet_{cmake_suffix}") self.cpp_info.components["libparquet"].libs = [f"parquet{suffix}"] self.cpp_info.components["libparquet"].requires = ["libarrow"] if not self.options.shared: @@ -481,6 +513,7 @@ def package_info(self): if self.options.get_safe("substrait"): self.cpp_info.components["libarrow_substrait"].set_property("pkg_config_name", "arrow_substrait") + self.cpp_info.components["libarrow_substrait"].set_property("cmake_target_name", f"Arrow::arrow_substrait_{cmake_suffix}") self.cpp_info.components["libarrow_substrait"].libs = [f"arrow_substrait{suffix}"] self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset"] @@ -488,6 +521,8 @@ def package_info(self): del self.options.plasma if self.options.acero: + self.cpp_info.components["libacero"].set_property("pkg_config_name", "acero") + self.cpp_info.components["libacero"].set_property("cmake_target_name", f"Acero::arrow_acero_{cmake_suffix}") self.cpp_info.components["libacero"].libs = [f"arrow_acero{suffix}"] self.cpp_info.components["libacero"].names["cmake_find_package"] = "acero" self.cpp_info.components["libacero"].names["cmake_find_package_multi"] = "acero" @@ -496,6 +531,7 @@ def package_info(self): if self.options.gandiva: self.cpp_info.components["libgandiva"].set_property("pkg_config_name", "gandiva") + self.cpp_info.components["libgandiva"].set_property("cmake_target_name", f"Gandiva::gandiva_{cmake_suffix}") self.cpp_info.components["libgandiva"].libs = [f"gandiva{suffix}"] self.cpp_info.components["libgandiva"].requires = ["libarrow"] if not self.options.shared: @@ -503,11 +539,16 @@ def package_info(self): if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].set_property("pkg_config_name", "flight_rpc") + self.cpp_info.components["libarrow_flight"].set_property("cmake_target_name", f"ArrowFlight::arrow_flight_{cmake_suffix}") self.cpp_info.components["libarrow_flight"].libs = [f"arrow_flight{suffix}"] self.cpp_info.components["libarrow_flight"].requires = ["libarrow"] + # https://github.com/apache/arrow/pull/43137#pullrequestreview-2267476893 + if Version(self.version) >= "18.0.0" and self.options.with_openssl: + self.cpp_info.components["libarrow_flight"].requires.append("openssl::openssl") if self.options.get_safe("with_flight_sql"): self.cpp_info.components["libarrow_flight_sql"].set_property("pkg_config_name", "flight_sql") + self.cpp_info.components["libarrow_flight_sql"].set_property("cmake_target_name", f"ArrowFlightSql::arrow_flight_sql_{cmake_suffix}") self.cpp_info.components["libarrow_flight_sql"].libs = [f"arrow_flight_sql{suffix}"] self.cpp_info.components["libarrow_flight_sql"].requires = ["libarrow", "libarrow_flight"] @@ -560,7 +601,8 @@ def package_info(self): if self._requires_rapidjson(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") if self.options.with_s3: - self.cpp_info.components["libarrow"].requires.append("aws-sdk-cpp::s3") + # https://github.com/apache/arrow/blob/6b268f62a8a172249ef35f093009c740c32e1f36/cpp/src/arrow/CMakeLists.txt#L98 + self.cpp_info.components["libarrow"].requires.extend([f"aws-sdk-cpp::{x}" for x in ["cognito-identity", "core", "identity-management", "s3", "sts"]]) if self.options.get_safe("with_gcs"): self.cpp_info.components["libarrow"].requires.append("google-cloud-cpp::storage") if self.options.with_orc: @@ -581,32 +623,7 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("zlib::zlib") if self.options.with_zstd: self.cpp_info.components["libarrow"].requires.append("zstd::zstd") - if self.options.with_boost: - self.cpp_info.components["libarrow"].requires.append("boost::boost") if self.options.with_grpc: self.cpp_info.components["libarrow"].requires.append("grpc::grpc") if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].requires.append("protobuf::protobuf") - - # TODO: to remove in conan v2 - self.cpp_info.filenames["cmake_find_package"] = "Arrow" - self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" - if self.options.parquet: - self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" - self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" - if self.options.get_safe("substrait"): - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" - if self.options.gandiva: - self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" - self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" - if self.options.with_flight_rpc: - self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" - if self.options.get_safe("with_flight_sql"): - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" - if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): - self.env_info.PATH.append(os.path.join(self.package_folder, "bin")) diff --git a/ci/conan/all/patches/1.0.0-0003-fix-shared-msvc.patch b/ci/conan/all/patches/1.0.0-0003-fix-shared-msvc.patch deleted file mode 100644 index 45210d1b8cc51..0000000000000 --- a/ci/conan/all/patches/1.0.0-0003-fix-shared-msvc.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/src/arrow/CMakeLists.txt -+++ cpp/src/arrow/CMakeLists.txt -@@ -490,6 +490,10 @@ - target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) - endif() - -+if(ARROW_BUILD_SHARED AND WIN32) -+target_compile_definitions(arrow_shared PRIVATE ARROW_EXPORTING) -+endif() -+ - if(ARROW_WITH_BACKTRACE) - find_package(Backtrace) - diff --git a/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch b/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch deleted file mode 100644 index 199804bff00ab..0000000000000 --- a/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch +++ /dev/null @@ -1,44 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/src/arrow/vendored/datetime/date.h b/cpp/src/arrow/vendored/datetime/date.h -index 02a4909..2b168d2 100644 ---- a/cpp/src/arrow/vendored/datetime/date.h -+++ b/cpp/src/arrow/vendored/datetime/date.h -@@ -5152,7 +5152,7 @@ to_stream(std::basic_ostream& os, const CharT* fmt, - if (modified == CharT{}) - #endif - { -- auto h = *fmt == CharT{'I'} ? make12(hms.hours()) : hms.hours(); -+ auto h = *fmt == CharT{'I'} ? arrow_vendored::date::make12(hms.hours()) : hms.hours(); - if (h < hours{10}) - os << CharT{'0'}; - os << h.count(); -@@ -5366,7 +5366,7 @@ to_stream(std::basic_ostream& os, const CharT* fmt, - save_ostream _(os); - os.fill('0'); - os.width(2); -- os << make12(tod.hours()).count() << CharT{':'}; -+ os << arrow_vendored::date::make12(tod.hours()).count() << CharT{':'}; - os.width(2); - os << tod.minutes().count() << CharT{':'}; - os.width(2); diff --git a/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch b/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch deleted file mode 100644 index 3ecd0bf9f3968..0000000000000 --- a/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch +++ /dev/null @@ -1,355 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 300f043..0127a7a 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -654,7 +654,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - endif() -@@ -664,7 +664,7 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) - endif() - - if(ARROW_WITH_SNAPPY) -@@ -800,8 +800,11 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake -index eb10ebe..9c81017 100644 ---- a/cpp/cmake_modules/BuildUtils.cmake -+++ b/cpp/cmake_modules/BuildUtils.cmake -@@ -165,10 +165,10 @@ function(create_merged_static_lib output_target) - set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) - - file(WRITE ${ar_script_path}.in "CREATE ${output_lib_path}\n") -- file(APPEND ${ar_script_path}.in "ADDLIB $\n") -+ file(APPEND ${ar_script_path}.in "ADDLIB $\n") - - foreach(lib ${ARG_TO_MERGE}) -- file(APPEND ${ar_script_path}.in "ADDLIB $\n") -+ file(APPEND ${ar_script_path}.in "ADDLIB $\n") - endforeach() - - file(APPEND ${ar_script_path}.in "SAVE\nEND\n") -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index 807e2b9..016c8db 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -154,16 +154,7 @@ macro(build_dependency DEPENDENCY_NAME) - endmacro() - - macro(resolve_dependency DEPENDENCY_NAME) -- if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") -- find_package(${DEPENDENCY_NAME} MODULE) -- if(NOT ${${DEPENDENCY_NAME}_FOUND}) -- build_dependency(${DEPENDENCY_NAME}) -- endif() -- elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "BUNDLED") -- build_dependency(${DEPENDENCY_NAME}) -- elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM") -- find_package(${DEPENDENCY_NAME} REQUIRED) -- endif() -+ find_package(${DEPENDENCY_NAME} REQUIRED) - endmacro() - - macro(resolve_dependency_with_version DEPENDENCY_NAME REQUIRED_VERSION) -@@ -765,6 +756,7 @@ endif() - # - Tests need Boost at runtime. - # - S3FS and Flight benchmarks need Boost at runtime. - if(ARROW_BUILD_INTEGRATION -+ OR ARROW_BOOST_REQUIRED - OR ARROW_BUILD_TESTS - OR ARROW_GANDIVA - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) -@@ -785,7 +777,7 @@ if(ARROW_BOOST_REQUIRED) - elseif(BOOST_SOURCE STREQUAL "BUNDLED") - build_boost() - elseif(BOOST_SOURCE STREQUAL "SYSTEM") -- find_package(BoostAlt ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) -+ find_package(Boost ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) - endif() - - if(TARGET Boost::system) -@@ -936,11 +928,11 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli) -+ resolve_dependency(brotli) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) -+ include_directories(SYSTEM ${brotli_INCLUDE_DIR}) - endif() - - if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) -@@ -1146,9 +1138,10 @@ if(ARROW_NEED_GFLAGS) - endif() - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) -+ set(GFLAGS_LIBRARIES ${gflags_LIBRARIES}) - -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -+ if(0) - if(TARGET gflags-shared) - set(GFLAGS_LIBRARIES gflags-shared) - elseif(TARGET gflags_shared) -@@ -1237,12 +1230,13 @@ endmacro() - if(ARROW_WITH_THRIFT) - # We already may have looked for Thrift earlier, when considering whether - # to build Boost, so don't look again if already found. -- if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) -+ if(0) - # Thrift c++ code generated by 0.13 requires 0.11 or greater - resolve_dependency_with_version(Thrift 0.11.0) - endif() -+ find_package(Thrift CONFIG REQUIRED) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - endif() - - # ---------------------------------------------------------------------- -@@ -1407,6 +1401,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1465,12 +1460,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1518,6 +1519,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1918,11 +1924,16 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4) -+ resolve_dependency(lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) -+ if(TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() -+ include_directories(SYSTEM ${lz4_INCLUDE_DIR}) -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${lz4_LIBRARIES_TARGETS} ) - endif() - - macro(build_zstd) -@@ -2037,10 +2048,10 @@ macro(build_re2) - endmacro() - - if(ARROW_GANDIVA) -- resolve_dependency(RE2) -+ resolve_dependency(re2) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) -+ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) - endif() - -@@ -2480,17 +2491,24 @@ if(ARROW_WITH_GRPC) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) -+ # get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) -+ if(grpc_INCLUDE_DIRS_RELEASE) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_RELEASE}) -+ elseif(grpc_INCLUDE_DIRS_DEBUG) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_DEBUG}) -+ endif() - include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) -+ include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(GRPC_VENDORED) - set(GRPCPP_PP_INCLUDE TRUE) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 5797a78..da6bd4d 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -292,10 +292,15 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() -+ - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 784bf7b..8f005a5 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -31,7 +31,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC -diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt -index 85e8db6..cd70c63 100644 ---- a/cpp/src/gandiva/CMakeLists.txt -+++ b/cpp/src/gandiva/CMakeLists.txt -@@ -25,7 +25,7 @@ add_custom_target(gandiva-benchmarks) - - add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) - --find_package(LLVMAlt REQUIRED) -+find_package(LLVM REQUIRED) - - if(LLVM_VERSION_MAJOR LESS "10") - set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -@@ -88,9 +88,16 @@ set(SRC_FILES - random_generator_holder.cc - ${GANDIVA_PRECOMPILED_CC_PATH}) - --set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE RE2::re2) - --set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE RE2::re2) -+ function(get_all_targets var) -+ set(targets) -+ get_all_targets_recursive(targets ${CMAKE_CURRENT_SOURCE_DIR}) -+ set(${var} ${targets} PARENT_SCOPE) -+endfunction() -+ -+set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core re2::re2) -+ -+set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core re2::re2) - - if(ARROW_GANDIVA_STATIC_LIBSTDCPP - AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)) -@@ -131,7 +138,7 @@ add_arrow_lib(gandiva - arrow_dependencies - precompiled - EXTRA_INCLUDES -- $ -+ $ - SHARED_LINK_FLAGS - ${GANDIVA_SHARED_LINK_FLAGS} - SHARED_LINK_LIBS -@@ -203,7 +210,7 @@ endfunction() - - set(GANDIVA_INTERNALS_TEST_ARGUMENTS) - if(WIN32) -- list(APPEND GANDIVA_INTERNALS_TEST_ARGUMENTS EXTRA_LINK_LIBS LLVM::LLVM_INTERFACE) -+ list(APPEND GANDIVA_INTERNALS_TEST_ARGUMENTS EXTRA_LINK_LIBS llvm-core::llvm-core) - endif() - add_gandiva_test(internals-test - SOURCES -@@ -225,9 +232,9 @@ add_gandiva_test(internals-test - decimal_type_util_test.cc - random_generator_holder_test.cc - EXTRA_DEPENDENCIES -- LLVM::LLVM_INTERFACE -+ llvm-core::llvm-core - EXTRA_INCLUDES -- $ -+ $ - ${GANDIVA_INTERNALS_TEST_ARGUMENTS}) - - if(ARROW_GANDIVA_JAVA) diff --git a/ci/conan/all/patches/11.0.0-0001-fix-cmake.patch b/ci/conan/all/patches/11.0.0-0001-fix-cmake.patch new file mode 100644 index 0000000000000..37f36f99a0c33 --- /dev/null +++ b/ci/conan/all/patches/11.0.0-0001-fix-cmake.patch @@ -0,0 +1,64 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index f3e49021d..95177c2a6 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,22 +45,21 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++ ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) ++endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() + endif() + + function(extract_thrift_version) diff --git a/ci/conan/all/patches/16.0.0-0001-fix-cmake.patch b/ci/conan/all/patches/16.0.0-0001-fix-cmake.patch new file mode 100644 index 0000000000000..6077237139d49 --- /dev/null +++ b/ci/conan/all/patches/16.0.0-0001-fix-cmake.patch @@ -0,0 +1,84 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index f3e49021d..3e63f1edf 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,23 +45,23 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++ ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) + endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() ++endif() ++ + + function(extract_thrift_version) + if(ThriftAlt_INCLUDE_DIR) +diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt +index 93f2e72d8..e00f73f7d 100644 +--- a/cpp/src/parquet/CMakeLists.txt ++++ b/cpp/src/parquet/CMakeLists.txt +@@ -262,11 +262,11 @@ if(NOT PARQUET_MINIMAL_DEPENDENCY) + + # These are libraries that we will link privately with parquet_shared (as they + # do not need to be linked transitively by other linkers) +- list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS Boost::headers thrift::thrift) + + # Link publicly with parquet_static (because internal users need to + # transitively link all dependencies) +- list(APPEND PARQUET_STATIC_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_STATIC_LINK_LIBS Boost::headers thrift::thrift) + if(NOT THRIFT_VENDORED) + list(APPEND PARQUET_STATIC_INSTALL_INTERFACE_LIBS thrift::thrift) + endif() diff --git a/ci/conan/all/patches/18.0.0-0001-fix-cmake.patch b/ci/conan/all/patches/18.0.0-0001-fix-cmake.patch new file mode 100644 index 0000000000000..9abff332e4b6d --- /dev/null +++ b/ci/conan/all/patches/18.0.0-0001-fix-cmake.patch @@ -0,0 +1,81 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index 98a706d..edf195e 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,22 +45,20 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) ++endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() + endif() + + function(extract_thrift_version) +diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt +index b984ef7..429fc6d 100644 +--- a/cpp/src/parquet/CMakeLists.txt ++++ b/cpp/src/parquet/CMakeLists.txt +@@ -263,11 +263,11 @@ if(NOT PARQUET_MINIMAL_DEPENDENCY) + + # These are libraries that we will link privately with parquet_shared (as they + # do not need to be linked transitively by other linkers) +- list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS Boost::headers thrift::thrift) + + # Link publicly with parquet_static (because internal users need to + # transitively link all dependencies) +- list(APPEND PARQUET_STATIC_LINK_LIBS thrift::thrift) ++ list(APPEND PARQUET_STATIC_LINK_LIBS Boost::headers thrift::thrift) + if(NOT THRIFT_VENDORED) + list(APPEND PARQUET_STATIC_INSTALL_INTERFACE_LIBS thrift::thrift) + endif() diff --git a/ci/conan/all/patches/2.0.0-0003-fix-shared-msvc.patch b/ci/conan/all/patches/2.0.0-0003-fix-shared-msvc.patch deleted file mode 100644 index 3583e5c221707..0000000000000 --- a/ci/conan/all/patches/2.0.0-0003-fix-shared-msvc.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/src/arrow/CMakeLists.txt -+++ cpp/src/arrow/CMakeLists.txt -@@ -504,6 +504,10 @@ - target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) - endif() - -+if(ARROW_BUILD_SHARED AND WIN32) -+target_compile_definitions(arrow_shared PRIVATE ARROW_EXPORTING) -+endif() -+ - if(ARROW_WITH_BACKTRACE) - find_package(Backtrace) - diff --git a/ci/conan/all/patches/2.0.0-0005-gandiva-engine.patch b/ci/conan/all/patches/2.0.0-0005-gandiva-engine.patch deleted file mode 100644 index 6dc0c7947a5e0..0000000000000 --- a/ci/conan/all/patches/2.0.0-0005-gandiva-engine.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/src/gandiva/engine.cc -+++ cpp/src/gandiva/engine.cc -@@ -64,6 +64,10 @@ - #include - #include - -+#if GANDIVA_LLVM_VERSION >= 11 -+#include -+#endif -+ - #if defined(_MSC_VER) - #pragma warning(pop) - #endif diff --git a/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch b/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch deleted file mode 100644 index abdcf7a0fa36a..0000000000000 --- a/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch +++ /dev/null @@ -1,295 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 515e6af..7488161 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -109,7 +109,7 @@ set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") - set(ARROW_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") - set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") - --set(ARROW_LLVM_VERSIONS "10" "9" "8" "7") -+set(ARROW_LLVM_VERSIONS "13" "12" "11" "10" "9" "8" "7") - list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) - string(REGEX - REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR -@@ -667,7 +667,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - if(Brotli_SOURCE STREQUAL "SYSTEM") -@@ -683,9 +683,9 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) - if(Lz4_SOURCE STREQUAL "SYSTEM") -- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4::lz4) - endif() - endif() - -@@ -842,8 +842,14 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_LINK_LIBS mimalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() -+ - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index cc37a3c..8fe6db9 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -171,6 +171,7 @@ macro(provide_find_module DEPENDENCY_NAME) - endmacro() - - macro(resolve_dependency DEPENDENCY_NAME) -+if(0) - set(options) - set(one_value_args REQUIRED_VERSION) - cmake_parse_arguments(ARG -@@ -207,6 +208,14 @@ macro(resolve_dependency DEPENDENCY_NAME) - provide_find_module(${DEPENDENCY_NAME}) - list(APPEND ARROW_SYSTEM_DEPENDENCIES ${DEPENDENCY_NAME}) - endif() -+else() -+ if(ARG_REQUIRED_VERSION) -+ find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION} REQUIRED) -+ else() -+ find_package(${DEPENDENCY_NAME} REQUIRED) -+ endif() -+ list(APPEND ARROW_SYSTEM_DEPENDENCIES ${DEPENDENCY_NAME}) -+endif() - endmacro() - - # ---------------------------------------------------------------------- -@@ -826,6 +835,7 @@ endif() - # - Tests need Boost at runtime. - # - S3FS and Flight benchmarks need Boost at runtime. - if(ARROW_BUILD_INTEGRATION -+ OR ARROW_BOOST_REQUIRED - OR ARROW_BUILD_TESTS - OR ARROW_GANDIVA - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) -@@ -846,7 +856,7 @@ if(ARROW_BOOST_REQUIRED) - elseif(BOOST_SOURCE STREQUAL "BUNDLED") - build_boost() - elseif(BOOST_SOURCE STREQUAL "SYSTEM") -- find_package(BoostAlt ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) -+ find_package(Boost ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) - endif() - - if(TARGET Boost::system) -@@ -973,11 +983,11 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli) -+ resolve_dependency(brotli) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) -+ include_directories(SYSTEM ${brotli_INCLUDE_DIR}) - endif() - - if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) -@@ -1200,9 +1210,10 @@ if(ARROW_NEED_GFLAGS) - endif() - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) -+ set(GFLAGS_LIBRARIES ${gflags_LIBRARIES}) - -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -+ if(0) - if(TARGET gflags-shared) - set(GFLAGS_LIBRARIES gflags-shared) - elseif(TARGET gflags_shared) -@@ -1291,12 +1302,13 @@ endmacro() - if(ARROW_WITH_THRIFT) - # We already may have looked for Thrift earlier, when considering whether - # to build Boost, so don't look again if already found. -- if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) -+ if(0) - # Thrift c++ code generated by 0.13 requires 0.11 or greater - resolve_dependency(Thrift REQUIRED_VERSION 0.11.0) - endif() -+ find_package(Thrift CONFIG REQUIRED) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - endif() - - # ---------------------------------------------------------------------- -@@ -1461,6 +1473,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1519,12 +1532,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1572,6 +1591,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1971,11 +1995,16 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4) -+ resolve_dependency(lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -- include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) -+ if(TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() -+ include_directories(SYSTEM ${lz4_INCLUDE_DIR}) -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${lz4_LIBRARIES_TARGETS} ) - endif() - - macro(build_zstd) -@@ -2090,10 +2119,10 @@ macro(build_re2) - endmacro() - - if(ARROW_GANDIVA) -- resolve_dependency(RE2) -+ resolve_dependency(re2) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) -+ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) - endif() - -@@ -2541,17 +2570,24 @@ if(ARROW_WITH_GRPC) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) -+ if(grpc_INCLUDE_DIRS_RELEASE) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_RELEASE}) -+ elseif(grpc_INCLUDE_DIRS_DEBUG) -+ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_DEBUG}) -+ endif() -+ - include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) -+ include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(GRPC_VENDORED) - set(GRPCPP_PP_INCLUDE TRUE) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${gRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 2751254..842fc9e 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -307,10 +307,14 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 784bf7b..8f005a5 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -31,7 +31,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/7.0.0-0006-install-utils.patch b/ci/conan/all/patches/7.0.0-0006-install-utils.patch deleted file mode 100644 index 7674174c8e254..0000000000000 --- a/ci/conan/all/patches/7.0.0-0006-install-utils.patch +++ /dev/null @@ -1,39 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt -index 495018e..f6cee6f 100644 ---- a/cpp/src/arrow/ipc/CMakeLists.txt -+++ b/cpp/src/arrow/ipc/CMakeLists.txt -@@ -61,8 +61,12 @@ endif() - if(ARROW_BUILD_UTILITIES OR ARROW_BUILD_INTEGRATION) - add_executable(arrow-file-to-stream file_to_stream.cc) - target_link_libraries(arrow-file-to-stream ${ARROW_UTIL_LIB}) -+ install(TARGETS arrow-file-to-stream ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) - add_executable(arrow-stream-to-file stream_to_file.cc) - target_link_libraries(arrow-stream-to-file ${ARROW_UTIL_LIB}) -+ install(TARGETS arrow-stream-to-file ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) - - if(ARROW_BUILD_INTEGRATION) - add_dependencies(arrow-integration arrow-file-to-stream) diff --git a/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch b/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch deleted file mode 100644 index eb2acb1523fc3..0000000000000 --- a/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch +++ /dev/null @@ -1,369 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 2d7baf1..dff5b1a 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -699,7 +699,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - if(Brotli_SOURCE STREQUAL "SYSTEM") -@@ -715,10 +715,17 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -- if(Lz4_SOURCE STREQUAL "SYSTEM") -- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) -- endif() -+ if (TARGET LZ4::lz4_static) -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) -+ endif() -+ else() -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) -+ endif() -+endif() - endif() - - if(ARROW_WITH_SNAPPY) -@@ -907,8 +914,13 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_LINK_LIBS mimalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index bc38952..62bf314 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -954,7 +954,7 @@ endif() - - if(ARROW_BOOST_REQUIRED) - resolve_dependency(Boost -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_BOOST_REQUIRED_VERSION} -@@ -965,7 +965,7 @@ if(ARROW_BOOST_REQUIRED) - if(TARGET Boost::system) - set(BOOST_SYSTEM_LIBRARY Boost::system) - set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) -- elseif(BoostAlt_FOUND) -+ elseif(Boost_FOUND) - set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) - set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) - else() -@@ -1108,9 +1108,9 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) -+ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) - endif() -@@ -1302,22 +1302,17 @@ endmacro() - if(ARROW_NEED_GFLAGS) - set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") - resolve_dependency(gflags -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GFLAGS_REQUIRED_VERSION} - IS_RUNTIME_DEPENDENCY - FALSE) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) - -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -- if(TARGET gflags-shared) -- set(GFLAGS_LIBRARIES gflags-shared) -- elseif(TARGET gflags_shared) -- set(GFLAGS_LIBRARIES gflags_shared) -- endif() -- endif() -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${gflags_LIBRARIES_TARGETS}) -+ set(GFLAGS_LIBRARIES gflags::gflags) - endif() - - # ---------------------------------------------------------------------- -@@ -1411,9 +1406,9 @@ if(ARROW_WITH_THRIFT) - thrift) - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - -- string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) -+ string(REPLACE "." ";" VERSION_LIST ${Thrift_VERSION}) - list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) - list(GET VERSION_LIST 1 THRIFT_VERSION_MINOR) - list(GET VERSION_LIST 2 THRIFT_VERSION_PATCH) -@@ -1528,6 +1523,7 @@ if(ARROW_WITH_PROTOBUF) - set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") - endif() - resolve_dependency(Protobuf -+ USE_CONFIG - REQUIRED_VERSION - ${ARROW_PROTOBUF_REQUIRED_VERSION} - PC_PACKAGE_NAMES -@@ -1538,7 +1534,7 @@ if(ARROW_WITH_PROTOBUF) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(TARGET arrow::protobuf::libprotobuf) - set(ARROW_PROTOBUF_LIBPROTOBUF arrow::protobuf::libprotobuf) -@@ -1547,9 +1543,9 @@ if(ARROW_WITH_PROTOBUF) - if(NOT TARGET protobuf::libprotobuf) - add_library(protobuf::libprotobuf UNKNOWN IMPORTED) - set_target_properties(protobuf::libprotobuf -- PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" -+ PROPERTIES IMPORTED_LOCATION "${Protobuf_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) - endif() -@@ -1569,7 +1565,7 @@ if(ARROW_WITH_PROTOBUF) - set_target_properties(protobuf::libprotoc - PROPERTIES IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) - endif() -@@ -1600,6 +1596,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1665,12 +1662,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1716,6 +1719,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -2001,7 +2009,7 @@ endmacro() - if(ARROW_WITH_RAPIDJSON) - set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") - resolve_dependency(RapidJSON -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_RAPIDJSON_REQUIRED_VERSION} -@@ -2038,10 +2046,9 @@ endmacro() - - if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" - )) -- set(xsimd_SOURCE "BUNDLED") - resolve_dependency(xsimd) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) -+ include_directories(SYSTEM ${xsimd_INCLUDE_DIR}) - endif() - - macro(build_zlib) -@@ -2140,10 +2147,14 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) -+ resolve_dependency(lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -+ if (TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() - include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) - endif() - -@@ -2274,7 +2285,7 @@ if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. -- resolve_dependency(re2 HAVE_ALT TRUE) -+ resolve_dependency(re2 USE_CONFIG TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM") - get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION) - string(APPEND ARROW_PC_LIBS_PRIVATE " ${RE2_LIB}") -@@ -2337,7 +2348,7 @@ endmacro() - if(ARROW_WITH_BZ2) - resolve_dependency(BZip2) - if(${BZip2_SOURCE} STREQUAL "SYSTEM") -- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") -+ string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZip2_LIBRARIES}") - endif() - - if(NOT TARGET BZip2::BZip2) -@@ -2346,7 +2357,7 @@ if(ARROW_WITH_BZ2) - PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() -- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") -+ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") - endif() - - macro(build_utf8proc) -@@ -3555,7 +3566,7 @@ if(ARROW_WITH_GRPC) - set(gRPC_SOURCE "${Protobuf_SOURCE}") - endif() - resolve_dependency(gRPC -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GRPC_REQUIRED_VERSION} -@@ -3573,9 +3584,9 @@ if(ARROW_WITH_GRPC) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${gPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -@@ -4097,9 +4108,9 @@ macro(build_opentelemetry) - endmacro() - - if(ARROW_WITH_OPENTELEMETRY) -- set(opentelemetry-cpp_SOURCE "AUTO") -+ set(opentelemetry-cpp_SOURCE "SYSTEM") - resolve_dependency(opentelemetry-cpp) -- get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api -+ get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::opentelemetry_common - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) - message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index b984bc1..2c78cd9 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -323,10 +323,14 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt -index 2cf8c99..90ebb9a 100644 ---- a/cpp/src/arrow/flight/CMakeLists.txt -+++ b/cpp/src/arrow/flight/CMakeLists.txt -@@ -17,6 +17,9 @@ - - add_custom_target(arrow_flight) - -+# TODO: This is a temporary workaround. absl should be LINKED as TARGET. -+include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ - arrow_install_all_headers("arrow/flight") - - set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++ ${ARROW_PROTOBUF_LIBPROTOBUF}) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 2dcfb01..0394c01 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -48,7 +48,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/8.0.0-0005-install-utils.patch b/ci/conan/all/patches/8.0.0-0005-install-utils.patch deleted file mode 100644 index 98075913ed109..0000000000000 --- a/ci/conan/all/patches/8.0.0-0005-install-utils.patch +++ /dev/null @@ -1,65 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index aba18c8..bb463d0 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -721,7 +721,7 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) - if(Lz4_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) - endif() -@@ -907,8 +907,8 @@ endif() - if(ARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR}) -- list(APPEND ARROW_LINK_LIBS jemalloc::jemalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS jemalloc::jemalloc) -+ list(APPEND ARROW_LINK_LIBS jemalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS jemalloc) - endif() - - if(ARROW_MIMALLOC) -diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt -index 495018e..3dcb35d 100644 ---- a/cpp/src/arrow/ipc/CMakeLists.txt -+++ b/cpp/src/arrow/ipc/CMakeLists.txt -@@ -61,9 +61,13 @@ endif() - if(ARROW_BUILD_UTILITIES OR ARROW_BUILD_INTEGRATION) - add_executable(arrow-file-to-stream file_to_stream.cc) - target_link_libraries(arrow-file-to-stream ${ARROW_UTIL_LIB}) -+ install(TARGETS arrow-file-to-stream ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) - add_executable(arrow-stream-to-file stream_to_file.cc) - target_link_libraries(arrow-stream-to-file ${ARROW_UTIL_LIB}) -- -+ install(TARGETS arrow-stream-to-file ${INSTALL_IS_OPTIONAL} -+ DESTINATION ${CMAKE_INSTALL_BINDIR}) -+ - if(ARROW_BUILD_INTEGRATION) - add_dependencies(arrow-integration arrow-file-to-stream) - add_dependencies(arrow-integration arrow-stream-to-file) diff --git a/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch b/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch deleted file mode 100644 index 7be516e1b4855..0000000000000 --- a/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch +++ /dev/null @@ -1,447 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index bb463d0..ce2d1df 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -705,7 +705,7 @@ endif() - - if(ARROW_WITH_BROTLI) - # Order is important for static linking -- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) -+ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) - list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) - if(Brotli_SOURCE STREQUAL "SYSTEM") -@@ -721,11 +721,18 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) -- if(Lz4_SOURCE STREQUAL "SYSTEM") -- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) -+ if (TARGET LZ4::lz4_static) -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) -+ endif() -+ else() -+ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) -+ if(Lz4_SOURCE STREQUAL "SYSTEM") -+ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) - endif() - endif() -+endif() - - if(ARROW_WITH_SNAPPY) - list(APPEND ARROW_STATIC_LINK_LIBS Snappy::snappy) -@@ -913,8 +920,13 @@ endif() - - if(ARROW_MIMALLOC) - add_definitions(-DARROW_MIMALLOC) -- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) -+ if (TARGET mimalloc-static) -+ list(APPEND ARROW_LINK_LIBS mimalloc-static) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) -+ else() -+ list(APPEND ARROW_LINK_LIBS mimalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) -+ endif() - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index f070323..16faf73 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -959,6 +959,7 @@ endif() - # - Tests need Boost at runtime. - # - S3FS and Flight benchmarks need Boost at runtime. - if(ARROW_BUILD_INTEGRATION -+ OR ARROW_BOOST_REQUIRED - OR ARROW_BUILD_TESTS - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) - OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) -@@ -975,7 +976,7 @@ endif() - - if(ARROW_BOOST_REQUIRED) - resolve_dependency(Boost -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_BOOST_REQUIRED_VERSION} -@@ -986,7 +987,7 @@ if(ARROW_BOOST_REQUIRED) - if(TARGET Boost::system) - set(BOOST_SYSTEM_LIBRARY Boost::system) - set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) -- elseif(BoostAlt_FOUND) -+ elseif(Boost_FOUND) - set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) - set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) - else() -@@ -1129,9 +1130,9 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) -+ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon -+ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) - endif() -@@ -1323,22 +1324,16 @@ endmacro() - if(ARROW_NEED_GFLAGS) - set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") - resolve_dependency(gflags -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GFLAGS_REQUIRED_VERSION} - IS_RUNTIME_DEPENDENCY - FALSE) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) -- -- if(NOT TARGET ${GFLAGS_LIBRARIES}) -- if(TARGET gflags-shared) -- set(GFLAGS_LIBRARIES gflags-shared) -- elseif(TARGET gflags_shared) -- set(GFLAGS_LIBRARIES gflags_shared) -- endif() -- endif() -+ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${gflags_LIBRARIES_TARGETS}) -+ set(GFLAGS_LIBRARIES gflags::gflags) - endif() - - # ---------------------------------------------------------------------- -@@ -1432,9 +1427,9 @@ if(ARROW_WITH_THRIFT) - thrift) - endif() - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) - -- string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) -+ string(REPLACE "." ";" VERSION_LIST ${Thrift_VERSION}) - list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) - list(GET VERSION_LIST 1 THRIFT_VERSION_MINOR) - list(GET VERSION_LIST 2 THRIFT_VERSION_PATCH) -@@ -1557,6 +1552,7 @@ if(ARROW_WITH_PROTOBUF) - set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") - endif() - resolve_dependency(Protobuf -+ USE_CONFIG - REQUIRED_VERSION - ${ARROW_PROTOBUF_REQUIRED_VERSION} - PC_PACKAGE_NAMES -@@ -1567,7 +1563,7 @@ if(ARROW_WITH_PROTOBUF) - endif() - - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) -+ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) - - if(TARGET arrow::protobuf::libprotobuf) - set(ARROW_PROTOBUF_LIBPROTOBUF arrow::protobuf::libprotobuf) -@@ -1576,9 +1572,9 @@ if(ARROW_WITH_PROTOBUF) - if(NOT TARGET protobuf::libprotobuf) - add_library(protobuf::libprotobuf UNKNOWN IMPORTED) - set_target_properties(protobuf::libprotobuf -- PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" -+ PROPERTIES IMPORTED_LOCATION "${Protobuf_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) - endif() -@@ -1598,7 +1594,7 @@ if(ARROW_WITH_PROTOBUF) - set_target_properties(protobuf::libprotoc - PROPERTIES IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES -- "${PROTOBUF_INCLUDE_DIR}") -+ "${Protobuf_INCLUDE_DIR}") - endif() - set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) - endif() -@@ -1690,11 +1686,12 @@ macro(build_substrait) - - add_custom_target(substrait_gen ALL DEPENDS ${SUBSTRAIT_PROTO_GEN_ALL}) - -- set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${PROTOBUF_INCLUDE_DIR}) -+ set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${protobuf_INCLUDE_DIR}) - - add_library(substrait STATIC ${SUBSTRAIT_SOURCES}) - set_target_properties(substrait PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_include_directories(substrait PUBLIC ${SUBSTRAIT_INCLUDES}) -+ target_include_directories(substrait PUBLIC ${PROTOBUF_INCLUDE_DIR}) - target_link_libraries(substrait INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) - add_dependencies(substrait substrait_gen) - -@@ -1711,6 +1708,7 @@ endif() - # jemalloc - Unix-only high-performance allocator - - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1780,12 +1778,18 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) -+endif() - endif() - - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1836,6 +1840,11 @@ if(ARROW_MIMALLOC) - add_dependencies(toolchain mimalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") -+ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -2121,7 +2130,7 @@ endmacro() - if(ARROW_WITH_RAPIDJSON) - set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") - resolve_dependency(RapidJSON -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_RAPIDJSON_REQUIRED_VERSION} -@@ -2158,10 +2167,10 @@ endmacro() - - if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" - )) -- set(xsimd_SOURCE "BUNDLED") -+ set(xsimd_SOURCE "SYSTEM") - resolve_dependency(xsimd) - # TODO: Don't use global includes but rather target_include_directories -- include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) -+ include_directories(SYSTEM ${xsimd_INCLUDE_DIR}) - endif() - - macro(build_zlib) -@@ -2260,10 +2269,14 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -- resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) -+ resolve_dependency(Lz4) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) -+ if (TARGET LZ4::lz4_static) -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) -+ else() -+ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) -+ endif() - include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) - endif() - -@@ -2394,7 +2407,7 @@ if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. -- resolve_dependency(re2 HAVE_ALT TRUE) -+ resolve_dependency(re2 USE_CONFIG TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM") - get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION_${UPPERCASE_BUILD_TYPE}) - if(NOT RE2_LIB) -@@ -2464,7 +2477,7 @@ endmacro() - if(ARROW_WITH_BZ2) - resolve_dependency(BZip2) - if(${BZip2_SOURCE} STREQUAL "SYSTEM") -- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") -+ string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZip2_LIBRARIES}") - endif() - - if(NOT TARGET BZip2::BZip2) -@@ -2473,7 +2486,7 @@ if(ARROW_WITH_BZ2) - PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() -- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") -+ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") - endif() - - macro(build_utf8proc) -@@ -3709,7 +3722,7 @@ if(ARROW_WITH_GRPC) - set(gRPC_SOURCE "${Protobuf_SOURCE}") - endif() - resolve_dependency(gRPC -- HAVE_ALT -+ USE_CONFIG - TRUE - REQUIRED_VERSION - ${ARROW_GRPC_REQUIRED_VERSION} -@@ -3727,9 +3740,9 @@ if(ARROW_WITH_GRPC) - else() - # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp - # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -+ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -+ elseif(EXISTS ${gRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) - set(GRPCPP_PP_INCLUDE FALSE) - else() - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -@@ -3937,7 +3950,7 @@ macro(build_google_cloud_cpp_storage) - endmacro() - - if(ARROW_WITH_GOOGLE_CLOUD_CPP) -- resolve_dependency(google_cloud_cpp_storage) -+ resolve_dependency(google_cloud_cpp) - get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) -@@ -4264,9 +4277,9 @@ if(ARROW_WITH_OPENTELEMETRY) - # cURL is required whether we build from source or use an existing installation - # (OTel's cmake files do not call find_curl for you) - find_curl() -- set(opentelemetry-cpp_SOURCE "AUTO") -+ set(opentelemetry-cpp_SOURCE "SYSTEM") - resolve_dependency(opentelemetry-cpp) -- get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api -+ get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::opentelemetry_common - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) - message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 690c51a..752f3b9 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -326,10 +326,14 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) -- list(APPEND _allocator_dependencies mimalloc_ep) -+ if (TARGET mimalloc-static) -+ list(APPEND _allocator_dependencies mimalloc-static) -+ else() -+ list(APPEND _allocator_dependencies mimalloc) -+ endif() - endif() - - if(_allocator_dependencies) -diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt -index f9d1356..c9bcf79 100644 ---- a/cpp/src/arrow/flight/CMakeLists.txt -+++ b/cpp/src/arrow/flight/CMakeLists.txt -@@ -17,6 +17,9 @@ - - add_custom_target(arrow_flight) - -+# TODO: This is a temporary workaround. absl should be LINKED as TARGET. -+include_directories(SYSTEM ${absl_INCLUDE_DIR}) -+ - arrow_install_all_headers("arrow/flight") - - set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++ ${ARROW_PROTOBUF_LIBPROTOBUF}) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index ed1c2d8..37a89da 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -52,7 +52,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC -diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt -index 71faf9a..3aabea1 100644 ---- a/cpp/src/gandiva/CMakeLists.txt -+++ b/cpp/src/gandiva/CMakeLists.txt -@@ -25,7 +25,7 @@ add_custom_target(gandiva-benchmarks) - - add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) - --find_package(LLVMAlt REQUIRED) -+find_package(LLVM REQUIRED) - - if(LLVM_VERSION_MAJOR LESS "10") - set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -@@ -40,7 +40,7 @@ endif() - - add_definitions(-DGANDIVA_LLVM_VERSION=${LLVM_VERSION_MAJOR}) - --find_package(OpenSSLAlt REQUIRED) -+find_package(OpenSSL REQUIRED) - - # Set the path where the bitcode file generated, see precompiled/CMakeLists.txt - set(GANDIVA_PRECOMPILED_BC_PATH "${CMAKE_CURRENT_BINARY_DIR}/irhelpers.bc") -@@ -98,10 +98,11 @@ set(SRC_FILES - random_generator_holder.cc - ${GANDIVA_PRECOMPILED_CC_PATH}) - --set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE -- ${GANDIVA_OPENSSL_LIBS}) -+set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core NTERFACE -+ ${GANDIVA_OPENSSL_LIBS}) -+ -+set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core ${GANDIVA_OPENSSL_LIBS}) - --set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE ${GANDIVA_OPENSSL_LIBS}) - - if(ARROW_GANDIVA_STATIC_LIBSTDCPP AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX - )) -@@ -139,7 +140,7 @@ add_arrow_lib(gandiva - arrow_dependencies - precompiled - EXTRA_INCLUDES -- $ -+ $ - ${GANDIVA_OPENSSL_INCLUDE_DIR} - ${UTF8PROC_INCLUDE_DIR} - SHARED_LINK_FLAGS diff --git a/ci/conan/all/test_package/CMakeLists.txt b/ci/conan/all/test_package/CMakeLists.txt index 18761d0f52c21..b25c8e889cb84 100644 --- a/ci/conan/all/test_package/CMakeLists.txt +++ b/ci/conan/all/test_package/CMakeLists.txt @@ -26,7 +26,13 @@ project(test_package LANGUAGES CXX) find_package(Arrow REQUIRED CONFIG) add_executable(${PROJECT_NAME} test_package.cpp) -target_link_libraries(${PROJECT_NAME} PRIVATE arrow::arrow) + +if (TARGET Arrow::arrow_shared) + target_link_libraries(${PROJECT_NAME} PRIVATE Arrow::arrow_shared) +else() + target_link_libraries(${PROJECT_NAME} PRIVATE Arrow::arrow_static) +endif() + if (${Arrow_VERSION} VERSION_LESS "10.0.0") target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_11) else() diff --git a/ci/conan/all/test_v1_package/CMakeLists.txt b/ci/conan/all/test_v1_package/CMakeLists.txt deleted file mode 100644 index faf547dec70c2..0000000000000 --- a/ci/conan/all/test_v1_package/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -# MIT License -# -# Copyright (c) 2019 Conan.io -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -cmake_minimum_required(VERSION 3.1) - -project(test_package) - -include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) -conan_basic_setup(TARGETS) - -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../test_package/ - ${CMAKE_CURRENT_BINARY_DIR}/test_package/) diff --git a/ci/conan/all/test_v1_package/conanfile.py b/ci/conan/all/test_v1_package/conanfile.py deleted file mode 100644 index 4f5cc2b61011b..0000000000000 --- a/ci/conan/all/test_v1_package/conanfile.py +++ /dev/null @@ -1,40 +0,0 @@ -# MIT License -# -# Copyright (c) 2019 Conan.io -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from conans import ConanFile, CMake -from conan.tools.build import cross_building -import os - - -class TestPackageV1Conan(ConanFile): - settings = "os", "arch", "compiler", "build_type" - generators = "cmake", "cmake_find_package_multi" - - def build(self): - cmake = CMake(self) - cmake.configure() - cmake.build() - - def test(self): - if not cross_building(self): - bin_path = os.path.join("bin", "test_package") - self.run(bin_path, run_environment=True) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index 3fa90be6f669a..cbb2fce054738 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,29 +21,15 @@ # SOFTWARE. versions: - "15.0.0": - folder: all - "14.0.2": - folder: all - "14.0.1": - folder: all - "14.0.0": - folder: all - "13.0.0": + "18.1.0": folder: all - "12.0.1": + "18.0.0": folder: all - "12.0.0": + "17.0.0": folder: all - "11.0.0": + "16.1.0": folder: all - "10.0.1": - folder: all - "10.0.0": - folder: all - "8.0.1": - folder: all - "8.0.0": - folder: all - "7.0.0": + "15.0.0": folder: all + "14.0.2": + folder: all \ No newline at end of file diff --git a/ci/conan/merge_status.sh b/ci/conan/merge_status.sh index bd99c22def1c9..600385c0e1770 100644 --- a/ci/conan/merge_status.sh +++ b/ci/conan/merge_status.sh @@ -15,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -UPSTREAM_REVISION=681a40adca5f83c80581814fe92316d6298ed96f +UPSTREAM_REVISION=a9b270f9d2052e193ce3c0a6c4e2fda0b0ac5ade diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index f28a24cac8d2d..731b49fa462d4 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -37,6 +37,7 @@ libprotobuf libutf8proc lz4-c make +meson ninja nodejs orc diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 751df9b2f3c01..840577fdd97a4 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,6 +20,9 @@ breathe doxygen ipython linkify-it-py +# We can't install linuxdoc by conda. We install linuxdoc by pip in +# ci/dockerfiles/conda-python-pandas.dockerfile. +# linuxdoc myst-parser numpydoc pydata-sphinx-theme=0.14 diff --git a/ci/docker/centos-7-cpp.dockerfile b/ci/docker/centos-7-cpp.dockerfile index 1f30eed694e4e..b012a5abed2e0 100644 --- a/ci/docker/centos-7-cpp.dockerfile +++ b/ci/docker/centos-7-cpp.dockerfile @@ -37,7 +37,6 @@ RUN \ -e 's/mirror\.centos\.org/vault.centos.org/' \ /etc/yum.repos.d/CentOS-SCLo-scl*.repo && \ yum install -y \ - cmake3 \ curl \ devtoolset-8 \ diffutils \ @@ -49,9 +48,13 @@ RUN \ wget \ which +ARG cmake +COPY ci/scripts/install_cmake.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local/ + COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN bash /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin ENV \ ARROW_R_DEV=TRUE \ - CMAKE=/usr/bin/cmake3 + CMAKE=/usr/local/bin/cmake diff --git a/ci/docker/conda-python-pandas.dockerfile b/ci/docker/conda-python-pandas.dockerfile index 9ee62cd282d36..4a52ffa8e12bc 100644 --- a/ci/docker/conda-python-pandas.dockerfile +++ b/ci/docker/conda-python-pandas.dockerfile @@ -27,6 +27,8 @@ ARG numpy=latest # so ensure to install doc requirements COPY ci/conda_env_sphinx.txt /arrow/ci/ RUN mamba install -q -y --file arrow/ci/conda_env_sphinx.txt && \ + # We can't install linuxdoc by mamba. We install linuxdoc by pip here. + pip install linuxdoc && \ mamba clean --all COPY ci/scripts/install_pandas.sh /arrow/ci/scripts/ diff --git a/ci/docker/conda-python-substrait.dockerfile b/ci/docker/conda-python-substrait.dockerfile deleted file mode 100644 index 36dd64e51e7ad..0000000000000 --- a/ci/docker/conda-python-substrait.dockerfile +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG repo -ARG arch -ARG python=3.9 - -FROM ${repo}:${arch}-conda-python-${python} - -COPY ci/conda_env_python.txt \ - ci/conda_env_sphinx.txt \ - /arrow/ci/ - -# Note: openjdk is pinned to 17 because the -# substrait repo currently pins to jdk 17. -# Newer jdk versions are currently failing -# due to the recent upgrade to Gradle 8 via -# install_substrait_consumer.sh. -# https://github.com/substrait-io/substrait-java/issues/274 -RUN mamba install -q -y \ - --file arrow/ci/conda_env_python.txt \ - --file arrow/ci/conda_env_sphinx.txt \ - $([ "$python" == "3.9" ] && echo "pickle5") \ - python=${python} \ - openjdk=17 \ - nomkl && \ - mamba clean --all - - -ARG substrait=latest -COPY ci/scripts/install_substrait_consumer.sh /arrow/ci/scripts/ - -RUN /arrow/ci/scripts/install_substrait_consumer.sh - -ENV ARROW_ACERO=ON \ - ARROW_COMPUTE=ON \ - ARROW_CSV=ON \ - ARROW_DATASET=ON \ - ARROW_FILESYSTEM=ON \ - ARROW_FLIGHT=OFF \ - ARROW_FLIGHT_SQL=OFF \ - ARROW_GANDIVA=OFF \ - ARROW_JSON=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_TESTING=OFF diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index fe3976248cc86..44c845bb17eff 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -30,13 +30,14 @@ RUN apt-get update -y -q && \ lsb-release \ wget && \ if [ ${llvm} -ge 17 ]; then \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | \ - gpg \ - --import - \ - --keyring /usr/share/keyrings/llvm-snapshot.gpg \ - --no-default-keyring && \ - echo "deb[keyring=/usr/share/keyrings/llvm-snapshot.gpg] https://apt.llvm.org/$(lsb_release --codename --short)/ llvm-toolchain-$(lsb_release --codename --short)-${available_llvm} main" > \ - /etc/apt/sources.list.d/llvm.list; \ + wget -O /usr/share/keyrings/llvm-snapshot.asc \ + https://apt.llvm.org/llvm-snapshot.gpg.key && \ + (echo "Types: deb"; \ + echo "URIs: https://apt.llvm.org/$(lsb_release --codename --short)/"; \ + echo "Suites: llvm-toolchain-$(lsb_release --codename --short)-${llvm}"; \ + echo "Components: main"; \ + echo "Signed-By: /usr/share/keyrings/llvm-snapshot.asc") | \ + tee /etc/apt/sources.list.d/llvm.sources; \ fi && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ diff --git a/ci/docker/java-jni-manylinux-201x.dockerfile b/ci/docker/java-jni-manylinux-201x.dockerfile deleted file mode 100644 index 479f4aa598b18..0000000000000 --- a/ci/docker/java-jni-manylinux-201x.dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base -FROM ${base} - -# Install the libraries required by the Gandiva to run -# Use enable llvm[enable-rtti] in the vcpkg.json to avoid link problems in Gandiva -RUN vcpkg install \ - --clean-after-build \ - --x-install-root=${VCPKG_ROOT}/installed \ - --x-manifest-root=/arrow/ci/vcpkg \ - --x-feature=dev \ - --x-feature=flight \ - --x-feature=gcs \ - --x-feature=json \ - --x-feature=parquet \ - --x-feature=gandiva \ - --x-feature=s3 - -# Install Java -ARG java=11 -ARG maven=3.9.3 -RUN yum install -y java-$java-openjdk-devel && \ - yum clean all && \ - curl \ - --fail \ - --location \ - "https://www.apache.org/dyn/closer.lua?action=download&filename=maven/maven-3/${maven}/binaries/apache-maven-${maven}-bin.tar.gz" | \ - tar xfz - -C /usr/local && \ - ln -s /usr/local/apache-maven-${maven}/bin/mvn /usr/local/bin - -# Install the gcs testbench -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN PYTHON=python /arrow/ci/scripts/install_gcs_testbench.sh default - -# For ci/scripts/{cpp,java}_*.sh -ENV ARROW_HOME=/tmp/local \ - ARROW_JAVA_CDATA=ON \ - ARROW_JAVA_JNI=ON \ - ARROW_USE_CCACHE=ON diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 0804f3543c283..31435d4989129 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -19,7 +19,6 @@ ARG base FROM ${base} ARG r=4.4 -ARG jdk=11 ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium @@ -67,7 +66,6 @@ RUN apt-get update -y && \ nodejs \ npm \ nvidia-cuda-toolkit \ - openjdk-${jdk}-jdk-headless \ pandoc \ r-recommended=${r}* \ r-base=${r}* \ @@ -80,15 +78,6 @@ RUN apt-get update -y && \ PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ npm install -g yarn @mermaid-js/mermaid-cli -ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 - -ARG maven=3.8.7 -COPY ci/scripts/util_download_apache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/util_download_apache.sh \ - "maven/maven-3/${maven}/binaries/apache-maven-${maven}-bin.tar.gz" /opt -ENV PATH=/opt/apache-maven-${maven}/bin:$PATH -RUN mvn -version - COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile @@ -133,4 +122,5 @@ ENV ARROW_ACERO=ON \ ARROW_S3=ON \ ARROW_USE_GLOG=OFF \ CMAKE_UNITY_BUILD=ON \ + CUDAToolkit_ROOT=/usr \ RETICULATE_PYTHON_ENV=${ARROW_PYTHON_VENV} diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 9ec80440a3c21..b73cc585ea74e 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -58,6 +58,7 @@ RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site # We don't need arrow's dependencies, only lintr (and its dependencies) RUN R -e "install.packages('lintr')" +RUN R -e "install.packages('cyclocomp')" # Docker linter COPY --from=hadolint /bin/hadolint /usr/bin/hadolint diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index 7b7e989adc0d1..da378eac43028 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -51,6 +51,10 @@ RUN /arrow/ci/scripts/r_docker_configure.sh COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin +ARG cmake +COPY ci/scripts/install_cmake.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local/ + COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow diff --git a/ci/docker/python-free-threaded-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-free-threaded-wheel-windows-test-vs2022.dockerfile similarity index 100% rename from ci/docker/python-free-threaded-wheel-windows-test-vs2019.dockerfile rename to ci/docker/python-free-threaded-wheel-windows-test-vs2022.dockerfile diff --git a/ci/docker/python-free-threaded-wheel-windows-vs2019.dockerfile b/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile similarity index 100% rename from ci/docker/python-free-threaded-wheel-windows-vs2019.dockerfile rename to ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 0b5645285b6e1..ffcaa8c0a0741 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -53,7 +53,7 @@ ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} # Install CMake ARG cmake=3.29.2 COPY ci/scripts/install_cmake.sh arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_cmake.sh ${arch} linux ${cmake} /usr/local +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local # Install Ninja ARG ninja=1.10.2 diff --git a/ci/docker/python-wheel-windows-test-vs2019-base.dockerfile b/ci/docker/python-wheel-windows-test-vs2019-base.dockerfile deleted file mode 100644 index 73a78da30b907..0000000000000 --- a/ci/docker/python-wheel-windows-test-vs2019-base.dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: You must update PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION in .env -# when you update this file. - -# based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2019 preinstalled -FROM abrarov/msvc-2019:2.11.0 - -# hadolint shell=cmd.exe - -# Add unix tools to path -RUN setx path "%path%;C:\Program Files\Git\usr\bin" - -# 1. Remove previous installations of Python from the base image -# NOTE: a more recent base image (tried with 2.12.1) comes with Python 3.9.7 -# and the MSI installers are failing to remove pip and tcl/tk "products" making -# the subsequent choco python installation step failing for installing Python -# version 3.9.* due to existing python version -# 2. Install Minio for S3 testing. -RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ - rm -rf Python* && \ - curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z \ - --output "C:\Windows\Minio.exe" - -# Install archiver to extract xz archives (for timezone database). -# Install the GCS testbench using a well-known Python version. -# NOTE: cannot use pipx's `--fetch-missing-python` because of -# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves. -RUN choco install --no-progress -r -y archiver && \ - choco install -r -y --pre --no-progress python --version=3.11.9 -ENV PIPX_BIN_DIR=C:\\Windows\\ -ENV PIPX_PYTHON="C:\Python311\python.exe" -COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/ -RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \ - storage-testbench -h diff --git a/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile new file mode 100644 index 0000000000000..1d1602c03a235 --- /dev/null +++ b/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile @@ -0,0 +1,65 @@ +# escape=` + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: You must update PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION in .env +# when you update this file. + +FROM mcr.microsoft.com/windows/servercore:ltsc2022 + +# Ensure we in a command shell and not Powershell +SHELL ["cmd", "/S", "/C"] + +# Install MSVC BuildTools +# +# The set of components below (lines starting with --add) is the most minimal +# set we could find that would still compile Arrow C++. +RUN ` + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe ` + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache ` + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" ` + --add Microsoft.VisualStudio.Component.VC.CoreBuildTools ` + --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ` + --add Microsoft.VisualStudio.Component.Windows10SDK.20348 ` + --add Microsoft.VisualStudio.Component.VC.CMake.Project ` + || IF "%ERRORLEVEL%"=="3010" EXIT 0) ` + && del /q vs_buildtools.exe + +# Install choco CLI +# +# We switch into Powershell just for this command and switch back to cmd +# See https://chocolatey.org/install#completely-offline-install +SHELL ["powershell", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] +RUN ` + Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) +SHELL ["cmd", "/S", "/C"] + +# Install git, wget, minio +RUN choco install --no-progress -r -y git wget +RUN curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z ` + --output "C:\Windows\Minio.exe" + +# Install the GCS testbench using a well-known Python version. +# NOTE: cannot use pipx's `--fetch-missing-python` because of +# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves. +RUN choco install -r -y --pre --no-progress python --version=3.11.9 +ENV PIPX_BIN_DIR=C:\\Windows\\ +ENV PIPX_PYTHON="C:\Python311\python.exe" +COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/ +RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && ` + storage-testbench -h diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2022.dockerfile similarity index 100% rename from ci/docker/python-wheel-windows-test-vs2019.dockerfile rename to ci/docker/python-wheel-windows-test-vs2022.dockerfile diff --git a/ci/docker/python-wheel-windows-vs2019-base.dockerfile b/ci/docker/python-wheel-windows-vs2019-base.dockerfile deleted file mode 100644 index bd91f01bf9b6d..0000000000000 --- a/ci/docker/python-wheel-windows-vs2019-base.dockerfile +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: You must update PYTHON_WHEEL_WINDOWS_IMAGE_REVISION in .env -# when you update this file. - -# based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2019 preinstalled -FROM abrarov/msvc-2019:2.11.0 - -# Install CMake and Ninja -ARG cmake=3.31.2 -RUN choco install --no-progress -r -y cmake --version=%cmake% --installargs 'ADD_CMAKE_TO_PATH=System' && \ - choco install --no-progress -r -y gzip wget ninja - -# Add unix tools to path -RUN setx path "%path%;C:\Program Files\Git\usr\bin" - -# Install vcpkg -# -# Compiling vcpkg itself from a git tag doesn't work anymore since vcpkg has -# started to ship precompiled binaries for the vcpkg-tool. -ARG vcpkg -COPY ci/vcpkg/*.patch \ - ci/vcpkg/*windows*.cmake \ - arrow/ci/vcpkg/ -COPY ci/scripts/install_vcpkg.sh arrow/ci/scripts/ -ENV VCPKG_ROOT=C:\\vcpkg -RUN bash arrow/ci/scripts/install_vcpkg.sh /c/vcpkg %vcpkg% && \ - setx PATH "%PATH%;%VCPKG_ROOT%" - -# Configure vcpkg and install dependencies -# NOTE: use windows batch environment notation for build arguments in RUN -# statements but bash notation in ENV statements -# VCPKG_FORCE_SYSTEM_BINARIES=1 spare around ~750MB of image size if the system -# cmake's and ninja's versions are recent enough -ARG build_type=release -ENV CMAKE_BUILD_TYPE=${build_type} \ - VCPKG_OVERLAY_TRIPLETS=C:\\arrow\\ci\\vcpkg \ - VCPKG_DEFAULT_TRIPLET=amd64-windows-static-md-${build_type} \ - VCPKG_FEATURE_FLAGS="manifests" -COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/ -# cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains -# ssl related fixes as well as we can patch the vcpkg portfile to support -# arm machines it hits ARROW-15141 where we would need to fall back to 1.8.186 -# but we cannot patch those portfiles since vcpkg-tool handles the checkout of -# previous versions => use bundled S3 build -RUN vcpkg install \ - --clean-after-build \ - --x-install-root=%VCPKG_ROOT%\installed \ - --x-manifest-root=arrow/ci/vcpkg \ - --x-feature=flight \ - --x-feature=gcs \ - --x-feature=json \ - --x-feature=orc \ - --x-feature=parquet \ - --x-feature=s3 - -# Remove previous installations of Python from the base image -# NOTE: a more recent base image (tried with 2.12.1) comes with Python 3.9.7 -# and the MSI installers are failing to remove pip and tcl/tk "products" making -# the subsequent choco python installation step failing for installing Python -# version 3.9.* due to existing Python version -RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ - rm -rf Python* diff --git a/ci/docker/python-wheel-windows-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-vs2022-base.dockerfile new file mode 100644 index 0000000000000..7f683487a8c01 --- /dev/null +++ b/ci/docker/python-wheel-windows-vs2022-base.dockerfile @@ -0,0 +1,132 @@ +# escape=` + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# NOTE: To build this Dockerfile, you probably need to do the following two +# things: +# +# 1. Increase your container image size to a higher value. +# +# e.g., +# +# Set a custom 'storage-opts' value in your Windows Docker config and restart +# Docker: +# +# "storage-opts": [ +# "size=50GB" +# ] +# +# See +# +# https://learn.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/container-storage#example +# +# for details on this step and +# +# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022#troubleshoot-build-tools-containers +# +# for more information. +# +# 2. Increase the memory limit for the build container to at least 4GB. +# +# e.g., +# +# docker build -t sometag -m 4GB --file ` +# .\ci\docker\python-wheel-windows-vs2022-base.dockerfile . + +# NOTE: You must update PYTHON_WHEEL_WINDOWS_IMAGE_REVISION in .env +# when you update this file. + +FROM mcr.microsoft.com/windows/servercore:ltsc2022 + +# Ensure we in a command shell and not Powershell +SHELL ["cmd", "/S", "/C"] + +# Install MSVC BuildTools +# +# The set of components below (lines starting with --add) is the most minimal +# set we could find that would still compile Arrow C++. +RUN ` + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe ` + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache ` + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" ` + --add Microsoft.VisualStudio.Component.VC.CoreBuildTools ` + --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ` + --add Microsoft.VisualStudio.Component.Windows10SDK.20348 ` + --add Microsoft.VisualStudio.Component.VC.CMake.Project ` + || IF "%ERRORLEVEL%"=="3010" EXIT 0) ` + && del /q vs_buildtools.exe + +# Install choco CLI +# +# Switch into Powershell just for this command because choco only provides a +# Powershell installation script. After, we switch back to cmd. +# +# See https://chocolatey.org/install#completely-offline-install +SHELL ["powershell", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] +RUN ` + Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) +SHELL ["cmd", "/S", "/C"] + +# Install CMake and other tools +ARG cmake=3.31.2 +RUN choco install --no-progress -r -y cmake --version=%cmake% --installargs 'ADD_CMAKE_TO_PATH=System' +RUN choco install --no-progress -r -y git gzip ninja wget + +# Add UNIX tools to PATH +RUN setx path "%path%;C:\Program Files\Git\usr\bin" + +# Install vcpkg +# +# Compiling vcpkg itself from a git tag doesn't work anymore since vcpkg has +# started to ship precompiled binaries for the vcpkg-tool. +ARG vcpkg +COPY ci/vcpkg/*.patch ` + ci/vcpkg/*windows*.cmake ` + arrow/ci/vcpkg/ +COPY ci/scripts/install_vcpkg.sh arrow/ci/scripts/ +ENV VCPKG_ROOT=C:\\vcpkg +RUN bash arrow/ci/scripts/install_vcpkg.sh /c/vcpkg %vcpkg% && ` + setx PATH "%PATH%;%VCPKG_ROOT%" + +# Configure vcpkg and install dependencies +# NOTE: use windows batch environment notation for build arguments in RUN +# statements but bash notation in ENV statements +# VCPKG_FORCE_SYSTEM_BINARIES=1 spare around ~750MB of image size if the system +# cmake's and ninja's versions are recent enough +ARG build_type=release +ENV CMAKE_BUILD_TYPE=${build_type} ` + VCPKG_OVERLAY_TRIPLETS=C:\\arrow\\ci\\vcpkg ` + VCPKG_DEFAULT_TRIPLET=amd64-windows-static-md-${build_type} ` + VCPKG_FEATURE_FLAGS="manifests" +COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/ +# cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains +# ssl related fixes as well as we can patch the vcpkg portfile to support +# arm machines it hits ARROW-15141 where we would need to fall back to 1.8.186 +# but we cannot patch those portfiles since vcpkg-tool handles the checkout of +# previous versions => use bundled S3 build +RUN vcpkg install ` + --clean-after-build ` + --x-install-root=%VCPKG_ROOT%\installed ` + --x-manifest-root=arrow/ci/vcpkg ` + --x-feature=flight` + --x-feature=gcs` + --x-feature=json` + --x-feature=orc` + --x-feature=parquet` + --x-feature=s3 diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2022.dockerfile similarity index 100% rename from ci/docker/python-wheel-windows-vs2019.dockerfile rename to ci/docker/python-wheel-windows-vs2022.dockerfile diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile deleted file mode 100644 index 1b342df596c9d..0000000000000 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ /dev/null @@ -1,105 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base=amd64/ubuntu:20.04 -FROM ${base} - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -RUN echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN apt-get update -y -q && \ - apt-get install -y -q \ - build-essential \ - ccache \ - cmake \ - curl \ - gdb \ - git \ - libssl-dev \ - libcurl4-openssl-dev \ - python3-pip \ - python3-venv \ - tzdata \ - wget && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* - -# Installs LLVM toolchain, for Gandiva and testing other compilers -# -# Note that this is installed before the base packages to improve iteration -# while debugging package list with docker build. -ARG llvm -RUN latest_system_llvm=10 && \ - if [ ${llvm} -gt ${latest_system_llvm} ]; then \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ - wget && \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - code_name=$(lsb_release --codename --short) && \ - if [ ${llvm} -gt 10 ]; then \ - echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ - /etc/apt/sources.list.d/llvm.list; \ - fi; \ - fi && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - clang-${llvm} \ - llvm-${llvm}-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin - -ENV ARROW_ACERO=ON \ - ARROW_AZURE=OFF \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=ON \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HDFS=ON \ - ARROW_HOME=/usr/local \ - ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=OFF \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - CMAKE_GENERATOR="Unix Makefiles" \ - PARQUET_BUILD_EXAMPLES=ON \ - PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile deleted file mode 100644 index 259c5fb77fa41..0000000000000 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ /dev/null @@ -1,194 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base=amd64/ubuntu:20.04 -FROM ${base} - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -RUN echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -# Installs LLVM toolchain, for Gandiva and testing other compilers -# -# Note that this is installed before the base packages to improve iteration -# while debugging package list with docker build. -ARG clang_tools -ARG llvm -RUN latest_system_llvm=10 && \ - if [ ${llvm} -gt ${latest_system_llvm} -o \ - ${clang_tools} -gt ${latest_system_llvm} ]; then \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ - wget && \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - code_name=$(lsb_release --codename --short) && \ - if [ ${llvm} -gt 10 ]; then \ - echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ - /etc/apt/sources.list.d/llvm.list; \ - fi && \ - if [ ${clang_tools} -ne ${llvm} -a \ - ${clang_tools} -gt ${latest_system_llvm} ]; then \ - echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${clang_tools} main" > \ - /etc/apt/sources.list.d/clang-tools.list; \ - fi; \ - fi && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - clang-${clang_tools} \ - clang-${llvm} \ - clang-format-${clang_tools} \ - clang-tidy-${clang_tools} \ - llvm-${llvm}-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* - -# Installs C++ toolchain and dependencies -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ca-certificates \ - ccache \ - cmake \ - curl \ - g++ \ - gcc \ - gdb \ - git \ - libbenchmark-dev \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgflags-dev \ - libgoogle-glog-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ - liblz4-dev \ - libnghttp2-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libpsl-dev \ - libradospp-dev \ - libre2-dev \ - librtmp-dev \ - libsnappy-dev \ - libssh-dev \ - libssh2-1-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libxml2-dev \ - libzstd-dev \ - lld \ - make \ - ninja-build \ - nlohmann-json3-dev \ - npm \ - patch \ - pkg-config \ - protobuf-compiler \ - python3-dev \ - python3-pip \ - python3-rados \ - python3-venv \ - rados-objclass-dev \ - rapidjson-dev \ - rsync \ - tzdata \ - wget && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_azurite.sh - -COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_ceph.sh - -COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin - -# Prioritize system packages and local installation. -# -# The following dependencies will be downloaded due to missing/invalid packages -# provided by the distribution: -# - Abseil is not packaged -# - libc-ares-dev does not install CMake config files -# - flatbuffer is not packaged -# - libgtest-dev only provide sources -# - libprotobuf-dev only provide sources -# - opentelemetry-cpp-dev is not packaged -# -# ARROW-17051: this build uses static Protobuf, so we must also use -# static Arrow to run Flight/Flight SQL tests. -ENV absl_SOURCE=BUNDLED \ - ARROW_ACERO=ON \ - ARROW_AZURE=OFF \ - ARROW_BUILD_STATIC=ON \ - ARROW_BUILD_TESTS=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=OFF \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HDFS=ON \ - ARROW_HOME=/usr/local \ - ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_JEMALLOC=ON \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_USE_ASAN=OFF \ - ARROW_USE_CCACHE=ON \ - ARROW_USE_LLD=ON \ - ARROW_USE_UBSAN=OFF \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ - AWSSDK_SOURCE=BUNDLED \ - Azure_SOURCE=BUNDLED \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ - gRPC_SOURCE=BUNDLED \ - GTest_SOURCE=BUNDLED \ - opentelemetry_cpp_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PARQUET_BUILD_EXAMPLES=ON \ - PARQUET_BUILD_EXECUTABLES=ON \ - Protobuf_SOURCE=BUNDLED \ - PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 \ - xsimd_SOURCE=BUNDLED diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index ce31c457e909e..2a90a5637d4df 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -68,6 +68,10 @@ RUN latest_system_llvm=14 && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +ARG cmake +COPY ci/scripts/install_cmake.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local/ + COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 721b37dcae842..8235e72c4ef15 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -41,7 +41,7 @@ RUN latest_system_llvm=14 && \ wget && \ wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ code_name=$(lsb_release --codename --short) && \ - if [ ${llvm} -gt 10 ]; then \ + if [ ${llvm} -gt ${latest_system_llvm} ]; then \ echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ /etc/apt/sources.list.d/llvm.list; \ fi && \ @@ -68,10 +68,6 @@ RUN apt-get update -y -q && \ bzip2 \ ca-certificates \ ccache \ - ceph \ - ceph-fuse \ - ceph-mds \ - cmake \ curl \ gdb \ git \ @@ -168,6 +164,10 @@ RUN if [ "${gcc}" = "" ]; then \ # make sure zlib is cached in the EMSDK folder RUN source ~/emsdk/emsdk_env.sh && embuilder --pic build zlib +ARG cmake +COPY ci/scripts/install_cmake.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local/ + COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local @@ -177,6 +177,9 @@ RUN /arrow/ci/scripts/install_gcs_testbench.sh default COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_azurite.sh +COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ceph.sh + COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/docker/ubuntu-22.04-verify-rc.dockerfile b/ci/docker/ubuntu-22.04-verify-rc.dockerfile index 8bc6f39b67a09..b9f130d24ea94 100644 --- a/ci/docker/ubuntu-22.04-verify-rc.dockerfile +++ b/ci/docker/ubuntu-22.04-verify-rc.dockerfile @@ -24,3 +24,7 @@ RUN /setup-ubuntu.sh && \ rm /setup-ubuntu.sh && \ apt-get clean && \ rm -rf /var/lib/apt/lists* + +ARG cmake +COPY ci/scripts/install_cmake.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local/ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 592a9a6a232e5..0347d452d7bfc 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -68,9 +68,6 @@ RUN apt-get update -y -q && \ autoconf \ ca-certificates \ ccache \ - ceph \ - ceph-fuse \ - ceph-mds \ cmake \ curl \ gdb \ @@ -165,6 +162,9 @@ RUN /arrow/ci/scripts/install_gcs_testbench.sh default COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_azurite.sh +COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ceph.sh + COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/rtools/README.md b/ci/rtools/README.md new file mode 100644 index 0000000000000..08b5ea7f5136e --- /dev/null +++ b/ci/rtools/README.md @@ -0,0 +1,35 @@ + + +# rtools40 patches for AWS SDK and related libs + +The patches in this directory are solely for the purpose of building Arrow C++ +under [Rtools40](https://cran.r-project.org/bin/windows/Rtools/rtools40.html) +and not used elsewhere. Once we've dropped support for Rtools40, we can consider +removing these patches. + +The larger reason these patches are needed is that Rtools provides their own +packages and their versions of the AWS libraries weren't compatible with CMake +3.25. Our solution was to bundle the AWS libs instead and these patches were +required to get them building under the Rtools40 environment. + +The patches were added while upgrading the minimum required CMake version to +3.25 in [GH-44950](https://github.com/apache/arrow/issues/44950). Please see the +associated PR, [GH-44989](https://github.com/apache/arrow/pull/44989), for more +context. diff --git a/ci/scripts/install_substrait_consumer.sh b/ci/rtools/aws_c_common_ep.patch old mode 100755 new mode 100644 similarity index 56% rename from ci/scripts/install_substrait_consumer.sh rename to ci/rtools/aws_c_common_ep.patch index 2e6d299f68bf2..94c84d0fe1b5a --- a/ci/scripts/install_substrait_consumer.sh +++ b/ci/rtools/aws_c_common_ep.patch @@ -1,5 +1,3 @@ -#!/usr/bin/env bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,20 +15,25 @@ # specific language governing permissions and limitations # under the License. -set -e - -echo "Install Substrait Consumer Test Suite"; +diff --git a/include/aws/common/byte_order.inl b/include/aws/common/byte_order.inl +index 1204be0..0abd9cb 100644 +--- a/include/aws/common/byte_order.inl ++++ b/include/aws/common/byte_order.inl +@@ -13,7 +13,7 @@ + # include + #else + # include +-#endif /* _MSC_VER */ ++#endif /* _WIN32 */ -git clone https://github.com/substrait-io/consumer-testing.git -cd consumer-testing -# avoid installing pyarrow -grep -v 'pyarrow\|arrow-nightlies' requirements.txt > requirements-no-arrow.txt -pip install -r requirements-no-arrow.txt + AWS_EXTERN_C_BEGIN -pip install -r requirements-build.txt -# setup substrait-java -git submodule init -git submodule update --init -./build-and-copy-isthmus-shadow-jar.sh -# install substrait_consumer library -python setup.py install +@@ -39,7 +39,7 @@ AWS_STATIC_IMPL uint64_t aws_hton64(uint64_t x) { + uint64_t v; + __asm__("bswap %q0" : "=r"(v) : "0"(x)); + return v; +-#elif defined(_MSC_VER) ++#elif defined(_WIN32) + return _byteswap_uint64(x); + #else + uint32_t low = x & UINT32_MAX; diff --git a/ci/rtools/aws_c_io_ep.patch b/ci/rtools/aws_c_io_ep.patch new file mode 100644 index 0000000000000..a15d706ba1238 --- /dev/null +++ b/ci/rtools/aws_c_io_ep.patch @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/source/windows/secure_channel_tls_handler.c b/source/windows/secure_channel_tls_handler.c +index 50caf02..29fe850 100644 +--- a/source/windows/secure_channel_tls_handler.c ++++ b/source/windows/secure_channel_tls_handler.c +@@ -35,6 +36,25 @@ + # pragma warning(disable : 4306) /* Identifier is type cast to a larger pointer. */ + #endif + ++#ifndef SP_PROT_TLS1_0_SERVER ++#define SP_PROT_TLS1_0_SERVER SP_PROT_TLS1_SERVER ++#endif ++#ifndef SP_PROT_TLS1_0_CLIENT ++#define SP_PROT_TLS1_0_CLIENT SP_PROT_TLS1_CLIENT ++#endif ++#ifndef SP_PROT_TLS1_1_SERVER ++#define SP_PROT_TLS1_1_SERVER 0x00000100 ++#endif ++#ifndef SP_PROT_TLS1_1_CLIENT ++#define SP_PROT_TLS1_1_CLIENT 0x00000200 ++#endif ++#ifndef SCH_USE_STRONG_CRYPTO ++#define SCH_USE_STRONG_CRYPTO 0x00400000 ++#endif ++#ifndef SECBUFFER_ALERT ++#define SECBUFFER_ALERT 0x11 ++#endif ++ + #define KB_1 1024 + #define READ_OUT_SIZE (16 * KB_1) + #define READ_IN_SIZE READ_OUT_SIZE +@@ -456,7 +476,7 @@ static int s_fillin_alpn_data( + + *extension_length += sizeof(uint32_t) + sizeof(uint16_t); + +- *extension_name = SecApplicationProtocolNegotiationExt_ALPN; ++ *extension_name = 2; + /*now add the protocols*/ + for (size_t i = 0; i < protocols_count; ++i) { + struct aws_byte_cursor *protocol_ptr = NULL; diff --git a/ci/rtools/awssdk_ep.patch b/ci/rtools/awssdk_ep.patch new file mode 100644 index 0000000000000..bd26f85329090 --- /dev/null +++ b/ci/rtools/awssdk_ep.patch @@ -0,0 +1,181 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/aws-cpp-sdk-core/include/aws/core/utils/Array.h b/aws-cpp-sdk-core/include/aws/core/utils/Array.h +index 2b5bbc566..7cb93bdf0 100644 +--- a/aws-cpp-sdk-core/include/aws/core/utils/Array.h ++++ b/aws-cpp-sdk-core/include/aws/core/utils/Array.h +@@ -54,7 +54,7 @@ namespace Aws + { + m_data.reset(Aws::NewArray(m_size, ARRAY_ALLOCATION_TAG)); + +-#ifdef _WIN32 ++#ifdef _MSC_VER + std::copy(arrayToCopy, arrayToCopy + arraySize, stdext::checked_array_iterator< T * >(m_data.get(), m_size)); + #else + std::copy(arrayToCopy, arrayToCopy + arraySize, m_data.get()); +@@ -82,7 +82,7 @@ namespace Aws + if(arr->m_size > 0 && arr->m_data) + { + size_t arraySize = arr->m_size; +-#ifdef _WIN32 ++#ifdef _MSC_VER + std::copy(arr->m_data.get(), arr->m_data.get() + arraySize, stdext::checked_array_iterator< T * >(m_data.get() + location, m_size)); + #else + std::copy(arr->m_data.get(), arr->m_data.get() + arraySize, m_data.get() + location); +@@ -101,7 +101,7 @@ namespace Aws + { + m_data.reset(Aws::NewArray(m_size, ARRAY_ALLOCATION_TAG)); + +-#ifdef _WIN32 ++#ifdef _MSC_VER + std::copy(other.m_data.get(), other.m_data.get() + other.m_size, stdext::checked_array_iterator< T * >(m_data.get(), m_size)); + #else + std::copy(other.m_data.get(), other.m_data.get() + other.m_size, m_data.get()); +@@ -134,7 +134,7 @@ namespace Aws + { + m_data.reset(Aws::NewArray(m_size, ARRAY_ALLOCATION_TAG)); + +-#ifdef _WIN32 ++#ifdef _MSC_VER + std::copy(other.m_data.get(), other.m_data.get() + other.m_size, stdext::checked_array_iterator< T * >(m_data.get(), m_size)); + #else + std::copy(other.m_data.get(), other.m_data.get() + other.m_size, m_data.get()); +diff --git a/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp b/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp +index 4dade6489..a0456cf8e 100644 +--- a/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp ++++ b/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp +@@ -22,6 +22,16 @@ + #include + #include + ++#ifndef WINHTTP_OPTION_WEB_SOCKET_KEEPALIVE_INTERVAL ++#define WINHTTP_OPTION_WEB_SOCKET_KEEPALIVE_INTERVAL 116 ++#endif ++#ifndef WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_1 ++#define WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_1 0x00000200 ++#endif ++#ifndef WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2 ++#define WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2 0x00000800 ++#endif ++ + using namespace Aws::Client; + using namespace Aws::Http; + using namespace Aws::Http::Standard; +@@ -272,7 +282,7 @@ bool WinHttpSyncHttpClient::DoQueryHeaders(void* hHttpRequest, std::shared_ptr(dwSize / sizeof(wchar_t))); + + WinHttpQueryHeaders(hHttpRequest, WINHTTP_QUERY_CONTENT_TYPE, nullptr, &contentTypeStr, &dwSize, 0); +- if (contentTypeStr[0] != NULL) ++ if (contentTypeStr[0]) + { + Aws::String contentStr = StringUtils::FromWString(contentTypeStr); + response->SetContentType(contentStr); +@@ -303,7 +313,7 @@ bool WinHttpSyncHttpClient::DoQueryHeaders(void* hHttpRequest, std::shared_ptrSetContentType(contentTypeStr); + AWS_LOGSTREAM_DEBUG(GetLogTag(), "Received content type " << contentTypeStr); +diff --git a/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp b/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp +index d7513cc3c..e390a8d4e 100644 +--- a/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp ++++ b/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp +@@ -349,7 +349,7 @@ std::shared_ptr WinSyncHttpClient::MakeRequest(const std::shared_p + } + } + +- if (!success && !IsRequestProcessingEnabled() || !ContinueRequest(*request)) ++ if ((!success && !IsRequestProcessingEnabled()) || !ContinueRequest(*request)) + { + response->SetClientErrorType(CoreErrors::USER_CANCELLED); + response->SetClientErrorMessage("Request processing disabled or continuation cancelled by user's continuation handler."); +diff --git a/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp b/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp +index 2ea82de6f..bc423441e 100644 +--- a/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp ++++ b/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp +@@ -11,7 +11,9 @@ + #include + #include + ++#ifdef _MSC_VER + #pragma warning( disable : 4996) ++#endif + + using namespace Aws::Utils; + namespace Aws +@@ -304,6 +306,9 @@ Aws::String CreateTempFilePath() + { + #ifdef _MSC_VER + #pragma warning(disable: 4996) // _CRT_SECURE_NO_WARNINGS ++#elif !defined(L_tmpnam_s) ++ // Definition from the MSVC stdio.h ++ #define L_tmpnam_s (sizeof("\\") + 16) + #endif + char s_tempName[L_tmpnam_s+1]; + +diff --git a/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp b/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp +index 0180f7fbf..3adbab313 100644 +--- a/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp ++++ b/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp +@@ -9,7 +9,9 @@ + + #include + ++#ifdef _MSC_VER + #pragma warning(disable: 4996) ++#endif + #include + #include + namespace Aws +diff --git a/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp b/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp +index 2ee517b48..3b0dce665 100644 +--- a/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp ++++ b/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp +@@ -939,7 +939,7 @@ std::shared_ptr Aws::Utils::Crypto::CreateSha256HMACIm + return GetSha256HMACFactory()->CreateImplementation(); + } + +-#ifdef _WIN32 ++#ifdef _MSC_VER + #pragma warning( push ) + #pragma warning( disable : 4702 ) + #endif +@@ -1032,7 +1032,7 @@ std::shared_ptr Aws::Utils::Crypto::CreateAES_KeyWrapImplementa + return GetAES_KeyWrapFactory()->CreateImplementation(key); + } + +-#ifdef _WIN32 ++#ifdef _MSC_VER + #pragma warning(pop) + #endif diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index efeed954006c1..9eac3ef5cb9f2 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,25 +18,22 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=19.0.0.9000 +pkgver=19.0.1.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") -depends=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp" - "${MINGW_PACKAGE_PREFIX}-bzip2" +depends=("${MINGW_PACKAGE_PREFIX}-bzip2" "${MINGW_PACKAGE_PREFIX}-curl" # for google-cloud-cpp bundled build "${MINGW_PACKAGE_PREFIX}-libutf8proc" "${MINGW_PACKAGE_PREFIX}-re2" - "${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" "${MINGW_PACKAGE_PREFIX}-zlib" "${MINGW_PACKAGE_PREFIX}-lz4" "${MINGW_PACKAGE_PREFIX}-zstd" "${MINGW_PACKAGE_PREFIX}-brotli") makedepends=("${MINGW_PACKAGE_PREFIX}-ccache" - "${MINGW_PACKAGE_PREFIX}-cmake" "${MINGW_PACKAGE_PREFIX}-gcc") options=("staticlibs" "strip" "!buildflags") @@ -82,8 +79,31 @@ build() { # CMAKE_UNITY_BUILD is set to OFF as otherwise some compute functionality # segfaults in tests + # We use the bundled AWS SDK instead of the MINGW one because the upstream + # one on rtools packages is unmaintained, uses an old version (1.7.365) + # and does not work with newer versions of CMake. See comments: + # https://github.com/apache/arrow/pull/44989/files#r1901428998 + + # We use the bundled Apache Thrift instead of the MINGW one because + # the upstream one on rtools packages is unmaintained. Apache Thrift + # still have the following problem: + # + # https://github.com/apache/thrift/pull/2725 + # + # The original MSYS2 package has another fix: + # + # https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-thrift/002-fix-pkgconfig-paths.patch + # + # But one on rtools packages doesn't have the fix. So we can't use + # the MINGW one. + + # MSYS2_ARG_CONV_EXCL is needed to prevent autoconverting CMAKE_INSTALL_PREFIX + # to Windows paths. See https://www.msys2.org/docs/filesystem-paths/#process-arguments + + # We require the full path to the CMake executable in order to build Arrow + # which is in the Program Files directory: "/c/Program Files/CMake/bin/cmake" MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \ - ${MINGW_PREFIX}/bin/cmake.exe \ + "${PROGRAMFILES}\CMake\bin\cmake.exe" \ ${ARROW_CPP_DIR} \ -G "MSYS Makefiles" \ -DARROW_ACERO=ON \ @@ -116,10 +136,12 @@ build() { -DARROW_WITH_BZ2=ON \ -DARROW_ZSTD_USE_SHARED=OFF \ -DARROW_CXXFLAGS="${CPPFLAGS}" \ + -DAWSSDK_SOURCE=BUNDLED \ -DCMAKE_BUILD_TYPE="release" \ -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} \ -DCMAKE_UNITY_BUILD=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DThrift_SOURCE=BUNDLED make -j3 popd diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index 0ea3fc29192dd..03e5cab8426c6 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -25,7 +25,6 @@ build_dir=${1} shift export ARROW_HOME=${source_dir} -export CONAN_HOOK_ERROR_LEVEL=40 conan_args=() conan_args+=(--build=missing) @@ -67,6 +66,7 @@ fi version=$(grep '^set(ARROW_VERSION ' ${ARROW_HOME}/cpp/CMakeLists.txt | \ grep -E -o '([0-9.]*)') +conan_args+=(--version ${version}) rm -rf ~/.conan/data/arrow/ rm -rf ${build_dir}/conan || sudo rm -rf ${build_dir}/conan @@ -78,4 +78,4 @@ else sudo chown -R $(id -u):$(id -g) ${build_dir}/conan/ fi cd ${build_dir}/conan/all -conan create . arrow/${version}@ "${conan_args[@]}" "$@" +conan create . "${conan_args[@]}" "$@" diff --git a/ci/scripts/conan_setup.sh b/ci/scripts/conan_setup.sh index bc56ee296a234..d665ce5436b2b 100755 --- a/ci/scripts/conan_setup.sh +++ b/ci/scripts/conan_setup.sh @@ -19,5 +19,4 @@ set -eux -conan config install https://github.com/conan-io/hooks.git -sf hooks -tf hooks -conan config set hooks.conan-center +conan profile detect diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index c1e7adf6a05e0..9611f94d52209 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -109,7 +109,13 @@ if [ "${ARROW_OFFLINE}" = "ON" ]; then echo > /etc/resolv.conf fi -if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then +if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then + meson setup \ + --prefix=${MESON_PREFIX:-${ARROW_HOME}} \ + --buildtype=${ARROW_BUILD_TYPE:-debug} \ + . \ + ${source_dir} +elif [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then if [ "${UBUNTU}" = "20.04" ]; then echo "arrow emscripten build is not supported on Ubuntu 20.04, run with UBUNTU=22.04" exit -1 @@ -141,7 +147,6 @@ else -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ - -DARROW_BUILD_OPENMP_BENCHMARKS=${ARROW_BUILD_OPENMP_BENCHMARKS:-OFF} \ -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ @@ -220,6 +225,7 @@ else -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + -DCUDAToolkit_ROOT=${CUDAToolkit_ROOT:-} \ -Dgflags_SOURCE=${gflags_SOURCE:-} \ -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ -DgRPC_SOURCE=${gRPC_SOURCE:-} \ @@ -243,8 +249,12 @@ else ${source_dir} fi -export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} -time cmake --build . --target install +if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then + time meson install +else + export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} + time cmake --build . --target install +fi # Save disk space by removing large temporary build products find . -name "*.o" -delete diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 7912bf23e491c..36e09e8936f60 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -89,7 +89,13 @@ pushd ${build_dir} if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then export PYTHON="${PYTHON:-python3}" fi -ctest \ +if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then + ARROW_BUILD_EXAMPLES=OFF # TODO: Remove this + meson test \ + --print-errorlogs \ + "$@" +else + ctest \ --label-regex unittest \ --output-on-failure \ --parallel ${n_jobs} \ @@ -97,6 +103,7 @@ ctest \ --timeout ${ARROW_CTEST_TIMEOUT:-300} \ "${ctest_options[@]}" \ "$@" +fi if [ "${ARROW_BUILD_EXAMPLES}" == "ON" ]; then examples=$(find ${binary_output_dir} -executable -name "*example") diff --git a/ci/scripts/install_cmake.sh b/ci/scripts/install_cmake.sh index 7fdb06d90f02c..d01a7a744dca8 100755 --- a/ci/scripts/install_cmake.sh +++ b/ci/scripts/install_cmake.sh @@ -17,29 +17,44 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi declare -A archs -archs=([amd64]=x86_64 - [arch64]=aarch64 +archs=([x86_64]=x86_64 [arm64]=aarch64 - [arm64v8]=aarch64 - [x86_64]=x86_64) - -declare -A platforms -platforms=([linux]=linux - [macos]=macos - [windows]=windows) + [aarch64]=aarch64) -if [ "$#" -ne 4 ]; then - echo "Usage: $0 " - exit 1 +arch=$(uname -m) +if [ -z ${archs[$arch]} ]; then + echo "Unsupported architecture: ${arch}" + exit 0 fi +arch=${archs[$arch]} + +version=$1 +prefix=$2 -arch=${archs[$1]} -platform=${platforms[$2]} -version=$3 -prefix=$4 +platform=$(uname) +case ${platform} in + Linux) + platform=linux + ;; + Darwin) + platform=macos + ;; + MSYS_NT*|MINGW64_NT*) + platform=windows + ;; + *) + echo "Unsupported platform: ${platform}" + exit 0 + ;; +esac mkdir -p ${prefix} url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-" diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh index 0f8a0804691e7..a84d136c0c2c7 100755 --- a/ci/scripts/install_python.sh +++ b/ci/scripts/install_python.sh @@ -28,9 +28,9 @@ declare -A versions versions=([3.9]=3.9.13 [3.10]=3.10.11 [3.11]=3.11.9 - [3.12]=3.12.5 - [3.13]=3.13.0 - [3.13t]=3.13.0) + [3.12]=3.12.9 + [3.13]=3.13.2 + [3.13t]=3.13.2) if [ "$#" -ne 2 ]; then echo "Usage: $0 " @@ -47,17 +47,11 @@ full_version=${versions[$2]} if [ $platform = "macOS" ]; then echo "Downloading Python installer..." - if [ "$version" = "3.13" ] || [ "$version" = "3.13t" ]; + if [ "$(uname -m)" = "x86_64" ] && [ "$version" = "3.9" ]; then - fname="python-${full_version}rc2-macos11.pkg" - elif [ "$(uname -m)" = "arm64" ] || \ - [ "$version" = "3.10" ] || \ - [ "$version" = "3.11" ] || \ - [ "$version" = "3.12" ]; - then - fname="python-${full_version}-macos11.pkg" - else fname="python-${full_version}-macosx10.9.pkg" + else + fname="python-${full_version}-macos11.pkg" fi wget "https://www.python.org/ftp/python/${full_version}/${fname}" diff --git a/ci/scripts/integration_substrait.sh b/ci/scripts/integration_substrait.sh deleted file mode 100755 index 152a8d9440187..0000000000000 --- a/ci/scripts/integration_substrait.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -# check that optional pyarrow modules are available -# because pytest would just skip the substrait tests -echo "Substrait Integration Tests" -echo "Validating imports" -python -c "import pyarrow.substrait" -python -c "from substrait_consumer.consumers.acero_consumer import AceroConsumer" - -echo "Executing pytest" -cd consumer-testing -pytest -r s substrait_consumer/tests/functional/extension_functions/test_boolean_functions.py --producer isthmus --consumer acero diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 833d31059c710..fd6d0591661c1 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -24,7 +24,7 @@ py -0p %PYTHON_CMD% -m sysconfig || exit /B 1 -call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" +call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat" @echo on echo "=== (%PYTHON%) Clear output directories and leftovers ===" @@ -56,7 +56,7 @@ set ARROW_WITH_SNAPPY=ON set ARROW_WITH_ZLIB=ON set ARROW_WITH_ZSTD=ON set CMAKE_UNITY_BUILD=ON -set CMAKE_GENERATOR=Visual Studio 16 2019 +set CMAKE_GENERATOR=Visual Studio 17 2022 set CMAKE_PLATFORM=x64 set VCPKG_ROOT=C:\vcpkg set VCPKG_FEATURE_FLAGS=-manifests diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index ffe8b388f93df..a686215b93dad 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -58,12 +58,5 @@ py -0p @REM Validate wheel contents %PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\repaired_wheels || exit /B 1 -@rem Download IANA Timezone Database for ORC C++ -curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B -mkdir %USERPROFILE%\Downloads\test\tzdata -arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata || exit /B -set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo -dir %TZDIR% - @REM Execute unittest %PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index d5fd78914755e..67142b66dd7fd 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -91,7 +91,9 @@ export TEXMFVAR=/tmp/texmf-var BEFORE=$(ls -alh ~/) SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') - if (as_cran) { + # generally will be false, but we can override it by setting SKIP_VIGNETTES=true + skip_vignettes <- identical(tolower(Sys.getenv('SKIP_VIGNETTES')), 'true') + if (as_cran && !skip_vignettes) { args <- '--as-cran' build_args <- character() } else { diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index c9395eb243f76..de92addf08371 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -23,13 +23,8 @@ set -ex # Make sure it is absolute and exported export ARROW_HOME="$(cd "${ARROW_HOME}" && pwd)" -# Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN -# curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf -# curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" -# pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz -# pacman --noconfirm -Scc - pacman --noconfirm -Syy + RWINLIB_LIB_DIR="lib" : ${MINGW_ARCH:="mingw32 mingw64 ucrt64"} @@ -71,7 +66,7 @@ if [ -d mingw64/lib/ ]; then # Move the 64-bit versions of libarrow into the expected location mv mingw64/lib/*.a $DST_DIR/lib/x64 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*,nghttp2}.a $DST_DIR/lib/x64 + cp $MSYS_LIB_DIR/mingw64/lib/lib{snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,nghttp2}.a $DST_DIR/lib/x64 fi # Same for the 32-bit versions @@ -79,7 +74,7 @@ if [ -d mingw32/lib/ ]; then ls $MSYS_LIB_DIR/mingw32/lib/ mkdir -p $DST_DIR/lib/i386 mv mingw32/lib/*.a $DST_DIR/lib/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*,nghttp2}.a $DST_DIR/lib/i386 + cp $MSYS_LIB_DIR/mingw32/lib/lib{snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,nghttp2}.a $DST_DIR/lib/i386 fi # Do the same also for ucrt64 @@ -87,7 +82,7 @@ if [ -d ucrt64/lib/ ]; then ls $MSYS_LIB_DIR/ucrt64/lib/ mkdir -p $DST_DIR/lib/x64-ucrt mv ucrt64/lib/*.a $DST_DIR/lib/x64-ucrt - cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*,nghttp2}.a $DST_DIR/lib/x64-ucrt + cp $MSYS_LIB_DIR/ucrt64/lib/lib{snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,nghttp2}.a $DST_DIR/lib/x64-ucrt fi # Create build artifact diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 67fb2a4a3ea76..39b51874b1c0e 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -29,17 +29,16 @@ index a79c72a59..6b7fa6a66 100644 vcpkg_cmake_install(ADD_BIN_TO_PATH) diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 0c7098082..c603c3653 100644 +index 0312b2ae1..fdb576b5f 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake -@@ -10,6 +10,7 @@ vcpkg_from_github( - PATCHES +@@ -8,5 +8,6 @@ vcpkg_from_github( fix_clang-cl_build.patch no-werror.patch + pkgconfig.diff + "snappy-disable-bmi.patch" ) - - vcpkg_cmake_configure( + file(COPY "${CURRENT_PORT_DIR}/snappy.pc.in" DESTINATION "${SOURCE_PATH}") diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 index 000000000..e839c93a4 diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index a45adbb6a751e..5dfe61a0c6062 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -75,6 +75,7 @@ { "name": "llvm", "default-features": false, + "version>=": "18.1", "features": [ "clang", "default-targets", diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a7d80c2e96c23..f2500b3a72f40 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.25) message(STATUS "Building using CMake version: ${CMAKE_VERSION}") # https://www.cmake.org/cmake/help/latest/policy/CMP0025.html diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 85febbc5c9a7c..114f79271d282 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -41,7 +41,6 @@ "cacheVariables": { "ARROW_BUILD_BENCHMARKS": "ON", "ARROW_BUILD_BENCHMARKS_REFERENCE": "ON", - "ARROW_BUILD_OPENMP_BENCHMARKS": "ON", "ARROW_BUILD_DETAILED_BENCHMARKS": "OFF", "CMAKE_BUILD_TYPE": "RelWithDebInfo" } diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 43e4e7603cfbf..ee6315f8f0f9a 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -243,9 +243,6 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_BUILD_BENCHMARKS_REFERENCE "Build the Arrow micro reference benchmarks" OFF) - define_option(ARROW_BUILD_OPENMP_BENCHMARKS - "Build the Arrow benchmarks that rely on OpenMP" OFF) - define_option(ARROW_BUILD_DETAILED_BENCHMARKS "Build benchmarks that do a longer exploration of performance" OFF) diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake index 98a706deb9919..0c5aed8e4e06c 100644 --- a/cpp/cmake_modules/FindThriftAlt.cmake +++ b/cpp/cmake_modules/FindThriftAlt.cmake @@ -32,35 +32,20 @@ if(ThriftAlt_FOUND) return() endif() -# There are some problems in ThriftConfig.cmake provided by MSYS2 and -# conda on Windows: -# -# * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 -# * https://github.com/msys2/MINGW-packages/issues/6619#issuecomment-649728718 -# -# We can remove the following "if(NOT WIN32)" condition once the -# followings are fixed and a new version that includes these fixes is -# published by MSYS2 and conda: -# -# * https://github.com/apache/thrift/pull/2725 -# * https://github.com/apache/thrift/pull/2726 -# * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 -if(NOT WIN32) - set(find_package_args "") - if(ThriftAlt_FIND_VERSION) - list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) - endif() - if(ThriftAlt_FIND_QUIETLY) - list(APPEND find_package_args QUIET) - endif() - find_package(Thrift ${find_package_args}) - if(Thrift_FOUND) - set(ThriftAlt_FOUND TRUE) - add_executable(thrift::compiler IMPORTED) - set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION - "${THRIFT_COMPILER}") - return() - endif() +set(find_package_args "") +if(ThriftAlt_FIND_VERSION) + list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +endif() +if(ThriftAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +find_package(Thrift ${find_package_args}) +if(Thrift_FOUND) + set(ThriftAlt_FOUND TRUE) + add_executable(thrift::compiler IMPORTED) + set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION + "${THRIFT_COMPILER}") + return() endif() function(extract_thrift_version) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index cbc96ce397fd6..ef7b0b008f29f 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -386,14 +386,14 @@ if(ARROW_WITH_OPENTELEMETRY) set(ARROW_WITH_PROTOBUF ON) endif() -if(ARROW_THRIFT) - set(ARROW_WITH_ZLIB ON) -endif() - if(ARROW_PARQUET) set(ARROW_WITH_THRIFT ON) endif() +if(ARROW_WITH_THRIFT) + set(ARROW_WITH_ZLIB ON) +endif() + if(ARROW_FLIGHT) set(ARROW_WITH_GRPC ON) endif() @@ -1256,13 +1256,19 @@ endif() # - Gandiva has a compile-time (header-only) dependency on Boost, not runtime. # - Tests need Boost at runtime. # - S3FS and Flight benchmarks need Boost at runtime. +# - arrow_testing uses boost::filesystem. So arrow_testing requires +# Boost library. (boost::filesystem isn't header-only.) But if we +# use arrow_testing as a static library without +# using arrow::util::Process, we don't need boost::filesystem. if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS OR (ARROW_FLIGHT AND (ARROW_TESTING OR ARROW_BUILD_BENCHMARKS)) - OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) + OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS) + OR (ARROW_TESTING AND ARROW_BUILD_SHARED)) set(ARROW_USE_BOOST TRUE) set(ARROW_BOOST_REQUIRE_LIBRARY TRUE) elseif(ARROW_GANDIVA + OR ARROW_TESTING OR ARROW_WITH_THRIFT OR (NOT ARROW_USE_NATIVE_INT128)) set(ARROW_USE_BOOST TRUE) @@ -1767,9 +1773,10 @@ macro(build_thrift) if(DEFINED BOOST_ROOT) list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") endif() - if(DEFINED Boost_INCLUDE_DIR) - list(APPEND THRIFT_CMAKE_ARGS "-DBoost_INCLUDE_DIR=${Boost_INCLUDE_DIR}") - endif() + list(APPEND + THRIFT_CMAKE_ARGS + "-DBoost_INCLUDE_DIR=$" + ) if(DEFINED Boost_NAMESPACE) list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") endif() @@ -4640,6 +4647,10 @@ function(build_orc) set(ZLIB_HOME ${ZLIB_ROOT} CACHE STRING "" FORCE) + # From CMake 3.21 onwards the set(CACHE) command does not remove any normal + # variable of the same name from the current scope. We have to manually remove + # the variable via unset to avoid ORC not finding the ZLIB_LIBRARY. + unset(ZLIB_LIBRARY) set(ZLIB_LIBRARY ZLIB::ZLIB CACHE STRING "" FORCE) @@ -5044,6 +5055,18 @@ macro(build_awssdk) string(APPEND AWS_C_FLAGS " -Wno-deprecated") string(APPEND AWS_CXX_FLAGS " -Wno-deprecated") endif() + # GH-44950: This is required to build under Rtools40 and we may be able to + # remove it if/when we no longer need to build under Rtools40 + if(WIN32 AND NOT MSVC) + string(APPEND + AWS_C_FLAGS + " -D_WIN32_WINNT=0x0601 -D__USE_MINGW_ANSI_STDIO=1 -Wno-error -Wno-error=format= -Wno-error=format-extra-args -Wno-unused-local-typedefs -Wno-unused-variable" + ) + string(APPEND + AWS_CXX_FLAGS + " -D_WIN32_WINNT=0x0601 -D__USE_MINGW_ANSI_STDIO=1 -Wno-error -Wno-error=format= -Wno-error=format-extra-args -Wno-unused-local-typedefs -Wno-unused-variable" + ) + endif() set(AWSSDK_COMMON_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} @@ -5081,6 +5104,28 @@ macro(build_awssdk) endif() list(APPEND AWSSDK_PATCH_COMMAND ${AWSSDK_UNUSED_DIRECTORIES}) + # Patch parts of the AWSSDK EP so it builds cleanly under Rtools40 + if(WIN32 AND NOT MSVC) + find_program(PATCH patch REQUIRED) + # Patch aws_c_common to build under Rtools40 + set(AWS_C_COMMON_PATCH_COMMAND ${PATCH} -p1 -i + ${CMAKE_SOURCE_DIR}/../ci/rtools/aws_c_common_ep.patch) + message(STATUS "Hello ${AWS_C_COMMON_PATCH_COMMAND}") + # aws_c_io_ep to build under Rtools40 + set(AWS_C_IO_PATCH_COMMAND ${PATCH} -p1 -i + ${CMAKE_SOURCE_DIR}/../ci/rtools/aws_c_io_ep.patch) + message(STATUS "Hello ${AWS_C_IO_PATCH_COMMAND}") + # awssdk_ep to build under Rtools40 + list(APPEND + AWSSDK_PATCH_COMMAND + && + ${PATCH} + -p1 + -i + ${CMAKE_SOURCE_DIR}/../ci/rtools/awssdk_ep.patch) + message(STATUS "Hello ${AWSSDK_PATCH_COMMAND}") + endif() + if(UNIX) # on Linux and macOS curl seems to be required find_curl() @@ -5175,6 +5220,7 @@ macro(build_awssdk) ${EP_COMMON_OPTIONS} URL ${AWS_C_COMMON_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWS_C_COMMON_BUILD_SHA256_CHECKSUM}" + PATCH_COMMAND ${AWS_C_COMMON_PATCH_COMMAND} CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} BUILD_BYPRODUCTS ${AWS_C_COMMON_STATIC_LIBRARY}) add_dependencies(AWS::aws-c-common aws_c_common_ep) @@ -5270,6 +5316,7 @@ macro(build_awssdk) ${EP_COMMON_OPTIONS} URL ${AWS_C_IO_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWS_C_IO_BUILD_SHA256_CHECKSUM}" + PATCH_COMMAND ${AWS_C_IO_PATCH_COMMAND} CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} BUILD_BYPRODUCTS ${AWS_C_IO_STATIC_LIBRARY} DEPENDS ${AWS_C_IO_DEPENDS}) diff --git a/cpp/examples/minimal_build/minimal.dockerfile b/cpp/examples/minimal_build/minimal.dockerfile index 9361fc5e81d4d..8062e9b698437 100644 --- a/cpp/examples/minimal_build/minimal.dockerfile +++ b/cpp/examples/minimal_build/minimal.dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM ubuntu:focal +FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive diff --git a/cpp/examples/minimal_build/system_dependency.dockerfile b/cpp/examples/minimal_build/system_dependency.dockerfile index 926fcaf6f4baa..84a16c4902f3a 100644 --- a/cpp/examples/minimal_build/system_dependency.dockerfile +++ b/cpp/examples/minimal_build/system_dependency.dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM ubuntu:focal +FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive diff --git a/cpp/examples/parquet/parquet_arrow/CMakeLists.txt b/cpp/examples/parquet/parquet_arrow/CMakeLists.txt index 0480391e3800e..189d17914d678 100644 --- a/cpp/examples/parquet/parquet_arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet_arrow/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. # Require cmake that supports BYPRODUCTS in add_custom_command, ExternalProject_Add [1]. -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.25) project(parquet_arrow_example) diff --git a/cpp/examples/tutorial_examples/CMakeLists.txt b/cpp/examples/tutorial_examples/CMakeLists.txt index 8788501484c87..a6f8350c41dfe 100644 --- a/cpp/examples/tutorial_examples/CMakeLists.txt +++ b/cpp/examples/tutorial_examples/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.25) project(ArrowTutorialExamples) diff --git a/cpp/examples/tutorial_examples/tutorial.dockerfile b/cpp/examples/tutorial_examples/tutorial.dockerfile index 9361fc5e81d4d..8062e9b698437 100644 --- a/cpp/examples/tutorial_examples/tutorial.dockerfile +++ b/cpp/examples/tutorial_examples/tutorial.dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM ubuntu:focal +FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive diff --git a/cpp/meson.build b/cpp/meson.build new file mode 100644 index 0000000000000..9eca739b82522 --- /dev/null +++ b/cpp/meson.build @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +project( + 'arrow', + 'cpp', + 'c', + version: '20.0.0-SNAPSHOT', + license: 'Apache-2.0', + meson_version: '>=1.3.0', + default_options: [ + 'buildtype=release', + 'c_std=c99', + 'warning_level=2', + 'cpp_std=c++17', + ], +) + +project_args = [ + '-Wno-unused-parameter', + '-Wno-array-bounds', + '-Wno-stringop-overflow', + '-Wno-aggressive-loop-optimizations', + '-Wno-nonnull', +] + +c_compiler = meson.get_compiler('c') +c_args = c_compiler.get_supported_arguments(project_args) +add_project_arguments(c_args, language: 'c') + +cpp_compiler = meson.get_compiler('cpp') +cpp_args = cpp_compiler.get_supported_arguments(project_args) +add_project_arguments(cpp_args, language: 'cpp') + +git_id = get_option('git_id') +if git_id == '' + git_id = run_command('git', 'log', '-n1', '--format=%H', check: false).stdout().strip() +endif + +git_description = get_option('git_description') +if git_description == '' + git_description = run_command('git', 'describe', '--tags', check: false).stdout().strip() +endif + +subdir('src/arrow') diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from b/cpp/meson.options similarity index 71% rename from dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from rename to cpp/meson.options index 52ab48b66f223..1391cd361c691 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from +++ b/cpp/meson.options @@ -6,7 +6,7 @@ # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an @@ -15,4 +15,18 @@ # specific language governing permissions and limitations # under the License. -arm64v8/ubuntu:focal +option( + 'git_id', + type: 'string', +) + +option( + 'git_description', + type: 'string', +) + +option( + 'package_kind', + type: 'string', + description: 'Arbitrary string that identifies the kind of package (for informational purposes)', +) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index eb9860b240f16..775e3633aa4a3 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -529,6 +529,7 @@ set(ARROW_UTIL_SRCS util/logger.cc util/logging.cc util/key_value_metadata.cc + util/math_internal.cc util/memory.cc util/mutex.cc util/ree_util.cc @@ -752,10 +753,12 @@ if(ARROW_COMPUTE) ARROW_COMPUTE_SRCS compute/kernels/aggregate_basic.cc compute/kernels/aggregate_mode.cc + compute/kernels/aggregate_pivot.cc compute/kernels/aggregate_quantile.cc compute/kernels/aggregate_tdigest.cc compute/kernels/aggregate_var_std.cc compute/kernels/hash_aggregate.cc + compute/kernels/pivot_internal.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_compare.cc diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 54269f1df0eb6..e6aa0560dfa80 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -221,18 +221,7 @@ if(ARROW_BUILD_BENCHMARKS) add_arrow_acero_benchmark(aggregate_benchmark SOURCES aggregate_benchmark.cc) - if(ARROW_BUILD_OPENMP_BENCHMARKS) - find_package(OpenMP REQUIRED) - add_arrow_acero_benchmark(hash_join_benchmark - EXTRA_LINK_LIBS - OpenMP::OpenMP_CXX - SOURCES - hash_join_benchmark.cc) - if(MSVC) - target_compile_options(arrow-compute-hash-join-benchmark - PRIVATE "-openmp:experimental -openmp:llvm") - endif() - endif() + add_arrow_acero_benchmark(hash_join_benchmark SOURCES hash_join_benchmark.cc) if(ARROW_BUILD_STATIC) target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_static) @@ -240,17 +229,13 @@ if(ARROW_BUILD_BENCHMARKS) target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static) target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_static) - if(ARROW_BUILD_OPENMP_BENCHMARKS) - target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static) - endif() + target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static) else() target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared) target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_shared) - if(ARROW_BUILD_OPENMP_BENCHMARKS) - target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared) - endif() + target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared) endif() endif() diff --git a/cpp/src/arrow/acero/accumulation_queue.h b/cpp/src/arrow/acero/accumulation_queue.h index 92d62d5d99d16..b0e0b85a4f3d0 100644 --- a/cpp/src/arrow/acero/accumulation_queue.h +++ b/cpp/src/arrow/acero/accumulation_queue.h @@ -34,7 +34,7 @@ using arrow::compute::ExecBatch; /// \brief A container that accumulates batches until they are ready to /// be processed. -class AccumulationQueue { +class ARROW_ACERO_EXPORT AccumulationQueue { public: AccumulationQueue() : row_count_(0) {} ~AccumulationQueue() = default; diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 3ab976e671ccf..bfba3b5e61703 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -1103,7 +1103,7 @@ class AsofJoinNode : public ExecNode { void ProcessThread() { for (;;) { - if (!process_.Pop()) { + if (!process_.WaitAndPop()) { EndFromProcessThread(); return; } diff --git a/cpp/src/arrow/acero/concurrent_queue_internal.h b/cpp/src/arrow/acero/concurrent_queue_internal.h index 20ec2089bee41..a751db70262f9 100644 --- a/cpp/src/arrow/acero/concurrent_queue_internal.h +++ b/cpp/src/arrow/acero/concurrent_queue_internal.h @@ -31,40 +31,40 @@ namespace arrow::acero { template class ConcurrentQueue { public: - // Pops the last item from the queue. Must be called on a non-empty queue - // - T Pop() { + // Pops the last item from the queue but waits if the queue is empty until new items are + // pushed. + T WaitAndPop() { std::unique_lock lock(mutex_); - cond_.wait(lock, [&] { return !queue_.empty(); }); + WaitUntilNonEmpty(lock); return PopUnlocked(); } // Pops the last item from the queue, or returns a nullopt if empty - // std::optional TryPop() { std::unique_lock lock(mutex_); return TryPopUnlocked(); } // Pushes an item to the queue - // void Push(const T& item) { std::unique_lock lock(mutex_); return PushUnlocked(item); } // Clears the queue - // void Clear() { std::unique_lock lock(mutex_); ClearUnlocked(); } + // Checks if the queue is empty bool Empty() const { std::unique_lock lock(mutex_); return queue_.empty(); } + // Returns a reference to the next element in the queue. Must be called on a non-empty + // queue const T& Front() const { // Need to lock the queue because `front()` may be implemented in terms // of `begin()`, which isn't safe with concurrent calls to e.g. `push()`. @@ -78,6 +78,10 @@ class ConcurrentQueue { size_t SizeUnlocked() const { return queue_.size(); } + void WaitUntilNonEmpty(std::unique_lock& lock) { + cond_.wait(lock, [&] { return !queue_.empty(); }); + } + T PopUnlocked() { auto item = queue_.front(); queue_.pop(); @@ -130,28 +134,34 @@ class BackpressureConcurrentQueue : public ConcurrentQueue { explicit BackpressureConcurrentQueue(BackpressureHandler handler) : handler_(std::move(handler)) {} - T Pop() { + // Pops the last item from the queue but waits if the queue is empty until new items are + // pushed. + T WaitAndPop() { std::unique_lock lock(ConcurrentQueue::GetMutex()); + ConcurrentQueue::WaitUntilNonEmpty(lock); DoHandle do_handle(*this); return ConcurrentQueue::PopUnlocked(); } - void Push(const T& item) { + // Pops the last item from the queue, or returns a nullopt if empty + std::optional TryPop() { std::unique_lock lock(ConcurrentQueue::GetMutex()); DoHandle do_handle(*this); - ConcurrentQueue::PushUnlocked(item); + return ConcurrentQueue::TryPopUnlocked(); } - void Clear() { + // Pushes an item to the queue + void Push(const T& item) { std::unique_lock lock(ConcurrentQueue::GetMutex()); DoHandle do_handle(*this); - ConcurrentQueue::ClearUnlocked(); + ConcurrentQueue::PushUnlocked(item); } - std::optional TryPop() { + // Clears the queue + void Clear() { std::unique_lock lock(ConcurrentQueue::GetMutex()); DoHandle do_handle(*this); - return ConcurrentQueue::TryPopUnlocked(); + ConcurrentQueue::ClearUnlocked(); } Status ForceShutdown() { return handler_.ForceShutdown(); } diff --git a/cpp/src/arrow/acero/groupby_aggregate_node.cc b/cpp/src/arrow/acero/groupby_aggregate_node.cc index 06b034ab2d459..2beef360b45d4 100644 --- a/cpp/src/arrow/acero/groupby_aggregate_node.cc +++ b/cpp/src/arrow/acero/groupby_aggregate_node.cc @@ -282,6 +282,11 @@ Status GroupByNode::Merge() { DCHECK(state0->agg_states[span_i]); batch_ctx.SetState(state0->agg_states[span_i].get()); + // XXX this resizes each KernelState (state0->agg_states[span_i]) multiple times. + // An alternative would be a two-pass algorithm: + // 1. Compute all transpositions (one per local state) and the final number of + // groups. + // 2. Process all agg kernels, resizing each KernelState only once. RETURN_NOT_OK( agg_kernels_[span_i]->resize(&batch_ctx, state0->grouper->num_groups())); RETURN_NOT_OK(agg_kernels_[span_i]->merge( diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 7f4b6dd75272f..347bb962691ea 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -30,16 +31,14 @@ #include "arrow/acero/options.h" #include "arrow/acero/test_util_internal.h" #include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/concatenate.h" #include "arrow/chunked_array.h" #include "arrow/compute/api_aggregate.h" -#include "arrow/compute/api_scalar.h" -#include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" -#include "arrow/compute/kernels/aggregate_internal.h" -#include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/table.h" @@ -50,9 +49,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/async_generator.h" -#include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" @@ -64,7 +61,6 @@ using testing::HasSubstr; namespace arrow { -using internal::BitmapReader; using internal::checked_cast; using internal::checked_pointer_cast; using internal::ToChars; @@ -77,9 +73,11 @@ using compute::ExecBatchFromJSON; using compute::ExecSpan; using compute::FunctionOptions; using compute::Grouper; +using compute::PivotWiderOptions; using compute::RowSegmenter; using compute::ScalarAggregateOptions; using compute::Segment; +using compute::SkewOptions; using compute::SortIndices; using compute::SortKey; using compute::SortOrder; @@ -565,6 +563,7 @@ class GroupBy : public ::testing::TestWithParam { return acero::GroupByTest(GetParam(), arguments, keys, aggregates, use_threads); } + // This is not named GroupByTest to avoid ambiguities between overloads Result AltGroupBy(const std::vector& arguments, const std::vector& keys, const std::vector& segment_keys, @@ -574,6 +573,70 @@ class GroupBy : public ::testing::TestWithParam { /*naive=*/false); } + Result RunPivot(const std::shared_ptr& key_type, + const std::shared_ptr& value_type, + const PivotWiderOptions& options, + const std::shared_ptr& table, bool use_threads = false) { + Aggregate agg{"hash_pivot_wider", std::make_shared(options), + /*target=*/std::vector{"agg_0", "agg_1"}, /*name=*/"out"}; + ARROW_ASSIGN_OR_RAISE( + Datum aggregated_and_grouped, + AltGroupBy({table->GetColumnByName("key"), table->GetColumnByName("value")}, + {table->GetColumnByName("group_key")}, + /*segment_keys=*/{}, {agg}, use_threads)); + ValidateOutput(aggregated_and_grouped); + return aggregated_and_grouped; + } + + Result RunPivot(const std::shared_ptr& key_type, + const std::shared_ptr& value_type, + const PivotWiderOptions& options, + const std::vector& table_json, + bool use_threads = false) { + auto table = + TableFromJSON(schema({field("group_key", int64()), field("key", key_type), + field("value", value_type)}), + table_json); + return RunPivot(key_type, value_type, options, table, use_threads); + } + + void CheckPivoted(const std::shared_ptr& key_type, + const std::shared_ptr& value_type, + const PivotWiderOptions& options, const Datum& pivoted, + const std::string& expected_json) { + FieldVector pivoted_fields; + for (const auto& key_name : options.key_names) { + pivoted_fields.push_back(field(key_name, value_type)); + } + auto expected_type = struct_({ + field("key_0", int64()), + field("out", struct_(std::move(pivoted_fields))), + }); + auto expected = ArrayFromJSON(expected_type, expected_json); + AssertDatumsEqual(expected, pivoted, /*verbose=*/true); + } + + void TestPivot(const std::shared_ptr& key_type, + const std::shared_ptr& value_type, + const PivotWiderOptions& options, + const std::vector& table_json, + const std::string& expected_json, bool use_threads) { + ASSERT_OK_AND_ASSIGN( + auto pivoted, RunPivot(key_type, value_type, options, table_json, use_threads)); + CheckPivoted(key_type, value_type, options, pivoted, expected_json); + } + + void TestPivot(const std::shared_ptr& key_type, + const std::shared_ptr& value_type, + const PivotWiderOptions& options, + const std::vector& table_json, + const std::string& expected_json) { + for (bool use_threads : {false, true}) { + ARROW_SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + TestPivot(key_type, value_type, options, table_json, expected_json, use_threads); + } + } + void TestSegmentKey(const std::shared_ptr
& table, Datum output, const std::vector& segment_keys) { return acero::TestSegmentKey(GetParam(), table, output, segment_keys); @@ -1018,9 +1081,67 @@ TEST_P(GroupBy, MeanOverflow) { } } -TEST_P(GroupBy, VarianceAndStddev) { +TEST_P(GroupBy, VarianceStddevSkewKurtosis) { + for (auto value_type : {int32(), float64()}) { + ARROW_SCOPED_TRACE("value_type = ", *value_type); + auto batch = RecordBatchFromJSON( + schema({field("argument", value_type), field("key", int64())}), R"([ + [1, 1], + [null, 1], + [0, 2], + [null, 3], + [4, null], + [3, 1], + [0, 2], + [-1, 2], + [1, null], + [null, 3] + ])"); + + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + {}, + { + {"hash_variance", nullptr}, + {"hash_stddev", nullptr}, + {"hash_skew", nullptr}, + {"hash_kurtosis", nullptr}, + }, + false)); + + auto expected = ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_variance", float64()), + field("hash_stddev", float64()), + field("hash_skew", float64()), + field("hash_kurtosis", float64()), + }), + R"([ + [1, 1.0, 1.0, 0.0, -2.0], + [2, 0.22222222222222224, 0.4714045207910317, -0.7071067811865478, -1.5], + [3, null, null, null, null], + [null, 2.25, 1.5, 0.0, -2.0] + ])"); + AssertDatumsApproxEqual(expected, aggregated_and_grouped, + /*verbose=*/true); + } +} + +TEST_P(GroupBy, VarianceAndStddevDdof) { + // Test ddof + auto variance_options = std::make_shared(/*ddof=*/2); + auto batch = RecordBatchFromJSON( - schema({field("argument", int32()), field("key", int64())}), R"([ + schema({field("argument", float64()), field("key", int64())}), R"([ [1, 1], [null, 1], [0, 2], @@ -1032,83 +1153,7 @@ TEST_P(GroupBy, VarianceAndStddev) { [1, null], [null, 3] ])"); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - {}, - { - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, - }, - false)); - - AssertDatumsApproxEqual(ArrayFromJSON(struct_({ - field("key_0", int64()), - field("hash_variance", float64()), - field("hash_stddev", float64()), - }), - R"([ - [1, 1.0, 1.0 ], - [2, 0.22222222222222224, 0.4714045207910317], - [3, null, null ], - [null, 2.25, 1.5 ] - ])"), - aggregated_and_grouped, - /*verbose=*/true); - - batch = RecordBatchFromJSON( - schema({field("argument", float64()), field("key", int64())}), R"([ - [1.0, 1], - [null, 1], - [0.0, 2], - [null, 3], - [4.0, null], - [3.0, 1], - [0.0, 2], - [-1.0, 2], - [1.0, null], - [null, 3] - ])"); - - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, GroupByTest( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - {}, - { - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, - }, - false)); - - AssertDatumsApproxEqual(ArrayFromJSON(struct_({ - field("key_0", int64()), - field("hash_variance", float64()), - field("hash_stddev", float64()), - }), - R"([ - [1, 1.0, 1.0 ], - [2, 0.22222222222222224, 0.4714045207910317], - [3, null, null ], - [null, 2.25, 1.5 ] - ])"), - aggregated_and_grouped, - /*verbose=*/true); - - // Test ddof - auto variance_options = std::make_shared(/*ddof=*/2); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, GroupByTest( { batch->GetColumnByName("argument"), @@ -1139,55 +1184,59 @@ TEST_P(GroupBy, VarianceAndStddev) { /*verbose=*/true); } -TEST_P(GroupBy, VarianceAndStddevDecimal) { - auto batch = RecordBatchFromJSON( - schema({field("argument0", decimal128(3, 2)), field("argument1", decimal128(3, 2)), - field("key", int64())}), - R"([ - ["1.00", "1.00", 1], - [null, null, 1], - ["0.00", "0.00", 2], - ["4.00", "4.00", null], - ["3.00", "3.00", 1], - ["0.00", "0.00", 2], - ["-1.00", "-1.00", 2], - ["1.00", "1.00", null] - ])"); +TEST_P(GroupBy, VarianceStddevSkewKurtosisDecimal) { + for (auto value_type : + {decimal32(3, 2), decimal64(3, 2), decimal128(3, 2), decimal256(3, 2)}) { + ARROW_SCOPED_TRACE("value_type = ", *value_type); + auto batch = RecordBatchFromJSON( + schema({field("argument", value_type), field("key", int64())}), + R"([ + ["1.00", 1], + [null, 1], + ["0.00", 2], + ["4.00", null], + ["3.00", 1], + ["0.00", 2], + ["-1.00", 2], + ["1.00", null] + ])"); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest( - { - batch->GetColumnByName("argument0"), - batch->GetColumnByName("argument0"), - batch->GetColumnByName("argument1"), - batch->GetColumnByName("argument1"), - }, - { - batch->GetColumnByName("key"), - }, - {}, - { - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, - }, - false)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + {}, + { + {"hash_variance", nullptr}, + {"hash_stddev", nullptr}, + {"hash_skew", nullptr}, + {"hash_kurtosis", nullptr}, + }, + false)); - AssertDatumsApproxEqual(ArrayFromJSON(struct_({ - field("key_0", int64()), - field("hash_variance", float64()), - field("hash_stddev", float64()), - field("hash_variance", float64()), - field("hash_stddev", float64()), - }), - R"([ - [1, 1.0, 1.0, 1.0, 1.0 ], - [2, 0.22222222222222224, 0.4714045207910317, 0.22222222222222224, 0.4714045207910317], - [null, 2.25, 1.5, 2.25, 1.5 ] - ])"), - aggregated_and_grouped, - /*verbose=*/true); + auto expected = ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_variance", float64()), + field("hash_stddev", float64()), + field("hash_skew", float64()), + field("hash_kurtosis", float64()), + }), + R"([ + [1, 1.0, 1.0, 0.0, -2.0], + [2, 0.22222222222222224, 0.4714045207910317, -0.7071067811865478, -1.5], + [null, 2.25, 1.5, 0.0, -2.0] + ])"); + + AssertDatumsApproxEqual(expected, aggregated_and_grouped, + /*verbose=*/true); + } } TEST_P(GroupBy, TDigest) { @@ -1425,7 +1474,7 @@ TEST_P(GroupBy, StddevVarianceTDigestScalar) { } } -TEST_P(GroupBy, VarianceOptions) { +TEST_P(GroupBy, VarianceOptionsAndSkewOptions) { BatchesWithSchema input; input.batches = { ExecBatchFromJSON( @@ -1441,81 +1490,93 @@ TEST_P(GroupBy, VarianceOptions) { "[[null, null, 1]]"), ExecBatchFromJSON({int32(), float32(), int64()}, "[[2, 2.0, 1], [3, 3.0, 2]]"), ExecBatchFromJSON({int32(), float32(), int64()}, "[[4, 4.0, 2], [2, 2.0, 4]]"), - ExecBatchFromJSON({int32(), float32(), int64()}, "[[null, null, 4]]"), + ExecBatchFromJSON({int32(), float32(), int64()}, "[[null, null, 4], [6, 6.0, 3]]"), }; input.schema = schema( {field("argument", int32()), field("argument1", float32()), field("key", int64())}); - auto keep_nulls = std::make_shared(/*ddof=*/0, /*skip_nulls=*/false, - /*min_count=*/0); - auto min_count = + auto var_keep_nulls = + std::make_shared(/*ddof=*/0, /*skip_nulls=*/false, + /*min_count=*/0); + auto var_min_count = std::make_shared(/*ddof=*/0, /*skip_nulls=*/true, /*min_count=*/3); - auto keep_nulls_min_count = std::make_shared( + auto var_keep_nulls_min_count = std::make_shared( /*ddof=*/0, /*skip_nulls=*/false, /*min_count=*/3); - for (bool use_threads : {false}) { - SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum actual, - RunGroupBy( - input, {"key"}, - { - {"hash_stddev", keep_nulls, "argument", "hash_stddev"}, - {"hash_stddev", min_count, "argument", "hash_stddev"}, - {"hash_stddev", keep_nulls_min_count, "argument", "hash_stddev"}, - {"hash_variance", keep_nulls, "argument", "hash_variance"}, - {"hash_variance", min_count, "argument", "hash_variance"}, - {"hash_variance", keep_nulls_min_count, "argument", "hash_variance"}, - }, - use_threads)); - Datum expected = ArrayFromJSON(struct_({ - field("key", int64()), - field("hash_stddev", float64()), - field("hash_stddev", float64()), - field("hash_stddev", float64()), - field("hash_variance", float64()), - field("hash_variance", float64()), - field("hash_variance", float64()), - }), - R"([ - [1, null, 0.471405, null, null, 0.222222, null ], - [2, 1.29904, 1.29904, 1.29904, 1.6875, 1.6875, 1.6875], - [3, 0.0, null, null, 0.0, null, null ], - [4, null, 0.471405, null, null, 0.222222, null ] - ])"); - ValidateOutput(expected); - AssertDatumsApproxEqual(expected, actual, /*verbose=*/true); + auto skew_keep_nulls = std::make_shared(/*skip_nulls=*/false, + /*min_count=*/0); + auto skew_min_count = + std::make_shared(/*skip_nulls=*/true, /*min_count=*/3); + auto skew_keep_nulls_min_count = std::make_shared( + /*skip_nulls=*/false, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN( - actual, - RunGroupBy( - input, {"key"}, - { - {"hash_stddev", keep_nulls, "argument1", "hash_stddev"}, - {"hash_stddev", min_count, "argument1", "hash_stddev"}, - {"hash_stddev", keep_nulls_min_count, "argument1", "hash_stddev"}, - {"hash_variance", keep_nulls, "argument1", "hash_variance"}, - {"hash_variance", min_count, "argument1", "hash_variance"}, - {"hash_variance", keep_nulls_min_count, "argument1", "hash_variance"}, - }, - use_threads)); - expected = ArrayFromJSON(struct_({ - field("key", int64()), - field("hash_stddev", float64()), - field("hash_stddev", float64()), - field("hash_stddev", float64()), - field("hash_variance", float64()), - field("hash_variance", float64()), - field("hash_variance", float64()), - }), - R"([ + for (std::string value_column : {"argument", "argument1"}) { + for (bool use_threads : {false}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + ASSERT_OK_AND_ASSIGN( + Datum actual, + RunGroupBy( + input, {"key"}, + { + {"hash_stddev", var_keep_nulls, value_column, "hash_stddev"}, + {"hash_stddev", var_min_count, value_column, "hash_stddev"}, + {"hash_stddev", var_keep_nulls_min_count, value_column, "hash_stddev"}, + {"hash_variance", var_keep_nulls, value_column, "hash_variance"}, + {"hash_variance", var_min_count, value_column, "hash_variance"}, + {"hash_variance", var_keep_nulls_min_count, value_column, + "hash_variance"}, + }, + use_threads)); + Datum expected = ArrayFromJSON(struct_({ + field("key", int64()), + field("hash_stddev", float64()), + field("hash_stddev", float64()), + field("hash_stddev", float64()), + field("hash_variance", float64()), + field("hash_variance", float64()), + field("hash_variance", float64()), + }), + R"([ [1, null, 0.471405, null, null, 0.222222, null ], [2, 1.29904, 1.29904, 1.29904, 1.6875, 1.6875, 1.6875], - [3, 0.0, null, null, 0.0, null, null ], + [3, 2.5, null, null, 6.25, null, null ], [4, null, 0.471405, null, null, 0.222222, null ] - ])"); - ValidateOutput(expected); - AssertDatumsApproxEqual(expected, actual, /*verbose=*/true); + ])"); + ValidateOutput(actual); + AssertDatumsApproxEqual(expected, actual, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + actual, + RunGroupBy( + input, {"key"}, + { + {"hash_skew", skew_keep_nulls, value_column, "hash_skew"}, + {"hash_skew", skew_min_count, value_column, "hash_skew"}, + {"hash_skew", skew_keep_nulls_min_count, value_column, "hash_skew"}, + {"hash_kurtosis", skew_keep_nulls, value_column, "hash_kurtosis"}, + {"hash_kurtosis", skew_min_count, value_column, "hash_kurtosis"}, + {"hash_kurtosis", skew_keep_nulls_min_count, value_column, + "hash_kurtosis"}, + }, + use_threads)); + expected = ArrayFromJSON(struct_({ + field("key", int64()), + field("hash_skew", float64()), + field("hash_skew", float64()), + field("hash_skew", float64()), + field("hash_kurtosis", float64()), + field("hash_kurtosis", float64()), + field("hash_kurtosis", float64()), + }), + R"([ + [1, null, 0.707106, null, null, -1.5, null ], + [2, 0.213833, 0.213833, 0.213833, -1.720164, -1.720164, -1.720164], + [3, 0.0, null, null, -2.0, null, null ], + [4, null, 0.707106, null, null, -1.5, null ] + ])"); + ValidateOutput(actual); + AssertDatumsApproxEqual(expected, actual, /*verbose=*/true); + } } } @@ -4345,6 +4406,566 @@ TEST_P(GroupBy, OnlyKeys) { } } +TEST_P(GroupBy, PivotBasics) { + auto key_type = utf8(); + auto value_type = float32(); + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [2, "height", 12.5] + ])", + R"([ + [3, "width", 13.5], + [1, "height", 14.5] + ])"}; + std::string expected_json = R"([ + [1, {"height": 14.5, "width": 10.5} ], + [2, {"height": 12.5, "width": 11.5} ], + [3, {"height": null, "width": 13.5} ] + ])"; + for (auto unexpected_key_behavior : + {PivotWiderOptions::kIgnore, PivotWiderOptions::kRaise}) { + PivotWiderOptions options(/*key_names=*/{"height", "width"}, unexpected_key_behavior); + TestPivot(key_type, value_type, options, table_json, expected_json); + } +} + +TEST_P(GroupBy, PivotAllKeyTypes) { + auto value_type = float32(); + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [2, "height", 12.5], + [3, "width", 13.5], + [1, "height", 14.5] + ])"}; + std::string expected_json = R"([ + [1, {"height": 14.5, "width": 10.5} ], + [2, {"height": 12.5, "width": 11.5} ], + [3, {"height": null, "width": 13.5} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + + for (const auto& key_type : BaseBinaryTypes()) { + ARROW_SCOPED_TRACE("key_type = ", *key_type); + TestPivot(key_type, value_type, options, table_json, expected_json); + } +} + +TEST_P(GroupBy, PivotNumericValues) { + auto key_type = utf8(); + std::vector table_json = {R"([ + [1, "width", 10], + [2, "width", 11] + ])", + R"([ + [2, "height", 12], + [3, "width", 13], + [1, "height", 14] + ])"}; + std::string expected_json = R"([ + [1, {"height": 14, "width": 10} ], + [2, {"height": 12, "width": 11} ], + [3, {"height": null, "width": 13} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + + for (const auto& value_type : NumericTypes()) { + ARROW_SCOPED_TRACE("value_type = ", *value_type); + TestPivot(key_type, value_type, options, table_json, expected_json); + } +} + +TEST_P(GroupBy, PivotBinaryLikeValues) { + auto key_type = utf8(); + std::vector table_json = {R"([ + [1, "name", "Bob"], + [2, "eye_color", "brown"] + ])", + R"([ + [2, "name", "Alice"], + [1, "eye_color", "gray"], + [3, "name", "Mallaury"] + ])"}; + std::string expected_json = R"([ + [1, {"name": "Bob", "eye_color": "gray"} ], + [2, {"name": "Alice", "eye_color": "brown"} ], + [3, {"name": "Mallaury", "eye_color": null} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"name", "eye_color"}); + + for (const auto& value_type : BaseBinaryTypes()) { + ARROW_SCOPED_TRACE("value_type = ", *value_type); + TestPivot(key_type, value_type, options, table_json, expected_json); + } +} + +TEST_P(GroupBy, PivotDecimalValues) { + auto key_type = utf8(); + auto value_type = decimal128(9, 1); + std::vector table_json = {R"([ + [1, "width", "10.1"], + [2, "width", "11.1"] + ])", + R"([ + [2, "height", "12.1"], + [3, "width", "13.1"], + [1, "height", "14.1"] + ])"}; + std::string expected_json = R"([ + [1, {"height": "14.1", "width": "10.1"} ], + [2, {"height": "12.1", "width": "11.1"} ], + [3, {"height": null, "width": "13.1"} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + TestPivot(key_type, value_type, options, table_json, expected_json); +} + +TEST_P(GroupBy, PivotStructValues) { + auto key_type = utf8(); + auto value_type = struct_({{"value", float32()}}); + std::vector table_json = {R"([ + [1, "width", [10.1]], + [2, "width", [11.1]] + ])", + R"([ + [2, "height", [12.1]], + [3, "width", [13.1]], + [1, "height", [14.1]] + ])"}; + std::string expected_json = R"([ + [1, {"height": [14.1], "width": [10.1]} ], + [2, {"height": [12.1], "width": [11.1]} ], + [3, {"height": null, "width": [13.1]} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + TestPivot(key_type, value_type, options, table_json, expected_json); +} + +TEST_P(GroupBy, PivotListValues) { + auto key_type = utf8(); + auto value_type = list(float32()); + std::vector table_json = {R"([ + [1, "foo", [10.5, 11.5]], + [2, "bar", [12.5]] + ])", + R"([ + [2, "foo", []], + [3, "bar", [13.5]], + [1, "foo", null] + ])"}; + std::string expected_json = R"([ + [1, {"foo": [10.5, 11.5], "bar": null} ], + [2, {"foo": [], "bar": [12.5]} ], + [3, {"foo": null, "bar": [13.5]} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"foo", "bar"}); + TestPivot(key_type, value_type, options, table_json, expected_json); +} + +TEST_P(GroupBy, PivotNullValueType) { + auto key_type = utf8(); + auto value_type = null(); + std::vector table_json = {R"([ + [1, "foo", null], + [2, "bar", null] + ])", + R"([ + [2, "foo", null], + [3, "bar", null], + [1, "foo", null] + ])"}; + std::string expected_json = R"([ + [1, {"foo": null, "bar": null} ], + [2, {"foo": null, "bar": null} ], + [3, {"foo": null, "bar": null} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"foo", "bar"}); + TestPivot(key_type, value_type, options, table_json, expected_json); +} + +TEST_P(GroupBy, PivotNullValues) { + auto key_type = utf8(); + auto value_type = float32(); + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, "width", null] + ])", + R"([ + [2, "height", 12.5], + [2, "width", 13.5], + [1, "width", null], + [2, "height", null] + ])", + R"([ + [1, "width", null], + [2, "height", null] + ])"}; + std::string expected_json = R"([ + [1, {"height": null, "width": 10.5} ], + [2, {"height": 12.5, "width": 13.5} ] + ])"; + PivotWiderOptions options(/*key_names=*/{"height", "width"}, PivotWiderOptions::kRaise); + TestPivot(key_type, value_type, options, table_json, expected_json); +} + +TEST_P(GroupBy, PivotScalarKey) { + BatchesWithSchema input; + std::vector types = {int32(), utf8(), float32()}; + std::vector shapes = {ArgShape::ARRAY, ArgShape::SCALAR, ArgShape::ARRAY}; + input.batches = { + ExecBatchFromJSON(types, shapes, + R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])"), + ExecBatchFromJSON(types, shapes, + R"([ + [2, "width", null] + ])"), + ExecBatchFromJSON(types, shapes, + R"([ + [3, "height", null], + [3, "height", null] + ])"), + ExecBatchFromJSON(types, shapes, + R"([ + [3, "height", 12.5], + [1, "height", 13.5] + ])"), + }; + input.schema = schema({field("group_key", int32()), field("pivot_key", utf8()), + field("pivot_value", float32())}); + Datum expected = ArrayFromJSON( + struct_({field("group_key", int32()), + field("pivoted", + struct_({field("height", float32()), field("width", float32())}))}), + R"([ + [1, {"height": 13.5, "width": 10.5} ], + [2, {"height": null, "width": 11.5} ], + [3, {"height": 12.5, "width": null} ] + ])"); + auto options = std::make_shared( + PivotWiderOptions(/*key_names=*/{"height", "width"})); + Aggregate aggregate{"hash_pivot_wider", options, + std::vector{"pivot_key", "pivot_value"}, "pivoted"}; + for (bool use_threads : {false, true}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + ASSERT_OK_AND_ASSIGN(Datum actual, + RunGroupBy(input, {"group_key"}, {aggregate}, use_threads)); + ValidateOutput(actual); + AssertDatumsApproxEqual(expected, actual, /*verbose=*/true); + } +} + +TEST_P(GroupBy, PivotUnusedKeyName) { + auto key_type = utf8(); + auto value_type = float32(); + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [2, "height", 12.5], + [3, "width", 13.5], + [1, "height", 14.5] + ])"}; + std::string expected_json = R"([ + [1, {"height": 14.5, "depth": null, "width": 10.5} ], + [2, {"height": 12.5, "depth": null, "width": 11.5} ], + [3, {"height": null, "depth": null, "width": 13.5} ] + ])"; + for (auto unexpected_key_behavior : + {PivotWiderOptions::kIgnore, PivotWiderOptions::kRaise}) { + PivotWiderOptions options(/*key_names=*/{"height", "depth", "width"}, + unexpected_key_behavior); + TestPivot(key_type, value_type, options, table_json, expected_json); + } +} + +TEST_P(GroupBy, PivotUnexpectedKeyName) { + auto key_type = utf8(); + auto value_type = float32(); + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [2, "height", 12.5], + [3, "width", 13.5], + [1, "depth", 15.5], + [1, "height", 14.5] + ])"}; + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + std::string expected_json = R"([ + [1, {"height": 14.5, "width": 10.5} ], + [2, {"height": 12.5, "width": 11.5} ], + [3, {"height": null, "width": 13.5} ] + ])"; + TestPivot(key_type, value_type, options, table_json, expected_json); + options.unexpected_key_behavior = PivotWiderOptions::kRaise; + for (bool use_threads : {false, true}) { + ARROW_SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, HasSubstr("Unexpected pivot key: depth"), + RunPivot(key_type, value_type, options, table_json, use_threads)); + } +} +TEST_P(GroupBy, PivotNullKeys) { + auto key_type = utf8(); + auto value_type = float32(); + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, null, 11.5] + ])"}; + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + for (bool use_threads : {false, true}) { + ARROW_SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, HasSubstr("pivot key name cannot be null"), + RunPivot(key_type, value_type, options, table_json, use_threads)); + } +} + +TEST_P(GroupBy, PivotDuplicateKeys) { + auto key_type = utf8(); + auto value_type = float32(); + std::vector table_json = {R"([])"}; + PivotWiderOptions options(/*key_names=*/{"height", "width", "height"}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, HasSubstr("Duplicate key name 'height' in PivotWiderOptions"), + RunPivot(key_type, value_type, options, table_json)); +} + +TEST_P(GroupBy, PivotDuplicateValues) { + auto key_type = utf8(); + auto value_type = float32(); + PivotWiderOptions options(/*key_names=*/{"height", "width"}); + + for (bool use_threads : {false, true}) { + ARROW_SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + + // Duplicate values in same chunk + std::vector table_json = {R"([ + [1, "width", 10.5], + [2, "width", 11.5], + [1, "width", 11.5] + ])"}; + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + HasSubstr("Encountered more than one non-null value"), + RunPivot(key_type, value_type, options, table_json)); + + // Duplicate values in different chunks + table_json = {R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [1, "width", 11.5] + ])"}; + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + HasSubstr("Encountered more than one non-null value"), + RunPivot(key_type, value_type, options, table_json)); + } +} + +TEST_P(GroupBy, PivotScalarKeyWithDuplicateValues) { + BatchesWithSchema input; + std::vector types = {int32(), utf8(), float32()}; + std::vector shapes = {ArgShape::ARRAY, ArgShape::SCALAR, ArgShape::ARRAY}; + input.schema = schema({field("group_key", int32()), field("pivot_key", utf8()), + field("pivot_value", float32())}); + auto options = std::make_shared( + PivotWiderOptions(/*key_names=*/{"height", "width"})); + Aggregate aggregate{"hash_pivot_wider", options, + std::vector{"pivot_key", "pivot_value"}, "pivoted"}; + + // Duplicate values in same chunk + input.batches = { + ExecBatchFromJSON(types, shapes, + R"([ + [1, "width", 10.5], + [1, "width", 11.5] + ])"), + }; + for (bool use_threads : {false, true}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, HasSubstr("Encountered more than one non-null value"), + RunGroupBy(input, {"group_key"}, {aggregate}, use_threads)); + } + + // Duplicate values in different chunks + input.batches = { + ExecBatchFromJSON(types, shapes, + R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])"), + ExecBatchFromJSON(types, shapes, + R"([ + [2, "width", 12.5] + ])"), + }; + for (bool use_threads : {false, true}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, HasSubstr("Encountered more than one non-null value"), + RunGroupBy(input, {"group_key"}, {aggregate}, use_threads)); + } +} + +struct RandomPivotTestCase { + PivotWiderOptions options; + std::shared_ptr input; + std::shared_ptr expected_output; +}; + +Result MakeRandomPivot(int64_t length) { + constexpr double kKeyPresenceProbability = 0.8; + constexpr double kValueValidityProbability = 0.7; + + const std::vector key_names = {"height", "width", "depth"}; + std::default_random_engine gen(42); + std::uniform_real_distribution value_dist(0.0f, 1.0f); + std::bernoulli_distribution key_presence_dist(kKeyPresenceProbability); + std::bernoulli_distribution value_validity_dist(kValueValidityProbability); + + Int64Builder group_key_builder; + StringBuilder key_builder; + FloatBuilder value_builder; + RETURN_NOT_OK(group_key_builder.Reserve(length)); + RETURN_NOT_OK(key_builder.Reserve(length)); + RETURN_NOT_OK(value_builder.Reserve(length)); + + // The last input key name will not be part of the result + PivotWiderOptions options( + std::vector(key_names.begin(), key_names.end() - 1)); + Int64Builder pivoted_group_builder; + std::vector pivoted_value_builders(options.key_names.size()); + + auto finish_group = [&](int64_t group_key) -> Status { + // First check if *any* pivoted column was populated (otherwise there was + // no valid value at all in this group, and no output row should be generated). + RETURN_NOT_OK(pivoted_group_builder.Append(group_key)); + // Make sure all pivoted columns are populated and in sync with the group key column + for (auto& pivoted_value_builder : pivoted_value_builders) { + if (pivoted_value_builder.length() < pivoted_group_builder.length()) { + RETURN_NOT_OK(pivoted_value_builder.AppendNull()); + } + EXPECT_EQ(pivoted_value_builder.length(), pivoted_group_builder.length()); + } + return Status::OK(); + }; + + int64_t group_key = 1000; + bool group_started = false; + int key_id = 0; + while (group_key_builder.length() < length) { + // For the current group_key and key_id we can either: + // 1. not add a row + // 2. add a row with a null value + // 3. add a row with a non-null value + // 3a. the row will end up in the pivoted data iff the key is part of + // the PivotWiderOptions.key_names + if (key_presence_dist(gen)) { + group_key_builder.UnsafeAppend(group_key); + group_started = true; + RETURN_NOT_OK(key_builder.Append(key_names[key_id])); + if (value_validity_dist(gen)) { + const auto value = value_dist(gen); + value_builder.UnsafeAppend(value); + if (key_id < static_cast(pivoted_value_builders.size())) { + RETURN_NOT_OK(pivoted_value_builders[key_id].Append(value)); + } + } else { + value_builder.UnsafeAppendNull(); + } + } + if (++key_id >= static_cast(key_names.size())) { + // We've considered all keys for this group. + // Emit a pivoted row only if any key was emitted in the input. + if (group_started) { + RETURN_NOT_OK(finish_group(group_key)); + } + // Initiate new group + ++group_key; + group_started = false; + key_id = 0; + } + } + if (group_started) { + // We've started this group, finish it + RETURN_NOT_OK(finish_group(group_key)); + } + ARROW_ASSIGN_OR_RAISE(auto group_keys, group_key_builder.Finish()); + ARROW_ASSIGN_OR_RAISE(auto keys, key_builder.Finish()); + ARROW_ASSIGN_OR_RAISE(auto values, value_builder.Finish()); + auto input_schema = + schema({{"group_key", int64()}, {"key", utf8()}, {"value", float32()}}); + auto input = RecordBatch::Make(input_schema, length, {group_keys, keys, values}); + RETURN_NOT_OK(input->Validate()); + + ARROW_ASSIGN_OR_RAISE(auto pivoted_groups, pivoted_group_builder.Finish()); + ArrayVector pivoted_value_columns; + for (auto& pivoted_value_builder : pivoted_value_builders) { + ARROW_ASSIGN_OR_RAISE(pivoted_value_columns.emplace_back(), + pivoted_value_builder.Finish()); + } + ARROW_ASSIGN_OR_RAISE( + auto pivoted_values, + StructArray::Make(std::move(pivoted_value_columns), options.key_names)); + ARROW_ASSIGN_OR_RAISE(auto output, + StructArray::Make({pivoted_groups, pivoted_values}, + std::vector{"key_0", "out"})); + RETURN_NOT_OK(output->Validate()); + + return RandomPivotTestCase{std::move(options), std::move(input), std::move(output)}; +} + +TEST_P(GroupBy, PivotRandom) { + constexpr int64_t kLength = 900; + // Larger than 256 to exercise take-index dispatch in pivot implementation + constexpr int64_t kChunkLength = 300; + ASSERT_OK_AND_ASSIGN(auto pivot_case, MakeRandomPivot(kLength)); + + for (bool shuffle : {false, true}) { + ARROW_SCOPED_TRACE("shuffle = ", shuffle); + auto input = Datum(pivot_case.input); + if (shuffle) { + // Since the "value" column is random-generated, sorting on it produces + // a random shuffle. + ASSERT_OK_AND_ASSIGN( + auto shuffle_indices, + SortIndices(pivot_case.input, SortOptions({SortKey("value")}))); + ASSERT_OK_AND_ASSIGN(input, Take(input, shuffle_indices)); + } + ASSERT_EQ(input.kind(), Datum::RECORD_BATCH); + RecordBatchVector chunks; + for (int64_t start = 0; start < kLength; start += kChunkLength) { + const auto chunk_length = std::min(kLength - start, kChunkLength); + chunks.push_back(input.record_batch()->Slice(start, chunk_length)); + } + ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatches(chunks)); + + for (bool use_threads : {false, true}) { + ARROW_SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + ASSERT_OK_AND_ASSIGN(auto pivoted, RunPivot(utf8(), float32(), pivot_case.options, + table, use_threads)); + // XXX For some reason this works even in the shuffled case + // (I would expect the test to require sorting of the output). + // This might depend on implementation details of group id generation + // by the hash-aggregate logic (the pivot implementation implicitly + // orders the output by ascending group id). + AssertDatumsEqual(pivot_case.expected_output, pivoted, /*verbose=*/true); + } + } +} + INSTANTIATE_TEST_SUITE_P(GroupBy, GroupBy, ::testing::Values(RunGroupByImpl)); class SegmentedScalarGroupBy : public GroupBy {}; @@ -4620,6 +5241,101 @@ TEST_P(SegmentedKeyGroupBy, MultiSegmentKeyCombined) { TestMultiSegmentKey(GetParam(), GetMultiSegmentInputAsCombined); } +TEST_P(SegmentedKeyGroupBy, PivotSegmentKey) { + auto group_by = GetParam(); + auto key_type = utf8(); + auto value_type = float32(); + + std::vector table_json = {R"([ + [1, "width", 10.5], + [1, "height", 11.5] + ])", + R"([ + [2, "height", 12.5], + [2, "width", 13.5], + [3, "width", 14.5] + ])", + R"([ + [3, "width", null], + [4, "height", 15.5] + ])"}; + std::vector expected_json = { + R"([[1, {"height": 11.5, "width": 10.5}]])", + R"([[2, {"height": 12.5, "width": 13.5}]])", + R"([[3, {"height": null, "width": 14.5}]])", + R"([[4, {"height": 15.5, "width": null}]])", + }; + + auto table = + TableFromJSON(schema({field("segment_key", int64()), field("pivot_key", key_type), + field("pivot_value", value_type)}), + table_json); + + auto options = std::make_shared( + PivotWiderOptions(/*key_names=*/{"height", "width"})); + Aggregate aggregate{"pivot_wider", options, std::vector{"agg_0", "agg_1"}, + "pivoted"}; + ASSERT_OK_AND_ASSIGN(Datum actual, + group_by( + { + table->GetColumnByName("pivot_key"), + table->GetColumnByName("pivot_value"), + }, + {}, {table->GetColumnByName("segment_key")}, {aggregate}, + /*use_threads=*/false, /*naive=*/false)); + ValidateOutput(actual); + auto expected = ChunkedArrayFromJSON( + struct_({field("key_0", int64()), + field("pivoted", struct_({field("height", value_type), + field("width", value_type)}))}), + expected_json); + AssertDatumsEqual(expected, actual, /*verbose=*/true); +} + +TEST_P(SegmentedKeyGroupBy, PivotSegmentKeyDuplicateValues) { + // NOTE: besides testing "pivot_wider" behavior, this test also checks that errors + // produced when consuming or merging an aggregate don't corrupt + // execution engine internals. + auto group_by = GetParam(); + auto key_type = utf8(); + auto value_type = float32(); + auto options = std::make_shared( + PivotWiderOptions(/*key_names=*/{"height", "width"})); + auto table_schema = schema({field("segment_key", int64()), field("pivot_key", key_type), + field("pivot_value", value_type)}); + + auto test_duplicate_values = [&](const std::vector& table_json) { + auto table = TableFromJSON(table_schema, table_json); + Aggregate aggregate{"pivot_wider", options, std::vector{"agg_0", "agg_1"}, + "pivoted"}; + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + HasSubstr("Encountered more than one non-null value for the same pivot key"), + group_by( + { + table->GetColumnByName("pivot_key"), + table->GetColumnByName("pivot_value"), + }, + {}, {table->GetColumnByName("segment_key")}, {aggregate}, + /*use_threads=*/false, /*naive=*/false)); + }; + + // Duplicate values in the same chunk + test_duplicate_values({R"([ + [1, "width", 10.5], + [2, "width", 11.5], + [2, "width", 12.5] + ])"}); + // Duplicate values in two different chunks + test_duplicate_values({R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [2, "width", 12.5] + ])"}); +} + INSTANTIATE_TEST_SUITE_P(SegmentedScalarGroupBy, SegmentedScalarGroupBy, ::testing::Values(RunSegmentedGroupByImpl)); diff --git a/cpp/src/arrow/acero/hash_join.h b/cpp/src/arrow/acero/hash_join.h index a81ff274e5e3a..c0faacf04baf0 100644 --- a/cpp/src/arrow/acero/hash_join.h +++ b/cpp/src/arrow/acero/hash_join.h @@ -37,7 +37,7 @@ namespace acero { using util::AccumulationQueue; -class HashJoinImpl { +class ARROW_ACERO_EXPORT HashJoinImpl { public: using OutputBatchCallback = std::function; using BuildFinishedCallback = std::function; diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 0a56194f2a3c8..c01e8a58933f6 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -32,8 +32,6 @@ #include #include -#include - namespace arrow { namespace acero { struct BenchmarkSettings { @@ -56,6 +54,8 @@ struct BenchmarkSettings { int var_length_max = 20; // Maximum length of any var length types Expression residual_filter = literal(true); + + bool stats_probe_rows = true; }; class JoinBenchmark { @@ -128,6 +128,7 @@ class JoinBenchmark { for (ExecBatch& batch : r_batches_with_schema.batches) r_batches_.InsertBatch(std::move(batch)); + stats_.num_build_rows = settings.num_build_batches * settings.batch_size; stats_.num_probe_rows = settings.num_probe_batches * settings.batch_size; schema_mgr_ = std::make_unique(); @@ -141,14 +142,9 @@ class JoinBenchmark { join_ = *HashJoinImpl::MakeSwiss(); } - omp_set_num_threads(settings.num_threads); - auto schedule_callback = [](std::function func) -> Status { -#pragma omp task - { DCHECK_OK(func(omp_get_thread_num())); } - return Status::OK(); - }; - scheduler_ = TaskScheduler::Make(); + thread_pool_ = arrow::internal::GetCpuThreadPool(); + DCHECK_OK(thread_pool_->SetCapacity(settings.num_threads)); DCHECK_OK(ctx_.Init(nullptr)); auto register_task_group_callback = [&](std::function task, @@ -157,7 +153,7 @@ class JoinBenchmark { }; auto start_task_group_callback = [&](int task_group_id, int64_t num_tasks) { - return scheduler_->StartTaskGroup(omp_get_thread_num(), task_group_id, num_tasks); + return scheduler_->StartTaskGroup(/*thread_id=*/0, task_group_id, num_tasks); }; DCHECK_OK(join_->Init( @@ -165,7 +161,7 @@ class JoinBenchmark { &(schema_mgr_->proj_maps[1]), std::move(key_cmp), settings.residual_filter, std::move(register_task_group_callback), std::move(start_task_group_callback), [](int64_t, ExecBatch) { return Status::OK(); }, - [](int64_t) { return Status::OK(); })); + [&](int64_t) { return Status::OK(); })); task_group_probe_ = scheduler_->RegisterTaskGroup( [this](size_t thread_index, int64_t task_id) -> Status { @@ -178,25 +174,27 @@ class JoinBenchmark { scheduler_->RegisterEnd(); DCHECK_OK(scheduler_->StartScheduling( - 0 /*thread index*/, std::move(schedule_callback), - static_cast(2 * settings.num_threads) /*concurrent tasks*/, - settings.num_threads == 1)); + /*thread_id=*/0, + [&](std::function task) -> Status { + return thread_pool_->Spawn([&, task]() { DCHECK_OK(task(thread_indexer_())); }); + }, + thread_pool_->GetCapacity(), settings.num_threads == 1)); } void RunJoin() { -#pragma omp parallel - { - int tid = omp_get_thread_num(); -#pragma omp single - DCHECK_OK( - join_->BuildHashTable(tid, std::move(r_batches_), [this](size_t thread_index) { - return scheduler_->StartTaskGroup(thread_index, task_group_probe_, - l_batches_.batch_count()); - })); - } + DCHECK_OK(join_->BuildHashTable( + /*thread_id=*/0, std::move(r_batches_), [this](size_t thread_index) { + return scheduler_->StartTaskGroup(thread_index, task_group_probe_, + l_batches_.batch_count()); + })); + + thread_pool_->WaitForIdle(); } std::unique_ptr scheduler_; + ThreadIndexer thread_indexer_; + arrow::internal::ThreadPool* thread_pool_; + AccumulationQueue l_batches_; AccumulationQueue r_batches_; std::unique_ptr schema_mgr_; @@ -205,6 +203,7 @@ class JoinBenchmark { int task_group_probe_; struct { + uint64_t num_build_rows; uint64_t num_probe_rows; } stats_; }; @@ -219,11 +218,13 @@ static void HashJoinBasicBenchmarkImpl(benchmark::State& st, st.ResumeTiming(); bm.RunJoin(); st.PauseTiming(); - total_rows += bm.stats_.num_probe_rows; + total_rows += (settings.stats_probe_rows ? bm.stats_.num_probe_rows + : bm.stats_.num_build_rows); } st.ResumeTiming(); } - st.counters["rows/sec"] = benchmark::Counter(total_rows, benchmark::Counter::kIsRate); + st.counters["rows/sec"] = + benchmark::Counter(static_cast(total_rows), benchmark::Counter::kIsRate); } template @@ -302,6 +303,7 @@ static void BM_HashJoinBasic_BuildParallelism(benchmark::State& st) { settings.num_threads = static_cast(st.range(0)); settings.num_build_batches = static_cast(st.range(1)); settings.num_probe_batches = settings.num_threads; + settings.stats_probe_rows = false; HashJoinBasicBenchmarkImpl(st, settings); } diff --git a/cpp/src/arrow/acero/options.h b/cpp/src/arrow/acero/options.h index 2beacfe26baa1..26293725582b1 100644 --- a/cpp/src/arrow/acero/options.h +++ b/cpp/src/arrow/acero/options.h @@ -29,10 +29,8 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/exec.h" #include "arrow/compute/expression.h" -#include "arrow/record_batch.h" #include "arrow/result.h" -#include "arrow/util/async_generator.h" -#include "arrow/util/async_util.h" +#include "arrow/util/future.h" namespace arrow { @@ -55,7 +53,7 @@ namespace acero { /// \brief This must not be used in release-mode struct DebugOptions; -using AsyncExecBatchGenerator = AsyncGenerator>; +using AsyncExecBatchGenerator = std::function>()>; /// \addtogroup acero-nodes /// @{ @@ -103,8 +101,8 @@ class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions { std::shared_ptr output_schema; /// \brief an asynchronous stream of batches ending with std::nullopt std::function>()> generator; - - Ordering ordering = Ordering::Unordered(); + /// \brief the order of the data, defaults to Ordering::Unordered + Ordering ordering; }; /// \brief a node that generates data from a table already loaded in memory diff --git a/cpp/src/arrow/acero/order_by_node_test.cc b/cpp/src/arrow/acero/order_by_node_test.cc index d77b0f3184f1a..37e6862ed0f52 100644 --- a/cpp/src/arrow/acero/order_by_node_test.cc +++ b/cpp/src/arrow/acero/order_by_node_test.cc @@ -42,8 +42,7 @@ static constexpr int kRowsPerBatch = 4; static constexpr int kNumBatches = 32; std::shared_ptr
TestTable() { - return gen::Gen({{"up", gen::Step()}, - {"down", gen::Step(/*start=*/0, /*step=*/-1, /*signed_int=*/true)}}) + return gen::Gen({{"up", gen::Step()}, {"down", gen::Step(/*start=*/0, /*step=*/-1)}}) ->FailOnError() ->Table(kRowsPerBatch, kNumBatches); } diff --git a/cpp/src/arrow/acero/sorted_merge_node.cc b/cpp/src/arrow/acero/sorted_merge_node.cc index c49aca17fb20a..125eb6e3ed0f9 100644 --- a/cpp/src/arrow/acero/sorted_merge_node.cc +++ b/cpp/src/arrow/acero/sorted_merge_node.cc @@ -586,7 +586,7 @@ class SortedMergeNode : public ExecNode { void EmitBatches() { while (true) { // Implementation note: If the queue is empty, we will block here - if (process_queue.Pop() == kPoisonPill) { + if (process_queue.WaitAndPop() == kPoisonPill) { EndFromProcessThread(); } // Either we're out of data or something went wrong diff --git a/cpp/src/arrow/acero/sorted_merge_node_test.cc b/cpp/src/arrow/acero/sorted_merge_node_test.cc index 55446d631d90c..82b630420c4ae 100644 --- a/cpp/src/arrow/acero/sorted_merge_node_test.cc +++ b/cpp/src/arrow/acero/sorted_merge_node_test.cc @@ -36,8 +36,7 @@ namespace arrow::acero { std::shared_ptr
TestTable(int start, int step, int rows_per_batch, int num_batches) { - return gen::Gen({{"timestamp", gen::Step(start, step, /*signed_int=*/true)}, - {"str", gen::Random(utf8())}}) + return gen::Gen({{"timestamp", gen::Step(start, step)}, {"str", gen::Random(utf8())}}) ->FailOnError() ->Table(rows_per_batch, num_batches); } diff --git a/cpp/src/arrow/acero/source_node.cc b/cpp/src/arrow/acero/source_node.cc index ac34e4b6a09fc..2d3e2a1da1735 100644 --- a/cpp/src/arrow/acero/source_node.cc +++ b/cpp/src/arrow/acero/source_node.cc @@ -407,7 +407,7 @@ struct SchemaSourceNode : public SourceNode { struct RecordBatchReaderSourceNode : public SourceNode { RecordBatchReaderSourceNode(ExecPlan* plan, std::shared_ptr schema, arrow::AsyncGenerator> generator) - : SourceNode(plan, schema, generator, Ordering::Implicit()) {} + : SourceNode(plan, schema, generator) {} static Result Make(ExecPlan* plan, std::vector inputs, const ExecNodeOptions& options) { diff --git a/cpp/src/arrow/acero/source_node_test.cc b/cpp/src/arrow/acero/source_node_test.cc index 132dc05e6fd64..79ff5852815c5 100644 --- a/cpp/src/arrow/acero/source_node_test.cc +++ b/cpp/src/arrow/acero/source_node_test.cc @@ -21,6 +21,7 @@ #include "arrow/acero/map_node.h" #include "arrow/acero/options.h" #include "arrow/acero/test_nodes.h" +#include "arrow/record_batch.h" namespace arrow { namespace acero { diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 85e14ac469ce7..b4d89df290214 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -643,37 +643,38 @@ void SwissTableMerge::MergePartition(SwissTable* target, const SwissTable* sourc // int source_group_id_bits = SwissTable::num_groupid_bits_from_log_blocks(source->log_blocks()); - uint64_t source_group_id_mask = ~0ULL >> (64 - source_group_id_bits); - int64_t source_block_bytes = source_group_id_bits + 8; + int source_block_bytes = + SwissTable::num_block_bytes_from_num_groupid_bits(source_group_id_bits); + uint32_t source_group_id_mask = + SwissTable::group_id_mask_from_num_groupid_bits(source_group_id_bits); ARROW_DCHECK(source_block_bytes % sizeof(uint64_t) == 0); // Compute index of the last block in target that corresponds to the given // partition. // ARROW_DCHECK(num_partition_bits <= target->log_blocks()); - int64_t target_max_block_id = + uint32_t target_max_block_id = ((partition_id + 1) << (target->log_blocks() - num_partition_bits)) - 1; overflow_group_ids->clear(); overflow_hashes->clear(); // For each source block... - int64_t source_blocks = 1LL << source->log_blocks(); - for (int64_t block_id = 0; block_id < source_blocks; ++block_id) { - uint8_t* block_bytes = source->blocks() + block_id * source_block_bytes; + uint32_t source_blocks = 1 << source->log_blocks(); + for (uint32_t block_id = 0; block_id < source_blocks; ++block_id) { + const uint8_t* block_bytes = source->block_data(block_id, source_block_bytes); uint64_t block = *reinterpret_cast(block_bytes); // For each non-empty source slot... constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL; - constexpr int kSlotsPerBlock = 8; - int num_full_slots = - kSlotsPerBlock - static_cast(ARROW_POPCOUNT64(block & kHighBitOfEachByte)); + int num_full_slots = SwissTable::kSlotsPerBlock - + static_cast(ARROW_POPCOUNT64(block & kHighBitOfEachByte)); for (int local_slot_id = 0; local_slot_id < num_full_slots; ++local_slot_id) { // Read group id and hash for this slot. // - uint64_t group_id = - source->extract_group_id(block_bytes, local_slot_id, source_group_id_mask); - int64_t global_slot_id = block_id * kSlotsPerBlock + local_slot_id; + uint32_t group_id = SwissTable::extract_group_id( + block_bytes, local_slot_id, source_group_id_bits, source_group_id_mask); + uint32_t global_slot_id = SwissTable::global_slot_id(block_id, local_slot_id); uint32_t hash = source->hashes()[global_slot_id]; // Insert partition id into the highest bits of hash, shifting the // remaining hash bits right. @@ -696,17 +697,18 @@ void SwissTableMerge::MergePartition(SwissTable* target, const SwissTable* sourc } } -inline bool SwissTableMerge::InsertNewGroup(SwissTable* target, uint64_t group_id, - uint32_t hash, int64_t max_block_id) { +inline bool SwissTableMerge::InsertNewGroup(SwissTable* target, uint32_t group_id, + uint32_t hash, uint32_t max_block_id) { // Load the first block to visit for this hash // - int64_t block_id = hash >> (SwissTable::bits_hash_ - target->log_blocks()); - int64_t block_id_mask = ((1LL << target->log_blocks()) - 1); + uint32_t block_id = SwissTable::block_id_from_hash(hash, target->log_blocks()); + uint32_t block_id_mask = (1 << target->log_blocks()) - 1; int num_group_id_bits = SwissTable::num_groupid_bits_from_log_blocks(target->log_blocks()); - int64_t num_block_bytes = num_group_id_bits + sizeof(uint64_t); + int num_block_bytes = + SwissTable::num_block_bytes_from_num_groupid_bits(num_group_id_bits); ARROW_DCHECK(num_block_bytes % sizeof(uint64_t) == 0); - uint8_t* block_bytes = target->blocks() + block_id * num_block_bytes; + const uint8_t* block_bytes = target->block_data(block_id, num_block_bytes); uint64_t block = *reinterpret_cast(block_bytes); // Search for the first block with empty slots. @@ -715,25 +717,23 @@ inline bool SwissTableMerge::InsertNewGroup(SwissTable* target, uint64_t group_i constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL; while ((block & kHighBitOfEachByte) == 0 && block_id < max_block_id) { block_id = (block_id + 1) & block_id_mask; - block_bytes = target->blocks() + block_id * num_block_bytes; + block_bytes = target->block_data(block_id, num_block_bytes); block = *reinterpret_cast(block_bytes); } if ((block & kHighBitOfEachByte) == 0) { return false; } - constexpr int kSlotsPerBlock = 8; - int local_slot_id = - kSlotsPerBlock - static_cast(ARROW_POPCOUNT64(block & kHighBitOfEachByte)); - int64_t global_slot_id = block_id * kSlotsPerBlock + local_slot_id; - target->insert_into_empty_slot(static_cast(global_slot_id), hash, - static_cast(group_id)); + int local_slot_id = SwissTable::kSlotsPerBlock - + static_cast(ARROW_POPCOUNT64(block & kHighBitOfEachByte)); + uint32_t global_slot_id = SwissTable::global_slot_id(block_id, local_slot_id); + target->insert_into_empty_slot(global_slot_id, hash, group_id); return true; } void SwissTableMerge::InsertNewGroups(SwissTable* target, const std::vector& group_ids, const std::vector& hashes) { - int64_t num_blocks = 1LL << target->log_blocks(); + uint32_t num_blocks = 1 << target->log_blocks(); for (size_t i = 0; i < group_ids.size(); ++i) { std::ignore = InsertNewGroup(target, group_ids[i], hashes[i], num_blocks); } @@ -1102,7 +1102,8 @@ uint32_t SwissTableForJoin::payload_id_to_key_id(uint32_t payload_id) const { } Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t num_rows, - bool reject_duplicate_keys, bool no_payload, + int64_t num_batches, bool reject_duplicate_keys, + bool no_payload, const std::vector& key_types, const std::vector& payload_types, MemoryPool* pool, int64_t hardware_flags) { @@ -1112,7 +1113,7 @@ Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t // Make sure that we do not use many partitions if there are not enough rows. // - constexpr int64_t min_num_rows_per_prtn = 1 << 18; + constexpr int64_t min_num_rows_per_prtn = 1 << 12; log_num_prtns_ = std::min(bit_util::Log2(dop_), bit_util::Log2(bit_util::CeilDiv(num_rows, min_num_rows_per_prtn))); @@ -1123,9 +1124,9 @@ Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t pool_ = pool; hardware_flags_ = hardware_flags; + batch_states_.resize(num_batches); prtn_states_.resize(num_prtns_); thread_states_.resize(dop_); - prtn_locks_.Init(dop_, num_prtns_); RowTableMetadata key_row_metadata; key_row_metadata.FromColumnMetadataVector(key_types, @@ -1154,91 +1155,74 @@ Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t return Status::OK(); } -Status SwissTableForJoinBuild::PushNextBatch(int64_t thread_id, - const ExecBatch& key_batch, - const ExecBatch* payload_batch_maybe_null, - arrow::util::TempVectorStack* temp_stack) { - ARROW_DCHECK(thread_id < dop_); +Status SwissTableForJoinBuild::PartitionBatch(size_t thread_id, int64_t batch_id, + const ExecBatch& key_batch, + arrow::util::TempVectorStack* temp_stack) { + DCHECK_LT(thread_id, thread_states_.size()); + DCHECK_LT(batch_id, static_cast(batch_states_.size())); ThreadState& locals = thread_states_[thread_id]; + BatchState& batch_state = batch_states_[batch_id]; + uint16_t num_rows = static_cast(key_batch.length); // Compute hash // - locals.batch_hashes.resize(key_batch.length); - RETURN_NOT_OK(Hashing32::HashBatch( - key_batch, locals.batch_hashes.data(), locals.temp_column_arrays, hardware_flags_, - temp_stack, /*start_row=*/0, static_cast(key_batch.length))); + batch_state.hashes.resize(num_rows); + RETURN_NOT_OK(Hashing32::HashBatch(key_batch, batch_state.hashes.data(), + locals.temp_column_arrays, hardware_flags_, + temp_stack, /*start_row=*/0, num_rows)); // Partition on hash // - locals.batch_prtn_row_ids.resize(locals.batch_hashes.size()); - locals.batch_prtn_ranges.resize(num_prtns_ + 1); - int num_rows = static_cast(locals.batch_hashes.size()); + batch_state.prtn_ranges.resize(num_prtns_ + 1); + batch_state.prtn_row_ids.resize(num_rows); if (num_prtns_ == 1) { // We treat single partition case separately to avoid extra checks in row // partitioning implementation for general case. // - locals.batch_prtn_ranges[0] = 0; - locals.batch_prtn_ranges[1] = num_rows; - for (int i = 0; i < num_rows; ++i) { - locals.batch_prtn_row_ids[i] = i; + batch_state.prtn_ranges[0] = 0; + batch_state.prtn_ranges[1] = num_rows; + for (uint16_t i = 0; i < num_rows; ++i) { + batch_state.prtn_row_ids[i] = i; } } else { PartitionSort::Eval( - static_cast(locals.batch_hashes.size()), num_prtns_, - locals.batch_prtn_ranges.data(), - [this, &locals](int64_t i) { + num_rows, num_prtns_, batch_state.prtn_ranges.data(), + [this, &batch_state](int64_t i) { // SwissTable uses the highest bits of the hash for block index. // We want each partition to correspond to a range of block indices, // so we also partition on the highest bits of the hash. // - return locals.batch_hashes[i] >> (31 - log_num_prtns_) >> 1; + return batch_state.hashes[i] >> (SwissTable::bits_hash_ - log_num_prtns_); }, - [&locals](int64_t i, int pos) { - locals.batch_prtn_row_ids[pos] = static_cast(i); + [&batch_state](int64_t i, int pos) { + batch_state.prtn_row_ids[pos] = static_cast(i); }); - } - // Update hashes, shifting left to get rid of the bits that were already used - // for partitioning. - // - for (size_t i = 0; i < locals.batch_hashes.size(); ++i) { - locals.batch_hashes[i] <<= log_num_prtns_; + // Update hashes, shifting left to get rid of the bits that were already used + // for partitioning. + // + for (size_t i = 0; i < batch_state.hashes.size(); ++i) { + batch_state.hashes[i] <<= log_num_prtns_; + } } - // For each partition: - // - map keys to unique integers using (this partition's) hash table - // - append payloads (if present) to (this partition's) row array - // - locals.temp_prtn_ids.resize(num_prtns_); - - RETURN_NOT_OK(prtn_locks_.ForEachPartition( - thread_id, locals.temp_prtn_ids.data(), - /*is_prtn_empty_fn=*/ - [&](int prtn_id) { - return locals.batch_prtn_ranges[prtn_id + 1] == locals.batch_prtn_ranges[prtn_id]; - }, - /*process_prtn_fn=*/ - [&](int prtn_id) { - return ProcessPartition(thread_id, key_batch, payload_batch_maybe_null, - temp_stack, prtn_id); - })); - return Status::OK(); } -Status SwissTableForJoinBuild::ProcessPartition(int64_t thread_id, - const ExecBatch& key_batch, - const ExecBatch* payload_batch_maybe_null, - arrow::util::TempVectorStack* temp_stack, - int prtn_id) { - ARROW_DCHECK(thread_id < dop_); +Status SwissTableForJoinBuild::ProcessPartition( + size_t thread_id, int64_t batch_id, int prtn_id, const ExecBatch& key_batch, + const ExecBatch* payload_batch_maybe_null, arrow::util::TempVectorStack* temp_stack) { + DCHECK_LT(thread_id, thread_states_.size()); + DCHECK_LT(batch_id, static_cast(batch_states_.size())); + DCHECK_LT(static_cast(prtn_id), prtn_states_.size()); ThreadState& locals = thread_states_[thread_id]; + BatchState& batch_state = batch_states_[batch_id]; + PartitionState& prtn_state = prtn_states_[prtn_id]; int num_rows_new = - locals.batch_prtn_ranges[prtn_id + 1] - locals.batch_prtn_ranges[prtn_id]; + batch_state.prtn_ranges[prtn_id + 1] - batch_state.prtn_ranges[prtn_id]; const uint16_t* row_ids = - locals.batch_prtn_row_ids.data() + locals.batch_prtn_ranges[prtn_id]; - PartitionState& prtn_state = prtn_states_[prtn_id]; + batch_state.prtn_row_ids.data() + batch_state.prtn_ranges[prtn_id]; size_t num_rows_before = prtn_state.key_ids.size(); // Insert new keys into hash table associated with the current partition // and map existing keys to integer ids. @@ -1247,7 +1231,7 @@ Status SwissTableForJoinBuild::ProcessPartition(int64_t thread_id, SwissTableWithKeys::Input input(&key_batch, num_rows_new, row_ids, temp_stack, &locals.temp_column_arrays, &locals.temp_group_ids); RETURN_NOT_OK(prtn_state.keys.MapWithInserts( - &input, locals.batch_hashes.data(), prtn_state.key_ids.data() + num_rows_before)); + &input, batch_state.hashes.data(), prtn_state.key_ids.data() + num_rows_before)); // Append input batch rows from current partition to an array of payload // rows for this partition. // @@ -2504,6 +2488,13 @@ class SwissJoin : public HashJoinImpl { } void InitTaskGroups() { + task_group_partition_ = register_task_group_callback_( + [this](size_t thread_index, int64_t task_id) -> Status { + return PartitionTask(thread_index, task_id); + }, + [this](size_t thread_index) -> Status { + return PartitionFinished(thread_index); + }); task_group_build_ = register_task_group_callback_( [this](size_t thread_index, int64_t task_id) -> Status { return BuildTask(thread_index, task_id); @@ -2590,18 +2581,19 @@ class SwissJoin : public HashJoinImpl { ColumnMetadataFromDataType(schema->data_type(HashJoinProjection::PAYLOAD, i))); payload_types.push_back(metadata); } - RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.Init( + hash_table_build_ = std::make_unique(); + RETURN_NOT_OK(CancelIfNotOK(hash_table_build_->Init( &hash_table_, num_threads_, build_side_batches_.row_count(), - reject_duplicate_keys, no_payload, key_types, payload_types, pool_, - hardware_flags_))); + build_side_batches_.batch_count(), reject_duplicate_keys, no_payload, key_types, + payload_types, pool_, hardware_flags_))); // Process all input batches // - return CancelIfNotOK( - start_task_group_callback_(task_group_build_, build_side_batches_.batch_count())); + return CancelIfNotOK(start_task_group_callback_(task_group_partition_, + build_side_batches_.batch_count())); } - Status BuildTask(size_t thread_id, int64_t batch_id) { + Status PartitionTask(size_t thread_id, int64_t batch_id) { if (IsCancelled()) { return Status::OK(); } @@ -2609,39 +2601,78 @@ class SwissJoin : public HashJoinImpl { DCHECK_GT(build_side_batches_[batch_id].length, 0); const HashJoinProjectionMaps* schema = schema_[1]; - bool no_payload = hash_table_build_.no_payload(); - ExecBatch input_batch; ARROW_ASSIGN_OR_RAISE( input_batch, KeyPayloadFromInput(/*side=*/1, &build_side_batches_[batch_id])); - // Split batch into key batch and optional payload batch - // - // Input batch is key-payload batch (key columns followed by payload - // columns). We split it into two separate batches. - // - // TODO: Change SwissTableForJoinBuild interface to use key-payload - // batch instead to avoid this operation, which involves increasing - // shared pointer ref counts. - // ExecBatch key_batch({}, input_batch.length); key_batch.values.resize(schema->num_cols(HashJoinProjection::KEY)); for (size_t icol = 0; icol < key_batch.values.size(); ++icol) { key_batch.values[icol] = input_batch.values[icol]; } - ExecBatch payload_batch({}, input_batch.length); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; + DCHECK_NE(hash_table_build_, nullptr); + return hash_table_build_->PartitionBatch(static_cast(thread_id), batch_id, + key_batch, temp_stack); + } + + Status PartitionFinished(size_t thread_id) { + RETURN_NOT_OK(status()); + + DCHECK_NE(hash_table_build_, nullptr); + return CancelIfNotOK( + start_task_group_callback_(task_group_build_, hash_table_build_->num_prtns())); + } + + Status BuildTask(size_t thread_id, int64_t prtn_id) { + if (IsCancelled()) { + return Status::OK(); + } + + const HashJoinProjectionMaps* schema = schema_[1]; + DCHECK_NE(hash_table_build_, nullptr); + bool no_payload = hash_table_build_->no_payload(); + ExecBatch key_batch, payload_batch; + auto num_keys = schema->num_cols(HashJoinProjection::KEY); + auto num_payloads = schema->num_cols(HashJoinProjection::PAYLOAD); + key_batch.values.resize(num_keys); if (!no_payload) { - payload_batch.values.resize(schema->num_cols(HashJoinProjection::PAYLOAD)); - for (size_t icol = 0; icol < payload_batch.values.size(); ++icol) { - payload_batch.values[icol] = - input_batch.values[schema->num_cols(HashJoinProjection::KEY) + icol]; - } + payload_batch.values.resize(num_payloads); } arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; - RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PushNextBatch( - static_cast(thread_id), key_batch, no_payload ? nullptr : &payload_batch, - temp_stack))); + + for (int64_t batch_id = 0; + batch_id < static_cast(build_side_batches_.batch_count()); ++batch_id) { + ExecBatch input_batch; + ARROW_ASSIGN_OR_RAISE( + input_batch, KeyPayloadFromInput(/*side=*/1, &build_side_batches_[batch_id])); + + // Split batch into key batch and optional payload batch + // + // Input batch is key-payload batch (key columns followed by payload + // columns). We split it into two separate batches. + // + // TODO: Change SwissTableForJoinBuild interface to use key-payload + // batch instead to avoid this operation, which involves increasing + // shared pointer ref counts. + // + key_batch.length = input_batch.length; + for (size_t icol = 0; icol < key_batch.values.size(); ++icol) { + key_batch.values[icol] = input_batch.values[icol]; + } + + if (!no_payload) { + payload_batch.length = input_batch.length; + for (size_t icol = 0; icol < payload_batch.values.size(); ++icol) { + payload_batch.values[icol] = input_batch.values[num_keys + icol]; + } + } + + RETURN_NOT_OK(CancelIfNotOK(hash_table_build_->ProcessPartition( + thread_id, batch_id, static_cast(prtn_id), key_batch, + no_payload ? nullptr : &payload_batch, temp_stack))); + } return Status::OK(); } @@ -2654,23 +2685,26 @@ class SwissJoin : public HashJoinImpl { // On a single thread prepare for merging partitions of the resulting hash // table. // - RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PreparePrtnMerge())); + DCHECK_NE(hash_table_build_, nullptr); + RETURN_NOT_OK(CancelIfNotOK(hash_table_build_->PreparePrtnMerge())); return CancelIfNotOK( - start_task_group_callback_(task_group_merge_, hash_table_build_.num_prtns())); + start_task_group_callback_(task_group_merge_, hash_table_build_->num_prtns())); } Status MergeTask(size_t /*thread_id*/, int64_t prtn_id) { if (IsCancelled()) { return Status::OK(); } - hash_table_build_.PrtnMerge(static_cast(prtn_id)); + DCHECK_NE(hash_table_build_, nullptr); + hash_table_build_->PrtnMerge(static_cast(prtn_id)); return Status::OK(); } Status MergeFinished(size_t thread_id) { RETURN_NOT_OK(status()); arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; - hash_table_build_.FinishPrtnMerge(temp_stack); + DCHECK_NE(hash_table_build_, nullptr); + hash_table_build_->FinishPrtnMerge(temp_stack); return CancelIfNotOK(OnBuildHashTableFinished(static_cast(thread_id))); } @@ -2679,6 +2713,9 @@ class SwissJoin : public HashJoinImpl { return status(); } + DCHECK_NE(hash_table_build_, nullptr); + hash_table_build_.reset(); + for (int i = 0; i < num_threads_; ++i) { local_states_[i].materialize.SetBuildSide(hash_table_.keys()->keys(), hash_table_.payloads(), @@ -2888,6 +2925,7 @@ class SwissJoin : public HashJoinImpl { const HashJoinProjectionMaps* schema_[2]; // Task scheduling + int task_group_partition_; int task_group_build_; int task_group_merge_; int task_group_scan_; @@ -2910,7 +2948,8 @@ class SwissJoin : public HashJoinImpl { SwissTableForJoin hash_table_; JoinProbeProcessor probe_processor_; JoinResidualFilter residual_filter_; - SwissTableForJoinBuild hash_table_build_; + // Temporarily used during build phase, and released afterward. + std::unique_ptr hash_table_build_; AccumulationQueue build_side_batches_; // Atomic state flags. diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 85f443b0323c7..365f2917d8eff 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -175,7 +175,7 @@ class RowArrayAccessor { // Read operations (row comparison, column decoding) // can be called by multiple threads concurrently. // -struct RowArray { +struct ARROW_ACERO_EXPORT RowArray { RowArray() : is_initialized_(false), hardware_flags_(0) {} Status InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, const ExecBatch& batch); @@ -380,8 +380,8 @@ class SwissTableMerge { // Max block id value greater or equal to the number of blocks guarantees that // the search will not be stopped. // - static inline bool InsertNewGroup(SwissTable* target, uint64_t group_id, uint32_t hash, - int64_t max_block_id); + static inline bool InsertNewGroup(SwissTable* target, uint32_t group_id, uint32_t hash, + uint32_t max_block_id); }; struct SwissTableWithKeys { @@ -523,19 +523,27 @@ class SwissTableForJoin { // class SwissTableForJoinBuild { public: - Status Init(SwissTableForJoin* target, int dop, int64_t num_rows, + Status Init(SwissTableForJoin* target, int dop, int64_t num_rows, int64_t num_batches, bool reject_duplicate_keys, bool no_payload, const std::vector& key_types, const std::vector& payload_types, MemoryPool* pool, int64_t hardware_flags); - // In the first phase of parallel hash table build, threads pick unprocessed - // exec batches, partition the rows based on hash, and update all of the - // partitions with information related to that batch of rows. + // In the first phase of parallel hash table build, each thread picks unprocessed exec + // batches, hashes the batches and preserve the hashes, then partition the rows based on + // hashes. // - Status PushNextBatch(int64_t thread_id, const ExecBatch& key_batch, - const ExecBatch* payload_batch_maybe_null, - arrow::util::TempVectorStack* temp_stack); + Status PartitionBatch(size_t thread_id, int64_t batch_id, const ExecBatch& key_batch, + arrow::util::TempVectorStack* temp_stack); + + // In the second phase of parallel hash table build, each thread picks the given + // partition of all batches, and updates that particular partition with information + // related to that batch of rows. + // + Status ProcessPartition(size_t thread_id, int64_t batch_id, int prtn_id, + const ExecBatch& key_batch, + const ExecBatch* payload_batch_maybe_null, + arrow::util::TempVectorStack* temp_stack); // Allocate memory and initialize counters required for parallel merging of // hash table partitions. @@ -543,7 +551,7 @@ class SwissTableForJoinBuild { // Status PreparePrtnMerge(); - // Second phase of parallel hash table build. + // Third phase of parallel hash table build. // Each partition can be processed by a different thread. // Parallel step. // @@ -564,9 +572,6 @@ class SwissTableForJoinBuild { private: void InitRowArray(); - Status ProcessPartition(int64_t thread_id, const ExecBatch& key_batch, - const ExecBatch* payload_batch_maybe_null, - arrow::util::TempVectorStack* temp_stack, int prtn_id); SwissTableForJoin* target_; // DOP stands for Degree Of Parallelism - the maximum number of participating @@ -604,6 +609,22 @@ class SwissTableForJoinBuild { MemoryPool* pool_; int64_t hardware_flags_; + // One per batch. + // + // Informations like hashes and partitions of each batch gathered in the partition phase + // and used in the build phase. + // + struct BatchState { + // Hashes for the batch, preserved in the partition phase to avoid recomputation in + // the build phase. One element per row in the batch. + std::vector hashes; + // Accumulative number of rows in each partition for the batch. `num_prtns_` + 1 + // elements. + std::vector prtn_ranges; + // Row ids after partition sorting the batch. One element per row in the batch. + std::vector prtn_row_ids; + }; + // One per partition. // struct PartitionState { @@ -620,17 +641,13 @@ class SwissTableForJoinBuild { // batches. // struct ThreadState { - std::vector batch_hashes; - std::vector batch_prtn_ranges; - std::vector batch_prtn_row_ids; - std::vector temp_prtn_ids; std::vector temp_group_ids; std::vector temp_column_arrays; }; + std::vector batch_states_; std::vector prtn_states_; std::vector thread_states_; - PartitionLocks prtn_locks_; std::vector partition_keys_first_row_id_; std::vector partition_payloads_first_row_id_; diff --git a/cpp/src/arrow/acero/task_util.cc b/cpp/src/arrow/acero/task_util.cc index 85378eaeeb27c..082ec99946e9f 100644 --- a/cpp/src/arrow/acero/task_util.cc +++ b/cpp/src/arrow/acero/task_util.cc @@ -91,11 +91,11 @@ class TaskSchedulerImpl : public TaskScheduler { AbortContinuationImpl abort_cont_impl_; std::vector task_groups_; - bool aborted_; bool register_finished_; std::mutex mutex_; // Mutex protecting task_groups_ (state_ and num_tasks_present_ - // fields), aborted_ flag and register_finished_ flag + // fields) and register_finished_ flag + AtomicWithPadding aborted_; AtomicWithPadding num_tasks_to_schedule_; // If a task group adds tasks it's possible for a thread inside // ScheduleMore to miss this fact. This serves as a flag to @@ -105,10 +105,8 @@ class TaskSchedulerImpl : public TaskScheduler { }; TaskSchedulerImpl::TaskSchedulerImpl() - : use_sync_execution_(false), - num_concurrent_tasks_(0), - aborted_(false), - register_finished_(false) { + : use_sync_execution_(false), num_concurrent_tasks_(0), register_finished_(false) { + aborted_.value.store(false); num_tasks_to_schedule_.value.store(0); tasks_added_recently_.value.store(false); } @@ -131,13 +129,11 @@ Status TaskSchedulerImpl::StartTaskGroup(size_t thread_id, int group_id, ARROW_DCHECK(group_id >= 0 && group_id < static_cast(task_groups_.size())); TaskGroup& task_group = task_groups_[group_id]; - bool aborted = false; + bool aborted = aborted_.value.load(); bool all_tasks_finished = false; { std::lock_guard lock(mutex_); - aborted = aborted_; - if (task_group.state_ == TaskGroupState::NOT_READY) { task_group.num_tasks_present_ = total_num_tasks; if (total_num_tasks == 0) { @@ -212,7 +208,7 @@ std::vector> TaskSchedulerImpl::PickTasks(int num_tasks, Status TaskSchedulerImpl::ExecuteTask(size_t thread_id, int group_id, int64_t task_id, bool* task_group_finished) { - if (!aborted_) { + if (!aborted_.value.load()) { RETURN_NOT_OK(task_groups_[group_id].task_impl_(thread_id, task_id)); } *task_group_finished = PostExecuteTask(thread_id, group_id); @@ -228,11 +224,10 @@ bool TaskSchedulerImpl::PostExecuteTask(size_t thread_id, int group_id) { Status TaskSchedulerImpl::OnTaskGroupFinished(size_t thread_id, int group_id, bool* all_task_groups_finished) { - bool aborted = false; + bool aborted = aborted_.value.load(); { std::lock_guard lock(mutex_); - aborted = aborted_; TaskGroup& task_group = task_groups_[group_id]; task_group.state_ = TaskGroupState::ALL_TASKS_FINISHED; *all_task_groups_finished = true; @@ -260,7 +255,7 @@ Status TaskSchedulerImpl::ExecuteMore(size_t thread_id, int num_tasks_to_execute int last_id = 0; for (;;) { - if (aborted_) { + if (aborted_.value.load()) { return Status::Cancelled("Scheduler cancelled"); } @@ -278,8 +273,8 @@ Status TaskSchedulerImpl::ExecuteMore(size_t thread_id, int num_tasks_to_execute bool task_group_finished = false; Status status = ExecuteTask(thread_id, group_id, task_id, &task_group_finished); if (!status.ok()) { - // Mark the remaining picked tasks as finished - for (size_t j = i + 1; j < tasks.size(); ++j) { + // Mark the current and remaining picked tasks as finished + for (size_t j = i; j < tasks.size(); ++j) { if (PostExecuteTask(thread_id, tasks[j].first)) { bool all_task_groups_finished = false; RETURN_NOT_OK( @@ -328,7 +323,7 @@ Status TaskSchedulerImpl::StartScheduling(size_t thread_id, ScheduleImpl schedul } Status TaskSchedulerImpl::ScheduleMore(size_t thread_id, int num_tasks_finished) { - if (aborted_) { + if (aborted_.value.load()) { return Status::Cancelled("Scheduler cancelled"); } @@ -369,17 +364,25 @@ Status TaskSchedulerImpl::ScheduleMore(size_t thread_id, int num_tasks_finished) int group_id = tasks[i].first; int64_t task_id = tasks[i].second; RETURN_NOT_OK(schedule_impl_([this, group_id, task_id](size_t thread_id) -> Status { - RETURN_NOT_OK(ScheduleMore(thread_id, 1)); - bool task_group_finished = false; - RETURN_NOT_OK(ExecuteTask(thread_id, group_id, task_id, &task_group_finished)); + // PostExecuteTask must be called later if any error ocurres during task execution + // (including ScheduleMore), so we preserve the status. + auto status = [&]() { + RETURN_NOT_OK(ScheduleMore(thread_id, 1)); + return ExecuteTask(thread_id, group_id, task_id, &task_group_finished); + }(); + + if (!status.ok()) { + task_group_finished = PostExecuteTask(thread_id, group_id); + } if (task_group_finished) { bool all_task_groups_finished = false; - return OnTaskGroupFinished(thread_id, group_id, &all_task_groups_finished); + RETURN_NOT_OK( + OnTaskGroupFinished(thread_id, group_id, &all_task_groups_finished)); } - return Status::OK(); + return status; })); } @@ -388,31 +391,43 @@ Status TaskSchedulerImpl::ScheduleMore(size_t thread_id, int num_tasks_finished) void TaskSchedulerImpl::Abort(AbortContinuationImpl impl) { bool all_finished = true; + DCHECK_EQ(aborted_.value.load(), false); + aborted_.value.store(true); { std::lock_guard lock(mutex_); - aborted_ = true; abort_cont_impl_ = std::move(impl); if (register_finished_) { for (size_t i = 0; i < task_groups_.size(); ++i) { TaskGroup& task_group = task_groups_[i]; - if (task_group.state_ == TaskGroupState::NOT_READY) { - task_group.state_ = TaskGroupState::ALL_TASKS_FINISHED; - } else if (task_group.state_ == TaskGroupState::READY) { - int64_t expected = task_group.num_tasks_started_.value.load(); - for (;;) { - if (task_group.num_tasks_started_.value.compare_exchange_strong( - expected, task_group.num_tasks_present_)) { - break; + switch (task_group.state_) { + case TaskGroupState::NOT_READY: { + task_group.state_ = TaskGroupState::ALL_TASKS_FINISHED; + break; + } + case TaskGroupState::READY: { + int64_t expected = task_group.num_tasks_started_.value.load(); + for (;;) { + if (task_group.num_tasks_started_.value.compare_exchange_strong( + expected, task_group.num_tasks_present_)) { + break; + } } + int64_t before_add = task_group.num_tasks_finished_.value.fetch_add( + task_group.num_tasks_present_ - expected); + if (before_add >= expected) { + task_group.state_ = TaskGroupState::ALL_TASKS_FINISHED; + } else { + all_finished = false; + task_group.state_ = TaskGroupState::ALL_TASKS_STARTED; + } + break; } - int64_t before_add = task_group.num_tasks_finished_.value.fetch_add( - task_group.num_tasks_present_ - expected); - if (before_add >= expected) { - task_group.state_ = TaskGroupState::ALL_TASKS_FINISHED; - } else { + case TaskGroupState::ALL_TASKS_STARTED: { all_finished = false; - task_group.state_ = TaskGroupState::ALL_TASKS_STARTED; + break; } + default: + break; } } } diff --git a/cpp/src/arrow/acero/task_util_test.cc b/cpp/src/arrow/acero/task_util_test.cc index d5196ad4e0a03..30f80012e5c40 100644 --- a/cpp/src/arrow/acero/task_util_test.cc +++ b/cpp/src/arrow/acero/task_util_test.cc @@ -231,5 +231,97 @@ TEST(TaskScheduler, StressTwo) { } } +TEST(TaskScheduler, AbortContOnTaskErrorSerial) { + constexpr int kNumTasks = 16; + + auto scheduler = TaskScheduler::Make(); + auto task = [&](std::size_t, int64_t task_id) { + if (task_id == kNumTasks / 2) { + return Status::Invalid("Task failed"); + } + return Status::OK(); + }; + + int task_group = + scheduler->RegisterTaskGroup(task, [](std::size_t) { return Status::OK(); }); + scheduler->RegisterEnd(); + + ASSERT_OK(scheduler->StartScheduling( + /*thread_id=*/0, + /*schedule_impl=*/ + [](TaskScheduler::TaskGroupContinuationImpl) { return Status::OK(); }, + /*num_concurrent_tasks=*/1, /*use_sync_execution=*/true)); + ASSERT_RAISES_WITH_MESSAGE( + Invalid, "Invalid: Task failed", + scheduler->StartTaskGroup(/*thread_id=*/0, task_group, kNumTasks)); + + int num_abort_cont_calls = 0; + auto abort_cont = [&]() { ++num_abort_cont_calls; }; + + scheduler->Abort(abort_cont); + + ASSERT_EQ(num_abort_cont_calls, 1); +} + +TEST(TaskScheduler, AbortContOnTaskErrorParallel) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumThreads = 16; + + ThreadIndexer thread_indexer; + int num_threads = std::min(static_cast(thread_indexer.Capacity()), kNumThreads); + ASSERT_OK_AND_ASSIGN(std::shared_ptr thread_pool, + MakePrimedThreadPool(num_threads)); + TaskScheduler::ScheduleImpl schedule = + [&](TaskScheduler::TaskGroupContinuationImpl task) { + return thread_pool->Spawn([&, task] { + std::size_t thread_id = thread_indexer(); + auto status = task(thread_id); + ASSERT_TRUE(status.ok() || status.IsInvalid() || status.IsCancelled()) + << status; + }); + }; + + for (int num_tasks : + {2, num_threads - 1, num_threads, num_threads + 1, 2 * num_threads}) { + ARROW_SCOPED_TRACE("num_tasks = ", num_tasks); + for (int num_concurrent_tasks : + {1, num_tasks - 1, num_tasks, num_tasks + 1, 2 * num_tasks}) { + ARROW_SCOPED_TRACE("num_concurrent_tasks = ", num_concurrent_tasks); + for (int aborting_task_id = 0; aborting_task_id < num_tasks; ++aborting_task_id) { + ARROW_SCOPED_TRACE("aborting_task_id = ", aborting_task_id); + auto scheduler = TaskScheduler::Make(); + + int num_abort_cont_calls = 0; + auto abort_cont = [&]() { ++num_abort_cont_calls; }; + + auto task = [&](std::size_t, int64_t task_id) { + if (task_id == aborting_task_id) { + scheduler->Abort(abort_cont); + } + if (task_id % 2 == 0) { + return Status::Invalid("Task failed"); + } + return Status::OK(); + }; + + int task_group = + scheduler->RegisterTaskGroup(task, [](std::size_t) { return Status::OK(); }); + scheduler->RegisterEnd(); + + ASSERT_OK(scheduler->StartScheduling(/*thread_id=*/0, schedule, + num_concurrent_tasks, + /*use_sync_execution=*/false)); + ASSERT_OK(scheduler->StartTaskGroup(/*thread_id=*/0, task_group, num_tasks)); + + thread_pool->WaitForIdle(); + + ASSERT_EQ(num_abort_cont_calls, 1); + } + } + } +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/util_test.cc b/cpp/src/arrow/acero/util_test.cc index a291075a0a9a7..8273bae237eea 100644 --- a/cpp/src/arrow/acero/util_test.cc +++ b/cpp/src/arrow/acero/util_test.cc @@ -15,12 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include +#include "arrow/acero/concurrent_queue_internal.h" #include "arrow/acero/hash_join_node.h" #include "arrow/acero/schema_util.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" - using testing::Eq; namespace arrow { @@ -184,5 +185,120 @@ TEST(FieldMap, ExtensionTypeHashJoin) { EXPECT_EQ(i.get(0), 0); } +template +void ConcurrentQueueBasicTest(Queue& queue) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading enabled"; +#endif + ASSERT_TRUE(queue.Empty()); + queue.Push(1); + ASSERT_FALSE(queue.Empty()); + ASSERT_EQ(queue.TryPop(), std::make_optional(1)); + ASSERT_TRUE(queue.Empty()); + + auto fut_pop = std::async(std::launch::async, [&]() { return queue.WaitAndPop(); }); + ASSERT_EQ(fut_pop.wait_for(std::chrono::milliseconds(10)), std::future_status::timeout); + queue.Push(2); + queue.Push(3); + queue.Push(4); + ASSERT_EQ(fut_pop.wait_for(std::chrono::milliseconds(10)), std::future_status::ready); + ASSERT_EQ(fut_pop.get(), 2); + fut_pop = std::async(std::launch::async, [&]() { return queue.WaitAndPop(); }); + ASSERT_EQ(fut_pop.wait_for(std::chrono::milliseconds(10)), std::future_status::ready); + ASSERT_EQ(fut_pop.get(), 3); + ASSERT_FALSE(queue.Empty()); + ASSERT_EQ(queue.TryPop(), std::make_optional(4)); + ASSERT_EQ(queue.TryPop(), std::nullopt); + queue.Push(5); + ASSERT_FALSE(queue.Empty()); + ASSERT_EQ(queue.Front(), 5); + ASSERT_FALSE(queue.Empty()); + queue.Clear(); + ASSERT_TRUE(queue.Empty()); +} + +TEST(ConcurrentQueue, BasicTest) { + ConcurrentQueue queue; + ConcurrentQueueBasicTest(queue); +} + +class BackpressureTestExecNode : public ExecNode { + public: + BackpressureTestExecNode() : ExecNode(nullptr, {}, {}, nullptr) {} + const char* kind_name() const override { return "BackpressureTestNode"; } + Status InputReceived(ExecNode* input, ExecBatch batch) override { + return Status::NotImplemented("Test only node"); + } + Status InputFinished(ExecNode* input, int total_batches) override { + return Status::NotImplemented("Test only node"); + } + Status StartProducing() override { return Status::NotImplemented("Test only node"); } + + protected: + Status StopProducingImpl() override { + stopped = true; + return Status::OK(); + } + + public: + void PauseProducing(ExecNode* output, int32_t counter) override { paused = true; } + void ResumeProducing(ExecNode* output, int32_t counter) override { paused = false; } + bool paused{false}; + bool stopped{false}; +}; + +class TestBackpressureControl : public BackpressureControl { + public: + explicit TestBackpressureControl(BackpressureTestExecNode* test_node) + : test_node(test_node) {} + virtual void Pause() { test_node->PauseProducing(nullptr, 0); } + virtual void Resume() { test_node->ResumeProducing(nullptr, 0); } + BackpressureTestExecNode* test_node; +}; + +TEST(BackpressureConcurrentQueue, BasicTest) { + BackpressureTestExecNode dummy_node; + auto ctrl = std::make_unique(&dummy_node); + ASSERT_OK_AND_ASSIGN(auto handler, + BackpressureHandler::Make(&dummy_node, 2, 4, std::move(ctrl))); + BackpressureConcurrentQueue queue(std::move(handler)); + + ConcurrentQueueBasicTest(queue); + ASSERT_FALSE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); +} + +TEST(BackpressureConcurrentQueue, BackpressureTest) { + BackpressureTestExecNode dummy_node; + auto ctrl = std::make_unique(&dummy_node); + ASSERT_OK_AND_ASSIGN(auto handler, + BackpressureHandler::Make(&dummy_node, 2, 4, std::move(ctrl))); + BackpressureConcurrentQueue queue(std::move(handler)); + + queue.Push(6); + queue.Push(7); + queue.Push(8); + ASSERT_FALSE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + queue.Push(9); + ASSERT_TRUE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + ASSERT_EQ(queue.TryPop(), std::make_optional(6)); + ASSERT_TRUE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + ASSERT_EQ(queue.TryPop(), std::make_optional(7)); + ASSERT_FALSE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + queue.Push(10); + ASSERT_FALSE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + queue.Push(11); + ASSERT_TRUE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + ASSERT_OK(queue.ForceShutdown()); + ASSERT_FALSE(dummy_node.paused); + ASSERT_TRUE(dummy_node.stopped); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 21faa3f4279ea..fa77f4ff4ed95 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -277,15 +277,15 @@ class ARROW_EXPORT FlatArray : public Array { /// Base class for arrays of fixed-size logical types class ARROW_EXPORT PrimitiveArray : public FlatArray { public: + /// Does not account for any slice offset + const std::shared_ptr& values() const { return data_->buffers[1]; } + + protected: PrimitiveArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); - /// Does not account for any slice offset - const std::shared_ptr& values() const { return data_->buffers[1]; } - - protected: PrimitiveArray() : raw_values_(NULLPTR) {} void SetData(const std::shared_ptr& data) { diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index 545425c264619..77ba2477791bb 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -166,6 +166,36 @@ TEST(TestSparseUnionArray, Validate) { ASSERT_RAISES(Invalid, arr->ValidateFull()); } +TEST(TestSparseUnionArray, Comparison) { + auto ints1 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"); + auto ints2 = ArrayFromJSON(int32(), "[1, 2, -3, 4, -5, 6]"); + auto strs1 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "d", "e", "f"])"); + auto strs2 = ArrayFromJSON(utf8(), R"(["a", "*", "c", "d", "e", "*"])"); + std::vector type_codes{8, 42}; + + auto check_equality = [&](const std::string& type_ids_json1, + const std::string& type_ids_json2, bool expected_equals) { + auto type_ids1 = ArrayFromJSON(int8(), type_ids_json1); + auto type_ids2 = ArrayFromJSON(int8(), type_ids_json2); + ASSERT_OK_AND_ASSIGN(auto arr1, + SparseUnionArray::Make(*type_ids1, {ints1, strs1}, type_codes)); + ASSERT_OK_AND_ASSIGN(auto arr2, + SparseUnionArray::Make(*type_ids2, {ints2, strs2}, type_codes)); + ASSERT_EQ(arr1->Equals(arr2), expected_equals); + ASSERT_EQ(arr2->Equals(arr1), expected_equals); + }; + + // Same type ids + check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", true); + check_equality("[8, 8, 42, 42, 42, 42]", "[8, 8, 42, 42, 42, 42]", false); + check_equality("[8, 8, 8, 42, 42, 8]", "[8, 8, 8, 42, 42, 8]", false); + check_equality("[8, 42, 42, 42, 42, 8]", "[8, 42, 42, 42, 42, 8]", false); + + // Different type ids + check_equality("[42, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", false); + check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 42]", false); +} + // ------------------------------------------------------------------------- // Tests for MakeDense and MakeSparse diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 0eb22a9d1553d..9667e1590ea16 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -63,28 +63,28 @@ Status CheckBufferSlice(const Buffer& buffer, int64_t offset) { } // namespace -Result> SliceBufferSafe(const std::shared_ptr& buffer, +Result> SliceBufferSafe(std::shared_ptr buffer, int64_t offset) { RETURN_NOT_OK(CheckBufferSlice(*buffer, offset)); - return SliceBuffer(buffer, offset); + return SliceBuffer(std::move(buffer), offset); } -Result> SliceBufferSafe(const std::shared_ptr& buffer, +Result> SliceBufferSafe(std::shared_ptr buffer, int64_t offset, int64_t length) { RETURN_NOT_OK(CheckBufferSlice(*buffer, offset, length)); - return SliceBuffer(buffer, offset, length); + return SliceBuffer(std::move(buffer), offset, length); } -Result> SliceMutableBufferSafe( - const std::shared_ptr& buffer, int64_t offset) { +Result> SliceMutableBufferSafe(std::shared_ptr buffer, + int64_t offset) { RETURN_NOT_OK(CheckBufferSlice(*buffer, offset)); - return SliceMutableBuffer(buffer, offset); + return SliceMutableBuffer(std::move(buffer), offset); } -Result> SliceMutableBufferSafe( - const std::shared_ptr& buffer, int64_t offset, int64_t length) { +Result> SliceMutableBufferSafe(std::shared_ptr buffer, + int64_t offset, int64_t length) { RETURN_NOT_OK(CheckBufferSlice(*buffer, offset, length)); - return SliceMutableBuffer(buffer, offset, length); + return SliceMutableBuffer(std::move(buffer), offset, length); } std::string Buffer::ToHexString() { @@ -167,9 +167,9 @@ std::shared_ptr Buffer::FromString(std::string data) { return std::make_shared(std::move(data)); } -std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, +std::shared_ptr SliceMutableBuffer(std::shared_ptr buffer, const int64_t offset, const int64_t length) { - return std::make_shared(buffer, offset, length); + return std::make_shared(std::move(buffer), offset, length); } MutableBuffer::MutableBuffer(const std::shared_ptr& parent, const int64_t offset, diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index fbf4a22e350ca..1b546a83ccc4d 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -396,33 +396,33 @@ class ARROW_EXPORT Buffer { /// \brief Construct a view on a buffer at the given offset and length. /// /// This function cannot fail and does not check for errors (except in debug builds) -static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, +static inline std::shared_ptr SliceBuffer(std::shared_ptr buffer, const int64_t offset, const int64_t length) { - return std::make_shared(buffer, offset, length); + return std::make_shared(std::move(buffer), offset, length); } /// \brief Construct a view on a buffer at the given offset, up to the buffer's end. /// /// This function cannot fail and does not check for errors (except in debug builds) -static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, +static inline std::shared_ptr SliceBuffer(std::shared_ptr buffer, const int64_t offset) { int64_t length = buffer->size() - offset; - return SliceBuffer(buffer, offset, length); + return SliceBuffer(std::move(buffer), offset, length); } /// \brief Input-checking version of SliceBuffer /// /// An Invalid Status is returned if the requested slice falls out of bounds. ARROW_EXPORT -Result> SliceBufferSafe(const std::shared_ptr& buffer, +Result> SliceBufferSafe(std::shared_ptr buffer, int64_t offset); /// \brief Input-checking version of SliceBuffer /// /// An Invalid Status is returned if the requested slice falls out of bounds. /// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size. ARROW_EXPORT -Result> SliceBufferSafe(const std::shared_ptr& buffer, +Result> SliceBufferSafe(std::shared_ptr buffer, int64_t offset, int64_t length); /// \brief Like SliceBuffer, but construct a mutable buffer slice. @@ -430,32 +430,32 @@ Result> SliceBufferSafe(const std::shared_ptr& b /// If the parent buffer is not mutable, behavior is undefined (it may abort /// in debug builds). ARROW_EXPORT -std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, +std::shared_ptr SliceMutableBuffer(std::shared_ptr buffer, const int64_t offset, const int64_t length); /// \brief Like SliceBuffer, but construct a mutable buffer slice. /// /// If the parent buffer is not mutable, behavior is undefined (it may abort /// in debug builds). -static inline std::shared_ptr SliceMutableBuffer( - const std::shared_ptr& buffer, const int64_t offset) { +static inline std::shared_ptr SliceMutableBuffer(std::shared_ptr buffer, + const int64_t offset) { int64_t length = buffer->size() - offset; - return SliceMutableBuffer(buffer, offset, length); + return SliceMutableBuffer(std::move(buffer), offset, length); } /// \brief Input-checking version of SliceMutableBuffer /// /// An Invalid Status is returned if the requested slice falls out of bounds. ARROW_EXPORT -Result> SliceMutableBufferSafe( - const std::shared_ptr& buffer, int64_t offset); +Result> SliceMutableBufferSafe(std::shared_ptr buffer, + int64_t offset); /// \brief Input-checking version of SliceMutableBuffer /// /// An Invalid Status is returned if the requested slice falls out of bounds. /// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size. ARROW_EXPORT -Result> SliceMutableBufferSafe( - const std::shared_ptr& buffer, int64_t offset, int64_t length); +Result> SliceMutableBufferSafe(std::shared_ptr buffer, + int64_t offset, int64_t length); /// @} diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 23a921cc5a0a4..e0e6d183393a7 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -381,21 +381,49 @@ class RangeDataEqualsImpl { const int8_t* right_codes = right_.GetValues(1); // Unions don't have a null bitmap + int64_t run_start = 0; // Start index of the current run + for (int64_t i = 0; i < range_length_; ++i) { - const auto type_id = left_codes[left_start_idx_ + i]; - if (type_id != right_codes[right_start_idx_ + i]) { + const auto current_type_id = left_codes[left_start_idx_ + i]; + + if (current_type_id != right_codes[right_start_idx_ + i]) { result_ = false; break; } - const auto child_num = child_ids[type_id]; - // XXX can we instead detect runs of same-child union values? + // Check if the current element breaks the run + if (i > 0 && current_type_id != left_codes[left_start_idx_ + i - 1]) { + // Compare the previous run + const auto previous_child_num = child_ids[left_codes[left_start_idx_ + i - 1]]; + int64_t run_length = i - run_start; + + RangeDataEqualsImpl impl( + options_, floating_approximate_, *left_.child_data[previous_child_num], + *right_.child_data[previous_child_num], + left_start_idx_ + left_.offset + run_start, + right_start_idx_ + right_.offset + run_start, run_length); + + if (!impl.Compare()) { + result_ = false; + break; + } + + // Start a new run + run_start = i; + } + } + + // Handle the final run + if (result_) { + const auto final_child_num = child_ids[left_codes[left_start_idx_ + run_start]]; + int64_t final_run_length = range_length_ - run_start; + RangeDataEqualsImpl impl( - options_, floating_approximate_, *left_.child_data[child_num], - *right_.child_data[child_num], left_start_idx_ + left_.offset + i, - right_start_idx_ + right_.offset + i, 1); + options_, floating_approximate_, *left_.child_data[final_child_num], + *right_.child_data[final_child_num], left_start_idx_ + left_.offset + run_start, + right_start_idx_ + right_.offset + run_start, final_run_length); + if (!impl.Compare()) { result_ = false; - break; } } return Status::OK(); diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 49d8709660684..20d3ce2faf256 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -24,8 +24,8 @@ #include "arrow/util/logging.h" namespace arrow { - namespace internal { + template <> struct EnumTraits : BasicEnumTraits return ""; } }; + +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "PivotWiderOptions::UnexpectedKeyBehavior"; } + static std::string value_name(compute::PivotWiderOptions::UnexpectedKeyBehavior value) { + switch (value) { + case compute::PivotWiderOptions::kIgnore: + return "kIgnore"; + case compute::PivotWiderOptions::kRaise: + return "kRaise"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -91,6 +109,9 @@ static auto kVarianceOptionsType = GetFunctionOptionsType( DataMember("ddof", &VarianceOptions::ddof), DataMember("skip_nulls", &VarianceOptions::skip_nulls), DataMember("min_count", &VarianceOptions::min_count)); +static auto kSkewOptionsType = GetFunctionOptionsType( + DataMember("skip_nulls", &SkewOptions::skip_nulls), + DataMember("min_count", &SkewOptions::min_count)); static auto kQuantileOptionsType = GetFunctionOptionsType( DataMember("q", &QuantileOptions::q), DataMember("interpolation", &QuantileOptions::interpolation), @@ -101,6 +122,9 @@ static auto kTDigestOptionsType = GetFunctionOptionsType( DataMember("buffer_size", &TDigestOptions::buffer_size), DataMember("skip_nulls", &TDigestOptions::skip_nulls), DataMember("min_count", &TDigestOptions::min_count)); +static auto kPivotOptionsType = GetFunctionOptionsType( + DataMember("key_names", &PivotWiderOptions::key_names), + DataMember("unexpected_key_behavior", &PivotWiderOptions::unexpected_key_behavior)); static auto kIndexOptionsType = GetFunctionOptionsType(DataMember("value", &IndexOptions::value)); } // namespace @@ -130,6 +154,11 @@ VarianceOptions::VarianceOptions(int ddof, bool skip_nulls, uint32_t min_count) min_count(min_count) {} constexpr char VarianceOptions::kTypeName[]; +SkewOptions::SkewOptions(bool skip_nulls, uint32_t min_count) + : FunctionOptions(internal::kSkewOptionsType), + skip_nulls(skip_nulls), + min_count(min_count) {} + QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation, bool skip_nulls, uint32_t min_count) : FunctionOptions(internal::kQuantileOptionsType), @@ -164,6 +193,13 @@ TDigestOptions::TDigestOptions(std::vector q, uint32_t delta, min_count{min_count} {} constexpr char TDigestOptions::kTypeName[]; +PivotWiderOptions::PivotWiderOptions(std::vector key_names, + UnexpectedKeyBehavior unexpected_key_behavior) + : FunctionOptions(internal::kPivotOptionsType), + key_names(std::move(key_names)), + unexpected_key_behavior(unexpected_key_behavior) {} +PivotWiderOptions::PivotWiderOptions() : FunctionOptions(internal::kPivotOptionsType) {} + IndexOptions::IndexOptions(std::shared_ptr value) : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {} IndexOptions::IndexOptions() : IndexOptions(std::make_shared()) {} @@ -175,8 +211,10 @@ void RegisterAggregateOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCountOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kSkewOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kPivotOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType)); } } // namespace internal @@ -242,6 +280,14 @@ Result Variance(const Datum& value, const VarianceOptions& options, return CallFunction("variance", {value}, &options, ctx); } +Result Skew(const Datum& value, const SkewOptions& options, ExecContext* ctx) { + return CallFunction("skew", {value}, &options, ctx); +} + +Result Kurtosis(const Datum& value, const SkewOptions& options, ExecContext* ctx) { + return CallFunction("kurtosis", {value}, &options, ctx); +} + Result Quantile(const Datum& value, const QuantileOptions& options, ExecContext* ctx) { return CallFunction("quantile", {value}, &options, ctx); diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 2e5210b073ee4..61bab4cdb86f2 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -114,6 +114,20 @@ class ARROW_EXPORT VarianceOptions : public FunctionOptions { uint32_t min_count; }; +/// \brief Control Skew and Kurtosis kernel behavior +class ARROW_EXPORT SkewOptions : public FunctionOptions { + public: + explicit SkewOptions(bool skip_nulls = true, uint32_t min_count = 0); + static constexpr char const kTypeName[] = "SkewOptions"; + static SkewOptions Defaults() { return SkewOptions{}; } + + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; +}; + /// \brief Control Quantile kernel behavior /// /// By default, returns the median value. @@ -175,6 +189,88 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions { uint32_t min_count; }; +/// \brief Control Pivot kernel behavior +/// +/// These options apply to the "pivot_wider" and "hash_pivot_wider" functions. +/// +/// Constraints: +/// - The corresponding `Aggregate::target` must have two FieldRef elements; +/// the first one points to the pivot key column, the second points to the +/// pivoted data column. +/// - The pivot key column must be string-like; its values will be matched +/// against `key_names` in order to dispatch the pivoted data into the +/// output. +/// +/// "pivot_wider" example +/// --------------------- +/// +/// Assuming the following two input columns with types utf8 and int16 (respectively): +/// ``` +/// width | 11 +/// height | 13 +/// ``` +/// and the options `PivotWiderOptions(.key_names = {"height", "width"})` +/// +/// then the output will be a scalar with the type +/// `struct{"height": int16, "width": int16}` +/// and the value `{"height": 13, "width": 11}`. +/// +/// "hash_pivot_wider" example +/// -------------------------- +/// +/// Assuming the following input with schema +/// `{"group": int32, "key": utf8, "value": int16}`: +/// ``` +/// group | key | value +/// ----------------------------- +/// 1 | height | 11 +/// 1 | width | 12 +/// 2 | width | 13 +/// 3 | height | 14 +/// 3 | depth | 15 +/// ``` +/// and the following settings: +/// - a hash grouping key "group" +/// - Aggregate( +/// .function = "hash_pivot_wider", +/// .options = PivotWiderOptions(.key_names = {"height", "width"}), +/// .target = {"key", "value"}, +/// .name = {"properties"}) +/// +/// then the output will have the schema +/// `{"group": int32, "properties": struct{"height": int16, "width": int16}}` +/// and the following value: +/// ``` +/// group | properties +/// | height | width +/// ----------------------------- +/// 1 | 11 | 12 +/// 2 | null | 13 +/// 3 | 14 | null +/// ``` +class ARROW_EXPORT PivotWiderOptions : public FunctionOptions { + public: + /// Configure the behavior of pivot keys not in `key_names` + enum UnexpectedKeyBehavior { + /// Unexpected pivot keys are ignored silently + kIgnore, + /// Unexpected pivot keys return a KeyError + kRaise + }; + + explicit PivotWiderOptions(std::vector key_names, + UnexpectedKeyBehavior unexpected_key_behavior = kIgnore); + // Default constructor for serialization + PivotWiderOptions(); + static constexpr char const kTypeName[] = "PivotWiderOptions"; + static PivotWiderOptions Defaults() { return PivotWiderOptions{}; } + + /// The values expected in the pivot key column + std::vector key_names; + /// The behavior when pivot keys not in `key_names` are encountered + UnexpectedKeyBehavior unexpected_key_behavior = kIgnore; +}; + /// \brief Control Index kernel behavior class ARROW_EXPORT IndexOptions : public FunctionOptions { public: @@ -421,6 +517,34 @@ Result Variance(const Datum& value, const VarianceOptions& options = VarianceOptions::Defaults(), ExecContext* ctx = NULLPTR); +/// \brief Calculate the skewness of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see SkewOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed skewness as a DoubleScalar +/// +/// \since 20.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Skew(const Datum& value, + const SkewOptions& options = SkewOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the kurtosis of a numeric array +/// +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see SkewOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed kurtosis as a DoubleScalar +/// +/// \since 20.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Kurtosis(const Datum& value, + const SkewOptions& options = SkewOptions::Defaults(), + ExecContext* ctx = NULLPTR); + /// \brief Calculate the quantiles of a numeric array /// /// \param[in] value input datum, expecting Array or ChunkedArray diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 61335de6ac09a..53ceed1b0893e 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -270,6 +270,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSelectKOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kRankQuantileOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kInversePermutationOptionsType)); diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index e2f3195db5493..e970cd3175add 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/util/config.h" + #include "arrow/compute/expression.h" #include @@ -30,8 +32,10 @@ #include "arrow/compute/function_internal.h" #include "arrow/compute/util.h" #include "arrow/io/memory.h" -#include "arrow/ipc/reader.h" -#include "arrow/ipc/writer.h" +#ifdef ARROW_IPC +# include "arrow/ipc/reader.h" +# include "arrow/ipc/writer.h" +#endif #include "arrow/util/hash_util.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" @@ -1492,6 +1496,7 @@ Result RemoveNamedRefs(Expression src) { // this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its // columns. Finally, the RecordBatch is written to an IPC file. Result> Serialize(const Expression& expr) { +#ifdef ARROW_IPC struct { std::shared_ptr metadata_ = std::make_shared(); ArrayVector columns_; @@ -1567,9 +1572,13 @@ Result> Serialize(const Expression& expr) { RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); RETURN_NOT_OK(writer->Close()); return stream->Finish(); +#else + return Status::NotImplemented("IPC feature isn't enabled"); +#endif } Result Deserialize(std::shared_ptr buffer) { +#ifdef ARROW_IPC io::BufferReader stream(std::move(buffer)); ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream)); ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0)); @@ -1670,6 +1679,9 @@ Result Deserialize(std::shared_ptr buffer) { }; return FromRecordBatch{*batch, 0}.GetOne(); +#else + return Status::NotImplemented("IPC feature isn't enabled"); +#endif } Expression project(std::vector values, std::vector names) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index 9dab049821d5c..23aa20eddc397 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -17,6 +17,9 @@ #pragma once +#include +#include + #include "arrow/compute/kernels/util_internal.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -24,9 +27,7 @@ #include "arrow/util/int128_internal.h" #include "arrow/util/logging.h" -namespace arrow { -namespace compute { -namespace internal { +namespace arrow::compute::internal { // Find the largest compatible primitive type for a primitive type. template @@ -254,6 +255,4 @@ SumType SumArray(const ArraySpan& data) { data, [](ValueType v) { return static_cast(v); }); } -} // namespace internal -} // namespace compute -} // namespace arrow +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_pivot.cc b/cpp/src/arrow/compute/kernels/aggregate_pivot.cc new file mode 100644 index 0000000000000..bcc2f53ac1544 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate_pivot.cc @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/kernels/pivot_internal.h" +#include "arrow/scalar.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/logging.h" + +namespace arrow::compute::internal { +namespace { + +using arrow::internal::VisitSetBitRunsVoid; +using arrow::util::span; + +struct PivotImpl : public ScalarAggregator { + Status Init(const PivotWiderOptions& options, const std::vector& in_types) { + options_ = &options; + key_type_ = in_types[0].GetSharedPtr(); + auto value_type = in_types[1].GetSharedPtr(); + FieldVector fields; + fields.reserve(options_->key_names.size()); + values_.reserve(options_->key_names.size()); + for (const auto& key_name : options_->key_names) { + fields.push_back(field(key_name, value_type)); + values_.push_back(MakeNullScalar(value_type)); + } + out_type_ = struct_(std::move(fields)); + ARROW_ASSIGN_OR_RAISE(key_mapper_, PivotWiderKeyMapper::Make(*key_type_, options_)); + return Status::OK(); + } + + Status Consume(KernelContext*, const ExecSpan& batch) override { + DCHECK_EQ(batch.num_values(), 2); + if (batch[0].is_array()) { + ARROW_ASSIGN_OR_RAISE(span keys, + key_mapper_->MapKeys(batch[0].array)); + if (batch[1].is_array()) { + // Array keys, array values + auto values = batch[1].array.ToArray(); + for (int64_t i = 0; i < batch.length; ++i) { + PivotWiderKeyIndex key = keys[i]; + if (key != kNullPivotKey && !values->IsNull(i)) { + if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) { + return DuplicateValue(); + } + ARROW_ASSIGN_OR_RAISE(values_[key], values->GetScalar(i)); + DCHECK(values_[key]->is_valid); + } + } + } else { + // Array keys, scalar value + const Scalar* value = batch[1].scalar; + if (value->is_valid) { + for (int64_t i = 0; i < batch.length; ++i) { + PivotWiderKeyIndex key = keys[i]; + if (key != kNullPivotKey) { + if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) { + return DuplicateValue(); + } + values_[key] = value->GetSharedPtr(); + } + } + } + } + } else { + ARROW_ASSIGN_OR_RAISE(PivotWiderKeyIndex key, + key_mapper_->MapKey(*batch[0].scalar)); + if (key != kNullPivotKey) { + if (batch[1].is_array()) { + // Scalar key, array values + auto values = batch[1].array.ToArray(); + for (int64_t i = 0; i < batch.length; ++i) { + if (!values->IsNull(i)) { + if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) { + return DuplicateValue(); + } + ARROW_ASSIGN_OR_RAISE(values_[key], values->GetScalar(i)); + DCHECK(values_[key]->is_valid); + } + } + } else { + // Scalar key, scalar value + const Scalar* value = batch[1].scalar; + if (value->is_valid) { + if (batch.length > 1 || values_[key]->is_valid) { + return DuplicateValue(); + } + values_[key] = value->GetSharedPtr(); + } + } + } + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other_state = checked_cast(src); + for (int64_t key = 0; key < static_cast(values_.size()); ++key) { + if (other_state.values_[key]->is_valid) { + if (ARROW_PREDICT_FALSE(values_[key]->is_valid)) { + return DuplicateValue(); + } + values_[key] = other_state.values_[key]; + } + } + return Status::OK(); + } + + Status Finalize(KernelContext* ctx, Datum* out) override { + *out = std::make_shared(std::move(values_), out_type_); + return Status::OK(); + } + + Status DuplicateValue() { + return Status::Invalid( + "Encountered more than one non-null value for the same pivot key"); + } + + std::shared_ptr out_type() const { return out_type_; } + + std::shared_ptr key_type_; + std::shared_ptr out_type_; + const PivotWiderOptions* options_; + std::unique_ptr key_mapper_; + ScalarVector values_; +}; + +Result> PivotInit(KernelContext* ctx, + const KernelInitArgs& args) { + const auto& options = checked_cast(*args.options); + DCHECK_EQ(args.inputs.size(), 2); + DCHECK(is_base_binary_like(args.inputs[0].id())); + auto state = std::make_unique(); + RETURN_NOT_OK(state->Init(options, args.inputs)); + return state; +} + +Result ResolveOutputType(KernelContext* ctx, const std::vector&) { + return checked_cast(ctx->state())->out_type(); +} + +const FunctionDoc pivot_doc{ + "Pivot values according to a pivot key column", + ("Output is a struct with as many fields as `PivotWiderOptions.key_names`.\n" + "All output struct fields have the same type as `pivot_values`.\n" + "Each pivot key decides in which output field the corresponding pivot value\n" + "is emitted. If a pivot key doesn't appear, null is emitted.\n" + "If more than one non-null value is encountered for a given pivot key,\n" + "Invalid is raised.\n" + "Behavior of unexpected pivot keys is controlled by `unexpected_key_behavior`\n" + "in PivotWiderOptions."), + {"pivot_keys", "pivot_values"}, + "PivotWiderOptions"}; + +} // namespace + +void RegisterScalarAggregatePivot(FunctionRegistry* registry) { + static auto default_pivot_options = PivotWiderOptions::Defaults(); + + auto func = std::make_shared( + "pivot_wider", Arity::Binary(), pivot_doc, &default_pivot_options); + + for (auto key_type : BaseBinaryTypes()) { + auto sig = KernelSignature::Make({key_type->id(), InputType::Any()}, + OutputType(ResolveOutputType)); + AddAggKernel(std::move(sig), PivotInit, func.get()); + } + DCHECK_OK(registry->AddFunction(std::move(func))); +} + +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index e6ad915fd5667..d64c740a8e70d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -42,6 +42,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" +#include "arrow/testing/math.h" #include "arrow/testing/random.h" #include "arrow/util/logging.h" @@ -3386,6 +3387,9 @@ TEST_F(TestVarStdKernelMergeStability, Basics) { #ifndef __MINGW32__ // MinGW has precision issues // XXX: The reference value from numpy is actually wrong due to floating // point limits. The correct result should equals variance(90, 0) = 4050. + // The problem is that the mean is not exactly representable as floating-point, + // and that small inaccuracy produces a large deviation when plugged into the M2 + // calculation. std::vector chunks = {"[40000008000000490]", "[40000008000000400]"}; this->AssertVarStdIs(chunks, options, 3904.0); #endif @@ -3430,12 +3434,21 @@ TEST_F(TestVarStdKernelUInt32, Basics) { this->AssertVarStdIs("[0, 0, 4294967295]", options, 6.148914688373205e+18); } -// https://en.wikipedia.org/wiki/Kahan_summation_algorithm void KahanSum(double& sum, double& adjust, double addend) { - double y = addend - adjust; - double t = sum + y; - adjust = (t - sum) - y; - sum = t; + // Backported enhancement from Neumaier's algorithm: consider case where + // sum is small compared to addend. + // https://en.wikipedia.org/wiki/Kahan_summation_algorithm#Further_enhancements + if (abs(sum) >= abs(addend)) { + double y = addend - adjust; + double t = sum + y; + adjust = (t - sum) - y; + sum = t; + } else { + double y = sum - adjust; + double t = addend + y; + adjust = (t - addend) - y; + sum = t; + } } // Calculate reference variance with Welford's online algorithm + Kahan summation @@ -3534,7 +3547,8 @@ TEST_F(TestVarStdKernelIntegerLength, Basics) { TEST(TestVarStdKernel, Decimal) { // Effectively treated as double, sanity check results here - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { + for (const auto& ty : + {decimal32(3, 2), decimal64(3, 2), decimal128(3, 2), decimal256(3, 2)}) { CheckVarStd(ArrayFromJSON(ty, R"(["1.00"])"), VarianceOptions(), 0); CheckVarStd(ArrayFromJSON(ty, R"([null, "1.00", "2.00", "3.00"])"), VarianceOptions(), 0.6666666666666666); @@ -3544,6 +3558,154 @@ TEST(TestVarStdKernel, Decimal) { } } +// +// Skew and Kurtosis +// + +constexpr int kSkewUlps = 3; +constexpr int kKurtosisUlps = 6; + +void CheckSkewKurtosis(const Datum& array, const SkewOptions& options, + double expected_skew, double expected_kurtosis, int n_ulps = -1) { + ARROW_SCOPED_TRACE("type = ", *array.type()); + ASSERT_OK_AND_ASSIGN(Datum out_skew, Skew(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_kurtosis, Kurtosis(array, options)); + const auto& skew = checked_cast(*out_skew.scalar()); + const auto& kurtosis = checked_cast(*out_kurtosis.scalar()); + ASSERT_TRUE(skew.is_valid && kurtosis.is_valid); + AssertWithinUlp(expected_skew, skew.value, n_ulps >= 0 ? n_ulps : kSkewUlps); + AssertWithinUlp(expected_kurtosis, kurtosis.value, + n_ulps >= 0 ? n_ulps : kKurtosisUlps); +} + +class TestSkewKurtosis : public ::testing::Test { + public: + void AssertSkewKurtosisAre(const Array& array, const SkewOptions& options, + double expected_skew, double expected_kurtosis, + int n_ulps = -1) { + CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps); + } + + void AssertSkewKurtosisAre(const std::shared_ptr& array, + const SkewOptions& options, double expected_skew, + double expected_kurtosis, int n_ulps = -1) { + CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps); + } + + void AssertSkewKurtosisAre(const std::shared_ptr& type, std::string_view json, + const SkewOptions& options, double expected_skew, + double expected_kurtosis, int n_ulps = -1) { + auto array = ArrayFromJSON(type, json); + CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps); + } + + void AssertSkewKurtosisAre(const std::shared_ptr& type, + const std::vector& json, + const SkewOptions& options, double expected_skew, + double expected_kurtosis, int n_ulps = -1) { + auto array = ChunkedArrayFromJSON(type, json); + CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps); + } + + void AssertSkewKurtosisInvalid(const Array& array, const SkewOptions& options) { + AssertSkewKurtosisInvalidInternal(array, options); + } + + void AssertSkewKurtosisInvalid(const std::shared_ptr& array, + const SkewOptions& options) { + AssertSkewKurtosisInvalidInternal(array, options); + } + + void AssertSkewKurtosisInvalid(const std::shared_ptr& type, + std::string_view json, const SkewOptions& options) { + auto array = ArrayFromJSON(type, json); + AssertSkewKurtosisInvalidInternal(array, options); + } + + void AssertSkewKurtosisInvalid(const std::shared_ptr& type, + const std::vector& json, + const SkewOptions& options) { + auto array = ChunkedArrayFromJSON(type, json); + AssertSkewKurtosisInvalidInternal(array, options); + } + + private: + void AssertSkewKurtosisInvalidInternal(const Datum& array, const SkewOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out_skew, Skew(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_kurtosis, Kurtosis(array, options)); + const auto& skew = checked_cast(*out_skew.scalar()); + const auto& kurtosis = checked_cast(*out_kurtosis.scalar()); + ASSERT_FALSE(skew.is_valid || kurtosis.is_valid); + } +}; + +TEST_F(TestSkewKurtosis, Basics) { + // Test sample from SciPy, with results obtained using numpy.float128 + auto options = SkewOptions::Defaults(); + AssertSkewKurtosisAre(float64(), "[1.165, 0.6268, 0.0751, 0.3516, -0.6965]", options, + -0.29322304336607355496, -0.83411431970273759); + // Results are slightly different because the input doesn't losslessly convert + // to float32. + AssertSkewKurtosisAre(float32(), "[1.165, 0.6268, 0.0751, 0.3516, -0.6965]", options, + -0.2932230870440958164, -0.8341143229437093939); +} + +TEST_F(TestSkewKurtosis, Chunked) { + auto options = SkewOptions::Defaults(); + AssertSkewKurtosisAre(float64(), {"[1.165, 0.6268]", "[]", "[0.0751, 0.3516, -0.6965]"}, + options, -0.29322304336607355496, -0.83411431970273759); + AssertSkewKurtosisAre(float32(), {"[1.165, 0.6268]", "[]", "[0.0751, 0.3516, -0.6965]"}, + options, -0.2932230870440958164, -0.8341143229437093939); +} + +TEST_F(TestSkewKurtosis, Decimal) { + auto options = SkewOptions::Defaults(); + for (auto type : + {decimal32(5, 4), decimal64(5, 4), decimal128(5, 4), decimal256(5, 4)}) { + AssertSkewKurtosisAre(type, R"(["1.1650", "0.6268", "0.0751", "0.3516", "-0.6965"])", + options, -0.29322304336607355496, -0.83411431970273759); + } +} + +TEST_F(TestSkewKurtosis, Integral) { + auto options = SkewOptions::Defaults(); + for (auto type : IntTypes()) { + AssertSkewKurtosisAre(type, "[1, 2, 3, 5]", options, 0.4346507595746657, + -1.1542857142857144); + } +} + +TEST_F(TestSkewKurtosis, SpecialCases) { + auto options = SkewOptions::Defaults(); + for (auto type : {float64(), float32()}) { + AssertSkewKurtosisAre(type, "[0, 1, 2]", options, 0.0, -1.5, /*n_ulps=*/0); + AssertSkewKurtosisAre(type, "[1]", options, std::nan(""), std::nan("")); + AssertSkewKurtosisAre(type, "[1, 1, 1, 1, 1, 1]", options, std::nan(""), + std::nan("")); + } +} + +TEST_F(TestSkewKurtosis, Options) { + for (auto type : {float64(), float32()}) { + auto options = SkewOptions::Defaults(); + AssertSkewKurtosisInvalid(type, "[]", options); + AssertSkewKurtosisInvalid(type, std::vector{}, options); + AssertSkewKurtosisInvalid(type, {"[]", "[]", "[]"}, options); + AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5); + AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0, -1.5); + options.min_count = 3; + AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5); + AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0, -1.5); + options.skip_nulls = false; + AssertSkewKurtosisInvalid(type, "[0, 1, null, 2]", options); + AssertSkewKurtosisInvalid(type, {"[0, 1]", "[]", "[null, 2]"}, options); + options.skip_nulls = true; + options.min_count = 4; + AssertSkewKurtosisInvalid(type, "[0, 1, null, 2]", options); + AssertSkewKurtosisInvalid(type, {"[0, 1]", "[]", "[null, 2]"}, options); + } +} + // // Quantile // @@ -4307,5 +4469,294 @@ TEST(TestTDigestKernel, ApproximateMedian) { } } +// +// Pivot +// + +class TestPivotKernel : public ::testing::Test { + public: + void AssertPivot(const Datum& keys, const Datum& values, const Scalar& expected, + const PivotWiderOptions& options) { + SCOPED_TRACE(options.ToString()); + ASSERT_OK_AND_ASSIGN(Datum out, + CallFunction("pivot_wider", {keys, values}, &options)); + ValidateOutput(out); + ASSERT_TRUE(out.is_scalar()); + AssertScalarsEqual(expected, *out.scalar(), /*verbose=*/true); + } +}; + +TEST_F(TestPivotKernel, Basics) { + auto key_type = utf8(); + auto value_type = float32(); + + auto keys = ArrayFromJSON(key_type, R"(["width", "height"])"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5]"); + auto expected = ScalarFromJSON( + struct_({field("height", value_type), field("width", value_type)}), "[11.5, 10.5]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); +} + +TEST_F(TestPivotKernel, AllKeyTypes) { + for (auto key_type : BaseBinaryTypes()) { + auto value_type = float32(); + + auto keys = ArrayFromJSON(key_type, R"(["width", "height"])"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5]"); + auto expected = + ScalarFromJSON(struct_({field("height", value_type), field("width", value_type)}), + "[11.5, 10.5]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); + } +} + +TEST_F(TestPivotKernel, Numbers) { + auto key_type = utf8(); + for (auto value_type : NumericTypes()) { + auto keys = ArrayFromJSON(key_type, R"(["width", "height"])"); + auto values = ArrayFromJSON(value_type, "[10, 11]"); + auto expected = ScalarFromJSON( + struct_({field("height", value_type), field("width", value_type)}), "[11, 10]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); + } +} + +TEST_F(TestPivotKernel, Binary) { + auto key_type = utf8(); + for (auto value_type : BaseBinaryTypes()) { + auto keys = ArrayFromJSON(key_type, R"(["abc", "def"])"); + auto values = ArrayFromJSON(value_type, R"(["foo", "bar"])"); + auto expected = + ScalarFromJSON(struct_({field("abc", value_type), field("def", value_type)}), + R"(["foo", "bar"])"); + AssertPivot(keys, values, *expected, PivotWiderOptions(/*key_names=*/{"abc", "def"})); + } +} + +TEST_F(TestPivotKernel, NullType) { + auto key_type = utf8(); + auto value_type = null(); + + auto keys = ArrayFromJSON(key_type, R"(["abc", "def"])"); + auto values = ArrayFromJSON(value_type, "[null, null]"); + auto expected = ScalarFromJSON( + struct_({field("abc", value_type), field("def", value_type)}), R"([null, null])"); + AssertPivot(keys, values, *expected, PivotWiderOptions(/*key_names=*/{"abc", "def"})); +} + +TEST_F(TestPivotKernel, NullValues) { + auto key_type = utf8(); + auto value_type = float32(); + + auto keys = ArrayFromJSON(key_type, R"(["width", "height", "height", "width"])"); + auto values = ArrayFromJSON(value_type, "[null, 10.5, null, 11.5]"); + auto expected = ScalarFromJSON( + struct_({field("height", value_type), field("width", value_type)}), "[10.5, 11.5]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); +} + +TEST_F(TestPivotKernel, ChunkedInput) { + auto key_type = utf8(); + auto value_type = float32(); + + auto keys = ChunkedArrayFromJSON(key_type, + {R"(["width"])", R"(["height", "height", "width"])"}); + auto values = ChunkedArrayFromJSON(value_type, {"[null, 10.5]", "[null, 11.5]"}); + auto expected = ScalarFromJSON( + struct_({field("height", value_type), field("width", value_type)}), "[10.5, 11.5]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); +} + +TEST_F(TestPivotKernel, AllInputKinds) { + auto key_type = utf8(); + auto value_type = float32(); + + DatumVector key_args = { + ScalarFromJSON(key_type, R"("width")"), + ArrayFromJSON(key_type, R"(["width"])"), + ChunkedArrayFromJSON(key_type, {R"(["width"])"}), + }; + DatumVector value_args = { + ScalarFromJSON(value_type, "11.5"), + ArrayFromJSON(value_type, "[11.5]"), + ChunkedArrayFromJSON(value_type, {"[11.5]"}), + }; + auto expected = ScalarFromJSON( + struct_({field("height", value_type), field("width", value_type)}), "[null, 11.5]"); + + for (const Datum& keys : key_args) { + ARROW_SCOPED_TRACE("keys = ", keys.ToString()); + for (const Datum& values : value_args) { + ARROW_SCOPED_TRACE("values = ", keys.ToString()); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); + } + } +} + +TEST_F(TestPivotKernel, ScalarKey) { + auto key_type = utf8(); + auto value_type = float32(); + auto expected_type = struct_({field("height", value_type), field("width", value_type)}); + + auto keys = ScalarFromJSON(key_type, R"("width")"); + auto values = ArrayFromJSON(value_type, "[null, 11.5, null]"); + auto expected = ScalarFromJSON(expected_type, "[null, 11.5]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); +} + +TEST_F(TestPivotKernel, ScalarValue) { + auto key_type = utf8(); + auto value_type = float32(); + auto expected_type = struct_({field("height", value_type), field("width", value_type)}); + + auto keys = ArrayFromJSON(key_type, R"(["width", "height"])"); + auto values = ScalarFromJSON(value_type, "11.5"); + auto expected = ScalarFromJSON(expected_type, "[11.5, 11.5]"); + AssertPivot(keys, values, *expected, + PivotWiderOptions(/*key_names=*/{"height", "width"})); +} + +TEST_F(TestPivotKernel, EmptyInput) { + auto key_type = utf8(); + auto value_type = float32(); + auto options = PivotWiderOptions(/*key_names=*/{"height", "width"}); + auto expected_type = struct_({field("height", value_type), field("width", value_type)}); + auto expected = ScalarFromJSON(expected_type, "[null, null]"); + + AssertPivot(ArrayFromJSON(key_type, "[]"), ArrayFromJSON(value_type, "[]"), *expected, + options); + AssertPivot(ChunkedArrayFromJSON(key_type, {}), ChunkedArrayFromJSON(value_type, {}), + *expected, options); +} + +TEST_F(TestPivotKernel, MissingKey) { + auto key_type = utf8(); + auto value_type = float32(); + + auto keys = ArrayFromJSON(key_type, R"(["width", "height"])"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5]"); + auto options = PivotWiderOptions(/*key_names=*/{"height", "width", "depth"}); + auto expected = + ScalarFromJSON(struct_({field("height", value_type), field("width", value_type), + field("depth", value_type)}), + "[11.5, 10.5, null]"); + AssertPivot(keys, values, *expected, options); +} + +TEST_F(TestPivotKernel, UnexpectedKey) { + auto key_type = utf8(); + auto value_type = float32(); + auto expected_type = struct_({field("height", value_type), field("width", value_type)}); + + auto options = PivotWiderOptions(/*key_names=*/{"height", "width"}); + auto options_raise = + PivotWiderOptions(/*key_names=*/{"height", "width"}, PivotWiderOptions::kRaise); + + { + auto keys = ArrayFromJSON(key_type, R"(["width", "height", "depth"])"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5, 12.5]"); + auto expected = ScalarFromJSON(expected_type, "[11.5, 10.5]"); + AssertPivot(keys, values, *expected, options); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, ::testing::HasSubstr("Unexpected pivot key: depth"), + CallFunction("pivot_wider", {keys, values}, &options_raise)); + } + { + // Scalar key + auto keys = ScalarFromJSON(key_type, R"("depth")"); + auto expected = ScalarFromJSON(expected_type, "[null, null]"); + for (const Datum& values : DatumVector{ArrayFromJSON(value_type, "[10.5]"), + ScalarFromJSON(value_type, "10.5")}) { + AssertPivot(keys, values, *expected, options); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, ::testing::HasSubstr("Unexpected pivot key: depth"), + CallFunction("pivot_wider", {keys, values}, &options_raise)); + } + } + { + // Scalar value + auto values = ScalarFromJSON(value_type, "10.5"); + auto expected = ScalarFromJSON(expected_type, "[null, null]"); + for (const Datum& keys : DatumVector{ArrayFromJSON(key_type, R"(["depth"])"), + ScalarFromJSON(key_type, R"("depth")")}) { + AssertPivot(keys, values, *expected, options); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, ::testing::HasSubstr("Unexpected pivot key: depth"), + CallFunction("pivot_wider", {keys, values}, &options_raise)); + } + } +} + +TEST_F(TestPivotKernel, NullKey) { + auto key_type = utf8(); + auto value_type = float32(); + + auto keys = ArrayFromJSON(key_type, R"(["width", null])"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5]"); + auto options = PivotWiderOptions(/*key_names=*/{"height", "width"}); + EXPECT_RAISES_WITH_MESSAGE_THAT(KeyError, + ::testing::HasSubstr("pivot key name cannot be null"), + CallFunction("pivot_wider", {keys, values}, &options)); +} + +TEST_F(TestPivotKernel, DuplicateKeyNames) { + auto key_type = utf8(); + auto value_type = float32(); + + auto keys = ArrayFromJSON(key_type, "[]"); + auto values = ArrayFromJSON(value_type, "[]"); + auto options = PivotWiderOptions(/*key_names=*/{"height", "height", "width"}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + KeyError, ::testing::HasSubstr("Duplicate key name 'height' in PivotWiderOptions"), + CallFunction("pivot_wider", {keys, values}, &options)); +} + +TEST_F(TestPivotKernel, DuplicateValues) { + auto key_type = utf8(); + auto value_type = float32(); + auto options = PivotWiderOptions(/*key_names=*/{"height", "width"}); + + { + // Duplicate values in the same chunk + auto keys = ArrayFromJSON(key_type, R"(["width", "height", "height"])"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5, 12.5]"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Encountered more than one non-null value"), + CallFunction("pivot_wider", {keys, values}, &options)); + } + { + // Duplicate values in different chunks + auto keys = + ChunkedArrayFromJSON(key_type, {R"(["width", "height"])", R"(["height"])"}); + auto values = ChunkedArrayFromJSON(value_type, {"[10.5, 11.5]", "[12.5]"}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Encountered more than one non-null value"), + CallFunction("pivot_wider", {keys, values}, &options)); + } + { + // Duplicate values with scalar key + auto keys = ScalarFromJSON(key_type, R"("width")"); + auto values = ArrayFromJSON(value_type, "[10.5, 11.5]"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Encountered more than one non-null value"), + CallFunction("pivot_wider", {keys, values}, &options)); + } + { + // Duplicate values with scalar value + auto keys = ArrayFromJSON(key_type, R"(["width", "height", "height"])"); + auto values = ScalarFromJSON(value_type, "10.5"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Encountered more than one non-null value"), + CallFunction("pivot_wider", {keys, values}, &options)); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index e4189f9b62b17..8d2da195b09a4 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -15,18 +15,21 @@ // specific language governing permissions and limitations // under the License. +#include #include +#include #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/aggregate_var_std_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/util/bit_run_reader.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/int128_internal.h" -namespace arrow { -namespace compute { -namespace internal { +namespace arrow::compute::internal { + +using ::arrow::internal::checked_cast; namespace { @@ -34,13 +37,14 @@ using arrow::internal::int128_t; using arrow::internal::VisitSetBitRunsVoid; template -struct VarStdState { +struct MomentsState { using ArrayType = typename TypeTraits::ArrayType; using CType = typename TypeTraits::CType; - using ThisType = VarStdState; + using SumType = typename internal::GetSumType::SumType; + using ThisType = MomentsState; - explicit VarStdState(int32_t decimal_scale, VarianceOptions options) - : decimal_scale(decimal_scale), options(options) {} + MomentsState(int level, int32_t decimal_scale, bool skip_nulls) + : level(level), decimal_scale(decimal_scale), skip_nulls(skip_nulls) {} template double ToDouble(T value) const { @@ -51,89 +55,90 @@ struct VarStdState { double ToDouble(const Decimal128& value) const { return value.ToDouble(decimal_scale); } double ToDouble(const Decimal256& value) const { return value.ToDouble(decimal_scale); } - // float/double/int64/decimal: calculate `m2` (sum((X-mean)^2)) with `two pass - // algorithm` - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm - template - enable_if_t::value || (sizeof(CType) > 4) || - (!is_integer_type::value && sizeof(CType) == 4)> - Consume(const ArraySpan& array) { + int64_t count() const { return moments.count; } + + void Consume(const ArraySpan& array) { + constexpr bool kCanUseIntArithmetic = std::is_integral_v && sizeof(CType) <= 4; + this->all_valid = array.GetNullCount() == 0; - int64_t count = array.length - array.GetNullCount(); - if (count == 0 || (!this->all_valid && !options.skip_nulls)) { + int64_t valid_count = array.length - array.GetNullCount(); + if (valid_count == 0 || (!this->all_valid && !this->skip_nulls)) { return; } - using SumType = typename internal::GetSumType::SumType; + if constexpr (kCanUseIntArithmetic) { + if (level == 2) { + // int32/16/8: textbook one pass algorithm for M2 with integer arithmetic + + // max number of elements that sum will not overflow int64 (2Gi int32 elements) + // for uint32: 0 <= sum < 2^63 (int64 >= 0) + // for int32: -2^62 <= sum < 2^62 + constexpr int64_t kMaxChunkLength = 1ULL << (63 - sizeof(CType) * 8); + int64_t start_index = 0; + + ArraySpan slice = array; + while (valid_count > 0) { + // process in chunks that overflow will never happen + slice.SetSlice(start_index + array.offset, + std::min(kMaxChunkLength, array.length - start_index)); + const int64_t count = slice.length - slice.GetNullCount(); + start_index += slice.length; + valid_count -= count; + + if (count > 0) { + IntegerVarStd var_std; + const CType* values = slice.GetValues(1); + VisitSetBitRunsVoid(slice.buffers[0].data, slice.offset, slice.length, + [&](int64_t pos, int64_t len) { + for (int64_t i = 0; i < len; ++i) { + const auto value = values[pos + i]; + var_std.ConsumeOne(value); + } + }); + + // merge variance + auto slice_moments = Moments(var_std.count, var_std.mean(), var_std.m2()); + this->moments.MergeFrom(level, slice_moments); + } + } + return; + } + } + + // float/double/int64/decimal: calculate each moment in a separate pass. + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm SumType sum = internal::SumArray(array); - const double mean = ToDouble(sum) / count; + const double mean = ToDouble(sum) / valid_count; const double m2 = internal::SumArray( array, [this, mean](CType value) { const double v = ToDouble(value); return (v - mean) * (v - mean); }); - - ThisType state(decimal_scale, options); - state.count = count; - state.mean = mean; - state.m2 = m2; - this->MergeFrom(state); - } - - // int32/16/8: textbook one pass algorithm with integer arithmetic - template - enable_if_t::value && (sizeof(CType) <= 4)> Consume( - const ArraySpan& array) { - // max number of elements that sum will not overflow int64 (2Gi int32 elements) - // for uint32: 0 <= sum < 2^63 (int64 >= 0) - // for int32: -2^62 <= sum < 2^62 - constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8); - - this->all_valid = array.GetNullCount() == 0; - if (!this->all_valid && !options.skip_nulls) return; - int64_t start_index = 0; - int64_t valid_count = array.length - array.GetNullCount(); - - ArraySpan slice = array; - while (valid_count > 0) { - // process in chunks that overflow will never happen - slice.SetSlice(start_index + array.offset, - std::min(max_length, array.length - start_index)); - const int64_t count = slice.length - slice.GetNullCount(); - start_index += slice.length; - valid_count -= count; - - if (count > 0) { - IntegerVarStd var_std; - const CType* values = slice.GetValues(1); - VisitSetBitRunsVoid(slice.buffers[0].data, slice.offset, slice.length, - [&](int64_t pos, int64_t len) { - for (int64_t i = 0; i < len; ++i) { - const auto value = values[pos + i]; - var_std.ConsumeOne(value); - } - }); - - // merge variance - ThisType state(decimal_scale, options); - state.count = var_std.count; - state.mean = var_std.mean(); - state.m2 = var_std.m2(); - this->MergeFrom(state); + double m3 = 0, m4 = 0; + if (level >= 3) { + m3 = internal::SumArray( + array, [this, mean](CType value) { + const double v = ToDouble(value); + return (v - mean) * (v - mean) * (v - mean); + }); + if (level >= 4) { + m4 = internal::SumArray( + array, [this, mean](CType value) { + const double v = ToDouble(value); + return (v - mean) * (v - mean) * (v - mean) * (v - mean); + }); } } + this->moments.MergeFrom(level, Moments(valid_count, mean, m2, m3, m4)); } - // Scalar: textbook algorithm void Consume(const Scalar& scalar, const int64_t count) { - this->m2 = 0; if (scalar.is_valid) { - this->count = count; - this->mean = ToDouble(UnboxScalar::Unbox(scalar)); + double value = ToDouble(UnboxScalar::Unbox(scalar)); + this->moments = Moments::FromScalar(level, value, count); } else { - this->count = 0; - this->mean = 0; + this->moments = Moments(); this->all_valid = false; } } @@ -142,35 +147,38 @@ struct VarStdState { // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html void MergeFrom(const ThisType& state) { this->all_valid = this->all_valid && state.all_valid; - if (state.count == 0) { - return; - } - if (this->count == 0) { - this->count = state.count; - this->mean = state.mean; - this->m2 = state.m2; - return; - } - MergeVarStd(this->count, this->mean, state.count, state.mean, state.m2, &this->count, - &this->mean, &this->m2); + this->moments.MergeFrom(level, state.moments); } + const int level; const int32_t decimal_scale; - const VarianceOptions options; - int64_t count = 0; - double mean = 0; - double m2 = 0; // m2 = count*s2 = sum((X-mean)^2) + const bool skip_nulls; + Moments moments; bool all_valid = true; }; template -struct VarStdImpl : public ScalarAggregator { - using ThisType = VarStdImpl; +struct StatisticImpl : public ScalarAggregator { + using ThisType = StatisticImpl; using ArrayType = typename TypeTraits::ArrayType; - explicit VarStdImpl(int32_t decimal_scale, const std::shared_ptr& out_type, - const VarianceOptions& options, VarOrStd return_type) - : out_type(out_type), state(decimal_scale, options), return_type(return_type) {} + StatisticImpl(StatisticType stat_type, int32_t decimal_scale, + const std::shared_ptr& out_type, const VarianceOptions& options) + : out_type(out_type), + stat_type(stat_type), + skip_nulls(options.skip_nulls), + min_count(options.min_count), + ddof(options.ddof), + state(moments_level_for_statistic(stat_type), decimal_scale, skip_nulls) {} + + StatisticImpl(StatisticType stat_type, int32_t decimal_scale, + const std::shared_ptr& out_type, const SkewOptions& options) + : out_type(out_type), + stat_type(stat_type), + skip_nulls(options.skip_nulls), + min_count(options.min_count), + ddof(0), + state(moments_level_for_statistic(stat_type), decimal_scale, skip_nulls) {} Status Consume(KernelContext*, const ExecSpan& batch) override { if (batch[0].is_array()) { @@ -188,92 +196,90 @@ struct VarStdImpl : public ScalarAggregator { } Status Finalize(KernelContext*, Datum* out) override { - if (state.count <= state.options.ddof || state.count < state.options.min_count || - (!state.all_valid && !state.options.skip_nulls)) { + if (state.count() <= ddof || state.count() < min_count || + (!state.all_valid && !skip_nulls)) { out->value = std::make_shared(); } else { - double var = state.m2 / (state.count - state.options.ddof); - out->value = - std::make_shared(return_type == VarOrStd::Var ? var : sqrt(var)); + switch (stat_type) { + case StatisticType::Std: + out->value = std::make_shared(state.moments.Stddev(ddof)); + break; + case StatisticType::Var: + out->value = std::make_shared(state.moments.Variance(ddof)); + break; + case StatisticType::Skew: + out->value = std::make_shared(state.moments.Skew()); + break; + case StatisticType::Kurtosis: + out->value = std::make_shared(state.moments.Kurtosis()); + break; + default: + return Status::NotImplemented("Unsupported statistic type ", + static_cast(stat_type)); + } } return Status::OK(); } std::shared_ptr out_type; - VarStdState state; - VarOrStd return_type; -}; - -struct VarStdInitState { - std::unique_ptr state; - KernelContext* ctx; - const DataType& in_type; - const std::shared_ptr& out_type; - const VarianceOptions& options; - VarOrStd return_type; - - VarStdInitState(KernelContext* ctx, const DataType& in_type, - const std::shared_ptr& out_type, - const VarianceOptions& options, VarOrStd return_type) - : ctx(ctx), - in_type(in_type), - out_type(out_type), - options(options), - return_type(return_type) {} - - Status Visit(const DataType&) { - return Status::NotImplemented("No variance/stddev implemented"); - } - - Status Visit(const HalfFloatType&) { - return Status::NotImplemented("No variance/stddev implemented"); - } - - template - enable_if_number Visit(const Type&) { - state.reset( - new VarStdImpl(/*decimal_scale=*/0, out_type, options, return_type)); - return Status::OK(); - } - - template - enable_if_decimal Visit(const Type&) { - state.reset(new VarStdImpl(checked_cast(in_type).scale(), - out_type, options, return_type)); - return Status::OK(); - } - - Result> Create() { - RETURN_NOT_OK(VisitTypeInline(in_type, this)); - return std::move(state); - } + StatisticType stat_type; + bool skip_nulls; + uint32_t min_count; + int ddof = 0; + MomentsState state; }; -Result> StddevInit(KernelContext* ctx, - const KernelInitArgs& args) { - VarStdInitState visitor( - ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options), VarOrStd::Std); - return visitor.Create(); +template +Result> StatisticInit( + StatisticType stat_type, const DataType& in_type, + const std::shared_ptr& out_type, const OptionsType& options) { + auto make_kernel_state = [&](auto&& type, int32_t decimal_scale = 0) { + using Type = std::decay_t; + return std::unique_ptr( + new StatisticImpl(stat_type, decimal_scale, out_type, options)); + }; + + auto visit = [&](auto&& type) -> Result> { + using Type = std::decay_t; + // Decimals + if constexpr (is_decimal_type::value) { + return make_kernel_state(type, type.scale()); + } + // Numbers (except half-float) + if constexpr (is_number_type::value && !is_half_float_type::value) { + return make_kernel_state(type); + } + return Status::NotImplemented("No variance/stddev implemented for ", + in_type.ToString()); + }; + return VisitType(in_type, visit); } -Result> VarianceInit(KernelContext* ctx, - const KernelInitArgs& args) { - VarStdInitState visitor( - ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options), VarOrStd::Var); - return visitor.Create(); +template +Result> StatisticInit(KernelContext* ctx, + const KernelInitArgs& args) { + const DataType& in_type = *args.inputs[0].type; + const std::shared_ptr& out_type = args.kernel->signature->out_type().type(); + const OptionsType& options = checked_cast(*args.options); + + return StatisticInit(kStatType, in_type, out_type, options); } -void AddVarStdKernels(KernelInit init, - const std::vector>& types, - ScalarAggregateFunction* func) { +void AddStatisticKernels(KernelInit init, + const std::vector>& types, + ScalarAggregateFunction* func) { for (const auto& ty : types) { auto sig = KernelSignature::Make({InputType(ty->id())}, float64()); AddAggKernel(std::move(sig), init, func); } } +void AddStatisticKernels(KernelInit init, ScalarAggregateFunction* func) { + AddStatisticKernels(init, NumericTypes(), func); + AddStatisticKernels( + init, {decimal32(1, 1), decimal64(1, 1), decimal128(1, 1), decimal256(1, 1)}, func); +} + const FunctionDoc stddev_doc{ "Calculate the standard deviation of a numeric array", ("The number of degrees of freedom can be controlled using VarianceOptions.\n" @@ -292,21 +298,53 @@ const FunctionDoc variance_doc{ {"array"}, "VarianceOptions"}; +const FunctionDoc skew_doc{ + "Calculate the skewness of a numeric array", + ("Nulls are ignored by default. If there are not enough non-null values\n" + "in the array to satisfy `min_count`, null is returned.\n" + "The behavior of nulls and the `min_count` parameter can be changed\n" + "in SkewOptions."), + {"array"}, + "SkewOptions"}; + +const FunctionDoc kurtosis_doc{ + "Calculate the kurtosis of a numeric array", + ("Nulls are ignored by default. If there are not enough non-null values\n" + "in the array to satisfy `min_count`, null is returned.\n" + "The behavior of nulls and the `min_count` parameter can be changed\n" + "in SkewOptions."), + {"array"}, + "SkewOptions"}; + std::shared_ptr AddStddevAggKernels() { - static auto default_std_options = VarianceOptions::Defaults(); + static const auto default_std_options = VarianceOptions::Defaults(); auto func = std::make_shared("stddev", Arity::Unary(), stddev_doc, &default_std_options); - AddVarStdKernels(StddevInit, NumericTypes(), func.get()); - AddVarStdKernels(StddevInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); + AddStatisticKernels(StatisticInit, func.get()); return func; } std::shared_ptr AddVarianceAggKernels() { - static auto default_var_options = VarianceOptions::Defaults(); + static const auto default_var_options = VarianceOptions::Defaults(); auto func = std::make_shared( "variance", Arity::Unary(), variance_doc, &default_var_options); - AddVarStdKernels(VarianceInit, NumericTypes(), func.get()); - AddVarStdKernels(VarianceInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); + AddStatisticKernels(StatisticInit, func.get()); + return func; +} + +std::shared_ptr AddSkewAggKernels() { + static const auto default_options = SkewOptions::Defaults(); + auto func = std::make_shared("skew", Arity::Unary(), skew_doc, + &default_options); + AddStatisticKernels(StatisticInit, func.get()); + return func; +} + +std::shared_ptr AddKurtosisAggKernels() { + static const auto default_options = SkewOptions::Defaults(); + auto func = std::make_shared("kurtosis", Arity::Unary(), + kurtosis_doc, &default_options); + AddStatisticKernels(StatisticInit, func.get()); return func; } @@ -315,8 +353,8 @@ std::shared_ptr AddVarianceAggKernels() { void RegisterScalarAggregateVariance(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(AddVarianceAggKernels())); DCHECK_OK(registry->AddFunction(AddStddevAggKernels())); + DCHECK_OK(registry->AddFunction(AddSkewAggKernels())); + DCHECK_OK(registry->AddFunction(AddKurtosisAggKernels())); } -} // namespace internal -} // namespace compute -} // namespace arrow +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h b/cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h index 675ebfd91d308..f7c35bea96783 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h @@ -18,24 +18,27 @@ #pragma once #include "arrow/util/int128_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/math_internal.h" -namespace arrow { -namespace compute { -namespace internal { +#include +#include + +namespace arrow::compute::internal { using arrow::internal::int128_t; // Accumulate sum/squared sum (using naive summation) // Shared implementation between scalar/hash aggregate variance/stddev kernels -template struct IntegerVarStd { - using c_type = typename ArrowType::c_type; - int64_t count = 0; int64_t sum = 0; int128_t square_sum = 0; - void ConsumeOne(const c_type value) { + template + void ConsumeOne(Integer value) { + static_assert(std::is_integral_v); + static_assert(sizeof(Integer) <= 4); sum += value; square_sum += static_cast(value) * value; count++; @@ -53,16 +56,87 @@ struct IntegerVarStd { } }; -static inline void MergeVarStd(int64_t count1, double mean1, int64_t count2, double mean2, - double m22, int64_t* out_count, double* out_mean, - double* out_m2) { - double mean = (mean1 * count1 + mean2 * count2) / (count1 + count2); - *out_m2 += m22 + count1 * (mean1 - mean) * (mean1 - mean) + - count2 * (mean2 - mean) * (mean2 - mean); - *out_count += count2; - *out_mean = mean; +enum class StatisticType { Var, Std, Skew, Kurtosis }; + +constexpr int moments_level_for_statistic(StatisticType s) { + switch (s) { + case StatisticType::Skew: + return 3; + case StatisticType::Kurtosis: + return 4; + default: + return 2; + } } -} // namespace internal -} // namespace compute -} // namespace arrow +struct Moments { + int64_t count = 0; + double mean = 0; + double m2 = 0; // m2 = sum((X-mean)^2) + double m3 = 0; // m3 = sum((X-mean)^3) + double m4 = 0; // m4 = sum((X-mean)^4) + + Moments() = default; + Moments(int64_t count, double mean, double m2, double m3 = 0, double m4 = 0) + : count(count), mean(mean), m2(m2), m3(m3), m4(m4) {} + + double Variance(int ddof) const { return m2 / (count - ddof); } + + double Stddev(int ddof) const { return sqrt(Variance(ddof)); } + + double Skew() const { + // This may return NaN for m2 == 0 and m3 == 0, which is expected + return sqrt(count) * m3 / sqrt(m2 * m2 * m2); + } + + double Kurtosis() const { + // This may return NaN for m2 == 0 and m4 == 0, which is expected + return count * m4 / (m2 * m2) - 3; + } + + void MergeFrom(int level, const Moments& other) { *this = Merge(level, *this, other); } + + static Moments Merge(int level, const Moments& a, const Moments& b) { + using ::arrow::internal::NeumaierSum; + + if (a.count == 0) { + return b; + } else if (b.count == 0) { + return a; + } + + // Shorter aliases for readability + const int64_t na = a.count, nb = b.count; + const int64_t n = na + nb; + const double mean = (a.mean * na + b.mean * nb) / n; + // NOTE: there is a more common formula: + // double delta = b.mean - a.mean; + // double m2 = a.m2 + b.m2 + delta * delta * na * nb / n; + // but it gives worse results in TestVarStdKernelMergeStability. + const double m2 = NeumaierSum({a.m2, b.m2, na * (a.mean - mean) * (a.mean - mean), + nb * (b.mean - mean) * (b.mean - mean)}); + double m3 = 0; + double m4 = 0; + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics + if (level >= 3) { + double delta = b.mean - a.mean; + double delta2 = delta * delta; + m3 = NeumaierSum({a.m3, b.m3, delta2 * delta * na * nb * (na - nb) / (n * n), + 3 * delta * (na * b.m2 - nb * a.m2) / n}); + if (level >= 4) { + m4 = NeumaierSum( + {a.m4, b.m4, + (delta2 * delta2) * na * nb * (na * na - na * nb + nb * nb) / (n * n * n), + 6 * delta2 * (na * na * b.m2 + nb * nb * a.m2) / (n * n), + 4 * delta * (na * b.m3 - nb * a.m3) / n}); + } + } + return Moments(n, mean, m2, m3, m4); + } + + static Moments FromScalar(int level, double value, int64_t count) { + return Moments(count, /*mean=*/value, /*m2=*/0, /*m3=*/0, /*m4=*/0); + } +}; + +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 21b7bd9bf6632..e84c3c2dc164d 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -26,6 +26,7 @@ #include "arrow/array/builder_nested.h" #include "arrow/array/builder_primitive.h" +#include "arrow/array/concatenate.h" #include "arrow/buffer_builder.h" #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" @@ -33,6 +34,7 @@ #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/aggregate_var_std_internal.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/kernels/pivot_internal.h" #include "arrow/compute/kernels/util_internal.h" #include "arrow/compute/row/grouper.h" #include "arrow/compute/row/row_encoder_internal.h" @@ -40,6 +42,7 @@ #include "arrow/stl_allocator.h" #include "arrow/type_traits.h" #include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/bitmap_writer.h" #include "arrow/util/checked_cast.h" @@ -47,6 +50,7 @@ #include "arrow/util/int128_internal.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/ree_util.h" +#include "arrow/util/span.h" #include "arrow/util/task_group.h" #include "arrow/util/tdigest.h" #include "arrow/util/thread_pool.h" @@ -56,6 +60,7 @@ namespace arrow { using internal::checked_cast; using internal::FirstTimeBitmapWriter; +using util::span; namespace compute { namespace internal { @@ -129,10 +134,12 @@ HashAggregateKernel MakeUnaryKernel(KernelInit init) { std::move(init)); } -Status AddHashAggKernels( - const std::vector>& types, - Result make_kernel(const std::shared_ptr&), - HashAggregateFunction* function) { +using HashAggregateKernelFactory = + std::function(const std::shared_ptr&)>; + +Status AddHashAggKernels(const std::vector>& types, + HashAggregateKernelFactory make_kernel, + HashAggregateFunction* function) { for (const auto& ty : types) { ARROW_ASSIGN_OR_RAISE(auto kernel, make_kernel(ty)); RETURN_NOT_OK(function->AddKernel(std::move(kernel))); @@ -840,28 +847,58 @@ using GroupedMeanFactory = using arrow::internal::int128_t; template -struct GroupedVarStdImpl : public GroupedAggregator { +struct GroupedStatisticImpl : public GroupedAggregator { using CType = typename TypeTraits::CType; + using SumType = typename internal::GetSumType::SumType; + // This method is defined solely to make GroupedStatisticImpl instantiable + // in ConsumeImpl below. It will be redefined in subclasses. Status Init(ExecContext* ctx, const KernelInitArgs& args) override { - options_ = *checked_cast(args.options); - if (is_decimal_type::value) { - const int32_t scale = + return Status::NotImplemented(""); + } + + // Init helper for hash_variance and hash_stddev + Status InitInternal(ExecContext* ctx, const KernelInitArgs& args, + StatisticType stat_type, const VarianceOptions& options) { + return InitInternal(ctx, args, stat_type, options.ddof, options.skip_nulls, + options.min_count); + } + + // Init helper for hash_skew and hash_kurtosis + Status InitInternal(ExecContext* ctx, const KernelInitArgs& args, + StatisticType stat_type, const SkewOptions& options) { + return InitInternal(ctx, args, stat_type, /*ddof=*/0, options.skip_nulls, + options.min_count); + } + + Status InitInternal(ExecContext* ctx, const KernelInitArgs& args, + StatisticType stat_type, int ddof, bool skip_nulls, + uint32_t min_count) { + if constexpr (is_decimal_type::value) { + int32_t decimal_scale = checked_cast(*args.inputs[0].type).scale(); - return InitInternal(ctx, scale, args.options); + return InitInternal(ctx, stat_type, decimal_scale, ddof, skip_nulls, min_count); + } else { + return InitInternal(ctx, stat_type, /*decimal_scale=*/0, ddof, skip_nulls, + min_count); } - return InitInternal(ctx, 0, args.options); } - Status InitInternal(ExecContext* ctx, int32_t decimal_scale, - const FunctionOptions* options) { - options_ = *checked_cast(options); + Status InitInternal(ExecContext* ctx, StatisticType stat_type, int32_t decimal_scale, + int ddof, bool skip_nulls, uint32_t min_count) { + stat_type_ = stat_type; + moments_level_ = moments_level_for_statistic(stat_type_); decimal_scale_ = decimal_scale; + skip_nulls_ = skip_nulls; + min_count_ = min_count; + ddof_ = ddof; ctx_ = ctx; pool_ = ctx->memory_pool(); counts_ = TypedBufferBuilder(pool_); means_ = TypedBufferBuilder(pool_); m2s_ = TypedBufferBuilder(pool_); + m3s_ = TypedBufferBuilder(pool_); + m4s_ = TypedBufferBuilder(pool_); no_nulls_ = TypedBufferBuilder(pool_); return Status::OK(); } @@ -872,6 +909,12 @@ struct GroupedVarStdImpl : public GroupedAggregator { RETURN_NOT_OK(counts_.Append(added_groups, 0)); RETURN_NOT_OK(means_.Append(added_groups, 0)); RETURN_NOT_OK(m2s_.Append(added_groups, 0)); + if (moments_level_ >= 3) { + RETURN_NOT_OK(m3s_.Append(added_groups, 0)); + if (moments_level_ >= 4) { + RETURN_NOT_OK(m4s_.Append(added_groups, 0)); + } + } RETURN_NOT_OK(no_nulls_.Append(added_groups, true)); return Status::OK(); } @@ -889,27 +932,30 @@ struct GroupedVarStdImpl : public GroupedAggregator { return value.ToDouble(decimal_scale_); } - Status Consume(const ExecSpan& batch) override { return ConsumeImpl(batch); } + Status Consume(const ExecSpan& batch) override { + constexpr bool kCanUseIntArithmetic = std::is_integral_v && sizeof(CType) <= 4; + + if constexpr (kCanUseIntArithmetic) { + if (moments_level_ == 2) { + return ConsumeIntegral(batch); + } + } + return ConsumeGeneric(batch); + } // float/double/int64/decimal: calculate `m2` (sum((X-mean)^2)) with - // `two pass algorithm` (see aggregate_var_std.cc) - template - enable_if_t::value || (sizeof(CType) > 4) || - std::is_same_v, - Status> - ConsumeImpl(const ExecSpan& batch) { - using SumType = typename internal::GetSumType::SumType; - - GroupedVarStdImpl state; - RETURN_NOT_OK(state.InitInternal(ctx_, decimal_scale_, &options_)); + // two pass algorithm (see aggregate_var_std.cc) + Status ConsumeGeneric(const ExecSpan& batch) { + GroupedStatisticImpl state; + RETURN_NOT_OK(state.InitInternal(ctx_, stat_type_, decimal_scale_, ddof_, skip_nulls_, + min_count_)); RETURN_NOT_OK(state.Resize(num_groups_)); int64_t* counts = state.counts_.mutable_data(); double* means = state.means_.mutable_data(); - double* m2s = state.m2s_.mutable_data(); uint8_t* no_nulls = state.no_nulls_.mutable_data(); - // XXX this uses naive summation; we should switch to pairwise summation as was - // done for the scalar aggregate kernel in ARROW-11567 + // XXX this uses naive summation; we should switch to pairwise summation + // (as the scalar aggregate kernel does) or Kahan summation. std::vector sums(num_groups_); VisitGroupedValues( batch, @@ -923,27 +969,34 @@ struct GroupedVarStdImpl : public GroupedAggregator { means[i] = ToDouble(sums[i]) / counts[i]; } + double* m2s = state.m2s_mutable_data(); + double* m3s = state.m3s_mutable_data(); + double* m4s = state.m4s_mutable_data(); + // Having distinct VisitGroupedValuesNonNull calls based on moments_level_ + // would increase code generation for relatively little benefit. VisitGroupedValuesNonNull( batch, [&](uint32_t g, typename TypeTraits::CType value) { - const double v = ToDouble(value); - m2s[g] += (v - means[g]) * (v - means[g]); + const double d = ToDouble(value) - means[g]; + const double d2 = d * d; + switch (moments_level_) { + case 4: + m4s[g] += d2 * d2; + [[fallthrough]]; + case 3: + m3s[g] += d2 * d; + [[fallthrough]]; + default: + m2s[g] += d2; + break; + } }); - ARROW_ASSIGN_OR_RAISE(auto mapping, - AllocateBuffer(num_groups_ * sizeof(uint32_t), pool_)); - for (uint32_t i = 0; static_cast(i) < num_groups_; i++) { - mapping->template mutable_data_as()[i] = i; - } - ArrayData group_id_mapping(uint32(), num_groups_, {nullptr, std::move(mapping)}, - /*null_count=*/0); - return this->Merge(std::move(state), group_id_mapping); + return MergeSameGroups(std::move(state)); } - // int32/16/8: textbook one pass algorithm with integer arithmetic (see - // aggregate_var_std.cc) - template - enable_if_t::value && (sizeof(CType) <= 4), Status> ConsumeImpl( - const ExecSpan& batch) { + // int32/16/8: textbook one pass algorithm to compute `m2` with integer arithmetic + // (see aggregate_var_std.cc) + Status ConsumeIntegral(const ExecSpan& batch) { // max number of elements that sum will not overflow int64 (2Gi int32 elements) // for uint32: 0 <= sum < 2^63 (int64 >= 0) // for int32: -2^62 <= sum < 2^62 @@ -958,15 +1011,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { return Status::OK(); } - std::vector> var_std(num_groups_); - - ARROW_ASSIGN_OR_RAISE(auto mapping, - AllocateBuffer(num_groups_ * sizeof(uint32_t), pool_)); - for (uint32_t i = 0; static_cast(i) < num_groups_; i++) { - mapping->template mutable_data_as()[i] = i; - } - ArrayData group_id_mapping(uint32(), num_groups_, {nullptr, std::move(mapping)}, - /*null_count=*/0); + std::vector var_std(num_groups_); for (int64_t start_index = 0; start_index < batch.length; start_index += max_length) { // process in chunks that overflow will never happen @@ -974,12 +1019,13 @@ struct GroupedVarStdImpl : public GroupedAggregator { // reset state var_std.clear(); var_std.resize(num_groups_); - GroupedVarStdImpl state; - RETURN_NOT_OK(state.InitInternal(ctx_, decimal_scale_, &options_)); + GroupedStatisticImpl state; + RETURN_NOT_OK(state.InitInternal(ctx_, stat_type_, decimal_scale_, ddof_, + skip_nulls_, min_count_)); RETURN_NOT_OK(state.Resize(num_groups_)); int64_t* other_counts = state.counts_.mutable_data(); double* other_means = state.means_.mutable_data(); - double* other_m2s = state.m2s_.mutable_data(); + double* other_m2s = state.m2s_mutable_data(); uint8_t* other_no_nulls = state.no_nulls_.mutable_data(); if (batch[0].is_array()) { @@ -1028,34 +1074,63 @@ struct GroupedVarStdImpl : public GroupedAggregator { other_means[i] = var_std[i].mean(); other_m2s[i] = var_std[i].m2(); } - RETURN_NOT_OK(this->Merge(std::move(state), group_id_mapping)); + RETURN_NOT_OK(MergeSameGroups(std::move(state))); } return Status::OK(); } Status Merge(GroupedAggregator&& raw_other, const ArrayData& group_id_mapping) override { - // Combine m2 from two chunks (see aggregate_var_std.cc) - auto other = checked_cast(&raw_other); + DCHECK_EQ(group_id_mapping.length, + checked_cast(&raw_other)->num_groups_); + const uint32_t* g = group_id_mapping.GetValues(1); + return MergeInternal(std::move(raw_other), + [g](int64_t other_g) { return g[other_g]; }); + } + + Status MergeSameGroups(GroupedAggregator&& raw_other) { + return MergeInternal(std::move(raw_other), [](int64_t other_g) { return other_g; }); + } + + template + Status MergeInternal(GroupedAggregator&& raw_other, GroupIdMapper&& group_id_mapper) { + // Combine moments from two chunks + auto other = checked_cast(&raw_other); + DCHECK_EQ(moments_level_, other->moments_level_); int64_t* counts = counts_.mutable_data(); double* means = means_.mutable_data(); - double* m2s = m2s_.mutable_data(); + double* m2s = m2s_mutable_data(); + // Moments above the current level will just be ignored. + double* m3s = m3s_mutable_data(); + double* m4s = m4s_mutable_data(); uint8_t* no_nulls = no_nulls_.mutable_data(); const int64_t* other_counts = other->counts_.data(); const double* other_means = other->means_.data(); - const double* other_m2s = other->m2s_.data(); + const double* other_m2s = other->m2s_data(); + const double* other_m3s = other->m3s_data(); + const double* other_m4s = other->m4s_data(); const uint8_t* other_no_nulls = other->no_nulls_.data(); - auto g = group_id_mapping.GetValues(1); - for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { + const int64_t num_other_groups = other->num_groups_; + + for (int64_t other_g = 0; other_g < num_other_groups; ++other_g) { + const auto g = group_id_mapper(other_g); if (!bit_util::GetBit(other_no_nulls, other_g)) { - bit_util::ClearBit(no_nulls, *g); + bit_util::ClearBit(no_nulls, g); } if (other_counts[other_g] == 0) continue; - MergeVarStd(counts[*g], means[*g], other_counts[other_g], other_means[other_g], - other_m2s[other_g], &counts[*g], &means[*g], &m2s[*g]); + auto moments = Moments::Merge( + moments_level_, Moments(counts[g], means[g], m2s[g], m3s[g], m4s[g]), + Moments(other_counts[other_g], other_means[other_g], other_m2s[other_g], + other_m3s[other_g], other_m4s[other_g])); + counts[g] = moments.count; + means[g] = moments.mean; + // Fill moments in reverse order, in case m3s or m4s is the same as m2s. + m4s[g] = moments.m4; + m3s[g] = moments.m3; + m2s[g] = moments.m2; } return Status::OK(); } @@ -1068,11 +1143,30 @@ struct GroupedVarStdImpl : public GroupedAggregator { auto* results = values->mutable_data_as(); const int64_t* counts = counts_.data(); - const double* m2s = m2s_.data(); + const double* means = means_.data(); + const double* m2s = m2s_data(); + const double* m3s = m3s_data(); + const double* m4s = m4s_data(); for (int64_t i = 0; i < num_groups_; ++i) { - if (counts[i] > options_.ddof && counts[i] >= options_.min_count) { - const double variance = m2s[i] / (counts[i] - options_.ddof); - results[i] = result_type_ == VarOrStd::Var ? variance : std::sqrt(variance); + if (counts[i] > ddof_ && counts[i] >= min_count_) { + const auto moments = Moments(counts[i], means[i], m2s[i], m3s[i], m4s[i]); + switch (stat_type_) { + case StatisticType::Var: + results[i] = moments.Variance(ddof_); + break; + case StatisticType::Std: + results[i] = moments.Stddev(ddof_); + break; + case StatisticType::Skew: + results[i] = moments.Skew(); + break; + case StatisticType::Kurtosis: + results[i] = moments.Kurtosis(); + break; + default: + return Status::NotImplemented("Statistic type ", + static_cast(stat_type_)); + } continue; } @@ -1085,7 +1179,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { null_count += 1; bit_util::SetBitTo(null_bitmap->mutable_data(), i, false); } - if (!options_.skip_nulls) { + if (!skip_nulls_) { if (null_bitmap) { arrow::internal::BitmapAnd(null_bitmap->data(), 0, no_nulls_.data(), 0, num_groups_, 0, null_bitmap->mutable_data()); @@ -1101,56 +1195,91 @@ struct GroupedVarStdImpl : public GroupedAggregator { std::shared_ptr out_type() const override { return float64(); } - VarOrStd result_type_; + const double* m2s_data() const { return m2s_.data(); } + // If moments_level_ < 3, the values read from m3s_data() will be ignored, + // but we still need to point to a valid buffer of the appropriate size. + // The trick is to reuse m2s_, which simplifies the code. + const double* m3s_data() const { + return (moments_level_ >= 3) ? m3s_.data() : m2s_.data(); + } + const double* m4s_data() const { + return (moments_level_ >= 4) ? m4s_.data() : m2s_.data(); + } + + double* m2s_mutable_data() { return m2s_.mutable_data(); } + double* m3s_mutable_data() { + return (moments_level_ >= 3) ? m3s_.mutable_data() : m2s_.mutable_data(); + } + double* m4s_mutable_data() { + return (moments_level_ >= 4) ? m4s_.mutable_data() : m2s_.mutable_data(); + } + + StatisticType stat_type_; + int moments_level_; int32_t decimal_scale_; - VarianceOptions options_; + bool skip_nulls_; + uint32_t min_count_; + int ddof_; int64_t num_groups_ = 0; // m2 = count * s2 = sum((X-mean)^2) TypedBufferBuilder counts_; - TypedBufferBuilder means_, m2s_; + TypedBufferBuilder means_, m2s_, m3s_, m4s_; TypedBufferBuilder no_nulls_; ExecContext* ctx_; MemoryPool* pool_; }; -template -Result> VarStdInit(KernelContext* ctx, - const KernelInitArgs& args) { - auto impl = std::make_unique>(); - impl->result_type_ = result_type; - RETURN_NOT_OK(impl->Init(ctx->exec_context(), args)); - // R build with openSUSE155 requires an explicit unique_ptr construction - return std::unique_ptr(std::move(impl)); -} - -template -struct GroupedVarStdFactory { - template ::value || - is_floating_type::value || - is_decimal_type::value>> - Status Visit(const T&) { - kernel = MakeKernel(std::move(argument_type), VarStdInit); - return Status::OK(); - } +template +struct ConcreteGroupedStatisticImpl : public GroupedStatisticImpl { + using GroupedStatisticImpl::InitInternal; - Status Visit(const HalfFloatType& type) { - return Status::NotImplemented("Computing variance/stddev of data of type ", type); + Status Init(ExecContext* ctx, const KernelInitArgs& args) override { + const auto& options = checked_cast(*args.options); + return InitInternal(ctx, args, kStatType, options); } +}; - Status Visit(const DataType& type) { - return Status::NotImplemented("Computing variance/stddev of data of type ", type); - } +template +using GroupedVarianceImpl = + ConcreteGroupedStatisticImpl; +template +using GroupedStddevImpl = + ConcreteGroupedStatisticImpl; +template +using GroupedSkewImpl = + ConcreteGroupedStatisticImpl; +template +using GroupedKurtosisImpl = + ConcreteGroupedStatisticImpl; + +template