Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Iceberg catalog support #95

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ data/iceberg/generated_*
scripts/metastore_db/
scripts/derby.log
scripts/test-script-with-path.sql
*.parquet
src/include/jiceberg_generated/
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
[submodule "extension-ci-tools"]
path = extension-ci-tools
url = https://github.com/duckdb/extension-ci-tools.git
[submodule "arrow"]
path = arrow
url = https://github.com/apache/arrow.git
40 changes: 33 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,38 @@ cmake_minimum_required(VERSION 2.8.12)
set(TARGET_NAME iceberg)
project(${TARGET_NAME})

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)

set(EXTENSION_NAME ${TARGET_NAME}_extension)
include_directories(src/include)
include_directories(
src/include
arrow/cpp/src
arrow/cpp/build/src
)

set(EXTENSION_SOURCES
src/iceberg_extension.cpp
src/iceberg_functions.cpp
src/catalog_api.cpp
src/catalog_utils.cpp
src/ic_create_table_as_op.cpp
src/common/utils.cpp
src/common/schema.cpp
src/common/iceberg.cpp
src/iceberg_functions/iceberg_snapshots.cpp
src/iceberg_functions/iceberg_scan.cpp
src/iceberg_functions/iceberg_metadata.cpp)
src/iceberg_functions/iceberg_metadata.cpp
src/storage/ic_catalog.cpp
src/storage/ic_catalog_set.cpp
src/storage/ic_clear_cache.cpp
src/storage/ic_schema_entry.cpp
src/storage/ic_schema_set.cpp
src/storage/ic_table_entry.cpp
src/storage/ic_table_set.cpp
src/storage/ic_transaction.cpp
src/storage/ic_transaction_manager.cpp
)

add_library(${EXTENSION_NAME} STATIC ${EXTENSION_SOURCES})

Expand Down Expand Up @@ -73,11 +90,20 @@ target_link_libraries(
Snappy::snappy
ZLIB::ZLIB)

find_package(CURL REQUIRED)

find_library(ARROW_LIB arrow PATHS arrow/cpp/build/release)
find_library(PARQUET_LIB parquet PATHS arrow/cpp/build/release)
find_library(FTICEBERGHELPER_LIB jiceberg PATHS jiceberg_lib/app/build/native/nativeCompile)

# Link dependencies into extension
target_link_libraries(${EXTENSION_NAME} PUBLIC optimized avro_static_release
debug avro_static_debug)
target_link_libraries(${TARGET_NAME}_loadable_extension optimized
avro_static_release debug avro_static_debug)
target_link_libraries(${EXTENSION_NAME} PUBLIC optimized avro_static_release
debug avro_static_debug
${FTICEBERGHELPER_LIB} CURL::libcurl
${ARROW_LIB} ${PARQUET_LIB})
target_link_libraries(${TARGET_NAME}_loadable_extension optimized CURL::libcurl
avro_static_release ${FTICEBERGHELPER_LIB} ${ARROW_LIB} ${PARQUET_LIB}
debug avro_static_debug)

install(
TARGETS ${EXTENSION_NAME} ${TARGET_NAME}_loadable_extension
Expand Down
127 changes: 127 additions & 0 deletions README-ADDENDUM.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# ADDENDUM

This fork adds proof-of-concept functionality to DuckDB iceberg extension to be able to connect to an iceberg catalog and write to iceberg tables as well as read from them.

# Requirements
You will need the following to be able to use this new version of the extension:
1. DuckDB version 1.2.0 or later
2. `httpfs` extension

Since this extension is not official yet, you will need to run duckdb in `unsigned` mode to be able to use it:
```bash
duckdb --unsigned
```

# Installation
The following steps need to be done once:
1. Download the zip from github and unzip it
2. Change directory to the directory where you unzipped the files
3. Install the extension
```sql
INSTALL './iceberg.duckdb_extension';
```
4. If you already have the official `iceberg` extension installed, you will need to force the install
```sql
FORCE INSTALL './iceberg.duckdb_extension';
```
5. Install `httpfs` extension if you don't have it installed already
```sql
INSTALL httpfs;
```

# Usage
## Load `httpfs` and `iceberg` extensions
```sql
LOAD httpfs;
LOAD iceberg;
```
## Create a secret to provide access to an iceberg catalog
```sql
CREATE SECRET (
TYPE ICEBERG,
CLIENT_ID '${CLIENT_ID}',
CLIENT_SECRET '${CLIENT_SECRET}',
ENDPOINT '${ENDPOINT}',
AWS_REGION '${AWS_REGION}'
)
```

## Attach an iceberg catalog
```sql
ATTACH 'my_catalog' AS my_catalog (TYPE ICEBERG)
```

## Read an iceberg table
```sql
SELECT * FROM my_catalog.my_schema.table_1;
```

## Create a new iceberg table
```sql
CREATE TABLE my_catalog.my_schema.new_table (id BIGINT, name VARCHAR);
```
```sql
CREATE TABLE my_catalog.my_schema.new_table_2 AS (SELECT FROM version() as "version");
```

## Delete an existing iceberg table
```sql
DROP TABLE my_catalog.my_schema.table_1;
```

# How to build extension from source
Requirements:
* A compiler that supports C++17
* CMake version 3.28 or later
```
git clone https://github.com/fivetran/duckdb-iceberg.git
git submodule update --init --recursive
brew install ninja
GEN=ninja make {debug/release}
```

# Roadmap
## 1. Supported SQL commands
### ✅ CREATE SECRET
### ✅ ATTACH
### 🔳 USE
### ✅ SELECT
### ✅ CREATE SCHEMA
### ✅ DROP SCHEMA
### 🔳 CREATE VIEW
### 🔳 DROP VIEW
### ✅ CREATE TABLE
### ✅ CREATE TABLE AS SELECT
### 🔳 ALTER TABLE
### ✅ DROP TABLE
### 🔳 INSERT
### 🔳 UPDATE
### 🔳 DELETE

## 2. Supported [Iceberg data types](https://docs.snowflake.com/en/user-guide/tables-iceberg-data-types) (writing)
### 🔳 boolean
### ✅ string
### 🔳 tinyint
### 🔳 smallint
### ✅ int
### ✅ long
### ✅ double
### 🔳 float
### 🔳 timestamp
### 🔳 timestamptz
### 🔳 binary
### 🔳 date
### 🔳 decimal(prec,scale)
### 🔳 array
### 🔳 map
### 🔳 struct

## 3. Miscellaneous
### 🔳 Bundle `jiceberg` statically into the extension







1 change: 1 addition & 0 deletions arrow
Submodule arrow added at f87237
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 1095 files
12 changes: 12 additions & 0 deletions jiceberg_lib/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#
# https://help.github.com/articles/dealing-with-line-endings/
#
# Linux start script should use lf
/gradlew text eol=lf

# These are Windows script files and should use crlf
*.bat text eol=crlf

# Binary files should be left untouched
*.jar binary

6 changes: 6 additions & 0 deletions jiceberg_lib/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Ignore Gradle project-specific cache directory
.gradle/
.idea/

# Ignore Gradle build output directory
build/
19 changes: 19 additions & 0 deletions jiceberg_lib/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Apache Iceberg does not have a library written in C++. This project uses the Java library
to create helper methods to be used in C++.

# Instructions
* Install [SDKMAN](https://sdkman.io/install/)
* Install GraalVM CE 21
```build
sdk install java 21.0.2-graalce
```
* Build library
```
gradle nativeBuild
```
* Copy header files
```
./scripts/copy_headers.sh
```
* In `src/include/jiceberg_generated`, replace <angled> includes with "quotes" in the header files starting with `libjiceberg...`

75 changes: 75 additions & 0 deletions jiceberg_lib/app/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This generated file contains a sample Java application project to get you started.
* For more details on building Java & JVM projects, please refer to https://docs.gradle.org/8.9/userguide/building_java_projects.html in the Gradle documentation.
*/

plugins {
id 'java'
id 'application'
id 'org.graalvm.buildtools.native' version '0.9.28'
}

repositories {
mavenCentral()
}

dependencies {
implementation 'org.apache.iceberg:iceberg-core:1.7.1'
implementation 'org.apache.iceberg:iceberg-api:1.7.1'
implementation 'org.apache.iceberg:iceberg-aws:1.7.1'
implementation 'org.apache.iceberg:iceberg-parquet:1.7.1'
implementation 'org.apache.iceberg:iceberg-data:1.7.1'
implementation 'org.apache.parquet:parquet-avro:1.12.3'
implementation 'org.apache.hadoop:hadoop-common:3.3.1'

implementation 'software.amazon.awssdk:s3:2.30.11'
implementation 'software.amazon.awssdk:sts:2.30.11'
implementation 'software.amazon.awssdk:auth:2.30.11'
implementation 'software.amazon.awssdk:regions:2.30.11'

compileOnly 'org.graalvm.sdk:graal-sdk:22.0.0'
implementation 'org.slf4j:slf4j-nop:2.0.9'
implementation 'org.slf4j:log4j-over-slf4j:2.0.9'
}

application {
mainClass = 'com.fivetran.iceberg.App' // Update with your package and class name
}

graalvmNative {
binaries {
main {
imageName = 'libjiceberg' // Name of the native executable
mainClass = 'com.fivetran.iceberg.App' // Update with your package and class name
fallback = false
buildArgs.add('--shared')
buildArgs.add('--initialize-at-run-time=org.apache.log4j')
buildArgs.add('--initialize-at-run-time=org.slf4j')
buildArgs.add('--initialize-at-run-time=com.github.benmanes.caffeine.cache.LocalLoadingCache')
buildArgs.add('--initialize-at-run-time=org.apache.iceberg.util.Pair')
buildArgs.add('--initialize-at-run-time=org.apache.logging.log4j.util.ProviderUtil')
buildArgs.add('--initialize-at-run-time=org.apache.logging.log4j.core.impl.Log4jContextFactory')
buildArgs.add('--initialize-at-run-time=org.apache.commons.logging.LogFactory')
buildArgs.add('--initialize-at-build-time=org.slf4j,org.apache.logging.slf4j')
buildArgs.add('-H:+ReportExceptionStackTraces')
buildArgs.add('--enable-https')

metadataRepository {
enabled = true
}
configurationFileDirectories.from(file('src/main/resources/META-INF/native-image-config'))
}
}
}

def void main(Closure<Boolean> booleanClosure) {}

tasks.register('runWithAgent', JavaExec) {
classpath = sourceSets.main.runtimeClasspath
mainClass = 'com.fivetran.iceberg.App'
jvmArgs = [
'-agentlib:native-image-agent=config-output-dir=src/main/resources/META-INF/native-image-config'
]
}
Loading