From 8b3136bea69476c8229b8272474f82388fefb554 Mon Sep 17 00:00:00 2001 From: libailin Date: Wed, 28 Aug 2024 09:50:34 +0800 Subject: [PATCH] [Feature-#1918][s3] Add support for reading all types of documents supported by Apache Tika --- .../chunjun-connector-format-base/pom.xml | 56 ++++++ .../format/base/common/TikaData.java | 37 ++++ .../format/base/config/TikaReadConfig.java | 47 +++++ .../format/base/options/TikaOptions.java | 43 +++++ .../format/base/source/TikaInputFormat.java | 119 ++++++++++++ .../base/source/TikaReaderExecutor.java | 178 ++++++++++++++++++ .../chunjun-connector-s3/pom.xml | 5 + .../chunjun/connector/s3/config/S3Config.java | 18 ++ .../connector/s3/sink/S3DynamicTableSink.java | 2 +- .../connector/s3/sink/S3OutputFormat.java | 76 ++++++-- .../connector/s3/source/S3InputFormat.java | 55 +++++- .../s3/table/S3DynamicTableFactory.java | 24 +++ .../connector/s3/table/options/S3Options.java | 27 +++ .../chunjun/connector/s3/util/S3Util.java | 25 ++- chunjun-connectors/pom.xml | 1 + 15 files changed, 695 insertions(+), 18 deletions(-) create mode 100644 chunjun-connectors/chunjun-connector-format-base/pom.xml create mode 100644 chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/common/TikaData.java create mode 100644 chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/config/TikaReadConfig.java create mode 100644 chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/options/TikaOptions.java create mode 100644 chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaInputFormat.java create mode 100644 chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaReaderExecutor.java diff --git a/chunjun-connectors/chunjun-connector-format-base/pom.xml b/chunjun-connectors/chunjun-connector-format-base/pom.xml new file mode 100644 index 0000000000..314ff8899a --- /dev/null +++ b/chunjun-connectors/chunjun-connector-format-base/pom.xml @@ -0,0 +1,56 @@ + + + + + + chunjun-connectors + com.dtstack.chunjun + ${revision} + + 4.0.0 + + chunjun-connector-format-base + ChunJun : Connector : Format Base + + + 2.8.0 + + + + + + org.apache.tika + tika-core + ${tika.version} + provided + + + + + org.apache.tika + tika-parsers-standard-package + ${tika.version} + provided + + + + diff --git a/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/common/TikaData.java b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/common/TikaData.java new file mode 100644 index 0000000000..a0ca4012f9 --- /dev/null +++ b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/common/TikaData.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dtstack.chunjun.connector.format.base.common; + +import lombok.Data; + +@Data +public class TikaData { + + private String[] data; + private boolean end; + + public TikaData(String[] data, boolean end) { + this.data = data; + this.end = end; + } + + public TikaData(String[] data) { + this.data = data; + } +} diff --git a/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/config/TikaReadConfig.java b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/config/TikaReadConfig.java new file mode 100644 index 0000000000..f5d5850c12 --- /dev/null +++ b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/config/TikaReadConfig.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dtstack.chunjun.connector.format.base.config; + +import lombok.Data; + +import java.io.Serializable; + +@Data +public class TikaReadConfig implements Serializable { + + public static final String ORIGINAL_FILENAME = "_ORIGINAL_FILENAME"; + + private static final long serialVersionUID = 9142075335239994317L; + + /** 是否启用tika提取 */ + private boolean useExtract = false; + + /** 内容重合度比例值 0-100 */ + private int overlapRatio = 0; + + /** 是否启动分块 */ + private boolean enableChunk = false; + + /** 分块大小 */ + private int chunkSize = -1; + + public boolean isEnableChunk() { + return chunkSize > 0 ? true : false; + } +} diff --git a/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/options/TikaOptions.java b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/options/TikaOptions.java new file mode 100644 index 0000000000..5927411112 --- /dev/null +++ b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/options/TikaOptions.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dtstack.chunjun.connector.format.base.options; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +public class TikaOptions { + + public static final ConfigOption USE_EXTRACT = + ConfigOptions.key("tika-use-extract") + .booleanType() + .defaultValue(false) + .withDescription("use tika extract"); + + public static final ConfigOption OVERLAP_RATIO = + ConfigOptions.key("tika-overlap-ratio") + .intType() + .defaultValue(0) + .withDescription("content overlap ratio"); + + public static final ConfigOption CHUNK_SIZE = + ConfigOptions.key("tika-chunk-size") + .intType() + .defaultValue(-1) + .withDescription("chunk size"); +} diff --git a/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaInputFormat.java b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaInputFormat.java new file mode 100644 index 0000000000..c063f25e08 --- /dev/null +++ b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaInputFormat.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dtstack.chunjun.connector.format.base.source; + +import com.dtstack.chunjun.connector.format.base.common.TikaData; +import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig; + +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.concurrent.BasicThreadFactory; + +import java.io.Closeable; +import java.io.InputStream; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; + +@Slf4j +public class TikaInputFormat implements Closeable { + private ThreadPoolExecutor executorService; + private final BlockingQueue queue = new LinkedBlockingQueue<>(4096); + private TikaReadConfig tikaReadConfig; + private TikaData row; + private int fieldCount; + + public TikaInputFormat(TikaReadConfig tikaReadConfig, int fieldCount) { + this.tikaReadConfig = tikaReadConfig; + this.fieldCount = fieldCount; + } + + public void open(InputStream inputStream, String originalFilename) { + this.executorService = + new ThreadPoolExecutor( + 1, + 1, + 0, + NANOSECONDS, + new LinkedBlockingDeque<>(2), + new BasicThreadFactory.Builder() + .namingPattern("tika-schedule-pool-%d") + .daemon(false) + .build()); + TikaReaderExecutor executor = + new TikaReaderExecutor(tikaReadConfig, queue, inputStream, originalFilename); + executorService.execute(executor); + } + + public boolean hasNext() { + try { + row = queue.poll(3000L, TimeUnit.MILLISECONDS); + // 如果没有数据,则继续等待 + if (row == null) { + log.warn("Waiting for queue get tika data"); + hasNext(); + } + if (row != null && row.isEnd()) { + return false; + } + return true; + } catch (InterruptedException e) { + throw new RuntimeException( + "cannot get data from the queue because the current thread is interrupted.", e); + } + } + + /** 根据声明的字段个数,对数据进行补全 */ + public String[] nextRecord() { + String[] data = row.getData(); + if (fieldCount == data.length) { + return data; + } + if (fieldCount < data.length) { + fieldCount = data.length; + } + return formatValue(data); + } + + private String[] formatValue(String[] data) { + String[] record = initDataContainer(fieldCount, ""); + // because fieldCount is always >= data.length + System.arraycopy(data, 0, record, 0, data.length); + return record; + } + + private String[] initDataContainer(int capacity, String defValue) { + String[] container = new String[capacity]; + for (int i = 0; i < capacity; i++) { + container[i] = defValue; + } + return container; + } + + @Override + public void close() { + if (executorService != null) { + executorService.shutdown(); + queue.clear(); + } + } +} diff --git a/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaReaderExecutor.java b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaReaderExecutor.java new file mode 100644 index 0000000000..1e48190745 --- /dev/null +++ b/chunjun-connectors/chunjun-connector-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaReaderExecutor.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dtstack.chunjun.connector.format.base.source; + +import com.dtstack.chunjun.connector.format.base.common.TikaData; +import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig; +import com.dtstack.chunjun.util.GsonUtil; + +import org.apache.commons.lang3.StringUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerDecorator; +import org.xml.sax.ContentHandler; + +import java.io.BufferedInputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.BlockingQueue; + +import static com.dtstack.chunjun.connector.format.base.config.TikaReadConfig.ORIGINAL_FILENAME; + +public class TikaReaderExecutor implements Runnable { + + private TikaReadConfig tikaReadConfig; + private BlockingQueue queue; + private Map metaData = new HashMap<>(); + private String metaDataString; + private String originalFilename; + private InputStream in; + + public TikaReaderExecutor( + TikaReadConfig tikaReadConfig, + BlockingQueue queue, + InputStream in, + String originalFilename) { + this.tikaReadConfig = tikaReadConfig; + this.queue = queue; + this.in = in; + this.originalFilename = originalFilename; + } + + @Override + public void run() { + // 抽取文档内容 + Parser parser = new AutoDetectParser(); + // 内容处理器,用来收集结果,Tika可以将解析结果包装成XHTML SAX + // event进行分发,通过ContentHandler处理这些event就可以得到文本内容和其他有用的信息 + ContentHandler contentHandler = new BodyContentHandler(-1); + // 元数据,既是输入也是输出,可以将文件名或者可能的文件类型传入,tika解析时可以根据这些信息判断文件类型, + // 再调用相应的解析器进行处理;另外,tika也会将一些额外的信息保存到Metadata中,如文件修改日期,作者,编辑工具等 + Metadata metadata = new Metadata(); + // 解析上下文,用来控制解析过程,比如是否提取Office文档里面的宏等 + ParseContext context = new ParseContext(); + + // tika官方文档提供的分块处理思路, 但是测试发现比如同类型word(doc)两个文件,有的可以正常分块,有的不能分块。 + // 还有txt类型文件未能分块读取, pdf文件暂时测试。 + // 因此暂时不建议使用 + final List chunks = new ArrayList<>(); + chunks.add(""); + ContentHandlerDecorator trunkHandler = + new ContentHandlerDecorator() { + @Override + public void characters(char[] ch, int start, int length) { + String chunkContent = ""; + String lastChunk = chunks.get(chunks.size() - 1); + String thisStr = new String(ch, start, length); + if (lastChunk.length() + length > tikaReadConfig.getChunkSize()) { + chunks.add(thisStr); + chunkContent = thisStr; + } else { + String chunkString = lastChunk + thisStr; + chunks.set(chunks.size() - 1, chunkString); + if (StringUtils.isNotBlank(chunkString)) { + chunkContent = chunkString; + } + } + if (metaData.isEmpty()) { + for (String name : metadata.names()) { + metaData.put(name, metadata.get(name)); + } + metaData.put(ORIGINAL_FILENAME, originalFilename); + metaDataString = GsonUtil.GSON.toJson(metaData); + } + if (StringUtils.isNotBlank(chunkContent)) { + try { + queue.put( + new TikaData( + new String[] {chunkContent, metaDataString}, + false)); + } catch (InterruptedException e) { + throw new RuntimeException( + "because the current thread was interrupted, adding data to the queue failed", + e); + } + } + } + }; + + // InputStream in 待解析的文档,以字节流形式传入,可以避免tika占用太多内存 + try (BufferedInputStream bufferedInputStream = new BufferedInputStream(in)) { + // 如何想要使用官方的分块处理方式,需要将contentHandler替换成trunkHandler + parser.parse(bufferedInputStream, contentHandler, metadata, context); + String content = contentHandler.toString(); + for (String name : metadata.names()) { + metaData.put(name, metadata.get(name)); + } + metaData.put(ORIGINAL_FILENAME, originalFilename); + metaDataString = GsonUtil.GSON.toJson(metaData); + if (tikaReadConfig.getChunkSize() > 0) { + // 对整个抽取出来的内容进行分块、内容重复度处理 + List chunkList = + getChunkList( + content, + tikaReadConfig.getChunkSize(), + tikaReadConfig.getOverlapRatio()); + for (String chunk : chunkList) { + queue.put(new TikaData(new String[] {chunk, metaDataString}, false)); + } + } else { + queue.put(new TikaData(new String[] {content, metaDataString}, false)); + } + queue.put(new TikaData(null, true)); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public List getChunkList(String content, int chunkSize, int overlapRatio) { + List chunks = new ArrayList<>(); + int length = content.length(); + int startIndex = 0; + int step = chunkSize; + int endIndex = startIndex + step; + int increment = step * overlapRatio / 100; + if (step >= length) { + chunks.add(content); + } else { + while (endIndex <= length) { + // 确保截取的字符串不会超过原始字符串的长度 + if (startIndex + step > length) { + endIndex = length; + } + String substring = content.substring(startIndex, endIndex); + chunks.add(substring); + // 更新起始和结束位置 + startIndex = endIndex - increment; + endIndex = startIndex + step; + } + if (endIndex > length && startIndex + increment < length) { + String substring = content.substring(startIndex, length); + chunks.add(substring); + } + } + return chunks; + } +} diff --git a/chunjun-connectors/chunjun-connector-s3/pom.xml b/chunjun-connectors/chunjun-connector-s3/pom.xml index 51faa129a1..8214236468 100644 --- a/chunjun-connectors/chunjun-connector-s3/pom.xml +++ b/chunjun-connectors/chunjun-connector-s3/pom.xml @@ -68,6 +68,11 @@ 1.11-8 test + + com.dtstack.chunjun + chunjun-connector-format-base + ${project.version} + diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/config/S3Config.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/config/S3Config.java index 2fe73532da..7f2ecbb338 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/config/S3Config.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/config/S3Config.java @@ -19,6 +19,7 @@ package com.dtstack.chunjun.connector.s3.config; import com.dtstack.chunjun.config.CommonConfig; +import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig; import com.amazonaws.regions.Regions; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; @@ -86,4 +87,21 @@ public class S3Config extends CommonConfig implements Serializable { /** 生成的文件名后缀 */ private String suffix; + + /** 对象匹配规则 */ + private String objectsRegex; + + /** 是否使用文本限定符 */ + private boolean useTextQualifier = true; + + /** 是否开启每条记录生成一个对应的文件 */ + private boolean enableWriteSingleRecordAsFile = false; + + /** 保留原始文件名 */ + private boolean keepOriginalFilename = false; + + /** 禁用 Bucket 名称注入到 endpoint 前缀 */ + private boolean disableBucketNameInEndpoint = false; + + private TikaReadConfig tikaReadConfig = new TikaReadConfig(); } diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3DynamicTableSink.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3DynamicTableSink.java index e50a2e1125..bea54bc953 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3DynamicTableSink.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3DynamicTableSink.java @@ -84,6 +84,6 @@ public DynamicTableSink copy() { @Override public String asSummaryString() { - return "StreamDynamicTableSink"; + return S3DynamicTableSink.class.getName(); } } diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3OutputFormat.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3OutputFormat.java index 7abd7d4c87..2ea943b4d0 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3OutputFormat.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3OutputFormat.java @@ -27,6 +27,7 @@ import com.dtstack.chunjun.sink.format.BaseRichOutputFormat; import com.dtstack.chunjun.throwable.ChunJunRuntimeException; import com.dtstack.chunjun.throwable.WriteRecordException; +import com.dtstack.chunjun.util.GsonUtil; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.table.data.RowData; @@ -34,15 +35,20 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PartETag; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.UUID; import java.util.stream.Collectors; +import static com.dtstack.chunjun.connector.format.base.config.TikaReadConfig.ORIGINAL_FILENAME; + /** The OutputFormat Implementation which write data to Amazon S3. */ @Slf4j public class S3OutputFormat extends BaseRichOutputFormat { @@ -137,7 +143,8 @@ private void checkOutputDir() { amazonS3, s3Config.getBucket(), s3Config.getObject(), - s3Config.getFetchSize()); + s3Config.getFetchSize(), + s3Config.getObjectsRegex()); } else { subObjects = S3Util.listObjectsByv1( @@ -166,11 +173,17 @@ private void nextBlock() { sw = new StringWriter(); } this.writerUtil = new WriterUtil(sw, s3Config.getFieldDelimiter()); + if (!s3Config.isUseTextQualifier()) { + writerUtil.setUseTextQualifier(false); + } this.currentPartNumber = this.currentPartNumber + 1; } /** Create file multipart upload ID */ private void createActionFinishedTag() { + if (s3Config.isEnableWriteSingleRecordAsFile()) { + return; + } if (!StringUtils.isNotBlank(currentUploadId)) { this.currentUploadId = S3Util.initiateMultipartUploadAndGetId( @@ -193,8 +206,11 @@ private void beforeWriteRecords() { } protected void flushDataInternal() { + if (sw == null) { + return; + } StringBuffer sb = sw.getBuffer(); - if (sb.length() > MIN_SIZE || willClose) { + if (sb.length() > MIN_SIZE || willClose || s3Config.isEnableWriteSingleRecordAsFile()) { byte[] byteArray; try { byteArray = sb.toString().getBytes(s3Config.getEncoding()); @@ -202,17 +218,23 @@ protected void flushDataInternal() { throw new ChunJunRuntimeException(e); } log.info("Upload part size:" + byteArray.length); - PartETag partETag = - S3Util.uploadPart( - amazonS3, - s3Config.getBucket(), - s3Config.getObject(), - this.currentUploadId, - this.currentPartNumber, - byteArray); - - MyPartETag myPartETag = new MyPartETag(partETag); - myPartETags.add(myPartETag); + + if (s3Config.isEnableWriteSingleRecordAsFile()) { + S3Util.putStringObject( + amazonS3, s3Config.getBucket(), s3Config.getObject(), sb.toString()); + } else { + PartETag partETag = + S3Util.uploadPart( + amazonS3, + s3Config.getBucket(), + s3Config.getObject(), + this.currentUploadId, + this.currentPartNumber, + byteArray); + + MyPartETag myPartETag = new MyPartETag(partETag); + myPartETags.add(myPartETag); + } log.debug( "task-{} upload etag:[{}]", @@ -225,6 +247,9 @@ protected void flushDataInternal() { } private void completeMultipartUploadFile() { + if (s3Config.isEnableWriteSingleRecordAsFile()) { + return; + } if (this.currentPartNumber > 10000) { throw new IllegalArgumentException("part can not bigger than 10000"); } @@ -282,7 +307,11 @@ protected void writeSingleRecordInternal(RowData rowData) throws WriteRecordExce // convert row to string stringRecord = (String[]) rowConverter.toExternal(rowData, stringRecord); try { - for (int i = 0; i < columnNameList.size(); ++i) { + int columnSize = columnNameList.size(); + if (s3Config.isEnableWriteSingleRecordAsFile()) { + columnSize = 1; + } + for (int i = 0; i < columnSize; ++i) { String column = stringRecord[i]; @@ -292,6 +321,25 @@ protected void writeSingleRecordInternal(RowData rowData) throws WriteRecordExce writerUtil.write(column); } writerUtil.endRecord(); + + if (s3Config.isEnableWriteSingleRecordAsFile()) { + Map metadataMap = + GsonUtil.GSON.fromJson(stringRecord[1], Map.class); + String key = FilenameUtils.getPath(s3Config.getObject()); + // 是否保留原始文件名 + if (s3Config.isKeepOriginalFilename()) { + key += metadataMap.get(ORIGINAL_FILENAME) + getExtension(); + } else { + key += + jobId + + "_" + + taskNumber + + "_" + + UUID.randomUUID().toString() + + getExtension(); + } + s3Config.setObject(key); + } flushDataInternal(); } catch (Exception ex) { String msg = "RowData2string error RowData(" + rowData + ")"; diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/source/S3InputFormat.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/source/S3InputFormat.java index c450c96854..6660bb63cf 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/source/S3InputFormat.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/source/S3InputFormat.java @@ -19,6 +19,8 @@ package com.dtstack.chunjun.connector.s3.source; import com.dtstack.chunjun.config.RestoreConfig; +import com.dtstack.chunjun.connector.format.base.common.TikaData; +import com.dtstack.chunjun.connector.format.base.source.TikaInputFormat; import com.dtstack.chunjun.connector.s3.config.S3Config; import com.dtstack.chunjun.connector.s3.enums.CompressType; import com.dtstack.chunjun.connector.s3.util.ReaderUtil; @@ -38,6 +40,7 @@ import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectInputStream; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import java.io.IOException; @@ -71,6 +74,9 @@ public class S3InputFormat extends BaseRichInputFormat { private RestoreConfig restoreConf; + private transient TikaData tikaData; + private TikaInputFormat tikaInputFormat; + @Override public void openInputFormat() throws IOException { super.openInputFormat(); @@ -137,7 +143,10 @@ protected InputSplit[] createInputSplitsInternal(int minNumSplits) { protected RowData nextRecordInternal(RowData rowData) throws ReadRecordException { String[] fields; try { - fields = readerUtil.getValues(); + fields = + s3Config.getTikaReadConfig().isUseExtract() && tikaData != null + ? tikaData.getData() + : readerUtil.getValues(); rowData = rowConverter.toInternal(fields); } catch (IOException e) { throw new ChunJunRuntimeException(e); @@ -164,9 +173,47 @@ protected void closeInternal() { @Override public boolean reachedEnd() throws IOException { + if (s3Config.getTikaReadConfig().isUseExtract()) { + tikaData = getTikaData(); + return tikaData == null || tikaData.getData() == null; + } return reachedEndWithoutCheckState(); } + public TikaData getTikaData() { + if (tikaInputFormat == null) { + nextTikaDataStream(); + } + if (tikaInputFormat != null) { + if (!tikaInputFormat.hasNext()) { + tikaInputFormat.close(); + tikaInputFormat = null; + return getTikaData(); + } + String[] record = tikaInputFormat.nextRecord(); + return new TikaData(record); + } else { + return null; + } + } + + private void nextTikaDataStream() { + if (splits.hasNext()) { + currentObject = splits.next(); + GetObjectRequest rangeObjectRequest = + new GetObjectRequest(s3Config.getBucket(), currentObject); + log.info("Current read file {}", currentObject); + S3Object o = amazonS3.getObject(rangeObjectRequest); + S3ObjectInputStream s3is = o.getObjectContent(); + tikaInputFormat = + new TikaInputFormat( + s3Config.getTikaReadConfig(), s3Config.getFieldNameList().size()); + tikaInputFormat.open(s3is, FilenameUtils.getName(currentObject)); + } else { + tikaInputFormat = null; + } + } + public boolean reachedEndWithoutCheckState() throws IOException { // br is empty, indicating that a new file needs to be read if (readerUtil == null) { @@ -259,7 +306,11 @@ public List resolveObjects() { if (s3Config.isUseV2()) { subObjects = S3Util.listObjectsKeyByPrefix( - amazonS3, bucket, prefix, s3Config.getFetchSize()); + amazonS3, + bucket, + prefix, + s3Config.getFetchSize(), + s3Config.getObjectsRegex()); } else { subObjects = S3Util.listObjectsByv1( diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/S3DynamicTableFactory.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/S3DynamicTableFactory.java index fa05abaf7f..c3c873982a 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/S3DynamicTableFactory.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/S3DynamicTableFactory.java @@ -18,6 +18,8 @@ package com.dtstack.chunjun.connector.s3.table; +import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig; +import com.dtstack.chunjun.connector.format.base.options.TikaOptions; import com.dtstack.chunjun.connector.s3.config.S3Config; import com.dtstack.chunjun.connector.s3.sink.S3DynamicTableSink; import com.dtstack.chunjun.connector.s3.source.S3DynamicTableSource; @@ -61,6 +63,14 @@ public DynamicTableSource createDynamicTableSource(Context context) { s3Config.setFirstLineHeader(options.get(S3Options.IS_FIRST_LINE_HEADER)); s3Config.setEndpoint(options.get(S3Options.ENDPOINT)); s3Config.setCompress(options.get(S3Options.COMPRESS)); + s3Config.setObjectsRegex(options.get(S3Options.OBJECTS_REGEX)); + s3Config.setDisableBucketNameInEndpoint( + options.get(S3Options.DISABLE_BUCKET_NAME_IN_ENDPOINT)); + TikaReadConfig tikaReadConfig = new TikaReadConfig(); + tikaReadConfig.setUseExtract(options.get(TikaOptions.USE_EXTRACT)); + tikaReadConfig.setOverlapRatio(options.get(TikaOptions.OVERLAP_RATIO)); + tikaReadConfig.setChunkSize(options.get(TikaOptions.CHUNK_SIZE)); + s3Config.setTikaReadConfig(tikaReadConfig); return new S3DynamicTableSource(context.getCatalogTable().getResolvedSchema(), s3Config); } @@ -94,6 +104,14 @@ public Set> optionalOptions() { options.add(S3Options.SUFFIX); options.add(SinkOptions.SINK_PARALLELISM); options.add(S3Options.WRITE_MODE); + options.add(S3Options.OBJECTS_REGEX); + options.add(S3Options.USE_TEXT_QUALIFIER); + options.add(S3Options.ENABLE_WRITE_SINGLE_RECORD_AS_FILE); + options.add(S3Options.KEEP_ORIGINAL_FILENAME); + options.add(S3Options.DISABLE_BUCKET_NAME_IN_ENDPOINT); + options.add(TikaOptions.USE_EXTRACT); + options.add(TikaOptions.CHUNK_SIZE); + options.add(TikaOptions.OVERLAP_RATIO); return options; } @@ -121,6 +139,12 @@ public DynamicTableSink createDynamicTableSink(Context context) { s3Config.setSuffix(options.get(S3Options.SUFFIX)); s3Config.setParallelism(options.get(SinkOptions.SINK_PARALLELISM)); s3Config.setWriteMode(options.get(S3Options.WRITE_MODE)); + s3Config.setUseTextQualifier(options.get(S3Options.USE_TEXT_QUALIFIER)); + s3Config.setEnableWriteSingleRecordAsFile( + options.get(S3Options.ENABLE_WRITE_SINGLE_RECORD_AS_FILE)); + s3Config.setKeepOriginalFilename(options.get(S3Options.KEEP_ORIGINAL_FILENAME)); + s3Config.setDisableBucketNameInEndpoint( + options.get(S3Options.DISABLE_BUCKET_NAME_IN_ENDPOINT)); return new S3DynamicTableSink(context.getCatalogTable().getResolvedSchema(), s3Config); } diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/options/S3Options.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/options/S3Options.java index 1f6236438f..a8fff32659 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/options/S3Options.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/options/S3Options.java @@ -95,4 +95,31 @@ public class S3Options { public static final ConfigOption WRITE_MODE = key("writeMode").stringType().defaultValue("overwrite").withDescription("writeMode"); + + public static final ConfigOption OBJECTS_REGEX = + key("objectsRegex").stringType().noDefaultValue().withDescription("objects regex rule"); + + public static final ConfigOption USE_TEXT_QUALIFIER = + key("useTextQualifier") + .booleanType() + .defaultValue(true) + .withDescription("use text qualifier"); + + public static final ConfigOption ENABLE_WRITE_SINGLE_RECORD_AS_FILE = + key("enableWriteSingleRecordAsFile") + .booleanType() + .defaultValue(false) + .withDescription("enable write single record as each file"); + + public static final ConfigOption KEEP_ORIGINAL_FILENAME = + key("keepOriginalFilename") + .booleanType() + .defaultValue(false) + .withDescription("keep original filename"); + + public static final ConfigOption DISABLE_BUCKET_NAME_IN_ENDPOINT = + key("disableBucketNameInEndpoint") + .booleanType() + .defaultValue(false) + .withDescription("disable Bucket Name In Endpoint"); } diff --git a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/util/S3Util.java b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/util/S3Util.java index a7d849a4a0..02d708f17e 100644 --- a/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/util/S3Util.java +++ b/chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/util/S3Util.java @@ -30,6 +30,7 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.amazonaws.services.s3.S3ClientOptions; import com.amazonaws.services.s3.model.AbortMultipartUploadRequest; import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest; import com.amazonaws.services.s3.model.DeleteObjectsRequest; @@ -46,12 +47,14 @@ import com.amazonaws.services.s3.model.UploadPartRequest; import com.amazonaws.services.s3.model.UploadPartResult; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; @Slf4j public class S3Util { @@ -75,6 +78,10 @@ public static AmazonS3 getS3Client(S3Config s3Config) { } else { builder = builder.withRegion(clientRegion.getName()); } + // 禁用 Bucket 名称注入到 endpoint 前缀 + if (s3Config.isDisableBucketNameInEndpoint()) { + builder = builder.withPathStyleAccessEnabled(true); + } return builder.build(); } else { @@ -89,6 +96,11 @@ public static AmazonS3 getS3Client(S3Config s3Config) { } AmazonS3Client client = new AmazonS3Client(cred, ccfg); client.setEndpoint(s3Config.getEndpoint()); + // 禁用 Bucket 名称注入到 endpoint 前缀 + if (s3Config.isDisableBucketNameInEndpoint()) { + client.setS3ClientOptions( + S3ClientOptions.builder().setPathStyleAccess(true).build()); + } return client; } } else { @@ -103,18 +115,29 @@ public static PutObjectResult putStringObject( } public static List listObjectsKeyByPrefix( - AmazonS3 s3Client, String bucketName, String prefix, int fetchSize) { + AmazonS3 s3Client, String bucketName, String prefix, int fetchSize, String regex) { List objects = new ArrayList<>(fetchSize); ListObjectsV2Request req = new ListObjectsV2Request().withBucketName(bucketName).withMaxKeys(fetchSize); if (StringUtils.isNotBlank(prefix)) { req.setPrefix(prefix); } + // 定义正则表达式 + Pattern pattern = null; + if (StringUtils.isNotBlank(regex)) { + pattern = Pattern.compile(regex); + } + ListObjectsV2Result result; do { result = s3Client.listObjectsV2(req); for (S3ObjectSummary objectSummary : result.getObjectSummaries()) { + // 如果对象键与正则表达式匹配,则进行相应处理 + if (pattern != null + && !pattern.matcher(FilenameUtils.getName(objectSummary.getKey())).find()) { + continue; + } objects.add(objectSummary.getKey()); } String token = result.getNextContinuationToken(); diff --git a/chunjun-connectors/pom.xml b/chunjun-connectors/pom.xml index a91f7cfe07..059f7379cc 100755 --- a/chunjun-connectors/pom.xml +++ b/chunjun-connectors/pom.xml @@ -110,6 +110,7 @@ chunjun-connector-nebula chunjun-connector-kingbase chunjun-connector-hudi + chunjun-connector-format-base