From bad351f36487acbcf1c2b515f1eb08e914f1544f Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 7 Jun 2022 17:55:09 -0700 Subject: [PATCH 001/137] add payloads and its basic operations in DataStore and DataPack --- forte/data/data_pack.py | 9 +++++ forte/data/data_store.py | 67 +++++++++++++++++++++++++++++++++++++- forte/data/ontology/top.py | 36 ++++++++++++++++++++ 3 files changed, 111 insertions(+), 1 deletion(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 9e0de82aa..00de5ecc0 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -53,6 +53,7 @@ AudioAnnotation, ImageAnnotation, Grids, + Payload, ) from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest @@ -2013,6 +2014,14 @@ def save_entry_object( tid=entry.tid, allow_duplicate=allow_duplicate, ) + elif isinstance(entry, Payload): + data_store_ref.add_payload_raw( + type_name=entry.entry_type(), + payload_idx=entry.payload_idx, + modality=entry.modality, + tid=entry.tid, + allow_duplicate=allow_duplicate, + ) elif isinstance(entry, Grids): data_store_ref.add_grid_raw( type_name=entry.entry_type(), diff --git a/forte/data/data_store.py b/forte/data/data_store.py index d0180d507..9533df571 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -29,6 +29,7 @@ ImageAnnotation, Link, Generics, + Payload, ) from forte.data.ontology.core import Entry, FList, FDict from forte.common import constants @@ -686,6 +687,21 @@ def _new_grid( return entry + def _new_payload( + self, + type_name: str, + payload_idx: int, + modality: str, + tid: Optional[int] = None, + ) -> List: + tid: int = self._new_tid() if tid is None else tid + entry: List[Any] + + entry = [payload_idx, modality, tid, type_name] + entry += self._default_attributes_for_type(type_name) + + return entry + def _new_link( self, type_name: str, @@ -917,7 +933,14 @@ def _add_entry_raw( except KeyError: self.__elements[type_name] = SortedList(key=sorting_fn) self.__elements[type_name].add(entry) - elif entry_type in [Link, Group, Generics, ImageAnnotation, Grids]: + elif entry_type in [ + Link, + Group, + Generics, + ImageAnnotation, + Grids, + Payload, + ]: try: self.__elements[type_name].append(entry) except KeyError: @@ -1079,6 +1102,48 @@ def add_image_annotation_raw( return tid_search_result return self._add_entry_raw(AudioAnnotation, type_name, entry) + def add_payload_raw( + self, + type_name: str, + payload_idx: int, + modality: str, + tid: Optional[int] = None, + allow_duplicate=True, + ) -> int: + + r""" + This function adds an payload entry with ``payload_idx`` + and modality to current data store object. Returns the ``tid`` for the + inserted entry. + + Args: + type_name: The fully qualified type name of the new Payload. + payload_idx: the index of the payload. + modality: the payload modality which can be text, audio, image. + tid: ``tid`` of the Payload entry that is being added. + It's optional, and it will be auto-assigned if not given. + allow_duplicate: Whether we allow duplicate in the DataStore. When + it's set to False, the function will return the ``tid`` of + existing entry if a duplicate is found. Default value is True. + + Returns: + ``tid`` of the entry. + """ + # We should create the `entry data` with the format + # [begin, end, tid, type_id, None, ...]. + # A helper function _new_annotation() can be used to generate a + # annotation type entry data with default fields. + # A reference to the entry should be store in both self.__elements and + # self.__tid_ref_dict. + entry = self._new_payload(type_name, payload_idx, modality, tid) + + if not allow_duplicate: + tid_search_result = self._get_existing_ann_entry_tid(entry) + # if found existing entry + if tid_search_result != -1: + return tid_search_result + return self._add_entry_raw(Payload, type_name, entry) + def add_grid_raw( self, type_name: str, diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index c74bb6819..9cc9a660b 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1168,6 +1168,41 @@ def __init__( ) +class Payload(Entry): + def __init__( + self, + pack: PackType, + modality: str, + ): + super().__init__(pack, modality) + self.modality = modality + self.cache = [] + + +class TextPayload(Payload): + def __init__( + self, + pack: PackType, + ): + super().__init__(pack, "text") + + +class AudioPayload(Payload): + def __init__( + self, + pack: PackType, + ): + super().__init__(pack, "audio") + + +class ImagePayload(Payload): + def __init__( + self, + pack: PackType, + ): + super().__init__(pack, "image") + + SinglePackEntries = ( Link, Group, @@ -1175,5 +1210,6 @@ def __init__( Generics, AudioAnnotation, ImageAnnotation, + Payload, ) MultiPackEntries = (MultiPackLink, MultiPackGroup, MultiPackGeneric) From 99640296bb43fd41137f46f242d01b38cb0754ea Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 7 Jun 2022 18:11:48 -0700 Subject: [PATCH 002/137] docstring fix --- forte/data/data_store.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 9533df571..b494586e4 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -1088,8 +1088,8 @@ def add_image_annotation_raw( ``tid`` of the entry. """ # We should create the `entry data` with the format - # [begin, end, tid, type_id, None, ...]. - # A helper function _new_annotation() can be used to generate a + # [image_payload_idx, None, tid, type_id, None, ...]. + # A helper function _new_image_annotation() can be used to generate a # annotation type entry data with default fields. # A reference to the entry should be store in both self.__elements and # self.__tid_ref_dict. @@ -1130,9 +1130,9 @@ def add_payload_raw( ``tid`` of the entry. """ # We should create the `entry data` with the format - # [begin, end, tid, type_id, None, ...]. - # A helper function _new_annotation() can be used to generate a - # annotation type entry data with default fields. + # [payload_idx, modality, tid, type_id, None, ...]. + # A helper function _new_payload() can be used to generate a + # payload type entry data with default fields. # A reference to the entry should be store in both self.__elements and # self.__tid_ref_dict. entry = self._new_payload(type_name, payload_idx, modality, tid) From 41a1d446bdde0bd440dd189aa43eff4f1c9d9828 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 7 Jun 2022 23:55:07 -0700 Subject: [PATCH 003/137] add implementation of payload with interface for users writing their own loading method --- forte/data/ontology/top.py | 40 +++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 9cc9a660b..04a2a1d53 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from abc import abstractmethod from dataclasses import dataclass from functools import total_ordering from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List @@ -1168,39 +1169,68 @@ def __init__( ) -class Payload(Entry): +from abc import ABC, abstractmethod + + +class Payload(Entry, ABC): def __init__( self, pack: PackType, modality: str, + payload_idx: int, ): - super().__init__(pack, modality) + self.payload_idx = payload_idx self.modality = modality + super().__init__(pack) self.cache = [] + self.meta = {} + + def load(self, path): + data, data_meta = self.loading_method(path) + self.cache.append(data) + self.meta = {**self.meta, **data_meta} + self.pack.pack_name = path + + def offload_cache(self, cache_idx): + self.cache.pop(cache_idx) + + def offload_all_cache(self): + self.cache.clear() + + @property + def payload_index(self): + return self.payload_idx + + @abstractmethod + def loading_method(self, path): + pass class TextPayload(Payload): def __init__( self, pack: PackType, + payload_idx: int, ): - super().__init__(pack, "text") + super().__init__(pack, "text", payload_idx) class AudioPayload(Payload): def __init__( self, pack: PackType, + payload_idx: int, ): - super().__init__(pack, "audio") + super().__init__(pack, "audio", payload_idx) class ImagePayload(Payload): def __init__( self, pack: PackType, + payload_idx: int, ): - super().__init__(pack, "image") + super().__init__(pack, "image", payload_idx) SinglePackEntries = ( From c72bf6031679eeddb9809a4f1bbba64f869d0376 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 8 Jun 2022 00:25:53 -0700 Subject: [PATCH 004/137] minor fix --- forte/data/data_pack.py | 2 +- forte/data/ontology/top.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 00de5ecc0..635e45603 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -2017,7 +2017,7 @@ def save_entry_object( elif isinstance(entry, Payload): data_store_ref.add_payload_raw( type_name=entry.entry_type(), - payload_idx=entry.payload_idx, + payload_idx=entry.payload_index, modality=entry.modality, tid=entry.tid, allow_duplicate=allow_duplicate, diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 04a2a1d53..b3d01b69b 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -15,6 +15,7 @@ from dataclasses import dataclass from functools import total_ordering from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List +from abc import ABC, abstractmethod import numpy as np @@ -1169,10 +1170,7 @@ def __init__( ) -from abc import ABC, abstractmethod - - -class Payload(Entry, ABC): +class Payload(Entry): def __init__( self, pack: PackType, From 0f9993c41ee7753599962784a6954343aea766f5 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 8 Jun 2022 09:54:46 -0700 Subject: [PATCH 005/137] add audio payload test example --- tests/forte/payload_test.py | 55 +++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tests/forte/payload_test.py diff --git a/tests/forte/payload_test.py b/tests/forte/payload_test.py new file mode 100644 index 000000000..c4ac1621d --- /dev/null +++ b/tests/forte/payload_test.py @@ -0,0 +1,55 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for Payload. +""" +import os +import unittest +import numpy as np +from typing import Dict + +from numpy import array_equal +from forte.data.ontology.top import Payload, AudioPayload +from forte.data.data_pack import DataPack + + +class PayloadTest(unittest.TestCase): + """ + Test Payload related ontologies like audio. + """ + + def setUp(self): + self.datapack = DataPack("payload test") + + def test_audio_payload(self): + class SoundfileAudioPayload(AudioPayload): + def loading_method(self, path): + try: + import soundfile # pylint: disable=import-outside-toplevel + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + "AudioReader requires 'soundfile' package to be installed." + " You can refer to [extra modules to install]('pip install" + " forte['audio_ext']) or 'pip install forte" + ". Note that additional steps might apply to Linux" + " users (refer to " + "https://pysoundfile.readthedocs.io/en/latest/#installation)." + ) from e + audio, sample_rate = soundfile.read(file=path) + audio_data_meta = {"sample_rate": sample_rate} + return audio, audio_data_meta + + self.datapack.add_entry( + SoundfileAudioPayload(self.datapack, payload_idx=0) + ) From a3b42ddfcbf9a8ac28238deee8561f224e3e498a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 8 Jun 2022 09:58:16 -0700 Subject: [PATCH 006/137] move SoundfileAudioPayload out --- tests/forte/payload_test.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/forte/payload_test.py b/tests/forte/payload_test.py index c4ac1621d..a8276b52c 100644 --- a/tests/forte/payload_test.py +++ b/tests/forte/payload_test.py @@ -24,6 +24,24 @@ from forte.data.data_pack import DataPack +class SoundfileAudioPayload(AudioPayload): + def loading_method(self, path): + try: + import soundfile # pylint: disable=import-outside-toplevel + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + "AudioReader requires 'soundfile' package to be installed." + " You can refer to [extra modules to install]('pip install" + " forte['audio_ext']) or 'pip install forte" + ". Note that additional steps might apply to Linux" + " users (refer to " + "https://pysoundfile.readthedocs.io/en/latest/#installation)." + ) from e + audio, sample_rate = soundfile.read(file=path) + audio_data_meta = {"sample_rate": sample_rate} + return audio, audio_data_meta + + class PayloadTest(unittest.TestCase): """ Test Payload related ontologies like audio. @@ -33,22 +51,6 @@ def setUp(self): self.datapack = DataPack("payload test") def test_audio_payload(self): - class SoundfileAudioPayload(AudioPayload): - def loading_method(self, path): - try: - import soundfile # pylint: disable=import-outside-toplevel - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "AudioReader requires 'soundfile' package to be installed." - " You can refer to [extra modules to install]('pip install" - " forte['audio_ext']) or 'pip install forte" - ". Note that additional steps might apply to Linux" - " users (refer to " - "https://pysoundfile.readthedocs.io/en/latest/#installation)." - ) from e - audio, sample_rate = soundfile.read(file=path) - audio_data_meta = {"sample_rate": sample_rate} - return audio, audio_data_meta self.datapack.add_entry( SoundfileAudioPayload(self.datapack, payload_idx=0) From 84e2d62908e02a88dcfed9d08b9e9972d133eabf Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 09:18:54 -0700 Subject: [PATCH 007/137] add Meta --- forte/data/ontology/top.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index b3d01b69b..73c2cfc29 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1231,6 +1231,11 @@ def __init__( super().__init__(pack, "image", payload_idx) +class Meta(Entry): + def __init__(self, pack: PackType): + super().__init__(pack) + + SinglePackEntries = ( Link, Group, From e26560f5f60b7a75f140a094ba2e640d3afd0b24 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 14:21:10 -0700 Subject: [PATCH 008/137] add the logic of adding meta --- forte/data/data_pack.py | 8 ++++++++ forte/data/data_store.py | 2 ++ 2 files changed, 10 insertions(+) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 635e45603..efdad72b5 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -54,6 +54,7 @@ ImageAnnotation, Grids, Payload, + Meta, ) from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest @@ -2029,6 +2030,13 @@ def save_entry_object( tid=entry.tid, allow_duplicate=allow_duplicate, ) + elif isinstance(entry, Meta): + data_store_ref.add_meta_raw( + type_name=entry.entry_type(), + image_payload_idx=entry.meta_name, + tid=entry.tid, + allow_duplicate=allow_duplicate, + ) else: raise ValueError( f"Invalid entry type {type(entry)}. A valid entry " diff --git a/forte/data/data_store.py b/forte/data/data_store.py index b494586e4..272e77f20 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -30,6 +30,7 @@ Link, Generics, Payload, + Meta, ) from forte.data.ontology.core import Entry, FList, FDict from forte.common import constants @@ -940,6 +941,7 @@ def _add_entry_raw( ImageAnnotation, Grids, Payload, + Meta, ]: try: self.__elements[type_name].append(entry) From 7fa2428e9dd49da72d8c44cf353fa2a7d973002c Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 14:38:41 -0700 Subject: [PATCH 009/137] add more meta adding in DataStore --- forte/data/data_store.py | 67 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 272e77f20..ac58687f4 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -688,6 +688,30 @@ def _new_grid( return entry + def _new_meta( + self, type_name: str, meta_name: str, tid: Optional[int] = None + ) -> List: + r"""This function generates a new grid with default fields. + Called by add_grid_raw() to create a new grid + with ``type_name``, ``meta_name`` and optional ``tid``. + + + Args: + type_name: The fully qualified type name of the new entry. + meta_name: The name of the Meta entry. + + Returns: + A list representing a new grid type entry data. + """ + + tid: int = self._new_tid() if tid is None else tid + entry: List[Any] + + entry = [meta_name, None, tid, type_name] + entry += self._default_attributes_for_type(type_name) + + return entry + def _new_payload( self, type_name: str, @@ -1155,7 +1179,7 @@ def add_grid_raw( ) -> int: r""" - This function adds an image annotation entry with ``image_payload_idx`` + This function adds a grid entry with ``image_payload_idx`` indices to current data store object. Returns the ``tid`` for the inserted entry. @@ -1187,6 +1211,47 @@ def add_grid_raw( return tid_search_result return self._add_entry_raw(Grids, type_name, entry) + def add_meta_raw( + self, + type_name: str, + meta_name: str, + tid: Optional[int] = None, + allow_duplicate=True, + ) -> int: + + r""" + This function adds an image annotation entry with ``image_payload_idx`` + indices to current data store object. Returns the ``tid`` for the + inserted entry. + + Args: + type_name: The fully qualified type name of the new grid. + image_payload_idx: the index of the image payload. + tid: ``tid`` of the Annotation entry that is being added. + It's optional, and it will be + auto-assigned if not given. + allow_duplicate: Whether we allow duplicate in the DataStore. When + it's set to False, the function will return the ``tid`` of + existing entry if a duplicate is found. Default value is True. + + Returns: + ``tid`` of the entry. + """ + # We should create the `entry data` with the format + # [begin, end, tid, type_id, None, ...]. + # A helper function _new_annotation() can be used to generate a + # annotation type entry data with default fields. + # A reference to the entry should be store in both self.__elements and + # self.__tid_ref_dict. + entry = self._new_meta(type_name, meta_name, tid) + + if not allow_duplicate: + tid_search_result = self._get_existing_ann_entry_tid(entry) + # if found existing entry + if tid_search_result != -1: + return tid_search_result + return self._add_entry_raw(Meta, type_name, entry) + def _get_existing_ann_entry_tid(self, entry: List[Any]): r""" This function searches for tid for existing annotation-like entry tid. From 520163b532e7ee904358decdb32758a9773061fd Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 17:54:57 -0700 Subject: [PATCH 010/137] rewrite audio reader and processor to adapt the new Payload design --- forte/data/readers/audio_reader.py | 19 ++++++----- tests/forte/data/readers/audio_reader_test.py | 32 +++++++++++++------ 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index b462a93b9..647d4f3d8 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -20,6 +20,7 @@ from forte.data.data_pack import DataPack from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader +from forte.data.ontology.top import AudioPayload __all__ = [ "AudioReader", @@ -29,7 +30,7 @@ class AudioReader(PackReader): r""":class:`AudioReader` is designed to read in audio files.""" - def __init__(self): + def __init__(self, audio_processing_meta): super().__init__() try: import soundfile # pylint: disable=import-outside-toplevel @@ -43,8 +44,9 @@ def __init__(self): "https://pysoundfile.readthedocs.io/en/latest/#installation)." ) from e self.soundfile = soundfile + self.audio_processing_meta = audio_processing_meta - def _collect(self, audio_directory) -> Iterator[Any]: # type: ignore + def _collect(self) -> Iterator[Any]: # type: ignore r"""Should be called with param ``audio_directory`` which is a path to a folder containing audio files. @@ -53,7 +55,10 @@ def _collect(self, audio_directory) -> Iterator[Any]: # type: ignore Returns: Iterator over paths to audio files """ - return dataset_path_iterator(audio_directory, self.configs.file_ext) + return dataset_path_iterator( + self.audio_processing_meta.audio_path, + self.audio_processing_meta.file_ext, + ) def _cache_key_function(self, audio_file: str) -> str: return os.path.basename(audio_file) @@ -62,12 +67,10 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() # Read in audio data and store in DataPack - audio, sample_rate = self.soundfile.read( - file=file_path, **(self.configs.read_kwargs or {}) - ) - pack.set_audio(audio=audio, sample_rate=sample_rate) + # add audio payload into DataPack.payloads + ap = AudioPayload(pack, file_path, 0) + ap.set_loading_method(self.soundfile.read) pack.pack_name = file_path - yield pack @classmethod diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 23a8c91ce..bef87df86 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -27,12 +27,14 @@ from forte.data.readers import AudioReader from forte.pipeline import Pipeline from forte.processors.base.pack_processor import PackProcessor +from forte.data.ontology.top import AudioProcessingMeta, AudioPayload class TestASRProcessor(PackProcessor): """ An audio processor for automatic speech recognition. """ + def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) @@ -42,8 +44,15 @@ def initialize(self, resources: Resources, configs: Config): self._model = Wav2Vec2ForCTC.from_pretrained(pretrained_model) def _process(self, input_pack: DataPack): + # it follows the logic of loaidng while using + # load audio using AudioPayload + for audio_payload in input_pack.get(AudioPayload): + audio_data, sample_rate = audio_payload.load( + audio_payload.loading_path + ) + required_sample_rate: int = 16000 - if input_pack.sample_rate != required_sample_rate: + if sample_rate != required_sample_rate: raise ProcessFlowException( f"A sample rate of {required_sample_rate} Hz is requied by the" " pretrained model." @@ -51,7 +60,7 @@ def _process(self, input_pack: DataPack): # tokenize input_values = self._tokenizer( - input_pack.audio, return_tensors="pt", padding="longest" + audio_data, return_tensors="pt", padding="longest" ).input_values # Batch size 1 # take argmax and decode @@ -75,28 +84,31 @@ def setUp(self): os.pardir, os.pardir, os.pardir, - "data_samples/audio_reader_test" + "data_samples/audio_reader_test", ) ) - + datapack = DataPack("payload test") + audio_processing_meta = AudioProcessingMeta(datapack, meta_name="audio") + audio_processing_meta.audio_path = self._test_audio_path # Define and config the Pipeline self._pipeline = Pipeline[DataPack]() - self._pipeline.set_reader(AudioReader()) + self._pipeline.set_reader(AudioReader(audio_processing_meta)) self._pipeline.add(TestASRProcessor()) self._pipeline.initialize() def test_asr_pipeline(self): target_transcription: Dict[str, str] = { - self._test_audio_path + "/test_audio_0.flac": - "A MAN SAID TO THE UNIVERSE SIR I EXIST", - self._test_audio_path + "/test_audio_1.flac": ( + self._test_audio_path + + "/test_audio_0.flac": "A MAN SAID TO THE UNIVERSE SIR I EXIST", + self._test_audio_path + + "/test_audio_1.flac": ( "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING " "THAN HIS MATTER" - ) + ), } # Verify the ASR result of each datapack - for pack in self._pipeline.process_dataset(self._test_audio_path): + for pack in self._pipeline.process_dataset(): self.assertEqual(pack.text, target_transcription[pack.pack_name]) From 9e67de241a4e733d253f713c8ea5fc8ec181aac6 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:02:15 -0700 Subject: [PATCH 011/137] new Payload stores loading_function and loading_path and add Meta class and its subclasses to store data processing related metadata --- forte/data/ontology/top.py | 101 ++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 13 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 73c2cfc29..76f97abe3 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -14,8 +14,9 @@ from abc import abstractmethod from dataclasses import dataclass from functools import total_ordering -from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List +from typing import Optional, Set, Tuple, Type, Any, Dict, Union, Iterable, List from abc import ABC, abstractmethod +import uuid import numpy as np @@ -53,6 +54,13 @@ "Region", "Box", "BoundingBox", + "Payload", + "TextPayload", + "ImagePayload", + "AudioPayload", + "Meta", + "ImageProcessingMeta", + "AudioProcessingMeta", ] QueryType = Union[Dict[str, Any], np.ndarray] @@ -1175,19 +1183,19 @@ def __init__( self, pack: PackType, modality: str, + path: str, payload_idx: int, ): self.payload_idx = payload_idx self.modality = modality + self._path = path super().__init__(pack) self.cache = [] self.meta = {} + self._loading_method = None - def load(self, path): - data, data_meta = self.loading_method(path) - self.cache.append(data) - self.meta = {**self.meta, **data_meta} - self.pack.pack_name = path + def set_loading_method(self, fn): + self._loading_method = fn def offload_cache(self, cache_idx): self.cache.pop(cache_idx) @@ -1199,42 +1207,108 @@ def offload_all_cache(self): def payload_index(self): return self.payload_idx - @abstractmethod - def loading_method(self, path): - pass + @property + def loading_path(self): + return self._path + + @property + def load(self): + return self._loading_method class TextPayload(Payload): def __init__( self, pack: PackType, + path: str, payload_idx: int, ): - super().__init__(pack, "text", payload_idx) + super().__init__(pack, "text", path, payload_idx) class AudioPayload(Payload): def __init__( self, pack: PackType, + path: str, payload_idx: int, ): - super().__init__(pack, "audio", payload_idx) + super().__init__(pack, "audio", path, payload_idx=payload_idx) class ImagePayload(Payload): def __init__( self, pack: PackType, + path: str, payload_idx: int, ): - super().__init__(pack, "image", payload_idx) + super().__init__(pack, "image", path, payload_idx) class Meta(Entry): - def __init__(self, pack: PackType): + """ + a Meta entry defines metadata related to data processing + about reading from data source, loading data to cache, and writing to + a target file. + + Args: + Entry (_type_): _description_ + """ + + def __init__(self, pack: PackType, meta_name): + self._meta_name: Optional[str] = meta_name super().__init__(pack) + @property + def meta_name(self): + return self._meta_name + + +class ImageProcessingMeta(Meta): + def __init__(self, pack: PackType, meta_name: Optional[str] = None): + if meta_name is None: + meta_name = "jpg" + super().__init__(pack, meta_name) + self.data_source_type = "disk" + self.pipeline_data_type = "nparray" + self.save_format = None + self.type_code = "jpg" + + +class AudioProcessingMeta(Meta): + """ + an AudioProcessingMeta entry defines metadata related to audio processing + about reading from data source, loading data to cache, and writing to + a target file. + + Args: + pack (PackType): The container that this AudioProcessingMeta will + be added to. + meta_name (Optional[str], optional): the name for the audio metadata. + Defaults to "flac". + """ + + # a Meta data entry object that define metadata related to image processing + # both reading from data source, loaded format and writing format + # for example, we might want to read a high resolution png image + # and load it as a numpy array and write it into jpg format. + + # it determines what third-party packages to use to convert image to target + # data format + + # payload meta defines data source and user need to write a + # reader for the data source. + # def __init__(self, pack: PackType, meta_name: Optional[str] = None): + def __init__(self, pack: PackType, meta_name: Optional[str] = None): + if meta_name is None: + meta_name = "flac" + super().__init__(pack, meta_name=meta_name) + self.data_source_type = "disk" + self.pipeline_data_type = "nparray" + self.save_format = None + self.file_ext = "flac" + SinglePackEntries = ( Link, @@ -1244,5 +1318,6 @@ def __init__(self, pack: PackType): AudioAnnotation, ImageAnnotation, Payload, + Meta, ) MultiPackEntries = (MultiPackLink, MultiPackGroup, MultiPackGeneric) From 3bb940e5ed4dfcbe4c10f1bf15112787b564ace1 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:03:49 -0700 Subject: [PATCH 012/137] temporarily import Meta inside the class as there is another Meta class inside the script --- forte/data/data_pack.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index efdad72b5..a3c3e147e 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -1972,6 +1972,7 @@ def save_entry_object( except KeyError: # The entry is not found in DataStore pass + from forte.data.ontology.top import Meta # Create a new registry in DataStore based on entry's type if isinstance(entry, Annotation): @@ -2033,7 +2034,7 @@ def save_entry_object( elif isinstance(entry, Meta): data_store_ref.add_meta_raw( type_name=entry.entry_type(), - image_payload_idx=entry.meta_name, + meta_name=entry.meta_name, tid=entry.tid, allow_duplicate=allow_duplicate, ) @@ -2041,7 +2042,8 @@ def save_entry_object( raise ValueError( f"Invalid entry type {type(entry)}. A valid entry " f"should be an instance of Annotation, Link, Group, Generics " - "or AudioAnnotation." + "AudioAnnotation, ImageAnnotation, Payload, Grids, Meta" + " or AudioAnnotation." ) # Store all the dataclass attributes to DataStore From 4fcc72b0e57381bb3741771790955a7dd5173f38 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:06:06 -0700 Subject: [PATCH 013/137] remove import Meta inside DataPack class --- forte/data/data_pack.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index a3c3e147e..3df0c0ebd 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -1972,7 +1972,6 @@ def save_entry_object( except KeyError: # The entry is not found in DataStore pass - from forte.data.ontology.top import Meta # Create a new registry in DataStore based on entry's type if isinstance(entry, Annotation): From 8b647e90b7751d3dd108196d53330d7d3e16869a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:07:03 -0700 Subject: [PATCH 014/137] no changes --- forte/data/data_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index ac58687f4..84de44ca6 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -1244,7 +1244,6 @@ def add_meta_raw( # A reference to the entry should be store in both self.__elements and # self.__tid_ref_dict. entry = self._new_meta(type_name, meta_name, tid) - if not allow_duplicate: tid_search_result = self._get_existing_ann_entry_tid(entry) # if found existing entry From 3a9e0c7923ce38b7001c212e40edc7463a3c5726 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:07:54 -0700 Subject: [PATCH 015/137] allow users to pass None to pack so that Entry won't be added to DataPack --- forte/data/ontology/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/forte/data/ontology/core.py b/forte/data/ontology/core.py index 72a9c9899..5dde5863d 100644 --- a/forte/data/ontology/core.py +++ b/forte/data/ontology/core.py @@ -189,8 +189,9 @@ def __init__(self, pack: ContainerType): self.__pack: ContainerType = pack self._tid: int = uuid.uuid4().int self._embedding: np.ndarray = np.empty(0) - self.pack._validate(self) - self.pack.on_entry_creation(self) + if pack is not None: + self.pack._validate(self) + self.pack.on_entry_creation(self) def __getstate__(self): r"""In serialization, the pack is not serialize, and it will be set From a30fb8158c28b570417be0584119532cde2fedd7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:08:53 -0700 Subject: [PATCH 016/137] pass None as pack to AudioProcessingMeta so that AudioProcessingMeta doesn't need to be added into any DataPack --- tests/forte/data/readers/audio_reader_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index bef87df86..9f09bdedd 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -88,7 +88,7 @@ def setUp(self): ) ) datapack = DataPack("payload test") - audio_processing_meta = AudioProcessingMeta(datapack, meta_name="audio") + audio_processing_meta = AudioProcessingMeta(None, meta_name="audio") audio_processing_meta.audio_path = self._test_audio_path # Define and config the Pipeline self._pipeline = Pipeline[DataPack]() From 4eafd164d2c128ec7a3dc778f4e1f3dce78df0a3 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:24:22 -0700 Subject: [PATCH 017/137] remove Meta importing --- forte/data/data_pack.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 3df0c0ebd..376d750db 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -54,7 +54,6 @@ ImageAnnotation, Grids, Payload, - Meta, ) from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest From d0f0939c3ef3b5c7fc108b633547e586b659656a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 18:24:44 -0700 Subject: [PATCH 018/137] pylint fix: fix imports --- forte/data/ontology/top.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 76f97abe3..c5318ad5d 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -11,12 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod from dataclasses import dataclass from functools import total_ordering -from typing import Optional, Set, Tuple, Type, Any, Dict, Union, Iterable, List -from abc import ABC, abstractmethod -import uuid +from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List import numpy as np From 35a97d37eb486f0f6e7a7bb3f437087bfaf9dd93 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 20:18:41 -0700 Subject: [PATCH 019/137] make Payload class variables private --- forte/data/ontology/top.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index c5318ad5d..7b330a139 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1183,22 +1183,26 @@ def __init__( path: str, payload_idx: int, ): - self.payload_idx = payload_idx - self.modality = modality + self._payload_idx = payload_idx + self._modality = modality self._path = path super().__init__(pack) - self.cache = [] - self.meta = {} + self._cache = [] + self._meta = {} self._loading_method = None def set_loading_method(self, fn): self._loading_method = fn def offload_cache(self, cache_idx): - self.cache.pop(cache_idx) + self._cache.pop(cache_idx) def offload_all_cache(self): - self.cache.clear() + self._cache.clear() + + @property + def modality(self): + return self.modality @property def payload_index(self): From 19afb61cd48d166254b430cae1e3c4ba871ba0d9 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 9 Jun 2022 20:19:37 -0700 Subject: [PATCH 020/137] remove initialized DataPack --- tests/forte/data/readers/audio_reader_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 9f09bdedd..ee89e4c03 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -87,7 +87,6 @@ def setUp(self): "data_samples/audio_reader_test", ) ) - datapack = DataPack("payload test") audio_processing_meta = AudioProcessingMeta(None, meta_name="audio") audio_processing_meta.audio_path = self._test_audio_path # Define and config the Pipeline From 7a92b285f8649c251b87175eddac4b049f7ca941 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 10 Jun 2022 00:45:16 -0700 Subject: [PATCH 021/137] Delete payload_test.py --- tests/forte/payload_test.py | 57 ------------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 tests/forte/payload_test.py diff --git a/tests/forte/payload_test.py b/tests/forte/payload_test.py deleted file mode 100644 index a8276b52c..000000000 --- a/tests/forte/payload_test.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2022 The Forte Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Unit tests for Payload. -""" -import os -import unittest -import numpy as np -from typing import Dict - -from numpy import array_equal -from forte.data.ontology.top import Payload, AudioPayload -from forte.data.data_pack import DataPack - - -class SoundfileAudioPayload(AudioPayload): - def loading_method(self, path): - try: - import soundfile # pylint: disable=import-outside-toplevel - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "AudioReader requires 'soundfile' package to be installed." - " You can refer to [extra modules to install]('pip install" - " forte['audio_ext']) or 'pip install forte" - ". Note that additional steps might apply to Linux" - " users (refer to " - "https://pysoundfile.readthedocs.io/en/latest/#installation)." - ) from e - audio, sample_rate = soundfile.read(file=path) - audio_data_meta = {"sample_rate": sample_rate} - return audio, audio_data_meta - - -class PayloadTest(unittest.TestCase): - """ - Test Payload related ontologies like audio. - """ - - def setUp(self): - self.datapack = DataPack("payload test") - - def test_audio_payload(self): - - self.datapack.add_entry( - SoundfileAudioPayload(self.datapack, payload_idx=0) - ) From cec1a22a98fb5ac053a180e272154c25dc9ce509 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 13 Jun 2022 09:17:59 -0700 Subject: [PATCH 022/137] new cache and meta for payload --- forte/data/ontology/top.py | 39 ++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 7b330a139..a00fbb8b4 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1186,35 +1186,46 @@ def __init__( self._payload_idx = payload_idx self._modality = modality self._path = path + super().__init__(pack) - self._cache = [] + self._cache = None self._meta = {} - self._loading_method = None - - def set_loading_method(self, fn): - self._loading_method = fn - def offload_cache(self, cache_idx): - self._cache.pop(cache_idx) + def get_data(self): + return self.offload_cache() - def offload_all_cache(self): - self._cache.clear() + def offload_cache(self, f_name=None): + cache = self._cache + self._cache = None + if f_name is not None: + with open(f_name, "wb") as f: + np.save(f, cache) + return cache @property def modality(self): - return self.modality + return self._modality @property def payload_index(self): - return self.payload_idx + return self._payload_idx @property def loading_path(self): return self._path - @property - def load(self): - return self._loading_method + def set_cache(self, data): + self._cache = data + + def load_cache(self, f_name): + with open(f_name, "rb") as f: + self.cache = np.load(f) + + def set_meta(self, key, value): + self._meta[key] = value + + def get_meta(self, key): + return self._meta[key] class TextPayload(Payload): From 15009fd033a63870d25f3e9c4db8eaa7679f7a06 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 13 Jun 2022 09:19:06 -0700 Subject: [PATCH 023/137] new reader test based on code changes --- tests/forte/data/readers/audio_reader_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index ee89e4c03..6aff427ab 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -14,6 +14,7 @@ """ Unit tests for AudioReader. """ +from email.mime import audio import os import unittest from typing import Dict @@ -47,9 +48,8 @@ def _process(self, input_pack: DataPack): # it follows the logic of loaidng while using # load audio using AudioPayload for audio_payload in input_pack.get(AudioPayload): - audio_data, sample_rate = audio_payload.load( - audio_payload.loading_path - ) + sample_rate = audio_payload.get_meta("sample_rate") + audio_data = audio_payload.offload_cache() required_sample_rate: int = 16000 if sample_rate != required_sample_rate: From c2b61dbed7e4b4868ad875d70e92cc5200c62ff1 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 13 Jun 2022 15:19:46 -0700 Subject: [PATCH 024/137] pass reader configs(reading method) to AudioPayload and AudioReadingMeta --- forte/data/readers/audio_reader.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 647d4f3d8..34ec0d77f 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -20,7 +20,7 @@ from forte.data.data_pack import DataPack from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader -from forte.data.ontology.top import AudioPayload +from forte.data.ontology.top import AudioPayload, AudioReadingMeta __all__ = [ "AudioReader", @@ -30,7 +30,7 @@ class AudioReader(PackReader): r""":class:`AudioReader` is designed to read in audio files.""" - def __init__(self, audio_processing_meta): + def __init__(self): super().__init__() try: import soundfile # pylint: disable=import-outside-toplevel @@ -44,9 +44,8 @@ def __init__(self, audio_processing_meta): "https://pysoundfile.readthedocs.io/en/latest/#installation)." ) from e self.soundfile = soundfile - self.audio_processing_meta = audio_processing_meta - def _collect(self) -> Iterator[Any]: # type: ignore + def _collect(self, audio_directory) -> Iterator[Any]: # type: ignore r"""Should be called with param ``audio_directory`` which is a path to a folder containing audio files. @@ -55,9 +54,10 @@ def _collect(self) -> Iterator[Any]: # type: ignore Returns: Iterator over paths to audio files """ + # construct ImageMeta and store it in DataPack return dataset_path_iterator( - self.audio_processing_meta.audio_path, - self.audio_processing_meta.file_ext, + audio_directory, + self.configs.file_ext, ) def _cache_key_function(self, audio_file: str) -> str: @@ -66,10 +66,18 @@ def _cache_key_function(self, audio_file: str) -> str: def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() + payload_idx = 0 # Read in audio data and store in DataPack # add audio payload into DataPack.payloads - ap = AudioPayload(pack, file_path, 0) - ap.set_loading_method(self.soundfile.read) + ap = AudioPayload(pack, file_path, payload_idx) + # audio_data, sample_rate = self.soundfile.read(file_path) + # ap.set_cache(audio_data) + # ap.set_meta("sample_rate", sample_rate) + for k, v in self.configs: + ap.set_meta(k, v) + meta = AudioReadingMeta(pack, payload_idx) + meta._module = self.configs.read_kwargs.module + meta._reading_method = self.configs.read_kwargs.method pack.pack_name = file_path yield pack From e2d8a7a2617a3992708b478a4dfdcfda84aa3f9c Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 13 Jun 2022 15:20:45 -0700 Subject: [PATCH 025/137] parse reading configs into a reading function --- tests/forte/data/readers/audio_reader_test.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 6aff427ab..ee339bb1c 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -15,6 +15,7 @@ Unit tests for AudioReader. """ from email.mime import audio +import importlib import os import unittest from typing import Dict @@ -28,7 +29,7 @@ from forte.data.readers import AudioReader from forte.pipeline import Pipeline from forte.processors.base.pack_processor import PackProcessor -from forte.data.ontology.top import AudioProcessingMeta, AudioPayload +from forte.data.ontology.top import AudioReadingMeta, AudioPayload class TestASRProcessor(PackProcessor): @@ -45,11 +46,18 @@ def initialize(self, resources: Resources, configs: Config): self._model = Wav2Vec2ForCTC.from_pretrained(pretrained_model) def _process(self, input_pack: DataPack): + # it follows the logic of loaidng while using # load audio using AudioPayload - for audio_payload in input_pack.get(AudioPayload): - sample_rate = audio_payload.get_meta("sample_rate") - audio_data = audio_payload.offload_cache() + for audio_payload, audio_reading_meta in zip( + input_pack.get(AudioPayload), input_pack.get(AudioReadingMeta) + ): + audio_reading_meta + module = importlib.import_module(audio_reading_meta.module) + reading_method = getattr(module, audio_reading_meta.reading_method) + audio_data, sample_rate = reading_method(audio_payload.reading_path) + # sample_rate = audio_payload.get_meta("sample_rate") + # audio_data = audio_payload.offload_cache() required_sample_rate: int = 16000 if sample_rate != required_sample_rate: @@ -87,11 +95,12 @@ def setUp(self): "data_samples/audio_reader_test", ) ) - audio_processing_meta = AudioProcessingMeta(None, meta_name="audio") - audio_processing_meta.audio_path = self._test_audio_path # Define and config the Pipeline self._pipeline = Pipeline[DataPack]() - self._pipeline.set_reader(AudioReader(audio_processing_meta)) + self._pipeline.set_reader( + AudioReader(), + config={"read_kwargs": {"module": "soundfile", "method": "read"}}, + ) self._pipeline.add(TestASRProcessor()) self._pipeline.initialize() @@ -107,7 +116,7 @@ def test_asr_pipeline(self): } # Verify the ASR result of each datapack - for pack in self._pipeline.process_dataset(): + for pack in self._pipeline.process_dataset(self._test_audio_path): self.assertEqual(pack.text, target_transcription[pack.pack_name]) From 309c10262342286da045c0b3bb8712519075b7b6 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 13 Jun 2022 15:31:05 -0700 Subject: [PATCH 026/137] reconstruct reading meta which is bound to payload --- forte/data/ontology/top.py | 63 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index a00fbb8b4..ddfa8e43e 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -55,9 +55,9 @@ "TextPayload", "ImagePayload", "AudioPayload", - "Meta", - "ImageProcessingMeta", - "AudioProcessingMeta", + "ReadingMeta", + "ImageReadingMeta", + "AudioReadingMeta", ] QueryType = Union[Dict[str, Any], np.ndarray] @@ -1211,7 +1211,7 @@ def payload_index(self): return self._payload_idx @property - def loading_path(self): + def reading_path(self): return self._path def set_cache(self, data): @@ -1258,7 +1258,7 @@ def __init__( super().__init__(pack, "image", path, payload_idx) -class Meta(Entry): +class ReadingMeta(Entry): """ a Meta entry defines metadata related to data processing about reading from data source, loading data to cache, and writing to @@ -1268,16 +1268,31 @@ class Meta(Entry): Entry (_type_): _description_ """ - def __init__(self, pack: PackType, meta_name): + def __init__(self, pack: PackType, payload_index: int, meta_name): self._meta_name: Optional[str] = meta_name + self._payload_index = payload_index super().__init__(pack) + self._module = None + self._reading_method = None + + @property + def payload_index(self): + return self._payload_index @property def meta_name(self): return self._meta_name + @property + def module(self): + return self._module + + @property + def reading_method(self): + return self._reading_method + -class ImageProcessingMeta(Meta): +class ImageReadingMeta(ReadingMeta): def __init__(self, pack: PackType, meta_name: Optional[str] = None): if meta_name is None: meta_name = "jpg" @@ -1288,14 +1303,15 @@ def __init__(self, pack: PackType, meta_name: Optional[str] = None): self.type_code = "jpg" -class AudioProcessingMeta(Meta): +class AudioReadingMeta(ReadingMeta): """ - an AudioProcessingMeta entry defines metadata related to audio processing - about reading from data source, loading data to cache, and writing to - a target file. + An AudioReadingMeta entry defines metadata related to reading raw audio + from data source. It can be efficiently serialized and deserialized within + DataPack, and it can be further converted to loading method by using + loading method registry. It's bound to one payload. Args: - pack (PackType): The container that this AudioProcessingMeta will + pack (PackType): The container that this AudioReadingMeta will be added to. meta_name (Optional[str], optional): the name for the audio metadata. Defaults to "flac". @@ -1312,14 +1328,19 @@ class AudioProcessingMeta(Meta): # payload meta defines data source and user need to write a # reader for the data source. # def __init__(self, pack: PackType, meta_name: Optional[str] = None): - def __init__(self, pack: PackType, meta_name: Optional[str] = None): - if meta_name is None: - meta_name = "flac" - super().__init__(pack, meta_name=meta_name) - self.data_source_type = "disk" - self.pipeline_data_type = "nparray" - self.save_format = None - self.file_ext = "flac" + def __init__( + self, + pack: PackType, + payload_index: int, + meta_name: Optional[str] = "audio", + ): + + super().__init__(pack, payload_index, meta_name=meta_name) + # self.data_source_type = "disk" + # self.save_format = None + # self.file_ext = "flac" + self._module = None + self._reading_method = None SinglePackEntries = ( @@ -1330,6 +1351,6 @@ def __init__(self, pack: PackType, meta_name: Optional[str] = None): AudioAnnotation, ImageAnnotation, Payload, - Meta, + ReadingMeta, ) MultiPackEntries = (MultiPackLink, MultiPackGroup, MultiPackGeneric) From 1777c9e1d144983a2def41f898c908e49394b242 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 13 Jun 2022 18:18:37 -0700 Subject: [PATCH 027/137] rewrite set text function and set audio function --- forte/data/data_pack.py | 99 +++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 34 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 376d750db..acf13c0ab 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -46,14 +46,17 @@ from forte.data.ontology.core import EntryType from forte.data.ontology.top import ( Annotation, + AudioPayload, Link, Group, + ReadingMeta, SinglePackEntries, Generics, AudioAnnotation, ImageAnnotation, Grids, Payload, + TextPayload, ) from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest @@ -245,14 +248,14 @@ def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) @property - def text(self) -> str: + def text(self, text_payload_index=0) -> str: r"""Return the text of the data pack""" - return self._text + return self.get_single(TextPayload, text_payload_index).cache @property - def audio(self) -> Optional[np.ndarray]: + def audio(self, audio_payload_index=0) -> Optional[np.ndarray]: r"""Return the audio of the data pack""" - return self._audio + return self.get_single(AudioPayload, audio_payload_index).cache @property def sample_rate(self) -> Optional[int]: @@ -454,7 +457,7 @@ def groups(self): def groups(self, val): self._groups = val - def get_span_text(self, begin: int, end: int) -> str: + def get_span_text(self, begin: int, end: int, text_payload_index=0) -> str: r"""Get the text in the data pack contained in the span. Args: @@ -464,7 +467,7 @@ def get_span_text(self, begin: int, end: int) -> str: Returns: The text within this span. """ - return self._text[begin:end] + return self.get_single(TextPayload, text_payload_index).cache[begin:end] def get_span_audio(self, begin: int, end: int) -> np.ndarray: r"""Get the audio in the data pack contained in the span. @@ -479,12 +482,20 @@ def get_span_audio(self, begin: int, end: int) -> np.ndarray: Returns: The audio within this span. """ - if self._audio is None: + audio_payload_entries = list(self.get(AudioPayload)) + # if self.pack.get(AudioPayload) is None: + # raise ProcessExecutionException( + # "The audio payload of this DataPack is not set. Please call" + # " method `set_audio` before running `get_span_audio`." + # ) + if len(audio_payload_entries) == 0: raise ProcessExecutionException( "The audio payload of this DataPack is not set. Please call" " method `set_audio` before running `get_span_audio`." ) - return self._audio[begin:end] + + audio = audio_payload_entries[0].cache + return audio[begin:end] def get_image_array(self, image_payload_idx: int): if image_payload_idx >= len(self.payloads): @@ -498,29 +509,35 @@ def set_text( self, text: str, replace_func: Optional[Callable[[str], ReplaceOperationsType]] = None, + text_payload_index: Optional[int] = None, ): - - if len(text) < len(self._text): - raise ProcessExecutionException( - "The new text is overwriting the original one with shorter " - "length, which might cause unexpected behavior." - ) - - if len(self._text): - logging.warning( - "Need to be cautious when changing the text of a " - "data pack, existing entries may get affected. " - ) + # if len(text) < len(self._text): + # raise ProcessExecutionException( + # "The new text is overwriting the original one with shorter " + # "length, which might cause unexpected behavior." + # ) + + # if len(self._text): + # logging.warning( + # "Need to be cautious when changing the text of a " + # "data pack, existing entries may get affected. " + # ) span_ops = [] if replace_func is None else replace_func(text) # The spans should be mutually exclusive ( - self._text, - self.__replace_back_operations, - self.__processed_original_spans, - self.__orig_text_len, + text, + replace_back_operations, + processed_original_spans, + orig_text_len, ) = data_utils_io.modify_text_and_track_ops(text, span_ops) + tp = TextPayload(self, 0) + tp.set_cache(text) + + tp.set_meta("replace_back_operations", replace_back_operations) + tp.set_meta("processed_original_spans", processed_original_spans) + tp.set_meta("orig_text_len", orig_text_len) def set_audio(self, audio: np.ndarray, sample_rate: int): r"""Set the audio payload and sample rate of the :class:`~forte.data.data_pack.DataPack` @@ -530,18 +547,20 @@ def set_audio(self, audio: np.ndarray, sample_rate: int): audio: A numpy array storing the audio waveform. sample_rate: An integer specifying the sample rate. """ - self._audio = audio - self.set_meta(sample_rate=sample_rate) + ap = AudioPayload(self, 0) + ap.set_cache(audio) + ap.set_meta("sample_rate", sample_rate) - def get_original_text(self): + def get_original_text(self, text_payload_index=0): r"""Get original unmodified text from the :class:`~forte.data.data_pack.DataPack` object. Returns: Original text after applying the `replace_back_operations` of :class:`~forte.data.data_pack.DataPack` object to the modified text """ + tp = self.get_single(TextPayload, text_payload_index) original_text, _, _, _ = data_utils_io.modify_text_and_track_ops( - self._text, self.__replace_back_operations + tp.cache, tp.get_meta("replace_back_operations") ) return original_text @@ -810,6 +829,7 @@ def get_data( context_type: Union[str, Type[Annotation], Type[AudioAnnotation]], request: Optional[DataRequest] = None, skip_k: int = 0, + payload_index=0, ) -> Iterator[Dict[str, Any]]: r"""Fetch data from entries in the data_pack of type `context_type`. Data includes `"span"`, annotation-specific @@ -972,7 +992,7 @@ def get_annotation_list( " [Annotation, AudioAnnotation]." ) - def get_context_data(c_type, context): + def get_context_data(c_type, context, payload_index): r"""Get context-specific data of a given context type and context. @@ -991,9 +1011,13 @@ def get_context_data(c_type, context): str: context data. """ if issubclass(c_type, Annotation): - return self.text[context.begin : context.end] + return self.get_single(TextPayload, payload_index).cache[ + context.begin : context.end + ] elif issubclass(c_type, AudioAnnotation): - return self.audio[context.begin : context.end] + return self.get_single(AudioPayload, payload_index).cache[ + context.begin : context.end + ] else: raise NotImplementedError( f"Context type is set to {context_type}" @@ -1011,7 +1035,9 @@ def get_context_data(c_type, context): skipped += 1 continue data: Dict[str, Any] = {} - data["context"] = get_context_data(context_type_, context) + data["context"] = get_context_data( + context_type_, context, payload_index + ) data["offset"] = context.begin for field in context_fields: @@ -2029,14 +2055,19 @@ def save_entry_object( tid=entry.tid, allow_duplicate=allow_duplicate, ) - elif isinstance(entry, Meta): - data_store_ref.add_meta_raw( + elif isinstance(entry, ReadingMeta): + data_store_ref.add_reading_meta_raw( type_name=entry.entry_type(), meta_name=entry.meta_name, tid=entry.tid, allow_duplicate=allow_duplicate, ) else: + import pdb + + pdb.set_trace() + print("") + raise ValueError( f"Invalid entry type {type(entry)}. A valid entry " f"should be an instance of Annotation, Link, Group, Generics " From 0f0fd7566c2d4309b89571841d8ccb364c6a9f9a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 14 Jun 2022 17:11:01 -0700 Subject: [PATCH 028/137] clean up unnecessary class variables and add TextReadingMeta --- forte/data/ontology/top.py | 106 +++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index ddfa8e43e..84e939d11 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -58,6 +58,7 @@ "ReadingMeta", "ImageReadingMeta", "AudioReadingMeta", + "TextReadingMeta", ] QueryType = Union[Dict[str, Any], np.ndarray] @@ -1180,27 +1181,32 @@ def __init__( self, pack: PackType, modality: str, - path: str, payload_idx: int, + uri: str = None, ): + supported_modality = ("text", "audio", "image") + if modality not in supported_modality: + raise ValueError( + f"The given modality {modality} is not supported. " + f"Currently we only support {supported_modality}" + ) self._payload_idx = payload_idx self._modality = modality - self._path = path + self._uri = uri super().__init__(pack) self._cache = None - self._meta = {} - - def get_data(self): - return self.offload_cache() + self.meta = None - def offload_cache(self, f_name=None): - cache = self._cache - self._cache = None - if f_name is not None: - with open(f_name, "wb") as f: - np.save(f, cache) - return cache + @property + def cache(self): + if self._cache is None: + raise ValueError( + "Payload doesn't have a cache." + "Please set the reader config `lazy_read` to False" + "or manually load it by set_cache() " + ) + return self._cache @property def modality(self): @@ -1211,41 +1217,36 @@ def payload_index(self): return self._payload_idx @property - def reading_path(self): - return self._path + def uri(self): + return self._uri def set_cache(self, data): self._cache = data - def load_cache(self, f_name): - with open(f_name, "rb") as f: - self.cache = np.load(f) - - def set_meta(self, key, value): - self._meta[key] = value - - def get_meta(self, key): - return self._meta[key] - class TextPayload(Payload): def __init__( self, pack: PackType, - path: str, payload_idx: int, + path: str = None, ): - super().__init__(pack, "text", path, payload_idx) + + super().__init__(pack, "text", payload_idx, path) class AudioPayload(Payload): def __init__( self, pack: PackType, - path: str, payload_idx: int, + path: str = None, ): - super().__init__(pack, "audio", path, payload_idx=payload_idx) + + super().__init__(pack, "audio", payload_idx, path) + + def audio_len(self): + return len(self._cache) class ImagePayload(Payload): @@ -1255,7 +1256,7 @@ def __init__( path: str, payload_idx: int, ): - super().__init__(pack, "image", path, payload_idx) + super().__init__(pack, "image", payload_idx, path) class ReadingMeta(Entry): @@ -1268,28 +1269,32 @@ class ReadingMeta(Entry): Entry (_type_): _description_ """ - def __init__(self, pack: PackType, payload_index: int, meta_name): + def __init__(self, pack: PackType, meta_name): self._meta_name: Optional[str] = meta_name - self._payload_index = payload_index super().__init__(pack) - self._module = None - self._reading_method = None - - @property - def payload_index(self): - return self._payload_index @property def meta_name(self): return self._meta_name - @property - def module(self): - return self._module - @property - def reading_method(self): - return self._reading_method +class TextReadingMeta(ReadingMeta): + """ + a text meta entry defines metadata related to text data reading from + data source. + + Args: + Entry (_type_): _description_ + + Returns: + _type_: _description_ + """ + + def __init__(self, pack: PackType, meta_name: Optional[str] = None): + super().__init__(pack, meta_name=meta_name) + self.replace_back_operations = None + self.processed_original_spans = None + self.orig_text_len = None class ImageReadingMeta(ReadingMeta): @@ -1331,16 +1336,15 @@ class AudioReadingMeta(ReadingMeta): def __init__( self, pack: PackType, - payload_index: int, + sample_rate: Optional[int] = None, meta_name: Optional[str] = "audio", ): + super().__init__(pack, meta_name=meta_name) + self._sample_rate = sample_rate - super().__init__(pack, payload_index, meta_name=meta_name) - # self.data_source_type = "disk" - # self.save_format = None - # self.file_ext = "flac" - self._module = None - self._reading_method = None + @property + def sample_rate(self): + return self._sample_rate SinglePackEntries = ( From 9ded0d7a3043009512d4e5ac330e4dcd39127b32 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 14 Jun 2022 20:57:20 -0700 Subject: [PATCH 029/137] test ImagePayload --- tests/forte/image_annotation_test.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/tests/forte/image_annotation_test.py b/tests/forte/image_annotation_test.py index 7d2233498..c5f846c5c 100644 --- a/tests/forte/image_annotation_test.py +++ b/tests/forte/image_annotation_test.py @@ -20,8 +20,17 @@ from typing import Dict from numpy import array_equal -from forte.data.ontology.top import ImageAnnotation +from forte.data.ontology.top import ( + ImageAnnotation, + ImagePayload, + ImageReadingMeta, + Payload, +) from forte.data.data_pack import DataPack +import importlib +import os +import unittest +from typing import Dict class ImageAnnotationTest(unittest.TestCase): @@ -35,16 +44,17 @@ def setUp(self): self.line[2, 2] = 1 self.line[3, 3] = 1 self.line[4, 4] = 1 - self.datapack.payloads.append(self.line) - self.datapack.image_annotations.append( - ImageAnnotation(self.datapack, 0) - ) + ip = ImagePayload(self.datapack, 0) + ip.set_cache(self.line) + ImageAnnotation(self.datapack, 0) def test_image_annotation(self): self.assertEqual( - self.datapack.image_annotations[0].image_payload_idx, 0 + self.datapack.get_single(ImageAnnotation, 0).image_payload_idx, 0 ) self.assertTrue( - array_equal(self.datapack.image_annotations[0].image, self.line) + array_equal( + self.datapack.get_single(ImagePayload, 0).cache, self.line + ) ) From 96acb05856e537f386546573d78e97ff42c626f0 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 09:12:59 -0700 Subject: [PATCH 030/137] get_single with payload index --- forte/data/base_pack.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 94081acca..89c89ace2 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -468,7 +468,9 @@ def get( """ raise NotImplementedError - def get_single(self, entry_type: Union[str, Type[EntryType]]) -> EntryType: + def get_single( + self, entry_type: Union[str, Type[EntryType]], payload_index=0 + ) -> EntryType: r"""Take a single entry of type :attr:`~forte.data.data_pack.DataPack.entry_type` from this data pack. This is useful when the target entry type appears only one @@ -481,9 +483,19 @@ def get_single(self, entry_type: Union[str, Type[EntryType]]) -> EntryType: Returns: A single data entry. """ - for a in self.get(entry_type): - return a - + idx = -1 + for idx, a in enumerate(self.get(entry_type)): + if idx == payload_index: + return a + if idx < payload_index: + if idx == -1: + raise EntryNotFoundError( + f"There is no {entry_type} in the provided pack." + ) + raise EntryNotFoundError( + f"The payload index {payload_index} is larger than maximum" + f" {entry_type} index {idx} in the provided pack." + ) raise EntryNotFoundError( f"The entry {entry_type} is not found in the provided pack." ) From e9ac30552b9fdbe592ec5618b7f7c4dd2ae6aff3 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 09:15:09 -0700 Subject: [PATCH 031/137] keep reading meta in the payload --- forte/data/data_pack.py | 102 +++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index acf13c0ab..b35f1e8ec 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -34,6 +34,7 @@ from sortedcontainers import SortedList from forte.common.exception import ( + EntryNotFoundError, ProcessExecutionException, UnknownOntologyClassException, ) @@ -47,6 +48,8 @@ from forte.data.ontology.top import ( Annotation, AudioPayload, + AudioReadingMeta, + ImagePayload, Link, Group, ReadingMeta, @@ -57,6 +60,7 @@ Grids, Payload, TextPayload, + TextReadingMeta, ) from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest @@ -469,7 +473,9 @@ def get_span_text(self, begin: int, end: int, text_payload_index=0) -> str: """ return self.get_single(TextPayload, text_payload_index).cache[begin:end] - def get_span_audio(self, begin: int, end: int) -> np.ndarray: + def get_span_audio( + self, begin: int, end: int, audio_payload_index=0 + ) -> np.ndarray: r"""Get the audio in the data pack contained in the span. `begin` and `end` represent the starting and ending indices of the span in audio payload respectively. Each index corresponds to one sample in @@ -482,47 +488,54 @@ def get_span_audio(self, begin: int, end: int) -> np.ndarray: Returns: The audio within this span. """ - audio_payload_entries = list(self.get(AudioPayload)) - # if self.pack.get(AudioPayload) is None: - # raise ProcessExecutionException( - # "The audio payload of this DataPack is not set. Please call" - # " method `set_audio` before running `get_span_audio`." - # ) - if len(audio_payload_entries) == 0: + try: + audio_payload_entry = self.get_single( + AudioPayload, audio_payload_index + ) + except EntryNotFoundError: raise ProcessExecutionException( - "The audio payload of this DataPack is not set. Please call" - " method `set_audio` before running `get_span_audio`." + "The audio payload of this DataPack at index" + f"({audio_payload_index}) is not set. Please add" + " more AudioPayload in this DataPack" ) - - audio = audio_payload_entries[0].cache - return audio[begin:end] + return audio_payload_entry.cache[begin:end] def get_image_array(self, image_payload_idx: int): - if image_payload_idx >= len(self.payloads): - raise ValueError( - f"The input image payload index{(image_payload_idx)}" - f" out of range. It should be less than {len(self.payloads)}" + """ + Get the image data in the data pack's payload with specificied image + payload index. + + Args: + image_payload_idx: the index of the image payload. + + Returns: + a numpy array representing the image. + """ + try: + image_arr = self.get_single(ImagePayload, image_payload_idx).cache + except EntryNotFoundError: + raise ProcessExecutionException( + "The image payload of this DataPack at index" + f"({image_payload_idx}) is not set. Please add" + " more ImagePayload in this DataPack" ) - return self.payloads[image_payload_idx] + return image_arr def set_text( self, text: str, replace_func: Optional[Callable[[str], ReplaceOperationsType]] = None, - text_payload_index: Optional[int] = None, + text_payload_index: int = 0, ): - # if len(text) < len(self._text): - # raise ProcessExecutionException( - # "The new text is overwriting the original one with shorter " - # "length, which might cause unexpected behavior." - # ) - - # if len(self._text): - # logging.warning( - # "Need to be cautious when changing the text of a " - # "data pack, existing entries may get affected. " - # ) + """ + Set text for TextPayload at a specified index. + Args: + text: a str text. + replace_func: function that replace text. Defaults to None. + text_payload_index (int, optional): TextPayload index in the + DataPack. Defaults to 0. + """ span_ops = [] if replace_func is None else replace_func(text) # The spans should be mutually exclusive @@ -532,14 +545,22 @@ def set_text( processed_original_spans, orig_text_len, ) = data_utils_io.modify_text_and_track_ops(text, span_ops) - tp = TextPayload(self, 0) + + tp = TextPayload(self, text_payload_index) + tp.set_cache(text) + tp.meta = TextReadingMeta(self) - tp.set_meta("replace_back_operations", replace_back_operations) - tp.set_meta("processed_original_spans", processed_original_spans) - tp.set_meta("orig_text_len", orig_text_len) + tp.meta.replace_back_operations = replace_back_operations + tp.meta.processed_original_spans = processed_original_spans + tp.meta.orig_text_len = orig_text_len - def set_audio(self, audio: np.ndarray, sample_rate: int): + def set_audio( + self, + audio: np.ndarray, + sample_rate: int, + audio_payload_index: int = 0, + ): r"""Set the audio payload and sample rate of the :class:`~forte.data.data_pack.DataPack` object. @@ -547,9 +568,10 @@ def set_audio(self, audio: np.ndarray, sample_rate: int): audio: A numpy array storing the audio waveform. sample_rate: An integer specifying the sample rate. """ - ap = AudioPayload(self, 0) + ap = AudioPayload(self, audio_payload_index) ap.set_cache(audio) - ap.set_meta("sample_rate", sample_rate) + ap.meta = AudioReadingMeta(self) + ap.meta.sample_rate = sample_rate def get_original_text(self, text_payload_index=0): r"""Get original unmodified text from the :class:`~forte.data.data_pack.DataPack` object. @@ -2058,16 +2080,10 @@ def save_entry_object( elif isinstance(entry, ReadingMeta): data_store_ref.add_reading_meta_raw( type_name=entry.entry_type(), - meta_name=entry.meta_name, tid=entry.tid, allow_duplicate=allow_duplicate, ) else: - import pdb - - pdb.set_trace() - print("") - raise ValueError( f"Invalid entry type {type(entry)}. A valid entry " f"should be an instance of Annotation, Link, Group, Generics " From 8e6e8a5818085df57620e91c9e4f3a64a0a4f8fe Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 09:17:38 -0700 Subject: [PATCH 032/137] initialize metadata in payload --- forte/data/readers/audio_reader.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 34ec0d77f..28cd05ddd 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -65,19 +65,14 @@ def _cache_key_function(self, audio_file: str) -> str: def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() - payload_idx = 0 # Read in audio data and store in DataPack # add audio payload into DataPack.payloads - ap = AudioPayload(pack, file_path, payload_idx) - # audio_data, sample_rate = self.soundfile.read(file_path) - # ap.set_cache(audio_data) - # ap.set_meta("sample_rate", sample_rate) - for k, v in self.configs: - ap.set_meta(k, v) - meta = AudioReadingMeta(pack, payload_idx) - meta._module = self.configs.read_kwargs.module - meta._reading_method = self.configs.read_kwargs.method + ap = AudioPayload(pack, payload_idx, file_path) + if not self.configs.lazy_read: + audio_data, sample_rate = self.soundfile.read(file_path) + ap.set_cache(audio_data) + ap.meta = AudioReadingMeta(pack, sample_rate) pack.pack_name = file_path yield pack @@ -97,4 +92,4 @@ def default_configs(cls): Returns: The default configuration of audio reader. """ - return {"file_ext": ".flac", "read_kwargs": None} + return {"file_ext": ".flac", "lazy_read": False, "read_kwargs": None} From 528a789ceb6e3e6ac66bfc0b6c7ee83ea9667a02 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 09:18:23 -0700 Subject: [PATCH 033/137] ReadingMeta datastore --- forte/data/data_store.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 84de44ca6..54a495ff2 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -30,7 +30,7 @@ Link, Generics, Payload, - Meta, + ReadingMeta, ) from forte.data.ontology.core import Entry, FList, FDict from forte.common import constants @@ -688,17 +688,13 @@ def _new_grid( return entry - def _new_meta( - self, type_name: str, meta_name: str, tid: Optional[int] = None - ) -> List: + def _new_meta(self, type_name: str, tid: Optional[int] = None) -> List: r"""This function generates a new grid with default fields. Called by add_grid_raw() to create a new grid - with ``type_name``, ``meta_name`` and optional ``tid``. - + with ``type_name``, and optional ``tid``. Args: type_name: The fully qualified type name of the new entry. - meta_name: The name of the Meta entry. Returns: A list representing a new grid type entry data. @@ -707,7 +703,7 @@ def _new_meta( tid: int = self._new_tid() if tid is None else tid entry: List[Any] - entry = [meta_name, None, tid, type_name] + entry = [None, None, tid, type_name] entry += self._default_attributes_for_type(type_name) return entry @@ -965,7 +961,7 @@ def _add_entry_raw( ImageAnnotation, Grids, Payload, - Meta, + ReadingMeta, ]: try: self.__elements[type_name].append(entry) @@ -1211,10 +1207,9 @@ def add_grid_raw( return tid_search_result return self._add_entry_raw(Grids, type_name, entry) - def add_meta_raw( + def add_reading_meta_raw( self, type_name: str, - meta_name: str, tid: Optional[int] = None, allow_duplicate=True, ) -> int: @@ -1243,13 +1238,13 @@ def add_meta_raw( # annotation type entry data with default fields. # A reference to the entry should be store in both self.__elements and # self.__tid_ref_dict. - entry = self._new_meta(type_name, meta_name, tid) + entry = self._new_meta(type_name, tid) if not allow_duplicate: tid_search_result = self._get_existing_ann_entry_tid(entry) # if found existing entry if tid_search_result != -1: return tid_search_result - return self._add_entry_raw(Meta, type_name, entry) + return self._add_entry_raw(ReadingMeta, type_name, entry) def _get_existing_ann_entry_tid(self, entry: List[Any]): r""" From 0dbba6600a82a609e648af0b128e0ddab6074d98 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 09:19:20 -0700 Subject: [PATCH 034/137] remove meta_name --- forte/data/ontology/top.py | 58 ++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 84e939d11..7c58ae878 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1177,6 +1177,20 @@ def __init__( class Payload(Entry): + """ + A payload class that holds data cache of one modality and its data source uri. + + Args: + pack: The container that this `Payload` will + be added to. + modality: modality of the payload such as text, audio and image. + payload_idx: the index of the payload. + uri: universal resource identifier of the data source. Defaults to None. + + Raises: + ValueError: raised when the modality is not supported. + """ + def __init__( self, pack: PackType, @@ -1229,9 +1243,8 @@ def __init__( self, pack: PackType, payload_idx: int, - path: str = None, + path: Optional[str] = None, ): - super().__init__(pack, "text", payload_idx, path) @@ -1240,9 +1253,8 @@ def __init__( self, pack: PackType, payload_idx: int, - path: str = None, + path: Optional[str] = None, ): - super().__init__(pack, "audio", payload_idx, path) def audio_len(self): @@ -1253,8 +1265,8 @@ class ImagePayload(Payload): def __init__( self, pack: PackType, - path: str, payload_idx: int, + path: Optional[str] = None, ): super().__init__(pack, "image", payload_idx, path) @@ -1266,17 +1278,13 @@ class ReadingMeta(Entry): a target file. Args: - Entry (_type_): _description_ + pack: The container that this `ReadingMeta` will + be added to. """ - def __init__(self, pack: PackType, meta_name): - self._meta_name: Optional[str] = meta_name + def __init__(self, pack: PackType): super().__init__(pack) - @property - def meta_name(self): - return self._meta_name - class TextReadingMeta(ReadingMeta): """ @@ -1290,18 +1298,16 @@ class TextReadingMeta(ReadingMeta): _type_: _description_ """ - def __init__(self, pack: PackType, meta_name: Optional[str] = None): - super().__init__(pack, meta_name=meta_name) + def __init__(self, pack: PackType): + super().__init__(pack) self.replace_back_operations = None self.processed_original_spans = None self.orig_text_len = None class ImageReadingMeta(ReadingMeta): - def __init__(self, pack: PackType, meta_name: Optional[str] = None): - if meta_name is None: - meta_name = "jpg" - super().__init__(pack, meta_name) + def __init__(self, pack: PackType): + super().__init__(pack) self.data_source_type = "disk" self.pipeline_data_type = "nparray" self.save_format = None @@ -1318,28 +1324,14 @@ class AudioReadingMeta(ReadingMeta): Args: pack (PackType): The container that this AudioReadingMeta will be added to. - meta_name (Optional[str], optional): the name for the audio metadata. - Defaults to "flac". """ - # a Meta data entry object that define metadata related to image processing - # both reading from data source, loaded format and writing format - # for example, we might want to read a high resolution png image - # and load it as a numpy array and write it into jpg format. - - # it determines what third-party packages to use to convert image to target - # data format - - # payload meta defines data source and user need to write a - # reader for the data source. - # def __init__(self, pack: PackType, meta_name: Optional[str] = None): def __init__( self, pack: PackType, sample_rate: Optional[int] = None, - meta_name: Optional[str] = "audio", ): - super().__init__(pack, meta_name=meta_name) + super().__init__(pack) self._sample_rate = sample_rate @property From 595a6b4f7f6dda37dc33009f7ab54c84b2b62a5f Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 15:12:35 -0700 Subject: [PATCH 035/137] payload_index -> entry_index --- forte/data/base_pack.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 89c89ace2..1045424fc 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -469,13 +469,12 @@ def get( raise NotImplementedError def get_single( - self, entry_type: Union[str, Type[EntryType]], payload_index=0 + self, entry_type: Union[str, Type[EntryType]], entry_index=0 ) -> EntryType: r"""Take a single entry of type :attr:`~forte.data.data_pack.DataPack.entry_type` from this data - pack. This is useful when the target entry type appears only one - time in the :class:`~forte.data.data_pack.DataPack` for e.g., a Document entry. Or you just - intended to take the first one. + pack. This is useful when you want to take an entry at a specific index or the target entry type appears only one + time in the :class:`~forte.data.data_pack.DataPack` for e.g., a Document entry. Args: entry_type: The entry type to be retrieved. @@ -485,16 +484,17 @@ def get_single( """ idx = -1 for idx, a in enumerate(self.get(entry_type)): - if idx == payload_index: + if idx == entry_index: return a - if idx < payload_index: - if idx == -1: - raise EntryNotFoundError( - f"There is no {entry_type} in the provided pack." - ) + + if idx == -1: + raise EntryNotFoundError( + f"There is no {entry_type} in the provided pack." + ) + if idx < entry_index: raise EntryNotFoundError( - f"The payload index {payload_index} is larger than maximum" - f" {entry_type} index {idx} in the provided pack." + f"The entry index {entry_index} is larger than maximum" + f" {entry_type} index ({idx}) in the provided pack." ) raise EntryNotFoundError( f"The entry {entry_type} is not found in the provided pack." From 9644b64a8225df5c8e4be1f905a24591859302de Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 15:13:47 -0700 Subject: [PATCH 036/137] docstring for get_single --- forte/data/base_pack.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 1045424fc..6eb14f312 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -469,7 +469,7 @@ def get( raise NotImplementedError def get_single( - self, entry_type: Union[str, Type[EntryType]], entry_index=0 + self, entry_type: Union[str, Type[EntryType]], entry_index: int = 0 ) -> EntryType: r"""Take a single entry of type :attr:`~forte.data.data_pack.DataPack.entry_type` from this data @@ -478,6 +478,7 @@ def get_single( Args: entry_type: The entry type to be retrieved. + entry_index: the index of the entry requested to get. Returns: A single data entry. From 55d4e0ac5e4f39c2216349cb71f6191b6d8279dd Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 15:30:58 -0700 Subject: [PATCH 037/137] add payload ontology --- forte/ontology_specs/payload_ontology.json | 61 ++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 forte/ontology_specs/payload_ontology.json diff --git a/forte/ontology_specs/payload_ontology.json b/forte/ontology_specs/payload_ontology.json new file mode 100644 index 000000000..e102a7de5 --- /dev/null +++ b/forte/ontology_specs/payload_ontology.json @@ -0,0 +1,61 @@ +{ + "name": "payload_ontology", + "definitions": [ + { + "entry_name": "ft.onto.payload_ontology.AudioPayload", + "parent_entry": "forte.data.ontology.top.Payload", + "description": "A payload that caches audio data", + "attributes":[] + }, + { + "entry_name": "ft.onto.payload_ontology.TextPayload", + "parent_entry": "forte.data.ontology.top.Payload", + "description": "A payload that caches text data", + "attributes": [ + { + "name": "replace_back_operations", + "type": "List", + "item_type": "typing.Tuple" + }, + { + "name": "processed_original_spans", + "type": "List", + "item_type": "typing.Tuple" + }, + { + "name": "orig_text_len", + "type": "int" + } + ] + }, + { + "entry_name": "ft.onto.payload_ontology.ImagePayload", + "parent_entry": "forte.data.ontology.top.Payload", + "description": "A payload that caches image data", + "attributes":[] + }, + { + "entry_name": "ft.onto.payload_ontology.TextReadingMeta", + "parent_entry": "forte.data.ontology.top.ReadingMeta", + "description": "A reading meta entry data that records meta data related to reading text", + "attributes": [] + }, + { + "entry_name": "ft.onto.payload_ontology.ImageReadingMeta", + "parent_entry": "forte.data.ontology.top.ReadingMeta", + "description": "A ImageReadingMeta entry data that records meta data related to reading image from data source.", + "attributes":[] + }, + { + "entry_name": "ft.onto.payload_ontology.AudioReadingMeta", + "parent_entry": "forte.data.ontology.top.ReadingMeta", + "description": "An AudioReadingMeta entry defines metadata related to reading raw audio from data source. It can be efficiently serialized and deserialized within DataPack, and it can be further converted to loading method by using loading method registry. It's bound to one payload.", + "attributes": [ + { + "name": "sample_rate", + "type": "int" + } + ] + } + ] +} From d456417b0d8748c601b3f453352702592c2c9eb7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 21:02:54 -0700 Subject: [PATCH 038/137] remote text, audio property and get_image_array function and remove ReadingMeta --- forte/data/data_pack.py | 89 ++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index b35f1e8ec..3da47af5d 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -48,11 +48,9 @@ from forte.data.ontology.top import ( Annotation, AudioPayload, - AudioReadingMeta, ImagePayload, Link, Group, - ReadingMeta, SinglePackEntries, Generics, AudioAnnotation, @@ -60,8 +58,8 @@ Grids, Payload, TextPayload, - TextReadingMeta, ) + from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest from forte.utils import get_class, get_full_module_name @@ -251,21 +249,6 @@ def _init_meta(self, pack_name: Optional[str] = None) -> Meta: def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) - @property - def text(self, text_payload_index=0) -> str: - r"""Return the text of the data pack""" - return self.get_single(TextPayload, text_payload_index).cache - - @property - def audio(self, audio_payload_index=0) -> Optional[np.ndarray]: - r"""Return the audio of the data pack""" - return self.get_single(AudioPayload, audio_payload_index).cache - - @property - def sample_rate(self) -> Optional[int]: - r"""Return the sample rate of the audio data""" - return getattr(self._meta, "sample_rate") - @property def all_annotations(self) -> Iterator[Annotation]: """ @@ -461,12 +444,45 @@ def groups(self): def groups(self, val): self._groups = val - def get_span_text(self, begin: int, end: int, text_payload_index=0) -> str: + def get_payload_at( + self, modality: str, payload_index: int + ) -> Union[str, np.ndarray]: + """ + Get Payload of requested modality at the requested payload index. + + Args: + modality: data modality among "text", "audio", "image" + payload_index (int): the index of the requested payload + + Raises: + ValueError: raised when the requested modality is not supported. + + Returns: + str data for text data or numpy array for image and audio data. + """ + supported_modality = ("text", "audio", "image") + if modality == "text": + return self.get_single(TextPayload, payload_index) + elif modality == "audio": + return self.get_single(AudioPayload, payload_index) + elif modality == "image": + return self.get_single(ImagePayload, payload_index) + else: + raise ValueError( + f"Provided modality {modality} is not supported." + "Please provide one of modality among" + f" {supported_modality}." + ) + + def get_span_text( + self, begin: int, end: int, text_payload_index: int = 0 + ) -> str: r"""Get the text in the data pack contained in the span. Args: begin: begin index to query. end: end index to query. + text_payload_index: entry index of text payload in this DataPack. Returns: The text within this span. @@ -484,6 +500,7 @@ def get_span_audio( Args: begin: begin index to query. end: end index to query. + audio_payload_index: entry index of audio payload in this DataPack. Returns: The audio within this span. @@ -500,27 +517,6 @@ def get_span_audio( ) return audio_payload_entry.cache[begin:end] - def get_image_array(self, image_payload_idx: int): - """ - Get the image data in the data pack's payload with specificied image - payload index. - - Args: - image_payload_idx: the index of the image payload. - - Returns: - a numpy array representing the image. - """ - try: - image_arr = self.get_single(ImagePayload, image_payload_idx).cache - except EntryNotFoundError: - raise ProcessExecutionException( - "The image payload of this DataPack at index" - f"({image_payload_idx}) is not set. Please add" - " more ImagePayload in this DataPack" - ) - return image_arr - def set_text( self, text: str, @@ -549,11 +545,11 @@ def set_text( tp = TextPayload(self, text_payload_index) tp.set_cache(text) - tp.meta = TextReadingMeta(self) + tp.meta = Generics(self) - tp.meta.replace_back_operations = replace_back_operations - tp.meta.processed_original_spans = processed_original_spans - tp.meta.orig_text_len = orig_text_len + tp.replace_back_operations = replace_back_operations + tp.processed_original_spans = processed_original_spans + tp.orig_text_len = orig_text_len def set_audio( self, @@ -573,9 +569,12 @@ def set_audio( ap.meta = AudioReadingMeta(self) ap.meta.sample_rate = sample_rate - def get_original_text(self, text_payload_index=0): + def get_original_text(self, text_payload_index: int = 0): r"""Get original unmodified text from the :class:`~forte.data.data_pack.DataPack` object. + Args: + text + Returns: Original text after applying the `replace_back_operations` of :class:`~forte.data.data_pack.DataPack` object to the modified text From 9196099a028e2af1b5909a53a2557cd96b2ebc11 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 21:03:43 -0700 Subject: [PATCH 039/137] remove payload ontology --- forte/ontology_specs/payload_ontology.json | 61 ---------------------- 1 file changed, 61 deletions(-) delete mode 100644 forte/ontology_specs/payload_ontology.json diff --git a/forte/ontology_specs/payload_ontology.json b/forte/ontology_specs/payload_ontology.json deleted file mode 100644 index e102a7de5..000000000 --- a/forte/ontology_specs/payload_ontology.json +++ /dev/null @@ -1,61 +0,0 @@ -{ - "name": "payload_ontology", - "definitions": [ - { - "entry_name": "ft.onto.payload_ontology.AudioPayload", - "parent_entry": "forte.data.ontology.top.Payload", - "description": "A payload that caches audio data", - "attributes":[] - }, - { - "entry_name": "ft.onto.payload_ontology.TextPayload", - "parent_entry": "forte.data.ontology.top.Payload", - "description": "A payload that caches text data", - "attributes": [ - { - "name": "replace_back_operations", - "type": "List", - "item_type": "typing.Tuple" - }, - { - "name": "processed_original_spans", - "type": "List", - "item_type": "typing.Tuple" - }, - { - "name": "orig_text_len", - "type": "int" - } - ] - }, - { - "entry_name": "ft.onto.payload_ontology.ImagePayload", - "parent_entry": "forte.data.ontology.top.Payload", - "description": "A payload that caches image data", - "attributes":[] - }, - { - "entry_name": "ft.onto.payload_ontology.TextReadingMeta", - "parent_entry": "forte.data.ontology.top.ReadingMeta", - "description": "A reading meta entry data that records meta data related to reading text", - "attributes": [] - }, - { - "entry_name": "ft.onto.payload_ontology.ImageReadingMeta", - "parent_entry": "forte.data.ontology.top.ReadingMeta", - "description": "A ImageReadingMeta entry data that records meta data related to reading image from data source.", - "attributes":[] - }, - { - "entry_name": "ft.onto.payload_ontology.AudioReadingMeta", - "parent_entry": "forte.data.ontology.top.ReadingMeta", - "description": "An AudioReadingMeta entry defines metadata related to reading raw audio from data source. It can be efficiently serialized and deserialized within DataPack, and it can be further converted to loading method by using loading method registry. It's bound to one payload.", - "attributes": [ - { - "name": "sample_rate", - "type": "int" - } - ] - } - ] -} From 4bc7917f3c0a5f933eaaba9b741882dc55b4ba92 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 21:05:55 -0700 Subject: [PATCH 040/137] remove ReadingMeta --- forte/data/ontology/top.py | 73 -------------------------------------- 1 file changed, 73 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 7c58ae878..b820d2377 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -55,10 +55,6 @@ "TextPayload", "ImagePayload", "AudioPayload", - "ReadingMeta", - "ImageReadingMeta", - "AudioReadingMeta", - "TextReadingMeta", ] QueryType = Union[Dict[str, Any], np.ndarray] @@ -1271,74 +1267,6 @@ def __init__( super().__init__(pack, "image", payload_idx, path) -class ReadingMeta(Entry): - """ - a Meta entry defines metadata related to data processing - about reading from data source, loading data to cache, and writing to - a target file. - - Args: - pack: The container that this `ReadingMeta` will - be added to. - """ - - def __init__(self, pack: PackType): - super().__init__(pack) - - -class TextReadingMeta(ReadingMeta): - """ - a text meta entry defines metadata related to text data reading from - data source. - - Args: - Entry (_type_): _description_ - - Returns: - _type_: _description_ - """ - - def __init__(self, pack: PackType): - super().__init__(pack) - self.replace_back_operations = None - self.processed_original_spans = None - self.orig_text_len = None - - -class ImageReadingMeta(ReadingMeta): - def __init__(self, pack: PackType): - super().__init__(pack) - self.data_source_type = "disk" - self.pipeline_data_type = "nparray" - self.save_format = None - self.type_code = "jpg" - - -class AudioReadingMeta(ReadingMeta): - """ - An AudioReadingMeta entry defines metadata related to reading raw audio - from data source. It can be efficiently serialized and deserialized within - DataPack, and it can be further converted to loading method by using - loading method registry. It's bound to one payload. - - Args: - pack (PackType): The container that this AudioReadingMeta will - be added to. - """ - - def __init__( - self, - pack: PackType, - sample_rate: Optional[int] = None, - ): - super().__init__(pack) - self._sample_rate = sample_rate - - @property - def sample_rate(self): - return self._sample_rate - - SinglePackEntries = ( Link, Group, @@ -1347,6 +1275,5 @@ def sample_rate(self): AudioAnnotation, ImageAnnotation, Payload, - ReadingMeta, ) MultiPackEntries = (MultiPackLink, MultiPackGroup, MultiPackGeneric) From a0d30c295528417081c33c543aec490958863eef Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 21:31:45 -0700 Subject: [PATCH 041/137] remove reading meta --- forte/data/data_store.py | 41 ---------------------------------------- 1 file changed, 41 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 54a495ff2..9cf5ecd43 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -30,7 +30,6 @@ Link, Generics, Payload, - ReadingMeta, ) from forte.data.ontology.core import Entry, FList, FDict from forte.common import constants @@ -961,7 +960,6 @@ def _add_entry_raw( ImageAnnotation, Grids, Payload, - ReadingMeta, ]: try: self.__elements[type_name].append(entry) @@ -1207,45 +1205,6 @@ def add_grid_raw( return tid_search_result return self._add_entry_raw(Grids, type_name, entry) - def add_reading_meta_raw( - self, - type_name: str, - tid: Optional[int] = None, - allow_duplicate=True, - ) -> int: - - r""" - This function adds an image annotation entry with ``image_payload_idx`` - indices to current data store object. Returns the ``tid`` for the - inserted entry. - - Args: - type_name: The fully qualified type name of the new grid. - image_payload_idx: the index of the image payload. - tid: ``tid`` of the Annotation entry that is being added. - It's optional, and it will be - auto-assigned if not given. - allow_duplicate: Whether we allow duplicate in the DataStore. When - it's set to False, the function will return the ``tid`` of - existing entry if a duplicate is found. Default value is True. - - Returns: - ``tid`` of the entry. - """ - # We should create the `entry data` with the format - # [begin, end, tid, type_id, None, ...]. - # A helper function _new_annotation() can be used to generate a - # annotation type entry data with default fields. - # A reference to the entry should be store in both self.__elements and - # self.__tid_ref_dict. - entry = self._new_meta(type_name, tid) - if not allow_duplicate: - tid_search_result = self._get_existing_ann_entry_tid(entry) - # if found existing entry - if tid_search_result != -1: - return tid_search_result - return self._add_entry_raw(ReadingMeta, type_name, entry) - def _get_existing_ann_entry_tid(self, entry: List[Any]): r""" This function searches for tid for existing annotation-like entry tid. From e83a633897580932770e855bed958fc677d4984f Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 21:40:15 -0700 Subject: [PATCH 042/137] add enum modality --- forte/data/ontology/top.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index b820d2377..b5708045f 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass +from enum import Enum from functools import total_ordering from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List @@ -1190,12 +1191,12 @@ class Payload(Entry): def __init__( self, pack: PackType, - modality: str, + modality: Enum, payload_idx: int, uri: str = None, ): supported_modality = ("text", "audio", "image") - if modality not in supported_modality: + if modality.name not in supported_modality: raise ValueError( f"The given modality {modality} is not supported. " f"Currently we only support {supported_modality}" @@ -1208,6 +1209,12 @@ def __init__( self._cache = None self.meta = None + def get_type(self): + return type(self) + + def get_modality(self): + return self._modality + @property def cache(self): if self._cache is None: @@ -1241,7 +1248,7 @@ def __init__( payload_idx: int, path: Optional[str] = None, ): - super().__init__(pack, "text", payload_idx, path) + super().__init__(pack, Modality.text, payload_idx, path) class AudioPayload(Payload): @@ -1251,7 +1258,7 @@ def __init__( payload_idx: int, path: Optional[str] = None, ): - super().__init__(pack, "audio", payload_idx, path) + super().__init__(pack, Modality.audio, payload_idx, path) def audio_len(self): return len(self._cache) @@ -1264,8 +1271,10 @@ def __init__( payload_idx: int, path: Optional[str] = None, ): - super().__init__(pack, "image", payload_idx, path) + super().__init__(pack, Modality.image, payload_idx, path) + +Modality = Enum("modality", "text audio image") SinglePackEntries = ( Link, From 34f5f3a1bce6285ecc96a442d40ee785f4abbf68 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 21:45:52 -0700 Subject: [PATCH 043/137] payload cache --- forte/data/ontology/top.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index b5708045f..046e399a1 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1210,10 +1210,22 @@ def __init__( self.meta = None def get_type(self): + """ + Get the type of the payload class. + + Returns: + the type of the payload class. + """ return type(self) def get_modality(self): - return self._modality + """ + Get the modality of the payload class. + + Returns: + the modality of the payload class in str format. + """ + return self._modality.name @property def cache(self): @@ -1237,7 +1249,13 @@ def payload_index(self): def uri(self): return self._uri - def set_cache(self, data): + def set_cache(self, data: Union[str, np.ndarray]): + """ + Load cache data into the payload. + + Args: + data: data to be set in the payload. + """ self._cache = data From 6b88c3928992aae2828c035309c29fc977acf33d Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 22:14:27 -0700 Subject: [PATCH 044/137] add docstring for payload_index --- forte/data/data_pack.py | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 3da47af5d..00fd68d2d 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -482,7 +482,8 @@ def get_span_text( Args: begin: begin index to query. end: end index to query. - text_payload_index: entry index of text payload in this DataPack. + text_payload_index: the zero-based index of the TextPayload + in this DataPack's TextPayload entries. Defaults to 0. Returns: The text within this span. @@ -500,7 +501,8 @@ def get_span_audio( Args: begin: begin index to query. end: end index to query. - audio_payload_index: entry index of audio payload in this DataPack. + audio_payload_index: the zero-based index of the AudioPayload + in this DataPack's AudioPayload entries. Defaults to 0. Returns: The audio within this span. @@ -529,8 +531,8 @@ def set_text( Args: text: a str text. replace_func: function that replace text. Defaults to None. - text_payload_index (int, optional): TextPayload index in the - DataPack. Defaults to 0. + text_payload_index: the zero-based index of the TextPayload + in this DataPack's TextPayload entries. Defaults to 0. """ span_ops = [] if replace_func is None else replace_func(text) @@ -543,7 +545,6 @@ def set_text( ) = data_utils_io.modify_text_and_track_ops(text, span_ops) tp = TextPayload(self, text_payload_index) - tp.set_cache(text) tp.meta = Generics(self) @@ -563,17 +564,20 @@ def set_audio( Args: audio: A numpy array storing the audio waveform. sample_rate: An integer specifying the sample rate. + audio_payload_index: the zero-based index of the AudioPayload + in this DataPack's AudioPayload entries. Defaults to 0. """ ap = AudioPayload(self, audio_payload_index) ap.set_cache(audio) - ap.meta = AudioReadingMeta(self) + ap.meta = Generics(self) ap.meta.sample_rate = sample_rate def get_original_text(self, text_payload_index: int = 0): r"""Get original unmodified text from the :class:`~forte.data.data_pack.DataPack` object. Args: - text + text_payload_index: the zero-based index of the TextPayload + in this DataPack's TextPayload entries. Defaults to 0. Returns: Original text after applying the `replace_back_operations` of @@ -784,7 +788,7 @@ def __add_entry_with_check(self, entry: Union[EntryType, int]) -> EntryType: f"is not a valid begin." ) - if end > len(self.text): + if end > len(self.get_payload_at("text", 0).cache): if len(self.text) == 0: raise ValueError( f"The end {end} of span is greater than the text " @@ -850,7 +854,7 @@ def get_data( context_type: Union[str, Type[Annotation], Type[AudioAnnotation]], request: Optional[DataRequest] = None, skip_k: int = 0, - payload_index=0, + payload_index: int = 0, ) -> Iterator[Dict[str, Any]]: r"""Fetch data from entries in the data_pack of type `context_type`. Data includes `"span"`, annotation-specific @@ -1013,9 +1017,13 @@ def get_annotation_list( " [Annotation, AudioAnnotation]." ) - def get_context_data(c_type, context, payload_index): - r"""Get context-specific data of a given context type and - context. + def get_context_data( + c_type: Union[Type[Annotation], Type[AudioAnnotation]], + context: Union[Annotation, AudioAnnotation], + payload_index: int, + ): + r""" + Get context-specific data of a given context type and context. Args: c_type: @@ -1023,6 +1031,7 @@ def get_context_data(c_type, context, payload_index): could be any :class:`~forte.data.ontology.top.Annotation` type. context: context that contains data to be extracted. + payload_index: the index of the payload of requrested modality. Raises: NotImplementedError: raised when the given context type is @@ -2076,12 +2085,6 @@ def save_entry_object( tid=entry.tid, allow_duplicate=allow_duplicate, ) - elif isinstance(entry, ReadingMeta): - data_store_ref.add_reading_meta_raw( - type_name=entry.entry_type(), - tid=entry.tid, - allow_duplicate=allow_duplicate, - ) else: raise ValueError( f"Invalid entry type {type(entry)}. A valid entry " From 9dacad24f81e9011655b84af543cdd670c44d9ff Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 22:26:08 -0700 Subject: [PATCH 045/137] payload_index docstring --- forte/data/data_pack.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 00fd68d2d..db939e375 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -452,7 +452,8 @@ def get_payload_at( Args: modality: data modality among "text", "audio", "image" - payload_index (int): the index of the requested payload + payload_index (int): the zero-based index of the Payload + in this DataPack's Payload entries of the requested modality. Raises: ValueError: raised when the requested modality is not supported. @@ -929,6 +930,9 @@ def get_data( returned by default. skip_k: Will skip the first `skip_k` instances and generate data from the (`offset` + 1)th instance. + payload_index: the zero-based index of the Payload + in this DataPack's Payload entries of a particular modality. + The modality is depedent on ``context_type``. Defaults to 0. Returns: A data generator, which generates one piece of data (a dict @@ -1031,7 +1035,10 @@ def get_context_data( could be any :class:`~forte.data.ontology.top.Annotation` type. context: context that contains data to be extracted. - payload_index: the index of the payload of requrested modality. + payload_index: the zero-based index of the Payload + in this DataPack's Payload entries of a particular modality. + The modality is depedent on ``c_type``. + Defaults to 0. Raises: NotImplementedError: raised when the given context type is From d18b8b43dc72fc5889c2b346dec7a4c5d161edb3 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 22:33:15 -0700 Subject: [PATCH 046/137] add docstring for payload related functions --- forte/data/data_store.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 9cf5ecd43..65960d979 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -12,6 +12,7 @@ # limitations under the License. +from enum import Enum from typing import Dict, List, Iterator, Tuple, Optional, Any, Type import uuid import logging @@ -711,13 +712,29 @@ def _new_payload( self, type_name: str, payload_idx: int, - modality: str, + modality: Enum, tid: Optional[int] = None, ) -> List: + r"""This function generates a new payload with default fields. + Called by add_payload_raw() to create a new payload with ``type_name``, + ``payload_idx``, and ``modality``. + + Args: + type_name: The fully qualified type name of the new entry. + payload_idx: the zero-based index of the TextPayload + in this DataPack's TextPayload entries. + modality (Enum): an ``Enum`` object that represents the payload + modality. + tid (Optional[int], optional): _description_. Defaults to None. + + Returns: + A list representing new payload raw data. + """ + tid: int = self._new_tid() if tid is None else tid entry: List[Any] - entry = [payload_idx, modality, tid, type_name] + entry = [payload_idx, modality.name, tid, type_name] entry += self._default_attributes_for_type(type_name) return entry @@ -1126,7 +1143,7 @@ def add_payload_raw( self, type_name: str, payload_idx: int, - modality: str, + modality: Enum, tid: Optional[int] = None, allow_duplicate=True, ) -> int: @@ -1138,7 +1155,8 @@ def add_payload_raw( Args: type_name: The fully qualified type name of the new Payload. - payload_idx: the index of the payload. + payload_idx: the zero-based index of the Payload + in this DataPack's Payload entries of the requested modality. modality: the payload modality which can be text, audio, image. tid: ``tid`` of the Payload entry that is being added. It's optional, and it will be auto-assigned if not given. From 20e943fa0c985b559bf779956950ed4a386bd832 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 15 Jun 2022 22:34:35 -0700 Subject: [PATCH 047/137] remove unused method --- forte/data/ontology/top.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 046e399a1..a33f47ae2 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1278,9 +1278,6 @@ def __init__( ): super().__init__(pack, Modality.audio, payload_idx, path) - def audio_len(self): - return len(self._cache) - class ImagePayload(Payload): def __init__( From 406b659ccef1d51d5b828dd15043405b8ad65f14 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:05:47 -0700 Subject: [PATCH 048/137] get_payload_at() and DataPack.payloads of three modalities --- forte/data/data_pack.py | 111 ++++++++++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 32 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index db939e375..ab4bd7781 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -34,8 +34,6 @@ from sortedcontainers import SortedList from forte.common.exception import ( - EntryNotFoundError, - ProcessExecutionException, UnknownOntologyClassException, ) from forte.common.constants import TID_INDEX @@ -47,8 +45,6 @@ from forte.data.ontology.core import EntryType from forte.data.ontology.top import ( Annotation, - AudioPayload, - ImagePayload, Link, Group, SinglePackEntries, @@ -57,7 +53,6 @@ ImageAnnotation, Grids, Payload, - TextPayload, ) from forte.data.span import Span @@ -177,7 +172,10 @@ def __init__(self, pack_name: Optional[str] = None): self._entry_converter: EntryConverter = EntryConverter() self.image_annotations: List[ImageAnnotation] = [] self.grids: List[Grids] = [] - self.payloads: List[np.ndarray] = [] + + self.text_payloads: List[Payload] = [] + self.audio_payloads: List[Payload] = [] + self.image_payloads: List[Payload] = [] self.__replace_back_operations: ReplaceOperationsType = [] self.__processed_original_spans: List[Tuple[Span, Span]] = [] @@ -249,6 +247,24 @@ def _init_meta(self, pack_name: Optional[str] = None) -> Meta: def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) + @property + def text(self, text_payload_index: int = 0) -> str: + """ + Get text from a text payload at an index. + + Args: + text_payload_index: the index of the text payload. Defaults to 0. + + Raises: + ValueError: raised when the index is out of bound of the text + payload list. + + Returns: + text data in the text payload. + """ + tp = self.get_payload_at("text", text_payload_index) + return tp.cache + @property def all_annotations(self) -> Iterator[Annotation]: """ @@ -459,21 +475,53 @@ def get_payload_at( ValueError: raised when the requested modality is not supported. Returns: - str data for text data or numpy array for image and audio data. + Payload entry containing text data or numpy array for image and + audio data. """ supported_modality = ("text", "audio", "image") - if modality == "text": - return self.get_single(TextPayload, payload_index) - elif modality == "audio": - return self.get_single(AudioPayload, payload_index) - elif modality == "image": - return self.get_single(ImagePayload, payload_index) - else: + try: + if modality == "text": + payloads_length = len(self.text_payloads) + payload = self.text_payloads[payload_index] + elif modality == "audio": + payloads_length = len(self.audio_payloads) + payload = self.audio_payloads[payload_index] + elif modality == "image": + payloads_length = len(self.image_payloads) + payload = self.image_payloads[payload_index] + else: + raise ValueError( + f"Provided modality {modality} is not supported." + "Please provide one of modality among" + f" {supported_modality}." + ) + except IndexError: raise ValueError( - f"Provided modality {modality} is not supported." - "Please provide one of modality among" - f" {supported_modality}." + f"payload index ({payload_index}) " + f"is larger or equal to {modality} payload list" + f" length ({payloads_length}). " + f"Please input a {modality} payload index less than it." ) + return payload + + def get_payload_data_at( + self, modality: str, payload_index: int + ) -> Union[str, np.ndarray]: + """ + Get Payload of requested modality at the requested payload index. + + Args: + modality: data modality among "text", "audio", "image" + payload_index (int): the zero-based index of the Payload + in this DataPack's Payload entries of the requested modality. + + Raises: + ValueError: raised when the requested modality is not supported. + + Returns: + str data for text data or numpy array for image and audio data. + """ + return self.get_payload_at(modality, payload_index).cache def get_span_text( self, begin: int, end: int, text_payload_index: int = 0 @@ -489,7 +537,7 @@ def get_span_text( Returns: The text within this span. """ - return self.get_single(TextPayload, text_payload_index).cache[begin:end] + return self.text_payloads[text_payload_index].cache[begin:end] def get_span_audio( self, begin: int, end: int, audio_payload_index=0 @@ -508,16 +556,7 @@ def get_span_audio( Returns: The audio within this span. """ - try: - audio_payload_entry = self.get_single( - AudioPayload, audio_payload_index - ) - except EntryNotFoundError: - raise ProcessExecutionException( - "The audio payload of this DataPack at index" - f"({audio_payload_index}) is not set. Please add" - " more AudioPayload in this DataPack" - ) + audio_payload_entry = self.get_payload_at("audio", audio_payload_index) return audio_payload_entry.cache[begin:end] def set_text( @@ -544,8 +583,8 @@ def set_text( processed_original_spans, orig_text_len, ) = data_utils_io.modify_text_and_track_ops(text, span_ops) + tp = self.get_payload_at("text", text_payload_index) - tp = TextPayload(self, text_payload_index) tp.set_cache(text) tp.meta = Generics(self) @@ -584,7 +623,7 @@ def get_original_text(self, text_payload_index: int = 0): Original text after applying the `replace_back_operations` of :class:`~forte.data.data_pack.DataPack` object to the modified text """ - tp = self.get_single(TextPayload, text_payload_index) + tp = self.get_payload_at("text", text_payload_index) original_text, _, _, _ = data_utils_io.modify_text_and_track_ops( tp.cache, tp.get_meta("replace_back_operations") ) @@ -1048,11 +1087,11 @@ def get_context_data( str: context data. """ if issubclass(c_type, Annotation): - return self.get_single(TextPayload, payload_index).cache[ + return self.get_payload_at("text", payload_index).cache[ context.begin : context.end ] elif issubclass(c_type, AudioAnnotation): - return self.get_single(AudioPayload, payload_index).cache[ + return self.get_payload_at("audio", payload_index).cache[ context.begin : context.end ] else: @@ -1632,6 +1671,14 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): # Record that this entry hasn't been added to the index yet. self._pending_entries[entry.tid] = entry.tid, c + if isinstance(entry, Payload): + if entry.get_modality() == "text": + self.text_payloads.append(entry) + elif entry.get_modality() == "audio": + self.audio_payloads.append(entry) + elif entry.get_modality() == "image": + self.image_payloads.append(entry) + def __del__(self): super().__del__() # Remove all the remaining tids in _pending_entries. From c854b46c881667acc16a19ac8d8030c3307c4c83 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:07:35 -0700 Subject: [PATCH 049/137] remove payloads of different modalities --- forte/data/data_store.py | 111 ++----------------------------------- forte/data/ontology/top.py | 39 +------------ 2 files changed, 7 insertions(+), 143 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 65960d979..d0180d507 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -12,7 +12,6 @@ # limitations under the License. -from enum import Enum from typing import Dict, List, Iterator, Tuple, Optional, Any, Type import uuid import logging @@ -30,7 +29,6 @@ ImageAnnotation, Link, Generics, - Payload, ) from forte.data.ontology.core import Entry, FList, FDict from forte.common import constants @@ -688,57 +686,6 @@ def _new_grid( return entry - def _new_meta(self, type_name: str, tid: Optional[int] = None) -> List: - r"""This function generates a new grid with default fields. - Called by add_grid_raw() to create a new grid - with ``type_name``, and optional ``tid``. - - Args: - type_name: The fully qualified type name of the new entry. - - Returns: - A list representing a new grid type entry data. - """ - - tid: int = self._new_tid() if tid is None else tid - entry: List[Any] - - entry = [None, None, tid, type_name] - entry += self._default_attributes_for_type(type_name) - - return entry - - def _new_payload( - self, - type_name: str, - payload_idx: int, - modality: Enum, - tid: Optional[int] = None, - ) -> List: - r"""This function generates a new payload with default fields. - Called by add_payload_raw() to create a new payload with ``type_name``, - ``payload_idx``, and ``modality``. - - Args: - type_name: The fully qualified type name of the new entry. - payload_idx: the zero-based index of the TextPayload - in this DataPack's TextPayload entries. - modality (Enum): an ``Enum`` object that represents the payload - modality. - tid (Optional[int], optional): _description_. Defaults to None. - - Returns: - A list representing new payload raw data. - """ - - tid: int = self._new_tid() if tid is None else tid - entry: List[Any] - - entry = [payload_idx, modality.name, tid, type_name] - entry += self._default_attributes_for_type(type_name) - - return entry - def _new_link( self, type_name: str, @@ -970,14 +917,7 @@ def _add_entry_raw( except KeyError: self.__elements[type_name] = SortedList(key=sorting_fn) self.__elements[type_name].add(entry) - elif entry_type in [ - Link, - Group, - Generics, - ImageAnnotation, - Grids, - Payload, - ]: + elif entry_type in [Link, Group, Generics, ImageAnnotation, Grids]: try: self.__elements[type_name].append(entry) except KeyError: @@ -1125,8 +1065,8 @@ def add_image_annotation_raw( ``tid`` of the entry. """ # We should create the `entry data` with the format - # [image_payload_idx, None, tid, type_id, None, ...]. - # A helper function _new_image_annotation() can be used to generate a + # [begin, end, tid, type_id, None, ...]. + # A helper function _new_annotation() can be used to generate a # annotation type entry data with default fields. # A reference to the entry should be store in both self.__elements and # self.__tid_ref_dict. @@ -1139,49 +1079,6 @@ def add_image_annotation_raw( return tid_search_result return self._add_entry_raw(AudioAnnotation, type_name, entry) - def add_payload_raw( - self, - type_name: str, - payload_idx: int, - modality: Enum, - tid: Optional[int] = None, - allow_duplicate=True, - ) -> int: - - r""" - This function adds an payload entry with ``payload_idx`` - and modality to current data store object. Returns the ``tid`` for the - inserted entry. - - Args: - type_name: The fully qualified type name of the new Payload. - payload_idx: the zero-based index of the Payload - in this DataPack's Payload entries of the requested modality. - modality: the payload modality which can be text, audio, image. - tid: ``tid`` of the Payload entry that is being added. - It's optional, and it will be auto-assigned if not given. - allow_duplicate: Whether we allow duplicate in the DataStore. When - it's set to False, the function will return the ``tid`` of - existing entry if a duplicate is found. Default value is True. - - Returns: - ``tid`` of the entry. - """ - # We should create the `entry data` with the format - # [payload_idx, modality, tid, type_id, None, ...]. - # A helper function _new_payload() can be used to generate a - # payload type entry data with default fields. - # A reference to the entry should be store in both self.__elements and - # self.__tid_ref_dict. - entry = self._new_payload(type_name, payload_idx, modality, tid) - - if not allow_duplicate: - tid_search_result = self._get_existing_ann_entry_tid(entry) - # if found existing entry - if tid_search_result != -1: - return tid_search_result - return self._add_entry_raw(Payload, type_name, entry) - def add_grid_raw( self, type_name: str, @@ -1191,7 +1088,7 @@ def add_grid_raw( ) -> int: r""" - This function adds a grid entry with ``image_payload_idx`` + This function adds an image annotation entry with ``image_payload_idx`` indices to current data store object. Returns the ``tid`` for the inserted entry. diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index a33f47ae2..3a960b588 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from enum import Enum +from enum import IntEnum from functools import total_ordering from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List @@ -53,9 +53,6 @@ "Box", "BoundingBox", "Payload", - "TextPayload", - "ImagePayload", - "AudioPayload", ] QueryType = Union[Dict[str, Any], np.ndarray] @@ -1191,7 +1188,7 @@ class Payload(Entry): def __init__( self, pack: PackType, - modality: Enum, + modality: IntEnum, payload_idx: int, uri: str = None, ): @@ -1259,37 +1256,7 @@ def set_cache(self, data: Union[str, np.ndarray]): self._cache = data -class TextPayload(Payload): - def __init__( - self, - pack: PackType, - payload_idx: int, - path: Optional[str] = None, - ): - super().__init__(pack, Modality.text, payload_idx, path) - - -class AudioPayload(Payload): - def __init__( - self, - pack: PackType, - payload_idx: int, - path: Optional[str] = None, - ): - super().__init__(pack, Modality.audio, payload_idx, path) - - -class ImagePayload(Payload): - def __init__( - self, - pack: PackType, - payload_idx: int, - path: Optional[str] = None, - ): - super().__init__(pack, Modality.image, payload_idx, path) - - -Modality = Enum("modality", "text audio image") +Modality = IntEnum("modality", "text audio image") SinglePackEntries = ( Link, From 5d225f8bde7eed6ad8f5300b5a19f03a794e5180 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:08:19 -0700 Subject: [PATCH 050/137] AudioReadingMeta -> Generics --- forte/data/readers/audio_reader.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 28cd05ddd..8fbc41ab3 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -20,7 +20,8 @@ from forte.data.data_pack import DataPack from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader -from forte.data.ontology.top import AudioPayload, AudioReadingMeta +from forte.data.ontology.top import Generics, Modality +from ft.onto.base_ontology import AudioPayload __all__ = [ "AudioReader", @@ -68,11 +69,13 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: payload_idx = 0 # Read in audio data and store in DataPack # add audio payload into DataPack.payloads - ap = AudioPayload(pack, payload_idx, file_path) + + ap = AudioPayload(pack, Modality.audio, payload_idx, file_path) if not self.configs.lazy_read: audio_data, sample_rate = self.soundfile.read(file_path) ap.set_cache(audio_data) - ap.meta = AudioReadingMeta(pack, sample_rate) + ap.meta = Generics(pack) + ap.meta.sample_rate = sample_rate pack.pack_name = file_path yield pack From 3e13c9ddcd77ceee2c4fce2c74b261d4c16ef30b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:08:47 -0700 Subject: [PATCH 051/137] add payload ontologies of three modalities --- forte/ontology_specs/base_ontology.json | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json index 8b78809f7..8833388c1 100644 --- a/forte/ontology_specs/base_ontology.json +++ b/forte/ontology_specs/base_ontology.json @@ -444,6 +444,29 @@ "type": "str" } ] + }, + { + "entry_name": "ft.onto.base_ontology.AudioPayload", + "parent_entry": "forte.data.ontology.top.Payload", + "description": "A payload that caches audio data", + "attributes":[ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.TextPayload", + "parent_entry": "forte.data.ontology.top.Payload", + "description": "A payload that caches text data", + "attributes": [] + }, + { + "entry_name": "ft.onto.base_ontology.ImagePayload", + "parent_entry": "forte.data.ontology.top.Payload", + "description": "A payload that caches image data", + "attributes":[] } ] } From aa275e38cfafa08ca0c22b465db141d70eb78abe Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:09:37 -0700 Subject: [PATCH 052/137] update generated base ontology classes --- ft/onto/base_ontology.py | 118 ++++++++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 19 deletions(-) diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 0be5956d7..7e7d53396 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -8,7 +8,7 @@ """ from dataclasses import dataclass -from forte.data.base_pack import PackType +from enum import Enum from forte.data.data_pack import DataPack from forte.data.multi_pack import MultiPack from forte.data.ontology.core import Entry @@ -20,6 +20,7 @@ from forte.data.ontology.top import Group from forte.data.ontology.top import Link from forte.data.ontology.top import MultiPackLink +from forte.data.ontology.top import Payload from typing import Dict from typing import Iterable from typing import List @@ -54,6 +55,9 @@ "MRCQuestion", "Recording", "AudioUtterance", + "AudioPayload", + "TextPayload", + "ImagePayload", ] @@ -308,7 +312,12 @@ class PredicateLink(Link): ParentType = PredicateMention ChildType = PredicateArgument - def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: DataPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.arg_type: Optional[str] = None @@ -328,7 +337,12 @@ class Dependency(Link): ParentType = Token ChildType = Token - def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: DataPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.dep_label: Optional[str] = None self.rel_type: Optional[str] = None @@ -337,9 +351,8 @@ def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Option @dataclass class EnhancedDependency(Link): """ - A `Link` type entry which represent a `enhanced dependency - `_. - + A `Link` type entry which represent a enhanced dependency: + https://universaldependencies.org/u/overview/enhanced-syntax.html Attributes: dep_label (Optional[str]): The enhanced dependency label in Universal Dependency. """ @@ -349,7 +362,12 @@ class EnhancedDependency(Link): ParentType = Token ChildType = Token - def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: DataPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.dep_label: Optional[str] = None @@ -367,7 +385,12 @@ class RelationLink(Link): ParentType = EntityMention ChildType = EntityMention - def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: DataPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -385,7 +408,12 @@ class CrossDocEntityRelation(MultiPackLink): ParentType = EntityMention ChildType = EntityMention - def __init__(self, pack: MultiPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: MultiPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -398,7 +426,9 @@ class CoreferenceGroup(Group): MemberType = EntityMention - def __init__(self, pack: DataPack, members: Optional[Iterable[Entry]] = None): + def __init__( + self, pack: DataPack, members: Optional[Iterable[Entry]] = None + ): super().__init__(pack, members) @@ -415,7 +445,12 @@ class EventRelation(Link): ParentType = EventMention ChildType = EventMention - def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: DataPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -433,7 +468,12 @@ class CrossDocEventRelation(MultiPackLink): ParentType = EventMention ChildType = EventMention - def __init__(self, pack: MultiPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): + def __init__( + self, + pack: MultiPack, + parent: Optional[Entry] = None, + child: Optional[Entry] = None, + ): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -455,8 +495,8 @@ class ConstituentNode(Annotation): sentiment: Dict[str, float] is_root: Optional[bool] is_leaf: Optional[bool] - parent_node: Optional['ConstituentNode'] - children_nodes: FList['ConstituentNode'] + parent_node: Optional["ConstituentNode"] + children_nodes: FList["ConstituentNode"] def __init__(self, pack: DataPack, begin: int, end: int): super().__init__(pack, begin, end) @@ -464,8 +504,8 @@ def __init__(self, pack: DataPack, begin: int, end: int): self.sentiment: Dict[str, float] = dict() self.is_root: Optional[bool] = None self.is_leaf: Optional[bool] = None - self.parent_node: Optional['ConstituentNode'] = None - self.children_nodes: FList['ConstituentNode'] = FList(self) + self.parent_node: Optional["ConstituentNode"] = None + self.children_nodes: FList["ConstituentNode"] = FList(self) @dataclass @@ -490,7 +530,6 @@ def __init__(self, pack: DataPack, begin: int, end: int): @dataclass class MCOption(Annotation): - def __init__(self, pack: DataPack, begin: int, end: int): super().__init__(pack, begin, end) @@ -540,7 +579,7 @@ class Recording(AudioAnnotation): recording_class: List[str] - def __init__(self, pack: PackType, begin: int, end: int): + def __init__(self, pack: DataPack, begin: int, end: int): super().__init__(pack, begin, end) self.recording_class: List[str] = [] @@ -555,6 +594,47 @@ class AudioUtterance(AudioAnnotation): speaker: Optional[str] - def __init__(self, pack: PackType, begin: int, end: int): + def __init__(self, pack: DataPack, begin: int, end: int): super().__init__(pack, begin, end) self.speaker: Optional[str] = None + + +@dataclass +class AudioPayload(Payload): + """ + A payload that caches audio data + Attributes: + speaker (Optional[str]): + """ + + speaker: Optional[str] + + def __init__( + self, pack: DataPack, modality: Enum, payload_idx: int, uri: str = None + ): + super().__init__(pack, modality, payload_idx, uri) + self.speaker: Optional[str] = None + + +@dataclass +class TextPayload(Payload): + """ + A payload that caches text data + """ + + def __init__( + self, pack: DataPack, modality: Enum, payload_idx: int, uri: str = None + ): + super().__init__(pack, modality, payload_idx, uri) + + +@dataclass +class ImagePayload(Payload): + """ + A payload that caches image data + """ + + def __init__( + self, pack: DataPack, modality: Enum, payload_idx: int, uri: str = None + ): + super().__init__(pack, modality, payload_idx, uri) From e0bb7b7e2915fcb8d2ab9d38d91d0a42eb8856ad Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:10:39 -0700 Subject: [PATCH 053/137] DataPack.audio -> pack.get_payload_data_at(audio, 0) --- tests/forte/data/audio_annotation_test.py | 193 ++++++++++++++-------- 1 file changed, 128 insertions(+), 65 deletions(-) diff --git a/tests/forte/data/audio_annotation_test.py b/tests/forte/data/audio_annotation_test.py index 357201f33..bd9bebfde 100644 --- a/tests/forte/data/audio_annotation_test.py +++ b/tests/forte/data/audio_annotation_test.py @@ -27,18 +27,31 @@ from forte.data.data_pack import DataPack from forte.data.readers import AudioReader from forte.data.ontology.top import ( - Annotation, AudioAnnotation, Generics, Group, Link + Annotation, + AudioAnnotation, + Generics, + Group, + Link, + Modality, +) +from ft.onto.base_ontology import ( + Recording, + AudioUtterance, + TextPayload, + Utterance, ) -from ft.onto.base_ontology import Recording, AudioUtterance, Utterance class RecordingProcessor(PackProcessor): """ A processor to add a Recording ontology to the whole audio data. """ + def _process(self, input_pack: DataPack): Recording( - pack=input_pack, begin=0, end=len(input_pack.audio) + pack=input_pack, + begin=0, + end=len(input_pack.get_payload_data_at("audio", 0)), ) @@ -68,7 +81,8 @@ class TextUtteranceProcessor(PackProcessor): """ def _process(self, input_pack: DataPack): - input_pack.set_text("test text") + tp = TextPayload(input_pack, Modality.text, 0) + tp.set_cache("test text") Utterance(pack=input_pack, begin=0, end=len(input_pack.text)) @@ -77,11 +91,10 @@ class AudioUtteranceProcessor(PackProcessor): A processor to add an AudioUtterance annotation to the specified span of audio payload. """ + def _process(self, input_pack: DataPack): audio_utter: AudioUtterance = AudioUtterance( - pack=input_pack, - begin=self.configs.begin, - end=self.configs.end + pack=input_pack, begin=self.configs.begin, end=self.configs.end ) audio_utter.speaker = self.configs.speaker @@ -109,20 +122,20 @@ def setUp(self): os.pardir, os.pardir, os.pardir, - "data_samples/audio_reader_test" + "data_samples/audio_reader_test", ) ) self._test_configs = { "Alice": {"begin": 200, "end": 35000}, - "Bob": {"begin": 35200, "end": 72000} + "Bob": {"begin": 35200, "end": 72000}, } # Define and config the Pipeline self._pipeline = Pipeline[DataPack]() - self._pipeline.set_reader(AudioReader(), config={ - "read_kwargs": {"always_2d": "True"} - }) + self._pipeline.set_reader( + AudioReader(), config={"read_kwargs": {"always_2d": "True"}} + ) self._pipeline.add(RecordingProcessor()) for speaker, span in self._test_configs.items(): self._pipeline.add( @@ -131,51 +144,75 @@ def setUp(self): self._pipeline.add(TextUtteranceProcessor()) self._pipeline.initialize() - def test_audio_annotation(self): - # Test `DataPack.get_span_audio()` with None audio payload - with self.assertRaises(ProcessExecutionException): + with self.assertRaises(ValueError): pack: DataPack = DataPack() - pack.set_text("test text") + tp = TextPayload(pack, Modality.text, 0) + tp.set_cache("test text") pack.get_span_audio(begin=0, end=1) # Verify the annotations of each datapack for pack in self._pipeline.process_dataset(self._test_audio_path): # test get all audio annotation # test get selective fields data from subclass of AudioAnnotation - raw_data_generator = pack.get_data(AudioAnnotation, - {Recording: - {"fields": ["recording_class"]}, - AudioUtterance: - {"fields": ["speaker"]}} - ) + raw_data_generator = pack.get_data( + AudioAnnotation, + { + Recording: {"fields": ["recording_class"]}, + AudioUtterance: {"fields": ["speaker"]}, + }, + ) for data_instance in pack.get(AudioAnnotation): raw_data = next(raw_data_generator) - - self.assertTrue('Recording' in raw_data.keys() and - "recording_class" in raw_data['Recording']) - self.assertTrue('AudioUtterance' in raw_data.keys() and - "speaker" in raw_data['AudioUtterance']) + + self.assertTrue( + "Recording" in raw_data.keys() + and "recording_class" in raw_data["Recording"] + ) + self.assertTrue( + "AudioUtterance" in raw_data.keys() + and "speaker" in raw_data["AudioUtterance"] + ) # test grouped data if isinstance(data_instance, Recording): - self.assertTrue(array_equal(np.array([data_instance.audio]), raw_data['Recording']['audio'])) - self.assertTrue(data_instance.recording_class ==np.squeeze(raw_data['Recording']['recording_class']).tolist()) + self.assertTrue( + array_equal( + np.array([data_instance.audio]), + raw_data["Recording"]["audio"], + ) + ) + self.assertTrue( + data_instance.recording_class + == np.squeeze( + raw_data["Recording"]["recording_class"] + ).tolist() + ) elif isinstance(data_instance, AudioUtterance): - self.assertTrue(array_equal(np.array([data_instance.audio]), raw_data['AudioUtterance']['audio'])) - self.assertTrue(data_instance.speaker - ==raw_data['AudioUtterance']['speaker'][0]) + self.assertTrue( + array_equal( + np.array([data_instance.audio]), + raw_data["AudioUtterance"]["audio"], + ) + ) + self.assertTrue( + data_instance.speaker + == raw_data["AudioUtterance"]["speaker"][0] + ) # check non-existence of non-requested data fields raw_data_generator = pack.get_data(AudioAnnotation) for raw_data in raw_data_generator: self.assertFalse("Recording" in raw_data) self.assertFalse("AudioUtterance" in raw_data) - + # Check Recording recordings = list(pack.get(Recording)) self.assertEqual(len(recordings), 1) - self.assertTrue(array_equal(recordings[0].audio, pack.audio)) - + self.assertTrue( + array_equal( + recordings[0].audio, pack.get_payload_data_at("audio", 0) + ) + ) # Check serialization/deserialization of AudioAnnotation new_pack = DataPack.from_string(pack.to_string()) self.assertEqual(new_pack.audio_annotations, pack.audio_annotations) @@ -192,10 +229,14 @@ def test_audio_annotation(self): for audio_utter in audio_utters: configs: Dict = self._test_configs[audio_utter.speaker] - self.assertTrue(array_equal( - audio_utter.audio, - pack.audio[configs["begin"]:configs["end"]] - )) + self.assertTrue( + array_equal( + audio_utter.audio, + pack.get_payload_data_at("audio", 0)[ + configs["begin"] : configs["end"] + ], + ) + ) # Check `AudioAnnotation.get(Group/Link/Generics)`. Note that only # `DummyGroup` and `DummyLink` entries can be retrieved because @@ -203,9 +244,10 @@ def test_audio_annotation(self): for entry_type in (Group, Link): self.assertEqual( len(list(recordings[0].get(entry_type))), - len(self._test_configs) + len(self._test_configs), ) - self.assertEqual(len(list(recordings[0].get(Generics))), 0) + # we have one generics meta data + self.assertEqual(len(list(recordings[0].get(Generics))), 1) # Check operations with mixing types of entries. self.assertEqual(len(list(pack.get(Utterance))), 1) @@ -216,12 +258,27 @@ def test_audio_annotation(self): # Verify the new conditional branches in DataPack.get() when dealing # with empty annotation/audio_annotation list. empty_pack: DataPack = DataPack() - self.assertEqual(len(list(empty_pack.get( - entry_type=Annotation, range_annotation=utter - ))), 0) - self.assertEqual(len(list(empty_pack.get( - entry_type=AudioAnnotation, range_annotation=recordings[0] - ))), 0) + self.assertEqual( + len( + list( + empty_pack.get( + entry_type=Annotation, range_annotation=utter + ) + ) + ), + 0, + ) + self.assertEqual( + len( + list( + empty_pack.get( + entry_type=AudioAnnotation, + range_annotation=recordings[0], + ) + ) + ), + 0, + ) # Check `DataPack.delete_entry(AudioAnnotation)` for audio_annotation in list(pack.get(AudioAnnotation)): @@ -241,8 +298,7 @@ def test_build_coverage_for(self): # Add coverage index for (Recording, AudioUtterance) pack.build_coverage_for( - context_type=Recording, - covered_type=AudioUtterance + context_type=Recording, covered_type=AudioUtterance ) self.assertTrue(pack._index.coverage_index_is_valid) self.assertEqual( @@ -250,24 +306,32 @@ def test_build_coverage_for(self): ) # Check DataIndex.get_covered() - self.assertTrue(pack.covers( - context_entry=recording, covered_entry=audio_utters[0] - )) - self.assertFalse(pack.covers( - context_entry=audio_utters[0], covered_entry=recording - )) + self.assertTrue( + pack.covers( + context_entry=recording, covered_entry=audio_utters[0] + ) + ) + self.assertFalse( + pack.covers( + context_entry=audio_utters[0], covered_entry=recording + ) + ) # Check DataIndex.coverage_index_is_valid flag pack._index.deactivate_coverage_index() - self.assertTrue(pack._index.coverage_index( - outer_type=Recording, - inner_type=AudioUtterance - ) is None) + self.assertTrue( + pack._index.coverage_index( + outer_type=Recording, inner_type=AudioUtterance + ) + is None + ) pack._index.activate_coverage_index() - self.assertFalse(pack._index.coverage_index( - outer_type=Recording, - inner_type=AudioUtterance - ) is None) + self.assertFalse( + pack._index.coverage_index( + outer_type=Recording, inner_type=AudioUtterance + ) + is None + ) # Check DataIndex.have_overlap() with self.assertRaises(TypeError): @@ -286,8 +350,7 @@ def test_build_coverage_for(self): # Check coverage index when inner and outer entries are the same pack._index.deactivate_coverage_index() pack.build_coverage_for( - context_type=Utterance, - covered_type=Utterance + context_type=Utterance, covered_type=Utterance ) self.assertEqual(len(pack._index._coverage_index), 1) utter = pack.get_single(Utterance) From 841f318b10fec658d57a3605ab2fec035cedbe6b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:11:38 -0700 Subject: [PATCH 054/137] DataPack.audio -> input_pack.get_payload_at(audio, 0) --- tests/forte/data/readers/audio_reader_test.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index ee339bb1c..447a7095f 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -17,8 +17,10 @@ from email.mime import audio import importlib import os +from sunau import AUDIO_FILE_ENCODING_ADPCM_G721 import unittest from typing import Dict +from forte.data.ontology.top import Modality from torch import argmax from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC @@ -29,7 +31,7 @@ from forte.data.readers import AudioReader from forte.pipeline import Pipeline from forte.processors.base.pack_processor import PackProcessor -from forte.data.ontology.top import AudioReadingMeta, AudioPayload +from ft.onto.base_ontology import AudioPayload, TextPayload class TestASRProcessor(PackProcessor): @@ -46,19 +48,9 @@ def initialize(self, resources: Resources, configs: Config): self._model = Wav2Vec2ForCTC.from_pretrained(pretrained_model) def _process(self, input_pack: DataPack): - - # it follows the logic of loaidng while using - # load audio using AudioPayload - for audio_payload, audio_reading_meta in zip( - input_pack.get(AudioPayload), input_pack.get(AudioReadingMeta) - ): - audio_reading_meta - module = importlib.import_module(audio_reading_meta.module) - reading_method = getattr(module, audio_reading_meta.reading_method) - audio_data, sample_rate = reading_method(audio_payload.reading_path) - # sample_rate = audio_payload.get_meta("sample_rate") - # audio_data = audio_payload.offload_cache() - + ap = input_pack.get_payload_at("audio", 0) + sample_rate = ap.meta.sample_rate + audio_data = ap.cache required_sample_rate: int = 16000 if sample_rate != required_sample_rate: raise ProcessFlowException( @@ -76,7 +68,9 @@ def _process(self, input_pack: DataPack): argmax(self._model(input_values).logits, dim=-1) ) - input_pack.set_text(text=transcription[0]) + tp = TextPayload(input_pack, Modality.text, 0) + tp.set_cache(transcription[0]) + # input_pack.set_text(text=transcription[0]) class AudioReaderPipelineTest(unittest.TestCase): From 0389a59d766ce442d27418f273a9e271015c2a1a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 20:14:52 -0700 Subject: [PATCH 055/137] adapt Modality parameter and audio -> get_payload_at(audio) in tests/forte/image_annotation_test.py --- tests/forte/image_annotation_test.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/forte/image_annotation_test.py b/tests/forte/image_annotation_test.py index c5f846c5c..3ba37c4c3 100644 --- a/tests/forte/image_annotation_test.py +++ b/tests/forte/image_annotation_test.py @@ -14,23 +14,16 @@ """ Unit tests for ImageAnnotation. """ -import os import unittest import numpy as np -from typing import Dict from numpy import array_equal -from forte.data.ontology.top import ( - ImageAnnotation, - ImagePayload, - ImageReadingMeta, - Payload, -) +from forte.data.ontology.top import ImageAnnotation, Modality + +from ft.onto.base_ontology import ImagePayload + from forte.data.data_pack import DataPack -import importlib -import os import unittest -from typing import Dict class ImageAnnotationTest(unittest.TestCase): @@ -44,7 +37,7 @@ def setUp(self): self.line[2, 2] = 1 self.line[3, 3] = 1 self.line[4, 4] = 1 - ip = ImagePayload(self.datapack, 0) + ip = ImagePayload(self.datapack, Modality.image, 0) ip.set_cache(self.line) ImageAnnotation(self.datapack, 0) @@ -55,6 +48,11 @@ def test_image_annotation(self): self.assertTrue( array_equal( - self.datapack.get_single(ImagePayload, 0).cache, self.line + self.datapack.get_payload_at("image", 0).cache, self.line ) ) + self.datapack.image_payloads[0]._modality = None + new_pack = DataPack.from_string(self.datapack.to_string()) + self.assertEqual( + new_pack.audio_annotations, self.datapack.audio_annotations + ) From dd8b5931e2c8cbeeeb302950658318c492bb36b7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Thu, 16 Jun 2022 21:27:59 -0700 Subject: [PATCH 056/137] add payload operations in data store --- forte/data/data_store.py | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index d0180d507..50bbe9f5b 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -12,6 +12,7 @@ # limitations under the License. +from enum import IntEnum from typing import Dict, List, Iterator, Tuple, Optional, Any, Type import uuid import logging @@ -29,6 +30,7 @@ ImageAnnotation, Link, Generics, + Payload, ) from forte.data.ontology.core import Entry, FList, FDict from forte.common import constants @@ -759,6 +761,35 @@ def _new_generics(self, type_name: str, tid: Optional[int] = None): return entry + def _new_payload( + self, + type_name: str, + payload_idx: int, + modality: IntEnum, + tid: Optional[int] = None, + ) -> List: + r"""This function generates a new payload with default fields. + Called by add_payload_raw() to create a new payload with ``type_name``, + ``payload_idx``, and ``modality``. + Args: + type_name: The fully qualified type name of the new entry. + payload_idx: the zero-based index of the TextPayload + in this DataPack's TextPayload entries. + modality: an ``IntEnum`` object that represents the payload + modality. + tid (Optional[int], optional): _description_. Defaults to None. + Returns: + A list representing new payload raw data. + """ + + tid: int = self._new_tid() if tid is None else tid + entry: List[Any] + + entry = [payload_idx, modality.name, tid, type_name] + entry += self._default_attributes_for_type(type_name) + + return entry + def _is_subclass( self, type_name: str, cls, no_dynamic_subclass: bool = False ) -> bool: @@ -1120,6 +1151,48 @@ def add_grid_raw( return tid_search_result return self._add_entry_raw(Grids, type_name, entry) + def add_payload_raw( + self, + type_name: str, + payload_idx: int, + modality: IntEnum, + tid: Optional[int] = None, + allow_duplicate=True, + ) -> int: + + r""" + This function adds an payload entry with ``payload_idx`` + and modality to current data store object. + Returns the ``tid`` for the inserted entry. + Args: + type_name: The fully qualified type name of the new Payload. + payload_idx: the zero-based index of the Payload + in this DataPack's Payload entries of the requested modality. + modality: an ``IntEnum`` object that represents the payload + modality. + tid: ``tid`` of the Payload entry that is being added. + It's optional, and it will be auto-assigned if not given. + allow_duplicate: Whether we allow duplicate in the DataStore. When + it's set to False, the function will return the ``tid`` of + existing entry if a duplicate is found. Default value is True. + Returns: + ``tid`` of the entry. + """ + # We should create the `entry data` with the format + # [payload_idx, modality, tid, type_id, None, ...]. + # A helper function _new_payload() can be used to generate a + # payload type entry data with default fields. + # A reference to the entry should be store in both self.__elements and + # self.__tid_ref_dict. + entry = self._new_payload(type_name, payload_idx, modality, tid) + + if not allow_duplicate: + tid_search_result = self._get_existing_ann_entry_tid(entry) + # if found existing entry + if tid_search_result != -1: + return tid_search_result + return self._add_entry_raw(Payload, type_name, entry) + def _get_existing_ann_entry_tid(self, entry: List[Any]): r""" This function searches for tid for existing annotation-like entry tid. From 64ae4d8396102244325403f4dc05e856d7d5b932 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 10:57:45 -0700 Subject: [PATCH 057/137] add enum --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 34dda65d1..c9eba44dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ dataclasses~=0.7; python_version <'3.7' importlib-resources==5.1.4;python_version<'3.7' packaging~=21.2 asyml-utilities +enum From 6b4c54a354c82af62bff2c8b254d417b17678ce4 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:45:21 -0700 Subject: [PATCH 058/137] get around an no-key-found issue with conditional popping state key --- forte/data/ontology/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte/data/ontology/core.py b/forte/data/ontology/core.py index 5dde5863d..d30d3f3c6 100644 --- a/forte/data/ontology/core.py +++ b/forte/data/ontology/core.py @@ -147,8 +147,8 @@ def get_state_func(instance): _pointer_keys[key] = True else: _pointer_keys[key] = False - - state.pop("_Entry__pack") + if "_Entry__pack" in state: + state.pop("_Entry__pack") return state From 156a924500472cedebb0db328d7dfc0a23efeb0d Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:46:03 -0700 Subject: [PATCH 059/137] add modality imports in forte.data init --- forte/data/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/forte/data/__init__.py b/forte/data/__init__.py index 01858ebca..12b458e39 100644 --- a/forte/data/__init__.py +++ b/forte/data/__init__.py @@ -20,3 +20,4 @@ from forte.data.data_store import * from forte.data.selector import * from forte.data.index import * +from forte.data.modality import * From e2ac2f7c6230f270a82fa95c90acc7a4f2cea4c6 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:46:44 -0700 Subject: [PATCH 060/137] import IntEnum --- forte/data/base_pack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 6eb14f312..dafc78e3d 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +from enum import IntEnum import gzip import pickle import uuid @@ -127,6 +128,7 @@ def __getstate__(self): def __setstate__(self, state): super().__setstate__(state) + if "meta" in self.__dict__: self._meta = self.__dict__.pop("meta") self.__control_component = None From 7ce6676c7bbaa44af4c1125c356cfd75f300dc54 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:50:16 -0700 Subject: [PATCH 061/137] update text operation based on TextPayload --- forte/data/data_pack.py | 62 +++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index ab4bd7781..dc9841482 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -28,6 +28,7 @@ Tuple, ) from functools import partial + from typing_inspect import get_origin from packaging.version import Version import numpy as np @@ -55,6 +56,7 @@ Payload, ) +from forte.data.modality import Modality from forte.data.span import Span from forte.data.types import ReplaceOperationsType, DataRequest from forte.utils import get_class, get_full_module_name @@ -165,7 +167,6 @@ class DataPack(BasePack[Entry, Link, Group]): def __init__(self, pack_name: Optional[str] = None): super().__init__(pack_name) - self._text = "" self._audio: Optional[np.ndarray] = None self._data_store: DataStore = DataStore() @@ -177,11 +178,6 @@ def __init__(self, pack_name: Optional[str] = None): self.audio_payloads: List[Payload] = [] self.image_payloads: List[Payload] = [] - self.__replace_back_operations: ReplaceOperationsType = [] - self.__processed_original_spans: List[Tuple[Span, Span]] = [] - - self.__orig_text_len: int = 0 - self._index: DataIndex = DataIndex() def __getstate__(self): @@ -248,7 +244,7 @@ def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) @property - def text(self, text_payload_index: int = 0) -> str: + def text(self) -> str: """ Get text from a text payload at an index. @@ -262,7 +258,12 @@ def text(self, text_payload_index: int = 0) -> str: Returns: text data in the text payload. """ - tp = self.get_payload_at("text", text_payload_index) + try: + tp = self.get_payload_at("text", 0) + except ValueError: + # backward compatibility, there might be case there is + # not payloads + return "" return tp.cache @property @@ -537,7 +538,7 @@ def get_span_text( Returns: The text within this span. """ - return self.text_payloads[text_payload_index].cache[begin:end] + return self.get_payload_data_at("text", text_payload_index)[begin:end] def get_span_audio( self, begin: int, end: int, audio_payload_index=0 @@ -556,8 +557,7 @@ def get_span_audio( Returns: The audio within this span. """ - audio_payload_entry = self.get_payload_at("audio", audio_payload_index) - return audio_payload_entry.cache[begin:end] + return self.get_payload_data_at("audio", audio_payload_index)[begin:end] def set_text( self, @@ -574,8 +574,9 @@ def set_text( text_payload_index: the zero-based index of the TextPayload in this DataPack's TextPayload entries. Defaults to 0. """ - span_ops = [] if replace_func is None else replace_func(text) + # Temporary imports + span_ops = [] if replace_func is None else replace_func(text) # The spans should be mutually exclusive ( text, @@ -583,7 +584,14 @@ def set_text( processed_original_spans, orig_text_len, ) = data_utils_io.modify_text_and_track_ops(text, span_ops) - tp = self.get_payload_at("text", text_payload_index) + # temporary solution for backward compatibility + # past API use this method to add a single text in the datapack + if len(self.text_payloads) == 0 and text_payload_index == 0: + from ft.onto.base_ontology import TextPayload + + tp = TextPayload(self, Modality.text) + else: + tp = self.get_payload_at("text", text_payload_index) tp.set_cache(text) tp.meta = Generics(self) @@ -607,7 +615,15 @@ def set_audio( audio_payload_index: the zero-based index of the AudioPayload in this DataPack's AudioPayload entries. Defaults to 0. """ - ap = AudioPayload(self, audio_payload_index) + # temporary solution for backward compatibility + # past API use this method to add a single audio in the datapack + if len(self.audio_payloads) == 0 and audio_payload_index == 0: + from ft.onto.base_ontology import AudioPayload + + ap = AudioPayload(self, Modality.audio) + else: + ap = self.get_payload_at("audio", audio_payload_index) + ap.set_cache(audio) ap.meta = Generics(self) ap.meta.sample_rate = sample_rate @@ -625,7 +641,7 @@ def get_original_text(self, text_payload_index: int = 0): """ tp = self.get_payload_at("text", text_payload_index) original_text, _, _, _ = data_utils_io.modify_text_and_track_ops( - tp.cache, tp.get_meta("replace_back_operations") + tp.cache, tp.replace_back_operations ) return original_text @@ -705,16 +721,19 @@ def get_original_index( Returns: Original index that aligns with input_index """ - if len(self.__processed_original_spans) == 0: + processed_original_spans = self.get_payload_at( + "text", 0 + ).processed_original_spans + if len(processed_original_spans) == 0: return input_index - len_processed_text = len(self._text) + len_processed_text = len(self.get_payload_data_at("text", 0)) orig_index = None prev_end = 0 for ( inverse_span, original_span, - ) in self.__processed_original_spans: + ) in processed_original_spans: # check if the input_index lies between one of the unprocessed # spans if prev_end <= input_index < inverse_span.begin: @@ -743,9 +762,7 @@ def get_original_index( if orig_index is None: # check if the input_index lies between the last unprocessed # span - inverse_span, original_span = self.__processed_original_spans[ - -1 - ] + inverse_span, original_span = processed_original_spans[-1] if inverse_span.end <= input_index < len_processed_text: increment = original_span.end - inverse_span.end orig_index = input_index + increment @@ -1673,10 +1690,13 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): if isinstance(entry, Payload): if entry.get_modality() == "text": + entry.set_payload_index(len(self.text_payloads)) self.text_payloads.append(entry) elif entry.get_modality() == "audio": + entry.set_payload_index(len(self.audio_payloads)) self.audio_payloads.append(entry) elif entry.get_modality() == "image": + entry.set_payload_index(len(self.image_payloads)) self.image_payloads.append(entry) def __del__(self): From 0baeb1b103d8f457ceab0177dfb845947d3bf48e Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:53:20 -0700 Subject: [PATCH 062/137] Payload serialization function --- forte/data/ontology/top.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 3a960b588..27631bb32 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -15,6 +15,7 @@ from enum import IntEnum from functools import total_ordering from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List +from forte.data.modality import Modality import numpy as np @@ -872,7 +873,9 @@ def __init__( self._width = width self._image_payload_idx = image_payload_idx super().__init__(pack) - self.img_arr = self.pack.get_image_array(self._image_payload_idx) + self.img_arr = self.pack.get_payload_data_at( + "image", self._image_payload_idx + ) self.c_h, self.c_w = ( self.img_arr.shape[0] // self._height, self.img_arr.shape[1] // self._width, @@ -1189,7 +1192,7 @@ def __init__( self, pack: PackType, modality: IntEnum, - payload_idx: int, + payload_idx: Optional[int] = None, uri: str = None, ): supported_modality = ("text", "audio", "image") @@ -1251,12 +1254,35 @@ def set_cache(self, data: Union[str, np.ndarray]): Load cache data into the payload. Args: - data: data to be set in the payload. + data: data to be set in the payload. It can be str for text data or + numpy array for audio or image data. """ self._cache = data + def set_payload_index(self, payload_index: int): + """ + Set payload index for the DataPack. + + Args: + payload_index: _description_ + """ + self._payload_idx = payload_index + + def __getstate__(self): + r""" + Convert ``_modality`` ``Enum`` object to str format for serialization. + """ + state = super().__getstate__() + state["_modality"] = self._modality.name + return state + + def __setstate__(self, state): + r""" + Convert ``_modality`` string to ``Enum`` object for deserialization. + """ + super().__setstate__(state) + self._modality = getattr(Modality, state["_modality"]) -Modality = IntEnum("modality", "text audio image") SinglePackEntries = ( Link, From 39d663c8c5e409ae26e72d2f89ba1544af5f3555 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:53:45 -0700 Subject: [PATCH 063/137] Payload DataStore --- forte/data/data_store.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 50bbe9f5b..2b7c7c518 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -948,7 +948,14 @@ def _add_entry_raw( except KeyError: self.__elements[type_name] = SortedList(key=sorting_fn) self.__elements[type_name].add(entry) - elif entry_type in [Link, Group, Generics, ImageAnnotation, Grids]: + elif entry_type in [ + Link, + Group, + Generics, + ImageAnnotation, + Grids, + Payload, + ]: try: self.__elements[type_name].append(entry) except KeyError: From 10a942fb1329f17157892ce684ebdeaf6a8d2043 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:54:11 -0700 Subject: [PATCH 064/137] import Modality --- forte/data/readers/audio_reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 8fbc41ab3..18b72a8d0 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -20,7 +20,8 @@ from forte.data.data_pack import DataPack from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader -from forte.data.ontology.top import Generics, Modality +from forte.data.modality import Modality +from forte.data.ontology.top import Generics from ft.onto.base_ontology import AudioPayload __all__ = [ From 2277b175318c2a16def9e6120d5b97a2d8411646 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:54:51 -0700 Subject: [PATCH 065/137] payload ontology json --- forte/ontology_specs/base_ontology.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json index 8833388c1..d8b5092a3 100644 --- a/forte/ontology_specs/base_ontology.json +++ b/forte/ontology_specs/base_ontology.json @@ -450,10 +450,6 @@ "parent_entry": "forte.data.ontology.top.Payload", "description": "A payload that caches audio data", "attributes":[ - { - "name": "speaker", - "type": "str" - } ] }, { From 72ce2e1d0e6220beaed6378edc875a51d18215bf Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:55:12 -0700 Subject: [PATCH 066/137] payload ontology classes --- ft/onto/base_ontology.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 7e7d53396..5f05e9ece 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -8,7 +8,7 @@ """ from dataclasses import dataclass -from enum import Enum +from enum import IntEnum from forte.data.data_pack import DataPack from forte.data.multi_pack import MultiPack from forte.data.ontology.core import Entry @@ -603,17 +603,16 @@ def __init__(self, pack: DataPack, begin: int, end: int): class AudioPayload(Payload): """ A payload that caches audio data - Attributes: - speaker (Optional[str]): """ - speaker: Optional[str] - def __init__( - self, pack: DataPack, modality: Enum, payload_idx: int, uri: str = None + self, + pack: DataPack, + modality: IntEnum, + payload_idx: Optional[int] = None, + uri: str = None, ): super().__init__(pack, modality, payload_idx, uri) - self.speaker: Optional[str] = None @dataclass @@ -623,7 +622,11 @@ class TextPayload(Payload): """ def __init__( - self, pack: DataPack, modality: Enum, payload_idx: int, uri: str = None + self, + pack: DataPack, + modality: IntEnum, + payload_idx: Optional[int] = None, + uri: str = None, ): super().__init__(pack, modality, payload_idx, uri) @@ -635,6 +638,10 @@ class ImagePayload(Payload): """ def __init__( - self, pack: DataPack, modality: Enum, payload_idx: int, uri: str = None + self, + pack: DataPack, + modality: IntEnum, + payload_idx: Optional[int] = None, + uri: str = None, ): super().__init__(pack, modality, payload_idx, uri) From 960699bc9ad889b89176d953cadbada1836f2b99 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:56:01 -0700 Subject: [PATCH 067/137] remove Modality import --- tests/forte/data/audio_annotation_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/forte/data/audio_annotation_test.py b/tests/forte/data/audio_annotation_test.py index bd9bebfde..ae9a8f35f 100644 --- a/tests/forte/data/audio_annotation_test.py +++ b/tests/forte/data/audio_annotation_test.py @@ -16,6 +16,7 @@ """ import os import unittest +from forte.data.modality import Modality import numpy as np from typing import Dict, List @@ -32,7 +33,6 @@ Generics, Group, Link, - Modality, ) from ft.onto.base_ontology import ( Recording, From 1609a7b7ce3cbb165254f9338de9df03c2ff041b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:56:41 -0700 Subject: [PATCH 068/137] revert changes in tests/forte/data/readers/audio_reader_test.py --- tests/forte/data/readers/audio_reader_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 447a7095f..0bdb17981 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -20,7 +20,7 @@ from sunau import AUDIO_FILE_ENCODING_ADPCM_G721 import unittest from typing import Dict -from forte.data.ontology.top import Modality +from forte.data import Modality from torch import argmax from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC @@ -70,7 +70,7 @@ def _process(self, input_pack: DataPack): tp = TextPayload(input_pack, Modality.text, 0) tp.set_cache(transcription[0]) - # input_pack.set_text(text=transcription[0]) + input_pack.set_text(text=transcription[0]) class AudioReaderPipelineTest(unittest.TestCase): From 069dd766b3dfd7f0e776170acc186919fc532b24 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:57:13 -0700 Subject: [PATCH 069/137] update grid tests based on the new Payload --- tests/forte/grids_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/forte/grids_test.py b/tests/forte/grids_test.py index 24fd9c3c0..2c764ff79 100644 --- a/tests/forte/grids_test.py +++ b/tests/forte/grids_test.py @@ -14,7 +14,10 @@ """ Unit tests for Grids. """ +from tkinter import image_types import unittest +from forte.data.modality import Modality +from ft.onto.base_ontology import ImagePayload import numpy as np from numpy import array_equal @@ -34,7 +37,8 @@ def setUp(self): line[2, 2] = 1 line[3, 3] = 1 line[4, 4] = 1 - self.datapack.payloads.append(line) + ip = ImagePayload(self.datapack, Modality.image) + ip.set_cache(line) self.datapack.image_annotations.append( ImageAnnotation(self.datapack, 0) ) @@ -45,7 +49,11 @@ def setUp(self): self.zeros = np.zeros((6, 12)) self.ref_arr = np.zeros((6, 12)) self.ref_arr[2, 2] = 1 - self.datapack.payloads.append(self.ref_arr) + ip = ImagePayload(self.datapack, Modality.image) + ip.set_cache(self.ref_arr) + self.datapack.image_annotations.append( + ImageAnnotation(self.datapack, 0) + ) def test_grids(self): From 1a47ef68196994f101336b6762f41116238b9d19 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:57:42 -0700 Subject: [PATCH 070/137] update test based on new Payload for tests/forte/image_annotation_test.py --- tests/forte/image_annotation_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/forte/image_annotation_test.py b/tests/forte/image_annotation_test.py index 3ba37c4c3..874f89a4a 100644 --- a/tests/forte/image_annotation_test.py +++ b/tests/forte/image_annotation_test.py @@ -15,10 +15,11 @@ Unit tests for ImageAnnotation. """ import unittest +from forte.data.modality import Modality import numpy as np from numpy import array_equal -from forte.data.ontology.top import ImageAnnotation, Modality +from forte.data.ontology.top import ImageAnnotation from ft.onto.base_ontology import ImagePayload @@ -51,7 +52,6 @@ def test_image_annotation(self): self.datapack.get_payload_at("image", 0).cache, self.line ) ) - self.datapack.image_payloads[0]._modality = None new_pack = DataPack.from_string(self.datapack.to_string()) self.assertEqual( new_pack.audio_annotations, self.datapack.audio_annotations From 451e08570bd153b4b0b10e958f7dccb65e0a18a9 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:58:16 -0700 Subject: [PATCH 071/137] update generics count due to its function as Payload Meta data --- tests/forte/pipeline_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/forte/pipeline_test.py b/tests/forte/pipeline_test.py index e16aaeda9..101db97c5 100644 --- a/tests/forte/pipeline_test.py +++ b/tests/forte/pipeline_test.py @@ -1055,7 +1055,7 @@ def test_empty_selector(self): ) ): # Because no packs are selected, we do not have any entries added. - self.assertTrue(pack.get_pack("pack").num_generics_entries == 0) + self.assertTrue(pack.get_pack("pack").num_generics_entries == 1) def test_caster_all_selector(self): """ From e26b37a175bd3f6e0a8835c6d4032659940c2ae3 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 16:58:36 -0700 Subject: [PATCH 072/137] modality class --- forte/data/modality.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 forte/data/modality.py diff --git a/forte/data/modality.py b/forte/data/modality.py new file mode 100644 index 000000000..6b3845e7f --- /dev/null +++ b/forte/data/modality.py @@ -0,0 +1,16 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from enum import IntEnum + +Modality = IntEnum("modality", "text audio image") From 97d39253b6d8f696643852d68025849109fa0c2d Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 17:05:16 -0700 Subject: [PATCH 073/137] revert changes in get_single() --- forte/data/base_pack.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index dafc78e3d..d4572cdf5 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -470,14 +470,12 @@ def get( """ raise NotImplementedError - def get_single( - self, entry_type: Union[str, Type[EntryType]], entry_index: int = 0 - ) -> EntryType: + def get_single(self, entry_type: Union[str, Type[EntryType]]) -> EntryType: r"""Take a single entry of type :attr:`~forte.data.data_pack.DataPack.entry_type` from this data - pack. This is useful when you want to take an entry at a specific index or the target entry type appears only one - time in the :class:`~forte.data.data_pack.DataPack` for e.g., a Document entry. - + pack. This is useful when the target entry type appears only one + time in the :class:`~forte.data.data_pack.DataPack` for e.g., a Document entry. Or you just + intended to take the first one. Args: entry_type: The entry type to be retrieved. entry_index: the index of the entry requested to get. From f5d44c2e7f4803ffa55d8c68597610192a57c4fb Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 17 Jun 2022 17:05:45 -0700 Subject: [PATCH 074/137] revert changes in get_single() --- forte/data/base_pack.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index d4572cdf5..4b7b212ad 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -478,25 +478,12 @@ def get_single(self, entry_type: Union[str, Type[EntryType]]) -> EntryType: intended to take the first one. Args: entry_type: The entry type to be retrieved. - entry_index: the index of the entry requested to get. - Returns: A single data entry. """ - idx = -1 - for idx, a in enumerate(self.get(entry_type)): - if idx == entry_index: - return a + for a in self.get(entry_type): + return a - if idx == -1: - raise EntryNotFoundError( - f"There is no {entry_type} in the provided pack." - ) - if idx < entry_index: - raise EntryNotFoundError( - f"The entry index {entry_index} is larger than maximum" - f" {entry_type} index ({idx}) in the provided pack." - ) raise EntryNotFoundError( f"The entry {entry_type} is not found in the provided pack." ) From f8a0ef63ec310e6a9a6859828a21cf35e7181792 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 08:12:18 -0700 Subject: [PATCH 075/137] pylint fix --- forte/data/base_pack.py | 1 - forte/data/data_pack.py | 24 ++++++++---------------- forte/data/ontology/top.py | 3 +-- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 4b7b212ad..4705cd842 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -13,7 +13,6 @@ # limitations under the License. import copy -from enum import IntEnum import gzip import pickle import uuid diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index dc9841482..6559ca79a 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -215,18 +215,6 @@ def __setstate__(self, state): self._entry_converter = EntryConverter() super().__setstate__(state) - # For backward compatibility. - if "replace_back_operations" in self.__dict__: - self.__replace_back_operations = self.__dict__.pop( - "replace_back_operations" - ) - if "processed_original_spans" in self.__dict__: - self.__processed_original_spans = self.__dict__.pop( - "processed_original_spans" - ) - if "orig_text_len" in self.__dict__: - self.__orig_text_len = self.__dict__.pop("orig_text_len") - self._index = DataIndex() self._index.update_basic_index(list(iter(self))) @@ -496,13 +484,13 @@ def get_payload_at( "Please provide one of modality among" f" {supported_modality}." ) - except IndexError: + except IndexError as e: raise ValueError( f"payload index ({payload_index}) " f"is larger or equal to {modality} payload list" f" length ({payloads_length}). " f"Please input a {modality} payload index less than it." - ) + ) from e return payload def get_payload_data_at( @@ -587,7 +575,9 @@ def set_text( # temporary solution for backward compatibility # past API use this method to add a single text in the datapack if len(self.text_payloads) == 0 and text_payload_index == 0: - from ft.onto.base_ontology import TextPayload + from ft.onto.base_ontology import ( + TextPayload, + ) # pylint: disable=import-outside-toplevel tp = TextPayload(self, Modality.text) else: @@ -618,7 +608,9 @@ def set_audio( # temporary solution for backward compatibility # past API use this method to add a single audio in the datapack if len(self.audio_payloads) == 0 and audio_payload_index == 0: - from ft.onto.base_ontology import AudioPayload + from ft.onto.base_ontology import ( + AudioPayload, + ) # pylint: disable=import-outside-toplevel ap = AudioPayload(self, Modality.audio) else: diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 27631bb32..5f0e1a610 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -15,10 +15,9 @@ from enum import IntEnum from functools import total_ordering from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List -from forte.data.modality import Modality - import numpy as np +from forte.data.modality import Modality from forte.data.base_pack import PackType from forte.data.ontology.core import ( Entry, From cd26081f22ebac470131a8a861d41970a5cd8c38 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 08:16:12 -0700 Subject: [PATCH 076/137] pylint --- forte/data/data_pack.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 6559ca79a..180b6a5d2 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -575,9 +575,9 @@ def set_text( # temporary solution for backward compatibility # past API use this method to add a single text in the datapack if len(self.text_payloads) == 0 and text_payload_index == 0: - from ft.onto.base_ontology import ( + from ft.onto.base_ontology import ( # pylint: disable=import-outside-toplevel TextPayload, - ) # pylint: disable=import-outside-toplevel + ) tp = TextPayload(self, Modality.text) else: @@ -608,9 +608,9 @@ def set_audio( # temporary solution for backward compatibility # past API use this method to add a single audio in the datapack if len(self.audio_payloads) == 0 and audio_payload_index == 0: - from ft.onto.base_ontology import ( + from ft.onto.base_ontology import ( # pylint: disable=import-outside-toplevel AudioPayload, - ) # pylint: disable=import-outside-toplevel + ) ap = AudioPayload(self, Modality.audio) else: From 544a808db97d8ca07c9dd9bb2dba16e823c12b7b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 09:42:16 -0700 Subject: [PATCH 077/137] pylint --- forte/data/data_pack.py | 8 +++----- forte/data/ontology/top.py | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 180b6a5d2..4bc77691c 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -449,9 +449,7 @@ def groups(self): def groups(self, val): self._groups = val - def get_payload_at( - self, modality: str, payload_index: int - ) -> Union[str, np.ndarray]: + def get_payload_at(self, modality: str, payload_index: int) -> Payload: """ Get Payload of requested modality at the requested payload index. @@ -1096,11 +1094,11 @@ def get_context_data( str: context data. """ if issubclass(c_type, Annotation): - return self.get_payload_at("text", payload_index).cache[ + return self.get_payload_data_at("text", payload_index)[ context.begin : context.end ] elif issubclass(c_type, AudioAnnotation): - return self.get_payload_at("audio", payload_index).cache[ + return self.get_payload_data_at("audio", payload_index)[ context.begin : context.end ] else: diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 5f0e1a610..4b02159b8 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1192,7 +1192,7 @@ def __init__( pack: PackType, modality: IntEnum, payload_idx: Optional[int] = None, - uri: str = None, + uri: Optional[str] = None, ): supported_modality = ("text", "audio", "image") if modality.name not in supported_modality: @@ -1200,15 +1200,15 @@ def __init__( f"The given modality {modality} is not supported. " f"Currently we only support {supported_modality}" ) - self._payload_idx = payload_idx - self._modality = modality - self._uri = uri + self._payload_idx: int = payload_idx + self._modality: IntEnum = modality + self._uri: str = uri super().__init__(pack) - self._cache = None - self.meta = None + self._cache: Optional[Union[str, np.ndarray]] = None + self.meta: Optional[Generics] = None - def get_type(self): + def get_type(self) -> type: """ Get the type of the payload class. @@ -1217,7 +1217,7 @@ def get_type(self): """ return type(self) - def get_modality(self): + def get_modality(self) -> str: """ Get the modality of the payload class. @@ -1227,7 +1227,7 @@ def get_modality(self): return self._modality.name @property - def cache(self): + def cache(self) -> Optional[Union[str, np.ndarray]]: if self._cache is None: raise ValueError( "Payload doesn't have a cache." @@ -1237,15 +1237,15 @@ def cache(self): return self._cache @property - def modality(self): + def modality(self) -> IntEnum: return self._modality @property - def payload_index(self): + def payload_index(self) -> int: return self._payload_idx @property - def uri(self): + def uri(self) -> str: return self._uri def set_cache(self, data: Union[str, np.ndarray]): From e71686855a4ee12cec6b493774648ae3105a3978 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 10:30:37 -0700 Subject: [PATCH 078/137] pylint --- forte/ontology_specs/base_ontology.json | 24 +++++++++++++++++++++--- ft/onto/base_ontology.py | 21 ++++++++++++++++++--- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json index d8b5092a3..85f9fbb08 100644 --- a/forte/ontology_specs/base_ontology.json +++ b/forte/ontology_specs/base_ontology.json @@ -449,14 +449,32 @@ "entry_name": "ft.onto.base_ontology.AudioPayload", "parent_entry": "forte.data.ontology.top.Payload", "description": "A payload that caches audio data", - "attributes":[ - ] + "attributes":[] }, { "entry_name": "ft.onto.base_ontology.TextPayload", "parent_entry": "forte.data.ontology.top.Payload", "description": "A payload that caches text data", - "attributes": [] + "attributes": [ + { + "name": "_cache", + "type": "str" + }, + { + "name": "replace_back_operations", + "type": "List", + "item_type": "typing.Tuple" + }, + { + "name": "processed_original_spans", + "type": "List", + "item_type": "typing.Tuple" + }, + { + "name": "orig_text_len", + "type": "int" + } + ] }, { "entry_name": "ft.onto.base_ontology.ImagePayload", diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 5f05e9ece..45f7a0f3b 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -25,6 +25,7 @@ from typing import Iterable from typing import List from typing import Optional +from typing import Tuple __all__ = [ "Token", @@ -610,7 +611,7 @@ def __init__( pack: DataPack, modality: IntEnum, payload_idx: Optional[int] = None, - uri: str = None, + uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) @@ -619,16 +620,30 @@ def __init__( class TextPayload(Payload): """ A payload that caches text data + Attributes: + _cache (Optional[str]): + replace_back_operations (FList[Tuple]): + processed_original_spans (FList[Tuple]): + orig_text_len (Optional[int]): """ + _cache: Optional[str] + replace_back_operations: FList[Tuple] + processed_original_spans: FList[Tuple] + orig_text_len: Optional[int] + def __init__( self, pack: DataPack, modality: IntEnum, payload_idx: Optional[int] = None, - uri: str = None, + uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) + self._cache: Optional[str] = None + self.replace_back_operations: FList[Tuple] = FList(self) + self.processed_original_spans: FList[Tuple] = FList(self) + self.orig_text_len: Optional[int] = None @dataclass @@ -642,6 +657,6 @@ def __init__( pack: DataPack, modality: IntEnum, payload_idx: Optional[int] = None, - uri: str = None, + uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) From 1496b40683bc3dc4c7c84162899d26756eb24ffe Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 10:56:19 -0700 Subject: [PATCH 079/137] pylint --- forte/data/ontology/core.py | 3 +-- forte/ontology_specs/base_ontology.json | 4 ++-- ft/onto/base_ontology.py | 13 ++++++------- tests/forte/image_annotation_test.py | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/forte/data/ontology/core.py b/forte/data/ontology/core.py index d30d3f3c6..c20889138 100644 --- a/forte/data/ontology/core.py +++ b/forte/data/ontology/core.py @@ -147,8 +147,7 @@ def get_state_func(instance): _pointer_keys[key] = True else: _pointer_keys[key] = False - if "_Entry__pack" in state: - state.pop("_Entry__pack") + # state.pop("_Entry__pack") return state diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json index 85f9fbb08..05c260971 100644 --- a/forte/ontology_specs/base_ontology.json +++ b/forte/ontology_specs/base_ontology.json @@ -462,12 +462,12 @@ }, { "name": "replace_back_operations", - "type": "List", + "type": "typing.List", "item_type": "typing.Tuple" }, { "name": "processed_original_spans", - "type": "List", + "type": "typing.List", "item_type": "typing.Tuple" }, { diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 45f7a0f3b..7275e69d5 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -25,7 +25,6 @@ from typing import Iterable from typing import List from typing import Optional -from typing import Tuple __all__ = [ "Token", @@ -622,14 +621,14 @@ class TextPayload(Payload): A payload that caches text data Attributes: _cache (Optional[str]): - replace_back_operations (FList[Tuple]): - processed_original_spans (FList[Tuple]): + replace_back_operations (Optional[List]): + processed_original_spans (Optional[List]): orig_text_len (Optional[int]): """ _cache: Optional[str] - replace_back_operations: FList[Tuple] - processed_original_spans: FList[Tuple] + replace_back_operations: Optional[List] + processed_original_spans: Optional[List] orig_text_len: Optional[int] def __init__( @@ -641,8 +640,8 @@ def __init__( ): super().__init__(pack, modality, payload_idx, uri) self._cache: Optional[str] = None - self.replace_back_operations: FList[Tuple] = FList(self) - self.processed_original_spans: FList[Tuple] = FList(self) + self.replace_back_operations: Optional[List] = None + self.processed_original_spans: Optional[List] = None self.orig_text_len: Optional[int] = None diff --git a/tests/forte/image_annotation_test.py b/tests/forte/image_annotation_test.py index 874f89a4a..e7a192f9c 100644 --- a/tests/forte/image_annotation_test.py +++ b/tests/forte/image_annotation_test.py @@ -40,11 +40,11 @@ def setUp(self): self.line[4, 4] = 1 ip = ImagePayload(self.datapack, Modality.image, 0) ip.set_cache(self.line) - ImageAnnotation(self.datapack, 0) + ImageAnnotation(self.datapack) def test_image_annotation(self): self.assertEqual( - self.datapack.get_single(ImageAnnotation, 0).image_payload_idx, 0 + self.datapack.get_single(ImageAnnotation).image_payload_idx, 0 ) self.assertTrue( From 32e770b984d3a32c65e6a10fec09c5c886fc19b7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 11:45:30 -0700 Subject: [PATCH 080/137] pylint --- forte/data/data_pack.py | 10 +++++++--- forte/data/ontology/top.py | 6 +++--- forte/data/readers/audio_reader.py | 2 +- forte/ontology_specs/base_ontology.json | 7 ++++++- forte/processors/ir/bert/bert_based_query_creator.py | 8 ++++---- ft/onto/base_ontology.py | 5 +++++ tests/forte/data/readers/audio_reader_test.py | 2 +- .../processors/ir/bert_based_query_creator_test.py | 6 +++--- 8 files changed, 30 insertions(+), 16 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 4bc77691c..44d5a9e72 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -28,6 +28,8 @@ Tuple, ) from functools import partial +import typing + from typing_inspect import get_origin from packaging.version import Version @@ -232,7 +234,7 @@ def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) @property - def text(self) -> str: + def text(self) -> Union[str, Any, None]: """ Get text from a text payload at an index. @@ -449,7 +451,9 @@ def groups(self): def groups(self, val): self._groups = val - def get_payload_at(self, modality: str, payload_index: int) -> Payload: + def get_payload_at( + self, modality: str, payload_index: int + ): # -> Union[TextPayload, AudioPayload, ImagePayload]: """ Get Payload of requested modality at the requested payload index. @@ -616,7 +620,7 @@ def set_audio( ap.set_cache(audio) ap.meta = Generics(self) - ap.meta.sample_rate = sample_rate + ap.sample_rate = sample_rate def get_original_text(self, text_payload_index: int = 0): r"""Get original unmodified text from the :class:`~forte.data.data_pack.DataPack` object. diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 4b02159b8..9993ec367 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1200,9 +1200,9 @@ def __init__( f"The given modality {modality} is not supported. " f"Currently we only support {supported_modality}" ) - self._payload_idx: int = payload_idx + self._payload_idx: Optional[int] = payload_idx self._modality: IntEnum = modality - self._uri: str = uri + self._uri: Optional[str] = uri super().__init__(pack) self._cache: Optional[Union[str, np.ndarray]] = None @@ -1227,7 +1227,7 @@ def get_modality(self) -> str: return self._modality.name @property - def cache(self) -> Optional[Union[str, np.ndarray]]: + def cache(self) -> str: # Union[str, np.ndarray]: if self._cache is None: raise ValueError( "Payload doesn't have a cache." diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 18b72a8d0..69483b92c 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -76,7 +76,7 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: audio_data, sample_rate = self.soundfile.read(file_path) ap.set_cache(audio_data) ap.meta = Generics(pack) - ap.meta.sample_rate = sample_rate + ap.sample_rate = sample_rate pack.pack_name = file_path yield pack diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json index 05c260971..7ad72a8a8 100644 --- a/forte/ontology_specs/base_ontology.json +++ b/forte/ontology_specs/base_ontology.json @@ -449,7 +449,12 @@ "entry_name": "ft.onto.base_ontology.AudioPayload", "parent_entry": "forte.data.ontology.top.Payload", "description": "A payload that caches audio data", - "attributes":[] + "attributes":[ + { + "name": "sample_rate", + "type": "int" + } + ] }, { "entry_name": "ft.onto.base_ontology.TextPayload", diff --git a/forte/processors/ir/bert/bert_based_query_creator.py b/forte/processors/ir/bert/bert_based_query_creator.py index cb73b6113..24caae675 100644 --- a/forte/processors/ir/bert/bert_based_query_creator.py +++ b/forte/processors/ir/bert/bert_based_query_creator.py @@ -45,10 +45,10 @@ def __init__(self) -> None: def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs - - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu" - ) + self.device = "cpu" + # self.device = torch.device( + # "cuda" if torch.cuda.is_available() else "cpu" + # ) try: from texar.torch.data import ( # pylint: disable=import-outside-toplevel diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 7275e69d5..456b88e8f 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -603,8 +603,12 @@ def __init__(self, pack: DataPack, begin: int, end: int): class AudioPayload(Payload): """ A payload that caches audio data + Attributes: + sample_rate (Optional[int]): """ + sample_rate: Optional[int] + def __init__( self, pack: DataPack, @@ -613,6 +617,7 @@ def __init__( uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) + self.sample_rate: Optional[int] = None @dataclass diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 0bdb17981..05d722c09 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -49,7 +49,7 @@ def initialize(self, resources: Resources, configs: Config): def _process(self, input_pack: DataPack): ap = input_pack.get_payload_at("audio", 0) - sample_rate = ap.meta.sample_rate + sample_rate = ap.sample_rate audio_data = ap.cache required_sample_rate: int = 16000 if sample_rate != required_sample_rate: diff --git a/tests/forte/processors/ir/bert_based_query_creator_test.py b/tests/forte/processors/ir/bert_based_query_creator_test.py index dc722f4af..318a046d5 100644 --- a/tests/forte/processors/ir/bert_based_query_creator_test.py +++ b/tests/forte/processors/ir/bert_based_query_creator_test.py @@ -63,7 +63,7 @@ def test_pipeline(self, texts): for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): query_pack: DataPack = m_pack.get_pack("query") - self.assertEqual(query_pack.num_generics_entries, 1) - self.assertIsInstance(query_pack.generics[0], Query) - query = query_pack.generics[0].value + self.assertEqual(query_pack.num_generics_entries, 2) + self.assertIsInstance(query_pack.generics[1], Query) + query = query_pack.generics[1].value self.assertEqual(query.shape, (1, 768)) From f71782d0ede999df493b580ebc8ceeb38780d649 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 11:54:12 -0700 Subject: [PATCH 081/137] pylint --- forte/data/data_pack.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 44d5a9e72..c69e6f758 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -28,8 +28,6 @@ Tuple, ) from functools import partial -import typing - from typing_inspect import get_origin from packaging.version import Version From 07a4ab2a7ad5cd141838b241ffbadfbbbdc7b4b3 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 16:18:34 -0700 Subject: [PATCH 082/137] mypy --- forte/data/data_pack.py | 14 ++++++++++---- forte/data/ontology/top.py | 11 ++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index c69e6f758..f6c0d92f2 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -26,6 +26,7 @@ Set, Callable, Tuple, + cast, ) from functools import partial @@ -232,7 +233,7 @@ def _validate(self, entry: EntryType) -> bool: return isinstance(entry, SinglePackEntries) @property - def text(self) -> Union[str, Any, None]: + def text(self) -> str: """ Get text from a text payload at an index. @@ -495,7 +496,7 @@ def get_payload_at( def get_payload_data_at( self, modality: str, payload_index: int - ) -> Union[str, np.ndarray]: + ) -> Union[str, np.ndarray[Any, Any]]: """ Get Payload of requested modality at the requested payload index. @@ -526,7 +527,9 @@ def get_span_text( Returns: The text within this span. """ - return self.get_payload_data_at("text", text_payload_index)[begin:end] + return cast(str, self.get_payload_data_at("text", text_payload_index))[ + begin:end + ] def get_span_audio( self, begin: int, end: int, audio_payload_index=0 @@ -545,7 +548,10 @@ def get_span_audio( Returns: The audio within this span. """ - return self.get_payload_data_at("audio", audio_payload_index)[begin:end] + return cast( + np.ndarray, + self.get_payload_data_at("audio", audio_payload_index)[begin:end], + ) def set_text( self, diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 9993ec367..97883c938 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1191,7 +1191,7 @@ def __init__( self, pack: PackType, modality: IntEnum, - payload_idx: Optional[int] = None, + payload_idx: int = 0, uri: Optional[str] = None, ): supported_modality = ("text", "audio", "image") @@ -1200,12 +1200,13 @@ def __init__( f"The given modality {modality} is not supported. " f"Currently we only support {supported_modality}" ) - self._payload_idx: Optional[int] = payload_idx + self._payload_idx: int = payload_idx self._modality: IntEnum = modality self._uri: Optional[str] = uri super().__init__(pack) - self._cache: Optional[Union[str, np.ndarray]] = None + # self._cache: Optional[Union[str, np.ndarray]] = None + self._cache: Union[str, np.ndarray] = "" self.meta: Optional[Generics] = None def get_type(self) -> type: @@ -1227,7 +1228,7 @@ def get_modality(self) -> str: return self._modality.name @property - def cache(self) -> str: # Union[str, np.ndarray]: + def cache(self) -> Union[str, np.ndarray]: if self._cache is None: raise ValueError( "Payload doesn't have a cache." @@ -1245,7 +1246,7 @@ def payload_index(self) -> int: return self._payload_idx @property - def uri(self) -> str: + def uri(self) -> Optional[str]: return self._uri def set_cache(self, data: Union[str, np.ndarray]): From 4e7cac54a7de381b6163bf2cbee0cf4082b20234 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 16:26:19 -0700 Subject: [PATCH 083/137] update base ontology --- ft/onto/base_ontology.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 456b88e8f..8d3e3bd41 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -613,7 +613,7 @@ def __init__( self, pack: DataPack, modality: IntEnum, - payload_idx: Optional[int] = None, + payload_idx: int = 0, uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) @@ -640,7 +640,7 @@ def __init__( self, pack: DataPack, modality: IntEnum, - payload_idx: Optional[int] = None, + payload_idx: int = 0, uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) @@ -660,7 +660,7 @@ def __init__( self, pack: DataPack, modality: IntEnum, - payload_idx: Optional[int] = None, + payload_idx: int = 0, uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) From 90c66f104daf3e3663b2a73950f131af6cdbe819 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 16:49:35 -0700 Subject: [PATCH 084/137] pylint --- forte/data/data_pack.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index f6c0d92f2..363023c28 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -496,13 +496,13 @@ def get_payload_at( def get_payload_data_at( self, modality: str, payload_index: int - ) -> Union[str, np.ndarray[Any, Any]]: + ) -> Union[str, np.ndarray]: """ Get Payload of requested modality at the requested payload index. Args: modality: data modality among "text", "audio", "image" - payload_index (int): the zero-based index of the Payload + payload_index: the zero-based index of the Payload in this DataPack's Payload entries of the requested modality. Raises: From 10f2b1477cc9ed21eee54bb818f921ff491eb7c7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 17:22:18 -0700 Subject: [PATCH 085/137] remove enum installation as it's built-in --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c9eba44dd..34dda65d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,3 @@ dataclasses~=0.7; python_version <'3.7' importlib-resources==5.1.4;python_version<'3.7' packaging~=21.2 asyml-utilities -enum From 94f180f49fb93f02b799f6ba7a941c9b8ee839e0 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 18:05:44 -0700 Subject: [PATCH 086/137] fix doc build --- forte/data/data_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 2b7c7c518..e4e485126 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -771,6 +771,7 @@ def _new_payload( r"""This function generates a new payload with default fields. Called by add_payload_raw() to create a new payload with ``type_name``, ``payload_idx``, and ``modality``. + Args: type_name: The fully qualified type name of the new entry. payload_idx: the zero-based index of the TextPayload @@ -1166,11 +1167,11 @@ def add_payload_raw( tid: Optional[int] = None, allow_duplicate=True, ) -> int: - r""" This function adds an payload entry with ``payload_idx`` and modality to current data store object. Returns the ``tid`` for the inserted entry. + Args: type_name: The fully qualified type name of the new Payload. payload_idx: the zero-based index of the Payload From 259a9a58f44f0ce58adb9acbc94fd9a94a4953c5 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 20 Jun 2022 18:14:31 -0700 Subject: [PATCH 087/137] fix spelling --- forte/data/base_pack.py | 1 + forte/data/data_pack.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 4705cd842..a8fb1fa62 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -475,6 +475,7 @@ def get_single(self, entry_type: Union[str, Type[EntryType]]) -> EntryType: pack. This is useful when the target entry type appears only one time in the :class:`~forte.data.data_pack.DataPack` for e.g., a Document entry. Or you just intended to take the first one. + Args: entry_type: The entry type to be retrieved. Returns: diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 363023c28..ecd551c7c 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -986,7 +986,7 @@ def get_data( data from the (`offset` + 1)th instance. payload_index: the zero-based index of the Payload in this DataPack's Payload entries of a particular modality. - The modality is depedent on ``context_type``. Defaults to 0. + The modality is dependent on ``context_type``. Defaults to 0. Returns: A data generator, which generates one piece of data (a dict @@ -1091,7 +1091,7 @@ def get_context_data( contains data to be extracted. payload_index: the zero-based index of the Payload in this DataPack's Payload entries of a particular modality. - The modality is depedent on ``c_type``. + The modality is dependent on ``c_type``. Defaults to 0. Raises: From 871e02c89e439a4cd25264d6159c2d1f4a32952b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 21 Jun 2022 14:43:20 -0700 Subject: [PATCH 088/137] add enum34 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 34dda65d1..c1a9c73e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ dataclasses~=0.7; python_version <'3.7' importlib-resources==5.1.4;python_version<'3.7' packaging~=21.2 asyml-utilities +enum34 From eff96fce7e63fa2b09dfcd2f032c710927d27c9d Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 21 Jun 2022 14:44:15 -0700 Subject: [PATCH 089/137] remove meta data for payloads --- forte/data/data_pack.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index ecd551c7c..8ba4b7872 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -585,12 +585,11 @@ def set_text( TextPayload, ) - tp = TextPayload(self, Modality.text) + tp = TextPayload(self, Modality.text, text_payload_index) else: tp = self.get_payload_at("text", text_payload_index) tp.set_cache(text) - tp.meta = Generics(self) tp.replace_back_operations = replace_back_operations tp.processed_original_spans = processed_original_spans @@ -623,7 +622,6 @@ def set_audio( ap = self.get_payload_at("audio", audio_payload_index) ap.set_cache(audio) - ap.meta = Generics(self) ap.sample_rate = sample_rate def get_original_text(self, text_payload_index: int = 0): @@ -631,7 +629,7 @@ def get_original_text(self, text_payload_index: int = 0): Args: text_payload_index: the zero-based index of the TextPayload - in this DataPack's TextPayload entries. Defaults to 0. + in this DataPack's entries. Defaults to 0. Returns: Original text after applying the `replace_back_operations` of From 8ba49f7009d2f52a3f122694d156fe375c8d9a00 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 21 Jun 2022 14:45:53 -0700 Subject: [PATCH 090/137] revert empty ontology initialization change --- forte/data/ontology/core.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/forte/data/ontology/core.py b/forte/data/ontology/core.py index c20889138..a7516cb0f 100644 --- a/forte/data/ontology/core.py +++ b/forte/data/ontology/core.py @@ -147,7 +147,7 @@ def get_state_func(instance): _pointer_keys[key] = True else: _pointer_keys[key] = False - # state.pop("_Entry__pack") + state.pop("_Entry__pack") return state @@ -188,9 +188,8 @@ def __init__(self, pack: ContainerType): self.__pack: ContainerType = pack self._tid: int = uuid.uuid4().int self._embedding: np.ndarray = np.empty(0) - if pack is not None: - self.pack._validate(self) - self.pack.on_entry_creation(self) + self.pack._validate(self) + self.pack.on_entry_creation(self) def __getstate__(self): r"""In serialization, the pack is not serialize, and it will be set From 628887bf38760058382bc5746d29eb87d8d4399a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 21 Jun 2022 14:48:10 -0700 Subject: [PATCH 091/137] adjust generics count and testing state.pop(_entry__pack) --- forte/data/ontology/top.py | 5 +++-- forte/data/readers/audio_reader.py | 1 - forte/ontology_specs/base_ontology.json | 21 +------------------ ft/onto/base_ontology.py | 14 ------------- tests/forte/data/audio_annotation_test.py | 2 +- tests/forte/pipeline_test.py | 2 +- .../ir/bert_based_query_creator_test.py | 6 +++--- 7 files changed, 9 insertions(+), 42 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 97883c938..e363a06b6 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1205,9 +1205,10 @@ def __init__( self._uri: Optional[str] = uri super().__init__(pack) - # self._cache: Optional[Union[str, np.ndarray]] = None self._cache: Union[str, np.ndarray] = "" - self.meta: Optional[Generics] = None + self.replace_back_operations: List[Tuple] = [] + self.processed_original_spans: List[Tuple] = [] + self.orig_text_len: int = 0 def get_type(self) -> type: """ diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 69483b92c..cfbe32393 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -75,7 +75,6 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: if not self.configs.lazy_read: audio_data, sample_rate = self.soundfile.read(file_path) ap.set_cache(audio_data) - ap.meta = Generics(pack) ap.sample_rate = sample_rate pack.pack_name = file_path yield pack diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json index 7ad72a8a8..f3d854301 100644 --- a/forte/ontology_specs/base_ontology.json +++ b/forte/ontology_specs/base_ontology.json @@ -460,26 +460,7 @@ "entry_name": "ft.onto.base_ontology.TextPayload", "parent_entry": "forte.data.ontology.top.Payload", "description": "A payload that caches text data", - "attributes": [ - { - "name": "_cache", - "type": "str" - }, - { - "name": "replace_back_operations", - "type": "typing.List", - "item_type": "typing.Tuple" - }, - { - "name": "processed_original_spans", - "type": "typing.List", - "item_type": "typing.Tuple" - }, - { - "name": "orig_text_len", - "type": "int" - } - ] + "attributes": [] }, { "entry_name": "ft.onto.base_ontology.ImagePayload", diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 8d3e3bd41..281dfd82f 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -624,18 +624,8 @@ def __init__( class TextPayload(Payload): """ A payload that caches text data - Attributes: - _cache (Optional[str]): - replace_back_operations (Optional[List]): - processed_original_spans (Optional[List]): - orig_text_len (Optional[int]): """ - _cache: Optional[str] - replace_back_operations: Optional[List] - processed_original_spans: Optional[List] - orig_text_len: Optional[int] - def __init__( self, pack: DataPack, @@ -644,10 +634,6 @@ def __init__( uri: Optional[str] = None, ): super().__init__(pack, modality, payload_idx, uri) - self._cache: Optional[str] = None - self.replace_back_operations: Optional[List] = None - self.processed_original_spans: Optional[List] = None - self.orig_text_len: Optional[int] = None @dataclass diff --git a/tests/forte/data/audio_annotation_test.py b/tests/forte/data/audio_annotation_test.py index ae9a8f35f..8871bfe8e 100644 --- a/tests/forte/data/audio_annotation_test.py +++ b/tests/forte/data/audio_annotation_test.py @@ -247,7 +247,7 @@ def test_audio_annotation(self): len(self._test_configs), ) # we have one generics meta data - self.assertEqual(len(list(recordings[0].get(Generics))), 1) + self.assertEqual(len(list(recordings[0].get(Generics))), 0) # Check operations with mixing types of entries. self.assertEqual(len(list(pack.get(Utterance))), 1) diff --git a/tests/forte/pipeline_test.py b/tests/forte/pipeline_test.py index 101db97c5..e16aaeda9 100644 --- a/tests/forte/pipeline_test.py +++ b/tests/forte/pipeline_test.py @@ -1055,7 +1055,7 @@ def test_empty_selector(self): ) ): # Because no packs are selected, we do not have any entries added. - self.assertTrue(pack.get_pack("pack").num_generics_entries == 1) + self.assertTrue(pack.get_pack("pack").num_generics_entries == 0) def test_caster_all_selector(self): """ diff --git a/tests/forte/processors/ir/bert_based_query_creator_test.py b/tests/forte/processors/ir/bert_based_query_creator_test.py index 318a046d5..dc722f4af 100644 --- a/tests/forte/processors/ir/bert_based_query_creator_test.py +++ b/tests/forte/processors/ir/bert_based_query_creator_test.py @@ -63,7 +63,7 @@ def test_pipeline(self, texts): for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): query_pack: DataPack = m_pack.get_pack("query") - self.assertEqual(query_pack.num_generics_entries, 2) - self.assertIsInstance(query_pack.generics[1], Query) - query = query_pack.generics[1].value + self.assertEqual(query_pack.num_generics_entries, 1) + self.assertIsInstance(query_pack.generics[0], Query) + query = query_pack.generics[0].value self.assertEqual(query.shape, (1, 768)) From 0fc42be1ba2f497981b52bdb968868e156e5b77f Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 22 Jun 2022 12:49:37 -0700 Subject: [PATCH 092/137] temporary fix --- forte/data/data_pack.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 8ba4b7872..14bf711a2 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -215,6 +215,10 @@ def __setstate__(self, state): self._entry_converter = EntryConverter() super().__setstate__(state) + for payload in ( + self.text_payloads + self.audio_payloads + self.image_payloads + ): + payload.set_pack(self) self._index = DataIndex() self._index.update_basic_index(list(iter(self))) From c62f6e7611eebc492ce296d8f52331892b63b066 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 22 Jun 2022 12:54:49 -0700 Subject: [PATCH 093/137] remove unused Generics --- forte/data/readers/audio_reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index cfbe32393..ad26169b0 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -21,7 +21,6 @@ from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader from forte.data.modality import Modality -from forte.data.ontology.top import Generics from ft.onto.base_ontology import AudioPayload __all__ = [ From f6c3863595a8aa57862aa7c52d2d3da4322cece8 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 08:59:19 -0700 Subject: [PATCH 094/137] capitalize first character in Enum --- forte/data/modality.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forte/data/modality.py b/forte/data/modality.py index 6b3845e7f..e2a682066 100644 --- a/forte/data/modality.py +++ b/forte/data/modality.py @@ -13,4 +13,4 @@ # limitations under the License. from enum import IntEnum -Modality = IntEnum("modality", "text audio image") +Modality = IntEnum("modality", "Text Audio Image") From 90aa217eb9e85844e4563307ab44ee43bf7b4278 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:33:00 -0700 Subject: [PATCH 095/137] fix docstring for text() and change modality from str format to IntEnum --- forte/data/data_pack.py | 82 ++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 14bf711a2..f37690764 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from enum import IntEnum import logging from pathlib import Path from typing import ( @@ -239,7 +240,7 @@ def _validate(self, entry: EntryType) -> bool: @property def text(self) -> str: """ - Get text from a text payload at an index. + Get the first text data stored in the DataPack. Args: text_payload_index: the index of the text payload. Defaults to 0. @@ -251,13 +252,7 @@ def text(self) -> str: Returns: text data in the text payload. """ - try: - tp = self.get_payload_at("text", 0) - except ValueError: - # backward compatibility, there might be case there is - # not payloads - return "" - return tp.cache + return self.get_payload_data_at(Modality.Text, 0) @property def all_annotations(self) -> Iterator[Annotation]: @@ -455,51 +450,55 @@ def groups(self, val): self._groups = val def get_payload_at( - self, modality: str, payload_index: int + self, modality: IntEnum, payload_index: int ): # -> Union[TextPayload, AudioPayload, ImagePayload]: """ Get Payload of requested modality at the requested payload index. Args: modality: data modality among "text", "audio", "image" - payload_index (int): the zero-based index of the Payload + payload_index: the zero-based index of the Payload in this DataPack's Payload entries of the requested modality. Raises: ValueError: raised when the requested modality is not supported. Returns: - Payload entry containing text data or numpy array for image and - audio data. + Payload entry containing text data, image or audio data. + """ - supported_modality = ("text", "audio", "image") + supported_modality = ("Text", "Audio", "Image") + try: - if modality == "text": + # if modality.name == "text": + if modality == Modality.Text: payloads_length = len(self.text_payloads) payload = self.text_payloads[payload_index] - elif modality == "audio": + # elif modality.name == "audio": + elif modality == Modality.Audio: payloads_length = len(self.audio_payloads) payload = self.audio_payloads[payload_index] - elif modality == "image": + # elif modality.name == "image": + elif modality == Modality.Image: payloads_length = len(self.image_payloads) payload = self.image_payloads[payload_index] else: raise ValueError( - f"Provided modality {modality} is not supported." + f"Provided modality {modality.name} is not supported." "Please provide one of modality among" f" {supported_modality}." ) except IndexError as e: raise ValueError( f"payload index ({payload_index}) " - f"is larger or equal to {modality} payload list" + f"is larger or equal to {modality.name} payload list" f" length ({payloads_length}). " - f"Please input a {modality} payload index less than it." + f"Please input a {modality.name} payload index less than it." ) from e return payload def get_payload_data_at( - self, modality: str, payload_index: int + self, modality: IntEnum, payload_index: int ) -> Union[str, np.ndarray]: """ Get Payload of requested modality at the requested payload index. @@ -513,7 +512,12 @@ def get_payload_data_at( ValueError: raised when the requested modality is not supported. Returns: - str data for text data or numpy array for image and audio data. + different data types for different data modalities. + + 1. str data for text data. + + 2. Numpy array for image and audio data. + """ return self.get_payload_at(modality, payload_index).cache @@ -531,9 +535,9 @@ def get_span_text( Returns: The text within this span. """ - return cast(str, self.get_payload_data_at("text", text_payload_index))[ - begin:end - ] + return cast( + str, self.get_payload_data_at(Modality.Text, text_payload_index) + )[begin:end] def get_span_audio( self, begin: int, end: int, audio_payload_index=0 @@ -554,7 +558,9 @@ def get_span_audio( """ return cast( np.ndarray, - self.get_payload_data_at("audio", audio_payload_index)[begin:end], + self.get_payload_data_at(Modality.Audio, audio_payload_index)[ + begin:end + ], ) def set_text( @@ -589,9 +595,9 @@ def set_text( TextPayload, ) - tp = TextPayload(self, Modality.text, text_payload_index) + tp = TextPayload(self, Modality.Text, text_payload_index) else: - tp = self.get_payload_at("text", text_payload_index) + tp = self.get_payload_at(Modality.Text, text_payload_index) tp.set_cache(text) @@ -621,9 +627,9 @@ def set_audio( AudioPayload, ) - ap = AudioPayload(self, Modality.audio) + ap = AudioPayload(self, Modality.Audio) else: - ap = self.get_payload_at("audio", audio_payload_index) + ap = self.get_payload_at(Modality.Audio, audio_payload_index) ap.set_cache(audio) ap.sample_rate = sample_rate @@ -639,7 +645,7 @@ def get_original_text(self, text_payload_index: int = 0): Original text after applying the `replace_back_operations` of :class:`~forte.data.data_pack.DataPack` object to the modified text """ - tp = self.get_payload_at("text", text_payload_index) + tp = self.get_payload_at(Modality.Text, text_payload_index) original_text, _, _, _ = data_utils_io.modify_text_and_track_ops( tp.cache, tp.replace_back_operations ) @@ -722,12 +728,12 @@ def get_original_index( Original index that aligns with input_index """ processed_original_spans = self.get_payload_at( - "text", 0 + Modality.Text, 0 ).processed_original_spans if len(processed_original_spans) == 0: return input_index - len_processed_text = len(self.get_payload_data_at("text", 0)) + len_processed_text = len(self.get_payload_data_at(Modality.Text, 0)) orig_index = None prev_end = 0 for ( @@ -845,7 +851,7 @@ def __add_entry_with_check(self, entry: Union[EntryType, int]) -> EntryType: f"is not a valid begin." ) - if end > len(self.get_payload_at("text", 0).cache): + if end > len(self.get_payload_data_at(Modality.Text, 0)): if len(self.text) == 0: raise ValueError( f"The end {end} of span is greater than the text " @@ -1104,11 +1110,11 @@ def get_context_data( str: context data. """ if issubclass(c_type, Annotation): - return self.get_payload_data_at("text", payload_index)[ + return self.get_payload_data_at(Modality.Text, payload_index)[ context.begin : context.end ] elif issubclass(c_type, AudioAnnotation): - return self.get_payload_data_at("audio", payload_index)[ + return self.get_payload_data_at(Modality.Audio, payload_index)[ context.begin : context.end ] else: @@ -1689,13 +1695,13 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): self._pending_entries[entry.tid] = entry.tid, c if isinstance(entry, Payload): - if entry.get_modality() == "text": + if entry.get_modality() == Modality.Text: entry.set_payload_index(len(self.text_payloads)) self.text_payloads.append(entry) - elif entry.get_modality() == "audio": + elif entry.get_modality() == Modality.Audio: entry.set_payload_index(len(self.audio_payloads)) self.audio_payloads.append(entry) - elif entry.get_modality() == "image": + elif entry.get_modality() == Modality.Image: entry.set_payload_index(len(self.image_payloads)) self.image_payloads.append(entry) From 1596a6f7629fac5014bf9bfb64c9baa507a73493 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:33:35 -0700 Subject: [PATCH 096/137] change modality from str format to IntEnum --- forte/data/ontology/top.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index e363a06b6..b7d6d7af5 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -873,7 +873,7 @@ def __init__( self._image_payload_idx = image_payload_idx super().__init__(pack) self.img_arr = self.pack.get_payload_data_at( - "image", self._image_payload_idx + Modality.Image, self._image_payload_idx ) self.c_h, self.c_w = ( self.img_arr.shape[0] // self._height, @@ -1194,7 +1194,7 @@ def __init__( payload_idx: int = 0, uri: Optional[str] = None, ): - supported_modality = ("text", "audio", "image") + supported_modality = ("Text", "Audio", "Image") if modality.name not in supported_modality: raise ValueError( f"The given modality {modality} is not supported. " @@ -1219,7 +1219,16 @@ def get_type(self) -> type: """ return type(self) - def get_modality(self) -> str: + def get_modality(self) -> IntEnum: + """ + Get the modality of the payload class. + + Returns: + the modality of the payload class as an IntEnum object. + """ + return self._modality + + def get_modality_name(self) -> str: """ Get the modality of the payload class. From 16a05dfcb969b461703f6ad4cdd1b74ee5c14bed Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:34:00 -0700 Subject: [PATCH 097/137] change modality from str format to IntEnum --- forte/data/readers/audio_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index ad26169b0..2e616a283 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -70,7 +70,7 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: # Read in audio data and store in DataPack # add audio payload into DataPack.payloads - ap = AudioPayload(pack, Modality.audio, payload_idx, file_path) + ap = AudioPayload(pack, Modality.Audio, payload_idx, file_path) if not self.configs.lazy_read: audio_data, sample_rate = self.soundfile.read(file_path) ap.set_cache(audio_data) From 95f2e666bf18e9f26f8ed02562efe6b04ef0b05d Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:34:53 -0700 Subject: [PATCH 098/137] change modality from str format to IntEnum --- tests/forte/data/audio_annotation_test.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/forte/data/audio_annotation_test.py b/tests/forte/data/audio_annotation_test.py index 8871bfe8e..194daeedc 100644 --- a/tests/forte/data/audio_annotation_test.py +++ b/tests/forte/data/audio_annotation_test.py @@ -51,7 +51,7 @@ def _process(self, input_pack: DataPack): Recording( pack=input_pack, begin=0, - end=len(input_pack.get_payload_data_at("audio", 0)), + end=len(input_pack.get_payload_data_at(Modality.Audio, 0)), ) @@ -81,7 +81,7 @@ class TextUtteranceProcessor(PackProcessor): """ def _process(self, input_pack: DataPack): - tp = TextPayload(input_pack, Modality.text, 0) + tp = TextPayload(input_pack, Modality.Text, 0) tp.set_cache("test text") Utterance(pack=input_pack, begin=0, end=len(input_pack.text)) @@ -148,7 +148,7 @@ def test_audio_annotation(self): # Test `DataPack.get_span_audio()` with None audio payload with self.assertRaises(ValueError): pack: DataPack = DataPack() - tp = TextPayload(pack, Modality.text, 0) + tp = TextPayload(pack, Modality.Text, 0) tp.set_cache("test text") pack.get_span_audio(begin=0, end=1) # Verify the annotations of each datapack @@ -210,7 +210,8 @@ def test_audio_annotation(self): self.assertEqual(len(recordings), 1) self.assertTrue( array_equal( - recordings[0].audio, pack.get_payload_data_at("audio", 0) + recordings[0].audio, + pack.get_payload_data_at(Modality.Audio, 0), ) ) # Check serialization/deserialization of AudioAnnotation @@ -232,7 +233,7 @@ def test_audio_annotation(self): self.assertTrue( array_equal( audio_utter.audio, - pack.get_payload_data_at("audio", 0)[ + pack.get_payload_data_at(Modality.Audio, 0)[ configs["begin"] : configs["end"] ], ) From fd7558bd44c441b3bddf611ab1329bd83254548c Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:35:10 -0700 Subject: [PATCH 099/137] change modality from str format to IntEnum --- tests/forte/data/readers/audio_reader_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 05d722c09..3d92f3327 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -48,7 +48,7 @@ def initialize(self, resources: Resources, configs: Config): self._model = Wav2Vec2ForCTC.from_pretrained(pretrained_model) def _process(self, input_pack: DataPack): - ap = input_pack.get_payload_at("audio", 0) + ap = input_pack.get_payload_at(Modality.Audio, 0) sample_rate = ap.sample_rate audio_data = ap.cache required_sample_rate: int = 16000 @@ -68,7 +68,7 @@ def _process(self, input_pack: DataPack): argmax(self._model(input_values).logits, dim=-1) ) - tp = TextPayload(input_pack, Modality.text, 0) + tp = TextPayload(input_pack, Modality.Text, 0) tp.set_cache(transcription[0]) input_pack.set_text(text=transcription[0]) From b6f7bdeb1a23ac29a010956d69175dd484589b4a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:35:32 -0700 Subject: [PATCH 100/137] change modality from str format to IntEnum --- tests/forte/grids_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/forte/grids_test.py b/tests/forte/grids_test.py index 2c764ff79..7af88c45e 100644 --- a/tests/forte/grids_test.py +++ b/tests/forte/grids_test.py @@ -14,7 +14,6 @@ """ Unit tests for Grids. """ -from tkinter import image_types import unittest from forte.data.modality import Modality from ft.onto.base_ontology import ImagePayload @@ -37,7 +36,7 @@ def setUp(self): line[2, 2] = 1 line[3, 3] = 1 line[4, 4] = 1 - ip = ImagePayload(self.datapack, Modality.image) + ip = ImagePayload(self.datapack, Modality.Image) ip.set_cache(line) self.datapack.image_annotations.append( ImageAnnotation(self.datapack, 0) @@ -49,7 +48,7 @@ def setUp(self): self.zeros = np.zeros((6, 12)) self.ref_arr = np.zeros((6, 12)) self.ref_arr[2, 2] = 1 - ip = ImagePayload(self.datapack, Modality.image) + ip = ImagePayload(self.datapack, Modality.Image) ip.set_cache(self.ref_arr) self.datapack.image_annotations.append( ImageAnnotation(self.datapack, 0) From 9e89f065b81fb4eb5251417f903473b86612a570 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 10:35:45 -0700 Subject: [PATCH 101/137] change modality from str format to IntEnum --- tests/forte/image_annotation_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/forte/image_annotation_test.py b/tests/forte/image_annotation_test.py index e7a192f9c..40fde96de 100644 --- a/tests/forte/image_annotation_test.py +++ b/tests/forte/image_annotation_test.py @@ -38,7 +38,7 @@ def setUp(self): self.line[2, 2] = 1 self.line[3, 3] = 1 self.line[4, 4] = 1 - ip = ImagePayload(self.datapack, Modality.image, 0) + ip = ImagePayload(self.datapack, Modality.Image, 0) ip.set_cache(self.line) ImageAnnotation(self.datapack) @@ -49,7 +49,7 @@ def test_image_annotation(self): self.assertTrue( array_equal( - self.datapack.get_payload_at("image", 0).cache, self.line + self.datapack.get_payload_at(Modality.Image, 0).cache, self.line ) ) new_pack = DataPack.from_string(self.datapack.to_string()) From 6e4aa819ab5789741ecf5e06df2315d95ff04578 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 11:07:06 -0700 Subject: [PATCH 102/137] correct text function --- forte/data/data_pack.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index f37690764..5303cbe63 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -241,6 +241,8 @@ def _validate(self, entry: EntryType) -> bool: def text(self) -> str: """ Get the first text data stored in the DataPack. + If there is no text payload in the DataPack, it will return empty + string. Args: text_payload_index: the index of the text payload. Defaults to 0. @@ -252,7 +254,10 @@ def text(self) -> str: Returns: text data in the text payload. """ - return self.get_payload_data_at(Modality.Text, 0) + if len(self.text_payloads) > 0: + return self.get_payload_data_at(Modality.Text, 0) + else: + return "" @property def all_annotations(self) -> Iterator[Annotation]: From 05956763cd2bb72a92643ea91780b038e88e20fe Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 13:47:46 -0700 Subject: [PATCH 103/137] entry converter with payload support --- forte/data/entry_converter.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/forte/data/entry_converter.py b/forte/data/entry_converter.py index b4dfed83e..f31b0e1cf 100644 --- a/forte/data/entry_converter.py +++ b/forte/data/entry_converter.py @@ -28,6 +28,7 @@ MultiPackGeneric, MultiPackGroup, MultiPackLink, + Payload, SinglePackEntries, MultiPackEntries, ) @@ -109,6 +110,14 @@ def save_entry_object( image_payload_idx=entry.image_payload_idx, # type: ignore tid=entry.tid, ) + elif data_store_ref._is_subclass(entry.entry_type(), Payload): + data_store_ref.add_payload_raw( + type_name=entry.entry_type(), + payload_idx=entry.payload_index, + modality=entry.modality, + tid=entry.tid, + allow_duplicate=allow_duplicate, + ) elif data_store_ref._is_subclass(entry.entry_type(), Grids): # Will be deprecated in future data_store_ref.add_grid_raw( # type: ignore From 861f37c34965661f569942373c8f2551804db46b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Fri, 24 Jun 2022 13:48:07 -0700 Subject: [PATCH 104/137] cutomized payload serialization function --- forte/data/ontology/top.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 8764870a1..0b8d9f75d 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1321,7 +1321,9 @@ def __getstate__(self): r""" Convert ``_modality`` ``Enum`` object to str format for serialization. """ - state = super().__getstate__() + # TODO: this function will be removed since + # Entry store is being integrated into DataStore + state = self.__dict__.copy() state["_modality"] = self._modality.name return state @@ -1329,7 +1331,9 @@ def __setstate__(self, state): r""" Convert ``_modality`` string to ``Enum`` object for deserialization. """ - super().__setstate__(state) + # TODO: this function will be removed since + # Entry store is being integrated into DataStore + self.__dict__.update(state) self._modality = getattr(Modality, state["_modality"]) From 101111f76abfaceba0b6f4207e991bd525bf96bc Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:49:12 -0700 Subject: [PATCH 105/137] add_payload_raw --- forte/data/base_store.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/forte/data/base_store.py b/forte/data/base_store.py index f2dab0d34..4a013f19b 100644 --- a/forte/data/base_store.py +++ b/forte/data/base_store.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import abstractmethod +from enum import IntEnum from typing import List, Iterator, Tuple, Any, Optional, Dict import json @@ -270,6 +271,36 @@ def add_image_annotation_raw( """ raise NotImplementedError + @abstractmethod + def add_payload_raw( + self, + type_name: str, + payload_idx: int, + modality: IntEnum, + tid: Optional[int] = None, + allow_duplicate: bool = True, + ) -> int: + r""" + This function adds an payload entry with ``modality`` and + indices to current data store object. Returns the ``tid`` for the + inserted entry. + + Args: + type_name: The fully qualified type name of the new AudioAnnotation. + payload_idx: the index of the payload. + modality: The modality object + tid: ``tid`` of the Annotation entry that is being added. + It's optional, and it will be + auto-assigned if not given. + allow_duplicate: Whether we allow duplicate in the DataStore. When + it's set to False, the function will return the ``tid`` of + existing entry if a duplicate is found. Default value is True. + + Returns: + ``tid`` of the entry. + """ + raise NotImplementedError + @abstractmethod def add_multipack_generic_raw( self, type_name: str, tid: Optional[int] = None From 79c062fcb74478de61f1e6d5fc58b9446880832c Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:49:54 -0700 Subject: [PATCH 106/137] remove extra parameter Modality --- forte/data/data_pack.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 50db9ca4f..0ca1ca644 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -34,6 +34,7 @@ from sortedcontainers import SortedList from forte.common.exception import ( + ProcessExecutionException, UnknownOntologyClassException, ) from forte.common.constants import TID_INDEX @@ -237,7 +238,7 @@ def text(self) -> str: text data in the text payload. """ if len(self.text_payloads) > 0: - return self.get_payload_data_at(Modality.Text, 0) + return str(self.get_payload_data_at(Modality.Text, 0)) else: return "" @@ -462,7 +463,7 @@ def get_payload_at( f" {supported_modality}." ) except IndexError as e: - raise ValueError( + raise ProcessExecutionException( f"payload index ({payload_index}) " f"is larger or equal to {modality.name} payload list" f" length ({payloads_length}). " @@ -568,7 +569,7 @@ def set_text( TextPayload, ) - tp = TextPayload(self, Modality.Text, text_payload_index) + tp = TextPayload(self, text_payload_index) else: tp = self.get_payload_at(Modality.Text, text_payload_index) @@ -600,7 +601,7 @@ def set_audio( AudioPayload, ) - ap = AudioPayload(self, Modality.Audio) + ap = AudioPayload(self) else: ap = self.get_payload_at(Modality.Audio, audio_payload_index) From 369e2c29daeeb4f6f27be2e0e50274bc04e25bb4 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:50:25 -0700 Subject: [PATCH 107/137] cast entry to Payload --- forte/data/entry_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/forte/data/entry_converter.py b/forte/data/entry_converter.py index f31b0e1cf..36b2924c6 100644 --- a/forte/data/entry_converter.py +++ b/forte/data/entry_converter.py @@ -13,7 +13,7 @@ # limitations under the License. import logging -from typing import Dict, Optional +from typing import Dict, Optional, cast from forte.data.base_pack import PackType from forte.data.ontology.core import Entry, FList, FDict from forte.data.ontology.core import EntryType @@ -111,6 +111,7 @@ def save_entry_object( tid=entry.tid, ) elif data_store_ref._is_subclass(entry.entry_type(), Payload): + entry = cast(Payload, entry) data_store_ref.add_payload_raw( type_name=entry.entry_type(), payload_idx=entry.payload_index, From 808ada096cc05ff5ed6d984a1f3d58862df7673e Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:52:38 -0700 Subject: [PATCH 108/137] list to sequence --- forte/data/ontology/top.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 0b8d9f75d..e558db463 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -14,7 +14,17 @@ from dataclasses import dataclass from enum import IntEnum from functools import total_ordering -from typing import Optional, Tuple, Type, Any, Dict, Union, Iterable, List +from typing import ( + Optional, + Sequence, + Tuple, + Type, + Any, + Dict, + Union, + Iterable, + List, +) import numpy as np from forte.data.modality import Modality @@ -1245,8 +1255,8 @@ def __init__( super().__init__(pack) self._cache: Union[str, np.ndarray] = "" - self.replace_back_operations: List[Tuple] = [] - self.processed_original_spans: List[Tuple] = [] + self.replace_back_operations: Sequence[Tuple] = [] + self.processed_original_spans: Sequence[Tuple] = [] self.orig_text_len: int = 0 def get_type(self) -> type: From 4d1982c2808fcd90cd068f28206705d841599ef8 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:53:30 -0700 Subject: [PATCH 109/137] remove extra paramater Modality --- forte/data/readers/audio_reader.py | 2 +- ft/onto/base_ontology.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 2e616a283..a230b0951 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -70,7 +70,7 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: # Read in audio data and store in DataPack # add audio payload into DataPack.payloads - ap = AudioPayload(pack, Modality.Audio, payload_idx, file_path) + ap = AudioPayload(pack, payload_idx, file_path) if not self.configs.lazy_read: audio_data, sample_rate = self.soundfile.read(file_path) ap.set_cache(audio_data) diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 281dfd82f..d49f3cb15 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from enum import IntEnum from forte.data.data_pack import DataPack +from forte.data.modality import Modality from forte.data.multi_pack import MultiPack from forte.data.ontology.core import Entry from forte.data.ontology.core import FDict @@ -612,11 +613,10 @@ class AudioPayload(Payload): def __init__( self, pack: DataPack, - modality: IntEnum, payload_idx: int = 0, uri: Optional[str] = None, ): - super().__init__(pack, modality, payload_idx, uri) + super().__init__(pack, Modality.Audio, payload_idx, uri) self.sample_rate: Optional[int] = None @@ -629,11 +629,10 @@ class TextPayload(Payload): def __init__( self, pack: DataPack, - modality: IntEnum, payload_idx: int = 0, uri: Optional[str] = None, ): - super().__init__(pack, modality, payload_idx, uri) + super().__init__(pack, Modality.Text, payload_idx, uri) @dataclass @@ -645,8 +644,7 @@ class ImagePayload(Payload): def __init__( self, pack: DataPack, - modality: IntEnum, payload_idx: int = 0, uri: Optional[str] = None, ): - super().__init__(pack, modality, payload_idx, uri) + super().__init__(pack, Modality.Image, payload_idx, uri) From b514beeaf66ac906bf33878a89db45efb6c01270 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:53:55 -0700 Subject: [PATCH 110/137] datapack.set_Text --- tests/forte/data/audio_annotation_test.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/forte/data/audio_annotation_test.py b/tests/forte/data/audio_annotation_test.py index 194daeedc..25b4bc30e 100644 --- a/tests/forte/data/audio_annotation_test.py +++ b/tests/forte/data/audio_annotation_test.py @@ -81,8 +81,7 @@ class TextUtteranceProcessor(PackProcessor): """ def _process(self, input_pack: DataPack): - tp = TextPayload(input_pack, Modality.Text, 0) - tp.set_cache("test text") + input_pack.set_text("test text") Utterance(pack=input_pack, begin=0, end=len(input_pack.text)) @@ -146,10 +145,9 @@ def setUp(self): def test_audio_annotation(self): # Test `DataPack.get_span_audio()` with None audio payload - with self.assertRaises(ValueError): + with self.assertRaises(ProcessExecutionException): pack: DataPack = DataPack() - tp = TextPayload(pack, Modality.Text, 0) - tp.set_cache("test text") + pack.set_text("test text") pack.get_span_audio(begin=0, end=1) # Verify the annotations of each datapack for pack in self._pipeline.process_dataset(self._test_audio_path): From 25cdeaa26315ec4e88fe9f41e9abe7b31e57a596 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:54:21 -0700 Subject: [PATCH 111/137] remove extra imports --- tests/forte/data/readers/audio_reader_test.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 3d92f3327..f3e90995b 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -14,10 +14,7 @@ """ Unit tests for AudioReader. """ -from email.mime import audio -import importlib import os -from sunau import AUDIO_FILE_ENCODING_ADPCM_G721 import unittest from typing import Dict from forte.data import Modality @@ -31,7 +28,7 @@ from forte.data.readers import AudioReader from forte.pipeline import Pipeline from forte.processors.base.pack_processor import PackProcessor -from ft.onto.base_ontology import AudioPayload, TextPayload +from ft.onto.base_ontology import TextPayload class TestASRProcessor(PackProcessor): @@ -68,7 +65,7 @@ def _process(self, input_pack: DataPack): argmax(self._model(input_values).logits, dim=-1) ) - tp = TextPayload(input_pack, Modality.Text, 0) + tp = TextPayload(input_pack, 0) tp.set_cache(transcription[0]) input_pack.set_text(text=transcription[0]) @@ -91,10 +88,7 @@ def setUp(self): ) # Define and config the Pipeline self._pipeline = Pipeline[DataPack]() - self._pipeline.set_reader( - AudioReader(), - config={"read_kwargs": {"module": "soundfile", "method": "read"}}, - ) + self._pipeline.set_reader(AudioReader()) self._pipeline.add(TestASRProcessor()) self._pipeline.initialize() From 85b52fc986bbf8296ce54e16e1500cca9534899b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 07:56:49 -0700 Subject: [PATCH 112/137] remove extra paramter modality --- tests/forte/grids_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/forte/grids_test.py b/tests/forte/grids_test.py index 7af88c45e..df54a28a8 100644 --- a/tests/forte/grids_test.py +++ b/tests/forte/grids_test.py @@ -36,7 +36,7 @@ def setUp(self): line[2, 2] = 1 line[3, 3] = 1 line[4, 4] = 1 - ip = ImagePayload(self.datapack, Modality.Image) + ip = ImagePayload(self.datapack) ip.set_cache(line) self.datapack.image_annotations.append( ImageAnnotation(self.datapack, 0) @@ -48,7 +48,7 @@ def setUp(self): self.zeros = np.zeros((6, 12)) self.ref_arr = np.zeros((6, 12)) self.ref_arr[2, 2] = 1 - ip = ImagePayload(self.datapack, Modality.Image) + ip = ImagePayload(self.datapack) ip.set_cache(self.ref_arr) self.datapack.image_annotations.append( ImageAnnotation(self.datapack, 0) From 0c40f8bc24c50c0fb177c6e7d16f5ab78694eade Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 08:00:21 -0700 Subject: [PATCH 113/137] remove modality paramter --- tests/forte/image_annotation_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/forte/image_annotation_test.py b/tests/forte/image_annotation_test.py index 40fde96de..35f523c80 100644 --- a/tests/forte/image_annotation_test.py +++ b/tests/forte/image_annotation_test.py @@ -38,7 +38,7 @@ def setUp(self): self.line[2, 2] = 1 self.line[3, 3] = 1 self.line[4, 4] = 1 - ip = ImagePayload(self.datapack, Modality.Image, 0) + ip = ImagePayload(self.datapack, 0) ip.set_cache(self.line) ImageAnnotation(self.datapack) From 2eac83f582f472243eefe32f6f1baf79231d30ad Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 08:10:53 -0700 Subject: [PATCH 114/137] remove duplication check for payload --- forte/data/data_store.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 4337dbdcc..0857cfa55 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -1180,12 +1180,6 @@ def add_payload_raw( # A reference to the entry should be store in both self.__elements and # self.__tid_ref_dict. entry = self._new_payload(type_name, payload_idx, modality, tid) - - if not allow_duplicate: - tid_search_result = self._get_existing_ann_entry_tid(entry) - # if found existing entry - if tid_search_result != -1: - return tid_search_result return self._add_entry_raw(Payload, type_name, entry) def _get_existing_ann_entry_tid(self, entry: List[Any]): From be4c98cfa1613e533786e97ea75466ec9a8bf8d4 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 08:16:06 -0700 Subject: [PATCH 115/137] improve docstring for payload --- forte/data/ontology/top.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index e558db463..600fed359 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -860,8 +860,10 @@ def __init__(self, pack: PackType, image_payload_idx: int = 0): Args: pack: The container that this image annotation will be added to. - image_payload_idx: the index of the image payload. If it's not set, - it defaults to 0 which means it will load the first image payload. + image_payload_idx: the index of the image payload in the DataPack's + image payload list. + If it's not set, it defaults to 0 which means it will load the + first image payload. """ self._image_payload_idx = image_payload_idx super().__init__(pack) @@ -901,7 +903,9 @@ class Grids(Entry): pack: The container that this grids will be added to. height: the number of grid cell per column, the unit is one grid cell. width: the number of grid cell per row, the unit is one grid cell. - image_payload_idx: the index of the image payload. If it's not set, + image_payload_idx: the index of the image payload in the DataPack's + image payload list. + If it's not set, it defaults to 0 which meaning it will load the first image payload. """ @@ -1035,7 +1039,9 @@ class Region(ImageAnnotation): Args: pack: the container that this ``Region`` will be added to. - image_payload_idx: the index of the image payload. If it's not set, + image_payload_idx: the index of the image payload in the DataPack's + image payload list. + If it's not set, it defaults to 0 which meaning it will load the first image payload. """ @@ -1061,7 +1067,8 @@ class Box(Region): Args: pack: the container that this ``Box`` will be added to. - image_payload_idx: the index of the image payload. If it's not set, + image_payload_idx: the index of the image payload in the DataPack's + image payload list. If it's not set, it defaults to 0 which meaning it will load the first image payload. cy: the row index of the box center in the image array, the unit is one image array entry. @@ -1184,7 +1191,8 @@ class BoundingBox(Box): Args: pack: The container that this BoundingBox will be added to. - image_payload_idx: the index of the image payload. If it's not set, + image_payload_idx: the index of the image payload in the DataPack's + image payload list. If it's not set, it defaults to 0 which means it will load the first image payload. height: the height of the bounding box, the unit is one image array entry. @@ -1229,7 +1237,9 @@ class Payload(Entry): pack: The container that this `Payload` will be added to. modality: modality of the payload such as text, audio and image. - payload_idx: the index of the payload. + payload_idx: the index of the payload in the DataPack's + image payload list of the same modality. For example, if we instantiate a ``TextPayload`` inherited from ``Payload``, we assign + the payload index in DataPack's text payload list. uri: universal resource identifier of the data source. Defaults to None. Raises: @@ -1261,7 +1271,8 @@ def __init__( def get_type(self) -> type: """ - Get the type of the payload class. + Get the class type of the payload class. For example, suppose a ``TextPayload`` inherits this ``Payload`` class, ``TextPayload`` will be + returned. Returns: the type of the payload class. @@ -1288,12 +1299,6 @@ def get_modality_name(self) -> str: @property def cache(self) -> Union[str, np.ndarray]: - if self._cache is None: - raise ValueError( - "Payload doesn't have a cache." - "Please set the reader config `lazy_read` to False" - "or manually load it by set_cache() " - ) return self._cache @property @@ -1323,7 +1328,7 @@ def set_payload_index(self, payload_index: int): Set payload index for the DataPack. Args: - payload_index: _description_ + payload_index: a new payload index to be set. """ self._payload_idx = payload_index From 6f4055792d97f15b65ccbb23d62f7f384be22683 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 10:36:14 -0700 Subject: [PATCH 116/137] new ontology --- forte/data/data_pack.py | 4 ++-- ft/onto/base_ontology.py | 23 ++++++----------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 0ca1ca644..13c870c60 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -441,7 +441,7 @@ def get_payload_at( Payload entry containing text data, image or audio data. """ - supported_modality = ("Text", "Audio", "Image") + supported_modality = [enum.name for enum in Modality] try: # if modality.name == "text": @@ -707,7 +707,7 @@ def get_original_index( if len(processed_original_spans) == 0: return input_index - len_processed_text = len(self.get_payload_data_at(Modality.Text, 0)) + len_processed_text = len(self.text) orig_index = None prev_end = 0 for ( diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index d49f3cb15..041fe73ae 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -8,9 +8,7 @@ """ from dataclasses import dataclass -from enum import IntEnum from forte.data.data_pack import DataPack -from forte.data.modality import Modality from forte.data.multi_pack import MultiPack from forte.data.ontology.core import Entry from forte.data.ontology.core import FDict @@ -611,12 +609,9 @@ class AudioPayload(Payload): sample_rate: Optional[int] def __init__( - self, - pack: DataPack, - payload_idx: int = 0, - uri: Optional[str] = None, + self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None ): - super().__init__(pack, Modality.Audio, payload_idx, uri) + super().__init__(pack, payload_idx, uri) self.sample_rate: Optional[int] = None @@ -627,12 +622,9 @@ class TextPayload(Payload): """ def __init__( - self, - pack: DataPack, - payload_idx: int = 0, - uri: Optional[str] = None, + self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None ): - super().__init__(pack, Modality.Text, payload_idx, uri) + super().__init__(pack, payload_idx, uri) @dataclass @@ -642,9 +634,6 @@ class ImagePayload(Payload): """ def __init__( - self, - pack: DataPack, - payload_idx: int = 0, - uri: Optional[str] = None, + self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None ): - super().__init__(pack, Modality.Image, payload_idx, uri) + super().__init__(pack, payload_idx, uri) From 801b666e7061cdd58c7fc0cd6d99c720e732a7f2 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 10:39:04 -0700 Subject: [PATCH 117/137] set Payload._modality based on Payload class type --- forte/data/ontology/top.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 600fed359..87a320e91 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1249,18 +1249,28 @@ class Payload(Entry): def __init__( self, pack: PackType, - modality: IntEnum, payload_idx: int = 0, uri: Optional[str] = None, ): - supported_modality = ("Text", "Audio", "Image") - if modality.name not in supported_modality: + from ft.onto.base_ontology import ( + TextPayload, + AudioPayload, + ImagePayload, + ) + + if isinstance(self, TextPayload): + self._modality = Modality.Text + elif isinstance(self, AudioPayload): + self._modality = Modality.Audio + elif isinstance(self, ImagePayload): + self._modality = Modality.Image + else: + supported_modality = [enum.name for enum in Modality] raise ValueError( f"The given modality {modality} is not supported. " f"Currently we only support {supported_modality}" ) self._payload_idx: int = payload_idx - self._modality: IntEnum = modality self._uri: Optional[str] = uri super().__init__(pack) From 8ddfd63b9333c73ccfbad12ba7ce685916947793 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 10:39:39 -0700 Subject: [PATCH 118/137] remove set_cache not needed --- tests/forte/data/readers/audio_reader_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index f3e90995b..47f880e53 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -66,7 +66,6 @@ def _process(self, input_pack: DataPack): ) tp = TextPayload(input_pack, 0) - tp.set_cache(transcription[0]) input_pack.set_text(text=transcription[0]) From 98df2017275fefab8b4c8c359e56101d2610a329 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 10:47:46 -0700 Subject: [PATCH 119/137] set_cache -> set_audio --- forte/data/readers/audio_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index a230b0951..488364c3a 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -73,7 +73,7 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: ap = AudioPayload(pack, payload_idx, file_path) if not self.configs.lazy_read: audio_data, sample_rate = self.soundfile.read(file_path) - ap.set_cache(audio_data) + pack.set_audio(audio_data) ap.sample_rate = sample_rate pack.pack_name = file_path yield pack From 2c8e7d969450265dde1698cf9f16e6b6baf84c5d Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 10:52:32 -0700 Subject: [PATCH 120/137] change self.device back --- forte/processors/ir/bert/bert_based_query_creator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/forte/processors/ir/bert/bert_based_query_creator.py b/forte/processors/ir/bert/bert_based_query_creator.py index 24caae675..6296618f2 100644 --- a/forte/processors/ir/bert/bert_based_query_creator.py +++ b/forte/processors/ir/bert/bert_based_query_creator.py @@ -45,10 +45,9 @@ def __init__(self) -> None: def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs - self.device = "cpu" - # self.device = torch.device( - # "cuda" if torch.cuda.is_available() else "cpu" - # ) + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" + ) try: from texar.torch.data import ( # pylint: disable=import-outside-toplevel From 41bfd87caf1e3e73ab2d95077c162b0003889a11 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 11:09:12 -0700 Subject: [PATCH 121/137] set_audio(audio_data, sample_rate) --- forte/data/readers/audio_reader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 488364c3a..1952e5c2a 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -20,7 +20,6 @@ from forte.data.data_pack import DataPack from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader -from forte.data.modality import Modality from ft.onto.base_ontology import AudioPayload __all__ = [ @@ -73,7 +72,7 @@ def _parse_pack(self, file_path: str) -> Iterator[DataPack]: ap = AudioPayload(pack, payload_idx, file_path) if not self.configs.lazy_read: audio_data, sample_rate = self.soundfile.read(file_path) - pack.set_audio(audio_data) + pack.set_audio(audio_data, sample_rate) ap.sample_rate = sample_rate pack.pack_name = file_path yield pack From e65587ce6bf00328e62c559b64ddcd67fd9e00af Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 11:12:47 -0700 Subject: [PATCH 122/137] get_payload_data -> self.text --- forte/data/data_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 13c870c60..27c92d1dd 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -825,7 +825,7 @@ def __add_entry_with_check(self, entry: Union[EntryType, int]) -> EntryType: f"is not a valid begin." ) - if end > len(self.get_payload_data_at(Modality.Text, 0)): + if end > len(self.text): if len(self.text) == 0: raise ValueError( f"The end {end} of span is greater than the text " From dd9839db3e938c7b9935620019c35d5ce59050d6 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Mon, 27 Jun 2022 11:26:17 -0700 Subject: [PATCH 123/137] fix modality --- forte/data/ontology/top.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 87a320e91..967685117 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1238,7 +1238,8 @@ class Payload(Entry): be added to. modality: modality of the payload such as text, audio and image. payload_idx: the index of the payload in the DataPack's - image payload list of the same modality. For example, if we instantiate a ``TextPayload`` inherited from ``Payload``, we assign + image payload list of the same modality. For example, if we + instantiate a ``TextPayload`` inherited from ``Payload``, we assign the payload index in DataPack's text payload list. uri: universal resource identifier of the data source. Defaults to None. @@ -1252,7 +1253,7 @@ def __init__( payload_idx: int = 0, uri: Optional[str] = None, ): - from ft.onto.base_ontology import ( + from ft.onto.base_ontology import ( # pylint: disable=import-outside-toplevel TextPayload, AudioPayload, ImagePayload, @@ -1267,7 +1268,7 @@ def __init__( else: supported_modality = [enum.name for enum in Modality] raise ValueError( - f"The given modality {modality} is not supported. " + f"The given modality {self._modality.name} is not supported. " f"Currently we only support {supported_modality}" ) self._payload_idx: int = payload_idx @@ -1281,7 +1282,8 @@ def __init__( def get_type(self) -> type: """ - Get the class type of the payload class. For example, suppose a ``TextPayload`` inherits this ``Payload`` class, ``TextPayload`` will be + Get the class type of the payload class. For example, suppose a + ``TextPayload`` inherits this ``Payload`` class, ``TextPayload`` will be returned. Returns: From 42928d880a40fa5b8cea9b0456b5d68b61b7fead Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 28 Jun 2022 19:33:37 -0700 Subject: [PATCH 124/137] get_modality -> modality property --- forte/data/data_pack.py | 6 +++--- forte/data/ontology/top.py | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 27c92d1dd..913f280b7 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -1550,13 +1550,13 @@ def _save_entry_to_data_store(self, entry: Entry): self._entry_converter.save_entry_object(entry=entry, pack=self) if isinstance(entry, Payload): - if entry.get_modality() == Modality.Text: + if entry.modality == Modality.Text: entry.set_payload_index(len(self.text_payloads)) self.text_payloads.append(entry) - elif entry.get_modality() == Modality.Audio: + elif entry.modality == Modality.Audio: entry.set_payload_index(len(self.audio_payloads)) self.audio_payloads.append(entry) - elif entry.get_modality() == Modality.Image: + elif entry.modality == Modality.Image: entry.set_payload_index(len(self.image_payloads)) self.image_payloads.append(entry) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 967685117..39688b906 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1259,6 +1259,9 @@ def __init__( ImagePayload, ) + # since we cannot pass different modality from generated ontology, and + # we don't want to import base ontology in the header of the file + # we import it here. if isinstance(self, TextPayload): self._modality = Modality.Text elif isinstance(self, AudioPayload): @@ -1291,32 +1294,30 @@ def get_type(self) -> type: """ return type(self) - def get_modality(self) -> IntEnum: + @property + def cache(self) -> Union[str, np.ndarray]: + return self._cache + + @property + def modality(self) -> IntEnum: """ Get the modality of the payload class. Returns: - the modality of the payload class as an IntEnum object. + the modality of the payload class in ``IntEnum`` format. """ return self._modality - def get_modality_name(self) -> str: + @property + def modality_name(self) -> str: """ - Get the modality of the payload class. + Get the modality of the payload class in str format. Returns: the modality of the payload class in str format. """ return self._modality.name - @property - def cache(self) -> Union[str, np.ndarray]: - return self._cache - - @property - def modality(self) -> IntEnum: - return self._modality - @property def payload_index(self) -> int: return self._payload_idx From 1ae728273b76c4e15c7517ebb2ffa787a6414895 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 28 Jun 2022 19:33:57 -0700 Subject: [PATCH 125/137] add_entry_raw for Payload --- forte/data/entry_converter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/forte/data/entry_converter.py b/forte/data/entry_converter.py index e8f3f38b3..343d6693e 100644 --- a/forte/data/entry_converter.py +++ b/forte/data/entry_converter.py @@ -117,10 +117,10 @@ def save_entry_object( ) elif data_store_ref._is_subclass(entry.entry_type(), Payload): entry = cast(Payload, entry) - data_store_ref.add_payload_raw( + data_store_ref.add_entry_raw( type_name=entry.entry_type(), - payload_idx=entry.payload_index, - modality=entry.modality, + attribute_data=[entry.payload_index, entry.modality_name], + base_class=Payload, tid=entry.tid, allow_duplicate=allow_duplicate, ) From 53bc5b11ccd14d086b7ae8830d1cc7a81bc4a43b Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 28 Jun 2022 19:34:49 -0700 Subject: [PATCH 126/137] remove unused imports --- forte/data/base_store.py | 1 - forte/data/data_store.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/forte/data/base_store.py b/forte/data/base_store.py index 23c29d187..d56385120 100644 --- a/forte/data/base_store.py +++ b/forte/data/base_store.py @@ -13,7 +13,6 @@ # limitations under the License. from abc import abstractmethod -from enum import IntEnum from typing import List, Iterator, Tuple, Any, Optional, Dict, Type import json from forte.data.ontology.core import Entry diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 9612d3657..8671b68e8 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from enum import IntEnum import json from typing import Dict, List, Iterator, Tuple, Optional, Any, Type @@ -893,7 +892,6 @@ def add_entry_raw( return self._add_entry_raw(base_class, type_name, new_entry) - def _get_existing_ann_entry_tid(self, entry: List[Any]): r""" This function searches for tid for existing annotation-like entry tid. From 5aa6be02fbed57d0d6639bbe62065b7fee5aef52 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 28 Jun 2022 19:36:35 -0700 Subject: [PATCH 127/137] remove modality parameter in ImagePayload --- forte/data/readers/image_reader.py | 92 ++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 forte/data/readers/image_reader.py diff --git a/forte/data/readers/image_reader.py b/forte/data/readers/image_reader.py new file mode 100644 index 000000000..dc66bd3c1 --- /dev/null +++ b/forte/data/readers/image_reader.py @@ -0,0 +1,92 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The reader that reads image files into Datapacks. +""" +import os +from typing import Any, Iterator + +from forte.data.data_pack import DataPack +from forte.data.data_utils_io import dataset_path_iterator +from forte.data.base_reader import PackReader +from forte.data.modality import Modality +from ft.onto.base_ontology import ImagePayload + +__all__ = [ + "ImageReader", +] + + +class ImageReader(PackReader): + r""":class:`ImageReader` is designed to read in Image files.""" + + def __init__(self): + super().__init__() + try: + import matplotlib.pyplot as plt # pylint: disable=import-outside-toplevel + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + "ImageReader requires 'matplotlib' package to be installed." + " You can refer to [extra modules to install]('pip install" + " forte['image_ext']) or 'pip install matplotlib'." + ) from e + self.plt = plt + + def _collect(self, image_directory) -> Iterator[Any]: # type: ignore + r"""Should be called with param ``Image_directory`` which is a path to a + folder containing Image files. + + Args: + Image_directory: Image directory containing the files. + + Returns: Iterator over paths to Image files + """ + # construct ImageMeta and store it in DataPack + return dataset_path_iterator( + image_directory, + self.configs.file_ext, + ) + + def _cache_key_function(self, image_file: str) -> str: + return os.path.basename(image_file) + + def _parse_pack(self, file_path: str) -> Iterator[DataPack]: + pack: DataPack = DataPack() + payload_idx = 0 + # Read in Image data and store in DataPack + # add Image payload into DataPack.payloads + ip = ImagePayload(pack, payload_idx) + if not self.configs.lazy_read: + image_data = self.plt.imread(file_path) + ip.set_cache(image_data) + pack.pack_name = file_path + yield pack + + @classmethod + def default_configs(cls): + r"""This defines a basic configuration structure for Image reader. + + Here: + + - file_ext (str): The file extension to find the target Image files + under a specific directory path. Default value is ".png". + + - read_kwargs (dict): A dictionary containing all the keyword + arguments for `soundfile.read` method. For details, refer to + https://pysoundfile.readthedocs.io/en/latest/#soundfile.read. + Default value is None. + + Returns: The default configuration of Image reader. + """ + return {"file_ext": ".png", "lazy_read": False, "read_kwargs": None} From c225b428f08124edd5ed9ced51aadb0444926af3 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 28 Jun 2022 19:37:09 -0700 Subject: [PATCH 128/137] Revert "remove modality parameter in ImagePayload" This reverts commit 5aa6be02fbed57d0d6639bbe62065b7fee5aef52. --- forte/data/readers/image_reader.py | 92 ------------------------------ 1 file changed, 92 deletions(-) delete mode 100644 forte/data/readers/image_reader.py diff --git a/forte/data/readers/image_reader.py b/forte/data/readers/image_reader.py deleted file mode 100644 index dc66bd3c1..000000000 --- a/forte/data/readers/image_reader.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2022 The Forte Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -The reader that reads image files into Datapacks. -""" -import os -from typing import Any, Iterator - -from forte.data.data_pack import DataPack -from forte.data.data_utils_io import dataset_path_iterator -from forte.data.base_reader import PackReader -from forte.data.modality import Modality -from ft.onto.base_ontology import ImagePayload - -__all__ = [ - "ImageReader", -] - - -class ImageReader(PackReader): - r""":class:`ImageReader` is designed to read in Image files.""" - - def __init__(self): - super().__init__() - try: - import matplotlib.pyplot as plt # pylint: disable=import-outside-toplevel - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "ImageReader requires 'matplotlib' package to be installed." - " You can refer to [extra modules to install]('pip install" - " forte['image_ext']) or 'pip install matplotlib'." - ) from e - self.plt = plt - - def _collect(self, image_directory) -> Iterator[Any]: # type: ignore - r"""Should be called with param ``Image_directory`` which is a path to a - folder containing Image files. - - Args: - Image_directory: Image directory containing the files. - - Returns: Iterator over paths to Image files - """ - # construct ImageMeta and store it in DataPack - return dataset_path_iterator( - image_directory, - self.configs.file_ext, - ) - - def _cache_key_function(self, image_file: str) -> str: - return os.path.basename(image_file) - - def _parse_pack(self, file_path: str) -> Iterator[DataPack]: - pack: DataPack = DataPack() - payload_idx = 0 - # Read in Image data and store in DataPack - # add Image payload into DataPack.payloads - ip = ImagePayload(pack, payload_idx) - if not self.configs.lazy_read: - image_data = self.plt.imread(file_path) - ip.set_cache(image_data) - pack.pack_name = file_path - yield pack - - @classmethod - def default_configs(cls): - r"""This defines a basic configuration structure for Image reader. - - Here: - - - file_ext (str): The file extension to find the target Image files - under a specific directory path. Default value is ".png". - - - read_kwargs (dict): A dictionary containing all the keyword - arguments for `soundfile.read` method. For details, refer to - https://pysoundfile.readthedocs.io/en/latest/#soundfile.read. - Default value is None. - - Returns: The default configuration of Image reader. - """ - return {"file_ext": ".png", "lazy_read": False, "read_kwargs": None} From 10ff586959ee73ebab01d1fd9b4cd3f60c633d05 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Tue, 28 Jun 2022 23:35:55 -0700 Subject: [PATCH 129/137] correct ontology files --- ft/onto/base_ontology.py | 76 +++++++++------------------------------- ft/onto/metric.py | 2 +- 2 files changed, 18 insertions(+), 60 deletions(-) diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py index 041fe73ae..678b4c63e 100644 --- a/ft/onto/base_ontology.py +++ b/ft/onto/base_ontology.py @@ -311,12 +311,7 @@ class PredicateLink(Link): ParentType = PredicateMention ChildType = PredicateArgument - def __init__( - self, - pack: DataPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.arg_type: Optional[str] = None @@ -336,12 +331,7 @@ class Dependency(Link): ParentType = Token ChildType = Token - def __init__( - self, - pack: DataPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.dep_label: Optional[str] = None self.rel_type: Optional[str] = None @@ -350,7 +340,7 @@ def __init__( @dataclass class EnhancedDependency(Link): """ - A `Link` type entry which represent a enhanced dependency: + A `Link` type entry which represent a enhanced dependency: https://universaldependencies.org/u/overview/enhanced-syntax.html Attributes: dep_label (Optional[str]): The enhanced dependency label in Universal Dependency. @@ -361,12 +351,7 @@ class EnhancedDependency(Link): ParentType = Token ChildType = Token - def __init__( - self, - pack: DataPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.dep_label: Optional[str] = None @@ -384,12 +369,7 @@ class RelationLink(Link): ParentType = EntityMention ChildType = EntityMention - def __init__( - self, - pack: DataPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -407,12 +387,7 @@ class CrossDocEntityRelation(MultiPackLink): ParentType = EntityMention ChildType = EntityMention - def __init__( - self, - pack: MultiPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: MultiPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -425,9 +400,7 @@ class CoreferenceGroup(Group): MemberType = EntityMention - def __init__( - self, pack: DataPack, members: Optional[Iterable[Entry]] = None - ): + def __init__(self, pack: DataPack, members: Optional[Iterable[Entry]] = None): super().__init__(pack, members) @@ -444,12 +417,7 @@ class EventRelation(Link): ParentType = EventMention ChildType = EventMention - def __init__( - self, - pack: DataPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -467,12 +435,7 @@ class CrossDocEventRelation(MultiPackLink): ParentType = EventMention ChildType = EventMention - def __init__( - self, - pack: MultiPack, - parent: Optional[Entry] = None, - child: Optional[Entry] = None, - ): + def __init__(self, pack: MultiPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.rel_type: Optional[str] = None @@ -494,8 +457,8 @@ class ConstituentNode(Annotation): sentiment: Dict[str, float] is_root: Optional[bool] is_leaf: Optional[bool] - parent_node: Optional["ConstituentNode"] - children_nodes: FList["ConstituentNode"] + parent_node: Optional['ConstituentNode'] + children_nodes: FList['ConstituentNode'] def __init__(self, pack: DataPack, begin: int, end: int): super().__init__(pack, begin, end) @@ -503,8 +466,8 @@ def __init__(self, pack: DataPack, begin: int, end: int): self.sentiment: Dict[str, float] = dict() self.is_root: Optional[bool] = None self.is_leaf: Optional[bool] = None - self.parent_node: Optional["ConstituentNode"] = None - self.children_nodes: FList["ConstituentNode"] = FList(self) + self.parent_node: Optional['ConstituentNode'] = None + self.children_nodes: FList['ConstituentNode'] = FList(self) @dataclass @@ -529,6 +492,7 @@ def __init__(self, pack: DataPack, begin: int, end: int): @dataclass class MCOption(Annotation): + def __init__(self, pack: DataPack, begin: int, end: int): super().__init__(pack, begin, end) @@ -608,9 +572,7 @@ class AudioPayload(Payload): sample_rate: Optional[int] - def __init__( - self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None - ): + def __init__(self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None): super().__init__(pack, payload_idx, uri) self.sample_rate: Optional[int] = None @@ -621,9 +583,7 @@ class TextPayload(Payload): A payload that caches text data """ - def __init__( - self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None - ): + def __init__(self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None): super().__init__(pack, payload_idx, uri) @@ -633,7 +593,5 @@ class ImagePayload(Payload): A payload that caches image data """ - def __init__( - self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None - ): + def __init__(self, pack: DataPack, payload_idx: int = 0, uri: Optional[str] = None): super().__init__(pack, payload_idx, uri) diff --git a/ft/onto/metric.py b/ft/onto/metric.py index b1188061a..af7a44d94 100644 --- a/ft/onto/metric.py +++ b/ft/onto/metric.py @@ -1,5 +1,5 @@ # ***automatically_generated*** -# ***source json:../testing/metric.json*** +# ***source json:forte/ontology_specs/metric.json*** # flake8: noqa # mypy: ignore-errors # pylint: skip-file From a68445d3366285f261cb677cf8ca4b0d9af313a7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 15:31:45 -0700 Subject: [PATCH 130/137] revert base back changes --- forte/data/base_pack.py | 230 +++++----------------------------------- 1 file changed, 27 insertions(+), 203 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 813cfd29d..94081acca 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import copy import gzip import pickle @@ -28,33 +27,19 @@ Union, Iterator, Dict, + Tuple, Any, Iterable, ) -from functools import partial -from typing_inspect import get_origin -from packaging.version import Version + + import jsonpickle from forte.common import ProcessExecutionException, EntryNotFoundError -from forte.data.index import BaseIndex -from forte.data.base_store import BaseStore from forte.data.container import EntryContainer -from forte.data.ontology.core import ( - Entry, - EntryType, - GroupType, - LinkType, - FList, - FDict, -) -from forte.version import ( - PACK_VERSION, - DEFAULT_PACK_VERSION, - PACK_ID_COMPATIBLE_VERSION, -) - -logger = logging.getLogger(__name__) +from forte.data.index import BaseIndex +from forte.data.ontology.core import Entry, EntryType, GroupType, LinkType +from forte.version import PACK_VERSION, DEFAULT_PACK_VERSION __all__ = ["BasePack", "BaseMeta", "PackType"] @@ -112,19 +97,26 @@ class BasePack(EntryContainer[EntryType, LinkType, GroupType]): # pylint: disable=too-many-public-methods def __init__(self, pack_name: Optional[str] = None): super().__init__() + self.links: List[LinkType] = [] + self.groups: List[GroupType] = [] self.pack_version: str = PACK_VERSION self._meta: BaseMeta = self._init_meta(pack_name) self._index: BaseIndex = BaseIndex() - self._data_store: BaseStore - self.__control_component: Optional[str] = None - # This Dict maintains a mapping from entry's tid to the component + # This Dict maintains a mapping from entry's tid to the Entry object + # itself (for MultiPack) or entry's tid (for DataPack) and the component # name associated with the entry. # The component name is used for tracking the "creator" of this entry. - self._pending_entries: Dict[int, Optional[str]] = {} + # TODO: Will need to unify the format for MultiPack and DataPack after + # DataStore is integrated with MultiPack and MultiPack entries. In + # future we should only maintain a mapping from entry's tid to the + # corresponding component, i.e., Dict[int, Optional[str]]. + self._pending_entries: Dict[ + int, Tuple[Union[int, Entry], Optional[str]] + ] = {} def __getstate__(self): state = self.__dict__.copy() @@ -134,22 +126,7 @@ def __getstate__(self): return state def __setstate__(self, state): - # Pack version checking. We will no longer provide support for - # serialized Pack whose "pack_version" is less than - # PACK_ID_COMPATIBLE_VERSION. - pack_version: str = ( - state["pack_version"] - if "pack_version" in state - else DEFAULT_PACK_VERSION - ) - if Version(pack_version) < Version(PACK_ID_COMPATIBLE_VERSION): - raise ValueError( - "The pack cannot be deserialized because its version " - f"{pack_version} is outdated. We only support pack with " - f"version greater or equal to {PACK_ID_COMPATIBLE_VERSION}" - ) super().__setstate__(state) - if "meta" in self.__dict__: self._meta = self.__dict__.pop("meta") self.__control_component = None @@ -159,6 +136,9 @@ def __setstate__(self, state): def _init_meta(self, pack_name: Optional[str] = None) -> BaseMeta: raise NotImplementedError + def get_control_component(self): + return self.__control_component + def set_meta(self, **kwargs): for k, v in kwargs.items(): if not hasattr(self._meta, k): @@ -244,6 +224,7 @@ def from_string(cls, data_content: str) -> "BasePack": return pack + @abstractmethod def delete_entry(self, entry: EntryType): r"""Remove the entry from the pack. @@ -253,14 +234,7 @@ def delete_entry(self, entry: EntryType): Returns: None """ - self._data_store.delete_entry(tid=entry.tid) - - # update basic index - self._index.remove_entry(entry) - - # set other index invalid - self._index.turn_link_index_switch(on=False) - self._index.turn_group_index_switch(on=False) + raise NotImplementedError def add_entry( self, entry: Union[Entry, int], component_name: Optional[str] = None @@ -307,7 +281,7 @@ def add_all_remaining_entries(self, component: Optional[str] = None): Returns: None """ - for entry, c in list(self._pending_entries.items()): + for entry, c in list(self._pending_entries.values()): c_ = component if component else c self.add_entry(entry, c_) self._pending_entries.clear() @@ -456,171 +430,20 @@ def on_entry_creation( # Use the auto-inferred control component. c = self.__control_component - def entry_getter(cls: Entry, attr_name: str, field_type): - """A getter function for dataclass fields of entry object. - When the field contains ``tid``s, we will convert them to entry - object on the fly. - - Args: - cls: An ``Entry`` class object. - attr_name: The name of the attribute. - field_type: The type of the attribute. - """ - data_store_ref = ( - cls.pack._data_store # pylint: disable=protected-access - ) - attr_val = data_store_ref.get_attribute( - tid=cls.tid, attr_name=attr_name - ) - if field_type in (FList, FDict): - # Generate FList/FDict object on the fly - return field_type(parent_entry=cls, data=attr_val) - try: - # TODO: Find a better solution to determine if a field is Entry - # Will be addressed by https://github.com/asyml/forte/issues/835 - # Convert tid to entry object on the fly - if isinstance(attr_val, int): - # Single pack entry - return cls.pack.get_entry(tid=attr_val) - # The condition below is to check whether the attribute's value - # is a pair of integers - `(pack_id, tid)`. If so we may have - # encountered a `tid` that can only be resolved by - # `MultiPack.get_subentry`. - elif ( - isinstance(attr_val, tuple) - and len(attr_val) == 2 - and all(isinstance(element, int) for element in attr_val) - and hasattr(cls.pack, "get_subentry") - ): - # Multi pack entry - return cls.pack.get_subentry(*attr_val) - except KeyError: - pass - return attr_val - - def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): - """A setter function for dataclass fields of entry object. - When the value contains entry objects, we will convert them into - ``tid``s before storing to ``DataStore``. - - Args: - cls: An ``Entry`` class object. - value: The value to be assigned to the attribute. - attr_name: The name of the attribute. - field_type: The type of the attribute. - """ - attr_value: Any - data_store_ref = ( - cls.pack._data_store # pylint: disable=protected-access - ) - # Assumption: Users will not assign value to a FList/FDict field. - # Only internal methods can set the FList/FDict field, and value's - # type has to be Iterator[Entry]/Dict[Any, Entry]. - if field_type is FList: - try: - attr_value = [entry.tid for entry in value] - except AttributeError as e: - raise ValueError( - "You are trying to assign value to a `FList` field, " - "which can only accept an iterator of `Entry` objects." - ) from e - elif field_type is FDict: - try: - attr_value = { - key: entry.tid for key, entry in value.items() - } - except AttributeError as e: - raise ValueError( - "You are trying to assign value to a `FDict` field, " - "which can only accept a mapping whose values are " - "`Entry` objects." - ) from e - elif isinstance(value, Entry): - attr_value = ( - value.tid - if value.pack.pack_id == cls.pack.pack_id - # When value's pack and cls's pack are not the same, we - # assume that cls.pack is a MultiPack, which will resolve - # value.tid using MultiPack.get_subentry(pack_id, tid). - # In this case, both pack_id and tid should be stored. - else (value.pack.pack_id, value.tid) - ) - else: - attr_value = value - data_store_ref.set_attribute( - tid=cls.tid, attr_name=attr_name, attr_value=attr_value - ) - - # Save the input entry object in DataStore - self._save_entry_to_data_store(entry=entry) - - # Register property functions for all dataclass fields. - for name, field in entry.__dataclass_fields__.items(): - # Convert the typing annotation to the original class. - # This will be used to determine if a field is FList/FDict. - field_type = get_origin(field.type) - setattr( - type(entry), - name, - # property(fget, fset) will register a conversion layer - # that specifies how to retrieve/assign value of this field. - property( - # We need to bound the attribute name and field type here - # for the getter and setter of each field. - fget=partial( - entry_getter, attr_name=name, field_type=field_type - ), - fset=partial( - entry_setter, attr_name=name, field_type=field_type - ), - ), - ) - # Record that this entry hasn't been added to the index yet. - self._pending_entries[entry.tid] = c + self._pending_entries[entry.tid] = entry, c # TODO: how to make this return the precise type here? def get_entry(self, tid: int) -> EntryType: - r"""Look up the entry_index with ``tid``. Specific implementation + r"""Look up the entry_index with key ``ptr``. Specific implementation depends on the actual class.""" - try: - # Try to find entry in DataIndex - entry: EntryType = self._index.get_entry(tid) - except KeyError: - # Find entry in DataStore - entry = self._get_entry_from_data_store(tid=tid) + entry: EntryType = self._index.get_entry(tid) if entry is None: raise KeyError( f"There is no entry with tid '{tid}'' in this datapack" ) return entry - def get_entry_raw(self, tid: int) -> List: - r"""Retrieve the raw entry data in list format from DataStore.""" - return self._data_store.get_entry(tid=tid)[0] - - @abstractmethod - def _save_entry_to_data_store(self, entry: Entry): - r"""Save an existing entry object into DataStore""" - raise NotImplementedError - - @abstractmethod - def _get_entry_from_data_store(self, tid: int) -> EntryType: - r"""Generate a class object from entry data in DataStore""" - raise NotImplementedError - - @property - @abstractmethod - def links(self): - r"""A List container of all links in this data pack.""" - raise NotImplementedError - - @property - @abstractmethod - def groups(self): - r"""A List container of all groups in this pack.""" - raise NotImplementedError - @abstractmethod def get_data( self, context_type, request, skip_k @@ -654,6 +477,7 @@ def get_single(self, entry_type: Union[str, Type[EntryType]]) -> EntryType: Args: entry_type: The entry type to be retrieved. + Returns: A single data entry. """ From 7102b3ec863813f64b0737437c084731bb1778ed Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 15:39:19 -0700 Subject: [PATCH 131/137] kept base_pack.py unchanged --- forte/data/base_pack.py | 228 +++++++++++++++++++++++++++++++++++----- 1 file changed, 202 insertions(+), 26 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 94081acca..a519ab7fd 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import copy import gzip import pickle @@ -27,19 +28,33 @@ Union, Iterator, Dict, - Tuple, Any, Iterable, ) - - +from functools import partial +from typing_inspect import get_origin +from packaging.version import Version import jsonpickle from forte.common import ProcessExecutionException, EntryNotFoundError -from forte.data.container import EntryContainer from forte.data.index import BaseIndex -from forte.data.ontology.core import Entry, EntryType, GroupType, LinkType -from forte.version import PACK_VERSION, DEFAULT_PACK_VERSION +from forte.data.base_store import BaseStore +from forte.data.container import EntryContainer +from forte.data.ontology.core import ( + Entry, + EntryType, + GroupType, + LinkType, + FList, + FDict, +) +from forte.version import ( + PACK_VERSION, + DEFAULT_PACK_VERSION, + PACK_ID_COMPATIBLE_VERSION, +) + +logger = logging.getLogger(__name__) __all__ = ["BasePack", "BaseMeta", "PackType"] @@ -97,26 +112,19 @@ class BasePack(EntryContainer[EntryType, LinkType, GroupType]): # pylint: disable=too-many-public-methods def __init__(self, pack_name: Optional[str] = None): super().__init__() - self.links: List[LinkType] = [] - self.groups: List[GroupType] = [] self.pack_version: str = PACK_VERSION self._meta: BaseMeta = self._init_meta(pack_name) self._index: BaseIndex = BaseIndex() + self._data_store: BaseStore + self.__control_component: Optional[str] = None - # This Dict maintains a mapping from entry's tid to the Entry object - # itself (for MultiPack) or entry's tid (for DataPack) and the component + # This Dict maintains a mapping from entry's tid to the component # name associated with the entry. # The component name is used for tracking the "creator" of this entry. - # TODO: Will need to unify the format for MultiPack and DataPack after - # DataStore is integrated with MultiPack and MultiPack entries. In - # future we should only maintain a mapping from entry's tid to the - # corresponding component, i.e., Dict[int, Optional[str]]. - self._pending_entries: Dict[ - int, Tuple[Union[int, Entry], Optional[str]] - ] = {} + self._pending_entries: Dict[int, Optional[str]] = {} def __getstate__(self): state = self.__dict__.copy() @@ -126,6 +134,20 @@ def __getstate__(self): return state def __setstate__(self, state): + # Pack version checking. We will no longer provide support for + # serialized Pack whose "pack_version" is less than + # PACK_ID_COMPATIBLE_VERSION. + pack_version: str = ( + state["pack_version"] + if "pack_version" in state + else DEFAULT_PACK_VERSION + ) + if Version(pack_version) < Version(PACK_ID_COMPATIBLE_VERSION): + raise ValueError( + "The pack cannot be deserialized because its version " + f"{pack_version} is outdated. We only support pack with " + f"version greater or equal to {PACK_ID_COMPATIBLE_VERSION}" + ) super().__setstate__(state) if "meta" in self.__dict__: self._meta = self.__dict__.pop("meta") @@ -136,9 +158,6 @@ def __setstate__(self, state): def _init_meta(self, pack_name: Optional[str] = None) -> BaseMeta: raise NotImplementedError - def get_control_component(self): - return self.__control_component - def set_meta(self, **kwargs): for k, v in kwargs.items(): if not hasattr(self._meta, k): @@ -224,7 +243,6 @@ def from_string(cls, data_content: str) -> "BasePack": return pack - @abstractmethod def delete_entry(self, entry: EntryType): r"""Remove the entry from the pack. @@ -234,7 +252,14 @@ def delete_entry(self, entry: EntryType): Returns: None """ - raise NotImplementedError + self._data_store.delete_entry(tid=entry.tid) + + # update basic index + self._index.remove_entry(entry) + + # set other index invalid + self._index.turn_link_index_switch(on=False) + self._index.turn_group_index_switch(on=False) def add_entry( self, entry: Union[Entry, int], component_name: Optional[str] = None @@ -281,7 +306,7 @@ def add_all_remaining_entries(self, component: Optional[str] = None): Returns: None """ - for entry, c in list(self._pending_entries.values()): + for entry, c in list(self._pending_entries.items()): c_ = component if component else c self.add_entry(entry, c_) self._pending_entries.clear() @@ -430,20 +455,171 @@ def on_entry_creation( # Use the auto-inferred control component. c = self.__control_component + def entry_getter(cls: Entry, attr_name: str, field_type): + """A getter function for dataclass fields of entry object. + When the field contains ``tid``s, we will convert them to entry + object on the fly. + + Args: + cls: An ``Entry`` class object. + attr_name: The name of the attribute. + field_type: The type of the attribute. + """ + data_store_ref = ( + cls.pack._data_store # pylint: disable=protected-access + ) + attr_val = data_store_ref.get_attribute( + tid=cls.tid, attr_name=attr_name + ) + if field_type in (FList, FDict): + # Generate FList/FDict object on the fly + return field_type(parent_entry=cls, data=attr_val) + try: + # TODO: Find a better solution to determine if a field is Entry + # Will be addressed by https://github.com/asyml/forte/issues/835 + # Convert tid to entry object on the fly + if isinstance(attr_val, int): + # Single pack entry + return cls.pack.get_entry(tid=attr_val) + # The condition below is to check whether the attribute's value + # is a pair of integers - `(pack_id, tid)`. If so we may have + # encountered a `tid` that can only be resolved by + # `MultiPack.get_subentry`. + elif ( + isinstance(attr_val, tuple) + and len(attr_val) == 2 + and all(isinstance(element, int) for element in attr_val) + and hasattr(cls.pack, "get_subentry") + ): + # Multi pack entry + return cls.pack.get_subentry(*attr_val) + except KeyError: + pass + return attr_val + + def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): + """A setter function for dataclass fields of entry object. + When the value contains entry objects, we will convert them into + ``tid``s before storing to ``DataStore``. + + Args: + cls: An ``Entry`` class object. + value: The value to be assigned to the attribute. + attr_name: The name of the attribute. + field_type: The type of the attribute. + """ + attr_value: Any + data_store_ref = ( + cls.pack._data_store # pylint: disable=protected-access + ) + # Assumption: Users will not assign value to a FList/FDict field. + # Only internal methods can set the FList/FDict field, and value's + # type has to be Iterator[Entry]/Dict[Any, Entry]. + if field_type is FList: + try: + attr_value = [entry.tid for entry in value] + except AttributeError as e: + raise ValueError( + "You are trying to assign value to a `FList` field, " + "which can only accept an iterator of `Entry` objects." + ) from e + elif field_type is FDict: + try: + attr_value = { + key: entry.tid for key, entry in value.items() + } + except AttributeError as e: + raise ValueError( + "You are trying to assign value to a `FDict` field, " + "which can only accept a mapping whose values are " + "`Entry` objects." + ) from e + elif isinstance(value, Entry): + attr_value = ( + value.tid + if value.pack.pack_id == cls.pack.pack_id + # When value's pack and cls's pack are not the same, we + # assume that cls.pack is a MultiPack, which will resolve + # value.tid using MultiPack.get_subentry(pack_id, tid). + # In this case, both pack_id and tid should be stored. + else (value.pack.pack_id, value.tid) + ) + else: + attr_value = value + data_store_ref.set_attribute( + tid=cls.tid, attr_name=attr_name, attr_value=attr_value + ) + + # Save the input entry object in DataStore + self._save_entry_to_data_store(entry=entry) + + # Register property functions for all dataclass fields. + for name, field in entry.__dataclass_fields__.items(): + # Convert the typing annotation to the original class. + # This will be used to determine if a field is FList/FDict. + field_type = get_origin(field.type) + setattr( + type(entry), + name, + # property(fget, fset) will register a conversion layer + # that specifies how to retrieve/assign value of this field. + property( + # We need to bound the attribute name and field type here + # for the getter and setter of each field. + fget=partial( + entry_getter, attr_name=name, field_type=field_type + ), + fset=partial( + entry_setter, attr_name=name, field_type=field_type + ), + ), + ) + # Record that this entry hasn't been added to the index yet. - self._pending_entries[entry.tid] = entry, c + self._pending_entries[entry.tid] = c # TODO: how to make this return the precise type here? def get_entry(self, tid: int) -> EntryType: - r"""Look up the entry_index with key ``ptr``. Specific implementation + r"""Look up the entry_index with ``tid``. Specific implementation depends on the actual class.""" - entry: EntryType = self._index.get_entry(tid) + try: + # Try to find entry in DataIndex + entry: EntryType = self._index.get_entry(tid) + except KeyError: + # Find entry in DataStore + entry = self._get_entry_from_data_store(tid=tid) if entry is None: raise KeyError( f"There is no entry with tid '{tid}'' in this datapack" ) return entry + def get_entry_raw(self, tid: int) -> List: + r"""Retrieve the raw entry data in list format from DataStore.""" + return self._data_store.get_entry(tid=tid)[0] + + @abstractmethod + def _save_entry_to_data_store(self, entry: Entry): + r"""Save an existing entry object into DataStore""" + raise NotImplementedError + + @abstractmethod + def _get_entry_from_data_store(self, tid: int) -> EntryType: + r"""Generate a class object from entry data in DataStore""" + raise NotImplementedError + + @property + @abstractmethod + def links(self): + r"""A List container of all links in this data pack.""" + raise NotImplementedError + + @property + @abstractmethod + def groups(self): + r"""A List container of all groups in this pack.""" + raise NotImplementedError + @abstractmethod def get_data( self, context_type, request, skip_k From 2a920234a8ef8d4909a06ff913da6da5b6d72bb5 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 15:43:15 -0700 Subject: [PATCH 132/137] correct docstring --- forte/data/data_pack.py | 1 - 1 file changed, 1 deletion(-) diff --git a/forte/data/data_pack.py b/forte/data/data_pack.py index 913f280b7..fd548832b 100644 --- a/forte/data/data_pack.py +++ b/forte/data/data_pack.py @@ -1067,7 +1067,6 @@ def get_context_data( payload_index: the zero-based index of the Payload in this DataPack's Payload entries of a particular modality. The modality is dependent on ``c_type``. - Defaults to 0. Raises: NotImplementedError: raised when the given context type is From 69676ca1ddcbf3d81a32f9f66956fffc52b4dd8a Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 15:43:48 -0700 Subject: [PATCH 133/137] revert changes in unrelevant file --- forte/processors/ir/bert/bert_based_query_creator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/forte/processors/ir/bert/bert_based_query_creator.py b/forte/processors/ir/bert/bert_based_query_creator.py index 6296618f2..cb73b6113 100644 --- a/forte/processors/ir/bert/bert_based_query_creator.py +++ b/forte/processors/ir/bert/bert_based_query_creator.py @@ -45,6 +45,7 @@ def __init__(self) -> None: def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs + self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) From 1f8f8ed7f50acebb4564792032afe11453f60b62 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 16:05:47 -0700 Subject: [PATCH 134/137] revert metric.py changes --- ft/onto/metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ft/onto/metric.py b/ft/onto/metric.py index af7a44d94..b1188061a 100644 --- a/ft/onto/metric.py +++ b/ft/onto/metric.py @@ -1,5 +1,5 @@ # ***automatically_generated*** -# ***source json:forte/ontology_specs/metric.json*** +# ***source json:../testing/metric.json*** # flake8: noqa # mypy: ignore-errors # pylint: skip-file From 453dcc0208ef72e71ee6defda3423287898de832 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 16:06:25 -0700 Subject: [PATCH 135/137] remove unused import --- tests/forte/data/audio_annotation_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/forte/data/audio_annotation_test.py b/tests/forte/data/audio_annotation_test.py index 25b4bc30e..b06963e04 100644 --- a/tests/forte/data/audio_annotation_test.py +++ b/tests/forte/data/audio_annotation_test.py @@ -37,7 +37,6 @@ from ft.onto.base_ontology import ( Recording, AudioUtterance, - TextPayload, Utterance, ) From e1fe2bfc6a9004951a3f599fbedc436e09d25c0c Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 16:06:55 -0700 Subject: [PATCH 136/137] remove unused textpayload --- tests/forte/data/readers/audio_reader_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/forte/data/readers/audio_reader_test.py b/tests/forte/data/readers/audio_reader_test.py index 47f880e53..419976370 100644 --- a/tests/forte/data/readers/audio_reader_test.py +++ b/tests/forte/data/readers/audio_reader_test.py @@ -65,7 +65,6 @@ def _process(self, input_pack: DataPack): argmax(self._model(input_values).logits, dim=-1) ) - tp = TextPayload(input_pack, 0) input_pack.set_text(text=transcription[0]) From 6758f76c7a746326a5607bc387453f092a39d0c7 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Wed, 29 Jun 2022 16:09:54 -0700 Subject: [PATCH 137/137] revert changes in audio reader --- forte/data/readers/audio_reader.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/forte/data/readers/audio_reader.py b/forte/data/readers/audio_reader.py index 1952e5c2a..b462a93b9 100644 --- a/forte/data/readers/audio_reader.py +++ b/forte/data/readers/audio_reader.py @@ -20,7 +20,6 @@ from forte.data.data_pack import DataPack from forte.data.data_utils_io import dataset_path_iterator from forte.data.base_reader import PackReader -from ft.onto.base_ontology import AudioPayload __all__ = [ "AudioReader", @@ -54,27 +53,21 @@ def _collect(self, audio_directory) -> Iterator[Any]: # type: ignore Returns: Iterator over paths to audio files """ - # construct ImageMeta and store it in DataPack - return dataset_path_iterator( - audio_directory, - self.configs.file_ext, - ) + return dataset_path_iterator(audio_directory, self.configs.file_ext) def _cache_key_function(self, audio_file: str) -> str: return os.path.basename(audio_file) def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() - payload_idx = 0 - # Read in audio data and store in DataPack - # add audio payload into DataPack.payloads - ap = AudioPayload(pack, payload_idx, file_path) - if not self.configs.lazy_read: - audio_data, sample_rate = self.soundfile.read(file_path) - pack.set_audio(audio_data, sample_rate) - ap.sample_rate = sample_rate + # Read in audio data and store in DataPack + audio, sample_rate = self.soundfile.read( + file=file_path, **(self.configs.read_kwargs or {}) + ) + pack.set_audio(audio=audio, sample_rate=sample_rate) pack.pack_name = file_path + yield pack @classmethod @@ -93,4 +86,4 @@ def default_configs(cls): Returns: The default configuration of audio reader. """ - return {"file_ext": ".flac", "lazy_read": False, "read_kwargs": None} + return {"file_ext": ".flac", "read_kwargs": None}