Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/chipper repetitions #314

Closed
wants to merge 41 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
8032818
New stop criterion and logits processor for tables
ajjimeno Nov 22, 2023
6fc6a3a
Revised repetition setting
ajjimeno Nov 23, 2023
239d463
Adding table processing
ajjimeno Nov 23, 2023
e09548a
Hyperparameter selection
ajjimeno Nov 23, 2023
78b255c
Merge branch 'main' into feat/chipper-repetitions
ajjimeno Nov 23, 2023
2795e54
Moving cleaning in post-processing
ajjimeno Nov 26, 2023
312b264
Moving cleaning in post-processing
ajjimeno Nov 26, 2023
93cfbba
Updated version
ajjimeno Nov 26, 2023
3dc9003
Added docstrings
ajjimeno Nov 26, 2023
91606d4
IoU union_are == 0.0
ajjimeno Nov 26, 2023
1dfc3d7
Linting
ajjimeno Nov 26, 2023
c571014
Linting
ajjimeno Nov 26, 2023
5446658
Fixed bbox == None in iou
ajjimeno Nov 26, 2023
a7de901
Linting
ajjimeno Nov 26, 2023
1422f99
Merge branch 'main' into feat/chipper-repetitions
ajjimeno Nov 29, 2023
a4c791e
Merge branch 'main' into feat/chipper-repetitions
ajjimeno Dec 20, 2023
2811001
Revised after merging with main
ajjimeno Dec 20, 2023
380d92c
Revised version
ajjimeno Dec 20, 2023
6845489
Additional tests
ajjimeno Dec 20, 2023
420202a
Fixing model selection
ajjimeno Dec 20, 2023
787af10
Moving element to cpu
ajjimeno Dec 20, 2023
0f5321f
Not remove nested list parent/children
ajjimeno Dec 20, 2023
c7b7faa
Fixed bug on repetition removal
ajjimeno Jan 2, 2024
d984279
Fixed bbox errors
ajjimeno Jan 2, 2024
e3b2123
Fixed bbox errors
ajjimeno Jan 2, 2024
491ec7d
Merge branch 'feat/chipper-repetitions' of github.com:Unstructured-IO…
ajjimeno Jan 2, 2024
081d5f9
Merge branch 'main' into feat/chipper-repetitions
ajjimeno Jan 2, 2024
4f56cbf
Do not sort Chipper elements
ajjimeno Jan 3, 2024
6fdc6b4
Updated version and fixed tests
ajjimeno Jan 3, 2024
36bfd3e
Update unstructured_inference/models/chipper.py
ajjimeno Jan 31, 2024
584955b
Update unstructured_inference/models/chipper.py
ajjimeno Jan 31, 2024
a965f70
Revised Chipper
ajjimeno Jan 31, 2024
ca82299
Merge branch 'main' into feat/chipper-repetitions
ajjimeno Jan 31, 2024
7a32f28
Fixed issue with repetitions and linting over LongTensor vs Tensor
ajjimeno Feb 1, 2024
9c19756
Fixes possible issue with repeated picture elements
ajjimeno Feb 1, 2024
35dbbe5
Fix issue with repeated images
ajjimeno Feb 1, 2024
2ea363b
Updated LongTensor casting
ajjimeno Feb 4, 2024
5123b8b
Removed unused like
ajjimeno Feb 4, 2024
68f1bbb
Linting
ajjimeno Feb 4, 2024
ab3b6f6
Removed LongTensor casting
ajjimeno Feb 4, 2024
61ad2c7
Fixed mypy LongTensor error
ajjimeno Feb 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.7.24

* Revised repetitions for Chipper

## 0.7.23

* fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty
Expand Down
7 changes: 2 additions & 5 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,17 +214,14 @@ def points(self):

class MockPageLayout(layout.PageLayout):
def __init__(
self,
number=1,
image=None,
model=None,
detection_model=None,
self, number=1, image=None, model=None, detection_model=None, element_extraction_model=None
):
self.image = image
self.layout = layout
self.model = model
self.number = number
self.detection_model = detection_model
self.element_extraction_model = element_extraction_model


@pytest.mark.parametrize(
Expand Down
59 changes: 53 additions & 6 deletions test_unstructured_inference/models/test_chippermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_no_repeat_ngram_logits():

no_repeat_ngram_size = 2

logitsProcessor = chipper.NoRepeatNGramLogitsProcessor(ngram_size=2)
logitsProcessor = chipper.NoRepeatNGramLogitsProcessor(ngram_size=2, context_length=10)
output = logitsProcessor(input_ids=input_ids, scores=logits)

assert (
Expand Down Expand Up @@ -194,20 +194,64 @@ def test_ngram_repetiton_stopping_criteria():
logits = torch.tensor([[0.1, -0.3, -0.5, 0, 1.0, -0.9]])

stoppingCriteria = chipper.NGramRepetitonStoppingCriteria(
repetition_window=2, skip_tokens={0, 1, 2, 3, 4}
ngram_size=2, skip_tokens={0, 1, 2, 3, 4}
)

output = stoppingCriteria(input_ids=input_ids, scores=logits)

assert output is False

stoppingCriteria = chipper.NGramRepetitonStoppingCriteria(
repetition_window=2, skip_tokens={1, 2, 3, 4}
ngram_size=2, skip_tokens={1, 2, 3, 4}
)
output = stoppingCriteria(input_ids=input_ids, scores=logits)
assert output is True


def test_no_repeat_group_ngram_logits_processor():
input_ids = torch.tensor([[1, 2, 3, 4, 0, 1, 2, 3, 4]])
logits = torch.tensor([[0.1, -0.3, -0.5, 0, 1.0, -0.9]])

logitsProcessor = chipper.NoRepeatGroupNGramLogitsProcessor(ngram_size=3, token_group=[1, 2])

output = logitsProcessor(input_ids=input_ids, scores=logits)

assert (
int(
torch.sum(
output == torch.tensor([[0.1000, -0.3000, -0.5000, 0.0000, 1.0000, -0.9000]]),
),
)
== 6
)

input_ids = torch.tensor([[1, 1, 2, 1, 2, 1, 2, 1, 2]])
logits = torch.tensor([[0.1, -0.3, -0.5, 0, 1.0, -0.9]])

output = logitsProcessor(input_ids=input_ids, scores=logits)

assert (
int(
torch.sum(
output
== torch.tensor([[0.1000, -float("inf"), -float("inf"), 0.0000, 1.0000, -0.9000]]),
),
)
== 6
)


def test_target_token_id_stopping_criterion():
input_ids = torch.tensor([1, 2, 3])
logits = torch.tensor([0.1, 0.2, 0.3])

stoppingCriterion = chipper.TargetTokenIdStoppingCriterion(1)

output = stoppingCriterion(input_ids=input_ids, scores=logits)

assert output is True


@pytest.mark.parametrize(
("decoded_str", "expected_classes"),
[
Expand Down Expand Up @@ -259,7 +303,7 @@ def test_predict_tokens_beam_indices():
model = get_model("chipper")
model.stopping_criteria = [
chipper.NGramRepetitonStoppingCriteria(
repetition_window=1,
ngram_size=1,
skip_tokens={},
),
]
Expand Down Expand Up @@ -294,9 +338,12 @@ def test_deduplicate_detected_elements():
assert len(output) == 2


def test_norepeatnGramlogitsprocessor_exception():
def test_logitsprocessor_exception():
with pytest.raises(ValueError):
chipper.NoRepeatNGramLogitsProcessor(ngram_size="", context_length=10)

with pytest.raises(ValueError):
chipper.NoRepeatNGramLogitsProcessor(ngram_size="")
chipper.NoRepeatGroupNGramLogitsProcessor(ngram_size="", token_group={})


def test_run_chipper_v3():
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.23" # pragma: no cover
__version__ = "0.7.24" # pragma: no cover
4 changes: 3 additions & 1 deletion unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ def get_elements_from_layout(

# If the model is a chipper model, we don't want to order the
# elements, as they are already ordered
order_elements = not isinstance(self.detection_model, UnstructuredChipperModel)
order_elements = not isinstance(
self.element_extraction_model, UnstructuredChipperModel
) and not isinstance(self.detection_model, UnstructuredChipperModel)
if order_elements:
layout = order_layout(layout)

Expand Down
Loading
Loading