This repository has been archived by the owner on Mar 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathtry_iter_data.py
118 lines (90 loc) · 4.17 KB
/
try_iter_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Python protocols listing.
"""
# pylint: disable=too-few-public-methods, protected-access, not-an-iterable
from typing import Sequence
class DoNotKnowLenDataset:
"""
Just a class with some state.
"""
def __init__(self, data: Sequence[tuple[str, str, int, int]]) -> None:
self._data = data
class HaveLengthDataset(DoNotKnowLenDataset):
"""
A class with some state that knows its length.
"""
def __len__(self) -> int:
return len(self._data)
class IterableDataset(DoNotKnowLenDataset):
"""
A class with some state that knows its length and can be a part of for loop.
"""
def __getitem__(self, item: int) -> tuple[str, str, int, int]:
return self._data[item]
def main() -> None:
"""
Entrypoint for the listing.
"""
# 1. Data is taken from assets/danetqa_example.csv
raw_data = (
(
'Есть ли вода на марсе?',
'Гидросфера Марса — это совокупность водных запасов планеты Марс',
0,
1
),
(
'Состоит ли англия в евросоюзе?',
'В полночь с 31 января на 1 февраля 2020 года по центральноевропейскому времени '
'после 47 лет членства Великобритания официально покинула Евросоюз.',
1,
0
),
(
'Действительно ли в ссср не было адвокатов?',
'Семён Львович Ария — советский и российский юрист, один из '
'крупнейших советских адвокатов',
2,
0
)
)
# 2. Instantiate dataset
plain_dataset = DoNotKnowLenDataset(raw_data)
###############################################################################################
# Problems:
# - dataset does not know its length
# - dataset does not know how to be iterable for "for" loop
###############################################################################################
# Problem 1. Automatic length collection via len() does not work
try:
print(f'Number of samples in CannotIterateDataset dataset: {len(plain_dataset)}')
except TypeError:
print('len() method is not supported for class CannotIterateDataset')
# Workaround for Problem 1. To get length we need to know internal structure of the object:
# how it stores data, access via protected attribute, which violates public API
print('Number of samples in CannotIterateDataset dataset '
f'(WORKAROUND): {len(plain_dataset._data)}')
# Solution for Problem 1. To get length we use built-in len() that works
# on top of implemented __len__ protocol. Public API is not violated
len_dataset = HaveLengthDataset(raw_data)
print(f'Number of samples in HaveLengthDataset dataset (LEN): {len(len_dataset)}')
###############################################################################################
# Problem 2. Iterate over dataset samples
try:
for question, context, sample_id, answer in len_dataset:
print(question, context, sample_id, answer)
except TypeError:
print('HaveLengthDataset is not iterable!')
# Workaround for Problem 2. To iterate we need to know internal structure of the object:
# how it stores data, access via protected attribute, which violates public API
print('Iterating over HaveLengthDataset (WORKAROUND)')
for question, context, sample_id, answer in len_dataset._data:
print('\t', question, context, sample_id, answer)
# Solution for Problem 2. To loop over dataset with "for" loop
# need to implement __getitem__ protocol. Public API is not violated
iter_dataset = IterableDataset(raw_data)
print('Iterating over HaveLengthDataset (GET_ITEM)')
for question, context, sample_id, answer in iter_dataset:
print('\t', question, context, sample_id, answer)
if __name__ == '__main__':
main()