Skip to content

Commit

Permalink
add test for xml external file
Browse files Browse the repository at this point in the history
lukavdplas committed Apr 25, 2024
1 parent b773091 commit 56ae944
Showing 3 changed files with 66 additions and 1 deletion.
3 changes: 2 additions & 1 deletion ianalyzer_readers/extract.py
Original file line number Diff line number Diff line change
@@ -301,14 +301,15 @@ class XML(Extractor):
with two keys: `'tag'` gives the name of the sibling tag. The other key can be
`'exact'`, which gives a string to match, or `'match'`, which gives the name of
a metadata field against which to match the content. If this field has
`external_file=True`, then `'match'` can also give the name of another ifeld in
`external_file=True`, then `'match'` can also give the name of another field in
the reader, which as `external_file=False`.
external_file: This property can be set to look through a secondary XML file
(usually one containing metadata). It requires that the passed metadata have an
`'external_file'` key that specifies the path to the file. This parameter
specifies the toplevel tag and entry level tag for that file; if set, the
extractor will extract this field from the external file instead of the current
source file.
The value for `entry_tag` is only used if the extractor does not have a `se
transform_soup_func: A function to transform the soup directly after the tag is
selected, before further processing (attributes, flattening, etc) to extract
the value from it. Keep in mind that the soup passed could be `None` if no
2 changes: 2 additions & 0 deletions ianalyzer_readers/readers/xml.py
Original file line number Diff line number Diff line change
@@ -230,6 +230,8 @@ def _bowl_from_soup(self, soup, toplevel_tag=None, entry_tag=None, metadata = {}
'''
if toplevel_tag == None:
toplevel_tag = self._get_tag_requirements(self.tag_toplevel, metadata)
else:
toplevel_tag = self._get_tag_requirements(toplevel_tag, metadata)

return soup.find(**toplevel_tag) if toplevel_tag else soup

62 changes: 62 additions & 0 deletions tests/xml/test_xml_extraction.py
Original file line number Diff line number Diff line change
@@ -214,7 +214,69 @@ def test_xml_toplevel_tag_dict(tmpdir):
</play>
'''


def test_xml_secondary_tag(tmpdir):
extractor = XML('l', secondary_tag={'tag': 'character', 'exact': 'GHOST'})
reader = make_test_reader(extractor, 'play', 'scene', doc_longer, tmpdir)
assert_extractor_output(reader, 'Mark me.')

doc_with_title = '''
<?xml version="1.0" encoding="UTF-8"?>
<play>
<title>Hamlet</title>
<lines>
<character>HAMLET</character>
<l>Whither wilt thou lead me? Speak, I'll go no further.</l>
</lines>
</play>
'''

external_doc = '''
<?xml version="1.0" encoding="UTF-8"?>
<bibliography>
<play>
<title>Doctor Faustus</title>
<author>Christopher Marlowe</author>
</play>
<play>
<title>Hamlet</title>
<author>William Shakespeare</author>
</play>
</bibliography>
'''

def test_xml_external_file(tmpdir):
path = os.path.join(tmpdir, 'test.xml')
with open(path, 'w') as f:
f.write(doc_with_title)
external_path = os.path.join(tmpdir, 'metadata.xml')
with open(external_path, 'w') as f:
f.write(external_doc)

class TestReader(XMLReader):
data_directory = tmpdir
tag_toplevel = 'play'
tag_entry = 'lines'

def sources(self, *args, **kwargs):
yield path, {'external_file': external_path}

fields = [
Field(
name='author',
extractor=XML(
'author',
secondary_tag={'tag': 'title', 'match': 'title'},
external_file={'xml_tag_toplevel': 'bibliography', 'xml_tag_entry': None}
)
),
Field(
name='title',
extractor=XML('title', toplevel=True)
)
]

reader = TestReader()
doc = next(reader.documents())

assert doc['author'] == 'William Shakespeare'

0 comments on commit 56ae944

Please sign in to comment.