add test for xml external file

CentreForDigitalHumanities · Apr 25, 2024 · 56ae944 · 56ae944
1 parent b773091
commit 56ae944
Showing 3 changed files with 66 additions and 1 deletion.
diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -301,14 +301,15 @@ class XML(Extractor):
             with two keys: `'tag'` gives the name of the sibling tag. The other key can be
             `'exact'`, which gives a string to match, or `'match'`, which gives the name of
             a metadata field against which to match the content. If this field has
-            `external_file=True`, then `'match'` can also give the name of another ifeld in
+            `external_file=True`, then `'match'` can also give the name of another field in
             the reader, which as `external_file=False`.
         external_file: This property can be set to look through a secondary XML file
             (usually one containing metadata). It requires that the passed metadata have an
             `'external_file'` key that specifies the path to the file. This parameter
             specifies the toplevel tag and entry level tag for that file; if set, the
             extractor will extract this field from the external file instead of the current
             source file.
+            The value for `entry_tag` is only used if the extractor does not have a `se
         transform_soup_func: A function to transform the soup directly after the tag is
             selected, before further processing (attributes, flattening, etc) to extract
             the value from it. Keep in mind that the soup passed could be `None` if no

diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py
@@ -230,6 +230,8 @@ def _bowl_from_soup(self, soup, toplevel_tag=None, entry_tag=None, metadata = {}
         '''
         if toplevel_tag == None:
             toplevel_tag = self._get_tag_requirements(self.tag_toplevel, metadata)
+        else:
+            toplevel_tag = self._get_tag_requirements(toplevel_tag, metadata)
 
         return soup.find(**toplevel_tag) if toplevel_tag else soup
 

diff --git a/tests/xml/test_xml_extraction.py b/tests/xml/test_xml_extraction.py
@@ -214,7 +214,69 @@ def test_xml_toplevel_tag_dict(tmpdir):
 </play>
 '''
 
+
 def test_xml_secondary_tag(tmpdir):
     extractor = XML('l', secondary_tag={'tag': 'character', 'exact': 'GHOST'})
     reader = make_test_reader(extractor, 'play', 'scene', doc_longer, tmpdir)
     assert_extractor_output(reader, 'Mark me.')
+
+doc_with_title = '''
+<?xml version="1.0" encoding="UTF-8"?>
+<play>
+    <title>Hamlet</title>
+    <lines>
+        <character>HAMLET</character>
+        <l>Whither wilt thou lead me? Speak, I'll go no further.</l>
+    </lines>
+</play>
+'''
+
+external_doc = '''
+<?xml version="1.0" encoding="UTF-8"?>
+<bibliography>
+    <play>        
+        <title>Doctor Faustus</title>
+        <author>Christopher Marlowe</author>
+    </play>
+    <play>
+        <title>Hamlet</title>
+        <author>William Shakespeare</author>
+    </play>
+</bibliography>
+'''
+
+def test_xml_external_file(tmpdir):
+    path = os.path.join(tmpdir, 'test.xml')
+    with open(path, 'w') as f:
+        f.write(doc_with_title)
+    external_path = os.path.join(tmpdir, 'metadata.xml')
+    with open(external_path, 'w') as f:
+        f.write(external_doc)
+
+    class TestReader(XMLReader):
+        data_directory = tmpdir
+        tag_toplevel = 'play'
+        tag_entry = 'lines'
+
+        def sources(self, *args, **kwargs):
+            yield path, {'external_file': external_path}
+
+        fields = [
+            Field(
+                name='author',
+                extractor=XML(
+                    'author',
+                    secondary_tag={'tag': 'title', 'match': 'title'},
+                    external_file={'xml_tag_toplevel': 'bibliography', 'xml_tag_entry': None}
+                )
+            ),
+            Field(
+                name='title',
+                extractor=XML('title', toplevel=True)
+            )
+        ]
+
+    reader = TestReader()
+    doc = next(reader.documents())
+
+    assert doc['author'] == 'William Shakespeare'