diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index f85923c..bec74b7 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -301,7 +301,7 @@ class XML(Extractor): with two keys: `'tag'` gives the name of the sibling tag. The other key can be `'exact'`, which gives a string to match, or `'match'`, which gives the name of a metadata field against which to match the content. If this field has - `external_file=True`, then `'match'` can also give the name of another ifeld in + `external_file=True`, then `'match'` can also give the name of another field in the reader, which as `external_file=False`. external_file: This property can be set to look through a secondary XML file (usually one containing metadata). It requires that the passed metadata have an @@ -309,6 +309,7 @@ class XML(Extractor): specifies the toplevel tag and entry level tag for that file; if set, the extractor will extract this field from the external file instead of the current source file. + The value for `entry_tag` is only used if the extractor does not have a `se transform_soup_func: A function to transform the soup directly after the tag is selected, before further processing (attributes, flattening, etc) to extract the value from it. Keep in mind that the soup passed could be `None` if no diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py index 1260b70..7fd46c0 100644 --- a/ianalyzer_readers/readers/xml.py +++ b/ianalyzer_readers/readers/xml.py @@ -230,6 +230,8 @@ def _bowl_from_soup(self, soup, toplevel_tag=None, entry_tag=None, metadata = {} ''' if toplevel_tag == None: toplevel_tag = self._get_tag_requirements(self.tag_toplevel, metadata) + else: + toplevel_tag = self._get_tag_requirements(toplevel_tag, metadata) return soup.find(**toplevel_tag) if toplevel_tag else soup diff --git a/tests/xml/test_xml_extraction.py b/tests/xml/test_xml_extraction.py index a3f27d4..ca0fc35 100644 --- a/tests/xml/test_xml_extraction.py +++ b/tests/xml/test_xml_extraction.py @@ -214,7 +214,69 @@ def test_xml_toplevel_tag_dict(tmpdir): ''' + def test_xml_secondary_tag(tmpdir): extractor = XML('l', secondary_tag={'tag': 'character', 'exact': 'GHOST'}) reader = make_test_reader(extractor, 'play', 'scene', doc_longer, tmpdir) assert_extractor_output(reader, 'Mark me.') + +doc_with_title = ''' + + + Hamlet + + HAMLET + Whither wilt thou lead me? Speak, I'll go no further. + + +''' + +external_doc = ''' + + + + Doctor Faustus + Christopher Marlowe + + + Hamlet + William Shakespeare + + +''' + +def test_xml_external_file(tmpdir): + path = os.path.join(tmpdir, 'test.xml') + with open(path, 'w') as f: + f.write(doc_with_title) + external_path = os.path.join(tmpdir, 'metadata.xml') + with open(external_path, 'w') as f: + f.write(external_doc) + + class TestReader(XMLReader): + data_directory = tmpdir + tag_toplevel = 'play' + tag_entry = 'lines' + + def sources(self, *args, **kwargs): + yield path, {'external_file': external_path} + + fields = [ + Field( + name='author', + extractor=XML( + 'author', + secondary_tag={'tag': 'title', 'match': 'title'}, + external_file={'xml_tag_toplevel': 'bibliography', 'xml_tag_entry': None} + ) + ), + Field( + name='title', + extractor=XML('title', toplevel=True) + ) + ] + + reader = TestReader() + doc = next(reader.documents()) + + assert doc['author'] == 'William Shakespeare'