From 5edb8bbcd2332ebb81d0b8a26655f85ada6f4427 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Wed, 24 Apr 2024 15:18:17 +0200 Subject: [PATCH 1/3] #125 Create exception for empty figures and parse figure subpoints. Further, create one test for every parsed field of a figure. --- pubmed_parser/pubmed_oa_parser.py | 14 +++++++++- tests/test_pubmed_oa_parser.py | 46 ++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index 925165e..0e0d162 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -428,19 +428,31 @@ def parse_pubmed_caption(path): if figs is not None: for fig in figs: fig_id = fig.attrib["id"] - fig_label = stringify_children(fig.find("label")) + try: + fig_label = stringify_children(fig.find("label")) + except AttributeError: + continue fig_captions = fig.find("caption").getchildren() caption = " ".join([stringify_children(c) for c in fig_captions]) graphic = fig.find("graphic") graphic_ref = None if graphic is not None: graphic_ref = graphic.attrib.values()[0] + list_items = fig.findall(".//list-item") + + fig_subpoints = [] + for list_item in list_items: + sub_label = stringify_children(list_item.find("label")) + sub_text = stringify_children(list_item.find("p")) + fig_subpoints.append((sub_label, sub_text)) + dict_caption = { "pmid": pmid, "pmc": pmc, "fig_caption": caption, "fig_id": fig_id, "fig_label": fig_label, + "fig_subpoints": fig_subpoints, "graphic_ref": graphic_ref, } dict_captions.append(dict_caption) diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py index a0c276a..f936a43 100644 --- a/tests/test_pubmed_oa_parser.py +++ b/tests/test_pubmed_oa_parser.py @@ -17,9 +17,16 @@ def fetch_pubmed_xml(db_dir): return content # Get up-to-date pubmed online article -pubmed_dir = {"3460867": "00/00/PMC3460867", "28298962": "8e/71/PMC5334499"} +pubmed_dir = {"3460867": "00/00/PMC3460867", + "28298962": "8e/71/PMC5334499", + "9539395": "51/b3/PMC9539395" + } pubmed_xml_3460867 = fetch_pubmed_xml(pubmed_dir['3460867']) +pubmed_xml_9539395 = fetch_pubmed_xml(pubmed_dir['9539395']) +captions_9539395 = pp.parse_pubmed_caption(pubmed_xml_9539395) +captions_9539395_fig_1 = captions_9539395[0] + def test_parse_pubmed_xml(): """ @@ -68,3 +75,40 @@ def test_parse_pubmed_caption(): assert ( len(captions) == 4 ), "Expected number of figures/captions to have a length of 4" + + +def test_caption_fig_caption(): + """This is a test for the fig_caption field.""" + fig_caption = 'Aerosol delivery of sACE22.v2.4‐IgG1 alleviates lung injury and improves survival of SARS‐CoV‐2 gamma variant infected K18‐hACE2 transgenic mice \n\n' + assert captions_9539395_fig_1['fig_caption'] == fig_caption + + +def test_caption_fig_id(): + """This is a test for the fig_id field.""" + assert captions_9539395_fig_1['fig_id'] == 'emmm202216109-fig-0001' + + +def test_caption_fig_label(): + """This is a test for the fig_label field.""" + assert captions_9539395_fig_1['fig_label'] == 'Figure 1' + + +def test_caption_fig_subpoints(): + """This is a test for the fig_subpoints field.""" + fig_subpoints = [('A', 'K18‐hACE2 transgenic mice were inoculated with SARS‐CoV‐2 isolate /Japan/TY7‐503/2021 (gamma variant) at 1\u2009×\u2009104 PFU. sACE22.v2.4‐IgG1 (7.5\u2009ml at 8.3\u2009mg/ml in PBS) was delivered to the mice by a nebulizer in 25\u2009min at 12\u2009h, 48\u2009h, and 84\u2009h postinoculation. PBS was aerosol delivered as control.'), ('B, C', 'Survival (B) and weight loss (C). N\u2009=\u200910 mice for each group. The P‐value of the survival curve by the Gehan–Breslow–Wilcoxon test is shown. Error bars for mouse weight are centered on the mean and show SEM.'), ('D', "Viral load in the lung was measured by RT–qPCR on Day 7. The mRNA expression levels of SARS‐CoV‐2 Spike, Nsp, and Rdrp are normalized to the housekeeping gene peptidylprolyl isomerase A (Ppia). Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('E', "Cytokine expression levels of Tnfa, Ifng, Il1a, and Il1b were measured by RT–qPCR normalized by Ppia. Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('F, G', 'Representative H&E staining of lung sections on Day 7 postinoculation for control PBS group (F) and inhalation of the sACE22.v2.4‐IgG1 group (G). Images at left are low magnifications. Boxed regions (black) are shown at higher magnification on the right. Lungs from 4 independent mice were sectioned, stained, and imaged.')] + assert captions_9539395_fig_1['fig_subpoints'] == fig_subpoints + + +def test_caption_graphic_ref(): + """This is a test for the graphic_ref field.""" + assert captions_9539395_fig_1['graphic_ref'] == 'EMMM-14-e16109-g008' + + +def test_caption_pmc(): + """This is a test for the pmc field.""" + assert captions_9539395_fig_1['pmc'] == '9539395' + + +def test_caption_pmid(): + """This is a test for the pmid field.""" + assert captions_9539395_fig_1['pmid'] == '36094679' From 428cbc628abefe9eb1e865cc6a39f48eb1e6b206 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Wed, 24 Apr 2024 15:20:56 +0200 Subject: [PATCH 2/3] #125 Minor change --- pubmed_parser/pubmed_oa_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index 0e0d162..9eee248 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -438,8 +438,8 @@ def parse_pubmed_caption(path): graphic_ref = None if graphic is not None: graphic_ref = graphic.attrib.values()[0] - list_items = fig.findall(".//list-item") + list_items = fig.findall(".//list-item") fig_subpoints = [] for list_item in list_items: sub_label = stringify_children(list_item.find("label")) From b47f499f331b647981f5f5b670c60024488470b6 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Thu, 16 May 2024 10:52:26 +0200 Subject: [PATCH 3/3] #125 Requested changes --- pubmed_parser/pubmed_oa_parser.py | 26 +++++++++-------- tests/test_pubmed_oa_parser.py | 47 ++++++++----------------------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index 9eee248..5cfd0d9 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -428,23 +428,27 @@ def parse_pubmed_caption(path): if figs is not None: for fig in figs: fig_id = fig.attrib["id"] - try: - fig_label = stringify_children(fig.find("label")) - except AttributeError: - continue - fig_captions = fig.find("caption").getchildren() - caption = " ".join([stringify_children(c) for c in fig_captions]) + + fig_label = fig.find("label") + if fig_label is not None: + fig_label = stringify_children(fig_label) + + fig_captions = fig.find("caption") + if fig_captions is not None: + fig_captions = fig_captions.getchildren() + caption = " ".join([stringify_children(c) for c in fig_captions]) + graphic = fig.find("graphic") graphic_ref = None if graphic is not None: graphic_ref = graphic.attrib.values()[0] list_items = fig.findall(".//list-item") - fig_subpoints = [] + fig_list_items = [] for list_item in list_items: - sub_label = stringify_children(list_item.find("label")) - sub_text = stringify_children(list_item.find("p")) - fig_subpoints.append((sub_label, sub_text)) + item_label = stringify_children(list_item.find("label")) + item_text = stringify_children(list_item.find("p")) + fig_list_items.append((item_label, item_text)) dict_caption = { "pmid": pmid, @@ -452,7 +456,7 @@ def parse_pubmed_caption(path): "fig_caption": caption, "fig_id": fig_id, "fig_label": fig_label, - "fig_subpoints": fig_subpoints, + "fig_list-items": fig_list_items, "graphic_ref": graphic_ref, } dict_captions.append(dict_caption) diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py index f936a43..db5e158 100644 --- a/tests/test_pubmed_oa_parser.py +++ b/tests/test_pubmed_oa_parser.py @@ -25,7 +25,6 @@ def fetch_pubmed_xml(db_dir): pubmed_xml_9539395 = fetch_pubmed_xml(pubmed_dir['9539395']) captions_9539395 = pp.parse_pubmed_caption(pubmed_xml_9539395) -captions_9539395_fig_1 = captions_9539395[0] def test_parse_pubmed_xml(): @@ -77,38 +76,16 @@ def test_parse_pubmed_caption(): ), "Expected number of figures/captions to have a length of 4" -def test_caption_fig_caption(): - """This is a test for the fig_caption field.""" +def test_parse_pubmed_caption_content(): + """This is a test for the caption content.""" fig_caption = 'Aerosol delivery of sACE22.v2.4‐IgG1 alleviates lung injury and improves survival of SARS‐CoV‐2 gamma variant infected K18‐hACE2 transgenic mice \n\n' - assert captions_9539395_fig_1['fig_caption'] == fig_caption - - -def test_caption_fig_id(): - """This is a test for the fig_id field.""" - assert captions_9539395_fig_1['fig_id'] == 'emmm202216109-fig-0001' - - -def test_caption_fig_label(): - """This is a test for the fig_label field.""" - assert captions_9539395_fig_1['fig_label'] == 'Figure 1' - - -def test_caption_fig_subpoints(): - """This is a test for the fig_subpoints field.""" - fig_subpoints = [('A', 'K18‐hACE2 transgenic mice were inoculated with SARS‐CoV‐2 isolate /Japan/TY7‐503/2021 (gamma variant) at 1\u2009×\u2009104 PFU. sACE22.v2.4‐IgG1 (7.5\u2009ml at 8.3\u2009mg/ml in PBS) was delivered to the mice by a nebulizer in 25\u2009min at 12\u2009h, 48\u2009h, and 84\u2009h postinoculation. PBS was aerosol delivered as control.'), ('B, C', 'Survival (B) and weight loss (C). N\u2009=\u200910 mice for each group. The P‐value of the survival curve by the Gehan–Breslow–Wilcoxon test is shown. Error bars for mouse weight are centered on the mean and show SEM.'), ('D', "Viral load in the lung was measured by RT–qPCR on Day 7. The mRNA expression levels of SARS‐CoV‐2 Spike, Nsp, and Rdrp are normalized to the housekeeping gene peptidylprolyl isomerase A (Ppia). Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('E', "Cytokine expression levels of Tnfa, Ifng, Il1a, and Il1b were measured by RT–qPCR normalized by Ppia. Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('F, G', 'Representative H&E staining of lung sections on Day 7 postinoculation for control PBS group (F) and inhalation of the sACE22.v2.4‐IgG1 group (G). Images at left are low magnifications. Boxed regions (black) are shown at higher magnification on the right. Lungs from 4 independent mice were sectioned, stained, and imaged.')] - assert captions_9539395_fig_1['fig_subpoints'] == fig_subpoints - - -def test_caption_graphic_ref(): - """This is a test for the graphic_ref field.""" - assert captions_9539395_fig_1['graphic_ref'] == 'EMMM-14-e16109-g008' - - -def test_caption_pmc(): - """This is a test for the pmc field.""" - assert captions_9539395_fig_1['pmc'] == '9539395' - - -def test_caption_pmid(): - """This is a test for the pmid field.""" - assert captions_9539395_fig_1['pmid'] == '36094679' + assert captions_9539395[0]['fig_caption'] == fig_caption + assert captions_9539395[0]['fig_id'] == 'emmm202216109-fig-0001' + assert captions_9539395[0]['fig_label'] == 'Figure 1' + assert captions_9539395[8]['fig_label'] is None + fig_list_items = [('A', 'K18‐hACE2 transgenic mice were inoculated with SARS‐CoV‐2 isolate /Japan/TY7‐503/2021 (gamma variant) at 1\u2009×\u2009104 PFU. sACE22.v2.4‐IgG1 (7.5\u2009ml at 8.3\u2009mg/ml in PBS) was delivered to the mice by a nebulizer in 25\u2009min at 12\u2009h, 48\u2009h, and 84\u2009h postinoculation. PBS was aerosol delivered as control.'), ('B, C', 'Survival (B) and weight loss (C). N\u2009=\u200910 mice for each group. The P‐value of the survival curve by the Gehan–Breslow–Wilcoxon test is shown. Error bars for mouse weight are centered on the mean and show SEM.'), ('D', "Viral load in the lung was measured by RT–qPCR on Day 7. The mRNA expression levels of SARS‐CoV‐2 Spike, Nsp, and Rdrp are normalized to the housekeeping gene peptidylprolyl isomerase A (Ppia). Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('E', "Cytokine expression levels of Tnfa, Ifng, Il1a, and Il1b were measured by RT–qPCR normalized by Ppia. Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('F, G', 'Representative H&E staining of lung sections on Day 7 postinoculation for control PBS group (F) and inhalation of the sACE22.v2.4‐IgG1 group (G). Images at left are low magnifications. Boxed regions (black) are shown at higher magnification on the right. Lungs from 4 independent mice were sectioned, stained, and imaged.')] + assert captions_9539395[0]['fig_list-items'] == fig_list_items + assert captions_9539395[0]['graphic_ref'] == 'EMMM-14-e16109-g008' + assert captions_9539395[8]['graphic_ref'] is None + assert captions_9539395[0]['pmc'] == '9539395' + assert captions_9539395[0]['pmid'] == '36094679'