From 3ea59563e1314f7accb5712b2f4cd7ebf9e3c920 Mon Sep 17 00:00:00 2001 From: Nils Herrmann <nils18@live.com.mx> Date: Mon, 20 May 2024 12:23:17 +0200 Subject: [PATCH 1/3] #119 Modify XPath to retreive tables and references. --- pubmed_parser/pubmed_oa_parser.py | 13 ++++++++----- tests/test_pubmed_oa_parser.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index 5cfd0d9..f3ca9d8 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -161,11 +161,11 @@ def parse_pubmed_xml(path, include_path=False, nxml=False): journal = "" dict_article_meta = parse_article_meta(tree) - pub_year_node = tree.find(".//pub-date/year") + pub_year_node = tree.find(".//pub-date[@pub-type='epub']/year") pub_year = pub_year_node.text if pub_year_node is not None else "" - pub_month_node = tree.find(".//pub-date/month") + pub_month_node = tree.find(".//pub-date[@pub-type='epub']/month") pub_month = pub_month_node.text if pub_month_node is not None else "01" - pub_day_node = tree.find(".//pub-date/day") + pub_day_node = tree.find(".//pub-date[@pub-type='epub']/day") pub_day = pub_day_node.text if pub_day_node is not None else "01" subjects_node = tree.findall(".//article-categories//subj-group/subject") @@ -264,11 +264,14 @@ def parse_pubmed_references(path): ref = reference.find("mixed-citation") elif reference.find("element-citation") is not None: ref = reference.find("element-citation") + elif reference.find("citation") is not None: + ref = reference.find("citation") else: ref = None if ref is not None: - if "publication-type" in ref.attrib.keys() and ref is not None: + ref_types = ["citation-type", "publication-type"] + if any(ref_type in ref_types for ref_type in ref.attrib.keys()): if ref.attrib.values() is not None: journal_type = ref.attrib.values()[0] else: @@ -529,7 +532,7 @@ def parse_pubmed_table(path, return_xml=True): pmc = dict_article_meta["pmc"] # parse table - tables = tree.xpath(".//body.//sec.//table-wrap") + tables = tree.xpath(".//body//table-wrap") table_dicts = list() for table in tables: if table.find("label") is not None: diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py index db5e158..bae0963 100644 --- a/tests/test_pubmed_oa_parser.py +++ b/tests/test_pubmed_oa_parser.py @@ -39,6 +39,7 @@ def test_parse_pubmed_xml(): assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493" assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins" # noqa assert "Competing Interests: " in parsed_xml.get("coi_statement") + assert parsed_xml.get('publication_date') == '28-9-2012' def test_parse_pubmed_paragraph(): @@ -63,6 +64,18 @@ def test_parse_pubmed_references(): assert isinstance(references[0], dict) assert len(references) == 58, "Expected references to have length of 29" + references_9539395 = pp.parse_pubmed_references(pubmed_xml_9539395) + assert references_9539395[0].get('pmid') == '36094679' + + +def test_parse_pubmed_table(): + """ + Test parsing table from PubMed XML file + """ + table_9539395 = pp.parse_pubmed_table(pubmed_xml_9539395) + expected_cols = ['Gene', 'Uninfected and untreated', 'Day 7 postinoculation', 'PBS', 'sACE22.v2.4-IgG1'] + assert table_9539395[0].get('table_columns') == expected_cols + def test_parse_pubmed_caption(): """ From 0bd12bff5a7458b27c9f5a445a1905d95ad3694b Mon Sep 17 00:00:00 2001 From: Nils Herrmann <nils18@live.com.mx> Date: Mon, 20 May 2024 12:33:54 +0200 Subject: [PATCH 2/3] #119 Revert error. Some changes from #112 were mistakenly in this PR --- pubmed_parser/pubmed_oa_parser.py | 6 +++--- tests/test_pubmed_oa_parser.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index f3ca9d8..b6c619f 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -161,11 +161,11 @@ def parse_pubmed_xml(path, include_path=False, nxml=False): journal = "" dict_article_meta = parse_article_meta(tree) - pub_year_node = tree.find(".//pub-date[@pub-type='epub']/year") + pub_year_node = tree.find(".//pub-date/year") pub_year = pub_year_node.text if pub_year_node is not None else "" - pub_month_node = tree.find(".//pub-date[@pub-type='epub']/month") + pub_month_node = tree.find(".//pub-date/month") pub_month = pub_month_node.text if pub_month_node is not None else "01" - pub_day_node = tree.find(".//pub-date[@pub-type='epub']/day") + pub_day_node = tree.find(".//pub-date/day") pub_day = pub_day_node.text if pub_day_node is not None else "01" subjects_node = tree.findall(".//article-categories//subj-group/subject") diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py index bae0963..4bff96e 100644 --- a/tests/test_pubmed_oa_parser.py +++ b/tests/test_pubmed_oa_parser.py @@ -39,7 +39,6 @@ def test_parse_pubmed_xml(): assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493" assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins" # noqa assert "Competing Interests: " in parsed_xml.get("coi_statement") - assert parsed_xml.get('publication_date') == '28-9-2012' def test_parse_pubmed_paragraph(): From a73fa7a0b42824b5721756d15d8f754e258e04ea Mon Sep 17 00:00:00 2001 From: Nils Herrmann <nils18@live.com.mx> Date: Thu, 23 May 2024 11:55:56 +0200 Subject: [PATCH 3/3] #119 Requested changes: Clean up the elif convolute --- pubmed_parser/pubmed_oa_parser.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index b6c619f..d769d0d 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -235,6 +235,15 @@ def parse_pubmed_xml(path, include_path=False, nxml=False): return dict_out +def get_reference(reference): + """Get reference from one of the three possible positions.""" + for tag in ["mixed-citation", "element-citation", "citation"]: + ref = reference.find(tag) + if ref is not None: + return ref + return None + + def parse_pubmed_references(path): """ Given path to xml file, parse references articles @@ -260,15 +269,7 @@ def parse_pubmed_references(path): for reference in references: ref_id = reference.attrib["id"] - if reference.find("mixed-citation") is not None: - ref = reference.find("mixed-citation") - elif reference.find("element-citation") is not None: - ref = reference.find("element-citation") - elif reference.find("citation") is not None: - ref = reference.find("citation") - else: - ref = None - + ref = get_reference(reference) if ref is not None: ref_types = ["citation-type", "publication-type"] if any(ref_type in ref_types for ref_type in ref.attrib.keys()):