From 3ea59563e1314f7accb5712b2f4cd7ebf9e3c920 Mon Sep 17 00:00:00 2001
From: Nils Herrmann <nils18@live.com.mx>
Date: Mon, 20 May 2024 12:23:17 +0200
Subject: [PATCH 1/3] #119 Modify XPath to retreive tables and references.

---
 pubmed_parser/pubmed_oa_parser.py | 13 ++++++++-----
 tests/test_pubmed_oa_parser.py    | 13 +++++++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py
index 5cfd0d9..f3ca9d8 100644
--- a/pubmed_parser/pubmed_oa_parser.py
+++ b/pubmed_parser/pubmed_oa_parser.py
@@ -161,11 +161,11 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
         journal = ""
 
     dict_article_meta = parse_article_meta(tree)
-    pub_year_node = tree.find(".//pub-date/year")
+    pub_year_node = tree.find(".//pub-date[@pub-type='epub']/year")
     pub_year = pub_year_node.text if pub_year_node is not None else ""
-    pub_month_node = tree.find(".//pub-date/month")
+    pub_month_node = tree.find(".//pub-date[@pub-type='epub']/month")
     pub_month = pub_month_node.text if pub_month_node is not None else "01"
-    pub_day_node = tree.find(".//pub-date/day")
+    pub_day_node = tree.find(".//pub-date[@pub-type='epub']/day")
     pub_day = pub_day_node.text if pub_day_node is not None else "01"
 
     subjects_node = tree.findall(".//article-categories//subj-group/subject")
@@ -264,11 +264,14 @@ def parse_pubmed_references(path):
             ref = reference.find("mixed-citation")
         elif reference.find("element-citation") is not None:
             ref = reference.find("element-citation")
+        elif reference.find("citation") is not None:
+            ref = reference.find("citation")
         else:
             ref = None
 
         if ref is not None:
-            if "publication-type" in ref.attrib.keys() and ref is not None:
+            ref_types = ["citation-type", "publication-type"]
+            if any(ref_type in ref_types for ref_type in ref.attrib.keys()):
                 if ref.attrib.values() is not None:
                     journal_type = ref.attrib.values()[0]
                 else:
@@ -529,7 +532,7 @@ def parse_pubmed_table(path, return_xml=True):
     pmc = dict_article_meta["pmc"]
 
     # parse table
-    tables = tree.xpath(".//body.//sec.//table-wrap")
+    tables = tree.xpath(".//body//table-wrap")
     table_dicts = list()
     for table in tables:
         if table.find("label") is not None:
diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py
index db5e158..bae0963 100644
--- a/tests/test_pubmed_oa_parser.py
+++ b/tests/test_pubmed_oa_parser.py
@@ -39,6 +39,7 @@ def test_parse_pubmed_xml():
     assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
     assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins"  # noqa
     assert "Competing Interests: " in parsed_xml.get("coi_statement")
+    assert parsed_xml.get('publication_date') == '28-9-2012'
 
 
 def test_parse_pubmed_paragraph():
@@ -63,6 +64,18 @@ def test_parse_pubmed_references():
     assert isinstance(references[0], dict)
     assert len(references) == 58, "Expected references to have length of 29"
 
+    references_9539395 = pp.parse_pubmed_references(pubmed_xml_9539395)
+    assert references_9539395[0].get('pmid') == '36094679'
+
+
+def test_parse_pubmed_table():
+    """
+    Test parsing table from PubMed XML file
+    """
+    table_9539395 = pp.parse_pubmed_table(pubmed_xml_9539395)
+    expected_cols = ['Gene', 'Uninfected and untreated', 'Day 7 postinoculation', 'PBS', 'sACE22.v2.4-IgG1']
+    assert table_9539395[0].get('table_columns') == expected_cols
+
 
 def test_parse_pubmed_caption():
     """

From 0bd12bff5a7458b27c9f5a445a1905d95ad3694b Mon Sep 17 00:00:00 2001
From: Nils Herrmann <nils18@live.com.mx>
Date: Mon, 20 May 2024 12:33:54 +0200
Subject: [PATCH 2/3] #119 Revert error. Some changes from #112 were mistakenly
 in this PR

---
 pubmed_parser/pubmed_oa_parser.py | 6 +++---
 tests/test_pubmed_oa_parser.py    | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py
index f3ca9d8..b6c619f 100644
--- a/pubmed_parser/pubmed_oa_parser.py
+++ b/pubmed_parser/pubmed_oa_parser.py
@@ -161,11 +161,11 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
         journal = ""
 
     dict_article_meta = parse_article_meta(tree)
-    pub_year_node = tree.find(".//pub-date[@pub-type='epub']/year")
+    pub_year_node = tree.find(".//pub-date/year")
     pub_year = pub_year_node.text if pub_year_node is not None else ""
-    pub_month_node = tree.find(".//pub-date[@pub-type='epub']/month")
+    pub_month_node = tree.find(".//pub-date/month")
     pub_month = pub_month_node.text if pub_month_node is not None else "01"
-    pub_day_node = tree.find(".//pub-date[@pub-type='epub']/day")
+    pub_day_node = tree.find(".//pub-date/day")
     pub_day = pub_day_node.text if pub_day_node is not None else "01"
 
     subjects_node = tree.findall(".//article-categories//subj-group/subject")
diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py
index bae0963..4bff96e 100644
--- a/tests/test_pubmed_oa_parser.py
+++ b/tests/test_pubmed_oa_parser.py
@@ -39,7 +39,6 @@ def test_parse_pubmed_xml():
     assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
     assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins"  # noqa
     assert "Competing Interests: " in parsed_xml.get("coi_statement")
-    assert parsed_xml.get('publication_date') == '28-9-2012'
 
 
 def test_parse_pubmed_paragraph():

From a73fa7a0b42824b5721756d15d8f754e258e04ea Mon Sep 17 00:00:00 2001
From: Nils Herrmann <nils18@live.com.mx>
Date: Thu, 23 May 2024 11:55:56 +0200
Subject: [PATCH 3/3] #119 Requested changes: Clean up the elif convolute

---
 pubmed_parser/pubmed_oa_parser.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py
index b6c619f..d769d0d 100644
--- a/pubmed_parser/pubmed_oa_parser.py
+++ b/pubmed_parser/pubmed_oa_parser.py
@@ -235,6 +235,15 @@ def parse_pubmed_xml(path, include_path=False, nxml=False):
     return dict_out
 
 
+def get_reference(reference):
+    """Get reference from one of the three possible positions."""
+    for tag in ["mixed-citation", "element-citation", "citation"]:
+        ref = reference.find(tag)
+        if ref is not None:
+            return ref
+    return None
+
+
 def parse_pubmed_references(path):
     """
     Given path to xml file, parse references articles
@@ -260,15 +269,7 @@ def parse_pubmed_references(path):
     for reference in references:
         ref_id = reference.attrib["id"]
 
-        if reference.find("mixed-citation") is not None:
-            ref = reference.find("mixed-citation")
-        elif reference.find("element-citation") is not None:
-            ref = reference.find("element-citation")
-        elif reference.find("citation") is not None:
-            ref = reference.find("citation")
-        else:
-            ref = None
-
+        ref = get_reference(reference)
         if ref is not None:
             ref_types = ["citation-type", "publication-type"]
             if any(ref_type in ref_types for ref_type in ref.attrib.keys()):