From 2399e9de4638a7703c6959701dcaa61eb015509a Mon Sep 17 00:00:00 2001 From: Heather Piwowar Date: Fri, 25 Sep 2015 08:04:22 -0700 Subject: [PATCH] handle square brackets better --- models/byline.py | 28 +++++++++++++++++++--------- test/test_byline.py | 11 ++++++++++- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/models/byline.py b/models/byline.py index dd71a954..e957a163 100644 --- a/models/byline.py +++ b/models/byline.py @@ -23,8 +23,6 @@ def _clean_byline(self): clean_byline = clean_byline.replace("\n", " ") remove_patterns = [ - "\(.*?\)", - "\[.*?\]", "with.*$", "assistance.*$", "derived from.*$", @@ -39,34 +37,46 @@ def _clean_byline(self): ] for pattern in remove_patterns: clean_byline = re.sub(pattern, "", clean_byline, re.IGNORECASE) - # print pattern, all_authors clean_byline = clean_byline.replace(" & ", ",") clean_byline = re.sub(" and ", ",", clean_byline, re.IGNORECASE) - clean_byline.strip(" .") self.clean_byline = clean_byline return clean_byline def author_email_pairs(self): + print "start:", self.raw_byline clean_byline = self._clean_byline() if not clean_byline: return None responses = [] - for one_author in clean_byline.split(","): + for author_clause in clean_byline.split(","): + author_name = None author_email = None - if "<" in one_author: - (author_name, author_email) = one_author.split("<", 1) + + clause_replace_patterns = [ + "\(.*?\)", + "\[.*?\]", + "\[.*?$" + ] + for pattern in clause_replace_patterns: + author_clause = re.sub(pattern, "", author_clause, re.IGNORECASE) + + if not author_clause or (len(author_clause) < 6): + return None + + if "<" in author_clause: + (author_name, author_email) = author_clause.split("<", 1) author_email = re.sub("(>.*)", "", author_email) if not validate_email(author_email): author_email = None else: - author_name = one_author + author_name = author_clause if author_name: - author_name = author_name.strip(" .'") + author_name = author_name.strip("\t .'") author_name = author_name.strip('"') if author_name or author_email: diff --git a/test/test_byline.py b/test/test_byline.py index 4a677963..54ad3c94 100644 --- a/test/test_byline.py +++ b/test/test_byline.py @@ -25,7 +25,7 @@ class TestByline(unittest.TestCase): def test_clean_byline_string(self): byline = Byline(self.test_bylines[2]) - expected = " Corentin M Barbu , Sebastian Gibb " + expected = ' Corentin M Barbu [aut, cre], Sebastian Gibb [ctb]' assert_equals(byline._clean_byline(), expected) def test_author_email_pairs(self): @@ -41,4 +41,13 @@ def test_author_halt(self): expected = None assert_equals(response, expected) + def test_author_square_brackets(self): + test_string = """Jie (Kate) Hu [aut, cre], + Norman Breslow [aut], + Gary Chan [aut]""" + byline = Byline(test_string) + response = byline.author_email_pairs() + expected = None + assert_equals(response, expected) +