Skip to content

Commit

Permalink
handle square brackets better
Browse files Browse the repository at this point in the history
  • Loading branch information
hpiwowar committed Sep 25, 2015
1 parent f171b1a commit 2399e9d
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
28 changes: 19 additions & 9 deletions models/byline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ def _clean_byline(self):
clean_byline = clean_byline.replace("\n", " ")

remove_patterns = [
"\(.*?\)",
"\[.*?\]",
"with.*$",
"assistance.*$",
"derived from.*$",
Expand All @@ -39,34 +37,46 @@ def _clean_byline(self):
]
for pattern in remove_patterns:
clean_byline = re.sub(pattern, "", clean_byline, re.IGNORECASE)
# print pattern, all_authors

clean_byline = clean_byline.replace(" & ", ",")
clean_byline = re.sub(" and ", ",", clean_byline, re.IGNORECASE)
clean_byline.strip(" .")
self.clean_byline = clean_byline
return clean_byline



def author_email_pairs(self):
print "start:", self.raw_byline
clean_byline = self._clean_byline()
if not clean_byline:
return None

responses = []
for one_author in clean_byline.split(","):
for author_clause in clean_byline.split(","):
author_name = None
author_email = None
if "<" in one_author:
(author_name, author_email) = one_author.split("<", 1)

clause_replace_patterns = [
"\(.*?\)",
"\[.*?\]",
"\[.*?$"
]
for pattern in clause_replace_patterns:
author_clause = re.sub(pattern, "", author_clause, re.IGNORECASE)

if not author_clause or (len(author_clause) < 6):
return None

if "<" in author_clause:
(author_name, author_email) = author_clause.split("<", 1)
author_email = re.sub("(>.*)", "", author_email)
if not validate_email(author_email):
author_email = None
else:
author_name = one_author
author_name = author_clause

if author_name:
author_name = author_name.strip(" .'")
author_name = author_name.strip("\t .'")
author_name = author_name.strip('"')

if author_name or author_email:
Expand Down
11 changes: 10 additions & 1 deletion test/test_byline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TestByline(unittest.TestCase):

def test_clean_byline_string(self):
byline = Byline(self.test_bylines[2])
expected = " Corentin M Barbu , Sebastian Gibb "
expected = ' Corentin M Barbu [aut, cre], Sebastian Gibb [ctb]'
assert_equals(byline._clean_byline(), expected)

def test_author_email_pairs(self):
Expand All @@ -41,4 +41,13 @@ def test_author_halt(self):
expected = None
assert_equals(response, expected)

def test_author_square_brackets(self):
test_string = """Jie (Kate) Hu [aut, cre],
Norman Breslow [aut],
Gary Chan [aut]"""
byline = Byline(test_string)
response = byline.author_email_pairs()
expected = None
assert_equals(response, expected)


0 comments on commit 2399e9d

Please sign in to comment.