-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitle_article_input.py
155 lines (138 loc) · 6.65 KB
/
title_article_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env LA#!/usr/bin/env LANG=en_UK.UTF-8 /usr/local/bin/python3
'''
Multiple language title article splitting script
Takes arguments of full title, and title language in ISO Country Code alpha-2
Handles the potential for title entries to be upper case or lower case.
Loads TITLE_ARTICLES dictionary, which returns a list of title articles relevant to supplied language
Script matches title article (if present) to language key's value list,
if not returns original title and empty string.
This script accepts titles and ISO codes inputted from command line
python3 title_article_input.pt 'Title of film here' 'en'
Joanna White 2021
'''
import sys
# Dictionary of ISO country codes and title articles for each language
# These contents may have first originated from AACR2 documentation, with additions from BFI staff through the years
TITLE_ARTICLES = {'af': ["Die ", "'N "],
'sq': ["Nji ", "Një "],
'ar': ["El-", "Ad-", "Ag-", "Ak-", "An-", "Ar-", "As-", "At-", "Az-"],
'da': ["Den ", "Det ", "De ", "En ", "Et "],
'nl': ["De ", "Het ", "'S ", "Een ", "Eene ", "'N "],
'en': ["The ", "A ", "An "],
'fr': ["Le ", "La ", "L'", "Les ", "Un ", "Une "],
'de': ["Der ", "Die ", "Das ", "Ein ", "Eine "],
'el': ["Ho ", "He ", "To ", "Hoi ", "Hai ", "Ta ", "Henas ", "Heis ", "Mia ", "Hena "],
'he': ["Ha-" , "Ho-"],
'hu': ["A ", "Az ", "Egy "],
'is': ["Hinn ", "Hin ", "Hid ", "Hinir ", "Hinar "],
'it': ["Il ", "La ", "Lo ", "I ", "Gli ", "Gl'", "Le ", "L'", "Un ", "Uno ", "Una ", "Un'"],
'nb': ["Den ", "Det ", "De ", "En ", "Et "],
'nn': ["Dent ", "Det ", "Dei ", "Ein ", "Ei ", "Eit "],
'pt': ["O ", "A ", "Os ", "As ", "Um ", "Uma "],
'ro': ["Un ", "Una ", "O "],
'es': ["El ", "La ", "Lo ", "Los ", "Las ", "Un ", "Una "],
'sv': ["Den ", "Det ", "De ", "En ", "Ett "],
'tr': ["Bir "],
'cy': ["Y ", "Yr "],
'yi': ["Der ", "Di ", "Die ", "Dos ", "Das ", "A ", "An ", "Eyn ", "Eyne "]
}
def splitter(title_supplied, language):
# Refresh variables
title = ''
title_art = ''
# Manage title appearing all upper case
if title_supplied.isupper():
title_supplied = title_supplied.title()
# Counts words in the supplied title:
title_strip = title_supplied.strip()
count = 1 + title_strip.count(" ")
# For single word titles, splits into title and title_art
# where articles are attached to first word of title
language = language.lower()
if count == 1:
if 'ar' in language or 'he' in language:
title_supplied = title_supplied.capitalize()
# Split here on the first word - hyphen
if title_supplied.startswith(("El-", "Ad-", "Ag-", "Ak-", "An-", "Ar-", "As-", "At-", "Az-", "Ha-", "Ho-")):
title_art_split = title_supplied.split("-")
title_art = title_art_split[0]
title = "{}".format(title_art_split[1])
elif 'it' in language or 'fr' in language:
title_supplied = title_supplied.capitalize()
# Split on the first word apostrophe where present
if title_supplied.startswith(("L'", "Un'", "Gl'")):
title_art_split = title_supplied.split("'")
title_art = "{}'".format(title_art_split[0])
title = "{}".format(title_art_split[1])
else:
title = title_supplied
title_art = ''
# For multiple word titles, splits into title and title_art
# where articles are attached to first word of title
# and where articles are separately spaced
elif count > 1:
ttl = []
title_split = title_supplied.split()
title_art_split = title_split[0]
title_art_split = title_art_split.capitalize()
if 'ar' in language or 'he' in language:
# Split here on the first word - hyphen
if title_art_split.startswith(("El-", "Ad-", "Ag-", "Ak-", "An-", "Ar-", "As-", "At-", "Az-", "Ha-", "Ho-")):
article_split = title_art_split.split("-")
title_art = str(article_split[0])
ttl.append(article_split[1])
ttl += title_split[1:]
title = ' '.join(ttl)
elif 'it' in language or 'fr' in language:
# Split on the first word apostrophe where present
if title_art_split.startswith(("L'", "Un'", "Gl'")):
article_split = title_art_split.split("'")
title_art = "{}'".format(article_split[0])
ttl.append(article_split[1])
ttl += title_split[1:]
title_join = ' '.join(ttl)
title = title_join.strip()
else:
ttl = title_split[1:]
title_art = title_split[0]
title = ' '.join(ttl)
else:
ttl = title_split[1:]
title_art = title_split[0]
title = ' '.join(ttl)
# Searches through keys for language match
for key in TITLE_ARTICLES.keys():
if language == str(key):
lst = []
lst = TITLE_ARTICLES[language]
# Looks to match title_art with values in language key match
# and return title, title_art where this is the case
for item in zip(lst):
if len(title_art) > 0:
title_art = title_art.capitalize()
if title_art in str(item):
title_art = title_art.title()
title = title[0].upper() + title[1:]
if title.isupper():
title = title.title()
return title, title_art
else:
return title, title_art
# Returns titles as they are where no article language matches
for key in TITLE_ARTICLES.keys():
if language != str(key):
return title_supplied, ''
def main():
'''
Receives command line argument and passes 'Title' and 'en' ISO code to splitter() function.
Prints results to command line.
'''
if len(sys.argv) < 3:
print('Please supply two arguments, title and language for the script to work, for example:\npython3 title_article_input.py "The Lighthouse" "en"')
else:
title = sys.argv[1]
language = sys.argv[2]
results = splitter(title, language)
print(results)
if __name__ == '__main__':
main()