BU-Spark · trgardos · Dec 11, 2024
diff --git a/modules/deed_preprocessing/read_all_tiffs.py b/modules/deed_preprocessing/read_all_tiffs.py
@@ -46,10 +46,10 @@
                         #search for words in the bigotry_dict, can update with additional words
                         found = False
                         words = re.split(r'[\n ]+', corrected_text)
-                        for identifier in bigotry_dict:
+                        for identifier in bigotry_dict.keys():
                             if not found:
                                 for word in words:
-                                    similarity_ratio = SequenceMatcher(None, word, identifier).ratio()
+                                    similarity_ratio = SequenceMatcher(None, word.lower(), identifier.lower()).ratio()
                                     if similarity_ratio >= 0.9:
                                         found = True
                                         break

diff --git a/modules/deed_preprocessing/reset_racist_dir.py b/modules/deed_preprocessing/reset_racist_dir.py
@@ -57,7 +57,7 @@
                             if not found:
                                 for identifier in bigotry_dict.keys():
                                     if not found:
-                                        similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
+                                        similarity_ratio = SequenceMatcher(None, words[i].lower(), identifier.lower()).ratio()
                                         if similarity_ratio >= 0.9:
                                             #figure out how to move this file into other directory
                                             found = True

diff --git a/modules/last_year/bigotry_dict.py b/modules/last_year/bigotry_dict.py
@@ -1,102 +1,108 @@
+#to compare, use .lower() method on all strings
 bigotry_dict = {
-    "Irishman": True,
     "irishman": True,
-    "Greek": True,
     "greek": True,
-    "Portugese": True,
     "portugese": True,
-    "Mulatto": True,
-    "mutatto": True,
-    "Quadroon": True,
+    "mulatto": True,
     "quadroon": True,
-    "Chinaman": True,
     "chinaman": True,
-    "Jap": True,
     "jap": True,
     "japs": True,
-    "Japs": True,
-    "Hebrew": True,
     "hebrew": True,
-    "Pole": True,
     "pole": True,
-    "French Canadian": True,
-    "Canadien": True,
-    "Quebecois": True,
-    "Quebecker": True,
-    "Arab": True,
-    "Arabs": True,
-    "Truk": True,
-    "Turks": True,
-    "Frenchman": True,
-    "German": True,
+    "french canadian": True,
+    "canadien": True,
+    "quebecois": True,
+    "quebecker": True,
+    "arab": True,
+    "arabs": True,
+    "turk": True,
+    "turks": True,
+    "frenchman": True,
     "german": True,
-    "Germans": True,
     "germans": True,
-    "Spaniard": True,
     "spaniard": True,
-    "Spaniards": True,
     "spaniards": True,
-    "Slav": True,
     "slav": True,
-    "Slavs": True,
     "slavs": True,
-    "Russian": True,
     "russian": True,
-    "Russians": True,
     "russians": True,
-    "Persian": True,
     "persian": True,
-    "Persians": True,
     "persians": True,
-    "Korean": True,
     "korean": True,
-    "Koreans": True,
     "koreans": True,
-    "Negro": True,
     "negro": True,
-    "Colored": True,
     "colored": True,
-    "Polander": True,
     "polander": True,
-    "Polish": True,
     "polish": True,
-    "Italian": True,
     "italian": True,
-    "African": True,
     "african": True,
-    "Africans": True,
     "africans": True,
-    "Hindu": True,
     "hindu": True,
-    "Japanese": True,
     "japanese": True,
-    "Chinese": True,
     "chinese": True,
-    "Catholic": True,
     "catholic": True,
-    "Jew": True,
     "jew": True,
-    "Jewish": True,
     "jewish": True,
-    "shall not be resold": True,
-    "shall not be re-sold": True,
-    "shall not be sold": True,
-    "white": True,
-    "White": True,
-    "Whites": True,
-    "whites": True,
-    "Aryan": True,
-    "Aryans": True,
     "aryan": True,
     "aryans": True,
-    "Caucasian": True,
     "caucasian": True,
-    "Caucasians": True,
     "caucasians": True,
     "race": True,
-    "Race": True,
     "races": True,
-    "Races": True,
-    "Semetic": True,
     "semetic": True,
-}
+    "mongolian": True,
+    "mongoloid": True,
+    "gypsy": True,
+    "gypsies": True,
+    "latin": True,
+    "latins": True,
+    "asian": True,
+    "asians": True,
+    "indian": True,
+    "indians": True,
+    "eskimo": True,
+    "eskimos": True,
+    "laplander": True,
+    "laplanders": True,
+    "oriental": True,
+    "orientals": True,
+    "swede": True,
+    "swedes": True,
+    "scandinavian": True,
+    "scandinavians": True,
+    "dane": True,
+    "danes": True,
+    "norwegian": True,
+    "norwegians": True,
+    "hungarian": True,
+    "hungarians": True,
+    "tartar": True,
+    "tartars": True,
+    "sioux": True,
+    "cherokee": True,
+    "creek": True,
+    "choctaw": True,
+    "navajo": True,
+    "navajos": True,
+    "apache": True,
+    "apaches": True,
+    "seminole": True,
+    "zulu": True,
+    "zulus": True,
+    "boer": True,
+    "boers": True,
+    "kelt": True,
+    "kelts": True,
+    "welsh": True,
+    "balkan": True,
+    "balkans": True,
+    "slavic": True,
+    "slavonic": True,
+    "serb": True,
+    "serbs": True,
+    "croat": True,
+    "croats": True,
+    "bosnian": True,
+    "bosnians": True
+}
diff --git a/modules/last_year/search_keywords.py b/modules/last_year/search_keywords.py
@@ -28,7 +28,7 @@
                             if not found:
                                 for identifier in bigotry_dict.keys():
                                     if not found:
-                                        similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
+                                        similarity_ratio = SequenceMatcher(None, words[i].lower(), identifier.lower()).ratio()
                                         if similarity_ratio >= 0.9:
                                             # Collect the surrounding words
                                             context = words[max(0, i-10):min(len(words),i+10)]