standupmaths · utkarshgupta137 · Aug 16, 2022
diff --git a/fiveletterwords.py b/fiveletterwords.py
@@ -1,110 +1,96 @@
-import time
-start_time = time.time()
-
-filestub = '/Users/mattparker/Dropbox/python/five_letter_words/'
-
-def load_words():
-    words_txt = '/Users/mattparker/Dropbox/python/five_letter_words/words_alpha.txt'
-    with open(words_txt) as word_file:
-        valid_words = list(word_file.read().split())
-    return valid_words
-
-word_length = 5
-
-word_length2 = word_length*2
-word_length4 = word_length2*2
-word_length5 = word_length4 + word_length
-
-# number of scanA increases per progress report
-stepgap = 1000
-
-# Yes, that is the alphabet. In the default order python makes a list in. Weird.
-alphabet = ['f', 'g', 'o', 'q', 't', 'b', 'y', 'h', 'r', 'u', 'j', 'w', 'i', 'p', 's', 'd', 'l', 'e', 'k', 'm', 'n', 'v', 'z', 'c', 'a', 'x']
-
-# I could be clever and write this to be dynamic
-# but for now I'll hard code everything assuming five words
-number_of_sets = 5
-
-english_words = load_words()
-
-print(f"{len(english_words)} words in total")
-
-fl_words = []
-
-for w in english_words:
-    if len(w) == word_length:
-        fl_words.append(w)
-
-print(f"{len(fl_words)} words have {word_length} letters")
-
-
-word_sets = []
-
-unique_fl_words = []
-for w in fl_words:
-    unique_letters = set(w)
-    if len(unique_letters) == word_length:
-        if unique_letters not in word_sets:
-            word_sets.append(unique_letters)
-            unique_fl_words.append(w)
-
-number_of_words = len(unique_fl_words)
-
-print(f"{number_of_words} words have a unique set of {word_length} letters")
-
-doubleword_sets = []
-doubleword_words = []
-
-scanA = 0
-while scanA < number_of_words-1:
-    scanB = scanA + 1
-    while scanB < number_of_words:
-        give_it_a_try = word_sets[scanA] | word_sets[scanB]
-        if len(give_it_a_try) == word_length2:
-            doubleword_sets.append(give_it_a_try)
-            doubleword_words.append([unique_fl_words[scanA], unique_fl_words[scanB]])
-        scanB += 1
-    scanA += 1
-
-number_of_doublewords = len(doubleword_sets)
-
-print(f"we found {number_of_doublewords} combos")
-
-counter = 0
-
-success_found = []
-
-scanA = 0
-print(f"starting at position {scanA}")
-
-while scanA < number_of_doublewords-1:
-    if scanA % stepgap == 0:
-        print(f"Up to {scanA} after {time.time() - start_time} seconds.")
-
-    scanB = scanA + 1
-    while scanB < number_of_doublewords:
-        give_it_a_try = doubleword_sets[scanA] | doubleword_sets[scanB]
-        if len(give_it_a_try) == word_length4:
-            scanC = 0
-            while scanC < number_of_words:
-                final_go = give_it_a_try | word_sets[scanC]
-                if len(final_go) == word_length5:
-                    success = doubleword_words[scanA] + doubleword_words[scanB]
-                    success.append(unique_fl_words[scanC])
-                    success.sort()
-                    if success not in success_found:
-                        success_found.append(success)
-                        print(success)
-                scanC += 1
-            counter += 1
-        scanB += 1
-    scanA += 1
-
-print(f"Damn, we had {len(success_found)} successful finds!")
-print(f"That took {time.time() - start_time} seconds")
-
-print("Here they all are:")
-for i in success_found:
-    print(i)
-
-print("DONE")
+import functools
+import string
+
+# Read all words
+with open("words_alpha.txt") as f:
+    words = [word.strip() for word in f]
+
+print(f"Total words: {len(words):,}")
+
+# Keep only words with length 5
+words_len5 = [word for word in words if len(word) == 5]
+
+print(f"Words with length 5: {len(words_len5):,}")
+
+# Remove words with repeating alphabets
+words_len5_dedup = [word for word in words_len5 if len(set(word)) == 5]
+
+print(f"Words with length 5 without repeating alphabets: {len(words_len5_dedup):,}")
+
+# Remove anagrams
+words_len5_alpha_set = set()
+words_len5_filtered = set()
+for word in words_len5_dedup:
+    alphabets = str(sorted(word))
+    if alphabets not in words_len5_alpha_set:
+        words_len5_alpha_set.add(alphabets)
+        words_len5_filtered.add(word)
+
+print(
+    f"Words with length 5 without repeating alphabets or anagrams: {len(words_len5_filtered):,}"
+)
+
+# Create a dict of alphabet -> words that contain this alphabet
+alphabet_words = {alphabet: set() for alphabet in string.ascii_lowercase}
+
+for word in words_len5_filtered:
+    for alphabet in word:
+        alphabet_words[alphabet].add(word)
+
+# Get list of alphabets in increasing order of frequency
+alphabets_sorted = []
+for k, v in alphabet_words.items():
+    alphabets_sorted.append((len(v), k))
+alphabets_sorted.sort()
+
+for count, alphabet in alphabets_sorted:
+    print(alphabet, f"{count:,}")
+
+
+# Function to find combinations
+#   - alphabets: alphabets not used till now
+#   - words: valid words using the above alphabets
+@functools.lru_cache(maxsize=1024)
+def find_combos(alphabets, words):
+    if not words or not alphabets:
+        return []
+    if len(alphabets) == 5:
+        # 5 alphabets left, there can be at max 1 word since we've removed anagrams
+        return [[word] for word in words]
+
+    ret = []
+    # Consider the least frequent alphabet that we've not used till now
+    for count, alphabet in alphabets_sorted:
+        if alphabet in alphabets:
+            # Consider all words containing this alphabet which are in the `words` set
+            for word in alphabet_words[alphabet]:
+                if word in words:
+                    # Create a set without any words which contains alphabets in the current word
+                    rem = words
+                    for alphabet in word:
+                        rem -= alphabet_words[alphabet]
+
+                    # Recursion!
+                    ret += [
+                        [word] + rest
+                        for rest in find_combos(frozenset(alphabets - set(word)), rem)
+                    ]
+            break
+
+    return ret
+
+
+combos = []
+total = 0
+for alphabet in string.ascii_lowercase:
+    ret = find_combos(
+        frozenset(set(string.ascii_lowercase) - {alphabet}),
+        frozenset(words_len5_filtered - alphabet_words[alphabet]),
+    )
+    combos += ret
+    total += len(ret)
+    print(f"Number of combos without {alphabet}: {len(ret):,}")
+print(f"Total combos: {total:,}")
+print("\nCombos:")
+for combo in combos:
+    print(combo)