from collections import defaultdict
from pathlib import Path
def read_wordlist(file_path):
words = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
word = line.strip()
if word:
words.append(word.lower())
return words
def find_conflict_groups(words, max_prefix_length=5):
prefix_to_words = defaultdict(list)
for word in words:
prefix = word[:max_prefix_length]
prefix_to_words[prefix].append(word)
return {
prefix: word_list
for prefix, word_list in prefix_to_words.items()
if len(word_list) > 1
}
def calculate_min_prefix_lengths(words):
word_to_min_prefix = {}
for word in words:
min_length = 1
while min_length <= len(word):
prefix = word[:min_length]
conflicts = [w for w in words if w != word and w.startswith(prefix)]
if not conflicts:
word_to_min_prefix[word] = min_length
break
min_length += 1
if word not in word_to_min_prefix:
word_to_min_prefix[word] = len(word)
return word_to_min_prefix
def get_proper_noun_candidates():
proper_nouns = {
"albania",
"andorra",
"armenia",
"austria",
"bahrain",
"belarus",
"belgium",
"bolivia",
"botswana",
"bulgaria",
"cambodia",
"cameroon",
"croatia",
"cyprus",
"czechia",
"denmark",
"ecuador",
"estonia",
"finland",
"georgia",
"germany",
"hungary",
"iceland",
"ireland",
"jamaica",
"jordan",
"kosovo",
"latvia",
"lebanon",
"liberia",
"lithuania",
"luxembourg",
"malta",
"moldova",
"monaco",
"montenegro",
"morocco",
"namibia",
"nepal",
"nigeria",
"norway",
"panama",
"poland",
"portugal",
"romania",
"serbia",
"slovakia",
"slovenia",
"sweden",
"switzerland",
"tunisia",
"turkey",
"ukraine",
"uruguay",
"zambia",
"adelaide",
"amsterdam",
"athens",
"atlanta",
"baghdad",
"bangalore",
"barcelona",
"beijing",
"belgrade",
"berlin",
"bogota",
"boston",
"brisbane",
"brussels",
"budapest",
"buenos",
"cairo",
"calgary",
"canberra",
"caracas",
"chicago",
"cologne",
"copenhagen",
"dallas",
"delhi",
"detroit",
"dublin",
"durban",
"edinburgh",
"florence",
"frankfurt",
"geneva",
"glasgow",
"hamburg",
"helsinki",
"houston",
"istanbul",
"jakarta",
"karachi",
"kiev",
"lagos",
"leipzig",
"lisbon",
"london",
"madrid",
"manila",
"marseille",
"melbourne",
"miami",
"milan",
"montreal",
"moscow",
"mumbai",
"munich",
"nairobi",
"naples",
"oslo",
"ottawa",
"paris",
"perth",
"prague",
"quebec",
"riyadh",
"rome",
"seattle",
"seoul",
"shanghai",
"stockholm",
"sydney",
"tehran",
"tokyo",
"toronto",
"tunis",
"valencia",
"vancouver",
"venice",
"vienna",
"warsaw",
"zurich",
"abdul",
"ahmed",
"akira",
"alejandro",
"alexander",
"alfonso",
"andrea",
"antonio",
"beatrice",
"carlos",
"catherine",
"christine",
"christopher",
"claudia",
"dmitri",
"eduardo",
"elena",
"francisco",
"giovanni",
"giuseppe",
"gonzalez",
"helena",
"ibrahim",
"jacques",
"jessica",
"johannes",
"jonathan",
"leonardo",
"lorenzo",
"mahmoud",
"margaret",
"maria",
"martinez",
"maxime",
"miguel",
"mohammed",
"natasha",
"nicolas",
"olivier",
"patricia",
"philippe",
"rafael",
"ricardo",
"roberto",
"rodrigo",
"sebastian",
"stephanie",
"theodore",
"valentina",
"valerie",
"victoria",
"vladimir",
"wolfgang",
"xavier",
"amazon",
"brahmaputra",
"colorado",
"danube",
"euphrates",
"ganges",
"hudson",
"indus",
"jordan",
"mackenzie",
"mekong",
"murray",
"niger",
"nile",
"rhine",
"thames",
"volga",
"yangtze",
"yukon",
"alps",
"andes",
"everest",
"fuji",
"himalaya",
"kilimanjaro",
"mckinley",
"rockies",
"sahara",
"ural",
"aristotle",
"beethoven",
"churchill",
"columbus",
"confucius",
"darwin",
"edison",
"einstein",
"galileo",
"gandhi",
"homer",
"jefferson",
"lincoln",
"mozart",
"napoleon",
"newton",
"plato",
"shakespeare",
"socrates",
"tesla",
"washington",
"adobe",
"amazon",
"apple",
"boeing",
"canon",
"cisco",
"dell",
"fedex",
"google",
"honda",
"intel",
"microsoft",
"nike",
"nokia",
"oracle",
"pepsi",
"samsung",
"sony",
"toyota",
"walmart",
"apollo",
"athena",
"buddha",
"diana",
"hercules",
"jupiter",
"mars",
"mercury",
"neptune",
"pluto",
"saturn",
"thor",
"venus",
"zeus",
}
return proper_nouns
def analyze_proper_noun_strategy():
words = read_wordlist("GOLD_WORDLIST.txt")
conflict_groups = find_conflict_groups(words, 5)
word_to_min_prefix = calculate_min_prefix_lengths(words)
proper_nouns = get_proper_noun_candidates()
problematic_words = [
word for word, min_len in word_to_min_prefix.items() if min_len > 5
]
print("=" * 80)
print("PROPER NOUN REPLACEMENT STRATEGY ANALYSIS")
print("=" * 80)
print()
print("CURRENT SITUATION:")
print(f"Total dictionary size: {len(words)} words")
print(f"Words requiring 6+ characters: {len(problematic_words)} words")
print(f"Conflict groups (5-char prefixes): {len(conflict_groups)} groups")
print(f"Available proper noun candidates: {len(proper_nouns)} words")
print()
all_words_extended = set(words) | proper_nouns
proper_noun_conflicts = find_conflict_groups(list(all_words_extended), 5)
unique_proper_nouns = []
conflicting_proper_nouns = []
for noun in proper_nouns:
noun_prefix = noun[:5]
conflicts_with_existing = [
w for w in words if w.startswith(noun_prefix) and w != noun
]
conflicts_with_other_nouns = [
n for n in proper_nouns if n != noun and n.startswith(noun_prefix)
]
if not conflicts_with_existing and not conflicts_with_other_nouns:
unique_proper_nouns.append(noun)
else:
conflicting_proper_nouns.append(
(noun, conflicts_with_existing + conflicts_with_other_nouns)
)
print("PROPER NOUN ANALYSIS:")
print(f"Proper nouns with unique 5-char prefixes: {len(unique_proper_nouns)}")
print(f"Proper nouns that would still conflict: {len(conflicting_proper_nouns)}")
print()
print("SAMPLE UNIQUE PROPER NOUNS BY CATEGORY:")
countries = [
n
for n in unique_proper_nouns
if n
in {
"albania",
"andorra",
"armenia",
"austria",
"bahrain",
"belarus",
"belgium",
"bolivia",
"botswana",
"bulgaria",
"cambodia",
"cameroon",
"croatia",
}
]
cities = [
n
for n in unique_proper_nouns
if n
in {
"adelaide",
"athens",
"atlanta",
"baghdad",
"bangalore",
"barcelona",
"belgrade",
"bogota",
"brisbane",
"brussels",
"budapest",
}
]
names = [
n
for n in unique_proper_nouns
if n in {"abdul", "ahmed", "akira", "alejandro", "alfonso", "beatrice"}
]
print(f"Countries ({len(countries)}): {', '.join(countries[:10])}")
print(f"Cities ({len(cities)}): {', '.join(cities[:10])}")
print(f"Names ({len(names)}): {', '.join(names[:10])}")
print()
print("REPLACEMENT STRATEGY:")
print("1. MAINTAIN 4096 DICTIONARY SIZE")
print("2. Replace problematic common words with unique proper nouns")
print("3. Prioritize replacements that resolve the most conflicts")
print()
optimal_replacements = []
remaining_problematic = set(problematic_words)
available_nouns = list(unique_proper_nouns)
sorted_conflicts = sorted(
conflict_groups.items(), key=lambda x: len(x[1]), reverse=True
)
for prefix, conflict_words in sorted_conflicts:
if available_nouns and any(w in remaining_problematic for w in conflict_words):
group_problematic = [
w for w in conflict_words if w in remaining_problematic
]
if group_problematic:
word_to_replace = max(group_problematic, key=len)
replacement_noun = available_nouns.pop(0)
optimal_replacements.append(
(word_to_replace, replacement_noun, prefix, len(conflict_words))
)
remaining_problematic.discard(word_to_replace)
print("TOP REPLACEMENT RECOMMENDATIONS:")
print("Replace Word → Proper Noun | Prefix | Conflicts Resolved")
print("-" * 75)
for old_word, new_word, prefix, conflicts in optimal_replacements[:20]:
print(f"{old_word:20s} → {new_word:12s} | {prefix:6s} | {conflicts:6d}")
print(f"... and {len(optimal_replacements) - 20} more optimal replacements")
print()
print("ESTIMATED IMPACT:")
print(f"Words that can be optimally replaced: {len(optimal_replacements)}")
print(f"Remaining problematic words: {len(remaining_problematic)}")
print(f"Available proper nouns remaining: {len(available_nouns)}")
print(f"Dictionary size after replacement: 4096 (maintained)")
print(
f"Estimated 5-char uniqueness: {((4096 - len(remaining_problematic)) / 4096 * 100):.1f}%"
)
if __name__ == "__main__":
analyze_proper_noun_strategy()