import sys
from collections import defaultdict
from pathlib import Path
def read_wordlist(file_path):
words = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
word = line.strip()
if word: words.append(word.lower())
return words
def calculate_min_prefix_lengths(words):
word_to_min_prefix = {}
for word in words:
min_length = 1
while min_length <= len(word):
prefix = word[:min_length]
conflicts = [w for w in words if w != word and w.startswith(prefix)]
if not conflicts:
word_to_min_prefix[word] = min_length
break
min_length += 1
if word not in word_to_min_prefix:
word_to_min_prefix[word] = len(word)
return word_to_min_prefix
def find_conflict_groups(words, max_prefix_length=5):
prefix_to_words = defaultdict(list)
for word in words:
prefix = word[:max_prefix_length]
prefix_to_words[prefix].append(word)
conflict_groups = {
prefix: word_list
for prefix, word_list in prefix_to_words.items()
if len(word_list) > 1
}
return conflict_groups
def analyze_replacement_candidates(words, word_to_min_prefix, max_prefix_length=5):
problematic_words = [
(word, min_len)
for word, min_len in word_to_min_prefix.items()
if min_len > max_prefix_length
]
print("=" * 80)
print(
f"WORDS REQUIRING MORE THAN {max_prefix_length} CHARACTERS FOR UNIQUE IDENTIFICATION"
)
print("=" * 80)
print(
f"Total problematic words: {len(problematic_words)} out of {len(words)} ({len(problematic_words)/len(words)*100:.1f}%)"
)
print()
by_length = defaultdict(list)
for word, min_len in problematic_words:
by_length[min_len].append(word)
print("BREAKDOWN BY REQUIRED LENGTH:")
for length in sorted(by_length.keys()):
count = len(by_length[length])
print(f"Length {length:2d}: {count:4d} words ({count/len(words)*100:.1f}%)")
print()
conflict_groups = find_conflict_groups(words, max_prefix_length)
print("TOP CONFLICT GROUPS (words sharing same 5-character prefix):")
print("Prefix | Count | Words")
print("-" * 60)
sorted_conflicts = sorted(
conflict_groups.items(), key=lambda x: len(x[1]), reverse=True
)
for prefix, word_list in sorted_conflicts[:20]: words_str = ", ".join(word_list[:8]) if len(word_list) > 8:
words_str += f" ... (+{len(word_list) - 8} more)"
print(f"{prefix:10s} | {len(word_list):5d} | {words_str}")
print()
print("REPLACEMENT STRATEGY ANALYSIS:")
print("-" * 40)
removal_candidates = []
keep_shorter_variants = []
for prefix, word_list in conflict_groups.items():
if len(word_list) == 2:
shorter, longer = sorted(word_list, key=len)
if longer.startswith(shorter):
removal_candidates.append((longer, shorter, "longer variant"))
keep_shorter_variants.append(shorter)
print(f"1. REMOVE LONGER VARIANTS ({len(removal_candidates)} candidates):")
print(" Remove Word | Keep Word | Reason")
print(" " + "-" * 50)
for remove, keep, reason in removal_candidates[:15]:
print(f" {remove:18s} | {keep:12s} | {reason}")
if len(removal_candidates) > 15:
print(f" ... and {len(removal_candidates) - 15} more")
print()
semantic_groups = defaultdict(list)
common_roots = [
"admin",
"repre",
"const",
"commu",
"conce",
"chara",
"agric",
"insti",
]
for root in common_roots:
matching_words = [word for word in words if word.startswith(root)]
if len(matching_words) > 1:
semantic_groups[root] = matching_words
print("2. SEMANTIC GROUPS FOR POTENTIAL CONSOLIDATION:")
for root, word_list in semantic_groups.items():
if len(word_list) > 2:
print(f" {root}: {', '.join(word_list[:6])}")
if len(word_list) > 6:
print(f" ... and {len(word_list) - 6} more")
print()
unrelated_conflicts = []
for prefix, word_list in conflict_groups.items():
if len(word_list) <= 4: unrelated = True
for i, word1 in enumerate(word_list):
for word2 in word_list[i + 1 :]:
if word1 in word2 or word2 in word1:
unrelated = False
break
if not unrelated:
break
if unrelated:
unrelated_conflicts.append((prefix, word_list))
print("3. UNRELATED WORD CONFLICTS (may need synonym replacement):")
for prefix, word_list in unrelated_conflicts[:10]:
print(f" {prefix}: {', '.join(word_list)}")
print()
total_removals_needed = len(problematic_words)
easy_removals = len(removal_candidates)
difficult_cases = total_removals_needed - easy_removals
print("SUMMARY:")
print(f"Total words needing replacement: {total_removals_needed}")
print(f"Easy removals (longer variants): {easy_removals}")
print(f"Difficult cases (need synonyms): {difficult_cases}")
print(f"Dictionary reduction needed: {total_removals_needed/len(words)*100:.1f}%")
return {
"problematic_words": problematic_words,
"conflict_groups": conflict_groups,
"removal_candidates": removal_candidates,
"semantic_groups": dict(semantic_groups),
"unrelated_conflicts": unrelated_conflicts,
}
def main():
wordlist_path = Path("GOLD_WORDLIST.txt")
if not wordlist_path.exists():
print(f"Error: {wordlist_path} not found!")
sys.exit(1)
print("Reading wordlist...")
words = read_wordlist(wordlist_path)
print(f"Loaded {len(words)} words")
print("\nCalculating minimum prefix lengths...")
word_to_min_prefix = calculate_min_prefix_lengths(words)
print("\nAnalyzing replacement candidates...")
analysis = analyze_replacement_candidates(words, word_to_min_prefix, 5)
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
if __name__ == "__main__":
main()