import re
def load_word_list(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
return {line.strip().lower() for line in f}
def load_cmudict_words(filepath):
words = set()
with open(filepath, 'r', encoding='latin-1') as f:
for line in f:
if not line.startswith(';;;'):
word = line.split(' ')[0]
word = re.sub(r'\(\d+\)$', '', word)
words.add(word.lower())
return words
def analyze_readability():
generated_list_path = 'data/my_word_list_with_suffixes.txt'
cmudict_path = 'data/cmudict-0.7b.txt'
try:
generated_words = load_word_list(generated_list_path)
cmudict_words = load_cmudict_words(cmudict_path)
print(f"Loaded {len(generated_words)} words from our generated list.")
print(f"Loaded {len(cmudict_words)} unique words from the CMU Pronouncing Dictionary.")
except FileNotFoundError as e:
print(f"Error: {e}. Please make sure the word list files are in the correct path.")
return
common_words = generated_words.intersection(cmudict_words)
percentage_in_cmudict = (len(common_words) / len(generated_words)) * 100
print(f"\nReadability Analysis:")
print(f"---------------------")
print(f"{len(common_words)} out of {len(generated_words)} words from our list were found in the CMU Pronouncing Dictionary.")
print(f"Readability Score: {percentage_in_cmudict:.2f}%")
if percentage_in_cmudict < 80:
print("\nNote: A lower score doesn't necessarily mean the words are unreadable,")
print("but it indicates that a significant portion are not standard dictionary words")
print("or are very obscure.")
else:
print("\nThis is a high score, suggesting the majority of words are standard and pronounceable.")
if __name__ == '__main__':
analyze_readability()