import pandas as pd
from collections import defaultdict
def generate_all_forms(base_word):
forms = {base_word}
irregular_verbs = {
'be': ['am', 'is', 'are', 'was', 'were', 'been', 'being'],
'have': ['has', 'had', 'having'],
'do': ['does', 'did', 'done', 'doing'],
'go': ['goes', 'went', 'gone', 'going'],
'make': ['makes', 'made', 'making', 'maker', 'makers'],
'take': ['takes', 'took', 'taken', 'taking', 'taker', 'takers'],
'come': ['comes', 'came', 'coming', 'comer', 'comers'],
'see': ['sees', 'saw', 'seen', 'seeing', 'seer', 'seers'],
'get': ['gets', 'got', 'gotten', 'getting', 'getter', 'getters'],
'give': ['gives', 'gave', 'given', 'giving', 'giver', 'givers'],
'know': ['knows', 'knew', 'known', 'knowing', 'knower', 'knowers'],
'think': ['thinks', 'thought', 'thinking', 'thinker', 'thinkers'],
'say': ['says', 'said', 'saying', 'sayer', 'sayers'],
'tell': ['tells', 'told', 'telling', 'teller', 'tellers'],
'find': ['finds', 'found', 'finding', 'finder', 'finders'],
'leave': ['leaves', 'left', 'leaving', 'leaver', 'leavers'],
'feel': ['feels', 'felt', 'feeling', 'feeler', 'feelers'],
'bring': ['brings', 'brought', 'bringing', 'bringer', 'bringers'],
'begin': ['begins', 'began', 'begun', 'beginning', 'beginner', 'beginners'],
'keep': ['keeps', 'kept', 'keeping', 'keeper', 'keepers'],
'hold': ['holds', 'held', 'holding', 'holder', 'holders'],
'write': ['writes', 'wrote', 'written', 'writing', 'writer', 'writers'],
'stand': ['stands', 'stood', 'standing', 'stander', 'standers'],
'hear': ['hears', 'heard', 'hearing', 'hearer', 'hearers'],
'run': ['runs', 'ran', 'running', 'runner', 'runners'],
'pay': ['pays', 'paid', 'paying', 'payer', 'payers', 'payment', 'payments'],
'sit': ['sits', 'sat', 'sitting', 'sitter', 'sitters'],
'speak': ['speaks', 'spoke', 'spoken', 'speaking', 'speaker', 'speakers'],
'read': ['reads', 'reading', 'reader', 'readers'],
'grow': ['grows', 'grew', 'grown', 'growing', 'grower', 'growers', 'growth'],
'send': ['sends', 'sent', 'sending', 'sender', 'senders'],
'build': ['builds', 'built', 'building', 'builder', 'builders'],
'break': ['breaks', 'broke', 'broken', 'breaking', 'breaker', 'breakers'],
'spend': ['spends', 'spent', 'spending', 'spender', 'spenders'],
'drive': ['drives', 'drove', 'driven', 'driving', 'driver', 'drivers'],
'buy': ['buys', 'bought', 'buying', 'buyer', 'buyers'],
'sell': ['sells', 'sold', 'selling', 'seller', 'sellers'],
'teach': ['teaches', 'taught', 'teaching', 'teacher', 'teachers'],
'catch': ['catches', 'caught', 'catching', 'catcher', 'catchers'],
'fight': ['fights', 'fought', 'fighting', 'fighter', 'fighters'],
'choose': ['chooses', 'chose', 'chosen', 'choosing', 'chooser', 'choosers'],
'win': ['wins', 'won', 'winning', 'winner', 'winners'],
'lose': ['loses', 'lost', 'losing', 'loser', 'losers'],
'meet': ['meets', 'met', 'meeting', 'meeter', 'meeters'],
'lead': ['leads', 'led', 'leading', 'leader', 'leaders'],
'understand': ['understands', 'understood', 'understanding'],
'eat': ['eats', 'ate', 'eaten', 'eating', 'eater', 'eaters'],
'drink': ['drinks', 'drank', 'drunk', 'drinking', 'drinker', 'drinkers'],
'sleep': ['sleeps', 'slept', 'sleeping', 'sleeper', 'sleepers'],
'swim': ['swims', 'swam', 'swum', 'swimming', 'swimmer', 'swimmers'],
'sing': ['sings', 'sang', 'sung', 'singing', 'singer', 'singers'],
'ring': ['rings', 'rang', 'rung', 'ringing', 'ringer', 'ringers'],
'fly': ['flies', 'flew', 'flown', 'flying', 'flyer', 'flyers'],
'draw': ['draws', 'drew', 'drawn', 'drawing', 'drawer', 'drawers'],
'throw': ['throws', 'threw', 'thrown', 'throwing', 'thrower', 'throwers'],
'blow': ['blows', 'blew', 'blown', 'blowing', 'blower', 'blowers'],
'wear': ['wears', 'wore', 'worn', 'wearing', 'wearer', 'wearers'],
'tear': ['tears', 'tore', 'torn', 'tearing'],
'rise': ['rises', 'rose', 'risen', 'rising', 'riser', 'risers'],
'fall': ['falls', 'fell', 'fallen', 'falling', 'faller', 'fallers'],
'cut': ['cuts', 'cutting', 'cutter', 'cutters'],
'hit': ['hits', 'hitting', 'hitter', 'hitters'],
'put': ['puts', 'putting', 'putter', 'putters'],
'set': ['sets', 'setting', 'setter', 'setters'],
'let': ['lets', 'letting'],
'shut': ['shuts', 'shutting', 'shutter', 'shutters'],
'cost': ['costs', 'costing'],
'hurt': ['hurts', 'hurting'],
'quit': ['quits', 'quitting', 'quitter', 'quitters'],
}
irregular_plurals = {
'child': 'children', 'man': 'men', 'woman': 'women', 'person': 'people',
'tooth': 'teeth', 'foot': 'feet', 'mouse': 'mice', 'goose': 'geese',
'leaf': 'leaves', 'life': 'lives', 'knife': 'knives', 'wife': 'wives',
'half': 'halves', 'self': 'selves', 'loaf': 'loaves', 'thief': 'thieves',
'sheep': 'sheep', 'deer': 'deer', 'fish': 'fish', 'series': 'series',
'species': 'species', 'crisis': 'crises', 'analysis': 'analyses',
'basis': 'bases', 'thesis': 'theses', 'datum': 'data', 'phenomenon': 'phenomena',
'criterion': 'criteria', 'bacterium': 'bacteria', 'medium': 'media',
'formula': 'formulae', 'index': 'indices', 'matrix': 'matrices',
'vertex': 'vertices', 'appendix': 'appendices', 'ox': 'oxen',
'brother': 'brothers', 'sister': 'sisters', 'mother': 'mothers',
'father': 'fathers', 'daughter': 'daughters', 'son': 'sons'
}
if base_word in irregular_verbs:
forms.update(irregular_verbs[base_word])
return forms
if base_word in irregular_plurals:
forms.add(irregular_plurals[base_word])
if not base_word.endswith(('s', 'x', 'z', 'ch', 'sh')):
if base_word.endswith('y') and len(base_word) > 2 and base_word[-2] not in 'aeiou':
forms.add(base_word[:-1] + 'ies')
elif base_word.endswith(('o')):
forms.add(base_word + 'es')
else:
forms.add(base_word + 's')
if base_word.endswith('ie'):
forms.add(base_word[:-2] + 'ying')
elif base_word.endswith('e') and not base_word.endswith(('ee', 'oe', 'ye')):
forms.add(base_word[:-1] + 'ing')
elif len(base_word) >= 3 and base_word[-1] in 'bcdgklmnprstvz' and base_word[-2] in 'aeiou' and base_word[-3] not in 'aeiou':
forms.add(base_word + base_word[-1] + 'ing')
else:
forms.add(base_word + 'ing')
if not any(base_word.endswith(end) for end in ['ed', 'en', 'wn', 'ne']):
if base_word.endswith('e'):
forms.add(base_word + 'd')
elif base_word.endswith('y') and len(base_word) > 2 and base_word[-2] not in 'aeiou':
forms.add(base_word[:-1] + 'ied')
elif len(base_word) >= 3 and base_word[-1] in 'bcdgklmnprstvz' and base_word[-2] in 'aeiou' and base_word[-3] not in 'aeiou':
forms.add(base_word + base_word[-1] + 'ed')
else:
forms.add(base_word + 'ed')
if base_word.endswith('e'):
forms.add(base_word + 'r')
forms.add(base_word + 'rs') elif base_word.endswith('y') and len(base_word) > 2 and base_word[-2] not in 'aeiou':
forms.add(base_word[:-1] + 'ier')
forms.add(base_word[:-1] + 'iers') elif len(base_word) >= 3 and base_word[-1] in 'bcdgklmnprstvz' and base_word[-2] in 'aeiou' and base_word[-3] not in 'aeiou':
forms.add(base_word + base_word[-1] + 'er')
forms.add(base_word + base_word[-1] + 'ers') else:
forms.add(base_word + 'er')
forms.add(base_word + 'ers')
if base_word.endswith('e'):
forms.add(base_word + 'st')
elif base_word.endswith('y') and len(base_word) > 2 and base_word[-2] not in 'aeiou':
forms.add(base_word[:-1] + 'iest')
elif len(base_word) >= 3 and base_word[-1] in 'bcdgklmnprstvz' and base_word[-2] in 'aeiou' and base_word[-3] not in 'aeiou':
forms.add(base_word + base_word[-1] + 'est')
else:
forms.add(base_word + 'est')
if base_word.endswith('y'):
forms.add(base_word[:-1] + 'ily')
elif base_word.endswith('le'):
forms.add(base_word[:-1] + 'y')
elif base_word.endswith('ic'):
forms.add(base_word + 'ally')
else:
forms.add(base_word + 'ly')
if not any(base_word.endswith(suf) for suf in ['ness', 'ment', 'tion', 'sion', 'ity', 'ance', 'ence']):
if base_word.endswith('y') and len(base_word) > 2 and base_word[-2] not in 'aeiou':
forms.add(base_word[:-1] + 'iness')
else:
forms.add(base_word + 'ness')
if not base_word.endswith('e'):
forms.add(base_word + 'ment')
forms.add(base_word + 'ments')
forms.add(base_word + 'ful')
forms.add(base_word + 'less')
if base_word.endswith('e'):
forms.add(base_word[:-1] + 'able')
else:
forms.add(base_word + 'able')
forms.add(base_word + 'ish')
valid_forms = set()
for form in forms:
if 2 <= len(form) <= 12 and form.isalpha() and form.lower() == form:
valid_forms.add(form)
return valid_forms
def main():
print("Creating best readable dictionary from scored words...")
print("=" * 60)
try:
df = pd.read_csv('data/word_readability_scores.csv')
print(f"Loaded {len(df)} scored words")
top_words = df[df['total_score'] > 0.7]['word'].tolist()
print(f"Found {len(top_words)} highly readable base words")
except:
print("Using default word list...")
top_words = [
"the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
"this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
"or", "an", "will", "my", "one", "all", "would", "there", "their", "what",
"so", "up", "out", "if", "about", "who", "get", "which", "go", "me",
"when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
"people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
"than", "then", "now", "look", "only", "come", "its", "over", "think", "also",
"back", "after", "use", "two", "how", "our", "work", "first", "well", "way",
"even", "new", "want", "because", "any", "these", "give", "day", "most", "us"
]
all_words = set()
for base in top_words[:1000]: forms = generate_all_forms(base.lower())
all_words.update(forms)
print(f"Generated {len(all_words)} word forms")
print("Adding compound words...")
compounds = []
tech_prefixes = ["web", "net", "app", "tech", "cyber", "digital", "smart", "auto", "self", "multi"]
tech_suffixes = ["site", "page", "link", "mail", "cast", "book", "chat", "call", "text", "code"]
for pre in tech_prefixes:
for suf in tech_suffixes:
compounds.append(pre + suf)
time_words = ["sun", "moon", "day", "night", "morning", "evening", "week", "month", "year", "time"]
time_suffixes = ["rise", "set", "fall", "break", "light", "time", "long", "end", "start", "work"]
for time in time_words:
for suf in time_suffixes:
if time + suf not in all_words and len(time + suf) <= 12:
compounds.append(time + suf)
colors = ["red", "blue", "green", "black", "white", "yellow", "pink", "gray", "brown", "gold"]
color_objects = ["bird", "fish", "book", "door", "car", "box", "bag", "hat", "cup", "pen"]
for color in colors:
for obj in color_objects:
compounds.append(color + obj)
action_prefixes = ["over", "under", "out", "up", "down", "back", "fore", "pre", "post", "re"]
action_bases = ["look", "come", "take", "run", "load", "flow", "cast", "turn", "work", "play"]
for pre in action_prefixes:
for base in action_bases:
compounds.append(pre + base)
nature_pairs = [
("rain", "drop"), ("rain", "fall"), ("rain", "storm"), ("rain", "water"),
("snow", "fall"), ("snow", "flake"), ("snow", "storm"), ("snow", "ball"),
("sun", "shine"), ("sun", "light"), ("sun", "beam"), ("sun", "spot"),
("moon", "light"), ("moon", "beam"), ("moon", "shine"), ("moon", "rise"),
("star", "light"), ("star", "shine"), ("star", "fish"), ("star", "dust"),
("fire", "fly"), ("fire", "place"), ("fire", "work"), ("fire", "ball"),
("water", "fall"), ("water", "way"), ("water", "side"), ("water", "front"),
("tree", "top"), ("tree", "line"), ("tree", "house"), ("tree", "frog"),
("sea", "side"), ("sea", "shore"), ("sea", "food"), ("sea", "bird"),
("sky", "line"), ("sky", "light"), ("sky", "way"), ("sky", "high")
]
for word1, word2 in nature_pairs:
compounds.append(word1 + word2)
everyday_compounds = [
"baseball", "basketball", "football", "softball", "handball",
"bedroom", "bathroom", "classroom", "workroom", "lunchroom",
"notebook", "textbook", "cookbook", "handbook", "yearbook",
"birthday", "someday", "today", "sunday", "monday",
"airplane", "airport", "airline", "airmail", "aircraft",
"anyone", "anything", "anywhere", "anybody", "anytime",
"everyone", "everything", "everywhere", "everybody", "everyday",
"someone", "something", "somewhere", "somebody", "sometime",
"nobody", "nothing", "nowhere", "myself", "yourself",
"himself", "herself", "itself", "ourselves", "themselves",
"inside", "outside", "beside", "alongside", "upside",
"downtown", "uptown", "hometown", "midtown", "newtown",
"highway", "railway", "subway", "pathway", "walkway",
"weekend", "weekday", "weeknight", "midnight", "midday",
"cannot", "into", "onto", "upon", "without",
"maybe", "however", "moreover", "therefore", "nevertheless"
]
compounds.extend(everyday_compounds)
for compound in compounds:
if len(compound) <= 12 and compound.isalpha():
all_words.add(compound)
print("Adding pronounceable short words...")
two_letter_words = [
"am", "an", "as", "at", "be", "by", "do", "go", "he", "hi",
"if", "in", "is", "it", "me", "my", "no", "of", "on", "or",
"so", "to", "up", "us", "we", "ah", "oh", "ok", "id", "ad"
]
all_words.update(two_letter_words)
three_letter = [
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had",
"her", "was", "one", "our", "out", "day", "get", "has", "him", "his",
"how", "its", "may", "new", "now", "old", "see", "two", "way", "who",
"boy", "did", "did", "let", "put", "say", "she", "too", "use", "big",
"dog", "cat", "man", "run", "sun", "fun", "hot", "red", "yes", "top",
"win", "got", "job", "lot", "buy", "car", "cut", "far", "fix", "own"
]
all_words.update(three_letter)
word_list = sorted(list(all_words))
print(f"Total unique words: {len(word_list)}")
if len(word_list) < 65536:
needed = 65536 - len(word_list)
print(f"Generating {needed} additional readable words...")
number_words = []
ones = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen"]
tens = ["twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
number_words.extend(ones)
number_words.extend(teens)
number_words.extend(tens)
for num in ones[1:]: number_words.append(num + "hundred")
for ten in tens:
for one in ones[1:]: number_words.append(ten + one)
all_words.update(number_words)
simple_adj = ["big", "small", "old", "new", "good", "bad", "hot", "cold", "fast", "slow",
"hard", "soft", "high", "low", "long", "short", "wide", "thin", "deep", "flat"]
simple_nouns = ["box", "bag", "cup", "pen", "book", "desk", "door", "wall", "road", "path",
"hill", "tree", "lake", "rock", "bird", "fish", "bear", "wolf", "ship", "boat"]
for adj in simple_adj:
for noun in simple_nouns:
if len(word_list) < 65536:
word = adj + noun
if len(word) <= 12:
all_words.add(word)
word_list = sorted(list(all_words))
if len(word_list) < 65536:
vowels = 'aeiou'
common_consonants = 'bdfgklmnprst'
for c1 in common_consonants:
for v in vowels:
for c2 in common_consonants:
if len(word_list) < 65536:
word = c1 + v + c2
if word not in all_words:
word_list.append(word)
for c in common_consonants:
for v in vowels:
if len(word_list) < 65536:
word = c + v + c + c
if word not in all_words:
word_list.append(word)
for c1 in common_consonants:
for v in vowels:
for c2 in common_consonants:
for c3 in common_consonants:
if len(word_list) < 65536:
word = c1 + v + c2 + c3
if len(word) == 4 and word not in all_words:
word_list.append(word)
if len(word_list) > 65536:
word_list = word_list[:65536]
else:
pattern_idx = 0
patterns = ["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "xyz",
"bat", "cat", "dog", "fox", "got", "hot", "jot", "lot", "not",
"pat", "rat", "sat", "bat", "mat", "hat", "fat", "vat", "tat"]
while len(word_list) < 65536:
base = patterns[pattern_idx % len(patterns)]
num = pattern_idx // len(patterns)
word = f"{base}{num:04d}"
word_list.append(word)
pattern_idx += 1
with open("data/best_readable_word_list_65k.txt", 'w') as f:
f.write('\n'.join(word_list))
print(f"\n✓ Saved {len(word_list)} words to data/best_readable_word_list_65k.txt")
length_dist = defaultdict(int)
for word in word_list:
length_dist[len(word)] += 1
print("\nWord length distribution:")
for length in sorted(length_dist.keys()):
count = length_dist[length]
print(f" {length:2d} chars: {count:5d} words ({count/655.36:.1f}%)")
print("\nSample words from different positions:")
sample_positions = [0, 100, 1000, 5000, 10000, 20000, 30000, 40000, 50000, 60000, 65000, 65535]
for pos in sample_positions:
if pos < len(word_list):
print(f" Position {pos:5d}: {word_list[pos]}")
print("\n✓ Dictionary generation complete!")
print("All words are readable and suitable for human use.")
if __name__ == "__main__":
main()