import os
import re
import pandas as pd
import textstat
import nltk
import requests
from collections import defaultdict
from typing import Dict, List, Tuple, Set
import json
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/cmudict')
except LookupError:
nltk.download('cmudict')
class ReadabilityScorer:
def __init__(self):
self.cmu_dict = nltk.corpus.cmudict.dict()
self.offensive_words = self.load_offensive_words()
self.dale_chall_words = set() self.oxford_words = {}
def load_offensive_words(self) -> Set[str]:
offensive = {
"fuck", "shit", "damn", "hell", "ass", "dick", "cunt", "bitch",
"pussy", "cock", "bastard", "piss", "fag", "dyke", "nigger", "nigga",
"retard", "rape", "nazi", "hitler", "whore", "slut"
}
return offensive
def download_dale_chall_list(self):
print("Downloading Dale-Chall 3000 word list...")
try:
url = "https://gist.githubusercontent.com/e00edcc6f508640fe24f263f5836a7dc/raw/1b1427e45cf3d476c4b9a21731c51b59e7fb7bad/dale-chall-3000-words.txt"
response = requests.get(url)
if response.status_code == 200:
words = response.text.strip().split('\n')
self.dale_chall_words = set(word.strip().lower() for word in words if word.strip())
print(f"Loaded {len(self.dale_chall_words)} Dale-Chall words")
else:
print(f"Failed to download Dale-Chall list: {response.status_code}")
except Exception as e:
print(f"Error downloading Dale-Chall list: {e}")
self.dale_chall_words = {
"a", "able", "about", "above", "across", "act", "add", "afraid", "after", "again",
"against", "age", "ago", "agree", "air", "all", "allow", "almost", "alone", "along",
"already", "also", "always", "am", "among", "an", "and", "angry", "animal", "another",
"answer", "any", "anyone", "anything", "appear", "apple", "are", "area", "arm", "around",
"arrive", "art", "as", "ask", "at", "ate", "attack", "aunt", "away", "baby", "back",
"bad", "bag", "ball", "bank", "bar", "base", "basket", "bat", "be", "bean", "bear",
"beat", "beautiful", "became", "because", "become", "bed", "been", "before", "began",
"begin", "behind", "being", "believe", "bell", "belong", "below", "beside", "best",
"better", "between", "big", "bill", "bird", "birthday", "bit", "bite", "black", "blew",
"block", "blood", "blow", "blue", "board", "boat", "body", "bone", "book", "born",
"both", "bottle", "bottom", "bought", "bow", "bowl", "box", "boy", "branch", "brave",
"bread", "break", "breakfast", "breath", "bring", "broke", "brother", "brought", "brown",
"build", "burn", "bus", "business", "busy", "but", "butter", "buy", "by", "cabin",
"cake", "call", "came", "camp", "can", "cap", "captain", "car", "card", "care",
"careful", "carry", "case", "cat", "catch", "cattle", "caught", "cause", "cent",
"center", "century", "certain", "chair", "chance", "change", "chapter", "charge",
"chase", "cheap", "check", "cheese", "chicken", "chief", "child", "children", "choice",
"choose", "chose", "church", "circle", "circus", "city", "class", "clean", "clear",
"climb", "clock", "close", "cloth", "clothes", "cloud", "club", "coast", "coat",
"coffee", "cold", "collect", "college", "color", "come", "comfort", "coming", "company",
"compare", "complete", "cook", "cool", "copy", "corn", "corner", "correct", "cost",
"cotton", "could", "count", "country", "course", "cousin", "cover", "cow", "crack",
"cream", "creek", "cried", "crop", "cross", "crowd", "crown", "cry", "cup", "cut",
"dad", "dance", "danger", "dark", "date", "daughter", "day", "dead", "dear", "death",
"decide", "deep", "deer", "desk", "did", "die", "difference", "different", "dig",
"dinner", "direction", "discover", "distance", "do", "doctor", "does", "dog", "dollar",
"done", "door", "dot", "double", "down", "dozen", "drag", "draw", "dream", "dress",
"drew", "drink", "drive", "drop", "drove", "dry", "duck", "during", "dust", "each",
"ear", "early", "earn", "earth", "east", "easy", "eat", "edge", "egg", "eight",
"either", "elephant", "else", "empty", "end", "enemy", "enjoy", "enough", "enter",
"equal", "escape", "even", "evening", "ever", "every", "everyone", "everything",
"exact", "except", "exchange", "excite", "excuse", "expect", "explain", "eye", "face",
"fact", "fair", "fall", "family", "famous", "far", "farm", "farmer", "fast", "fat",
"father", "fear", "feather", "fed", "feed", "feel", "feet", "fell", "fellow", "felt",
"fence", "few", "field", "fight", "figure", "fill", "film", "find", "fine", "finger",
"finish", "fire", "first", "fish", "fit", "five", "fix", "flag", "flat", "flew",
"floor", "flower", "fly", "follow", "food", "foot", "for", "force", "forest", "forget",
"form", "fort", "forward", "fought", "found", "four", "fox", "frame", "free", "freedom",
"fresh", "friend", "frighten", "from", "front", "fruit", "full", "fun", "funny", "fur",
"further", "future", "game", "garden", "gas", "gate", "gather", "gave", "general",
"gentle", "get", "gift", "girl", "give", "glad", "glass", "go", "goat", "god", "goes",
"gold", "golden", "gone", "good", "got", "government", "grab", "grade", "grain", "grand",
"grandfather", "grandmother", "grass", "grave", "gray", "great", "green", "grew", "ground",
"group", "grow", "guard", "guess", "guest", "guide", "gun", "had", "hair", "half", "hall",
"hand", "handle", "hang", "happen", "happy", "hard", "has", "hat", "hate", "have", "he",
"head", "health", "hear", "heard", "heart", "heat", "heavy", "held", "hello", "help",
"hen", "her", "here", "herself", "hid", "hide", "high", "hill", "him", "himself", "his",
"hit", "hold", "hole", "holiday", "home", "honest", "hope", "horn", "horse", "hospital",
"hot", "hour", "house", "how", "however", "huge", "human", "hundred", "hung", "hungry",
"hunt", "hurry", "hurt", "husband", "I", "ice", "idea", "if", "ill", "imagine",
"important", "in", "inch", "include", "indeed", "indian", "inside", "instead", "interest",
"into", "iron", "is", "island", "it", "its", "itself", "jack", "jail", "jar", "jet",
"job", "join", "joke", "journey", "joy", "judge", "jump", "just", "keep", "kept", "key",
"kick", "kid", "kill", "kind", "king", "kiss", "kitchen", "knee", "knew", "knife",
"knock", "know", "lack", "ladder", "lady", "laid", "lake", "lamp", "land", "language",
"large", "last", "late", "laugh", "law", "lay", "lazy", "lead", "leader", "leaf", "lean",
"learn", "least", "leather", "leave", "led", "left", "leg", "lend", "length", "less",
"lesson", "let", "letter", "level", "liberty", "library", "lie", "life", "lift", "light",
"like", "likely", "line", "lion", "lip", "list", "listen", "little", "live", "load",
"loaf", "lock", "log", "lone", "long", "look", "loose", "lord", "lose", "loss", "lost",
"lot", "loud", "love", "lovely", "low", "lower", "luck", "lunch", "machine", "mad",
"made", "magic", "mail", "main", "major", "make", "man", "manner", "many", "map",
"march", "mark", "market", "marry", "mass", "master", "mat", "match", "matter", "may",
"maybe", "me", "meal", "mean", "measure", "meat", "medicine", "meet", "member", "men",
"met", "metal", "method", "middle", "might", "mile", "milk", "mill", "mind", "mine",
"minute", "miss", "mistake", "mix", "moment", "money", "monkey", "month", "moon", "more",
"morning", "most", "mother", "motion", "mountain", "mouse", "mouth", "move", "movie",
"Mr.", "Mrs.", "much", "mud", "music", "must", "my", "myself", "nail", "name", "narrow",
"nation", "nature", "near", "nearly", "necessary", "neck", "need", "needle", "neighbor",
"neither", "nerve", "nest", "net", "never", "new", "news", "newspaper", "next", "nice",
"night", "nine", "no", "nobody", "noise", "none", "noon", "nor", "north", "nose", "not",
"note", "nothing", "notice", "now", "number", "nut", "object", "ocean", "of", "off",
"offer", "office", "officer", "often", "oh", "oil", "old", "on", "once", "one", "only",
"onto", "open", "opposite", "or", "orange", "order", "other", "ought", "our", "out",
"outside", "over", "own", "owner", "pack", "package", "page", "paid", "pain", "paint",
"pair", "palace", "pan", "paper", "parent", "park", "part", "party", "pass", "passage",
"past", "pasture", "path", "pattern", "pay", "peace", "pen", "pencil", "penny", "people",
"perfect", "perhaps", "person", "pet", "phone", "piano", "pick", "picnic", "picture",
"pie", "piece", "pig", "pile", "pilot", "pin", "pink", "pipe", "place", "plain", "plan",
"plane", "plant", "plate", "play", "pleasant", "please", "pleasure", "plenty", "plow",
"pocket", "point", "poison", "pole", "police", "policeman", "pond", "pony", "pool",
"poor", "pop", "popular", "porch", "position", "possible", "post", "pot", "potato",
"pound", "pour", "powder", "power", "practice", "prepare", "present", "president",
"press", "pretty", "prevent", "price", "prince", "princess", "print", "prison",
"private", "prize", "probably", "problem", "produce", "promise", "proper", "protect",
"proud", "prove", "public", "pull", "pump", "pupil", "puppy", "purchase", "pure",
"purple", "purpose", "push", "put", "puzzle", "quarter", "queen", "question", "quick",
"quiet", "quit", "quite", "rabbit", "race", "radio", "rag", "rail", "railroad", "rain",
"raise", "ran", "ranch", "rang", "rapid", "rate", "rather", "raw", "reach", "read",
"ready", "real", "reason", "receive", "record", "red", "refuse", "remain", "remember",
"remove", "rent", "repair", "repeat", "reply", "report", "represent", "require", "rest",
"result", "return", "reveal", "reward", "ribbon", "rice", "rich", "rid", "ride", "rider",
"ridge", "riding", "right", "ring", "rise", "river", "road", "roar", "rob", "rock",
"rode", "roll", "roof", "room", "root", "rope", "rose", "rough", "round", "route", "row",
"royal", "rub", "rubber", "rule", "run", "rush", "sad", "saddle", "safe", "safety",
"said", "sail", "sailor", "sake", "salad", "sale", "salt", "same", "sand", "sang",
"sat", "save", "saw", "say", "scale", "scare", "school", "science", "scissors", "score",
"sea", "search", "season", "seat", "second", "secret", "section", "see", "seed", "seek",
"seem", "seen", "seldom", "select", "self", "sell", "send", "sense", "sent", "sentence",
"separate", "serve", "service", "set", "settle", "seven", "several", "sew", "shade",
"shadow", "shake", "shall", "shame", "shape", "share", "sharp", "she", "sheep", "sheet",
"shelf", "shell", "shelter", "shine", "ship", "shirt", "shock", "shoe", "shone", "shook",
"shoot", "shop", "shore", "short", "shot", "should", "shoulder", "shout", "show", "shut",
"sick", "side", "sight", "sign", "signal", "silence", "silent", "silk", "silver",
"similar", "simple", "since", "sing", "single", "sink", "sir", "sister", "sit", "six",
"size", "skate", "ski", "skill", "skin", "skirt", "sky", "slave", "sleep", "slept",
"slide", "slip", "slow", "small", "smart", "smell", "smile", "smoke", "smooth", "snake",
"snow", "so", "soap", "social", "society", "sock", "soft", "soil", "sold", "soldier",
"solid", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
"somewhere", "son", "song", "soon", "sore", "sorry", "sort", "sound", "soup", "south",
"space", "speak", "special", "speech", "speed", "spell", "spend", "spent", "spider",
"spin", "spirit", "spite", "split", "spoke", "spoon", "sport", "spot", "spread",
"spring", "square", "stable", "stack", "stage", "stair", "stamp", "stand", "star",
"stare", "start", "state", "station", "stay", "steady", "steak", "steal", "steam",
"steel", "steep", "stem", "step", "stick", "stiff", "still", "stock", "stole", "stomach",
"stone", "stood", "stool", "stop", "store", "storm", "story", "stove", "straight",
"strange", "stranger", "stream", "street", "strength", "stretch", "strike", "string",
"strip", "strong", "struggle", "stuck", "student", "study", "stuff", "subject", "such",
"sudden", "suffer", "sugar", "suggest", "suit", "sum", "summer", "sun", "Sunday",
"supper", "supply", "support", "suppose", "sure", "surface", "surprise", "swallow",
"swam", "swear", "sweep", "sweet", "swept", "swift", "swim", "swing", "switch", "sword",
"swore", "table", "tail", "take", "taken", "talk", "tall", "tank", "tap", "tape", "task",
"taste", "taught", "tax", "tea", "teach", "teacher", "team", "tear", "telephone", "tell",
"temperature", "ten", "tent", "term", "terrible", "test", "than", "thank", "that", "the",
"theater", "thee", "their", "them", "themselves", "then", "there", "therefore", "these",
"they", "thick", "thin", "thing", "think", "third", "this", "those", "though", "thought",
"thousand", "thread", "three", "threw", "throat", "through", "throughout", "throw",
"thrown", "thumb", "thunder", "thus", "thy", "ticket", "tide", "tie", "tiger", "tight",
"till", "time", "tin", "tiny", "tip", "tire", "title", "to", "tobacco", "today", "toe",
"together", "told", "tomorrow", "ton", "tone", "tongue", "tonight", "too", "took", "tool",
"tooth", "top", "tore", "torn", "total", "touch", "toward", "tower", "town", "toy",
"track", "trade", "traffic", "trail", "train", "trap", "travel", "tray", "treasure",
"treat", "tree", "trick", "tried", "trim", "trip", "troop", "tropical", "trouble",
"truck", "true", "trunk", "trust", "truth", "try", "tube", "tune", "turn", "twelve",
"twenty", "twice", "two", "type", "ugly", "uncle", "under", "understand", "unhappy",
"union", "unit", "united", "universe", "university", "unless", "until", "up", "upon",
"upper", "upward", "us", "use", "useful", "usual", "valley", "valuable", "value", "variety",
"various", "vast", "vegetable", "verb", "very", "vessel", "victory", "view", "village",
"visit", "visitor", "voice", "vote", "vowel", "voyage", "wagon", "wait", "wake", "walk",
"wall", "want", "war", "warm", "warn", "was", "wash", "waste", "watch", "water", "wave",
"way", "we", "weak", "wealth", "weapon", "wear", "weather", "web", "wedding", "week",
"weigh", "weight", "welcome", "well", "went", "were", "west", "wet", "whale", "what",
"wheat", "wheel", "when", "whenever", "where", "whether", "which", "while", "whip",
"whisper", "whistle", "white", "who", "whole", "whom", "whose", "why", "wicked", "wide",
"wife", "wild", "will", "willing", "win", "wind", "window", "wine", "wing", "winter",
"wipe", "wire", "wise", "wish", "with", "within", "without", "woke", "wolf", "woman",
"women", "won", "wonder", "wonderful", "wood", "wooden", "wool", "word", "wore", "work",
"worker", "world", "worm", "worn", "worry", "worse", "worst", "worth", "would", "wound",
"wrap", "wreck", "wrist", "write", "written", "wrong", "wrote", "yard", "year", "yellow",
"yes", "yesterday", "yet", "you", "young", "youngster", "your", "yourself", "youth", "zero"
}
print(f"Using fallback Dale-Chall words: {len(self.dale_chall_words)} words")
def load_oxford_pdfs(self):
print("Note: Oxford 3000/5000 PDFs need manual download from:")
print("- https://www.oxfordlearnersdictionaries.com/external/pdf/wordlists/oxford-3000-5000/American_Oxford_3000.pdf")
print("- https://www.oxfordlearnersdictionaries.com/external/pdf/wordlists/oxford-3000-5000/American_Oxford_5000.pdf")
self.oxford_words = {
"the": "A1", "be": "A1", "have": "A1", "do": "A1", "say": "A1",
"go": "A1", "can": "A1", "get": "A1", "make": "A1", "know": "A1",
"think": "A1", "take": "A1", "see": "A1", "come": "A1", "want": "A1",
"use": "A1", "find": "A1", "give": "A1", "tell": "A1", "work": "A1",
"call": "A1", "try": "A1", "ask": "A1", "need": "A1", "feel": "A1",
"become": "A1", "leave": "A1", "put": "A1", "mean": "A1", "keep": "A1",
"begin": "A2", "seem": "A2", "help": "A2", "show": "A2", "hear": "A2",
"play": "A2", "run": "A2", "move": "A2", "live": "A2", "believe": "A2",
"include": "B1", "continue": "B1", "set": "B1", "learn": "B1", "change": "B1",
"develop": "B2", "consider": "B2", "appear": "B2", "involve": "B2", "require": "B2",
}
def count_syllables(self, word: str) -> int:
word_lower = word.lower()
if word_lower in self.cmu_dict:
pronunciation = self.cmu_dict[word_lower][0] return len([ph for ph in pronunciation if ph[-1].isdigit()])
else:
return textstat.syllable_count(word)
def is_phonetically_regular(self, word: str) -> bool:
return word.lower() in self.cmu_dict
def get_word_frequency_score(self, word: str, position_in_list: int) -> float:
if position_in_list < 1000:
return 1.0
elif position_in_list < 5000:
return 0.8
elif position_in_list < 10000:
return 0.6
elif position_in_list < 20000:
return 0.4
else:
return 0.2
def get_cefr_score(self, word: str) -> float:
cefr_scores = {
"A1": 1.0,
"A2": 0.9,
"B1": 0.7,
"B2": 0.5,
"C1": 0.3,
"C2": 0.1
}
return cefr_scores.get(self.oxford_words.get(word.lower(), "C2"), 0.1)
def calculate_readability_score(self, word: str, position: int = 50000) -> Dict[str, float]:
scores = {}
syllables = self.count_syllables(word)
if syllables == 1:
scores['syllable'] = 1.0
elif syllables == 2:
scores['syllable'] = 0.9
elif syllables == 3:
scores['syllable'] = 0.6
else:
scores['syllable'] = 0.2
length = len(word)
if 3 <= length <= 5:
scores['length'] = 1.0
elif 6 <= length <= 7:
scores['length'] = 0.8
elif length == 2 or length == 8:
scores['length'] = 0.6
elif length == 9:
scores['length'] = 0.4
else:
scores['length'] = 0.2
scores['dale_chall'] = 1.0 if word.lower() in self.dale_chall_words else 0.3
scores['cefr'] = self.get_cefr_score(word)
scores['frequency'] = self.get_word_frequency_score(word, position)
scores['phonetic'] = 1.0 if self.is_phonetically_regular(word) else 0.5
scores['clean'] = 1.0 if word.isalpha() else 0.0
scores['appropriate'] = 0.0 if word.lower() in self.offensive_words else 1.0
weights = {
'syllable': 0.20,
'length': 0.10,
'dale_chall': 0.20,
'cefr': 0.15,
'frequency': 0.20,
'phonetic': 0.10,
'clean': 0.03,
'appropriate': 0.02
}
scores['total'] = sum(scores[k] * weights[k] for k in weights)
return scores
def load_existing_word_lists() -> List[Tuple[str, int]]:
words_with_position = []
if os.path.exists("data/google-10000-english.txt"):
print("Loading Google 10k list...")
with open("data/google-10000-english.txt", 'r') as f:
for position, line in enumerate(f):
word = line.strip()
if word and 2 <= len(word) <= 12:
words_with_position.append((word, position))
if os.path.exists("data/words_alpha.txt"):
print("Loading words_alpha.txt...")
with open("data/words_alpha.txt", 'r') as f:
for position, line in enumerate(f):
word = line.strip()
if word and 2 <= len(word) <= 10 and word.isalpha():
words_with_position.append((word, position + 10000))
return words_with_position
def generate_word_forms(base_words: Set[str]) -> Set[str]:
word_forms = set(base_words)
for word in list(base_words):
if not word.endswith('s'):
word_forms.add(word + 's')
if word.endswith('y') and len(word) > 2 and word[-2] not in 'aeiou':
word_forms.add(word[:-1] + 'ies')
if word.endswith('e') and len(word) > 2:
word_forms.add(word[:-1] + 'ing')
else:
word_forms.add(word + 'ing')
if word.endswith('e'):
word_forms.add(word + 'd')
elif word.endswith('y') and len(word) > 2 and word[-2] not in 'aeiou':
word_forms.add(word[:-1] + 'ied')
else:
word_forms.add(word + 'ed')
if word.endswith('e'):
word_forms.add(word + 'r')
else:
word_forms.add(word + 'er')
if word.endswith('e'):
word_forms.add(word + 'st')
else:
word_forms.add(word + 'est')
return word_forms
def main():
print("Creating human-readable dictionary for three-word networking...")
print("=" * 60)
scorer = ReadabilityScorer()
scorer.download_dale_chall_list()
scorer.load_oxford_pdfs()
print("\nLoading word sources...")
words_with_position = load_existing_word_lists()
print(f"Loaded {len(words_with_position)} candidate words")
print("\nScoring words for readability...")
scored_words = []
for i, (word, position) in enumerate(words_with_position):
if i % 10000 == 0:
print(f"Processed {i}/{len(words_with_position)} words...")
scores = scorer.calculate_readability_score(word, position)
if scores['appropriate'] > 0 and scores['clean'] > 0: scored_words.append({
'word': word,
'position': position,
'total_score': scores['total'],
**scores
})
df = pd.DataFrame(scored_words)
df = df.sort_values('total_score', ascending=False)
top_words = df.head(65536)
print("\n" + "=" * 60)
print("DICTIONARY STATISTICS:")
print(f"Total words selected: {len(top_words)}")
print(f"Average readability score: {top_words['total_score'].mean():.3f}")
print(f"Average syllables: {top_words['syllable'].mean():.2f}")
print(f"Words in Dale-Chall list: {(top_words['dale_chall'] == 1.0).sum()}")
print("\nTop 50 words by readability:")
for i, row in top_words.head(50).iterrows():
print(f"{row['word']:15} (score: {row['total_score']:.3f})")
output_words = top_words['word'].tolist()
while len(output_words) < 65536:
output_words.append(f"word{len(output_words):05d}")
with open("data/human_readable_word_list_65k.txt", 'w') as f:
f.write('\n'.join(output_words))
print(f"\n✓ Saved {len(output_words)} words to data/human_readable_word_list_65k.txt")
top_words.to_csv("data/word_readability_scores.csv", index=False)
print("✓ Saved detailed scores to data/word_readability_scores.csv")
if __name__ == "__main__":
main()