from __future__ import annotations
import argparse
import json
import os
import re
import sys
import time
from collections import defaultdict
from pathlib import Path
from typing import List, Set
import openai
from dotenv import load_dotenv from tqdm import tqdm from wordfreq import zipf_frequency
try:
from openai.error import OpenAIError except (ImportError, AttributeError):
from openai import OpenAIError
def parse_cli() -> argparse.Namespace:
p = argparse.ArgumentParser("letter_wordlist_generator")
p.add_argument("--output", required=True, type=Path)
p.add_argument("--target-size", type=int, default=0, help="Stop when word count >= this (0 = no global target)")
p.add_argument("--max-cycles", type=int, default=20, help="Safety limit on alphabet passes when chasing target size")
p.add_argument("--model", default="gpt-4o-mini")
p.add_argument("--freq-threshold", type=float, default=3.0)
p.add_argument("--temp", type=float, default=0.25)
p.add_argument("--max-retries", type=int, default=5)
p.add_argument("--per-letter-max", type=int, default=6000)
p.add_argument("--min-length", type=int, default=4)
p.add_argument("--max-length", type=int, default=7)
p.add_argument("--allow-proper-nouns", action="store_true")
p.add_argument("--banned-file", type=Path)
p.add_argument("--auto-loosen", action="store_true")
p.add_argument("--loosen-step", type=float, default=0.25)
p.add_argument("--min-freq", type=float, default=1.0)
return p.parse_cli_args() if hasattr(p, "parse_cli_args") else p.parse_args()
def make_word_re(min_len: int, max_len: int) -> re.Pattern[str]:
return re.compile(fr"^[a-z]{{{min_len},{max_len}}}$")
DEFAULT_BANNED: Set[str] = {
"cunt", "damn", "shit", "fuck", "dick", "twat", "piss", "arse", "crap",
"bitch", "bastard", "bollock", "bollocks", "bugger", "wank", "prick",
}
def is_valid(
word: str,
freq_threshold: float,
banned: Set[str],
re_word: re.Pattern[str],
allow_proper: bool,
) -> bool:
if word in banned:
return False
if not re_word.fullmatch(word):
return False
if zipf_frequency(word, "en") < freq_threshold:
return False
if not allow_proper and word[0].isupper():
return False
return True
def build_prompt(
letter: str,
min_len: int,
max_len: int,
allow_proper: bool,
recent: List[str],
) -> str:
base = (
f"List as many LOWER-CASE English words as you can, {min_len} to {max_len} letters each,\n"
f"that start with the letter '{letter}'.\n"
"Words must be readable and commonly understood."
)
if allow_proper:
base += " Common given names and place names are allowed."
base += " Letters only; no abbreviations or foreign terms.\n"
base += "Output ONE word per line."
if recent:
base += "\n\nDo NOT repeat: " + ", ".join(recent)
return base
def query_model(model: str, prompt: str, temp: float, retries: int) -> str:
for attempt in range(1, retries + 1):
try:
resp = openai.ChatCompletion.create(
model=model,
temperature=temp,
messages=[{"role": "user", "content": prompt}],
)
return resp.choices[0].message.content.strip()
except OpenAIError as e:
wait = 2 ** attempt
print(f"⚠️ OpenAI error: {e} – retry {attempt}/{retries} in {wait}s", file=sys.stderr)
time.sleep(wait)
raise RuntimeError("OpenAI API failed too many times")
def alphabet_pass(
*,
letters: str,
final_words: Set[str],
freq_threshold: float,
opts: argparse.Namespace,
re_word: re.Pattern[str],
banned: Set[str],
out_fh,
) -> int:
added_this_pass = 0
for letter in letters:
current_letter_words = {w for w in final_words if w.startswith(letter)}
seen = set(current_letter_words)
if len(seen) >= opts.per_letter_max:
continue
while len(seen) < opts.per_letter_max:
prompt = build_prompt(letter, opts.min_length, opts.max_length, opts.allow_proper_nouns, list(seen)[-150:])
text = query_model(opts.model, prompt, opts.temp, opts.max_retries)
if not text:
break
new_batch: List[str] = []
for line in text.splitlines():
w = line.strip().lower()
if (
w
and w not in final_words
and w not in seen
and is_valid(w, freq_threshold, banned, re_word, opts.allow_proper_nouns)
):
new_batch.append(w)
if not new_batch:
break
for w in new_batch:
final_words.add(w)
seen.add(w)
out_fh.write(w + "\n")
out_fh.flush()
added_this_pass += len(new_batch)
return added_this_pass
def trim_to_target(words: Set[str], target: int, min_len: int, max_len: int) -> List[str]:
re_word = make_word_re(min_len, max_len)
scored = [(
zipf_frequency(w, "en"),
w,
) for w in words if re_word.fullmatch(w)]
scored.sort(reverse=True) return [w for _, w in scored[:target]]
def main():
opts = parse_cli()
load_dotenv()
if not (openai.api_key or os.getenv("OPENAI_API_KEY")):
sys.exit("❌ OPENAI_API_KEY missing.")
re_word = make_word_re(opts.min_length, opts.max_length)
banned: Set[str] = set(DEFAULT_BANNED)
if opts.banned_file and opts.banned_file.exists():
banned.update(w.strip().lower() for w in opts.banned_file.read_text().splitlines())
final_words: Set[str] = set()
if opts.output.exists():
final_words.update(w.strip() for w in opts.output.read_text().splitlines() if w.strip())
print(f"🔄 Resuming – {len(final_words)} words present")
with opts.output.open("a", encoding="utf-8") as out_fh:
freq_current = opts.freq_threshold
cycle = 0
while True:
cycle += 1
if opts.max_cycles and cycle > opts.max_cycles:
print("❌ Reached max_cycles without hitting target – aborting.")
break
print(f"— Cycle {cycle} (freq ≥ {freq_current:.2f}) —")
added = alphabet_pass(
letters="abcdefghijklmnopqrstuvwxyz",
final_words=final_words,
freq_threshold=freq_current,
opts=opts,
re_word=re_word,
banned=banned,
out_fh=out_fh,
)
print(f" Added {added} new words; total = {len(final_words)}")
if opts.target_size and len(final_words) >= opts.target_size:
print("🎯 Target reached or exceeded.")
break
if added == 0:
if not opts.auto_loosen or freq_current - opts.loosen_step < opts.min_freq:
print("⚠️ Stalled and cannot loosen further. Stop.")
break
freq_current -= opts.loosen_step
continue
if opts.target_size and len(final_words) > opts.target_size:
print(f"✂️ Trimming surplus {len(final_words) - opts.target_size} words …")
kept = trim_to_target(final_words, opts.target_size, opts.min_length, opts.max_length)
opts.output.write_text("\n".join(sorted(kept)) + "\n", encoding="utf-8")
final_words = set(kept)
print(f"✅ Finished with {len(final_words)} words → {opts.output}")
if __name__ == "__main__":
main()