#[doc(hidden)]
pub mod coda_groups;
#[doc(hidden)]
pub mod compare;
#[doc(hidden)]
pub mod dict;
#[doc(hidden)]
pub mod distance;
#[doc(hidden)]
pub mod meter;
#[doc(hidden)]
pub mod near_index;
#[doc(hidden)]
pub mod rhyme;
#[doc(hidden)]
pub mod rhyme_index;
#[doc(hidden)]
pub mod rhymemap;
#[doc(hidden)]
pub mod slant_index;
#[doc(hidden)]
pub mod stress;
#[doc(hidden)]
pub mod syllable;
pub mod phoneme;
#[cfg(feature = "server")]
pub mod server;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
use crate::dict::CmuDict;
pub use crate::stress::StressMode;
pub struct Phonetik {
dict: Arc<CmuDict>,
rhyme_index: Arc<rhyme_index::RhymeIndex>,
slant_index: Arc<slant_index::SlantIndex>,
near_index: Arc<near_index::NearIndex>,
stress_analyzer: Arc<stress::StressAnalyzer>,
rhyme_mapper: Arc<rhymemap::RhymeMapAnalyzer>,
}
impl Clone for Phonetik {
fn clone(&self) -> Self {
Self {
dict: self.dict.clone(),
rhyme_index: self.rhyme_index.clone(),
slant_index: self.slant_index.clone(),
near_index: self.near_index.clone(),
stress_analyzer: self.stress_analyzer.clone(),
rhyme_mapper: self.rhyme_mapper.clone(),
}
}
}
impl Phonetik {
pub fn new() -> Self {
let dict = Arc::new(CmuDict::load());
let rhyme_idx = rhyme_index::RhymeIndex::new(dict.clone());
let (coda_map, _) = coda_groups::build(&dict);
let shared_codas = Arc::new(coda_map);
let slant_idx = slant_index::SlantIndex::new(dict.clone(), shared_codas.clone());
let near_idx = near_index::NearIndex::new(dict.clone(), shared_codas);
let stress_a = stress::StressAnalyzer::new(dict.clone());
let rhyme_m = rhymemap::RhymeMapAnalyzer::new(dict.clone());
Self {
dict,
rhyme_index: Arc::new(rhyme_idx),
slant_index: Arc::new(slant_idx),
near_index: Arc::new(near_idx),
stress_analyzer: Arc::new(stress_a),
rhyme_mapper: Arc::new(rhyme_m),
}
}
pub fn lookup(&self, word: &str) -> Option<WordInfo> {
let normalized = CmuDict::normalize(word);
let variants = self.dict.lookup(word)?;
let encoded = &variants[0];
let count = phoneme::count_syllables(encoded);
let stresses = phoneme::extract_stresses(encoded);
let syllables = syllable::SyllableSplitter::split(&normalized, count);
let stress_display = syllable::SyllableSplitter::stress_display(&normalized, &stresses);
Some(WordInfo {
word: normalized,
phonemes: phoneme::decode_to_strings(encoded),
syllable_count: count,
syllables,
stress_pattern: stresses,
stress_display,
variant_count: variants.len(),
})
}
pub fn syllable_count(&self, word: &str) -> usize {
if let Some(variants) = self.dict.lookup(word) {
phoneme::count_syllables(&variants[0])
} else {
estimate_syllable_count(word)
}
}
pub fn syllable_counts(&self, lines: &[&str]) -> Vec<LineSyllableCount> {
lines
.iter()
.map(|line| {
let tokens = tokenize_words(line);
let mut total = 0;
let words: Vec<WordSyllableCount> = tokens
.iter()
.map(|tok| {
let count = self.syllable_count(tok);
total += count;
WordSyllableCount {
word: tok.clone(),
syllables: count,
}
})
.collect();
LineSyllableCount { words, total }
})
.collect()
}
pub fn rhymes(&self, word: &str, limit: usize) -> Vec<RhymeMatch> {
let limit = limit.min(500);
let mut matches = Vec::new();
let mut seen = std::collections::HashSet::new();
for m in self.perfect_rhymes(word) {
if seen.insert(m.word.clone()) {
matches.push(m);
}
}
let slant_limit = limit.saturating_sub(matches.len());
if slant_limit > 0 {
for m in self.slant_rhymes(word, slant_limit) {
if seen.insert(m.word.clone()) {
matches.push(m);
}
}
}
let near_limit = limit.saturating_sub(matches.len());
if near_limit > 0 {
for m in self.near_rhymes(word, near_limit) {
if seen.insert(m.word.clone()) {
matches.push(m);
}
}
}
matches.truncate(limit);
matches
}
pub fn perfect_rhymes(&self, word: &str) -> Vec<RhymeMatch> {
self.rhyme_index
.lookup(word)
.map(|r| {
r.matches
.into_iter()
.map(|m| RhymeMatch {
word: m.word,
phonemes: m.phonemes,
syllables: m.syllables,
rhyme_type: RhymeType::Perfect,
confidence: 1.0,
})
.collect()
})
.unwrap_or_default()
}
pub fn slant_rhymes(&self, word: &str, limit: usize) -> Vec<RhymeMatch> {
self.slant_index
.lookup(word, limit.min(500))
.map(|r| {
r.matches
.into_iter()
.map(|m| RhymeMatch {
word: m.word,
phonemes: m.phonemes,
syllables: m.syllables,
rhyme_type: RhymeType::Slant,
confidence: m.confidence,
})
.collect()
})
.unwrap_or_default()
}
pub fn near_rhymes(&self, word: &str, limit: usize) -> Vec<RhymeMatch> {
self.near_index
.lookup(word, limit.min(500))
.map(|r| {
r.matches
.into_iter()
.map(|m| RhymeMatch {
word: m.word,
phonemes: m.phonemes,
syllables: m.syllables,
rhyme_type: RhymeType::Near,
confidence: 0.5,
})
.collect()
})
.unwrap_or_default()
}
pub fn scan(&self, line: &str) -> LineScan {
self.scan_with_mode(line, StressMode::Spoken)
}
pub fn scan_with_mode(&self, line: &str, mode: StressMode) -> LineScan {
let analysis = self.stress_analyzer.analyze_line_with_mode(line, mode);
Self::build_line_scan(line, &analysis)
}
pub fn compare(&self, word1: &str, word2: &str) -> Option<Comparison> {
let l1 = self.dict.lookup(word1)?;
let l2 = self.dict.lookup(word2)?;
let (score, best_a, best_b) = compare::PhoneticComparer::best_similarity(&l1, &l2);
let rhyme_result = rhyme::RhymeAnalyzer::best_rhyme(&l1, &l2);
let rhyme_type = match rhyme_result.rhyme_type.as_str() {
"perfect" => RhymeType::Perfect,
"identity" => RhymeType::Perfect,
"near" => RhymeType::Near,
"slant" => RhymeType::Slant,
_ => RhymeType::None,
};
Some(Comparison {
word1: CmuDict::normalize(word1),
word2: CmuDict::normalize(word2),
similarity: (score * 10000.0).round() / 10000.0,
rhyme_type,
confidence: rhyme_result.confidence,
phonemes1: phoneme::decode_to_strings(best_a),
phonemes2: phoneme::decode_to_strings(best_b),
})
}
pub fn rhyme_map(&self, lines: &[&str]) -> rhymemap::RhymeMapResult {
let owned: Vec<String> = lines.iter().map(|s| s.to_string()).collect();
let opts = rhymemap::RhymeMapOptions::default();
self.rhyme_mapper.analyze(&owned, &opts)
}
pub fn analyze_document(&self, lines: &[&str], options: &DocumentAnalyzeOptions) -> DocumentMetadata {
use std::collections::HashMap;
if lines.is_empty() {
return DocumentMetadata {
version: DOCUMENT_METADATA_VERSION,
summary: DocumentSummary {
line_count: 0,
non_empty_line_count: 0,
total_syllables: 0,
total_word_tokens: 0,
dictionary_word_tokens: 0,
dictionary_coverage: 1.0,
mean_syllables_per_line: 0.0,
mean_meter_regularity: 0.0,
dominant_meter: DocumentDominantMeter {
meter_name: "none".into(),
foot_type: "none".into(),
foot_count: 0,
supporting_line_count: 0,
},
},
lines: vec![],
rhyme_map: None,
};
}
let stress_mode = options.stress_mode.unwrap_or_default();
let rhyme_map = if options.include_rhyme_map {
Some(self.rhyme_map(lines))
} else {
None
};
let non_empty_line_count = lines.iter().filter(|l| !l.trim().is_empty()).count();
let mut line_metas = Vec::with_capacity(lines.len());
let mut total_syllables = 0usize;
let mut total_word_tokens = 0usize;
let mut dictionary_word_tokens = 0usize;
let mut regularity_sum = 0f64;
let mut regularity_n = 0usize;
let mut meter_votes: HashMap<String, usize> = HashMap::new();
for (index, line) in lines.iter().enumerate() {
let analysis = self
.stress_analyzer
.analyze_line_with_mode(line, stress_mode);
let scan = Self::build_line_scan(line, &analysis);
let word_tokens = analysis.words.len();
let dictionary_words = analysis.words.iter().filter(|w| w.in_dictionary).count();
let estimated_words = word_tokens.saturating_sub(dictionary_words);
total_syllables += scan.syllable_count;
total_word_tokens += word_tokens;
dictionary_word_tokens += dictionary_words;
if scan.syllable_count > 0 {
regularity_sum += scan.meter.regularity;
regularity_n += 1;
*meter_votes.entry(scan.meter.name.clone()).or_insert(0) += 1;
}
line_metas.push(DocumentLineMetadata {
index,
text: (*line).to_string(),
word_tokens,
dictionary_words,
estimated_words,
prosody_fingerprint: compute_prosody_fingerprint(&scan),
scan,
});
}
let dominant_meter = dominant_from_votes(&meter_votes, &line_metas);
let dictionary_coverage = if total_word_tokens > 0 {
(dictionary_word_tokens as f64 / total_word_tokens as f64 * 10000.0).round() / 10000.0
} else {
1.0
};
let mean_syllables_per_line =
(total_syllables as f64 * 10000.0 / lines.len() as f64).round() / 10000.0;
let mean_meter_regularity = if regularity_n > 0 {
(regularity_sum / regularity_n as f64 * 10000.0).round() / 10000.0
} else {
0.0
};
DocumentMetadata {
version: DOCUMENT_METADATA_VERSION,
summary: DocumentSummary {
line_count: lines.len(),
non_empty_line_count,
total_syllables,
total_word_tokens,
dictionary_word_tokens,
dictionary_coverage,
mean_syllables_per_line,
mean_meter_regularity,
dominant_meter,
},
lines: line_metas,
rhyme_map,
}
}
fn build_line_scan(line: &str, analysis: &stress::LineStress) -> LineScan {
let meter_result = meter::MeterDetector::detect(&analysis.binary_pattern);
let visual = format_stress_visual(&analysis.binary_pattern);
LineScan {
text: line.to_string(),
stressed_display: analysis.stressed_display.clone(),
stress_pattern: analysis.stress_pattern.clone(),
binary_pattern: analysis.binary_pattern.clone(),
syllable_count: analysis.syllable_count,
visual,
meter: MeterInfo {
name: meter_result.meter_name,
foot_type: meter_result.foot_type,
foot_count: meter_result.foot_count,
regularity: meter_result.regularity,
},
}
}
pub fn contains(&self, word: &str) -> bool {
self.dict.lookup(word).is_some()
}
pub fn word_count(&self) -> usize {
self.dict.entry_count()
}
}
impl Default for Phonetik {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct WordInfo {
pub word: String,
pub phonemes: Vec<String>,
pub syllable_count: usize,
pub syllables: Vec<String>,
pub stress_pattern: Vec<i32>,
pub stress_display: String,
pub variant_count: usize,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct RhymeMatch {
pub word: String,
pub phonemes: Vec<String>,
pub syllables: usize,
pub rhyme_type: RhymeType,
pub confidence: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum RhymeType {
Perfect,
Slant,
Near,
None,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct LineScan {
pub text: String,
pub stressed_display: String,
pub stress_pattern: Vec<i32>,
pub binary_pattern: Vec<i32>,
pub syllable_count: usize,
pub visual: String,
pub meter: MeterInfo,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct MeterInfo {
pub name: String,
pub foot_type: String,
pub foot_count: usize,
pub regularity: f64,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Comparison {
pub word1: String,
pub word2: String,
pub similarity: f64,
pub rhyme_type: RhymeType,
pub confidence: f64,
pub phonemes1: Vec<String>,
pub phonemes2: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct WordSyllableCount {
pub word: String,
pub syllables: usize,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct LineSyllableCount {
pub words: Vec<WordSyllableCount>,
pub total: usize,
}
pub const DOCUMENT_METADATA_VERSION: u32 = 1;
#[derive(Debug, Clone, Default, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DocumentAnalyzeOptions {
#[serde(default)]
pub stress_mode: Option<StressMode>,
#[serde(default)]
pub include_rhyme_map: bool,
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DocumentMetadata {
pub version: u32,
pub summary: DocumentSummary,
pub lines: Vec<DocumentLineMetadata>,
#[serde(skip_serializing_if = "Option::is_none")]
pub rhyme_map: Option<rhymemap::RhymeMapResult>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DocumentSummary {
pub line_count: usize,
pub non_empty_line_count: usize,
pub total_syllables: usize,
pub total_word_tokens: usize,
pub dictionary_word_tokens: usize,
pub dictionary_coverage: f64,
pub mean_syllables_per_line: f64,
pub mean_meter_regularity: f64,
pub dominant_meter: DocumentDominantMeter,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DocumentDominantMeter {
pub meter_name: String,
pub foot_type: String,
pub foot_count: usize,
pub supporting_line_count: usize,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DocumentLineMetadata {
pub index: usize,
pub text: String,
pub word_tokens: usize,
pub dictionary_words: usize,
pub estimated_words: usize,
pub prosody_fingerprint: String,
pub scan: LineScan,
}
fn dominant_from_votes(
votes: &std::collections::HashMap<String, usize>,
line_metas: &[DocumentLineMetadata],
) -> DocumentDominantMeter {
if votes.is_empty() {
return DocumentDominantMeter {
meter_name: "none".into(),
foot_type: "none".into(),
foot_count: 0,
supporting_line_count: 0,
};
}
let (winner_name, supporting_line_count): (String, usize) = votes
.iter()
.max_by(|a, b| a.1.cmp(b.1).then_with(|| a.0.cmp(b.0)))
.map(|(k, v)| (k.clone(), *v))
.expect("votes non-empty");
let template = line_metas.iter().find(|l| {
l.scan.syllable_count > 0 && l.scan.meter.name == winner_name
});
if let Some(l) = template {
DocumentDominantMeter {
meter_name: winner_name.clone(),
foot_type: l.scan.meter.foot_type.clone(),
foot_count: l.scan.meter.foot_count,
supporting_line_count,
}
} else {
DocumentDominantMeter {
meter_name: winner_name,
foot_type: "unknown".into(),
foot_count: 0,
supporting_line_count,
}
}
}
fn compute_prosody_fingerprint(scan: &LineScan) -> String {
let bits: String = scan
.binary_pattern
.iter()
.map(|b| if *b == 1 { '1' } else { '0' })
.collect();
format!(
"{}:{}:{}:{}",
scan.syllable_count, scan.meter.foot_type, scan.meter.foot_count, bits
)
}
fn format_stress_visual(binary: &[i32]) -> String {
if binary.is_empty() {
return String::new();
}
let mut s = String::with_capacity(binary.len() * 2);
for (i, &b) in binary.iter().enumerate() {
if i > 0 {
s.push(' ');
}
s.push(if b == 1 { '/' } else { 'x' });
}
s
}
fn tokenize_words(line: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
for c in line.chars() {
if c.is_alphabetic() || c == '\'' || c == '-' {
current.push(c);
} else if !current.is_empty() {
tokens.push(current.clone());
current.clear();
}
}
if !current.is_empty() {
tokens.push(current);
}
tokens
}
fn estimate_syllable_count(word: &str) -> usize {
let mut count = 0;
let mut in_vowel = false;
for c in word.chars() {
let is_v = "aeiouyAEIOUY".contains(c);
if is_v && !in_vowel {
count += 1;
}
in_vowel = is_v;
}
if count == 0 {
1
} else {
count
}
}
#[cfg(test)]
mod tests {
use super::*;
fn ph() -> Phonetik {
Phonetik::new()
}
#[test]
fn clone_shares_data() {
let a = ph();
let b = a.clone();
assert_eq!(a.word_count(), b.word_count());
assert!(a.lookup("cat").is_some());
assert!(b.lookup("cat").is_some());
}
#[test]
fn lookup_returns_word_info() {
let p = ph();
let info = p.lookup("extraordinary").unwrap();
assert_eq!(info.word, "EXTRAORDINARY");
assert!(info.syllable_count >= 5);
assert!(!info.phonemes.is_empty());
assert!(!info.syllables.is_empty());
assert!(info.variant_count >= 1);
}
#[test]
fn lookup_unknown_word() {
let p = ph();
assert!(p.lookup("xyzzyplugh").is_none());
}
#[test]
fn syllable_count_known_word() {
let p = ph();
assert_eq!(p.syllable_count("cat"), 1);
assert_eq!(p.syllable_count("hello"), 2);
}
#[test]
fn syllable_count_unknown_falls_back_to_estimate() {
let p = ph();
let count = p.syllable_count("xyzzyplugh");
assert!(count >= 1);
}
#[test]
fn syllable_counts_batch() {
let p = ph();
let result = p.syllable_counts(&["hello world", "the cat"]);
assert_eq!(result.len(), 2);
assert!(result[0].total >= 3);
assert!(result[1].total >= 2);
}
#[test]
fn rhymes_returns_perfect_first() {
let p = ph();
let results = p.rhymes("cat", 20);
assert!(!results.is_empty());
assert_eq!(results[0].rhyme_type, RhymeType::Perfect);
}
#[test]
fn perfect_rhymes_known_pair() {
let p = ph();
let results = p.perfect_rhymes("cat");
let words: Vec<&str> = results.iter().map(|r| r.word.as_str()).collect();
assert!(words.contains(&"BAT"));
}
#[test]
fn slant_rhymes_returns_results() {
let p = ph();
let results = p.slant_rhymes("love", 20);
assert!(!results.is_empty());
for r in &results {
assert_eq!(r.rhyme_type, RhymeType::Slant);
}
}
#[test]
fn near_rhymes_returns_results() {
let p = ph();
let results = p.near_rhymes("night", 20);
assert!(!results.is_empty());
for r in &results {
assert_eq!(r.rhyme_type, RhymeType::Near);
}
}
#[test]
fn rhymes_respects_limit() {
let p = ph();
let results = p.rhymes("the", 5);
assert!(results.len() <= 5);
}
#[test]
fn rhymes_deduplicates() {
let p = ph();
let results = p.rhymes("cat", 200);
let mut words: Vec<&str> = results.iter().map(|r| r.word.as_str()).collect();
let len_before = words.len();
words.sort();
words.dedup();
assert_eq!(words.len(), len_before, "duplicates found in rhyme results");
}
#[test]
fn scan_iambic_pentameter() {
let p = ph();
let scan = p.scan("uneasy lies the head that wears the crown");
assert_eq!(scan.syllable_count, 10);
assert!(scan.meter.name.contains("iambic"));
assert!(!scan.visual.is_empty());
}
#[test]
fn scan_empty_line() {
let p = ph();
let scan = p.scan("");
assert_eq!(scan.syllable_count, 0);
}
#[test]
fn compare_rhyming_pair() {
let p = ph();
let cmp = p.compare("cat", "bat").unwrap();
assert!(cmp.similarity > 0.5);
assert_eq!(cmp.rhyme_type, RhymeType::Perfect);
}
#[test]
fn compare_unknown_word_returns_none() {
let p = ph();
assert!(p.compare("cat", "xyzzyplugh").is_none());
}
#[test]
fn rhyme_map_finds_patterns() {
let p = ph();
let result = p.rhyme_map(&["the cat sat on the mat", "the bat sat on the hat"]);
assert!(!result.patterns.is_empty());
}
#[test]
fn contains_known_and_unknown() {
let p = ph();
assert!(p.contains("hello"));
assert!(!p.contains("xyzzyplugh"));
}
#[test]
fn word_count_is_substantial() {
let p = ph();
assert!(p.word_count() > 100_000);
}
#[test]
fn format_stress_visual_fn() {
assert_eq!(format_stress_visual(&[0, 1, 0, 1]), "x / x /");
assert_eq!(format_stress_visual(&[]), "");
}
#[test]
fn tokenize_words_fn() {
assert_eq!(tokenize_words("hello, world!"), vec!["hello", "world"]);
assert_eq!(tokenize_words("don't stop"), vec!["don't", "stop"]);
assert!(tokenize_words("").is_empty());
}
#[test]
fn estimate_syllable_count_fn() {
assert_eq!(estimate_syllable_count("cat"), 1);
assert_eq!(estimate_syllable_count("hello"), 2);
assert_eq!(estimate_syllable_count("brr"), 1); }
#[test]
fn default_impl_works() {
let p = Phonetik::default();
assert!(p.word_count() > 100_000);
}
}