mod lang_metrics;
mod lexicon;
mod metrics;
mod passive;
mod pipeline;
mod profile;
mod segment;
mod store;
pub(crate) mod violations;
pub(crate) use pipeline::refresh_book;
pub(crate) use profile::{VoiceProfile, VoiceScope};
pub(crate) use store::ProseStore;
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum ProseLanguage {
En,
Ru,
De,
Fr,
Es,
Other(String),
}
impl ProseLanguage {
pub(crate) fn from_label(s: &str) -> ProseLanguage {
match s.trim().to_lowercase().as_str() {
"" | "en" | "eng" | "english" => ProseLanguage::En,
"ru" | "rus" | "russian" | "русский" => ProseLanguage::Ru,
"de" | "ger" | "german" | "deutsch" => ProseLanguage::De,
"fr" | "fre" | "french" | "français" | "francais" => ProseLanguage::Fr,
"es" | "spa" | "spanish" | "español" | "espanol" | "castellano" => {
ProseLanguage::Es
}
other => ProseLanguage::Other(other.to_string()),
}
}
pub(crate) fn as_code(&self) -> &str {
match self {
ProseLanguage::En => "en",
ProseLanguage::Ru => "ru",
ProseLanguage::De => "de",
ProseLanguage::Fr => "fr",
ProseLanguage::Es => "es",
ProseLanguage::Other(_) => "other",
}
}
pub(crate) fn is_supported(&self) -> bool {
!matches!(self, ProseLanguage::Other(_))
}
}
pub(crate) fn resolve_prose_language(
explicit: Option<&str>,
project_language: &str,
) -> (ProseLanguage, Option<String>) {
if let Some(code) = explicit.map(str::trim).filter(|s| !s.is_empty()) {
let lang = ProseLanguage::from_label(code);
let note = (!lang.is_supported()).then(|| {
format!(
"prose.language `{code}` is not an embedded language; \
Tier-1 rhythm metrics only"
)
});
return (lang, note);
}
let proj = project_language.trim();
if proj.is_empty() {
return (
ProseLanguage::En,
Some(
"prose_language not set; using EN word lists for \
language-sensitive metrics"
.into(),
),
);
}
let lang = ProseLanguage::from_label(proj);
let note = (!lang.is_supported()).then(|| {
format!(
"project language `{proj}` is not an embedded language; \
language-sensitive metrics unavailable (Tier-1 rhythm metrics still computed)"
)
});
(lang, note)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub(crate) enum SensoryChannel {
Visual,
Auditory,
Olfactory,
Tactile,
Kinesthetic,
}
pub(crate) fn tokenize(text: &str) -> Vec<String> {
text.split_whitespace()
.map(|w| {
w.trim_matches(|c: char| !c.is_alphanumeric())
.to_lowercase()
})
.filter(|w| !w.is_empty())
.collect()
}
pub(crate) struct CompiledLexicon {
modal_unigrams: HashSet<&'static str>,
modal_bigrams: Vec<[&'static str; 2]>,
modal_trigrams: Vec<[&'static str; 3]>,
interiority: Vec<Vec<&'static str>>,
erlebte: HashSet<&'static str>,
sensory: HashMap<&'static str, SensoryChannel>,
passive_exceptions: HashSet<&'static str>,
}
fn leak(s: &str) -> &'static str {
Box::leak(s.to_lowercase().into_boxed_str())
}
impl CompiledLexicon {
#[cfg(test)]
pub(crate) fn for_language(lang: &ProseLanguage) -> CompiledLexicon {
Self::for_language_with(lang, &[], &[])
}
pub(crate) fn for_language_with(
lang: &ProseLanguage,
extra_modal: &[String],
extra_interiority: &[String],
) -> CompiledLexicon {
let lx = lexicon::lexicon(lang);
let mut modal_unigrams: HashSet<&'static str> =
lx.modal_unigrams.iter().copied().collect();
let mut modal_bigrams: Vec<[&'static str; 2]> = lx.modal_bigrams.to_vec();
let mut modal_trigrams: Vec<[&'static str; 3]> = lx.modal_trigrams.to_vec();
for raw in extra_modal {
let w: Vec<&str> = raw.split_whitespace().collect();
match w.len() {
1 => {
modal_unigrams.insert(leak(w[0]));
}
2 => modal_bigrams.push([leak(w[0]), leak(w[1])]),
3 => modal_trigrams.push([leak(w[0]), leak(w[1]), leak(w[2])]),
_ => {}
}
}
let mut interiority: Vec<Vec<&'static str>> = lx
.interiority
.iter()
.map(|p| p.split_whitespace().collect())
.collect();
for raw in extra_interiority {
let toks: Vec<&'static str> = raw.split_whitespace().map(leak).collect();
if !toks.is_empty() {
interiority.push(toks);
}
}
CompiledLexicon {
modal_unigrams,
modal_bigrams,
modal_trigrams,
interiority,
erlebte: lx.erlebte_particles.iter().copied().collect(),
sensory: lx.sensory.iter().copied().collect(),
passive_exceptions: lx.passive_exceptions.iter().copied().collect(),
}
}
pub(crate) fn count_modal_tokens(&self, tokens: &[&str]) -> usize {
let mut hits = 0;
for i in 0..tokens.len() {
if self.modal_unigrams.contains(tokens[i]) {
hits += 1;
}
if i + 1 < tokens.len() {
for bi in &self.modal_bigrams {
if tokens[i] == bi[0] && tokens[i + 1] == bi[1] {
hits += 1;
}
}
}
if i + 2 < tokens.len() {
for tri in &self.modal_trigrams {
if tokens[i] == tri[0] && tokens[i + 1] == tri[1] && tokens[i + 2] == tri[2] {
hits += 1;
}
}
}
}
hits
}
pub(crate) fn sentence_has_interiority(&self, tokens: &[&str]) -> bool {
self.interiority
.iter()
.any(|phrase| contains_subsequence(tokens, phrase))
}
pub(crate) fn erlebte_particle_count(&self, tokens: &[&str]) -> usize {
tokens.iter().filter(|t| self.erlebte.contains(*t)).count()
}
pub(crate) fn sensory_channel(&self, token: &str) -> Option<SensoryChannel> {
self.sensory.get(token).copied()
}
pub(crate) fn is_passive_exception(&self, token: &str) -> bool {
self.passive_exceptions.contains(token)
}
}
fn contains_subsequence(haystack: &[&str], needle: &[&str]) -> bool {
if needle.is_empty() || needle.len() > haystack.len() {
return false;
}
haystack
.windows(needle.len())
.any(|w| w.iter().zip(needle).all(|(a, b)| a == b))
}
#[cfg(test)]
mod tests {
use super::*;
fn toks(text: &str) -> Vec<String> {
tokenize(text)
}
fn refs(v: &[String]) -> Vec<&str> {
v.iter().map(String::as_str).collect()
}
#[test]
fn resolution_chain() {
assert_eq!(resolve_prose_language(Some("de"), "english").0, ProseLanguage::De);
assert_eq!(resolve_prose_language(None, "russian").0, ProseLanguage::Ru);
assert_eq!(resolve_prose_language(None, "Français").0, ProseLanguage::Fr);
let (lang, note) = resolve_prose_language(None, "");
assert_eq!(lang, ProseLanguage::En);
assert!(note.unwrap().contains("not set"));
let (lang, note) = resolve_prose_language(Some("italian"), "english");
assert_eq!(lang, ProseLanguage::Other("italian".into()));
assert!(!lang.is_supported());
assert!(note.unwrap().contains("rhythm"));
}
#[test]
fn codes_round_trip() {
for (label, code) in [
("English", "en"), ("ru", "ru"), ("Deutsch", "de"),
("francais", "fr"), ("español", "es"),
] {
assert_eq!(ProseLanguage::from_label(label).as_code(), code);
}
assert_eq!(ProseLanguage::Other("x".into()).as_code(), "other");
}
#[test]
fn tokenizer_strips_edges_keeps_internal() {
assert_eq!(toks("«Hello,» she — said…"), ["hello", "she", "said"]);
assert_eq!(toks("По-видимому, semble-t-il É"), ["по-видимому", "semble-t-il", "é"]);
}
#[test]
fn modal_en_unigrams() {
let lx = CompiledLexicon::for_language(&ProseLanguage::En);
let t = toks("She might have known, perhaps, but could not be sure.");
assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
}
#[test]
fn modal_ru_bigram_and_trigram() {
let lx = CompiledLexicon::for_language(&ProseLanguage::Ru);
let t = toks("Это, вероятно, должно быть так, судя по всему.");
assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
}
#[test]
fn modal_de_inflected_forms() {
let lx = CompiledLexicon::for_language(&ProseLanguage::De);
let t = toks("Sie könnten es wohl vermutlich gewusst haben.");
assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
}
#[test]
fn modal_fr_bigram_and_trigram() {
let lx = CompiledLexicon::for_language(&ProseLanguage::Fr);
let t = toks("Apparemment, sans doute, on aurait dit un rêve.");
assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
}
#[test]
fn modal_es_bigram_and_trigram() {
let lx = CompiledLexicon::for_language(&ProseLanguage::Es);
let t = toks("A lo mejor, tal vez, quizás era cierto.");
assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
}
#[test]
fn interiority_token_level_no_false_substring() {
let lx = CompiledLexicon::for_language(&ProseLanguage::En);
assert!(lx.sentence_has_interiority(&refs(&toks("she thought it was over"))));
assert!(lx.sentence_has_interiority(&refs(&toks("she knew the truth"))));
assert!(!lx.sentence_has_interiority(&refs(&toks("the wind was cold"))));
}
#[test]
fn interiority_other_languages() {
for (lang, sent) in [
(ProseLanguage::Ru, "ей казалось, что всё кончено"),
(ProseLanguage::De, "sie dachte an den See"),
(ProseLanguage::Fr, "elle pensait à lui"),
(ProseLanguage::Es, "ella pensaba en voz baja"),
] {
let lx = CompiledLexicon::for_language(&lang);
assert!(
lx.sentence_has_interiority(&refs(&toks(sent))),
"{}",
lang.as_code()
);
}
}
#[test]
fn de_erlebte_particles_only_for_de() {
let de = CompiledLexicon::for_language(&ProseLanguage::De);
assert_eq!(de.erlebte_particle_count(&refs(&toks("das war ja doch wohl klar"))), 3);
let en = CompiledLexicon::for_language(&ProseLanguage::En);
assert_eq!(en.erlebte_particle_count(&refs(&toks("yes indeed of course"))), 0);
}
#[test]
fn sensory_channels_per_language() {
for cases in [
(ProseLanguage::En, "shadow", SensoryChannel::Visual),
(ProseLanguage::En, "murmur", SensoryChannel::Auditory),
(ProseLanguage::Ru, "запах", SensoryChannel::Olfactory),
(ProseLanguage::De, "kalt", SensoryChannel::Tactile),
(ProseLanguage::Fr, "tremblement", SensoryChannel::Kinesthetic),
(ProseLanguage::Es, "silencio", SensoryChannel::Auditory),
] {
let lx = CompiledLexicon::for_language(&cases.0);
assert_eq!(lx.sensory_channel(cases.1), Some(cases.2), "{}", cases.1);
assert_eq!(lx.sensory_channel("zzqq"), None);
}
}
#[test]
fn passive_exceptions_loaded() {
let en = CompiledLexicon::for_language(&ProseLanguage::En);
assert!(en.is_passive_exception("thought")); let ru = CompiledLexicon::for_language(&ProseLanguage::Ru);
assert!(ru.is_passive_exception("казалось")); let de = CompiledLexicon::for_language(&ProseLanguage::De);
assert!(de.is_passive_exception("klar")); }
}