inkhaven 1.4.13

Inkhaven — TUI literary work editor for Typst books
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
//! NARR-1 — narrative-voice (`prose`) profiling. Deterministic, zero-AI,
//! zero-runtime-dep statistical voice metrics over a book's chapters, in five
//! embedded languages (EN/RU/DE/FR/ES).
//!
//! This phase (N-P0) lands the language model: the [`ProseLanguage`] key, its
//! resolution chain, the [`SensoryChannel`] enum, the embedded word lists
//! (`lexicon`), and the lookup primitives ([`CompiledLexicon`]) + tokenizer the
//! metric passes build on. Metric formulas land in N-P1/N-P2.

mod lang_metrics;
mod lexicon;
mod metrics;
mod passive;
mod pipeline;
mod profile;
mod segment;
mod store;
pub(crate) mod violations;

// Public (in-crate) surface for the `inkhaven prose` CLI / Bund / TUI layer.
pub(crate) use pipeline::refresh_book;
pub(crate) use profile::{VoiceProfile, VoiceScope};
pub(crate) use store::ProseStore;

use std::collections::{HashMap, HashSet};

/// The language key every language-sensitive metric is dispatched on. `Other`
/// books still get the language-agnostic Tier-1 rhythm metrics; language-keyed
/// metrics are reported as unavailable.
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum ProseLanguage {
    En,
    Ru,
    De,
    Fr,
    Es,
    Other(String),
}

impl ProseLanguage {
    /// Map a config label (code or full name, any case) to a language. Empty →
    /// English. Unknown → `Other`.
    pub(crate) fn from_label(s: &str) -> ProseLanguage {
        match s.trim().to_lowercase().as_str() {
            "" | "en" | "eng" | "english" => ProseLanguage::En,
            "ru" | "rus" | "russian" | "русский" => ProseLanguage::Ru,
            "de" | "ger" | "german" | "deutsch" => ProseLanguage::De,
            "fr" | "fre" | "french" | "français" | "francais" => ProseLanguage::Fr,
            "es" | "spa" | "spanish" | "español" | "espanol" | "castellano" => {
                ProseLanguage::Es
            }
            other => ProseLanguage::Other(other.to_string()),
        }
    }

    /// Stable storage code (`en`/`ru`/`de`/`fr`/`es`/`other`).
    pub(crate) fn as_code(&self) -> &str {
        match self {
            ProseLanguage::En => "en",
            ProseLanguage::Ru => "ru",
            ProseLanguage::De => "de",
            ProseLanguage::Fr => "fr",
            ProseLanguage::Es => "es",
            ProseLanguage::Other(_) => "other",
        }
    }

    /// Whether the language has embedded word lists (i.e. language-sensitive
    /// metrics can be computed).
    pub(crate) fn is_supported(&self) -> bool {
        !matches!(self, ProseLanguage::Other(_))
    }
}

/// Resolve the active prose language: explicit `prose.language` override, then
/// the project language, then English. Returns the language plus an optional
/// note for the `prose profile` header (e.g. unsupported-language fallback).
pub(crate) fn resolve_prose_language(
    explicit: Option<&str>,
    project_language: &str,
) -> (ProseLanguage, Option<String>) {
    if let Some(code) = explicit.map(str::trim).filter(|s| !s.is_empty()) {
        let lang = ProseLanguage::from_label(code);
        let note = (!lang.is_supported()).then(|| {
            format!(
                "prose.language `{code}` is not an embedded language; \
                 Tier-1 rhythm metrics only"
            )
        });
        return (lang, note);
    }
    let proj = project_language.trim();
    if proj.is_empty() {
        return (
            ProseLanguage::En,
            Some(
                "prose_language not set; using EN word lists for \
                 language-sensitive metrics"
                    .into(),
            ),
        );
    }
    let lang = ProseLanguage::from_label(proj);
    let note = (!lang.is_supported()).then(|| {
        format!(
            "project language `{proj}` is not an embedded language; \
             language-sensitive metrics unavailable (Tier-1 rhythm metrics still computed)"
        )
    });
    (lang, note)
}

/// The five sensory vocabularies for Tier-2 channel balance.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub(crate) enum SensoryChannel {
    Visual,
    Auditory,
    Olfactory,
    Tactile,
    Kinesthetic,
}

/// Lowercase, punctuation-trimmed, whitespace-split tokens. Internal hyphens and
/// apostrophes are preserved (`по-видимому`, `semble-t-il`, `l'ombre`); edge
/// punctuation, quotes, dashes, and ellipses are stripped. Unicode-aware.
pub(crate) fn tokenize(text: &str) -> Vec<String> {
    text.split_whitespace()
        .map(|w| {
            w.trim_matches(|c: char| !c.is_alphanumeric())
                .to_lowercase()
        })
        .filter(|w| !w.is_empty())
        .collect()
}

/// Runtime lookup tables built once per profile computation from the embedded
/// [`lexicon::Lexicon`] for a language. Hash sets give O(1) membership; the tiny
/// bigram/trigram arrays are scanned linearly.
pub(crate) struct CompiledLexicon {
    modal_unigrams: HashSet<&'static str>,
    modal_bigrams: Vec<[&'static str; 2]>,
    modal_trigrams: Vec<[&'static str; 3]>,
    /// FID marker phrases pre-split into token sequences.
    interiority: Vec<Vec<&'static str>>,
    erlebte: HashSet<&'static str>,
    sensory: HashMap<&'static str, SensoryChannel>,
    passive_exceptions: HashSet<&'static str>,
}

/// Leak a config-supplied token to `&'static str`. Bounded (a handful of
/// `prose.extra_*` entries, parsed once at load) and conceptually static — the
/// values live for the whole run — so leaking is the right call vs. threading a
/// lifetime through every lexicon lookup.
fn leak(s: &str) -> &'static str {
    Box::leak(s.to_lowercase().into_boxed_str())
}

impl CompiledLexicon {
    /// Default lexicon (no config extras). Test-only — production builds the
    /// lexicon via [`CompiledLexicon::for_language_with`] with `prose.extra_*`.
    #[cfg(test)]
    pub(crate) fn for_language(lang: &ProseLanguage) -> CompiledLexicon {
        Self::for_language_with(lang, &[], &[])
    }

    /// As [`for_language`], with config-supplied extra modal tokens and
    /// interiority phrases appended to the active language's lists (NARR-1
    /// `prose.extra_modal_tokens` / `prose.extra_interiority_phrases`).
    pub(crate) fn for_language_with(
        lang: &ProseLanguage,
        extra_modal: &[String],
        extra_interiority: &[String],
    ) -> CompiledLexicon {
        let lx = lexicon::lexicon(lang);
        let mut modal_unigrams: HashSet<&'static str> =
            lx.modal_unigrams.iter().copied().collect();
        let mut modal_bigrams: Vec<[&'static str; 2]> = lx.modal_bigrams.to_vec();
        let mut modal_trigrams: Vec<[&'static str; 3]> = lx.modal_trigrams.to_vec();
        for raw in extra_modal {
            let w: Vec<&str> = raw.split_whitespace().collect();
            match w.len() {
                1 => {
                    modal_unigrams.insert(leak(w[0]));
                }
                2 => modal_bigrams.push([leak(w[0]), leak(w[1])]),
                3 => modal_trigrams.push([leak(w[0]), leak(w[1]), leak(w[2])]),
                _ => {}
            }
        }
        let mut interiority: Vec<Vec<&'static str>> = lx
            .interiority
            .iter()
            .map(|p| p.split_whitespace().collect())
            .collect();
        for raw in extra_interiority {
            let toks: Vec<&'static str> = raw.split_whitespace().map(leak).collect();
            if !toks.is_empty() {
                interiority.push(toks);
            }
        }
        CompiledLexicon {
            modal_unigrams,
            modal_bigrams,
            modal_trigrams,
            interiority,
            erlebte: lx.erlebte_particles.iter().copied().collect(),
            sensory: lx.sensory.iter().copied().collect(),
            passive_exceptions: lx.passive_exceptions.iter().copied().collect(),
        }
    }

    /// Count epistemic-hedging hits over a token list: each unigram match, plus
    /// each bigram / trigram phrase whose tokens appear consecutively. A phrase
    /// counts once per occurrence (overlaps are intentionally allowed — they
    /// feed a density ratio, not a deduplicated list).
    pub(crate) fn count_modal_tokens(&self, tokens: &[&str]) -> usize {
        let mut hits = 0;
        for i in 0..tokens.len() {
            if self.modal_unigrams.contains(tokens[i]) {
                hits += 1;
            }
            if i + 1 < tokens.len() {
                for bi in &self.modal_bigrams {
                    if tokens[i] == bi[0] && tokens[i + 1] == bi[1] {
                        hits += 1;
                    }
                }
            }
            if i + 2 < tokens.len() {
                for tri in &self.modal_trigrams {
                    if tokens[i] == tri[0] && tokens[i + 1] == tri[1] && tokens[i + 2] == tri[2] {
                        hits += 1;
                    }
                }
            }
        }
        hits
    }

    /// Whether a sentence's tokens contain any FID marker phrase as a
    /// consecutive run. Token-level (not substring), so `she knew` never matches
    /// inside `he knew`.
    pub(crate) fn sentence_has_interiority(&self, tokens: &[&str]) -> bool {
        self.interiority
            .iter()
            .any(|phrase| contains_subsequence(tokens, phrase))
    }

    /// German *erlebte Rede* modal-particle hits (0 for non-DE lexicons, whose
    /// particle list is empty). The caller weights and gates these (only in
    /// non-interrogative third-person sentences) in N-P2.
    pub(crate) fn erlebte_particle_count(&self, tokens: &[&str]) -> usize {
        tokens.iter().filter(|t| self.erlebte.contains(*t)).count()
    }

    /// The sensory channel a token belongs to, if any.
    pub(crate) fn sensory_channel(&self, token: &str) -> Option<SensoryChannel> {
        self.sensory.get(token).copied()
    }

    /// Whether a token is on the language's passive-detection exclusion list
    /// (reflexive-only verbs in RU, `sein`+adjective collocations in DE, …).
    pub(crate) fn is_passive_exception(&self, token: &str) -> bool {
        self.passive_exceptions.contains(token)
    }
}

/// True when `needle` (non-empty) appears as a consecutive run in `haystack`.
fn contains_subsequence(haystack: &[&str], needle: &[&str]) -> bool {
    if needle.is_empty() || needle.len() > haystack.len() {
        return false;
    }
    haystack
        .windows(needle.len())
        .any(|w| w.iter().zip(needle).all(|(a, b)| a == b))
}

#[cfg(test)]
mod tests {
    use super::*;

    fn toks(text: &str) -> Vec<String> {
        tokenize(text)
    }
    fn refs(v: &[String]) -> Vec<&str> {
        v.iter().map(String::as_str).collect()
    }

    // ── language resolution ──────────────────────────────────────────────
    #[test]
    fn resolution_chain() {
        // Explicit override wins.
        assert_eq!(resolve_prose_language(Some("de"), "english").0, ProseLanguage::De);
        // Falls through to project language.
        assert_eq!(resolve_prose_language(None, "russian").0, ProseLanguage::Ru);
        assert_eq!(resolve_prose_language(None, "Français").0, ProseLanguage::Fr);
        // Empty everything → En + note.
        let (lang, note) = resolve_prose_language(None, "");
        assert_eq!(lang, ProseLanguage::En);
        assert!(note.unwrap().contains("not set"));
        // Unknown → Other + note; rhythm-only.
        let (lang, note) = resolve_prose_language(Some("italian"), "english");
        assert_eq!(lang, ProseLanguage::Other("italian".into()));
        assert!(!lang.is_supported());
        assert!(note.unwrap().contains("rhythm"));
    }

    #[test]
    fn codes_round_trip() {
        for (label, code) in [
            ("English", "en"), ("ru", "ru"), ("Deutsch", "de"),
            ("francais", "fr"), ("español", "es"),
        ] {
            assert_eq!(ProseLanguage::from_label(label).as_code(), code);
        }
        assert_eq!(ProseLanguage::Other("x".into()).as_code(), "other");
    }

    // ── tokenizer ────────────────────────────────────────────────────────
    #[test]
    fn tokenizer_strips_edges_keeps_internal() {
        assert_eq!(toks("«Hello,» she — said…"), ["hello", "she", "said"]);
        // Internal hyphen / accents / Cyrillic preserved + lowercased.
        assert_eq!(toks("По-видимому, semble-t-il É"), ["по-видимому", "semble-t-il", "é"]);
    }

    // ── modal density scan, per language ─────────────────────────────────
    #[test]
    fn modal_en_unigrams() {
        let lx = CompiledLexicon::for_language(&ProseLanguage::En);
        let t = toks("She might have known, perhaps, but could not be sure.");
        // might + perhaps + could
        assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
    }

    #[test]
    fn modal_ru_bigram_and_trigram() {
        let lx = CompiledLexicon::for_language(&ProseLanguage::Ru);
        // bigram "должно быть" + unigram "вероятно" + trigram "судя по всему"
        let t = toks("Это, вероятно, должно быть так, судя по всему.");
        assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
    }

    #[test]
    fn modal_de_inflected_forms() {
        let lx = CompiledLexicon::for_language(&ProseLanguage::De);
        let t = toks("Sie könnten es wohl vermutlich gewusst haben.");
        // könnten + wohl + vermutlich
        assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
    }

    #[test]
    fn modal_fr_bigram_and_trigram() {
        let lx = CompiledLexicon::for_language(&ProseLanguage::Fr);
        // "sans doute" (bigram) + "on aurait dit" (trigram) + "apparemment" (unigram)
        let t = toks("Apparemment, sans doute, on aurait dit un rêve.");
        assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
    }

    #[test]
    fn modal_es_bigram_and_trigram() {
        let lx = CompiledLexicon::for_language(&ProseLanguage::Es);
        // "a lo mejor" (trigram) + "tal vez" (bigram) + "quizás" (unigram)
        let t = toks("A lo mejor, tal vez, quizás era cierto.");
        assert_eq!(lx.count_modal_tokens(&refs(&t)), 3);
    }

    // ── interiority, per language + the substring trap ───────────────────
    #[test]
    fn interiority_token_level_no_false_substring() {
        let lx = CompiledLexicon::for_language(&ProseLanguage::En);
        assert!(lx.sentence_has_interiority(&refs(&toks("she thought it was over"))));
        // "he knew" must NOT match inside "she knew" (token-level, not substring).
        assert!(lx.sentence_has_interiority(&refs(&toks("she knew the truth"))));
        assert!(!lx.sentence_has_interiority(&refs(&toks("the wind was cold"))));
    }

    #[test]
    fn interiority_other_languages() {
        for (lang, sent) in [
            (ProseLanguage::Ru, "ей казалось, что всё кончено"),
            (ProseLanguage::De, "sie dachte an den See"),
            (ProseLanguage::Fr, "elle pensait à lui"),
            (ProseLanguage::Es, "ella pensaba en voz baja"),
        ] {
            let lx = CompiledLexicon::for_language(&lang);
            assert!(
                lx.sentence_has_interiority(&refs(&toks(sent))),
                "{}",
                lang.as_code()
            );
        }
    }

    #[test]
    fn de_erlebte_particles_only_for_de() {
        let de = CompiledLexicon::for_language(&ProseLanguage::De);
        assert_eq!(de.erlebte_particle_count(&refs(&toks("das war ja doch wohl klar"))), 3);
        // Other languages have no particle list.
        let en = CompiledLexicon::for_language(&ProseLanguage::En);
        assert_eq!(en.erlebte_particle_count(&refs(&toks("yes indeed of course"))), 0);
    }

    // ── sensory channels, per language spot-check ────────────────────────
    #[test]
    fn sensory_channels_per_language() {
        for cases in [
            (ProseLanguage::En, "shadow", SensoryChannel::Visual),
            (ProseLanguage::En, "murmur", SensoryChannel::Auditory),
            (ProseLanguage::Ru, "запах", SensoryChannel::Olfactory),
            (ProseLanguage::De, "kalt", SensoryChannel::Tactile),
            (ProseLanguage::Fr, "tremblement", SensoryChannel::Kinesthetic),
            (ProseLanguage::Es, "silencio", SensoryChannel::Auditory),
        ] {
            let lx = CompiledLexicon::for_language(&cases.0);
            assert_eq!(lx.sensory_channel(cases.1), Some(cases.2), "{}", cases.1);
            assert_eq!(lx.sensory_channel("zzqq"), None);
        }
    }

    // ── passive exceptions present per language that needs them ───────────
    #[test]
    fn passive_exceptions_loaded() {
        let en = CompiledLexicon::for_language(&ProseLanguage::En);
        assert!(en.is_passive_exception("thought")); // not a passive despite -t
        let ru = CompiledLexicon::for_language(&ProseLanguage::Ru);
        assert!(ru.is_passive_exception("казалось")); // reflexive, not passive
        let de = CompiledLexicon::for_language(&ProseLanguage::De);
        assert!(de.is_passive_exception("klar")); // "war klar" not Zustandspassiv
    }
}