inkhaven 1.2.4

Inkhaven — TUI literary work editor for Typst books
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
//! Place/Character name lexicon for editor highlighting.
//!
//! After every save (and on project open) we walk the Places and Characters
//! system books, collect every paragraph title nested under them, and stem
//! each title's words using the Snowball algorithms configured in
//! `editor.stemming.languages`. The editor renderer then overlays cyan
//! (Places) and yellow (Characters) on each match.
//!
//! Match semantics:
//! * **Word-sequence over stems.** Each name is split into Unicode words,
//!   each word is stemmed with every configured language, and matching is
//!   done by sliding over the buffer's words. So "Москва" with the Russian
//!   stemmer also lights up "Москве", "Москвою", etc.
//! * **Case-insensitive** at the stemming layer (we lowercase before
//!   stemming on both sides).
//! * **Phrase** — multi-word titles ("King's Landing", "Серая Гавань")
//!   match as a contiguous sequence of word-tokens; punctuation between
//!   them is allowed.
//! * **Longest-first** — when two names overlap at the same start position
//!   (one being a prefix sequence of another), the longer wins.
//! * **No stemming → exact lowercase comparison.** If `languages` is empty
//!   the lexicon is still useful, just without inflection support.
//!
//! Performance: at literary scale (a few hundred names, lines well under
//! 1k chars), per-line `row_hits` is fine. Stems are precomputed once at
//! `build()` time.

use std::collections::HashMap;

use rust_stemmers::{Algorithm, Stemmer};
use unicode_segmentation::UnicodeSegmentation;
use uuid::Uuid;

use crate::store::hierarchy::Hierarchy;
use crate::store::node::NodeKind;

/// Highlight category for a lexicon hit. Drives both the editor
/// overlay style and the per-category status messages.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LexCategory {
    Place,
    Character,
    Note,
    Artefact,
}

/// One match on a row of editor text, in character (not byte) coordinates.
#[derive(Debug, Clone, Copy)]
pub struct LexHit {
    pub col_start: usize,
    pub col_end: usize,
    pub category: LexCategory,
}

/// A single name's compiled form: the lowercased stem sequence we'll compare
/// against the editor buffer's stem sequence. `stems_per_word` outer index
/// is "which word in the name", inner Vec is "all stem candidates for that
/// word across configured languages" (lowercased originals included so a
/// name with zero matching stemmers still works).
#[derive(Debug)]
struct CompiledName {
    /// One entry per word in the name. Each entry is the set of acceptable
    /// stems for that word position (any inner stem matching the buffer
    /// word's stem set counts as a hit).
    stems_per_word: Vec<Vec<String>>,
    category: LexCategory,
}

#[derive(Default, Debug)]
pub struct Lexicon {
    names: Vec<CompiledName>,
    /// Snowball algorithms used to expand both sides of the match.
    algos: Vec<Algorithm>,
}

impl Lexicon {
    /// Walk the hierarchy, collect paragraph titles under each
    /// supplied book, and stem them with every configured Snowball
    /// algorithm. `books` is a list of `(book_id, category)` pairs;
    /// duplicate titles across categories are deduplicated case-
    /// insensitively but the FIRST entry's category wins, so put the
    /// higher-priority category first.
    pub fn build(
        hierarchy: &Hierarchy,
        books: &[(Uuid, LexCategory)],
        algorithms: Vec<Algorithm>,
    ) -> Self {
        let mut names: Vec<CompiledName> = Vec::new();
        let mut seen: HashMap<String, ()> = HashMap::new();
        for (book_id, category) in books {
            collect_names_into(
                hierarchy,
                *book_id,
                *category,
                &algorithms,
                &mut names,
                &mut seen,
            );
        }
        // Longer phrases first so we prefer "King's Landing" over "King".
        names.sort_by(|a, b| b.stems_per_word.len().cmp(&a.stems_per_word.len()));
        Self {
            names,
            algos: algorithms,
        }
    }

    pub fn is_empty(&self) -> bool {
        self.names.is_empty()
    }

    /// All hits on a single editor row. Returns non-overlapping hits in
    /// left-to-right order; on overlap, the longer phrase wins.
    pub fn row_hits(&self, line: &str) -> Vec<LexHit> {
        if self.is_empty() {
            return Vec::new();
        }
        let tokens = tokenize_with_offsets(line);
        if tokens.is_empty() {
            return Vec::new();
        }
        // Pre-stem every buffer word once.
        let token_stems: Vec<Vec<String>> = tokens
            .iter()
            .map(|t| stems_for(&t.text, &self.algos))
            .collect();

        let mut out: Vec<LexHit> = Vec::new();
        let mut i = 0usize;
        while i < tokens.len() {
            // Pick the longest name whose stems match starting at i.
            let mut matched: Option<(usize, LexCategory)> = None; // (len_in_tokens, cat)
            for name in &self.names {
                let n = name.stems_per_word.len();
                if i + n > tokens.len() {
                    continue;
                }
                let mut ok = true;
                for k in 0..n {
                    let accept = &name.stems_per_word[k];
                    let cand = &token_stems[i + k];
                    if !cand.iter().any(|s| accept.iter().any(|a| a == s)) {
                        ok = false;
                        break;
                    }
                }
                if ok {
                    if matched.map_or(true, |(prev_n, _)| n > prev_n) {
                        matched = Some((n, name.category));
                    }
                }
            }
            if let Some((n, category)) = matched {
                let start_char = tokens[i].char_start;
                let end_char = tokens[i + n - 1].char_end;
                out.push(LexHit {
                    col_start: start_char,
                    col_end: end_char,
                    category,
                });
                i += n; // skip past the matched span so overlaps don't duplicate.
            } else {
                i += 1;
            }
        }
        out
    }
}

fn collect_names_into(
    hierarchy: &Hierarchy,
    root: Uuid,
    category: LexCategory,
    algos: &[Algorithm],
    out: &mut Vec<CompiledName>,
    seen: &mut HashMap<String, ()>,
) {
    for id in hierarchy.collect_subtree(root) {
        if id == root {
            continue;
        }
        let Some(node) = hierarchy.get(id) else {
            continue;
        };
        if node.kind != NodeKind::Paragraph {
            continue;
        }
        let trimmed = node.title.trim();
        if trimmed.is_empty() {
            continue;
        }
        let key = trimmed.to_lowercase();
        if seen.insert(key, ()).is_some() {
            continue;
        }
        let words = tokenize_words(trimmed);
        if words.is_empty() {
            continue;
        }
        let stems_per_word: Vec<Vec<String>> = words
            .iter()
            .map(|w| stems_for(w, algos))
            .collect();
        out.push(CompiledName {
            stems_per_word,
            category,
        });
    }
}

/// Lowercased word tokens of `s`, with no positional info. Uses Unicode
/// word segmentation so Cyrillic, CJK punctuation, etc. all behave.
fn tokenize_words(s: &str) -> Vec<String> {
    s.unicode_words()
        .map(|w| w.to_lowercase())
        .filter(|w| !w.is_empty())
        .collect()
}

#[derive(Debug, Clone)]
struct Token {
    text: String,
    /// Character offset (not byte) where this token begins in the source.
    char_start: usize,
    /// Character offset (not byte) one past the end.
    char_end: usize,
}

/// Same as `tokenize_words` but retains source positions. Positions are in
/// CHARACTER coordinates so they line up with the editor's char-based
/// cursor and span builder.
fn tokenize_with_offsets(s: &str) -> Vec<Token> {
    let mut out = Vec::new();
    // unicode-segmentation gives byte indices; convert each to char index
    // by precomputing the cumulative char count up to each byte boundary.
    // We keep a running counter as we iterate the source.
    let mut byte_to_char: Vec<usize> = Vec::with_capacity(s.len() + 1);
    {
        let mut c = 0usize;
        for (b, _) in s.char_indices() {
            // Pad entries between prior byte and this byte (multi-byte char)
            while byte_to_char.len() < b {
                byte_to_char.push(c);
            }
            byte_to_char.push(c);
            c += 1;
        }
        // Final sentinel for the end-of-string boundary.
        while byte_to_char.len() <= s.len() {
            byte_to_char.push(c);
        }
    }
    for (b, w) in s.unicode_word_indices() {
        let start = byte_to_char[b];
        let end = byte_to_char[b + w.len()];
        let text = w.to_lowercase();
        if !text.is_empty() {
            out.push(Token {
                text,
                char_start: start,
                char_end: end,
            });
        }
    }
    out
}

/// All acceptable stems for `word` across every configured algorithm. The
/// lowercased original is always included so an empty algorithm list still
/// matches the exact word.
fn stems_for(word: &str, algos: &[Algorithm]) -> Vec<String> {
    let lc = word.to_lowercase();
    let mut out: Vec<String> = Vec::with_capacity(1 + algos.len());
    out.push(lc.clone());
    for a in algos {
        let s = Stemmer::create(*a).stem(&lc).into_owned();
        if !out.contains(&s) {
            out.push(s);
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use rust_stemmers::Algorithm;

    /// Test-only convenience: build a Lexicon directly from name lists
    /// instead of walking a real Hierarchy.
    fn build_test_lex(
        places: &[&str],
        characters: &[&str],
        algos: Vec<Algorithm>,
    ) -> Lexicon {
        build_test_lex_cats(&[
            (LexCategory::Place, places),
            (LexCategory::Character, characters),
        ], algos)
    }

    fn build_test_lex_cats(
        groups: &[(LexCategory, &[&str])],
        algos: Vec<Algorithm>,
    ) -> Lexicon {
        let mut names: Vec<CompiledName> = Vec::new();
        for (cat, list) in groups {
            for n in *list {
                let words = tokenize_words(n);
                if words.is_empty() {
                    continue;
                }
                names.push(CompiledName {
                    stems_per_word: words.iter().map(|w| stems_for(w, &algos)).collect(),
                    category: *cat,
                });
            }
        }
        names.sort_by(|a, b| b.stems_per_word.len().cmp(&a.stems_per_word.len()));
        Lexicon { names, algos }
    }

    #[test]
    fn notes_and_artefacts_hit_with_distinct_categories() {
        let lex = build_test_lex_cats(
            &[
                (LexCategory::Note, &["dragonglass"]),
                (LexCategory::Artefact, &["valyrian steel"]),
            ],
            vec![Algorithm::English],
        );
        let hits = lex.row_hits("She wielded the Valyrian Steel against the Dragonglass.");
        let cats: Vec<LexCategory> = hits.iter().map(|h| h.category).collect();
        assert!(cats.contains(&LexCategory::Note), "got: {hits:?}");
        assert!(cats.contains(&LexCategory::Artefact), "got: {hits:?}");
    }

    #[test]
    fn english_inflections_match() {
        // "city" → "citi", "cities" → "citi" → both stem identically.
        let lex = build_test_lex(&["cities"], &[], vec![Algorithm::English]);
        let hits = lex.row_hits("She walked through the city at dawn.");
        assert_eq!(hits.len(), 1, "expected one hit, got {hits:?}");
    }

    #[test]
    fn russian_inflections_of_moscow() {
        let lex = build_test_lex(&[], &["Москва"], vec![Algorithm::Russian]);
        let line = "Из Москвы в Москве и снова в Москвою.";
        let hits = lex.row_hits(line);
        assert!(
            hits.len() >= 2,
            "expected at least two Москва forms, got {hits:?}"
        );
        for h in &hits {
            assert_eq!(h.category, LexCategory::Character);
        }
    }

    #[test]
    fn whole_word_via_segmentation() {
        // "Tomorrow" must not match "Tom".
        let lex = build_test_lex(&[], &["Tom"], vec![Algorithm::English]);
        let hits = lex.row_hits("Tomorrow is another day.");
        assert!(hits.is_empty(), "got unexpected hits: {hits:?}");
        let hits2 = lex.row_hits("Then Tom arrived.");
        assert_eq!(hits2.len(), 1);
    }

    #[test]
    fn multi_word_phrase() {
        let lex = build_test_lex(&["King's Landing"], &[], vec![Algorithm::English]);
        let line = "She returned to King's Landing alone.";
        let hits = lex.row_hits(line);
        assert_eq!(hits.len(), 1);
        let matched: String = line
            .chars()
            .skip(hits[0].col_start)
            .take(hits[0].col_end - hits[0].col_start)
            .collect();
        assert!(matched.contains("King") && matched.contains("Landing"));
    }

    #[test]
    fn longest_match_wins() {
        let lex = build_test_lex(
            &[],
            &["Robb", "Robb Stark"],
            vec![Algorithm::English],
        );
        let hits = lex.row_hits("Robb Stark rode north.");
        assert_eq!(hits.len(), 1);
        // The match must cover both words, not just "Robb".
        assert!(hits[0].col_end - hits[0].col_start >= "Robb Stark".len());
    }

    #[test]
    fn empty_lexicon_no_hits() {
        let lex = Lexicon::default();
        assert!(lex.is_empty());
        assert!(lex.row_hits("anything").is_empty());
    }

    #[test]
    fn empty_algos_falls_back_to_exact() {
        let lex = build_test_lex(&["Rohan"], &[], vec![]);
        let hits = lex.row_hits("Rohan rides.");
        assert_eq!(hits.len(), 1);
        // No stemmer → "Rohans" wouldn't match. Verify.
        let none = lex.row_hits("Rohans ride.");
        assert!(none.is_empty());
    }
}