formal-ai 0.186.0

Formal symbolic AI implementation with OpenAI-compatible APIs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
//! Language-independent *meaning* lexicon (issue #386).
//!
//! The solver must never recognise a user prompt by matching a hardcoded list
//! of words in one language. Instead it references **meanings** — concepts that
//! exist independently of any language and are *self-describing*: every meaning
//! is `defined_by` other meanings (a closed, mutually-referential graph in the
//! spirit of <https://github.com/link-foundation/relative-meta-logic>) and can
//! be anchored to real lexical data via source ids.
//!
//! A meaning declares the semantic `role`s it can play when read in a prompt
//! (e.g. a sort is both a `program_artifact` a follow-up can refer to and a
//! `program_modification` a follow-up can request). Its `lexeme` blocks list
//! the surface words that *evidence* it, per language. Those surfaces are
//! stored as lexeme-style ids with optional `codepoints` metadata; compatibility
//! descriptions are generated by the parser instead of authored in the seed.
//! Recognition code asks the lexicon "which words evidence role X?" and stays
//! free of hardcoded natural-language text — the words live once, here in the
//! data.

use std::collections::BTreeSet;
use std::sync::OnceLock;

use super::parser::{decode_codepoints, parse_lino, LinoNode};
use super::roles::{ROLE_ONTOLOGY_CATEGORY, ROLE_ONTOLOGY_ROOT, ROLE_ONTOLOGY_TYPE};
use super::MEANING_FILES;

/// Where a surface form positions the variable subject of a templated prompt.
///
/// A meaning's surface text may be a fixed phrase or a template with one open
/// slot — the position a user fills with the concrete subject ("how does *X*
/// work"). The slot is marked in the data with a single ellipsis `…` (U+2026,
/// serializer-safe: not a quote or backslash), and its position classifies the
/// form:
///
/// * [`Slot::Bare`] — no `…`: a fixed phrase carrying no subject ("how it works").
/// * [`Slot::Prefix`] — trailing `…`: the literal precedes the subject, which
///   follows ("how does …" → subject after).
/// * [`Slot::Suffix`] — leading `…`: the subject precedes the literal ("… как
///   работает" → subject before).
/// * [`Slot::Circumfix`] — middle `…`: the subject sits between two literals
///   ("how … works").
///
/// This lets recognition code derive an affix-matching strategy from the data
/// rather than from a hardcoded per-language list (issue #386).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Slot {
    /// A fixed phrase with no open subject slot.
    Bare,
    /// The literal precedes the subject (trailing `…`).
    Prefix,
    /// The subject precedes the literal (leading `…`).
    Suffix,
    /// The subject sits between two literals (middle `…`).
    Circumfix,
}

/// A single surface form together with generated compatibility metadata.
///
/// The data must "not just list" the word (issue #386): each form exposes
/// semantic facets for its notation and denotation. Recognition code still
/// matches on [`WordForm::text`]; [`WordForm::description`] is generated from
/// the structure for legacy diagnostics, not stored as free text in the seed.
///
/// A form may also carry an `action` — the canonical, language-independent
/// operation it names when it stands in for a verb (e.g. the procedural surface
/// "как сделать …" names the `do` action). Empty when the form does not fix an
/// action (the operation is then read from the matched subject instead).
///
/// Issue #398 adds the recursive bridge away from English-only descriptions:
/// a word form exposes generic semantic facets, just like a meaning. The parser
/// derives notation and denotation facets from the seed structure, while
/// authored facets can add part of speech or richer lexical metadata.
#[derive(Debug, Clone)]
pub struct WordForm {
    pub text: String,
    pub description: String,
    pub action: String,
    pub semantic_facets: Vec<SemanticFacet>,
}

impl WordForm {
    /// How this form positions its subject slot — derived from the position of
    /// the `…` (U+2026) marker in [`WordForm::text`]. A form with no marker is
    /// [`Slot::Bare`]; see [`Slot`] for the full classification.
    #[must_use]
    pub fn slot(&self) -> Slot {
        match self.text.split_once('') {
            None => Slot::Bare,
            Some((before, after)) => match (!before.is_empty(), !after.is_empty()) {
                (true, true) => Slot::Circumfix,
                (true, false) => Slot::Prefix,
                (false, true) => Slot::Suffix,
                (false, false) => Slot::Bare,
            },
        }
    }

    /// The literal text before the `…` slot marker (the whole text when there is
    /// no marker). For a [`Slot::Prefix`] form this is the matchable prefix.
    #[must_use]
    pub fn before_slot(&self) -> &str {
        match self.text.split_once('') {
            Some((before, _)) => before,
            None => &self.text,
        }
    }

    /// The literal text after the `…` slot marker (empty when there is no
    /// marker). For a [`Slot::Suffix`] form this is the matchable suffix.
    #[must_use]
    pub fn after_slot(&self) -> &str {
        match self.text.split_once('') {
            Some((_, after)) => after,
            None => "",
        }
    }

    /// Meaning slugs linked through the word-form semantic facet `kind`.
    ///
    /// The form itself is still the literal notation seen in text, but its
    /// denotation, part of speech, and other lexical annotations can now point
    /// at seed meanings rather than only at English prose.
    pub fn semantic_facet_targets<'a>(&'a self, kind: &'a str) -> impl Iterator<Item = &'a str> {
        self.semantic_facets
            .iter()
            .filter(move |facet| facet.kind == kind)
            .flat_map(|facet| facet.meanings.iter().map(String::as_str))
    }
}

/// Surface forms that evidence a meaning in one language.
#[derive(Debug, Clone)]
pub struct Lexeme {
    pub language: String,
    pub words: Vec<WordForm>,
}

/// A semantic facet attached to a meaning.
///
/// The facet `kind` is itself a meaning slug (`notation`, `annotation`,
/// `denotation`, `connotation`, or a future facet), and every target is another
/// meaning slug. This keeps the meta-language recursive: code only knows the
/// generic `facet` container, while the vocabulary of facets lives in the seed.
#[derive(Debug, Clone)]
pub struct SemanticFacet {
    pub kind: String,
    pub meanings: Vec<String>,
}

/// A language-independent meaning grounded in real lexical data.
#[derive(Debug, Clone)]
pub struct Meaning {
    pub slug: String,
    /// Generated compatibility text derived from the meaning id and definition
    /// links. The migrated seed no longer stores free-text glosses.
    pub gloss: String,
    pub wiktionary: String,
    /// The Wikidata entity (`Q…`) or property (`P…`) id this meaning is rooted
    /// in, when it corresponds to one. Empty for meanings that have no Wikidata
    /// anchor. Lets the formalizer resolve language-independent ids from the
    /// seed instead of hardcoded tables (issue #386).
    pub wikidata: String,
    pub defined_by: Vec<String>,
    pub roles: Vec<String>,
    pub semantic_facets: Vec<SemanticFacet>,
    pub lexemes: Vec<Lexeme>,
}

impl Meaning {
    #[must_use]
    pub fn has_role(&self, role: &str) -> bool {
        self.roles.iter().any(|r| r == role)
    }

    /// Meaning slugs linked through the semantic facet `kind`.
    ///
    /// Both `kind` and the returned targets are meaning identifiers from the
    /// seed. For example, the `link` root can declare a `notation` facet that
    /// points at `links_notation_format`, and callers can resolve that slug
    /// through [`Lexicon::semantic_facet_meanings`].
    pub fn semantic_facet_targets<'a>(&'a self, kind: &'a str) -> impl Iterator<Item = &'a str> {
        self.semantic_facets
            .iter()
            .filter(move |facet| facet.kind == kind)
            .flat_map(|facet| facet.meanings.iter().map(String::as_str))
    }

    /// Every surface word across every language this meaning lexicalises.
    pub fn words(&self) -> impl Iterator<Item = &str> {
        self.lexemes
            .iter()
            .flat_map(|lexeme| lexeme.words.iter().map(|w| w.text.as_str()))
    }

    /// Every surface form (text plus its self-describing note) across every
    /// language this meaning lexicalises, in declaration order.
    pub fn word_forms(&self) -> impl Iterator<Item = &WordForm> {
        self.lexemes.iter().flat_map(|lexeme| lexeme.words.iter())
    }

    /// Is this meaning evidenced in `normalized` — does any of its surface
    /// words (in any language) appear as a whole token or phrase? Matching is
    /// not language-gated: an English proper noun (e.g. `python`) is evidence
    /// in a prompt written in any language.
    #[must_use]
    pub fn evidenced_in(&self, normalized: &str) -> bool {
        self.words().any(|word| surface_present(normalized, word))
    }

    /// The first surface word this meaning lexicalises in `language`, if any.
    /// Used to render a concept in a chosen language (e.g. a dimension label).
    #[must_use]
    pub fn word_in(&self, language: &str) -> Option<&str> {
        self.lexemes
            .iter()
            .find(|lexeme| lexeme.language == language)
            .and_then(|lexeme| lexeme.words.first().map(|w| w.text.as_str()))
    }

    /// The self-describing note for `word` (matched case-insensitively against
    /// the stored surface text) in any language, if recorded. This is the live
    /// reader that makes [`WordForm::description`] usable — the data describes
    /// each form rather than merely listing it (issue #386).
    #[must_use]
    pub fn describe_word(&self, word: &str) -> Option<&str> {
        self.word_forms()
            .find(|form| form.text.eq_ignore_ascii_case(word))
            .map(|form| form.description.as_str())
    }

    /// Languages this meaning is lexicalised in (used by coverage tests).
    #[must_use]
    pub fn languages(&self) -> BTreeSet<String> {
        self.lexemes.iter().map(|l| l.language.clone()).collect()
    }

    /// Does any surface form this meaning lexicalises in one of `languages`
    /// appear in `normalized` as a raw substring (`str::contains`)?
    ///
    /// The language-restricted, raw-substring sibling of [`Meaning::evidenced_in`].
    /// Feature-capability recognition matches a feature's multilingual aliases by
    /// raw substring — punctuation is preserved, so whole-token boundaries do not
    /// hold — and only in the prompt's own language plus English, so it queries
    /// this rather than the token-bounded [`evidenced_in`](Self::evidenced_in).
    /// The surface words stay in the data; only the language codes (the legitimate
    /// code-resident bridge) and the raw-substring contract live in the caller.
    #[must_use]
    pub fn mentions_in_languages_raw(&self, normalized: &str, languages: &[&str]) -> bool {
        self.lexemes
            .iter()
            .filter(|lexeme| languages.contains(&lexeme.language.as_str()))
            .flat_map(|lexeme| lexeme.words.iter())
            .any(|word| !word.text.is_empty() && normalized.contains(word.text.as_str()))
    }

    /// Does this meaning lexicalise `surface` as a whole surface form in
    /// `language` (exact, case-sensitive match)? The compositional translator
    /// resolves a normalized source word to the concept that lists it through
    /// this, so the per-word table stays in the data (issue #386).
    fn lexeme_lists(&self, language: &str, surface: &str) -> bool {
        self.lexemes
            .iter()
            .filter(|lexeme| lexeme.language == language)
            .flat_map(|lexeme| lexeme.words.iter())
            .any(|word| word.text == surface)
    }

    /// Like [`Meaning::lexeme_lists`] but the matched form must also carry
    /// `action` — the per-form grammatical tag (e.g. a genitive inflection). Lets
    /// the compositional translator pick a single inflected form out of a meaning
    /// without naming it in code (issue #386).
    fn lexeme_lists_action(&self, language: &str, surface: &str, action: &str) -> bool {
        self.lexemes
            .iter()
            .filter(|lexeme| lexeme.language == language)
            .flat_map(|lexeme| lexeme.words.iter())
            .any(|word| word.text == surface && word.action == action)
    }
}

/// A spelled-surface → value-surface rewrite table: each entry maps a spelled
/// surface (a word or, for [`WordValueTable`] phrases, a multi-word string) to
/// the value surface of its meaning — the numeral or operator symbol carrying no
/// alphabetic character. Both halves of the arithmetic normalization mapping
/// returned by [`Lexicon::arithmetic_normalization_tables`] share this shape.
pub type WordValueTable = Vec<(String, String)>;

/// The parsed set of meanings.
#[derive(Debug, Clone, Default)]
pub struct Lexicon {
    pub meanings: Vec<Meaning>,
}

impl Lexicon {
    #[must_use]
    pub fn meaning(&self, slug: &str) -> Option<&Meaning> {
        self.meanings.iter().find(|m| m.slug == slug)
    }

    /// Resolved meanings attached to `slug` through the semantic facet `kind`.
    #[must_use]
    pub fn semantic_facet_meanings(&self, slug: &str, kind: &str) -> Vec<&Meaning> {
        let Some(meaning) = self.meaning(slug) else {
            return Vec::new();
        };
        meaning
            .semantic_facet_targets(kind)
            .filter_map(|target| self.meaning(target))
            .collect()
    }

    /// The meaning rooted in the Wikidata entity or property `id` (e.g. `Q89`,
    /// `P31`), if the seed carries one. Lets the formalizer resolve a
    /// language-independent id back to its canonical label and surfaces without
    /// a hardcoded table (issue #386).
    #[must_use]
    pub fn meaning_by_wikidata(&self, id: &str) -> Option<&Meaning> {
        self.meanings.iter().find(|m| m.wikidata == id)
    }

    /// Every meaning carrying `role`, in declaration order. Lets recognition
    /// code walk a semantic category (e.g. every measurement unit) without ever
    /// naming the surface words — those live in the data.
    pub fn meanings_with_role<'a>(&'a self, role: &'a str) -> impl Iterator<Item = &'a Meaning> {
        self.meanings.iter().filter(move |m| m.has_role(role))
    }

    /// Every surface *form* (text, description, action, slot) contributed by
    /// every meaning carrying `role`, in declaration order. Unlike
    /// [`Lexicon::words_for_role`] this preserves each form's slot marker and
    /// action, so a handler can derive an affix-matching strategy from the data:
    /// it walks the forms, buckets them by [`WordForm::slot`], and matches each
    /// against the prompt — never naming a surface word itself (issue #386).
    #[must_use]
    pub fn role_word_forms<'a>(&'a self, role: &str) -> Vec<&'a WordForm> {
        self.meanings
            .iter()
            .filter(|meaning| meaning.has_role(role))
            .flat_map(Meaning::word_forms)
            .collect()
    }

    /// Translate `surface` from `source` to `target` through the meaning carrying
    /// `role` that lexicalises it.
    ///
    /// Finds the first meaning (declaration order) carrying `role` whose `source`
    /// lexeme lists `surface`, then returns its first `target`-language form. The
    /// compositional ru→en fallback resolves a lemma or fixed phrase to English
    /// through this — naming the semantic role and the language codes, never the
    /// surface words, which live in `data/seed/meanings-translation.lino` (#386).
    #[must_use]
    pub fn role_surface_translation<'a>(
        &'a self,
        role: &str,
        source: &str,
        target: &str,
        surface: &str,
    ) -> Option<&'a str> {
        self.meanings
            .iter()
            .filter(|meaning| meaning.has_role(role))
            .find(|meaning| meaning.lexeme_lists(source, surface))
            .and_then(|meaning| meaning.word_in(target))
    }

    /// Does any meaning carrying `role` lexicalise `surface` in `language`?
    ///
    /// Lets the compositional translator test a structural property of a source
    /// word — e.g. whether it is a genitive-governing head — by role rather than
    /// by naming the word in code (issue #386).
    #[must_use]
    pub fn role_lists_surface(&self, role: &str, language: &str, surface: &str) -> bool {
        self.meanings
            .iter()
            .filter(|meaning| meaning.has_role(role))
            .any(|meaning| meaning.lexeme_lists(language, surface))
    }

    /// Like [`Lexicon::role_surface_translation`] but the `source` form must also
    /// carry `action`.
    ///
    /// The per-form grammatical tag selects one inflected surface out of a
    /// meaning's lexeme, so the compositional translator resolves a
    /// genitive-inflected complement to its English lemma while leaving the single
    /// tagged form in the data (issue #386).
    #[must_use]
    pub fn role_action_surface_translation<'a>(
        &'a self,
        role: &str,
        action: &str,
        source: &str,
        target: &str,
        surface: &str,
    ) -> Option<&'a str> {
        self.meanings
            .iter()
            .filter(|meaning| meaning.has_role(role))
            .find(|meaning| meaning.lexeme_lists_action(source, surface, action))
            .and_then(|meaning| meaning.word_in(target))
    }

    /// Distinct surface words contributed by every meaning carrying `role`,
    /// in declaration order. Useful for diagnostics and tests.
    #[must_use]
    pub fn words_for_role(&self, role: &str) -> Vec<String> {
        let mut out: Vec<String> = Vec::new();
        for meaning in self.meanings.iter().filter(|m| m.has_role(role)) {
            for word in meaning.words() {
                if !out.iter().any(|existing| existing == word) {
                    out.push(word.to_string());
                }
            }
        }
        out
    }

    /// Does `normalized` mention any surface word of any meaning in `role`?
    ///
    /// Mirrors the CJK-substring vs. whitespace-token contract used across the
    /// solver: CJK scripts have no inter-word spaces, so a CJK surface word is
    /// matched as a substring, while space-delimited scripts match a whole
    /// whitespace token or phrase (see [`crate::coding::contains_cjk`]).
    #[must_use]
    pub fn mentions_role(&self, role: &str, normalized: &str) -> bool {
        self.meanings_with_role(role)
            .any(|meaning| meaning.evidenced_in(normalized))
    }

    /// Like [`mentions_role`](Self::mentions_role) but ignores a meaning's
    /// script-independent *value surfaces* — word forms that carry no alphabetic
    /// character, such as the operator symbol "+" or the numeral "10".
    ///
    /// Those forms exist so the arithmetic normalizer can read a meaning's
    /// machine value (see
    /// [`arithmetic_normalization_tables`](Self::arithmetic_normalization_tables));
    /// they are not spelled words. Operator-*word* detection must therefore skip
    /// them so a bare "+" is recognised as an operator *symbol* by the symbol
    /// scan, not double-counted as a spelled word operator. This mirrors the
    /// pure-numeral skip already applied to spelled-number detection.
    #[must_use]
    pub fn mentions_role_spelled(&self, role: &str, normalized: &str) -> bool {
        self.meanings_with_role(role).any(|meaning| {
            meaning
                .words()
                .filter(|word| word.chars().any(char::is_alphabetic))
                .any(|word| surface_present(normalized, word))
        })
    }

    /// Does `normalized` contain any surface word of any meaning in `role` as a
    /// raw substring (`str::contains`), ignoring whitespace-token boundaries?
    ///
    /// This is the deliberately *looser* sibling of [`mentions_role`]. Many
    /// legacy recognisers matched an inflectable stem — `правил` to catch
    /// `правила`/`правило`/`правил`, `расчёт` to catch `при расчёте` — by raw
    /// substring. Those stems are not whole tokens, so [`mentions_role`]'s
    /// token-bounded contract would miss them. A meaning whose surface forms are
    /// such stems (recorded as [`Slot::Bare`] words) is queried through this
    /// method instead, preserving the original byte-faithful substring match
    /// while still keeping the surface words in the data, not the code. Slot
    /// markers are not stripped, so author stem roles as bare forms.
    #[must_use]
    pub fn mentions_role_raw(&self, role: &str, normalized: &str) -> bool {
        self.meanings_with_role(role)
            .any(|meaning| meaning.words().any(|word| normalized.contains(word)))
    }

    /// Build the word→value tables the arithmetic evaluator uses to rewrite a
    /// spelled expression into its symbolic form before tokenizing.
    ///
    /// "two plus three" becomes "2 + 3"; "пять умножить на два" becomes
    /// "5 * 2". Returns `(tokens, phrases)`. Each entry maps a spelled surface to
    /// the *value surface* of its meaning — the word form carrying no alphabetic
    /// character: the numeral "2" for the cardinal two, the symbol "+" for the
    /// addition operator. `tokens` are single words, applied after whitespace
    /// tokenization; `phrases` are multi-word surfaces, applied (and so replaced)
    /// before tokenization and ordered longest first so a phrase is rewritten
    /// before any shorter phrase it contains — "разделить на" before "делить на".
    /// Both lists are sorted deterministically so the generated mirror is stable.
    ///
    /// This is the single source of truth behind the generated `no_std` table in
    /// `src/arithmetic_word_tables.rs`: the evaluator is compiled into the wasm
    /// worker, which cannot reach the seed at runtime, so the table is
    /// materialized at build time by `examples/issue_386_gen_arith_table.rs` and
    /// checked against this builder by the `arithmetic_word_tables_match_seed`
    /// test in `src/calculation.rs`.
    #[must_use]
    pub fn arithmetic_normalization_tables(&self) -> (WordValueTable, WordValueTable) {
        let is_value_surface = |word: &str| !word.chars().any(char::is_alphabetic);
        let mut tokens: WordValueTable = Vec::new();
        let mut phrases: WordValueTable = Vec::new();
        for role in [
            super::roles::ROLE_CARDINAL_NUMBER_WORD,
            super::roles::ROLE_ARITHMETIC_OPERATOR_WORD,
        ] {
            for meaning in self.meanings_with_role(role) {
                // The value surface is the unique word form with no alphabetic
                // character: the numeral for a cardinal, the symbol for an
                // operator. Spelled surfaces in every language map onto it.
                let Some(value) = meaning.words().find(|&word| is_value_surface(word)) else {
                    continue;
                };
                for word in meaning.words() {
                    if word == value || is_value_surface(word) {
                        continue;
                    }
                    let entry = (word.to_string(), value.to_string());
                    if word.chars().any(char::is_whitespace) {
                        phrases.push(entry);
                    } else {
                        tokens.push(entry);
                    }
                }
            }
        }
        tokens.sort();
        tokens.dedup();
        phrases.sort_by(|a, b| {
            b.0.chars()
                .count()
                .cmp(&a.0.chars().count())
                .then_with(|| a.0.cmp(&b.0))
        });
        phrases.dedup();
        (tokens, phrases)
    }

    /// Distinct surface words contributed by every meaning carrying `role`,
    /// limited to the given `languages`, in declaration order.
    ///
    /// Lets a handler partition a role's vocabulary by linguistic typology — for
    /// the translation request-gate, the head-initial English/Russian command
    /// stems (matched clause-initially) versus the head-final Hindi/Chinese stems
    /// (matched anywhere, gated by a target marker) — while keeping every surface
    /// word in the data. Language codes are the legitimate code-resident bridge
    /// (see [`crate::translation::language_markers`]); the words stay in the seed.
    #[must_use]
    pub fn words_for_role_in_languages(&self, role: &str, languages: &[&str]) -> Vec<String> {
        let mut out: Vec<String> = Vec::new();
        for meaning in self.meanings_with_role(role) {
            for lexeme in &meaning.lexemes {
                if !languages.contains(&lexeme.language.as_str()) {
                    continue;
                }
                for word in &lexeme.words {
                    if !out.iter().any(|existing| existing == &word.text) {
                        out.push(word.text.clone());
                    }
                }
            }
        }
        out
    }

    /// The first language in `priority` whose surface word for `role` appears in
    /// `normalized` (raw substring), or `None` when none is present.
    ///
    /// Answers "which language did the user issue this command in?" — the
    /// source-inferencer reads a translation command's verb language as the
    /// language of the prompt itself. Priority order resolves ties (a prompt that
    /// happens to carry stems from several languages takes the first listed).
    /// Language codes are the legitimate code-resident bridge; the surface words
    /// stay in the data.
    #[must_use]
    pub fn first_role_language(
        &self,
        role: &str,
        normalized: &str,
        priority: &[&'static str],
    ) -> Option<&'static str> {
        priority.iter().copied().find(|&lang| {
            self.meanings_with_role(role).any(|meaning| {
                meaning
                    .lexemes
                    .iter()
                    .filter(|lexeme| lexeme.language == lang)
                    .any(|lexeme| {
                        lexeme
                            .words
                            .iter()
                            .any(|word| normalized.contains(word.text.as_str()))
                    })
            })
        })
    }

    /// The first meaning carrying `role`, in declaration order, that is
    /// evidenced in `normalized` — or `None`. Declaration order therefore
    /// encodes priority (e.g. the first matching delivery mode wins).
    #[must_use]
    pub fn first_role_match(&self, role: &str, normalized: &str) -> Option<&Meaning> {
        self.meanings
            .iter()
            .filter(|meaning| meaning.has_role(role))
            .find(|meaning| meaning.evidenced_in(normalized))
    }

    /// The first meaning carrying `role`, in declaration order, that mentions one
    /// of its `languages` surface forms in `normalized` as a raw substring — or
    /// `None`.
    ///
    /// The raw-substring, language-restricted sibling of [`first_role_match`](Self::first_role_match).
    /// Declaration order encodes priority, so the feature-capability recogniser
    /// lists its alias meanings in the legacy table order and takes the first hit,
    /// querying the prompt's own language plus English without ever naming a
    /// surface word in code.
    #[must_use]
    pub fn first_role_match_in_languages_raw(
        &self,
        role: &str,
        normalized: &str,
        languages: &[&str],
    ) -> Option<&Meaning> {
        self.meanings
            .iter()
            .filter(|meaning| meaning.has_role(role))
            .find(|meaning| meaning.mentions_in_languages_raw(normalized, languages))
    }

    /// Does any meaning carrying `role` mention one of its `languages` surface
    /// forms in `normalized` as a raw substring?
    ///
    /// The boolean, language-restricted sibling of [`mentions_role_raw`](Self::mentions_role_raw).
    /// The feature-capability question gate uses it to check each language's
    /// interrogative cues only against prompts detected in that language.
    #[must_use]
    pub fn mentions_role_in_languages_raw(
        &self,
        role: &str,
        normalized: &str,
        languages: &[&str],
    ) -> bool {
        self.meanings_with_role(role)
            .any(|meaning| meaning.mentions_in_languages_raw(normalized, languages))
    }

    /// The single meaning that roots the merged ontology — the one carrying
    /// [`ROLE_ONTOLOGY_ROOT`] (the `link` meaning), or `None` if absent.
    #[must_use]
    pub fn ontology_root(&self) -> Option<&Meaning> {
        self.meanings
            .iter()
            .find(|m| m.has_role(ROLE_ONTOLOGY_ROOT))
    }

    /// Does `slug` reach the ontology root by following `defined_by` edges?
    ///
    /// A breadth-first walk of the `defined_by` graph that visits each meaning
    /// at most once (cycles are expected). Every meaning must reach the root, so
    /// the data forms one connected ontology rather than disjoint islands of
    /// vocabulary — the universal "everything reduces to a link" stance.
    #[must_use]
    pub fn reaches_root(&self, slug: &str) -> bool {
        let Some(root) = self.ontology_root() else {
            return false;
        };
        let mut seen: BTreeSet<&str> = BTreeSet::new();
        let mut stack: Vec<&str> = vec![slug];
        while let Some(current) = stack.pop() {
            if current == root.slug {
                return true;
            }
            if !seen.insert(current) {
                continue;
            }
            if let Some(meaning) = self.meaning(current) {
                for target in &meaning.defined_by {
                    stack.push(target.as_str());
                }
            }
        }
        false
    }

    /// The type-system sub-root of the ontology — the meaning carrying
    /// [`ROLE_ONTOLOGY_TYPE`] (the `type` meaning), or `None` if absent.
    ///
    /// A distinguished node directly under the [`ontology_root`](Self::ontology_root):
    /// the broadest classifications descend from it, so a reasoner can ask "what
    /// kind of thing is this?" by walking up to the type sub-root.
    #[must_use]
    pub fn ontology_type_root(&self) -> Option<&Meaning> {
        self.meanings
            .iter()
            .find(|m| m.has_role(ROLE_ONTOLOGY_TYPE))
    }

    /// The top-level ontological categories — every meaning carrying
    /// [`ROLE_ONTOLOGY_CATEGORY`] (entity, concept, relation, action, property).
    ///
    /// These are the genera each domain cluster roots in, so generic reasoning
    /// can classify any meaning into a small, fixed set of categories rather
    /// than special-casing each domain.
    pub fn ontology_categories(&self) -> impl Iterator<Item = &Meaning> {
        self.meanings_with_role(ROLE_ONTOLOGY_CATEGORY)
    }
}

/// Does the surface word/phrase `expected` appear in `normalized`?
///
/// CJK surfaces have no inter-word spaces, so they match as substrings.
/// Space-delimited scripts match on whole-token boundaries — equal to the
/// whole string, or bounded by spaces — so a multi-word phrase ("each step")
/// matches as a unit and a short word ("api") never matches inside a longer
/// one ("напиши"). An empty surface never matches.
fn surface_present(normalized: &str, expected: &str) -> bool {
    if expected.is_empty() {
        return false;
    }
    if crate::coding::contains_cjk(expected) {
        return normalized.contains(expected);
    }
    normalized == expected
        || normalized.starts_with(&format!("{expected} "))
        || normalized.ends_with(&format!(" {expected}"))
        || normalized.contains(&format!(" {expected} "))
}

fn parse_lexicon(text: &str) -> Lexicon {
    let root = parse_lino(text);
    // The lexicon is split across several files (program, units, …), each
    // wrapping its records under a top-level `meanings` node. When the files
    // are concatenated the document therefore holds one-or-more `meanings`
    // containers; collect the records from every one. If none is present the
    // records sit at the document root (kept for robustness).
    let mut meanings = Vec::new();
    let containers: Vec<&LinoNode> = root
        .children
        .iter()
        .filter(|c| c.name == "meanings")
        .collect();
    let sources: Vec<&LinoNode> = if containers.is_empty() {
        vec![&root]
    } else {
        containers
    };
    for container in sources {
        for node in container
            .children
            .iter()
            .filter(|c| c.name == "meaning" || c.name != "meanings")
        {
            meanings.push(parse_meaning(node));
        }
    }
    Lexicon { meanings }
}

fn parse_meaning(node: &LinoNode) -> Meaning {
    let slug = meaning_slug(node);
    let mut defined_by = Vec::new();
    let mut roles = Vec::new();
    let semantic_facets = parse_semantic_facets(node);
    let mut lexemes = Vec::new();
    let mut wikidata = String::new();
    if node.name != "meaning" && !node.id.is_empty() {
        defined_by.extend(definition_targets(&node.id));
    }
    for child in &node.children {
        match child.name.as_str() {
            "defined_by" | "defined-by" => defined_by.extend(definition_targets(&child.id)),
            "grounded-in" | "wikidata" => wikidata.clone_from(&child.id),
            "role" => roles.push(child.id.clone()),
            "lexeme" => {
                let words = child
                    .children
                    .iter()
                    .filter(|w| w.name == "word" || w.name == "surface")
                    .map(|w| parse_word_form(&slug, w))
                    .collect();
                lexemes.push(Lexeme {
                    language: lexeme_language(child),
                    words,
                });
            }
            "surface" => {
                let language = child.find_child_value("language").to_string();
                lexemes.push(Lexeme {
                    language,
                    words: vec![parse_word_form(&slug, child)],
                });
            }
            _ => {}
        }
    }
    Meaning {
        gloss: generated_meaning_description(&slug, &defined_by, node.find_child_value("gloss")),
        slug,
        wiktionary: node.find_child_value("wiktionary").to_string(),
        wikidata,
        defined_by,
        roles,
        semantic_facets,
        lexemes,
    }
}

fn definition_targets(raw: &str) -> impl Iterator<Item = String> + '_ {
    raw.split(|character: char| {
        character.is_whitespace() || matches!(character, '(' | ')' | '[' | ']' | ',')
    })
    .filter(|target| !target.is_empty())
    .map(canonical_definition_target)
}

fn canonical_definition_target(target: &str) -> String {
    match target {
        "reference_action" => String::from("reference-action"),
        "link_action" => String::from("link-action"),
        "any_of_reference" => String::from("any-of-reference"),
        "any_of_link" => String::from("any-of-link"),
        "repeatable_from_zero" => String::from("repeatable-from-zero"),
        "zero_or_more" => String::from("zero-or-more"),
        "point_at" => String::from("point-at"),
        "or_else" => String::from("or-else"),
        "is_identity" => String::from("is-identity"),
        "is_a_kind_of" => String::from("is-a-kind-of"),
        "held_by" => String::from("held-by"),
        "together_with" => String::from("together-with"),
        "self_equation" => String::from("self-equation"),
        "one_symbol_one_meaning" => String::from("one-symbol-one-meaning"),
        "sense_split" => String::from("sense-split"),
        "bank_river" => String::from("bank-river"),
        "bank_money" => String::from("bank-money"),
        other => other.to_string(),
    }
}

fn parse_word_form(parent_meaning: &str, node: &LinoNode) -> WordForm {
    let mut semantic_facets = parse_semantic_facets(node);
    // The seed nesting itself asserts that this literal surface denotes the
    // parent meaning. Expose that as data so consumers do not have to read an
    // authored free-text field to understand the word form.
    ensure_semantic_facet_target(&mut semantic_facets, "notation", "word_surface");
    ensure_semantic_facet_target(&mut semantic_facets, "denotation", parent_meaning);

    WordForm {
        text: surface_text(node),
        description: generated_word_description(parent_meaning, node),
        action: node.find_child_value("action").to_string(),
        semantic_facets,
    }
}

fn ensure_semantic_facet_target(facets: &mut Vec<SemanticFacet>, kind: &str, target: &str) {
    if let Some(facet) = facets.iter_mut().find(|facet| facet.kind == kind) {
        if !facet.meanings.iter().any(|meaning| meaning == target) {
            facet.meanings.push(target.to_string());
        }
        return;
    }

    facets.push(SemanticFacet {
        kind: kind.to_string(),
        meanings: vec![target.to_string()],
    });
}

/// The closed facet vocabulary. A semantic facet is written either as the
/// native `subject predicate` line (`notation word_surface`) or, for backward
/// compatibility, as a `facet <kind>` wrapper. The two forms are equivalent;
/// `scripts/migrate-empty-facet-fields.rs` rewrites the wrapper into the line
/// form so the seed never carries an empty `word_surface:` colon redefinition.
const FACET_KINDS: &[&str] = &[
    "notation",
    "annotation",
    "denotation",
    "connotation",
    "part_of_speech",
    "self-equation",
];

fn parse_semantic_facets(node: &LinoNode) -> Vec<SemanticFacet> {
    let mut facets: Vec<SemanticFacet> = Vec::new();
    for child in &node.children {
        if child.name == "facet" {
            // Legacy wrapper: `facet <kind>` with nested target children.
            let targets = child.children.iter().filter_map(semantic_facet_target);
            merge_facet_targets(&mut facets, &child.id, targets);
        } else if FACET_KINDS.contains(&child.name.as_str()) && !child.id.is_empty() {
            // Native subject-predicate line: `<kind> <target>`.
            merge_facet_targets(&mut facets, &child.name, std::iter::once(child.id.clone()));
        }
    }
    facets
}

/// Append `targets` under the `kind` facet, creating it if absent and skipping
/// duplicates so the wrapper and line forms collapse to one facet.
fn merge_facet_targets(
    facets: &mut Vec<SemanticFacet>,
    kind: &str,
    targets: impl Iterator<Item = String>,
) {
    let position = facets.iter().position(|facet| facet.kind == kind);
    let index = position.unwrap_or_else(|| {
        facets.push(SemanticFacet {
            kind: kind.to_string(),
            meanings: Vec::new(),
        });
        facets.len() - 1
    });
    for target in targets {
        if !facets[index].meanings.contains(&target) {
            facets[index].meanings.push(target);
        }
    }
}

fn meaning_slug(node: &LinoNode) -> String {
    if node.name == "meaning" {
        node.id.clone()
    } else {
        node.name.clone()
    }
}

fn lexeme_language(node: &LinoNode) -> String {
    let explicit = node.find_child_value("language");
    if explicit.is_empty() {
        node.id.clone()
    } else {
        explicit.to_string()
    }
}

fn surface_text(node: &LinoNode) -> String {
    let text = node.find_child_value("text");
    if !text.is_empty() {
        return text.to_string();
    }
    // Backward compatibility with the historical `codepoints <ints>` encoding.
    let codepoints = node.find_child_value("codepoints");
    if codepoints.is_empty() {
        node.id.clone()
    } else {
        decode_codepoints(codepoints)
    }
}

fn generated_meaning_description(slug: &str, defined_by: &[String], stored: &str) -> String {
    if !stored.is_empty() {
        return stored.to_string();
    }
    if defined_by.is_empty() {
        slug.to_string()
    } else {
        format!("{} defined by {}", slug, defined_by.join(" "))
    }
}

fn generated_word_description(parent_meaning: &str, node: &LinoNode) -> String {
    let stored = node.find_child_value("description");
    if !stored.is_empty() {
        return stored.to_string();
    }
    let surface = surface_text(node);
    if surface.is_empty() {
        parent_meaning.to_string()
    } else {
        format!("{surface} denotes {parent_meaning}")
    }
}

fn semantic_facet_target(node: &LinoNode) -> Option<String> {
    match node.name.as_str() {
        "meaning" | "target" | "facet-target" => Some(node.id.clone()),
        _ if !node.id.is_empty() => Some(node.id.clone()),
        _ if !node.name.is_empty() => Some(node.name.clone()),
        _ => None,
    }
}

/// The parsed meaning lexicon. Cached — the embedded data is immutable at
/// runtime, so parsing happens at most once per process.
#[must_use]
pub fn lexicon() -> &'static Lexicon {
    static CACHE: OnceLock<Lexicon> = OnceLock::new();
    CACHE.get_or_init(|| parse_lexicon(&MEANING_FILES.join("\n")))
}