formal-ai 0.154.0

Formal symbolic AI implementation with OpenAI-compatible APIs
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
//! Shared symbolic engine primitives reused by the CLI, the HTTP server, and
//! the browser worker via Rust→WASM.
//!
//! Issue #133 (R194) wants every non-UI primitive — language detection,
//! prompt normalization, arithmetic evaluation — to live in Rust and be
//! exposed to the browser through the WASM bridge. JavaScript is reserved
//! for UI, transport, and orchestration; data processing happens in this
//! module so the offline trace and the live answer agree byte-for-byte.
//!
//! The module is `no_std` + `alloc` compatible so the WASM worker can
//! `#[path]`-include it without pulling in the standard library. The
//! sibling modules `language` and `arithmetic` are reached through `super::`
//! so the same source file compiles inside both the host crate (where the
//! modules live at `crate::language` / `crate::arithmetic`) and the
//! wasm-worker crate (which mounts them via `#[path]`).

#![allow(clippy::module_name_repetitions)]

use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::convert::TryFrom;

pub use super::arithmetic::evaluate_fallback_formatted;
pub use super::language::{detect as detect_language, Language};

/// Normalize an arbitrary prompt to a lowercase, single-space-delimited stream.
///
/// This matches the behaviour of the legacy JavaScript `normalizePrompt(prompt)`
/// helper in the browser worker — keeping a single implementation in Rust
/// eliminates the drift that produced different traces in #133.
///
/// The rules:
///   * Unicode letters and digits are kept (preserving every script — Cyrillic,
///     Devanagari, CJK, Latin).
///   * Every other Unicode codepoint becomes a single space.
///   * Adjacent spaces collapse, leading and trailing spaces are stripped.
///   * ASCII uppercase letters fold to lowercase. Non-ASCII case folding is
///     applied through `char::to_lowercase` so the result is locale-agnostic.
#[must_use]
pub fn normalize_prompt(prompt: &str) -> String {
    let mut out = String::with_capacity(prompt.len());
    let mut last_was_space = true;
    for ch in prompt.chars() {
        if is_unicode_letter_or_digit(ch) {
            for lower in ch.to_lowercase() {
                out.push(lower);
            }
            last_was_space = false;
        } else if !last_was_space {
            out.push(' ');
            last_was_space = true;
        }
    }
    if out.ends_with(' ') {
        out.pop();
    }
    out
}

/// Tokenize a normalized prompt into whitespace-separated tokens. Used by the
/// JS worker to feed the existing intent matchers; centralising the rule keeps
/// the JS and Rust paths aligned.
#[must_use]
pub fn tokenize_prompt(prompt: &str) -> Vec<String> {
    normalize_prompt(prompt)
        .split(' ')
        .filter(|token| !token.is_empty())
        .map(ToString::to_string)
        .collect()
}

/// Evaluate an arithmetic expression and return the formatted result.
///
/// The helper accepts the same word-form operators (`plus`, `minus`, `плюс`,
/// `умножить на`, …) as the legacy JS path so prompts like "what is two plus
/// two" round-trip to "4" through the WASM bridge.
///
/// `Ok(string)` carries the rendered numeric result. `Err(string)` carries
/// the error reason from `ArithmeticError::Display`.
pub fn evaluate_arithmetic_expression(expression: &str) -> Result<String, String> {
    evaluate_fallback_formatted(expression).map_err(|err| err.to_string())
}

/// Stable FNV-1a 64-bit id used by Rust answers and browser-worker memory.
///
/// JavaScript strings are UTF-16 internally, so the browser worker must call
/// this WASM export or use an explicit UTF-8 byte fallback. Hashing UTF-16 code
/// units changes non-ASCII ids and breaks parity for multilingual prompts.
#[must_use]
pub fn stable_id(prefix: &str, text: &str) -> String {
    let mut hash = 0xcbf2_9ce4_8422_2325_u64;
    for byte in text.as_bytes() {
        hash ^= u64::from(*byte);
        hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
    }

    format!("{prefix}_{hash:016x}")
}

const UNKNOWN_OPENERS_EN: &[&str] = &[
    "I don't know how to answer that yet.",
    "I didn't understand you.",
    "I'm not sure how to respond to that yet.",
    "I haven't learned to answer that yet.",
    "That one is new to me.",
];
const UNKNOWN_OPENERS_RU: &[&str] = &[
    "Я пока не знаю, как ответить на это.",
    "Я тебя не понял.",
    "Я не уверен, как на это ответить.",
    "Я ещё не научился отвечать на это.",
    "Это для меня новое.",
];
const UNKNOWN_OPENERS_HI: &[&str] = &[
    "मुझे अभी इसका उत्तर देना नहीं आता।",
    "मैं समझ नहीं पाया।",
    "मुझे यकीन नहीं है कि कैसे उत्तर दूँ।",
    "मैंने अभी तक यह उत्तर देना नहीं सीखा।",
    "यह मेरे लिए नया है।",
];
const UNKNOWN_OPENERS_ZH: &[&str] = &[
    "我还不知道如何回答这个问题。",
    "我不太明白你说的意思。",
    "我不确定该如何回答。",
    "我还没有学会回答这个问题。",
    "这对我来说是新的。",
];

#[must_use]
pub fn unknown_openers_for(language: &str) -> &'static [&'static str] {
    match language {
        "ru" => UNKNOWN_OPENERS_RU,
        "hi" => UNKNOWN_OPENERS_HI,
        "zh" => UNKNOWN_OPENERS_ZH,
        _ => UNKNOWN_OPENERS_EN,
    }
}

/// Pick the deterministic unknown-answer opener for a prompt/language pair.
#[must_use]
pub fn select_unknown_opener(prompt: &str, language: &str) -> &'static str {
    let pool = unknown_openers_for(language);
    debug_assert!(!pool.is_empty(), "unknown opener pool must be non-empty");
    let trimmed = prompt.trim();
    if trimmed.is_empty() {
        return pool[0];
    }
    let id = stable_id("unknown_opener", trimmed);
    let hex = id.rsplit('_').next().unwrap_or("0");
    let value = u64::from_str_radix(hex, 16).unwrap_or(0);
    let pool_len = pool.len() as u64;
    let index = usize::try_from(value % pool_len).unwrap_or(0);
    pool[index]
}

/// Match a prompt against intent-route fields using the browser/Rust route
/// semantics: exact keyword/phrase match, token match, or all-token combo.
#[must_use]
pub fn matches_intent_route_parts(
    normalized_prompt: &str,
    raw_prompt: &str,
    keywords: &[String],
    phrases: &[String],
    tokens: &[String],
    combos: &[Vec<String>],
) -> bool {
    if keywords
        .iter()
        .any(|keyword| normalized_prompt == keyword || raw_prompt == keyword)
    {
        return true;
    }
    if phrases
        .iter()
        .any(|phrase| normalized_prompt == phrase || raw_prompt == phrase)
    {
        return true;
    }
    if tokens
        .iter()
        .any(|token| contains_route_token(normalized_prompt, token))
    {
        return true;
    }
    combos.iter().any(|combo| {
        !combo.is_empty()
            && combo
                .iter()
                .all(|token| contains_route_token(normalized_prompt, token))
    })
}

/// Parse the line protocol used by the JS→WASM route matcher and return the
/// canonical match result.
///
/// Format:
/// `normalized\nraw\nK\tkeyword\nP\tphrase\nT\ttoken\nC\ttoken1\ttoken2...`
#[must_use]
pub fn matches_intent_route_payload(payload: &str) -> bool {
    let mut lines = payload.lines();
    let normalized = lines.next().unwrap_or("");
    let raw = normalize_route_raw_prompt(lines.next().unwrap_or(""));
    let mut keywords = Vec::new();
    let mut phrases = Vec::new();
    let mut tokens = Vec::new();
    let mut combos = Vec::new();

    for line in lines {
        let mut fields = line.split('\t');
        let Some(kind) = fields.next() else {
            continue;
        };
        match kind {
            "K" => {
                if let Some(value) = fields.next() {
                    keywords.push(value.to_string());
                }
            }
            "P" => {
                if let Some(value) = fields.next() {
                    phrases.push(value.to_string());
                }
            }
            "T" => {
                if let Some(value) = fields.next() {
                    tokens.push(value.to_string());
                }
            }
            "C" => {
                let combo = fields
                    .filter(|value| !value.is_empty())
                    .map(ToString::to_string)
                    .collect::<Vec<_>>();
                if !combo.is_empty() {
                    combos.push(combo);
                }
            }
            _ => {}
        }
    }

    matches_intent_route_parts(normalized, &raw, &keywords, &phrases, &tokens, &combos)
}

fn contains_route_token(normalized_prompt: &str, expected: &str) -> bool {
    normalized_prompt
        .split_whitespace()
        .any(|token| token == expected)
}

fn normalize_route_raw_prompt(prompt: &str) -> String {
    let mut out = String::with_capacity(prompt.len());
    for ch in prompt.chars() {
        for lower in ch.to_lowercase() {
            out.push(lower);
        }
    }
    let trimmed = out.trim();
    let trimmed = trimmed.trim_end_matches(['?', '', '.', '!', ',', ';', ':']);
    trimmed.trim().to_string()
}

fn is_unicode_letter_or_digit(ch: char) -> bool {
    if ch.is_ascii_alphanumeric() {
        return true;
    }
    let cp = ch as u32;
    // Cyrillic block (basic + supplement).
    if (0x0400..=0x04FF).contains(&cp) || (0x0500..=0x052F).contains(&cp) {
        return true;
    }
    // Devanagari block.
    if (0x0900..=0x097F).contains(&cp) {
        return true;
    }
    // CJK Unified Ideographs and the Bopomofo/CJK extension blocks.
    if (0x3400..=0x4DBF).contains(&cp)
        || (0x4E00..=0x9FFF).contains(&cp)
        || (0xF900..=0xFAFF).contains(&cp)
        || (0x3040..=0x30FF).contains(&cp)
        || (0x3100..=0x312F).contains(&cp)
        || (0xAC00..=0xD7AF).contains(&cp)
    {
        return true;
    }
    // Latin extended (Á, ñ, ü, …) and Greek for completeness.
    if (0x00C0..=0x024F).contains(&cp) || (0x0370..=0x03FF).contains(&cp) {
        return true;
    }
    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    #[test]
    fn normalize_collapses_punctuation_to_single_space() {
        assert_eq!(normalize_prompt("Hello,  world!"), "hello world");
        assert_eq!(normalize_prompt("  what's 2+2?"), "what s 2 2");
    }

    #[test]
    fn normalize_keeps_cjk_codepoints() {
        let out = normalize_prompt("你好,世界!");
        assert!(out.contains(''));
        assert!(out.contains(''));
        assert!(out.contains(''));
        assert!(out.contains(''));
    }

    #[test]
    fn normalize_handles_devanagari() {
        let out = normalize_prompt("नमस्ते, दुनिया!");
        assert!(out.contains(''));
        assert!(out.contains(''));
        assert!(!out.contains(','));
    }

    #[test]
    fn normalize_lowercases_cyrillic() {
        // `char::to_lowercase` handles Cyrillic correctly.
        let out = normalize_prompt("ПРИВЕТ, МИР!");
        assert!(out.contains("привет"));
        assert!(out.contains("мир"));
    }

    #[test]
    fn tokenize_returns_individual_words() {
        assert_eq!(
            tokenize_prompt("  Hello,  world  again!"),
            vec![
                "hello".to_string(),
                "world".to_string(),
                "again".to_string()
            ],
        );
    }

    #[test]
    fn detect_language_matches_existing_rules() {
        assert_eq!(detect_language("Hello"), Language::English);
        assert_eq!(detect_language("Привет"), Language::Russian);
        assert_eq!(detect_language("नमस्ते"), Language::Hindi);
        assert_eq!(detect_language("你好"), Language::Chinese);
    }

    #[test]
    fn evaluate_arithmetic_handles_word_operators() {
        assert_eq!(
            evaluate_arithmetic_expression("two plus two"),
            Ok("4".to_string())
        );
        assert_eq!(
            evaluate_arithmetic_expression("3 multiplied by 4"),
            Ok("12".to_string())
        );
    }

    #[test]
    fn evaluate_arithmetic_handles_percent_of_word_problems() {
        // Issue #334 step 2: the WASM worker must evaluate the reduced
        // "55 * 8% of 500" word problem to 2200 (8% of 500 = 40, 55 * 40).
        assert_eq!(
            evaluate_arithmetic_expression("55 * 8% of 500"),
            Ok("2200".to_string())
        );
        assert_eq!(
            evaluate_arithmetic_expression("8% of 500"),
            Ok("40".to_string())
        );
        // A bare `%` not followed by `of` still means modulo.
        assert_eq!(
            evaluate_arithmetic_expression("10 % 3"),
            Ok("1".to_string())
        );
    }

    #[test]
    fn evaluate_arithmetic_returns_localizable_errors() {
        assert!(evaluate_arithmetic_expression("1 / 0").is_err());
        assert!(evaluate_arithmetic_expression("").is_err());
    }

    #[test]
    fn stable_id_hashes_utf8_bytes_for_non_ascii_prompts() {
        assert_eq!(
            stable_id("unknown_opener", "неведомослово"),
            "unknown_opener_3f0af77ee5085861"
        );
    }

    #[test]
    fn unknown_opener_selection_matches_native_solver_for_russian() {
        assert_eq!(
            select_unknown_opener("неведомослово", "ru"),
            "Я ещё не научился отвечать на это."
        );
    }

    #[test]
    fn route_parts_match_keywords_tokens_and_combos() {
        let keywords = vec!["hello".to_string()];
        let phrases = vec!["what s your name".to_string()];
        let tokens = vec!["greet".to_string()];
        let combos = vec![vec!["who".to_string(), "you".to_string()]];

        assert!(matches_intent_route_parts(
            "hello", "hello", &keywords, &phrases, &tokens, &combos
        ));
        assert!(matches_intent_route_parts(
            "please greet",
            "please greet",
            &keywords,
            &phrases,
            &tokens,
            &combos
        ));
        assert!(matches_intent_route_parts(
            "who are you",
            "who are you",
            &keywords,
            &phrases,
            &tokens,
            &combos
        ));
        assert!(!matches_intent_route_parts(
            "world", "world", &keywords, &phrases, &tokens, &combos
        ));
    }

    #[test]
    fn route_payload_parser_preserves_raw_phrase_compatibility() {
        let payload = "what s your name\nWhat's your name?\nP\twhat's your name";
        assert!(matches_intent_route_payload(payload));
    }
}