formal-ai 0.188.0

Formal symbolic AI implementation with OpenAI-compatible APIs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
//! Natural-language web-search intent recognition.
//!
//! Every surface cue this recogniser reasons about — the explicit command
//! prefixes, the action verbs, the source/signal nouns, the topic connectives,
//! the query noise, the follow-up instruction verbs and clause boundaries, and
//! the research/enumeration vocabulary — is sourced from the language-independent
//! meaning lexicon (`data/seed/meanings-web-search*.lino`,
//! `meanings-web-research.lino`, `meanings-web-followup.lino`). The handler
//! references those meanings by their semantic *role* (e.g.
//! [`ROLE_WEB_SEARCH_EXPLICIT_PREFIX`], [`ROLE_FOLLOWUP_INSTRUCTION_VERB`]) and
//! by the *slot* each word form occupies (prefix / suffix / bare), never by raw
//! words baked into the code. Adding a language or a synonym is therefore a pure
//! data edit: drop a `word`/`description` into the relevant meaning and this
//! handler reasons about it automatically. The follow-up truncation in
//! particular is a universal boundary algorithm — a follow-up clause is detected
//! structurally (an instruction verb immediately preceded by sentence
//! punctuation or a chained clause-continuation marker), not by memorising the
//! handful of `". compare"`-style fragments the prompts happen to use.

use std::sync::OnceLock;

use crate::coding::contains_cjk;
use crate::engine::normalize_prompt;
use crate::seed::{
    self, Slot, WordForm, ROLE_CLAUSE_CONTINUATION_MARKER, ROLE_ENUMERATION_CONSTRAINT,
    ROLE_ENUMERATION_REQUEST_OPENER, ROLE_FOLLOWUP_INSTRUCTION_VERB,
    ROLE_RESEARCH_EVALUATION_DOMAIN, ROLE_RESEARCH_EVIDENCE_DOMAIN, ROLE_RESEARCH_QUESTION_OPENER,
    ROLE_RESEARCH_SUPERLATIVE_MODIFIER, ROLE_WEB_SEARCH_ACTION, ROLE_WEB_SEARCH_EXPLICIT_PREFIX,
    ROLE_WEB_SEARCH_IMPERATIVE_LEAD, ROLE_WEB_SEARCH_NEWS_RECENCY, ROLE_WEB_SEARCH_NEWS_SUBJECT,
    ROLE_WEB_SEARCH_QUERY_LEADING_NOISE, ROLE_WEB_SEARCH_QUERY_TRAILING_NOISE,
    ROLE_WEB_SEARCH_SIGNAL, ROLE_WEB_SEARCH_SOURCE_ONLY, ROLE_WEB_SEARCH_STRONG_ACTION,
    ROLE_WEB_SEARCH_TOPIC_MARKER,
};

use super::web_requests::normalize_url_candidate;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum WebSearchQueryKind {
    ExplicitPrefix,
    SemanticAction,
    LatestNews,
    ImplicitResearchQuestion,
    EnumerationResearchRequest,
}

impl WebSearchQueryKind {
    pub(super) const fn as_str(self) -> &'static str {
        match self {
            Self::ExplicitPrefix => "explicit_prefix",
            Self::SemanticAction => "semantic_action",
            Self::LatestNews => "latest_news",
            Self::ImplicitResearchQuestion => "implicit_research_question",
            Self::EnumerationResearchRequest => "enumeration_research_request",
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub(super) struct WebSearchRequest {
    pub(super) query: String,
    pub(super) kind: WebSearchQueryKind,
}

pub(super) fn extract_web_search_request(
    prompt: &str,
    normalized: &str,
) -> Option<WebSearchRequest> {
    let normalized_words = normalize_prompt(prompt);
    if normalized_words.starts_with("search conversations ")
        || normalized_words.starts_with("search my conversations ")
        || normalized_words.starts_with("search my chats ")
        || is_personal_fact_filter_request(&normalized_words)
    {
        return None;
    }
    // Try the punctuation-preserving `normalized` first so the follow-up
    // truncation downstream can see sentence boundaries (`normalize_prompt`
    // strips punctuation, which would hide the period in
    // "… Thomas Edison. Compare …"); fall back to the punctuation-stripped,
    // whitespace-collapsed `normalized_words` for prompts whose leading layout
    // only `normalize_prompt` cleans up.
    for &prefix in &markers().explicit_prefixes {
        if let Some(query) = normalized.strip_prefix(prefix) {
            if let Some(query) = valid_search_query(query) {
                return Some(WebSearchRequest {
                    query,
                    kind: WebSearchQueryKind::ExplicitPrefix,
                });
            }
        }
        if let Some(query) = normalized_words.strip_prefix(prefix) {
            if let Some(query) = valid_search_query(query) {
                return Some(WebSearchRequest {
                    query,
                    kind: WebSearchQueryKind::ExplicitPrefix,
                });
            }
        }
    }
    if let Some(query) = extract_semantic_web_search_query(&normalized_words) {
        return Some(WebSearchRequest {
            query,
            kind: WebSearchQueryKind::SemanticAction,
        });
    }
    if let Some(query) = extract_latest_news_search_request(&normalized_words) {
        return Some(WebSearchRequest {
            query,
            kind: WebSearchQueryKind::LatestNews,
        });
    }
    if let Some(query) = extract_enumeration_research_request(&normalized_words) {
        return Some(WebSearchRequest {
            query,
            kind: WebSearchQueryKind::EnumerationResearchRequest,
        });
    }
    extract_implicit_research_question(&normalized_words).map(|query| WebSearchRequest {
        query,
        kind: WebSearchQueryKind::ImplicitResearchQuestion,
    })
}

fn is_personal_fact_filter_request(normalized: &str) -> bool {
    normalized.contains("facts i have contributed")
        || normalized.contains("facts ive contributed")
        || normalized.contains("facts i contributed")
        || normalized.contains("my facts")
}

fn clean_search_query(value: &str) -> String {
    value
        .trim()
        .trim_matches(is_url_wrapper_punctuation)
        .trim_end_matches(is_url_trailing_punctuation)
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}

const fn is_url_wrapper_punctuation(character: char) -> bool {
    matches!(
        character,
        '<' | '>' | '(' | ')' | '[' | ']' | '{' | '}' | '"' | '\'' | '`' | '«' | '»'
    )
}

const fn is_url_trailing_punctuation(character: char) -> bool {
    matches!(character, '.' | ',' | '!' | '?' | ';' | ':' | '')
}

/// Sentence-ending punctuation that can introduce a follow-up instruction
/// clause. Universal across the supported languages — the ASCII marks plus the
/// fullwidth/ideographic forms a CJK prompt would use.
const fn is_sentence_boundary(character: char) -> bool {
    matches!(
        character,
        '.' | '?' | '!' | ';' | ':' | '' | '' | '' | '' | ''
    )
}

/// Every surface cue the web-search recogniser reasons about, projected out of
/// the meaning lexicon by role and slot. Built once and cached: because
/// [`seed::lexicon`] returns a `'static` reference, the projected literals are
/// themselves `'static` and need no allocation beyond the backing vectors.
struct WebSearchMarkers {
    /// Lead-ins of an explicit "search X for …" command (prefix slot).
    explicit_prefixes: Vec<&'static str>,
    /// Bare search verbs that signal an action is requested.
    action_markers: Vec<&'static str>,
    /// The subset of action verbs strong enough to stand without a source noun.
    strong_action_markers: Vec<&'static str>,
    /// Source/topic nouns that corroborate a weak action verb.
    signal_markers: Vec<&'static str>,
    /// Topic connectives whose object follows them ("about …", "о …").
    topic_after_markers: Vec<&'static str>,
    /// Topic connectives whose object precedes them ("… के बारे में").
    topic_before_markers: Vec<&'static str>,
    /// Imperative search leads whose query follows them ("search for …").
    imperative_lead_markers: Vec<&'static str>,
    /// Politeness / determiner noise stripped from the front of a query.
    leading_noise: Vec<&'static str>,
    /// Source/medium noise stripped from the end of a query.
    trailing_noise: Vec<&'static str>,
    /// Bare source words that are not, on their own, a valid query.
    source_only: Vec<String>,
    /// News/headline subject markers for bare latest-news requests.
    news_subject_markers: Vec<&'static str>,
    /// Freshness markers that pair with news/headline subjects.
    news_recency_markers: Vec<&'static str>,
    /// Verbs that open a follow-up instruction clause ("compare", "summarize").
    followup_verbs: Vec<&'static str>,
    /// Conjunctions/adverbs that, like punctuation, mark a clause boundary.
    continuation_markers: Vec<&'static str>,
    /// Question openers of an implicit research request ("what is the …").
    research_question_prefixes: Vec<&'static str>,
    /// Superlative/recency modifiers that make a question researchable.
    research_modifiers: Vec<&'static str>,
    /// Evidence nouns (dataset, benchmark, paper …) of a research question.
    research_evidence_domains: Vec<&'static str>,
    /// Evaluation nouns (validation, quality, comparison …) of a question.
    research_evaluation_domains: Vec<&'static str>,
    /// Openers of an enumeration research request ("list all …").
    enumeration_prefixes: Vec<&'static str>,
    /// Constraint connectives that make an enumeration researchable.
    enumeration_constraint_markers: Vec<&'static str>,
}

/// Build (once) the marker projection from the meaning lexicon.
fn markers() -> &'static WebSearchMarkers {
    static CACHE: OnceLock<WebSearchMarkers> = OnceLock::new();
    CACHE.get_or_init(|| WebSearchMarkers {
        explicit_prefixes: prefix_literals(ROLE_WEB_SEARCH_EXPLICIT_PREFIX),
        action_markers: bare_literals(ROLE_WEB_SEARCH_ACTION),
        strong_action_markers: bare_literals(ROLE_WEB_SEARCH_STRONG_ACTION),
        signal_markers: bare_literals(ROLE_WEB_SEARCH_SIGNAL),
        topic_after_markers: prefix_literals(ROLE_WEB_SEARCH_TOPIC_MARKER),
        topic_before_markers: suffix_literals(ROLE_WEB_SEARCH_TOPIC_MARKER),
        imperative_lead_markers: prefix_literals(ROLE_WEB_SEARCH_IMPERATIVE_LEAD),
        leading_noise: prefix_literals(ROLE_WEB_SEARCH_QUERY_LEADING_NOISE),
        trailing_noise: suffix_literals(ROLE_WEB_SEARCH_QUERY_TRAILING_NOISE),
        source_only: source_literals(ROLE_WEB_SEARCH_SOURCE_ONLY),
        news_subject_markers: bare_literals(ROLE_WEB_SEARCH_NEWS_SUBJECT),
        news_recency_markers: bare_literals(ROLE_WEB_SEARCH_NEWS_RECENCY),
        followup_verbs: bare_literals(ROLE_FOLLOWUP_INSTRUCTION_VERB),
        continuation_markers: bare_literals(ROLE_CLAUSE_CONTINUATION_MARKER),
        research_question_prefixes: prefix_literals(ROLE_RESEARCH_QUESTION_OPENER),
        research_modifiers: bare_literals(ROLE_RESEARCH_SUPERLATIVE_MODIFIER),
        research_evidence_domains: bare_literals(ROLE_RESEARCH_EVIDENCE_DOMAIN),
        research_evaluation_domains: bare_literals(ROLE_RESEARCH_EVALUATION_DOMAIN),
        enumeration_prefixes: prefix_literals(ROLE_ENUMERATION_REQUEST_OPENER),
        enumeration_constraint_markers: bare_literals(ROLE_ENUMERATION_CONSTRAINT),
    })
}

/// The literal lead-in (text before the `…` slot) of every prefix-slot form of
/// a role, in lexicon declaration order.
fn prefix_literals(role: &str) -> Vec<&'static str> {
    seed::lexicon()
        .role_word_forms(role)
        .into_iter()
        .filter(|form| form.slot() == Slot::Prefix)
        .map(WordForm::before_slot)
        .collect()
}

/// The literal tail (text after the `…` slot) of every suffix-slot form of a
/// role, in lexicon declaration order.
fn suffix_literals(role: &str) -> Vec<&'static str> {
    seed::lexicon()
        .role_word_forms(role)
        .into_iter()
        .filter(|form| form.slot() == Slot::Suffix)
        .map(WordForm::after_slot)
        .collect()
}

/// The surface text of every bare-slot form of a role, in lexicon declaration
/// order. A meaning's roles apply to all its forms, so we keep only the bare
/// detection tokens and drop any prefix/suffix surfaces the meaning also owns.
fn bare_literals(role: &str) -> Vec<&'static str> {
    seed::lexicon()
        .role_word_forms(role)
        .into_iter()
        .filter(|form| form.slot() == Slot::Bare)
        .map(|form| form.text.as_str())
        .collect()
}

/// The distinct surface words of a role, normalised to a trimmed lowercase key
/// for equality comparison against a cleaned query.
fn source_literals(role: &str) -> Vec<String> {
    seed::lexicon()
        .words_for_role(role)
        .iter()
        .map(|word| word.trim().to_lowercase())
        .collect()
}

fn extract_semantic_web_search_query(normalized: &str) -> Option<String> {
    let markers = markers();
    let has_action = contains_any_search_marker(normalized, &markers.action_markers);
    if !has_action {
        return None;
    }
    let has_strong_action = contains_any_search_marker(normalized, &markers.strong_action_markers);
    if !has_strong_action && !contains_any_search_marker(normalized, &markers.signal_markers) {
        return None;
    }
    for &marker in &markers.topic_after_markers {
        if let Some(index) = normalized.find(marker) {
            let start = index + marker.len();
            if let Some(query) = valid_search_query(&normalized[start..]) {
                return Some(query);
            }
        }
    }
    for &marker in &markers.topic_before_markers {
        if let Some(index) = normalized.find(marker) {
            if let Some(query) = valid_search_query(&normalized[..index]) {
                return Some(query);
            }
        }
    }
    for &marker in &markers.imperative_lead_markers {
        if let Some(index) = normalized.find(marker) {
            let start = index + marker.len();
            if let Some(query) = valid_search_query(&normalized[start..]) {
                return Some(query);
            }
        }
    }
    None
}

fn extract_latest_news_search_request(normalized: &str) -> Option<String> {
    let markers = markers();
    if !contains_any_search_marker(normalized, &markers.news_subject_markers)
        || !contains_any_search_marker(normalized, &markers.news_recency_markers)
    {
        return None;
    }
    valid_news_search_query(normalized)
}

fn extract_implicit_research_question(normalized: &str) -> Option<String> {
    let markers = markers();
    if !starts_with_any(normalized, &markers.research_question_prefixes) {
        return None;
    }
    let padded = format!(" {normalized} ");
    let has_modifier = markers
        .research_modifiers
        .iter()
        .any(|marker| padded.contains(marker));
    let has_evidence_domain = markers
        .research_evidence_domains
        .iter()
        .any(|marker| padded.contains(marker));
    let has_evaluation_domain = markers
        .research_evaluation_domains
        .iter()
        .any(|marker| padded.contains(marker));
    if !(has_modifier || has_evidence_domain && has_evaluation_domain) {
        return None;
    }
    let query = strip_implicit_research_prefix(normalized);
    valid_search_query(query)
}

fn extract_enumeration_research_request(normalized: &str) -> Option<String> {
    let query = strip_enumeration_research_prefix(normalized)?;
    if !looks_like_enumeration_research_query(query) {
        return None;
    }
    valid_search_query(query)
}

fn starts_with_any(value: &str, prefixes: &[&str]) -> bool {
    prefixes.iter().any(|prefix| value.starts_with(prefix))
}

fn strip_implicit_research_prefix(value: &str) -> &str {
    for &prefix in &markers().research_question_prefixes {
        if let Some(stripped) = value.strip_prefix(prefix) {
            return stripped;
        }
    }
    value
}

fn strip_enumeration_research_prefix(value: &str) -> Option<&str> {
    for &prefix in &markers().enumeration_prefixes {
        if let Some(stripped) = value.strip_prefix(prefix) {
            return Some(stripped);
        }
    }
    None
}

fn looks_like_enumeration_research_query(query: &str) -> bool {
    if query.split_whitespace().count() < 3 {
        return false;
    }
    contains_any_search_marker(query, &markers().enumeration_constraint_markers)
}

fn contains_any_search_marker(normalized: &str, markers: &[&str]) -> bool {
    markers
        .iter()
        .any(|marker| contains_search_marker(normalized, marker))
}

fn contains_search_marker(normalized: &str, marker: &str) -> bool {
    if marker.starts_with(' ') || marker.ends_with(' ') {
        let padded = format!(" {normalized} ");
        padded.contains(marker)
    } else {
        normalized.contains(marker)
    }
}

fn valid_search_query(value: &str) -> Option<String> {
    let query = clean_semantic_search_query(value);
    valid_clean_search_query(query)
}

fn valid_news_search_query(value: &str) -> Option<String> {
    let query = clean_search_query(truncate_search_instruction_tail(value));
    valid_clean_search_query(query)
}

fn valid_clean_search_query(query: String) -> Option<String> {
    let query_key = query.to_lowercase();
    if query.is_empty()
        || markers().source_only.iter().any(|word| word == &query_key)
        || normalize_url_candidate(&query).is_some()
    {
        return None;
    }
    Some(query)
}

/// Drop a trailing follow-up instruction clause ("… and summarize who won",
/// "… . Compare their patents") from a query.
///
/// This is a universal boundary algorithm, not a list of memorised fragments: a
/// follow-up clause is one of the lexicon's [`ROLE_FOLLOWUP_INSTRUCTION_VERB`]
/// surfaces sitting immediately after a *boundary* — either sentence
/// punctuation ([`is_sentence_boundary`]) or a run of
/// [`ROLE_CLAUSE_CONTINUATION_MARKER`] words (and / then / and then, walked back
/// so the compound needs no stored surface). The query is cut at the start of
/// the earliest such boundary. A bare verb with no boundary before it is part of
/// the topic and left untouched.
fn truncate_search_instruction_tail(value: &str) -> &str {
    let markers = markers();
    // ASCII-lowercase keeps byte offsets identical to `value` (it only folds
    // A–Z), so indices computed here slice `value` safely; the non-ASCII verbs
    // are already lowercase in the lexicon and unaffected by the fold.
    let lower = value.to_ascii_lowercase();
    let mut cut = value.len();
    for &verb in &markers.followup_verbs {
        let cjk = contains_cjk(verb);
        let mut from = 0;
        while let Some(relative) = lower[from..].find(verb) {
            let start = from + relative;
            let end = start + verb.len();
            from = end;
            // Space-delimited scripts require a whole-token match; CJK verbs have
            // no word boundaries and match as bare substrings.
            if !cjk && (!is_token_start(&lower, start) || !is_token_end(&lower, end)) {
                continue;
            }
            if let Some(boundary) = boundary_before(&lower, start, markers) {
                cut = cut.min(boundary);
            }
        }
    }
    value[..cut].trim()
}

/// Whether `index` begins a whitespace/punctuation-delimited token in `text`
/// (the preceding char is non-alphanumeric, or there is none).
fn is_token_start(text: &str, index: usize) -> bool {
    !text[..index]
        .chars()
        .next_back()
        .is_some_and(char::is_alphanumeric)
}

/// Whether `index` ends a whitespace/punctuation-delimited token in `text` (the
/// following char is non-alphanumeric, or there is none).
fn is_token_end(text: &str, index: usize) -> bool {
    !text[index..]
        .chars()
        .next()
        .is_some_and(char::is_alphanumeric)
}

/// If the text immediately before `verb_start` is a follow-up boundary, return
/// the byte offset at which to cut (the start of the boundary run); otherwise
/// `None`.
fn boundary_before(text: &str, verb_start: usize, markers: &WebSearchMarkers) -> Option<usize> {
    let head = text[..verb_start].trim_end();
    if head.is_empty() {
        // The verb opens the value — there is no preceding clause to split off.
        return None;
    }
    if head.ends_with(is_sentence_boundary) {
        return Some(head.len());
    }
    // Walk back over a run of clause-continuation markers ("and", "then",
    // "and then"); the cut falls at the start of the run.
    let mut cursor = head;
    let mut matched = false;
    loop {
        let trimmed = cursor.trim_end();
        let shortened = markers
            .continuation_markers
            .iter()
            .find(|&&marker| ends_with_token(trimmed, marker))
            .map(|&marker| &trimmed[..trimmed.len() - marker.len()]);
        match shortened {
            Some(rest) => {
                cursor = rest;
                matched = true;
            }
            None => break,
        }
    }
    matched.then(|| cursor.trim_end().len())
}

/// Whether `haystack` ends with `marker` as a whole token. CJK markers match as
/// bare substrings; space-delimited markers require a preceding whitespace (or
/// for the whole string to be exactly the marker).
fn ends_with_token(haystack: &str, marker: &str) -> bool {
    if contains_cjk(marker) {
        haystack.ends_with(marker)
    } else {
        haystack == marker
            || haystack
                .strip_suffix(marker)
                .is_some_and(|head| head.ends_with(char::is_whitespace))
    }
}

fn clean_semantic_search_query(value: &str) -> String {
    let markers = markers();
    let mut query = clean_search_query(truncate_search_instruction_tail(value));
    loop {
        let before = query.clone();
        for &prefix in &markers.leading_noise {
            if let Some(stripped) = query.strip_prefix(prefix) {
                query = clean_search_query(stripped);
            }
        }
        for &suffix in &markers.trailing_noise {
            if let Some(stripped) = query.strip_suffix(suffix) {
                query = clean_search_query(stripped);
            }
        }
        if query == before {
            return query;
        }
    }
}