1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
//! Unit-incompatibility handler extracted from `solver_handlers` to keep each
//! source file under the 1000-line cap enforced by `scripts/check-file-size.rs`.
use crate::engine::SymbolicAnswer;
use crate::event_log::EventLog;
use crate::seed::{lexicon, Lexicon, Meaning, ROLE_MEASUREMENT_UNIT, ROLE_PHYSICAL_DIMENSION};
use crate::solver_handlers::finalize_simple;
/// Detect queries that ask to convert between dimensionally incompatible units.
///
/// Meters measure length; kilobytes measure data storage. These quantities
/// live in different physical dimensions and have no conversion factor, so
/// the symbolic answer must say so explicitly rather than falling through to
/// `intent:unknown`.
///
/// The units, the physical dimensions they measure, and their surface words in
/// every supported language are **not** hardcoded here — they are read from the
/// meaning lexicon (`data/seed/meanings-units.lino`), where each unit meaning is
/// `defined_by` the dimension it measures (issue #386). This code knows only the
/// *concepts* "measurement unit" and "physical dimension"; the words live once,
/// in the data, and translate to any supported language.
pub fn try_incompatible_units(
prompt: &str,
normalized: &str,
log: &mut EventLog,
) -> Option<SymbolicAnswer> {
let (unit_a, dim_a, unit_b, dim_b) = detect_incompatible_unit_pair(normalized)?;
log.append(
"unit_incompatibility",
format!("{unit_a}:{dim_a} vs {unit_b}:{dim_b}"),
);
let body = format!(
"{unit_a} measures {dim_a}; {unit_b} measures {dim_b}. \
These are different physical dimensions and cannot be converted into each other. \
The incompatibility is recorded as a `unit_incompatibility` link in the network."
);
Some(finalize_simple(
prompt,
log,
"unit_incompatibility",
"response:unit_incompatibility",
&body,
1.0,
))
}
/// The English label of the physical dimension a `unit` measures.
///
/// Resolved through the unit meaning's `defined_by` graph: the dimension is the
/// `defined_by` target that itself plays the [`ROLE_PHYSICAL_DIMENSION`] role
/// (e.g. `meter` is `defined_by "length"` and `defined_by "unit"`, and `length`
/// carries the dimension role). The label is the dimension's English lexeme so
/// the rendered explanation reads naturally — it lives in the data, not here.
fn dimension_label<'a>(lex: &'a Lexicon, unit: &'a Meaning) -> Option<&'a str> {
unit.defined_by
.iter()
.filter_map(|slug| lex.meaning(slug))
.find(|m| m.has_role(ROLE_PHYSICAL_DIMENSION))
.and_then(|dim| dim.word_in("en"))
}
/// Whether `unit` appears in `normalized` as a standalone word rather than as a
/// fragment of a larger word.
///
/// Issue #334: a plain `normalized.contains(unit)` matched "mb" inside
/// "nu**mb**er" and "gram" inside "pro**gram**", so the coding prompt "Write a
/// program that computes the 10th Fibonacci number" was misread as a
/// length/mass conversion and answered with a unit-incompatibility refusal. A
/// unit token only counts when both of its neighbouring characters are
/// non-alphabetic (string edge, whitespace, punctuation, or a digit such as the
/// "500" in "500mb"), so genuine units still match while embedded fragments do
/// not.
fn contains_unit_word(normalized: &str, unit: &str) -> bool {
// Inflected alphabetic scripts (Russian "килобайт" -> "килобайте", Hindi
// "किलोबाइट") attach suffixes directly to the unit, so a strict word boundary
// would reject legitimate forms — those keep the permissive substring match.
//
// CJK is different: it has no inter-word spaces, so its ideographs glue into
// unrelated compounds. A bare-substring match read the day unit "天" inside
// "天气" (weather) and the gram unit "克" inside the transliteration "弗拉克斯",
// turning a units-free prompt into a phantom time-vs-mass incompatibility
// (issue #386). Ideographs are alphabetic to `char::is_alphabetic`, so the
// same boundary rule used for ASCII rejects a unit glued inside a larger
// compound while still matching one next to a digit ("7天", "5千克") or at a
// token edge. The boundary check therefore applies to ASCII and CJK units;
// only the inflected alphabetic scripts take the permissive path above.
if !unit.is_ascii() && !crate::coding::contains_cjk(unit) {
return normalized.contains(unit);
}
let boundary_ok = |ch: Option<char>| ch.is_none_or(|c| !c.is_alphabetic());
let mut search_from = 0;
while let Some(offset) = normalized[search_from..].find(unit) {
let start = search_from + offset;
let end = start + unit.len();
let before = normalized[..start].chars().next_back();
let after = normalized[end..].chars().next();
if boundary_ok(before) && boundary_ok(after) {
return true;
}
// Advance past this occurrence. `end` is always a char boundary (the
// unit matched there), whereas `start + 1` could land inside a
// multi-byte UTF-8 character and panic when sliced.
search_from = end;
}
false
}
/// Return the first matched unit token for each of two distinct physical
/// dimensions, together with their dimension labels, or `None` if `normalized`
/// does not mention units from at least two different dimensions.
///
/// Walks every meaning that plays the [`ROLE_MEASUREMENT_UNIT`] role in lexicon
/// declaration order, keeping the first matched unit per dimension. Two units
/// that measure different dimensions cannot be converted into one another —
/// that is the incompatibility the caller reports. The result tuple is
/// `(unit_a, dim_a, unit_b, dim_b)`.
#[allow(clippy::type_complexity)]
fn detect_incompatible_unit_pair(
normalized: &str,
) -> Option<(&'static str, &'static str, &'static str, &'static str)> {
let lex = lexicon();
// (matched surface word, dimension label) — one entry per distinct
// dimension, in lexicon order, so the rendered message is deterministic.
let mut found: Vec<(&'static str, &'static str)> = Vec::new();
for unit in lex.meanings_with_role(ROLE_MEASUREMENT_UNIT) {
let Some(dim) = dimension_label(lex, unit) else {
continue;
};
if found.iter().any(|(_, seen)| *seen == dim) {
continue; // already have a unit witnessing this dimension
}
let mut matched: Option<&'static str> = None;
for word in unit.words() {
if contains_unit_word(normalized, word) {
matched = Some(word);
break;
}
}
if let Some(word) = matched {
found.push((word, dim));
}
}
if found.len() < 2 {
return None;
}
let (unit_a, dim_a) = found[0];
let (unit_b, dim_b) = found[1];
Some((unit_a, dim_a, unit_b, dim_b))
}