inputx_scoring/lib.rs
1//! `inputx-scoring` — probability-native candidate scoring primitive.
2//!
3//! # The schema
4//!
5//! Every candidate carries two log-space scalars and a typed match
6//! classification:
7//!
8//! ```text
9//! score(W | i) = log_prior(W) + log_likelihood(i | W)
10//! ```
11//!
12//! This is the Bayesian decomposition `P(W|i) ∝ P(i|W) · P(W)` rendered
13//! in log space — addition replaces multiplication, comparison stays
14//! monotone, and the two factors can be assigned independently by
15//! producers (dictionaries, n-grams, fuzzy matchers, …) without
16//! coordinating a single global formula.
17//!
18//! Both terms are `i32` Q4 fixed-point. One log unit = [`Q4`] (= 16)
19//! integer steps. The Q4 choice trades resolution for headroom: scores
20//! fit comfortably in `i32` even for very rare or very common words
21//! while staying precise enough that ranking-relevant gaps (~0.0625 in
22//! log space) survive quantization.
23//!
24//! # Public surface
25//!
26//! - [`Source`] — which engine produced the candidate
27//! - [`MatchType`] — how the typed input maps to the candidate
28//! - [`Candidate`] — a (word, log_prior, log_likelihood, match_type, source) tuple
29//! - [`score`] — `log_prior + log_likelihood`, the sort key
30//! - [`Q4`] — log-to-integer scale (= 16)
31//!
32//! Scoring policy lives in the consumer (IME engine cement); this
33//! crate provides only the schema and the additive sort key.
34
35#![cfg_attr(not(feature = "std"), no_std)]
36
37/// Fixed-point scale for log-space scalars. `Q4 = 16` means every
38/// integer step is 1/16 of a log unit (≈ 0.0625). At this resolution,
39/// `i32` covers a dynamic range of ~ ±67 million log units — far more
40/// than any realistic candidate score needs.
41pub const Q4: i32 = 16;
42
43/// Engine that produced a candidate. Numeric representation is stable
44/// across versions so it can cross the FFI boundary as `u8` without
45/// translation. Mirrors `inputx_core::composite::merge::Source`.
46#[repr(u8)]
47#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
48pub enum Source {
49 Wubi = 0,
50 Pinyin = 1,
51 Japanese = 2,
52}
53
54/// Classification of how the typed input `i` maps to a candidate `W`.
55/// Carried alongside the (log_prior, log_likelihood) pair so the
56/// downstream merger / probe / UI can render context without
57/// re-deriving the match shape.
58///
59/// The numeric payloads (`u16` milli scales for proximity / cost,
60/// `u8` link count for composed) are intentionally narrow — they're
61/// classification metadata, not the score itself. Scoring lives in
62/// `log_likelihood`.
63#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
64pub enum MatchType {
65 /// Typed input matches the candidate's full code exactly.
66 Exact,
67 /// Typed input is a prefix of the candidate's full code.
68 /// Payload: `proximity_milli` = `typed_len * 1000 / full_len`,
69 /// `1000` meaning "fully typed" (which collapses to `Exact` for
70 /// producers that wish to distinguish).
71 Prefix(u16),
72 /// Typed input differs from the candidate by a phonetic edit
73 /// distance. Payload: `edit_cost_milli` ∈ `[0, 1000]`, scaled
74 /// against `inputx_phonetic_edit`'s max-cost convention.
75 Fuzzy(u16),
76 /// Typed input was decomposed into a chain of dict segments by a
77 /// Viterbi / DP composer. Payload: `bigram_links` = `chain_len − 1`,
78 /// the number of segment-to-segment bigram joins.
79 Composed { bigram_links: u8 },
80}
81
82/// One scored candidate. Word lifetime is `'a` so consumers can pass
83/// borrowed references through the merge pipeline; the merger clones
84/// only the survivors.
85///
86/// Equality intentionally compares `word + source + match_type`, not
87/// the score — score is a sort key, not part of identity. (Two
88/// producers that emit the same word with the same `match_type` and
89/// `source` are duplicates regardless of how they scored it.)
90#[derive(Copy, Clone, Debug)]
91pub struct Candidate<'a> {
92 pub word: &'a str,
93 /// `Q4 · log(prior probability)`. Producers derive from corpus
94 /// frequency (`Q4 · log(1 + freq)` is the canonical baseline) plus
95 /// any user-level boost (L0 pin, recency, etc.). Non-negative by
96 /// convention but the schema accepts negative for explicit
97 /// down-weights.
98 pub log_prior: i32,
99 /// `Q4 · log(match likelihood)`. Producers derive from the match
100 /// shape: per-type base (e.g. `LIKELIHOOD_JP_JUKUGO_BASE`),
101 /// proximity decay for prefix matches (`Q4 · K · log(proximity)`),
102 /// edit-cost penalty for fuzzy matches, demote/promote factors
103 /// (TC demote, full-match promote, …) folded in additively in log
104 /// space.
105 pub log_likelihood: i32,
106 pub match_type: MatchType,
107 pub source: Source,
108}
109
110impl<'a> PartialEq for Candidate<'a> {
111 fn eq(&self, other: &Self) -> bool {
112 self.word == other.word
113 && self.source == other.source
114 && self.match_type == other.match_type
115 }
116}
117
118impl<'a> Eq for Candidate<'a> {}
119
120/// Bayesian sort key: `log_prior + log_likelihood`. The single source
121/// of truth for ranking. Higher = better.
122///
123/// Producers that wish to expose `score(c)` over the FFI boundary as a
124/// `f64` may convert via `(score(c) as f64) / (Q4 as f64)` to recover
125/// nats / bits in natural log space.
126#[inline]
127pub fn score(c: &Candidate<'_>) -> i32 {
128 c.log_prior + c.log_likelihood
129}
130
131// ─── Derivation helpers (std-only) ─────────────────────────────────────
132//
133// The schema above is `no_std` clean — consumers in `no_std`
134// environments may fill `log_prior` / `log_likelihood` from their own
135// fixed-point log tables or precomputed values. The helpers below offer
136// a default Q4 conversion for the common cases, gated behind `std` so
137// they can use `f64::ln`.
138
139/// Q4 log of natural priors derived from a corpus frequency. Returns
140/// `Q4 · ln(1 + freq)` rounded to the nearest integer. Zero-freq
141/// candidates map to `0`; freq-1 to ~`11`; freq-1000 to ~`110`.
142///
143/// This is the canonical baseline for the `log_prior` term. Producers
144/// add user-level boosts (L0 pin, recency) on top by passing a larger
145/// `freq` (e.g. `freq * 1000` for a pinned word) or by adding directly
146/// to the returned value.
147#[cfg(feature = "std")]
148#[inline]
149pub fn log_prior_from_freq(freq: u64) -> i32 {
150 let ln = ((freq as f64) + 1.0).ln();
151 (ln * (Q4 as f64)).round() as i32
152}
153
154/// Q4 log-likelihood derived from a match-type classification.
155///
156/// `base_log_q4` is the producer-chosen per-engine / per-kind likelihood
157/// floor (already in Q4 log-space) — e.g. `Q4·ln(LIKELIHOOD_JP_JUKUGO_BASE)`.
158/// The match-type decay is applied on top:
159///
160/// - `Exact` → `base_log_q4`
161/// - `Prefix(prox_milli)` → `base_log_q4 + K · Q4·ln(prox/1000)` with `K=3`
162/// - `Fuzzy(cost_milli)` → `base_log_q4 + Q4·ln(1 − cost/1000)` (cost-capped at 999 to avoid `-inf`)
163/// - `Composed { links }` → `base_log_q4 + (links − 1) · Q4·ln(0.7)` for `links ≥ 1`, else `base_log_q4`
164///
165/// All decays are non-positive (multiplicative factors ≤ 1.0 in linear
166/// space), so the returned value is always `≤ base_log_q4`. This is the
167/// log-space equivalent of the v1.3 multiplicative chain
168/// `base · proximity^K · (1 − cost) · 0.7^(links − 1)`.
169#[cfg(feature = "std")]
170pub fn derive_log_likelihood(base_log_q4: i32, mt: MatchType) -> i32 {
171 /// `LIKELIHOOD_PREDICT_PROXIMITY_K` from the legacy scoring module —
172 /// pinyin / wubi / JP prefix-prediction all share `K = 3`.
173 const K: f64 = 3.0;
174 /// Per-extra-bigram-link composition decay (mirrors the linear-space
175 /// `0.7` factor pinyin_adapter applies per additional Viterbi join).
176 const LN_COMPOSED_PER_LINK: f64 = -0.356_674_943_938_732_4; // f64::ln(0.7)
177 let q4 = Q4 as f64;
178 match mt {
179 MatchType::Exact => base_log_q4,
180 MatchType::Prefix(prox_milli) => {
181 // Producers that emit `Prefix(1000)` mean "fully typed";
182 // the decay is 0 (ln 1 = 0), recovering the Exact case.
183 let prox = (prox_milli.max(1) as f64) / 1000.0;
184 let decay_q4 = (K * prox.ln() * q4).round() as i32;
185 base_log_q4 + decay_q4
186 }
187 MatchType::Fuzzy(cost_milli) => {
188 // Cap at 999 so the implied `1 − cost/1000` stays > 0 and
189 // `ln` is finite; a cost of 1000 would mean "no match",
190 // which producers should classify as a drop, not a fuzzy
191 // hit.
192 let cost = cost_milli.min(999) as f64 / 1000.0;
193 let decay_q4 = ((1.0 - cost).ln() * q4).round() as i32;
194 base_log_q4 + decay_q4
195 }
196 MatchType::Composed { bigram_links } => {
197 let extra_links = bigram_links.saturating_sub(1) as f64;
198 let decay_q4 = (extra_links * LN_COMPOSED_PER_LINK * q4).round() as i32;
199 base_log_q4 + decay_q4
200 }
201 }
202}
203
204#[cfg(test)]
205mod tests {
206 use super::*;
207
208 #[test]
209 fn q4_anchored_to_16() {
210 assert_eq!(Q4, 16);
211 }
212
213 #[test]
214 fn source_repr_u8_stable() {
215 assert_eq!(Source::Wubi as u8, 0);
216 assert_eq!(Source::Pinyin as u8, 1);
217 assert_eq!(Source::Japanese as u8, 2);
218 }
219
220 #[test]
221 fn bayes_additive_in_log_space() {
222 let cases: &[(i32, i32)] = &[
223 (0, 0),
224 (10 * Q4, 5 * Q4),
225 (100 * Q4, -30 * Q4),
226 (-50 * Q4, 100 * Q4),
227 (i32::MAX / 4, i32::MIN / 4),
228 ];
229 for &(lp, ll) in cases {
230 let c = Candidate {
231 word: "x",
232 log_prior: lp,
233 log_likelihood: ll,
234 match_type: MatchType::Exact,
235 source: Source::Pinyin,
236 };
237 assert_eq!(score(&c), lp.saturating_add(ll));
238 }
239 }
240
241 #[test]
242 fn match_type_round_trip_variants() {
243 let _exact = MatchType::Exact;
244 let prefix = MatchType::Prefix(875);
245 let fuzzy = MatchType::Fuzzy(300);
246 let composed = MatchType::Composed { bigram_links: 2 };
247 assert_eq!(prefix, MatchType::Prefix(875));
248 assert_eq!(fuzzy, MatchType::Fuzzy(300));
249 assert_eq!(composed, MatchType::Composed { bigram_links: 2 });
250 assert_ne!(prefix, MatchType::Prefix(874));
251 assert_ne!(composed, MatchType::Composed { bigram_links: 1 });
252 }
253
254 #[test]
255 fn candidate_equality_ignores_score() {
256 let a = Candidate {
257 word: "继续",
258 log_prior: 100 * Q4,
259 log_likelihood: 50 * Q4,
260 match_type: MatchType::Exact,
261 source: Source::Pinyin,
262 };
263 let b = Candidate {
264 word: "继续",
265 log_prior: 999 * Q4,
266 log_likelihood: -10 * Q4,
267 match_type: MatchType::Exact,
268 source: Source::Pinyin,
269 };
270 let c_word_diff = Candidate { word: "继续。", ..a };
271 let c_source_diff = Candidate { source: Source::Wubi, ..a };
272 let c_mt_diff = Candidate { match_type: MatchType::Prefix(500), ..a };
273 assert_eq!(a, b, "score should not affect identity");
274 assert_ne!(a, c_word_diff);
275 assert_ne!(a, c_source_diff);
276 assert_ne!(a, c_mt_diff);
277 }
278
279 #[cfg(feature = "std")]
280 #[test]
281 fn log_prior_from_freq_baseline_values() {
282 assert_eq!(log_prior_from_freq(0), 0);
283 let f1 = log_prior_from_freq(1);
284 assert!((10..=12).contains(&f1), "ln(2)·16 ≈ 11; got {f1}");
285 let f1000 = log_prior_from_freq(1000);
286 assert!((110..=112).contains(&f1000), "ln(1001)·16 ≈ 110; got {f1000}");
287 let f50000 = log_prior_from_freq(50_000);
288 assert!(f50000 > log_prior_from_freq(1000), "monotone in freq");
289 }
290
291 #[cfg(feature = "std")]
292 #[test]
293 fn derive_log_likelihood_exact_is_base() {
294 for base in [0, Q4, -10 * Q4, 100 * Q4] {
295 assert_eq!(derive_log_likelihood(base, MatchType::Exact), base);
296 }
297 }
298
299 #[cfg(feature = "std")]
300 #[test]
301 fn derive_log_likelihood_prefix_monotonic() {
302 let base = 100 * Q4;
303 let high = derive_log_likelihood(base, MatchType::Prefix(875));
304 let mid = derive_log_likelihood(base, MatchType::Prefix(500));
305 let low = derive_log_likelihood(base, MatchType::Prefix(100));
306 let full = derive_log_likelihood(base, MatchType::Prefix(1000));
307 assert_eq!(full, base, "prox=1000 collapses to base (ln 1 = 0)");
308 assert!(high < base && mid < high && low < mid,
309 "prefix decay must be monotone-decreasing in proximity; got full={full} high={high} mid={mid} low={low}");
310 }
311
312 #[cfg(feature = "std")]
313 #[test]
314 fn derive_log_likelihood_fuzzy_monotonic_in_cost() {
315 let base = 100 * Q4;
316 let cheap = derive_log_likelihood(base, MatchType::Fuzzy(100));
317 let expensive = derive_log_likelihood(base, MatchType::Fuzzy(700));
318 let zero = derive_log_likelihood(base, MatchType::Fuzzy(0));
319 assert_eq!(zero, base, "cost=0 collapses to base (ln 1 = 0)");
320 assert!(expensive < cheap && cheap < base,
321 "fuzzy decay must be monotone-decreasing in cost; got zero={zero} cheap={cheap} expensive={expensive}");
322 }
323
324 #[cfg(feature = "std")]
325 #[test]
326 fn derive_log_likelihood_composed_per_link_decay() {
327 let base = 200 * Q4;
328 let one = derive_log_likelihood(base, MatchType::Composed { bigram_links: 1 });
329 let two = derive_log_likelihood(base, MatchType::Composed { bigram_links: 2 });
330 let three = derive_log_likelihood(base, MatchType::Composed { bigram_links: 3 });
331 let zero = derive_log_likelihood(base, MatchType::Composed { bigram_links: 0 });
332 assert_eq!(zero, base, "0 links — no chain — no decay");
333 assert_eq!(one, base, "1 link is the first segment; (links − 1) = 0, no decay");
334 assert!(two < one && three < two,
335 "composed decay must drop per extra link; got one={one} two={two} three={three}");
336 }
337
338 #[cfg(feature = "std")]
339 #[test]
340 fn derive_log_likelihood_bayes_additivity_round_trip() {
341 // The whole point: caller can compute (log_prior, log_likelihood)
342 // independently and sum to a stable rank. Spot-check that
343 // `score()` of a candidate built from `log_prior_from_freq` +
344 // `derive_log_likelihood` recovers a sensible sort order across
345 // a synthetic 4-tuple covering Exact / Prefix / Fuzzy / Composed.
346 let base = 160 * Q4;
347 let make = |freq: u64, mt: MatchType| Candidate {
348 word: "x",
349 log_prior: log_prior_from_freq(freq),
350 log_likelihood: derive_log_likelihood(base, mt),
351 match_type: mt,
352 source: Source::Pinyin,
353 };
354 let exact_common = make(50_000, MatchType::Exact);
355 let exact_rare = make(10, MatchType::Exact);
356 let prefix_common = make(50_000, MatchType::Prefix(875));
357 let fuzzy_common = make(50_000, MatchType::Fuzzy(300));
358 // The common exact wins; common prefix beats rare exact (because
359 // prior contribution dominates); common fuzzy sits between common
360 // prefix and rare exact (rough chain, ranking-only check).
361 assert!(score(&exact_common) > score(&prefix_common));
362 assert!(score(&prefix_common) > score(&exact_rare),
363 "common prefix prior wins rare exact; got pref={} rare={}",
364 score(&prefix_common), score(&exact_rare));
365 assert!(score(&exact_common) > score(&fuzzy_common));
366 }
367
368 #[test]
369 fn score_orders_candidates_by_log_sum_desc() {
370 let mk = |lp: i32, ll: i32| Candidate {
371 word: "w",
372 log_prior: lp,
373 log_likelihood: ll,
374 match_type: MatchType::Exact,
375 source: Source::Pinyin,
376 };
377 let mut cands = [mk(10, 20), mk(50, -10), mk(0, 0), mk(100, 100)];
378 cands.sort_by(|a, b| score(b).cmp(&score(a)));
379 assert_eq!(score(&cands[0]), 200);
380 assert_eq!(score(&cands[1]), 40);
381 assert_eq!(score(&cands[2]), 30);
382 assert_eq!(score(&cands[3]), 0);
383 }
384}