Skip to main content

inputx_scoring/
lib.rs

1//! `inputx-scoring` — probability-native candidate scoring primitive.
2//!
3//! # The schema
4//!
5//! Every candidate carries two log-space scalars and a typed match
6//! classification:
7//!
8//! ```text
9//!     score(W | i) = log_prior(W) + log_likelihood(i | W)
10//! ```
11//!
12//! This is the Bayesian decomposition `P(W|i) ∝ P(i|W) · P(W)` rendered
13//! in log space — addition replaces multiplication, comparison stays
14//! monotone, and the two factors can be assigned independently by
15//! producers (dictionaries, n-grams, fuzzy matchers, …) without
16//! coordinating a single global formula.
17//!
18//! Both terms are `i32` Q4 fixed-point. One log unit = [`Q4`] (= 16)
19//! integer steps. The Q4 choice trades resolution for headroom: scores
20//! fit comfortably in `i32` even for very rare or very common words
21//! while staying precise enough that ranking-relevant gaps (~0.0625 in
22//! log space) survive quantization.
23//!
24//! # Public surface
25//!
26//! - [`Source`]  — which engine produced the candidate
27//! - [`MatchType`] — how the typed input maps to the candidate
28//! - [`Candidate`] — a (word, log_prior, log_likelihood, match_type, source) tuple
29//! - [`score`] — `log_prior + log_likelihood`, the sort key
30//! - [`Q4`] — log-to-integer scale (= 16)
31//!
32//! Scoring policy lives in the consumer (IME engine cement); this
33//! crate provides only the schema and the additive sort key.
34
35#![cfg_attr(not(feature = "std"), no_std)]
36
37/// Fixed-point scale for log-space scalars. `Q4 = 16` means every
38/// integer step is 1/16 of a log unit (≈ 0.0625). At this resolution,
39/// `i32` covers a dynamic range of ~ ±67 million log units — far more
40/// than any realistic candidate score needs.
41pub const Q4: i32 = 16;
42
43/// Engine that produced a candidate. Numeric representation is stable
44/// across versions so it can cross the FFI boundary as `u8` without
45/// translation. Mirrors `inputx_core::composite::merge::Source`.
46#[repr(u8)]
47#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
48pub enum Source {
49    Wubi = 0,
50    Pinyin = 1,
51    Japanese = 2,
52}
53
54/// Classification of how the typed input `i` maps to a candidate `W`.
55/// Carried alongside the (log_prior, log_likelihood) pair so the
56/// downstream merger / probe / UI can render context without
57/// re-deriving the match shape.
58///
59/// The numeric payloads (`u16` milli scales for proximity / cost,
60/// `u8` link count for composed) are intentionally narrow — they're
61/// classification metadata, not the score itself. Scoring lives in
62/// `log_likelihood`.
63#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
64pub enum MatchType {
65    /// Typed input matches the candidate's full code exactly.
66    Exact,
67    /// Typed input is a prefix of the candidate's full code.
68    /// Payload: `proximity_milli` = `typed_len * 1000 / full_len`,
69    /// `1000` meaning "fully typed" (which collapses to `Exact` for
70    /// producers that wish to distinguish).
71    Prefix(u16),
72    /// Typed input differs from the candidate by a phonetic edit
73    /// distance. Payload: `edit_cost_milli` ∈ `[0, 1000]`, scaled
74    /// against `inputx_phonetic_edit`'s max-cost convention.
75    Fuzzy(u16),
76    /// Typed input was decomposed into a chain of dict segments by a
77    /// Viterbi / DP composer. Payload: `bigram_links` = `chain_len − 1`,
78    /// the number of segment-to-segment bigram joins.
79    Composed { bigram_links: u8 },
80}
81
82/// One scored candidate. Word lifetime is `'a` so consumers can pass
83/// borrowed references through the merge pipeline; the merger clones
84/// only the survivors.
85///
86/// Equality intentionally compares `word + source + match_type`, not
87/// the score — score is a sort key, not part of identity. (Two
88/// producers that emit the same word with the same `match_type` and
89/// `source` are duplicates regardless of how they scored it.)
90#[derive(Copy, Clone, Debug)]
91pub struct Candidate<'a> {
92    pub word: &'a str,
93    /// `Q4 · log(prior probability)`. Producers derive from corpus
94    /// frequency (`Q4 · log(1 + freq)` is the canonical baseline) plus
95    /// any user-level boost (L0 pin, recency, etc.). Non-negative by
96    /// convention but the schema accepts negative for explicit
97    /// down-weights.
98    pub log_prior: i32,
99    /// `Q4 · log(match likelihood)`. Producers derive from the match
100    /// shape: per-type base (e.g. `LIKELIHOOD_JP_JUKUGO_BASE`),
101    /// proximity decay for prefix matches (`Q4 · K · log(proximity)`),
102    /// edit-cost penalty for fuzzy matches, demote/promote factors
103    /// (TC demote, full-match promote, …) folded in additively in log
104    /// space.
105    pub log_likelihood: i32,
106    pub match_type: MatchType,
107    pub source: Source,
108}
109
110impl<'a> PartialEq for Candidate<'a> {
111    fn eq(&self, other: &Self) -> bool {
112        self.word == other.word
113            && self.source == other.source
114            && self.match_type == other.match_type
115    }
116}
117
118impl<'a> Eq for Candidate<'a> {}
119
120/// Bayesian sort key: `log_prior + log_likelihood`. The single source
121/// of truth for ranking. Higher = better.
122///
123/// Producers that wish to expose `score(c)` over the FFI boundary as a
124/// `f64` may convert via `(score(c) as f64) / (Q4 as f64)` to recover
125/// nats / bits in natural log space.
126#[inline]
127pub fn score(c: &Candidate<'_>) -> i32 {
128    c.log_prior + c.log_likelihood
129}
130
131// ─── Derivation helpers (std-only) ─────────────────────────────────────
132//
133// The schema above is `no_std` clean — consumers in `no_std`
134// environments may fill `log_prior` / `log_likelihood` from their own
135// fixed-point log tables or precomputed values. The helpers below offer
136// a default Q4 conversion for the common cases, gated behind `std` so
137// they can use `f64::ln`.
138
139/// Q4 log of natural priors derived from a corpus frequency. Returns
140/// `Q4 · ln(1 + freq)` rounded to the nearest integer. Zero-freq
141/// candidates map to `0`; freq-1 to ~`11`; freq-1000 to ~`110`.
142///
143/// This is the canonical baseline for the `log_prior` term. Producers
144/// add user-level boosts (L0 pin, recency) on top by passing a larger
145/// `freq` (e.g. `freq * 1000` for a pinned word) or by adding directly
146/// to the returned value.
147#[cfg(feature = "std")]
148#[inline]
149pub fn log_prior_from_freq(freq: u64) -> i32 {
150    let ln = ((freq as f64) + 1.0).ln();
151    (ln * (Q4 as f64)).round() as i32
152}
153
154/// Q4 log-likelihood derived from a match-type classification.
155///
156/// `base_log_q4` is the producer-chosen per-engine / per-kind likelihood
157/// floor (already in Q4 log-space) — e.g. `Q4·ln(LIKELIHOOD_JP_JUKUGO_BASE)`.
158/// The match-type decay is applied on top:
159///
160/// - `Exact`              → `base_log_q4`
161/// - `Prefix(prox_milli)` → `base_log_q4 + K · Q4·ln(prox/1000)` with `K=3`
162/// - `Fuzzy(cost_milli)`  → `base_log_q4 + Q4·ln(1 − cost/1000)` (cost-capped at 999 to avoid `-inf`)
163/// - `Composed { links }` → `base_log_q4 + (links − 1) · Q4·ln(0.7)` for `links ≥ 1`, else `base_log_q4`
164///
165/// All decays are non-positive (multiplicative factors ≤ 1.0 in linear
166/// space), so the returned value is always `≤ base_log_q4`. This is the
167/// log-space equivalent of the v1.3 multiplicative chain
168/// `base · proximity^K · (1 − cost) · 0.7^(links − 1)`.
169#[cfg(feature = "std")]
170pub fn derive_log_likelihood(base_log_q4: i32, mt: MatchType) -> i32 {
171    /// `LIKELIHOOD_PREDICT_PROXIMITY_K` from the legacy scoring module —
172    /// pinyin / wubi / JP prefix-prediction all share `K = 3`.
173    const K: f64 = 3.0;
174    /// Per-extra-bigram-link composition decay (mirrors the linear-space
175    /// `0.7` factor pinyin_adapter applies per additional Viterbi join).
176    const LN_COMPOSED_PER_LINK: f64 = -0.356_674_943_938_732_4; // f64::ln(0.7)
177    let q4 = Q4 as f64;
178    match mt {
179        MatchType::Exact => base_log_q4,
180        MatchType::Prefix(prox_milli) => {
181            // Producers that emit `Prefix(1000)` mean "fully typed";
182            // the decay is 0 (ln 1 = 0), recovering the Exact case.
183            let prox = (prox_milli.max(1) as f64) / 1000.0;
184            let decay_q4 = (K * prox.ln() * q4).round() as i32;
185            base_log_q4 + decay_q4
186        }
187        MatchType::Fuzzy(cost_milli) => {
188            // Cap at 999 so the implied `1 − cost/1000` stays > 0 and
189            // `ln` is finite; a cost of 1000 would mean "no match",
190            // which producers should classify as a drop, not a fuzzy
191            // hit.
192            let cost = cost_milli.min(999) as f64 / 1000.0;
193            let decay_q4 = ((1.0 - cost).ln() * q4).round() as i32;
194            base_log_q4 + decay_q4
195        }
196        MatchType::Composed { bigram_links } => {
197            let extra_links = bigram_links.saturating_sub(1) as f64;
198            let decay_q4 = (extra_links * LN_COMPOSED_PER_LINK * q4).round() as i32;
199            base_log_q4 + decay_q4
200        }
201    }
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207
208    #[test]
209    fn q4_anchored_to_16() {
210        assert_eq!(Q4, 16);
211    }
212
213    #[test]
214    fn source_repr_u8_stable() {
215        assert_eq!(Source::Wubi as u8, 0);
216        assert_eq!(Source::Pinyin as u8, 1);
217        assert_eq!(Source::Japanese as u8, 2);
218    }
219
220    #[test]
221    fn bayes_additive_in_log_space() {
222        let cases: &[(i32, i32)] = &[
223            (0, 0),
224            (10 * Q4, 5 * Q4),
225            (100 * Q4, -30 * Q4),
226            (-50 * Q4, 100 * Q4),
227            (i32::MAX / 4, i32::MIN / 4),
228        ];
229        for &(lp, ll) in cases {
230            let c = Candidate {
231                word: "x",
232                log_prior: lp,
233                log_likelihood: ll,
234                match_type: MatchType::Exact,
235                source: Source::Pinyin,
236            };
237            assert_eq!(score(&c), lp.saturating_add(ll));
238        }
239    }
240
241    #[test]
242    fn match_type_round_trip_variants() {
243        let _exact = MatchType::Exact;
244        let prefix = MatchType::Prefix(875);
245        let fuzzy = MatchType::Fuzzy(300);
246        let composed = MatchType::Composed { bigram_links: 2 };
247        assert_eq!(prefix, MatchType::Prefix(875));
248        assert_eq!(fuzzy, MatchType::Fuzzy(300));
249        assert_eq!(composed, MatchType::Composed { bigram_links: 2 });
250        assert_ne!(prefix, MatchType::Prefix(874));
251        assert_ne!(composed, MatchType::Composed { bigram_links: 1 });
252    }
253
254    #[test]
255    fn candidate_equality_ignores_score() {
256        let a = Candidate {
257            word: "继续",
258            log_prior: 100 * Q4,
259            log_likelihood: 50 * Q4,
260            match_type: MatchType::Exact,
261            source: Source::Pinyin,
262        };
263        let b = Candidate {
264            word: "继续",
265            log_prior: 999 * Q4,
266            log_likelihood: -10 * Q4,
267            match_type: MatchType::Exact,
268            source: Source::Pinyin,
269        };
270        let c_word_diff = Candidate { word: "继续。", ..a };
271        let c_source_diff = Candidate { source: Source::Wubi, ..a };
272        let c_mt_diff = Candidate { match_type: MatchType::Prefix(500), ..a };
273        assert_eq!(a, b, "score should not affect identity");
274        assert_ne!(a, c_word_diff);
275        assert_ne!(a, c_source_diff);
276        assert_ne!(a, c_mt_diff);
277    }
278
279    #[cfg(feature = "std")]
280    #[test]
281    fn log_prior_from_freq_baseline_values() {
282        assert_eq!(log_prior_from_freq(0), 0);
283        let f1 = log_prior_from_freq(1);
284        assert!((10..=12).contains(&f1), "ln(2)·16 ≈ 11; got {f1}");
285        let f1000 = log_prior_from_freq(1000);
286        assert!((110..=112).contains(&f1000), "ln(1001)·16 ≈ 110; got {f1000}");
287        let f50000 = log_prior_from_freq(50_000);
288        assert!(f50000 > log_prior_from_freq(1000), "monotone in freq");
289    }
290
291    #[cfg(feature = "std")]
292    #[test]
293    fn derive_log_likelihood_exact_is_base() {
294        for base in [0, Q4, -10 * Q4, 100 * Q4] {
295            assert_eq!(derive_log_likelihood(base, MatchType::Exact), base);
296        }
297    }
298
299    #[cfg(feature = "std")]
300    #[test]
301    fn derive_log_likelihood_prefix_monotonic() {
302        let base = 100 * Q4;
303        let high = derive_log_likelihood(base, MatchType::Prefix(875));
304        let mid = derive_log_likelihood(base, MatchType::Prefix(500));
305        let low = derive_log_likelihood(base, MatchType::Prefix(100));
306        let full = derive_log_likelihood(base, MatchType::Prefix(1000));
307        assert_eq!(full, base, "prox=1000 collapses to base (ln 1 = 0)");
308        assert!(high < base && mid < high && low < mid,
309            "prefix decay must be monotone-decreasing in proximity; got full={full} high={high} mid={mid} low={low}");
310    }
311
312    #[cfg(feature = "std")]
313    #[test]
314    fn derive_log_likelihood_fuzzy_monotonic_in_cost() {
315        let base = 100 * Q4;
316        let cheap = derive_log_likelihood(base, MatchType::Fuzzy(100));
317        let expensive = derive_log_likelihood(base, MatchType::Fuzzy(700));
318        let zero = derive_log_likelihood(base, MatchType::Fuzzy(0));
319        assert_eq!(zero, base, "cost=0 collapses to base (ln 1 = 0)");
320        assert!(expensive < cheap && cheap < base,
321            "fuzzy decay must be monotone-decreasing in cost; got zero={zero} cheap={cheap} expensive={expensive}");
322    }
323
324    #[cfg(feature = "std")]
325    #[test]
326    fn derive_log_likelihood_composed_per_link_decay() {
327        let base = 200 * Q4;
328        let one = derive_log_likelihood(base, MatchType::Composed { bigram_links: 1 });
329        let two = derive_log_likelihood(base, MatchType::Composed { bigram_links: 2 });
330        let three = derive_log_likelihood(base, MatchType::Composed { bigram_links: 3 });
331        let zero = derive_log_likelihood(base, MatchType::Composed { bigram_links: 0 });
332        assert_eq!(zero, base, "0 links — no chain — no decay");
333        assert_eq!(one, base, "1 link is the first segment; (links − 1) = 0, no decay");
334        assert!(two < one && three < two,
335            "composed decay must drop per extra link; got one={one} two={two} three={three}");
336    }
337
338    #[cfg(feature = "std")]
339    #[test]
340    fn derive_log_likelihood_bayes_additivity_round_trip() {
341        // The whole point: caller can compute (log_prior, log_likelihood)
342        // independently and sum to a stable rank. Spot-check that
343        // `score()` of a candidate built from `log_prior_from_freq` +
344        // `derive_log_likelihood` recovers a sensible sort order across
345        // a synthetic 4-tuple covering Exact / Prefix / Fuzzy / Composed.
346        let base = 160 * Q4;
347        let make = |freq: u64, mt: MatchType| Candidate {
348            word: "x",
349            log_prior: log_prior_from_freq(freq),
350            log_likelihood: derive_log_likelihood(base, mt),
351            match_type: mt,
352            source: Source::Pinyin,
353        };
354        let exact_common = make(50_000, MatchType::Exact);
355        let exact_rare = make(10, MatchType::Exact);
356        let prefix_common = make(50_000, MatchType::Prefix(875));
357        let fuzzy_common = make(50_000, MatchType::Fuzzy(300));
358        // The common exact wins; common prefix beats rare exact (because
359        // prior contribution dominates); common fuzzy sits between common
360        // prefix and rare exact (rough chain, ranking-only check).
361        assert!(score(&exact_common) > score(&prefix_common));
362        assert!(score(&prefix_common) > score(&exact_rare),
363            "common prefix prior wins rare exact; got pref={} rare={}",
364            score(&prefix_common), score(&exact_rare));
365        assert!(score(&exact_common) > score(&fuzzy_common));
366    }
367
368    #[test]
369    fn score_orders_candidates_by_log_sum_desc() {
370        let mk = |lp: i32, ll: i32| Candidate {
371            word: "w",
372            log_prior: lp,
373            log_likelihood: ll,
374            match_type: MatchType::Exact,
375            source: Source::Pinyin,
376        };
377        let mut cands = [mk(10, 20), mk(50, -10), mk(0, 0), mk(100, 100)];
378        cands.sort_by(|a, b| score(b).cmp(&score(a)));
379        assert_eq!(score(&cands[0]), 200);
380        assert_eq!(score(&cands[1]), 40);
381        assert_eq!(score(&cands[2]), 30);
382        assert_eq!(score(&cands[3]), 0);
383    }
384}