every_other_token/
transforms.rs

1//! Token transform pipeline.
2//!
3//! This module defines the [`Transform`] enum and associated helpers used to
4//! mutate individual tokens in the live LLM stream.  Transforms can be stacked
5//! via [`Transform::Chain`] or selected randomly via [`Transform::Chaos`].
6//!
7//! ## Available transforms
8//!
9//! | Name | Effect |
10//! |------|--------|
11//! | `reverse` | Reverses the characters of the token |
12//! | `uppercase` | Converts the token to uppercase |
13//! | `mock` | Applies alternating lower/upper case per character |
14//! | `noise` | Appends a random symbol from `* + ~ @ # $ %` |
15//! | `chaos` | Randomly selects one of the above per call |
16//! | `scramble` | Fisher-Yates shuffles the token's characters |
17//! | `delete` | Replaces the token with the empty string |
18//! | `synonym` | Substitutes the token with a static synonym, if known |
19//! | `delay:N` | Passes the token through after an N-millisecond pause |
20
21use colored::*;
22use once_cell::sync::Lazy;
23use rand::Rng;
24use std::collections::HashMap;
25use std::sync::Mutex;
26
27const NOISE_CHARS: [char; 7] = ['*', '+', '~', '@', '#', '$', '%'];
28
29static SYNONYM_MAP: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
30    let mut m = HashMap::new();
31    // Original 30 entries
32    m.insert("good", "great");
33    m.insert("bad", "poor");
34    m.insert("fast", "quick");
35    m.insert("slow", "gradual");
36    m.insert("big", "large");
37    m.insert("small", "tiny");
38    m.insert("happy", "glad");
39    m.insert("sad", "unhappy");
40    m.insert("smart", "clever");
41    m.insert("old", "aged");
42    m.insert("new", "fresh");
43    m.insert("hot", "warm");
44    m.insert("cold", "cool");
45    m.insert("hard", "tough");
46    m.insert("easy", "simple");
47    m.insert("start", "begin");
48    m.insert("end", "finish");
49    m.insert("make", "create");
50    m.insert("get", "obtain");
51    m.insert("use", "employ");
52    m.insert("say", "state");
53    m.insert("go", "proceed");
54    m.insert("see", "observe");
55    m.insert("know", "understand");
56    m.insert("think", "believe");
57    m.insert("come", "arrive");
58    m.insert("take", "acquire");
59    m.insert("give", "provide");
60    m.insert("find", "locate");
61    m.insert("tell", "inform");
62    // Adjectives
63    m.insert("bright", "vivid");
64    m.insert("dark", "dim");
65    m.insert("clean", "pure");
66    m.insert("dirty", "grimy");
67    m.insert("strong", "powerful");
68    m.insert("weak", "frail");
69    m.insert("rich", "wealthy");
70    m.insert("young", "youthful");
71    m.insert("pretty", "beautiful");
72    m.insert("ugly", "hideous");
73    m.insert("loud", "noisy");
74    m.insert("quiet", "silent");
75    m.insert("angry", "furious");
76    m.insert("calm", "serene");
77    m.insert("brave", "courageous");
78    m.insert("scared", "frightened");
79    m.insert("funny", "amusing");
80    m.insert("serious", "solemn");
81    m.insert("kind", "gentle");
82    m.insert("cruel", "harsh");
83    m.insert("empty", "hollow");
84    m.insert("full", "packed");
85    m.insert("rough", "coarse");
86    m.insert("smooth", "sleek");
87    m.insert("sharp", "keen");
88    m.insert("dull", "blunt");
89    m.insert("deep", "profound");
90    m.insert("shallow", "superficial");
91    m.insert("wide", "broad");
92    m.insert("narrow", "slim");
93    m.insert("long", "lengthy");
94    m.insert("short", "brief");
95    m.insert("heavy", "weighty");
96    m.insert("light", "featherweight");
97    m.insert("warm", "heated");
98    m.insert("frozen", "icy");
99    m.insert("luminous", "bright");
100    m.insert("gloomy", "dreary");
101    m.insert("lively", "energetic");
102    m.insert("tired", "weary");
103    m.insert("healthy", "robust");
104    m.insert("sick", "ill");
105    m.insert("safe", "secure");
106    m.insert("dangerous", "hazardous");
107    m.insert("important", "crucial");
108    m.insert("trivial", "minor");
109    m.insert("simple", "plain");
110    m.insert("complex", "intricate");
111    m.insert("rare", "scarce");
112    m.insert("common", "ordinary");
113    m.insert("strange", "peculiar");
114    m.insert("normal", "typical");
115    m.insert("ancient", "archaic");
116    m.insert("modern", "contemporary");
117    m.insert("local", "regional");
118    m.insert("distant", "remote");
119    // Verbs
120    m.insert("walk", "stroll");
121    m.insert("run", "sprint");
122    m.insert("eat", "consume");
123    m.insert("drink", "sip");
124    m.insert("write", "compose");
125    m.insert("read", "peruse");
126    m.insert("speak", "articulate");
127    m.insert("listen", "hear");
128    m.insert("look", "glance");
129    m.insert("touch", "feel");
130    m.insert("help", "assist");
131    m.insert("stop", "halt");
132    m.insert("try", "attempt");
133    m.insert("fail", "falter");
134    m.insert("win", "triumph");
135    m.insert("forfeit", "lose");
136    m.insert("buy", "purchase");
137    m.insert("sell", "trade");
138    m.insert("build", "construct");
139    m.insert("break", "shatter");
140    m.insert("fix", "repair");
141    m.insert("cut", "slice");
142    m.insert("push", "shove");
143    m.insert("pull", "tug");
144    m.insert("throw", "toss");
145    m.insert("catch", "grab");
146    m.insert("jump", "leap");
147    m.insert("fall", "plunge");
148    m.insert("rise", "ascend");
149    m.insert("drop", "descend");
150    m.insert("open", "unlock");
151    m.insert("close", "shut");
152    m.insert("move", "shift");
153    m.insert("stay", "remain");
154    m.insert("change", "alter");
155    m.insert("grow", "expand");
156    m.insert("shrink", "diminish");
157    m.insert("show", "display");
158    m.insert("hide", "conceal");
159    m.insert("choose", "select");
160    m.insert("allow", "permit");
161    m.insert("prevent", "hinder");
162    m.insert("need", "require");
163    m.insert("want", "desire");
164    m.insert("like", "enjoy");
165    m.insert("hate", "despise");
166    m.insert("fear", "dread");
167    m.insert("love", "adore");
168    m.insert("send", "dispatch");
169    m.insert("receive", "accept");
170    m.insert("keep", "retain");
171    m.insert("misplace", "lose");
172    m.insert("follow", "pursue");
173    m.insert("lead", "guide");
174    m.insert("wait", "linger");
175    m.insert("hurry", "rush");
176    m.insert("agree", "concur");
177    m.insert("refuse", "decline");
178    // Nouns
179    m.insert("house", "dwelling");
180    m.insert("car", "vehicle");
181    m.insert("book", "volume");
182    m.insert("friend", "companion");
183    m.insert("work", "labor");
184    m.insert("time", "duration");
185    m.insert("way", "method");
186    m.insert("place", "location");
187    m.insert("thing", "object");
188    m.insert("part", "component");
189    m.insert("life", "existence");
190    m.insert("day", "period");
191    m.insert("man", "person");
192    m.insert("woman", "individual");
193    m.insert("child", "youth");
194    m.insert("world", "realm");
195    m.insert("school", "institution");
196    m.insert("country", "nation");
197    m.insert("city", "metropolis");
198    m.insert("family", "household");
199    m.insert("group", "collective");
200    m.insert("system", "framework");
201    m.insert("problem", "issue");
202    m.insert("idea", "concept");
203    m.insert("question", "inquiry");
204    m.insert("result", "outcome");
205    m.insert("road", "path");
206    m.insert("tree", "plant");
207    m.insert("water", "liquid");
208    m.insert("fire", "flame");
209    m.insert("glow", "light");
210    m.insert("sound", "noise");
211    m.insert("food", "nourishment");
212    m.insert("money", "currency");
213    m.insert("power", "strength");
214    m.insert("mind", "intellect");
215    m.insert("heart", "soul");
216    m.insert("hand", "palm");
217    m.insert("eye", "gaze");
218    m.insert("word", "term");
219    m.insert("story", "tale");
220    m.insert("truth", "fact");
221    m.insert("dream", "vision");
222    m.insert("goal", "objective");
223    m.insert("plan", "strategy");
224    m.insert("step", "stage");
225    m.insert("rule", "law");
226    m.insert("right", "privilege");
227    m.insert("choice", "option");
228    m.insert("chance", "opportunity");
229    m
230});
231
232/// Runtime synonym overrides, merged with SYNONYM_MAP at lookup time.
233/// Set via [`set_synonym_overrides`].
234static SYNONYM_OVERRIDES: Lazy<Mutex<HashMap<String, String>>> =
235    Lazy::new(|| Mutex::new(HashMap::new()));
236
237/// Load synonym pairs from a file and register them as runtime overrides.
238///
239/// Supports two line formats:
240/// - TSV: `word\treplacement`
241/// - Key-value: `word = replacement`
242///
243/// Lines starting with `#` are treated as comments and skipped.
244///
245/// # Errors
246/// Returns an error if the file cannot be read. Individual malformed lines are silently skipped.
247pub fn load_synonym_overrides(path: &str) -> Result<(), Box<dyn std::error::Error>> {
248    let content = std::fs::read_to_string(path)?;
249    let mut overrides = SYNONYM_OVERRIDES.lock().unwrap_or_else(|e| e.into_inner());
250    for (line_num, raw_line) in content.lines().enumerate() {
251        let line = raw_line.trim();
252        if line.is_empty() || line.starts_with('#') {
253            continue;
254        }
255        if let Some((k, v)) = line.split_once('\t') {
256            overrides.insert(k.trim().to_lowercase(), v.trim().to_string());
257        } else if let Some((k, v)) = line.split_once('=') {
258            overrides.insert(k.trim().to_lowercase(), v.trim().to_string());
259        } else {
260            eprintln!("[eot] synonym file line {}: parse error — {:?}", line_num + 1, line);
261        }
262    }
263    Ok(())
264}
265
266/// Replace the current runtime synonym overrides with the given map.
267pub fn set_synonym_overrides(map: HashMap<String, String>) {
268    let mut overrides = SYNONYM_OVERRIDES.lock().unwrap_or_else(|e| e.into_inner());
269    *overrides = map;
270}
271
272/// Look up a token in the synonym map, checking runtime overrides first.
273fn synonym_lookup(token: &str) -> Option<String> {
274    let lower = token.to_lowercase();
275    {
276        let overrides = SYNONYM_OVERRIDES.lock().unwrap_or_else(|e| e.into_inner());
277        if let Some(v) = overrides.get(lower.as_str()) {
278            return Some(v.clone());
279        }
280    }
281    SYNONYM_MAP.get(lower.as_str()).map(|s| s.to_string())
282}
283
284/// The set of token mutation strategies available at the interception layer.
285///
286/// Each variant describes a different way to perturb a token in the stream.
287/// The transform is applied only at odd-indexed positions (i.e., every other
288/// token, starting from index 1) unless the caller overrides the rate.
289///
290/// ## Strategies
291///
292/// | Variant | Behaviour |
293/// |---------|-----------|
294/// | `Reverse` | Reverses the Unicode characters of the token: `"hello"` -> `"olleh"`. |
295/// | `Uppercase` | Uppercases every character: `"hello"` -> `"HELLO"`. |
296/// | `Mock` | Alternates lowercase/uppercase per character position: `"hello"` -> `"hElLo"`. |
297/// | `Noise` | Appends one random symbol from `* + ~ @ # $ %`: `"hello"` -> `"hello*"`. |
298/// | `Chaos` | Randomly picks one of Reverse, Uppercase, Mock, or Noise per token. |
299/// | `Scramble` | Fisher-Yates shuffles the characters: same characters, random order. |
300/// | `Delete` | Drops the token entirely, returning an empty string. |
301/// | `Synonym` | Replaces the token with a synonym from the built-in 200-entry map; passes through unchanged if no entry exists. |
302/// | `Delay(ms)` | Returns the token unmodified after the given delay in milliseconds. Useful for pacing experiments. |
303/// | `Chain(vec)` | Applies a sequence of transforms in order; label is the individual labels joined by `+`. |
304#[derive(Debug, Clone)]
305pub enum Transform {
306    /// Reverse the Unicode characters of the token.
307    Reverse,
308    /// Uppercase every character of the token.
309    Uppercase,
310    /// Alternate lowercase/uppercase per character position (sPoNgEbOb case).
311    Mock,
312    /// Append one random symbol from the noise character set.
313    Noise,
314    /// Randomly select one of Reverse, Uppercase, Mock, or Noise for each token.
315    Chaos,
316    /// Shuffle the characters of the token using Fisher-Yates.
317    Scramble,
318    /// Return an empty string, effectively deleting the token from the stream.
319    Delete,
320    /// Replace the token with a built-in synonym; pass through unchanged if not found.
321    Synonym,
322    /// Return the token unchanged after sleeping for the given number of milliseconds.
323    Delay(u64),
324    /// Apply a sequence of transforms in order, chaining their effects.
325    Chain(Vec<Transform>),
326}
327
328impl Transform {
329    /// Parse a transform name (case-insensitive) or a comma-separated chain.
330    ///
331    /// Recognised single names: `reverse`, `uppercase`, `mock`, `noise`, `chaos`,
332    /// `scramble`, `delete`, `synonym`, `delay`, `delay:N` (where N is milliseconds).
333    ///
334    /// Comma-separated input like `"reverse,uppercase"` produces a `Chain` variant.
335    /// A single-element comma-separated string is unwrapped to the plain variant.
336    ///
337    /// # Errors
338    ///
339    /// Returns `Err(String)` if any component name is unrecognised.
340    pub fn from_str_loose(s: &str) -> Result<Self, String> {
341        // Handle "chain:reverse,uppercase" prefix syntax as an alias for "reverse,uppercase"
342        let s = if let Some(rest) = s.strip_prefix("chain:") {
343            rest
344        } else {
345            s
346        };
347        // Handle comma-separated chain: "reverse,uppercase"
348        if s.contains(',') {
349            let parts: Result<Vec<Transform>, String> = s
350                .split(',')
351                .map(|part| Transform::from_str_single(part.trim()))
352                .collect();
353            let transforms = parts?;
354            if transforms.len() == 1 {
355                // len == 1 is checked above, so into_iter().next() is always Some.
356                return transforms
357                    .into_iter()
358                    .next()
359                    .ok_or_else(|| "internal: empty transform list".to_string());
360            }
361            return Ok(Transform::Chain(transforms));
362        }
363        Transform::from_str_single(s)
364    }
365
366    fn from_str_single(s: &str) -> Result<Self, String> {
367        let lower = s.to_lowercase();
368        // Handle "delay:NNN" or "delay" forms
369        if lower.starts_with("delay:") {
370            let ms: u64 = lower
371                .strip_prefix("delay:")
372                .and_then(|n| n.parse().ok())
373                .unwrap_or(100);
374            return Ok(Transform::Delay(ms));
375        }
376        match lower.as_str() {
377            "reverse" => Ok(Transform::Reverse),
378            "uppercase" => Ok(Transform::Uppercase),
379            "mock" => Ok(Transform::Mock),
380            "noise" => Ok(Transform::Noise),
381            "chaos" => Ok(Transform::Chaos),
382            "scramble" => Ok(Transform::Scramble),
383            "delete" => Ok(Transform::Delete),
384            "synonym" => Ok(Transform::Synonym),
385            "delay" => Ok(Transform::Delay(100)),
386            _ => Err(format!("Unknown transform: {}", s)),
387        }
388    }
389
390    /// Apply the transform using the provided RNG and return `(result, label)`.
391    /// For Chaos, the sub-transform is chosen via `rng`; for others the label
392    /// equals the transform name.  Prefer this over `apply_with_label` in hot
393    /// paths to avoid per-call `thread_rng()` TLS lookups.
394    pub fn apply_with_label_rng<R: Rng>(&self, token: &str, rng: &mut R) -> (String, String) {
395        match self {
396            Transform::Reverse => (token.chars().rev().collect(), "reverse".to_string()),
397            Transform::Uppercase => (token.to_uppercase(), "uppercase".to_string()),
398            Transform::Mock => (apply_mock(token), "mock".to_string()),
399            Transform::Noise => {
400                let noise_char = NOISE_CHARS[rng.gen_range(0..NOISE_CHARS.len())];
401                (format!("{}{}", token, noise_char), "noise".to_string())
402            }
403            Transform::Scramble => {
404                let mut chars: Vec<char> = token.chars().collect();
405                // Fisher-Yates shuffle
406                let n = chars.len();
407                for i in (1..n).rev() {
408                    let j = rng.gen_range(0..=i);
409                    chars.swap(i, j);
410                }
411                (chars.into_iter().collect(), "scramble".to_string())
412            }
413            Transform::Delete => (String::new(), "delete".to_string()),
414            Transform::Synonym => {
415                let result = synonym_lookup(token).unwrap_or_else(|| token.to_string());
416                (result, "synonym".to_string())
417            }
418            Transform::Delay(_) => (token.to_string(), "delay".to_string()),
419            Transform::Chaos => match rng.gen_range(0u8..4) {
420                0 => (token.chars().rev().collect(), "reverse".to_string()),
421                1 => (token.to_uppercase(), "uppercase".to_string()),
422                2 => (apply_mock(token), "mock".to_string()),
423                _ => {
424                    let noise_char = NOISE_CHARS[rng.gen_range(0..NOISE_CHARS.len())];
425                    (format!("{}{}", token, noise_char), "noise".to_string())
426                }
427            },
428            Transform::Chain(transforms) => {
429                let mut current = token.to_string();
430                let mut labels: Vec<String> = Vec::new();
431                for t in transforms {
432                    let (next, label) = t.apply_with_label_rng(&current, rng);
433                    current = next;
434                    labels.push(label);
435                }
436                (current, labels.join("+"))
437            }
438        }
439    }
440
441    /// Apply the transform using the provided RNG.
442    pub fn apply_rng<R: Rng>(&self, token: &str, rng: &mut R) -> String {
443        self.apply_with_label_rng(token, rng).0
444    }
445
446    /// Apply the transform and return `(result, label)`.  Creates a one-shot
447    /// `thread_rng()`; use `apply_with_label_rng` in hot paths.
448    pub fn apply_with_label(&self, token: &str) -> (String, String) {
449        self.apply_with_label_rng(token, &mut rand::thread_rng())
450    }
451
452    /// Apply the transform to `token` at the given `rate`.
453    /// `rate` must be in `[0.0, 1.0]`.
454    pub fn apply_at_rate(&self, token: &str, rate: f64) -> String {
455        debug_assert!(rate >= 0.0 && rate <= 1.0, "rate must be in [0, 1]");
456        let _ = rate;
457        self.apply(token)
458    }
459
460    /// Apply the transform and return only the resulting string.
461    ///
462    /// Convenience wrapper around [`apply_with_label`](Self::apply_with_label).
463    pub fn apply(&self, token: &str) -> String {
464        self.apply_with_label(token).0
465    }
466
467    /// Apply the transform to `token`, asserting that `rate` is valid.
468    pub fn apply_with_rate_check(&self, token: &str, rate: f64) -> String {
469        debug_assert!(rate >= 0.0 && rate <= 1.0, "rate must be in [0, 1]");
470        let _ = rate;
471        self.apply(token)
472    }
473}
474
475/// Shared mock-case logic: alternate lower/upper per character.
476fn apply_mock(token: &str) -> String {
477    token
478        .chars()
479        .enumerate()
480        .map(|(i, c)| {
481            if i % 2 == 0 {
482                c.to_lowercase().next().unwrap_or(c)
483            } else {
484                c.to_uppercase().next().unwrap_or(c)
485            }
486        })
487        .collect()
488}
489
490/// Returns true if `ch` is a CJK ideographic character that should be its own token.
491fn is_cjk(ch: char) -> bool {
492    matches!(ch,
493        '\u{4E00}'..='\u{9FFF}'   // CJK Unified Ideographs
494        | '\u{3400}'..='\u{4DBF}' // CJK Extension A
495        | '\u{20000}'..='\u{2A6DF}' // CJK Extension B
496        | '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
497        | '\u{3000}'..='\u{303F}' // CJK Symbols and Punctuation
498        | '\u{FF00}'..='\u{FFEF}' // Halfwidth/Fullwidth Forms
499    )
500}
501
502/// Returns true if `ch` should be treated as a word-boundary punctuation character
503/// (split out as a single-char token, just like ASCII punctuation).
504fn is_word_boundary_punct(ch: char) -> bool {
505    ch.is_ascii_punctuation()
506        || matches!(
507            ch,
508            '\u{2014}' // em dash —
509            | '\u{2013}' // en dash –
510            | '\u{2026}' // ellipsis …
511            | '«'
512            | '»'
513            | '\u{201C}' // left double quote "
514            | '\u{201D}' // right double quote "
515            | '\u{2018}' // left single quote '
516            | '\u{2019}' // right single quote '
517            | '„'
518            | '‹'
519            | '›'
520            | '·'
521        )
522}
523
524/// Split text into tokens (words, punctuation, whitespace).
525pub fn tokenize(text: &str) -> Vec<String> {
526    let mut tokens = Vec::new();
527    let mut current_token = String::new();
528
529    for ch in text.chars() {
530        if ch.is_whitespace() || is_word_boundary_punct(ch) {
531            if !current_token.is_empty() {
532                tokens.push(current_token.clone());
533                current_token.clear();
534            }
535            if !ch.is_whitespace() {
536                tokens.push(ch.to_string());
537            }
538            if ch.is_whitespace() {
539                tokens.push(ch.to_string());
540            }
541        } else if is_cjk(ch) {
542            // Each CJK ideograph is its own token (no spaces separate them).
543            if !current_token.is_empty() {
544                tokens.push(current_token.clone());
545                current_token.clear();
546            }
547            tokens.push(ch.to_string());
548        } else {
549            current_token.push(ch);
550        }
551    }
552
553    if !current_token.is_empty() {
554        tokens.push(current_token);
555    }
556
557    tokens
558}
559
560/// Calculate simulated token importance (0.0 to 1.0) using a caller-supplied RNG.
561/// Identical to `calculate_token_importance` but takes an explicit RNG parameter
562/// for deterministic/seeded use.
563pub fn calculate_token_importance_rng<R: rand::Rng>(
564    token: &str,
565    position: usize,
566    rng: &mut R,
567) -> f64 {
568    let mut importance = 0.0;
569
570    importance += (token.len() as f64 / 20.0).min(0.3);
571
572    let position_factor = if !(5..=50).contains(&position) {
573        0.3
574    } else {
575        0.1
576    };
577    importance += position_factor;
578
579    if token.chars().any(|c| c.is_uppercase()) {
580        importance += 0.2;
581    }
582
583    let important_patterns = [
584        "the",
585        "and",
586        "or",
587        "but",
588        "if",
589        "when",
590        "where",
591        "how",
592        "why",
593        "what",
594        "robot",
595        "AI",
596        "technology",
597        "system",
598        "data",
599        "algorithm",
600        "model",
601        "create",
602        "build",
603        "develop",
604        "analyze",
605        "process",
606        "generate",
607    ];
608
609    let lower_token = token.to_lowercase();
610    if important_patterns
611        .iter()
612        .any(|&pattern| lower_token.contains(pattern))
613    {
614        importance += 0.3;
615    }
616
617    if token.chars().all(|c| c.is_ascii_punctuation()) {
618        importance *= 0.1;
619    }
620
621    importance += rng.gen_range(-0.1..0.1);
622
623    importance.clamp(0.0, 1.0)
624}
625
626/// Calculate simulated token importance (0.0 to 1.0) based on length,
627/// position, content type, and random jitter.
628/// Uses `thread_rng()` for jitter; for deterministic output use
629/// `calculate_token_importance_rng`.
630pub fn calculate_token_importance(token: &str, position: usize) -> f64 {
631    calculate_token_importance_rng(token, position, &mut rand::thread_rng())
632}
633
634/// Map an importance score to a terminal heatmap color.
635pub fn apply_heatmap_color(token: &str, importance: f64) -> String {
636    match importance {
637        i if i >= 0.8 => token.on_bright_red().bright_white().to_string(),
638        i if i >= 0.6 => token.on_red().bright_white().to_string(),
639        i if i >= 0.4 => token.on_yellow().black().to_string(),
640        i if i >= 0.2 => token.on_blue().bright_white().to_string(),
641        _ => token.normal().to_string(),
642    }
643}
644
645#[cfg(test)]
646mod tests {
647    use super::*;
648
649    // -- Transform apply tests --
650
651    #[test]
652    fn test_transform_reverse() {
653        assert_eq!(Transform::Reverse.apply("hello"), "olleh");
654        assert_eq!(Transform::Reverse.apply("world"), "dlrow");
655    }
656
657    #[test]
658    fn test_transform_uppercase() {
659        assert_eq!(Transform::Uppercase.apply("hello"), "HELLO");
660        assert_eq!(Transform::Uppercase.apply("world"), "WORLD");
661    }
662
663    #[test]
664    fn test_transform_mock() {
665        assert_eq!(Transform::Mock.apply("hello"), "hElLo");
666        assert_eq!(Transform::Mock.apply("world"), "wOrLd");
667    }
668
669    #[test]
670    fn test_transform_noise() {
671        let result = Transform::Noise.apply("hello");
672        assert!(result.starts_with("hello"));
673        assert!(result.len() > 5);
674    }
675
676    #[test]
677    fn test_transform_from_str_valid() {
678        assert!(matches!(
679            Transform::from_str_loose("reverse"),
680            Ok(Transform::Reverse)
681        ));
682        assert!(matches!(
683            Transform::from_str_loose("uppercase"),
684            Ok(Transform::Uppercase)
685        ));
686        assert!(matches!(
687            Transform::from_str_loose("mock"),
688            Ok(Transform::Mock)
689        ));
690        assert!(matches!(
691            Transform::from_str_loose("noise"),
692            Ok(Transform::Noise)
693        ));
694    }
695
696    #[test]
697    fn test_transform_from_str_invalid() {
698        assert!(Transform::from_str_loose("invalid").is_err());
699        assert!(Transform::from_str_loose("").is_err());
700        assert!(Transform::from_str_loose("foo").is_err());
701        assert!(Transform::from_str_loose("REVERSED").is_err());
702    }
703
704    #[test]
705    fn test_transform_from_str_case_insensitive() {
706        assert!(matches!(
707            Transform::from_str_loose("REVERSE"),
708            Ok(Transform::Reverse)
709        ));
710        assert!(matches!(
711            Transform::from_str_loose("Uppercase"),
712            Ok(Transform::Uppercase)
713        ));
714        assert!(matches!(
715            Transform::from_str_loose("MoCk"),
716            Ok(Transform::Mock)
717        ));
718    }
719
720    #[test]
721    fn test_transform_empty_inputs() {
722        assert_eq!(Transform::Reverse.apply(""), "");
723        assert_eq!(Transform::Uppercase.apply(""), "");
724        assert_eq!(Transform::Mock.apply(""), "");
725        assert_eq!(Transform::Noise.apply("").len(), 1);
726    }
727
728    #[test]
729    fn test_transform_single_char() {
730        assert_eq!(Transform::Reverse.apply("a"), "a");
731        assert_eq!(Transform::Mock.apply("a"), "a");
732        assert_eq!(Transform::Mock.apply("A"), "a");
733    }
734
735    #[test]
736    fn test_transform_mock_two_chars() {
737        assert_eq!(Transform::Mock.apply("ab"), "aB");
738        assert_eq!(Transform::Mock.apply("AB"), "aB");
739    }
740
741    #[test]
742    fn test_transform_preserves_length() {
743        let inputs = ["hello", "a", "ab", "abcdefghij", ""];
744        for input in &inputs {
745            assert_eq!(Transform::Reverse.apply(input).len(), input.len());
746            assert_eq!(Transform::Uppercase.apply(input).len(), input.len());
747            assert_eq!(Transform::Mock.apply(input).len(), input.len());
748        }
749    }
750
751    #[test]
752    fn test_transform_noise_appends_one_char() {
753        for _ in 0..20 {
754            let result = Transform::Noise.apply("test");
755            assert_eq!(result.len(), 5);
756            assert!(result.starts_with("test"));
757        }
758    }
759
760    #[test]
761    fn test_transform_noise_char_from_set() {
762        let noise_set = ['*', '+', '~', '@', '#', '$', '%'];
763        for _ in 0..50 {
764            let result = Transform::Noise.apply("x");
765            let noise_char = result.chars().last().expect("should have noise char");
766            assert!(
767                noise_set.contains(&noise_char),
768                "unexpected: {}",
769                noise_char
770            );
771        }
772    }
773
774    #[test]
775    fn test_reverse_is_involution() {
776        let token = "hello";
777        assert_eq!(
778            Transform::Reverse.apply(&Transform::Reverse.apply(token)),
779            token
780        );
781    }
782
783    #[test]
784    fn test_uppercase_is_idempotent() {
785        let once = Transform::Uppercase.apply("hello");
786        assert_eq!(Transform::Uppercase.apply(&once), once);
787    }
788
789    #[test]
790    fn test_noise_length_always_plus_one() {
791        for token in &["a", "hello", "test123", ""] {
792            assert_eq!(Transform::Noise.apply(token).len(), token.len() + 1);
793        }
794    }
795
796    #[test]
797    fn test_all_transforms_produce_different_results() {
798        let results: Vec<String> = [Transform::Reverse, Transform::Uppercase, Transform::Mock]
799            .iter()
800            .map(|t| t.apply("hello"))
801            .collect();
802        assert_ne!(results[0], results[1]);
803        assert_ne!(results[1], results[2]);
804        assert_ne!(results[0], results[2]);
805    }
806
807    #[test]
808    fn test_uppercase_already_upper() {
809        assert_eq!(Transform::Uppercase.apply("HELLO"), "HELLO");
810    }
811
812    #[test]
813    fn test_uppercase_with_numbers() {
814        assert_eq!(Transform::Uppercase.apply("test123"), "TEST123");
815    }
816
817    #[test]
818    fn test_reverse_with_numbers() {
819        assert_eq!(Transform::Reverse.apply("abc123"), "321cba");
820    }
821
822    #[test]
823    fn test_mock_longer_string() {
824        assert_eq!(Transform::Mock.apply("abcdef"), "aBcDeF");
825    }
826
827    // -- Tokenizer tests --
828
829    #[test]
830    fn test_tokenize_simple_sentence() {
831        let tokens = tokenize("hello world");
832        assert!(tokens.contains(&"hello".to_string()));
833        assert!(tokens.contains(&"world".to_string()));
834    }
835
836    #[test]
837    fn test_tokenize_with_punctuation() {
838        let tokens = tokenize("hello, world!");
839        assert!(tokens.contains(&"hello".to_string()));
840        assert!(tokens.contains(&",".to_string()));
841        assert!(tokens.contains(&"world".to_string()));
842        assert!(tokens.contains(&"!".to_string()));
843    }
844
845    #[test]
846    fn test_tokenize_empty() {
847        assert!(tokenize("").is_empty());
848    }
849
850    #[test]
851    fn test_tokenize_single_word() {
852        assert_eq!(tokenize("hello"), vec!["hello"]);
853    }
854
855    #[test]
856    fn test_tokenize_only_whitespace() {
857        assert!(tokenize("   ").iter().all(|t| t.trim().is_empty()));
858    }
859
860    #[test]
861    fn test_tokenize_only_punctuation() {
862        assert_eq!(tokenize("..."), vec![".", ".", "."]);
863    }
864
865    #[test]
866    fn test_tokenize_mixed() {
867        let tokens = tokenize("hello,world");
868        assert_eq!(tokens, vec!["hello", ",", "world"]);
869    }
870
871    #[test]
872    fn test_tokenize_preserves_all_chars() {
873        let input = "hello, world! foo";
874        assert_eq!(tokenize(input).join(""), input);
875    }
876
877    #[test]
878    fn test_tokenize_multiple_spaces() {
879        let tokens = tokenize("a  b");
880        assert!(tokens.contains(&"a".to_string()));
881        assert!(tokens.contains(&"b".to_string()));
882    }
883
884    #[test]
885    fn test_tokenize_leading_trailing_space() {
886        assert!(tokenize(" hello ").iter().any(|t| t == "hello"));
887    }
888
889    #[test]
890    fn test_tokenize_numbers() {
891        let tokens = tokenize("42 is the answer");
892        assert!(tokens.contains(&"42".to_string()));
893    }
894
895    // -- Importance scoring tests --
896
897    #[test]
898    fn test_importance_clamped() {
899        for pos in 0..100 {
900            let imp = calculate_token_importance("test", pos);
901            assert!(imp >= 0.0 && imp <= 1.0);
902        }
903    }
904
905    #[test]
906    fn test_punctuation_low_importance() {
907        let mut total = 0.0;
908        for _ in 0..100 {
909            total += calculate_token_importance(".", 25);
910        }
911        assert!(total / 100.0 < 0.3);
912    }
913
914    #[test]
915    fn test_importance_early_position_boost() {
916        let early: f64 = (0..5)
917            .map(|p| calculate_token_importance("word", p))
918            .sum::<f64>()
919            / 5.0;
920        let mid: f64 = (10..15)
921            .map(|p| calculate_token_importance("word", p))
922            .sum::<f64>()
923            / 5.0;
924        assert!(early > mid - 0.2);
925    }
926
927    #[test]
928    fn test_importance_uppercase_boost() {
929        let n = 200;
930        let upper: f64 = (0..n)
931            .map(|_| calculate_token_importance("AI", 25))
932            .sum::<f64>()
933            / n as f64;
934        let lower: f64 = (0..n)
935            .map(|_| calculate_token_importance("ai", 25))
936            .sum::<f64>()
937            / n as f64;
938        assert!(upper > lower);
939    }
940
941    #[test]
942    fn test_importance_keyword_boost() {
943        let n = 200;
944        let kw: f64 = (0..n)
945            .map(|_| calculate_token_importance("algorithm", 25))
946            .sum::<f64>()
947            / n as f64;
948        let plain: f64 = (0..n)
949            .map(|_| calculate_token_importance("xyz", 25))
950            .sum::<f64>()
951            / n as f64;
952        assert!(kw > plain);
953    }
954
955    #[test]
956    fn test_importance_long_token_boost() {
957        let n = 200;
958        let long: f64 = (0..n)
959            .map(|_| calculate_token_importance("supercalifragilistic", 25))
960            .sum::<f64>()
961            / n as f64;
962        let short: f64 = (0..n)
963            .map(|_| calculate_token_importance("a", 25))
964            .sum::<f64>()
965            / n as f64;
966        assert!(long > short);
967    }
968
969    #[test]
970    fn test_importance_all_tokens_in_range() {
971        let tokens = [
972            ".",
973            ",",
974            "!",
975            "?",
976            "a",
977            "AI",
978            "algorithm",
979            "the",
980            "superlongtoken",
981        ];
982        for token in &tokens {
983            for pos in [0, 1, 5, 25, 50, 100] {
984                let imp = calculate_token_importance(token, pos);
985                assert!(imp >= 0.0 && imp <= 1.0);
986            }
987        }
988    }
989
990    // -- Heatmap color tests --
991
992    #[test]
993    fn test_heatmap_color_nonempty() {
994        for level in [0.0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0] {
995            assert!(!apply_heatmap_color("test", level).is_empty());
996        }
997    }
998
999    #[test]
1000    fn test_heatmap_color_contains_text() {
1001        assert!(apply_heatmap_color("mytoken", 0.5).contains("mytoken"));
1002    }
1003
1004    // -- Chaos transform tests --
1005
1006    #[test]
1007    fn test_transform_chaos_from_str() {
1008        assert!(matches!(
1009            Transform::from_str_loose("chaos"),
1010            Ok(Transform::Chaos)
1011        ));
1012    }
1013
1014    #[test]
1015    fn test_transform_chaos_from_str_case_insensitive() {
1016        assert!(matches!(
1017            Transform::from_str_loose("CHAOS"),
1018            Ok(Transform::Chaos)
1019        ));
1020        assert!(matches!(
1021            Transform::from_str_loose("Chaos"),
1022            Ok(Transform::Chaos)
1023        ));
1024    }
1025
1026    #[test]
1027    fn test_transform_chaos_apply_nonempty() {
1028        for _ in 0..20 {
1029            let result = Transform::Chaos.apply("hello");
1030            assert!(!result.is_empty());
1031        }
1032    }
1033
1034    #[test]
1035    fn test_transform_chaos_apply_with_label_returns_known_label() {
1036        let known = ["reverse", "uppercase", "mock", "noise"];
1037        for _ in 0..50 {
1038            let (_text, label) = Transform::Chaos.apply_with_label("hello");
1039            assert!(
1040                known.contains(&label.as_str()),
1041                "unexpected label: {}",
1042                label
1043            );
1044        }
1045    }
1046
1047    #[test]
1048    fn test_transform_chaos_apply_with_label_text_nonempty() {
1049        for _ in 0..20 {
1050            let (text, _label) = Transform::Chaos.apply_with_label("world");
1051            assert!(!text.is_empty());
1052        }
1053    }
1054
1055    #[test]
1056    fn test_transform_chaos_empty_input() {
1057        // Noise appends 1 char, others keep length 0; either way no panic
1058        let (_text, label) = Transform::Chaos.apply_with_label("");
1059        let known = ["reverse", "uppercase", "mock", "noise"];
1060        assert!(known.contains(&label.as_str()));
1061    }
1062
1063    #[test]
1064    fn test_apply_with_label_non_chaos_label_matches_name() {
1065        assert_eq!(Transform::Reverse.apply_with_label("hi").1, "reverse");
1066        assert_eq!(Transform::Uppercase.apply_with_label("hi").1, "uppercase");
1067        assert_eq!(Transform::Mock.apply_with_label("hi").1, "mock");
1068        assert_eq!(Transform::Noise.apply_with_label("hi").1, "noise");
1069    }
1070
1071    #[test]
1072    fn test_apply_with_label_text_matches_apply() {
1073        let inputs = ["hello", "world", "test", ""];
1074        for input in &inputs {
1075            // Deterministic transforms only (not Chaos/Noise which are random)
1076            assert_eq!(
1077                Transform::Reverse.apply_with_label(input).0,
1078                Transform::Reverse.apply(input)
1079            );
1080            assert_eq!(
1081                Transform::Uppercase.apply_with_label(input).0,
1082                Transform::Uppercase.apply(input)
1083            );
1084            assert_eq!(
1085                Transform::Mock.apply_with_label(input).0,
1086                Transform::Mock.apply(input)
1087            );
1088        }
1089    }
1090
1091    #[test]
1092    fn test_transform_chaos_produces_variety_over_many_calls() {
1093        // Over 100 calls, Chaos should produce at least 2 distinct results
1094        let mut results: std::collections::HashSet<String> = std::collections::HashSet::new();
1095        for _ in 0..100 {
1096            results.insert(Transform::Chaos.apply("hello"));
1097        }
1098        assert!(results.len() >= 2, "Chaos should produce varied results");
1099    }
1100
1101    #[test]
1102    fn test_transform_scramble_same_chars() {
1103        let input = "hello";
1104        for _ in 0..20 {
1105            let result = Transform::Scramble.apply(input);
1106            let mut orig_sorted: Vec<char> = input.chars().collect();
1107            let mut res_sorted: Vec<char> = result.chars().collect();
1108            orig_sorted.sort();
1109            res_sorted.sort();
1110            assert_eq!(
1111                orig_sorted, res_sorted,
1112                "Scramble should produce same chars"
1113            );
1114        }
1115    }
1116
1117    #[test]
1118    fn test_transform_scramble_label() {
1119        let (_, label) = Transform::Scramble.apply_with_label("hi");
1120        assert_eq!(label, "scramble");
1121    }
1122
1123    #[test]
1124    fn test_transform_delete_empty() {
1125        assert_eq!(Transform::Delete.apply("hello"), "");
1126        assert_eq!(Transform::Delete.apply(""), "");
1127    }
1128
1129    #[test]
1130    fn test_transform_delete_label() {
1131        let (text, label) = Transform::Delete.apply_with_label("foo");
1132        assert_eq!(text, "");
1133        assert_eq!(label, "delete");
1134    }
1135
1136    #[test]
1137    fn test_transform_synonym_known() {
1138        assert_eq!(Transform::Synonym.apply("good"), "great");
1139        assert_eq!(Transform::Synonym.apply("bad"), "poor");
1140        assert_eq!(Transform::Synonym.apply("fast"), "quick");
1141    }
1142
1143    #[test]
1144    fn test_transform_synonym_unknown_passthrough() {
1145        assert_eq!(Transform::Synonym.apply("xyzzy"), "xyzzy");
1146    }
1147
1148    #[test]
1149    fn test_transform_synonym_label() {
1150        let (_, label) = Transform::Synonym.apply_with_label("good");
1151        assert_eq!(label, "synonym");
1152    }
1153
1154    #[test]
1155    fn test_transform_from_str_delay_colon() {
1156        assert!(matches!(
1157            Transform::from_str_loose("delay:200"),
1158            Ok(Transform::Delay(200))
1159        ));
1160    }
1161
1162    #[test]
1163    fn test_transform_from_str_delay_default() {
1164        assert!(matches!(
1165            Transform::from_str_loose("delay"),
1166            Ok(Transform::Delay(100))
1167        ));
1168    }
1169
1170    #[test]
1171    fn test_transform_delay_passthrough() {
1172        assert_eq!(Transform::Delay(50).apply("hello"), "hello");
1173    }
1174
1175    #[test]
1176    fn test_transform_from_str_scramble() {
1177        assert!(matches!(
1178            Transform::from_str_loose("scramble"),
1179            Ok(Transform::Scramble)
1180        ));
1181    }
1182
1183    #[test]
1184    fn test_transform_from_str_delete() {
1185        assert!(matches!(
1186            Transform::from_str_loose("delete"),
1187            Ok(Transform::Delete)
1188        ));
1189    }
1190
1191    #[test]
1192    fn test_transform_from_str_synonym() {
1193        assert!(matches!(
1194            Transform::from_str_loose("synonym"),
1195            Ok(Transform::Synonym)
1196        ));
1197    }
1198
1199    // -- Chain transform tests (Change 1) --
1200
1201    #[test]
1202    fn test_chain_reverse_uppercase() {
1203        let chain = Transform::Chain(vec![Transform::Reverse, Transform::Uppercase]);
1204        assert_eq!(chain.apply("hello"), "OLLEH");
1205    }
1206
1207    #[test]
1208    fn test_chain_mock_noise_label() {
1209        let chain = Transform::Chain(vec![Transform::Mock, Transform::Noise]);
1210        let (result, label) = chain.apply_with_label("hello");
1211        assert!(
1212            result.starts_with("hElLo"),
1213            "expected mock applied: {}",
1214            result
1215        );
1216        assert_eq!(label, "mock+noise");
1217    }
1218
1219    #[test]
1220    fn test_chain_from_str_loose_two() {
1221        let t = Transform::from_str_loose("reverse,uppercase").expect("parse ok");
1222        assert!(matches!(t, Transform::Chain(_)));
1223        assert_eq!(t.apply("hello"), "OLLEH");
1224    }
1225
1226    #[test]
1227    fn test_chain_from_str_loose_single_no_chain() {
1228        let t = Transform::from_str_loose("reverse").expect("parse ok");
1229        assert!(matches!(t, Transform::Reverse));
1230    }
1231
1232    #[test]
1233    fn test_chain_label_joined_with_plus() {
1234        let chain = Transform::Chain(vec![Transform::Reverse, Transform::Uppercase]);
1235        let (_, label) = chain.apply_with_label("hi");
1236        assert_eq!(label, "reverse+uppercase");
1237    }
1238
1239    // -- Unicode punctuation tokenize tests (Change 2) --
1240
1241    #[test]
1242    fn test_tokenize_em_dash() {
1243        let tokens = tokenize("word\u{2014}another");
1244        assert!(tokens.contains(&"word".to_string()));
1245        assert!(tokens.contains(&"\u{2014}".to_string()));
1246        assert!(tokens.contains(&"another".to_string()));
1247    }
1248
1249    #[test]
1250    fn test_tokenize_smart_quotes() {
1251        let tokens = tokenize("\u{201C}hello\u{201D}");
1252        assert!(tokens.contains(&"\u{201C}".to_string()));
1253        assert!(tokens.contains(&"hello".to_string()));
1254        assert!(tokens.contains(&"\u{201D}".to_string()));
1255    }
1256
1257    #[test]
1258    fn test_tokenize_ellipsis_unicode() {
1259        let tokens = tokenize("wait\u{2026}done");
1260        assert!(tokens.contains(&"\u{2026}".to_string()));
1261        assert!(tokens.contains(&"wait".to_string()));
1262        assert!(tokens.contains(&"done".to_string()));
1263    }
1264
1265    #[test]
1266    fn test_tokenize_en_dash() {
1267        let tokens = tokenize("2020\u{2013}2021");
1268        assert!(tokens.contains(&"\u{2013}".to_string()));
1269    }
1270
1271    #[test]
1272    fn test_tokenize_unicode_punct_preserves_all_chars() {
1273        let input = "hello\u{2014}world";
1274        assert_eq!(tokenize(input).join(""), input);
1275    }
1276
1277    // -- Seeded RNG importance tests (Change 3) --
1278
1279    #[test]
1280    fn test_importance_rng_same_seed_same_output() {
1281        use rand::SeedableRng;
1282        let mut rng1 = rand::rngs::StdRng::seed_from_u64(42);
1283        let mut rng2 = rand::rngs::StdRng::seed_from_u64(42);
1284        let v1 = calculate_token_importance_rng("hello", 10, &mut rng1);
1285        let v2 = calculate_token_importance_rng("hello", 10, &mut rng2);
1286        assert_eq!(v1, v2, "same seed must produce same result");
1287    }
1288
1289    #[test]
1290    fn test_importance_rng_different_seeds_differ() {
1291        use rand::SeedableRng;
1292        let mut results: std::collections::HashSet<u64> = std::collections::HashSet::new();
1293        for seed in 0u64..50 {
1294            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
1295            let v = calculate_token_importance_rng("test", 10, &mut rng);
1296            results.insert(v.to_bits());
1297        }
1298        assert!(results.len() > 1, "different seeds should sometimes differ");
1299    }
1300
1301    #[test]
1302    fn test_importance_rng_in_range() {
1303        use rand::SeedableRng;
1304        let mut rng = rand::rngs::StdRng::seed_from_u64(123);
1305        let v = calculate_token_importance_rng("algorithm", 5, &mut rng);
1306        assert!(v >= 0.0 && v <= 1.0);
1307    }
1308
1309    // -- Scramble and Delete extended tests (Improvement 7) --
1310
1311    #[test]
1312    fn test_scramble_empty_string() {
1313        assert_eq!(Transform::Scramble.apply(""), "");
1314    }
1315
1316    #[test]
1317    fn test_scramble_single_char() {
1318        for _ in 0..10 {
1319            assert_eq!(Transform::Scramble.apply("a"), "a");
1320        }
1321    }
1322
1323    #[test]
1324    fn test_scramble_preserves_chars() {
1325        let input = "hello";
1326        for _ in 0..20 {
1327            let result = Transform::Scramble.apply(input);
1328            let mut orig: Vec<char> = input.chars().collect();
1329            let mut res: Vec<char> = result.chars().collect();
1330            orig.sort();
1331            res.sort();
1332            assert_eq!(orig, res, "scramble should preserve the same characters");
1333        }
1334    }
1335
1336    #[test]
1337    fn test_scramble_produces_variety() {
1338        let mut results = std::collections::HashSet::new();
1339        for _ in 0..50 {
1340            results.insert(Transform::Scramble.apply("hello"));
1341        }
1342        assert!(
1343            results.len() >= 2,
1344            "scramble should produce different orderings"
1345        );
1346    }
1347
1348    #[test]
1349    fn test_delete_always_returns_empty() {
1350        for input in &["hello", "world", "test", "a", "abc123", ""] {
1351            assert_eq!(Transform::Delete.apply(input), "");
1352        }
1353    }
1354
1355    #[test]
1356    fn test_scramble_two_chars_both_permutations() {
1357        let mut seen = std::collections::HashSet::new();
1358        for _ in 0..200 {
1359            seen.insert(Transform::Scramble.apply("ab"));
1360        }
1361        assert!(seen.len() >= 1, "scramble of two chars should work");
1362    }
1363
1364    // ---- rstest parameterized tests ----
1365
1366    mod param_tests {
1367        use super::super::Transform;
1368        use rstest::rstest;
1369
1370        #[rstest]
1371        #[case("reverse", "olleh")]
1372        #[case("uppercase", "HELLO")]
1373        #[case("mock", "hElLo")]
1374        #[case("delete", "")]
1375        fn test_deterministic_transforms(#[case] name: &str, #[case] expected: &str) {
1376            let t = Transform::from_str_loose(name).expect("valid transform");
1377            assert_eq!(t.apply("hello"), expected, "transform={name}");
1378        }
1379
1380        #[rstest]
1381        #[case("reverse")]
1382        #[case("uppercase")]
1383        #[case("mock")]
1384        #[case("noise")]
1385        #[case("chaos")]
1386        #[case("scramble")]
1387        #[case("delete")]
1388        #[case("synonym")]
1389        #[case("delay")]
1390        fn test_all_transforms_parse(#[case] name: &str) {
1391            assert!(
1392                Transform::from_str_loose(name).is_ok(),
1393                "expected '{name}' to parse"
1394            );
1395        }
1396
1397        #[rstest]
1398        #[case("REVERSE")]
1399        #[case("Uppercase")]
1400        #[case("MOCK")]
1401        #[case("NOISE")]
1402        fn test_case_insensitive_parse(#[case] name: &str) {
1403            assert!(
1404                Transform::from_str_loose(name).is_ok(),
1405                "expected '{name}' to parse case-insensitively"
1406            );
1407        }
1408
1409        #[rstest]
1410        #[case("")]
1411        #[case("invalid")]
1412        #[case("REVERSED")]
1413        #[case("upper case")]
1414        fn test_invalid_transforms_error(#[case] name: &str) {
1415            assert!(
1416                Transform::from_str_loose(name).is_err(),
1417                "expected '{name}' to fail"
1418            );
1419        }
1420    }
1421
1422    // ---- chain: prefix syntax (item 12) ----
1423
1424    #[test]
1425    fn test_chain_prefix_two_transforms() {
1426        let t = Transform::from_str_loose("chain:reverse,uppercase").unwrap();
1427        assert!(matches!(t, Transform::Chain(_)));
1428    }
1429
1430    #[test]
1431    fn test_chain_prefix_single_unwraps() {
1432        let t = Transform::from_str_loose("chain:reverse").unwrap();
1433        assert!(matches!(t, Transform::Reverse));
1434    }
1435
1436    #[test]
1437    fn test_chain_prefix_invalid_propagates_err() {
1438        assert!(Transform::from_str_loose("chain:notreal,reverse").is_err());
1439    }
1440
1441    #[test]
1442    fn test_chain_prefix_equivalent_to_comma() {
1443        let with_prefix = Transform::from_str_loose("chain:reverse,uppercase").unwrap();
1444        let without_prefix = Transform::from_str_loose("reverse,uppercase").unwrap();
1445        // Both should be Chain variants with same length
1446        match (with_prefix, without_prefix) {
1447            (Transform::Chain(a), Transform::Chain(b)) => assert_eq!(a.len(), b.len()),
1448            _ => panic!("both should be Chain variants"),
1449        }
1450    }
1451
1452    // ---- CJK tokenization (item 7) ----
1453
1454    #[test]
1455    fn test_tokenize_cjk_individual_chars() {
1456        let tokens = tokenize("你好");
1457        assert_eq!(tokens, vec!["你", "好"], "each CJK char should be its own token");
1458    }
1459
1460    #[test]
1461    fn test_tokenize_cjk_mixed_with_latin() {
1462        let tokens = tokenize("hello你好world");
1463        assert!(tokens.contains(&"你".to_string()));
1464        assert!(tokens.contains(&"好".to_string()));
1465        assert!(tokens.contains(&"hello".to_string()));
1466        assert!(tokens.contains(&"world".to_string()));
1467    }
1468
1469    #[test]
1470    fn test_tokenize_cjk_with_spaces() {
1471        let tokens = tokenize("你 好");
1472        // CJK chars with space between them: "你", " ", "好"
1473        assert!(tokens.contains(&"你".to_string()));
1474        assert!(tokens.contains(&"好".to_string()));
1475    }
1476
1477    // ---- synonym file loading / runtime overrides (item 18) ----
1478    // All override tests are in a single test to avoid race conditions on the
1479    // global SYNONYM_OVERRIDES state when tests run in parallel.
1480
1481    #[test]
1482    fn test_synonym_overrides_all() {
1483        // Builtin fallback still works (no overrides set yet — cleared at end)
1484        assert_eq!(Transform::Synonym.apply("bad"), "poor");
1485
1486        // Runtime override takes priority over built-in
1487        set_synonym_overrides(
1488            vec![("good".to_string(), "fantastic".to_string())]
1489                .into_iter()
1490                .collect(),
1491        );
1492        assert_eq!(Transform::Synonym.apply("good"), "fantastic");
1493        set_synonym_overrides(std::collections::HashMap::new());
1494
1495        // TSV file format
1496        let tmp_tsv = std::env::temp_dir().join("synonyms_test_seq.tsv");
1497        std::fs::write(&tmp_tsv, "zephyr\tbreeze\n# comment line\n").expect("write");
1498        load_synonym_overrides(tmp_tsv.to_str().unwrap()).expect("load tsv");
1499        assert_eq!(Transform::Synonym.apply("zephyr"), "breeze");
1500        std::fs::remove_file(&tmp_tsv).ok();
1501        set_synonym_overrides(std::collections::HashMap::new());
1502
1503        // Key-value file format
1504        let tmp_kv = std::env::temp_dir().join("synonyms_kv_seq.txt");
1505        std::fs::write(&tmp_kv, "crimson = scarlet\n").expect("write");
1506        load_synonym_overrides(tmp_kv.to_str().unwrap()).expect("load kv");
1507        assert_eq!(Transform::Synonym.apply("crimson"), "scarlet");
1508        std::fs::remove_file(&tmp_kv).ok();
1509        set_synonym_overrides(std::collections::HashMap::new());
1510
1511        // After clearing, built-in map still works
1512        assert_eq!(Transform::Synonym.apply("fast"), "quick");
1513    }
1514
1515    // ---- proptest property-based tests — kept in a nested sub-module ----
1516}
1517
1518#[cfg(test)]
1519mod proptests {
1520    use super::*;
1521    use proptest::prelude::*;
1522
1523    proptest! {
1524        #[test]
1525        fn reverse_is_involution(s in "\\PC{0,50}") {
1526            let t = Transform::Reverse;
1527            let (once, _) = t.apply_with_label(&s);
1528            let t2 = Transform::Reverse;
1529            let (twice, _) = t2.apply_with_label(&once);
1530            prop_assert_eq!(twice, s);
1531        }
1532
1533        #[test]
1534        fn uppercase_is_idempotent(s in "\\PC{0,50}") {
1535            let t = Transform::Uppercase;
1536            let (once, _) = t.apply_with_label(&s);
1537            let t2 = Transform::Uppercase;
1538            let (twice, _) = t2.apply_with_label(&once);
1539            prop_assert_eq!(twice, once);
1540        }
1541
1542        #[test]
1543        fn noise_appends_one_char(s in "[a-z]{1,20}") {
1544            let t = Transform::Noise;
1545            let (out, _) = t.apply_with_label(&s);
1546            // Noise appends a symbol, so output length should be input length + 1 (in chars)
1547            let in_chars = s.chars().count();
1548            let out_chars = out.chars().count();
1549            prop_assert_eq!(out_chars, in_chars + 1);
1550        }
1551
1552        #[test]
1553        fn delete_always_empty(s in "\\PC{0,50}") {
1554            let t = Transform::Delete;
1555            let (out, _) = t.apply_with_label(&s);
1556            prop_assert_eq!(out.as_str(), "");
1557        }
1558
1559        #[test]
1560        fn chain_applies_in_order(s in "[a-z]{5,20}") {
1561            // Chain([Uppercase, Reverse]) should uppercase then reverse
1562            let chain = Transform::Chain(vec![Transform::Uppercase, Transform::Reverse]);
1563            let (out, _) = chain.apply_with_label(&s);
1564            // Manually compute expected
1565            let upper = s.to_uppercase();
1566            let expected: String = upper.chars().rev().collect();
1567            prop_assert_eq!(out, expected);
1568        }
1569    }
1570}
1571
1572// The configurable-confidence-threshold tests live in the main `tests` module
1573// defined earlier in this file. Rust does not allow two `mod tests` blocks in
1574// the same scope, so these are appended inside the already-open module via a
1575// separate inline block accessed through `mod tests` below. Instead, we place
1576// them directly into a `mod confidence_tests` sub-module.
1577#[cfg(test)]
1578mod confidence_tests {
1579    use super::*;
1580
1581    // ---- configurable confidence thresholds (item 11) ----
1582
1583    #[test]
1584    fn test_confidence_thresholds_custom() {
1585        use crate::render::{ConfidenceBand, ConfidenceThresholds};
1586        let t = ConfidenceThresholds { high: 0.9, mid: 0.6 };
1587        assert_eq!(ConfidenceBand::from_confidence_with_thresholds(0.95, &t), ConfidenceBand::High);
1588        assert_eq!(ConfidenceBand::from_confidence_with_thresholds(0.75, &t), ConfidenceBand::Mid);
1589        assert_eq!(ConfidenceBand::from_confidence_with_thresholds(0.3, &t), ConfidenceBand::Low);
1590    }
1591
1592    #[test]
1593    fn test_confidence_thresholds_default_unchanged() {
1594        use crate::render::{ConfidenceBand, ConfidenceThresholds};
1595        let t = ConfidenceThresholds::default();
1596        assert_eq!(t.high, 0.7);
1597        assert_eq!(t.mid, 0.4);
1598        assert_eq!(ConfidenceBand::from_confidence_with_thresholds(0.7, &t), ConfidenceBand::High);
1599        assert_eq!(ConfidenceBand::from_confidence_with_thresholds(0.4, &t), ConfidenceBand::Mid);
1600        assert_eq!(ConfidenceBand::from_confidence_with_thresholds(0.39, &t), ConfidenceBand::Low);
1601    }
1602
1603    // -- Item 20: Unicode grapheme cluster tests --
1604
1605    /// Known behavior: Reverse on a token with combining marks does chars().rev()
1606    /// which splits combining marks from their base characters.
1607    /// e.g. "e\u{0301}" reversed becomes "\u{0301}e" (combining accent before base).
1608    /// This is documented as a known limitation — proper grapheme-aware reversal
1609    /// would require a Unicode segmentation library.
1610    #[test]
1611    fn test_reverse_combining_marks() {
1612        let input = "e\u{0301}"; // e + combining acute accent = é
1613        let transform = Transform::Reverse;
1614        let (result, _) = transform.apply_with_label(input);
1615        // Result must be valid UTF-8 (no mojibake) — we just check it doesn't panic
1616        // and is a valid string. The combining mark may be reordered (known behavior).
1617        assert!(std::str::from_utf8(result.as_bytes()).is_ok(),
1618            "result must be valid UTF-8");
1619        assert!(!result.is_empty(), "result must not be empty");
1620    }
1621
1622    #[test]
1623    fn test_scramble_preserves_length() {
1624        let input = "hello world";
1625        let transform = Transform::Scramble;
1626        let mut rng = rand::thread_rng();
1627        let (result, _) = transform.apply_with_label_rng(input, &mut rng);
1628        assert_eq!(result.chars().count(), input.chars().count(),
1629            "scramble should preserve char count");
1630    }
1631
1632    #[test]
1633    fn test_mock_preserves_grapheme_count() {
1634        let input = "你好世界";
1635        let transform = Transform::Mock;
1636        let (result, _) = transform.apply_with_label(input);
1637        assert_eq!(result.chars().count(), input.chars().count(),
1638            "mock should preserve CJK char count");
1639    }
1640
1641    // -- Item 11: dry-run chain shows steps --
1642    #[test]
1643    fn test_dry_run_chain_shows_steps() {
1644        let chain = Transform::Chain(vec![Transform::Reverse, Transform::Uppercase]);
1645        let input = "hello world";
1646        // Apply reverse then uppercase manually
1647        let (after_reverse, _) = Transform::Reverse.apply_with_label(input);
1648        assert_eq!(after_reverse, "dlrow olleh");
1649        let (after_uppercase, _) = Transform::Uppercase.apply_with_label(&after_reverse);
1650        assert_eq!(after_uppercase, "DLROW OLLEH");
1651        // Verify the chain produces the same result
1652        let (chain_result, _) = chain.apply_with_label(input);
1653        assert_eq!(chain_result, "DLROW OLLEH");
1654    }
1655
1656    // -- Item 13: synonym load error includes line number --
1657    #[test]
1658    fn test_synonym_load_error_includes_line_number() {
1659        use std::io::Write;
1660        let tmp = std::env::temp_dir().join("eot_synonym_test_bad.tsv");
1661        let mut f = std::fs::File::create(&tmp).unwrap();
1662        writeln!(f, "good\tgreat").unwrap();
1663        writeln!(f, "bad_line_no_separator").unwrap(); // bad line #2
1664        writeln!(f, "fast\tquick").unwrap();
1665        drop(f);
1666        // Capture stderr is not easy in tests, but we verify no panic occurs
1667        // and the function succeeds (bad lines are skipped with eprintln).
1668        let result = load_synonym_overrides(tmp.to_str().unwrap());
1669        assert!(result.is_ok(), "load_synonym_overrides should not fail on bad lines");
1670        let _ = std::fs::remove_file(&tmp);
1671    }
1672}
every_other_token/transforms.rs

every_other_token/
transforms.rs