wafrift-wafmodel 0.3.1

Active-learning WAF decompiler: reconstruct a WAF's decision boundary as an executable symbolic automaton, mine bypasses offline, and prove hole-closure.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
//! Origin-normalization fingerprinting — *measure* which decode/normalize
//! stages a target's origin applies, so the P2 solver TARGETS its preimage to
//! the real pipeline instead of speculatively trying every canonical sink.
//!
//! [`solve`](crate::solve) inverts a *given* sink `Pipeline`. The open
//! question on a real target is *which* pipeline the origin is. This module
//! answers it from behaviour: send a marker that carries exactly one reversible
//! transform, observe the value that reaches the sink (the reflection), and
//! admit the stage **only on an exact, unambiguous fold** — the folded ASCII
//! marker appears and the sent (homoglyph/encoded) form does not survive. The
//! returned `Vec<Stage>` plugs straight into
//! [`solve_bypass`](crate::solve::solve_bypass) /
//! [`norm_mismatch_members`](crate::norm_mismatch_members) as the sink.
//!
//! The markers are **data-driven**: their homoglyph forms come from the same
//! [`nfkc_preimage`]/[`bestfit`] engines the solver inverts (no hand-listed
//! confusables), so the probe set and the bypass generator can never drift.
//!
//! Soundness contract: a stage is proposed, not trusted. `solve_bypass`
//! re-verifies that its preimage reconstructs the attack through the proposed
//! pipeline, so a mis-detected or mis-ordered stage can only fail to produce a
//! bypass — never fabricate one.

use crate::error::Result;
use crate::transduce::Stage;
use wafrift_grammar::grammar::{bestfit, nfkc_preimage};

/// A reflection oracle: returns the bytes that reached the **sink** for a given
/// input — what the origin's decode/normalize pipeline produced. On a live
/// target this is "send `?q=<input>`, read the value reflected into the
/// response". Distinct from [`WafOracle`](crate::oracle::WafOracle) (block/pass
/// verdict); here we observe the *transformed value*.
pub trait ReflectionOracle {
    /// Reflect `input` back through the origin. Errors are transport-style
    /// (retryable), not a signal about normalization.
    fn reflect(&mut self, input: &[u8]) -> Result<Vec<u8>>;
}

/// Wrap any `FnMut(&[u8]) -> Result<Vec<u8>>` as a [`ReflectionOracle`] — the
/// seam a live HTTP probe (scald / the wafrift CLI client) plugs into without
/// dragging an HTTP stack into this crate. Mirrors
/// [`FnOracle`](crate::oracle::FnOracle).
pub struct FnReflector<F>(pub F);

impl<F> ReflectionOracle for FnReflector<F>
where
    F: FnMut(&[u8]) -> Result<Vec<u8>>,
{
    fn reflect(&mut self, input: &[u8]) -> Result<Vec<u8>> {
        (self.0)(input)
    }
}

/// A unique, normalization-neutral marker. Lowercase alphanumerics only, so it
/// is itself unchanged by any stage — only the *carrier* transform moves it.
///
/// High-entropy on purpose: for the byte/whole-value probes (base64, hex,
/// overlong, NUL-strip) the *fold* is the bare marker, so a live target whose
/// page happens to contain the marker for any unrelated reason would make those
/// stages spuriously fire. A 16-char random-looking token makes ambient
/// collision astronomically unlikely, and [`scan_origin`]'s differential
/// baseline rejects the residual case explicitly rather than trusting luck.
const MARKER: &str = "wz7qx4k9mfp2r8td";

/// A second neutral token used only for the baseline control request in
/// [`scan_origin`]: distinct from `MARKER` (neither is a substring of the
/// other) so a baseline that reflects `CONTROL` proves the channel echoes,
/// while the absence of `MARKER` in that same baseline proves the marker is not
/// ambient page content. Lowercase alphanumerics ⇒ normalization-neutral.
const CONTROL: &str = "ctl8b3n6haje5wq1";

/// One normalization probe: the stage it tests, the marker as sent on the wire
/// (carrying the homoglyph/encoded form), and the ASCII the origin reflects iff
/// it applied that stage.
struct Probe {
    stage: Stage,
    /// Sent on the wire — `Vec<u8>` (not `String`) because the overlong-UTF-8
    /// carrier is *invalid* UTF-8 by construction.
    sent: Vec<u8>,
    folded: Vec<u8>,
}

/// Overlong-encode each ASCII byte as its non-canonical 2-byte form — the
/// carrier the overlong-decode probe sends (mirror of `solve::overlong_encode`,
/// inlined here to avoid widening that module's API for a 2-line helper).
fn overlong_bytes(s: &[u8]) -> Vec<u8> {
    let mut v = Vec::with_capacity(s.len() * 2);
    for &b in s {
        if b <= 0x7F {
            v.push(0xC0 | (b >> 6));
            v.push(0x80 | (b & 0x3F));
        } else {
            v.push(b);
        }
    }
    v
}

/// The probe battery, in canonical pipeline order (decode before normalize:
/// `wire → … → decode → normalize → sink`). Order only sets the proposed
/// pipeline order; `solve_bypass` re-verifies reconstruction regardless.
fn probes() -> Vec<Probe> {
    let marker = MARKER.as_bytes().to_vec();
    let mut out = Vec::new();

    // ── Byte-level decodes (earliest in the pipeline) ──

    // URL-decode: `%2D` → `-`. `-` is the only literal here (it is the
    // definition of percent-encoding, not a confusables list).
    out.push(Probe {
        stage: Stage::UrlDecode {
            plus_is_space: false,
        },
        sent: format!("{MARKER}%2D").into_bytes(),
        folded: format!("{MARKER}-").into_bytes(),
    });

    // Double URL-decode: `%252D` survives one pass as `%2D` and only a *second*
    // pass yields `-` — the exact asymmetry the double-encode bypass exploits.
    // A double-decoding origin folds this; a single-decoding one does not, so
    // the two are distinguished. (A double-decoder also folds the single `%2D`
    // probe above and so reports both; `run_probes` drops the subsumed single.)
    out.push(Probe {
        stage: Stage::DoubleUrlDecode,
        sent: format!("{MARKER}%252D").into_bytes(),
        folded: format!("{MARKER}-").into_bytes(),
    });

    // Base64: the marker base64-encoded; a decoding origin yields the marker.
    {
        use base64::Engine;
        out.push(Probe {
            stage: Stage::Base64Decode,
            sent: base64::engine::general_purpose::STANDARD
                .encode(&marker)
                .into_bytes(),
            folded: marker.clone(),
        });
    }

    // Hex: the marker hex-encoded; a hex-decoding origin yields the marker.
    out.push(Probe {
        stage: Stage::HexDecode,
        sent: hex::encode(&marker).into_bytes(),
        folded: marker.clone(),
    });

    // Overlong UTF-8: the marker encoded in the non-canonical 2-byte form.
    out.push(Probe {
        stage: Stage::OverlongUtf8Decode,
        sent: overlong_bytes(&marker),
        folded: marker.clone(),
    });

    // NUL-strip: a marker with an embedded NUL the origin drops.
    let mut nul_sent = marker.clone();
    nul_sent.insert(2, 0);
    out.push(Probe {
        stage: Stage::StripNulls,
        sent: nul_sent,
        folded: marker.clone(),
    });

    // ── Framework string decodes (after byte decodes, before normalize) ──

    // HTML entity decode: `&#x2d;` → `-` (framework templating / browser). The
    // numeric-hex entity form is exactly the solver's `html_entity_encode`
    // inverse, so probe and bypass generator can never drift.
    out.push(Probe {
        stage: Stage::HtmlEntityDecode,
        sent: format!("{MARKER}&#x2d;").into_bytes(),
        folded: format!("{MARKER}-").into_bytes(),
    });

    // JSON string unescape: the `-` escape → `-` (what a JSON body parser
    // hands the app). Disjoint escape syntax from the URL and HTML carriers, so
    // no cross-detection — each origin reports only its own decode.
    out.push(Probe {
        stage: Stage::JsonUnescape,
        sent: format!("{MARKER}\\u002d").into_bytes(),
        folded: format!("{MARKER}-").into_bytes(),
    });

    // ── Character-level normalizers (later in the pipeline) ──

    // NFKC: a fully-homoglyph form of the marker, generated by the engine the
    // solver inverts. `normalize(sent) == MARKER` holds by the engine's gate.
    if let Some(h) = nfkc_preimage::variants(MARKER, 1).into_iter().next() {
        debug_assert_eq!(nfkc_preimage::normalize(&h), MARKER);
        out.push(Probe {
            stage: Stage::NfkcNormalize,
            sent: h.into_bytes(),
            folded: marker.clone(),
        });
    }

    // Best-fit: a marker carrying a curly quote the engine coerces to `'`.
    let bf_ascii = format!("{MARKER}'");
    if let Some(h) = bestfit::variants(&bf_ascii, 1).into_iter().next() {
        debug_assert_eq!(bestfit::normalize(&h), bf_ascii);
        out.push(Probe {
            stage: Stage::BestFitDownconvert,
            sent: h.into_bytes(),
            folded: bf_ascii.into_bytes(),
        });
    }

    out
}

/// Run the probe battery once. Returns `(reflection_observed, stages)`:
///
/// * `reflection_observed` is true if ANY probe's folded or sent form came back
///   — i.e. the channel demonstrably echoes our input (possibly transformed).
///   This is derived from the probes themselves, NOT a verbatim control echo,
///   so it stays correct even for whole-value origins (base64/hex) that
///   transform every value including a control token.
/// * `stages` are the admitted stages, in canonical order. When `suppress` is
///   set (ambient marker collision) no stage is admitted — fail-closed.
fn run_probes(oracle: &mut dyn ReflectionOracle, suppress: bool) -> Result<(bool, Vec<Stage>)> {
    let mut stages = Vec::new();
    let mut reflection_observed = false;
    for p in probes() {
        let reflected = oracle.reflect(&p.sent)?;
        let folded_seen = contains(&reflected, &p.folded);
        let sent_survived = contains(&reflected, &p.sent);
        if folded_seen || sent_survived {
            reflection_observed = true;
        }
        // Admit ONLY on an exact fold: the decoded/folded marker is present and
        // the carrier form did not survive. Anything else (unchanged, partial,
        // mangled) leaves the stage out — fail-closed, never a guess.
        if !suppress && folded_seen && !sent_survived {
            stages.push(p.stage);
        }
    }
    // A double-URL-decoding origin folds BOTH the single `%2D` and the double
    // `%252D` probe, so it admits UrlDecode *and* DoubleUrlDecode. These are not
    // independent stages to chain: the double subsumes the single, and chaining
    // both in the sink pipeline would over-decode (a third pass). Keep only the
    // deeper one so the pipeline handed to the solver matches the real origin.
    if stages.iter().any(|s| matches!(s, Stage::DoubleUrlDecode)) {
        stages.retain(|s| !matches!(s, Stage::UrlDecode { .. }));
    }
    Ok((reflection_observed, stages))
}

/// Fingerprint the origin's normalization pipeline by reflection. Returns the
/// stages the origin demonstrably applies, in canonical order — the sink
/// `Pipeline` to hand to the solver. An empty result means a non-normalizing
/// origin (correctly reported: the solver will then find no homoglyph bypass
/// rather than fabricate one).
///
/// For a *live* target prefer [`scan_origin`], which adds the differential
/// baseline that distinguishes "non-normalizing" from "never observed the
/// reflection" and rejects ambient-marker false positives.
pub fn detect_origin_normalization(oracle: &mut dyn ReflectionOracle) -> Result<Vec<Stage>> {
    Ok(run_probes(oracle, false)?.1)
}

fn contains(haystack: &[u8], needle: &[u8]) -> bool {
    !needle.is_empty() && haystack.windows(needle.len()).any(|w| w == needle)
}

/// The outcome of a live origin scan. Separates the *measurement was valid*
/// question from the *what did it find* answer — a distinction
/// [`detect_origin_normalization`] alone cannot express (an empty `Vec` there
/// means either "non-normalizing origin" or "we never saw the reflection", and
/// on a real target those demand opposite operator actions).
#[derive(Debug, Clone, PartialEq)]
pub struct OriginScan {
    /// Our probe content (folded or raw) was observed coming back — the channel
    /// demonstrably echoes, so a negative result is trustworthy. When `false`,
    /// the scan is inconclusive (wrong parameter, no reflection, or a transform
    /// that ate every probe) and `stages` is empty by construction, NOT a clean
    /// bill of health.
    pub reflection_observed: bool,
    /// The fold marker was already present in the baseline response (ambient
    /// page content collided with our marker). When `true` the byte/whole-value
    /// probes cannot be trusted, so `stages` is empty — fail-closed rather than
    /// report luck-driven detections.
    pub marker_collision: bool,
    /// Detected origin stages, in canonical order. Meaningful only when
    /// `reflection_observed && !marker_collision`.
    pub stages: Vec<Stage>,
}

/// Live origin scan with a **differential baseline** that makes the result
/// trustworthy on a real target — the production entry point the CLI uses.
///
/// One control request is sent first (a neutral `CONTROL` token that is not a
/// carrier for any probe): if `MARKER` appears in that baseline it is ambient
/// page content, not a fold, so the byte/whole-value probes are suppressed
/// (`marker_collision`). The probe battery then runs and `reflection_observed`
/// is taken from whether any probe's content (folded or raw) actually came
/// back — robust even for whole-value origins that also transform the control,
/// so an empty `stages` with `reflection_observed = true` is a real "this
/// origin normalizes nothing" rather than "we pointed at the wrong parameter".
pub fn scan_origin(oracle: &mut dyn ReflectionOracle) -> Result<OriginScan> {
    // CONTROL is not a carrier for any probe, so it can never *fold* to MARKER;
    // MARKER appearing here therefore means ambient page content.
    let baseline = oracle.reflect(CONTROL.as_bytes())?;
    let marker_collision = contains(&baseline, MARKER.as_bytes());
    let (reflection_observed, stages) = run_probes(oracle, marker_collision)?;
    Ok(OriginScan {
        reflection_observed,
        marker_collision,
        stages,
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::transduce::Pipeline;

    /// A faithful origin double: it reflects exactly what its declared sink
    /// pipeline produces — the inverse role of `SimRegexWaf` for `WafOracle`.
    struct FakeOrigin(Pipeline);
    impl ReflectionOracle for FakeOrigin {
        fn reflect(&mut self, input: &[u8]) -> Result<Vec<u8>> {
            Ok(self.0.apply(input))
        }
    }

    fn detect(stages: Vec<Stage>) -> Vec<Stage> {
        let mut o = FakeOrigin(Pipeline(stages));
        detect_origin_normalization(&mut o).unwrap()
    }

    #[test]
    fn identity_origin_detects_nothing() {
        assert!(detect(vec![Stage::Identity]).is_empty());
    }

    #[test]
    fn nfkc_origin_is_detected() {
        assert_eq!(
            detect(vec![Stage::NfkcNormalize]),
            vec![Stage::NfkcNormalize]
        );
    }

    #[test]
    fn bestfit_origin_is_detected() {
        assert_eq!(
            detect(vec![Stage::BestFitDownconvert]),
            vec![Stage::BestFitDownconvert]
        );
    }

    #[test]
    fn url_decoding_origin_is_detected() {
        assert_eq!(
            detect(vec![Stage::UrlDecode {
                plus_is_space: false
            }]),
            vec![Stage::UrlDecode {
                plus_is_space: false
            }]
        );
    }

    #[test]
    fn null_stripping_origin_is_detected() {
        assert_eq!(detect(vec![Stage::StripNulls]), vec![Stage::StripNulls]);
    }

    #[test]
    fn overlong_utf8_decoding_origin_is_detected() {
        assert_eq!(
            detect(vec![Stage::OverlongUtf8Decode]),
            vec![Stage::OverlongUtf8Decode]
        );
    }

    #[test]
    fn base64_decoding_origin_is_detected() {
        assert_eq!(detect(vec![Stage::Base64Decode]), vec![Stage::Base64Decode]);
    }

    #[test]
    fn hex_decoding_origin_is_detected() {
        assert_eq!(detect(vec![Stage::HexDecode]), vec![Stage::HexDecode]);
    }

    #[test]
    fn base64_and_hex_do_not_cross_report() {
        // Both are whole-value text decodes over overlapping alphabets; assert
        // each origin reports ONLY itself (the marker's hex is not valid base64
        // of the marker, and vice versa).
        assert_eq!(detect(vec![Stage::Base64Decode]), vec![Stage::Base64Decode]);
        assert_eq!(detect(vec![Stage::HexDecode]), vec![Stage::HexDecode]);
    }

    #[test]
    fn base64_origin_does_not_falsely_report_other_decodes() {
        // Precision: a base64-decoding origin must report ONLY base64 — the
        // url/overlong/null markers are not valid base64 of those folds.
        assert_eq!(detect(vec![Stage::Base64Decode]), vec![Stage::Base64Decode]);
    }

    #[test]
    fn byte_decodes_are_independent_no_cross_detection() {
        // Precision: a NUL-stripping origin must NOT report overlong-decode (or
        // url/normalize), and vice versa — the probes are mutually exclusive.
        assert_eq!(detect(vec![Stage::StripNulls]), vec![Stage::StripNulls]);
        assert_eq!(
            detect(vec![Stage::OverlongUtf8Decode]),
            vec![Stage::OverlongUtf8Decode]
        );
    }

    #[test]
    fn json_unescaping_origin_is_detected() {
        assert_eq!(detect(vec![Stage::JsonUnescape]), vec![Stage::JsonUnescape]);
    }

    #[test]
    fn html_entity_decoding_origin_is_detected() {
        assert_eq!(
            detect(vec![Stage::HtmlEntityDecode]),
            vec![Stage::HtmlEntityDecode]
        );
    }

    #[test]
    fn double_url_decoding_origin_is_detected_and_subsumes_single() {
        // A double-decoder folds both the single and the double probe; detection
        // must report ONLY DoubleUrlDecode — chaining UrlDecode too would
        // over-decode the solver's preimage with a spurious third pass.
        let d = detect(vec![Stage::DoubleUrlDecode]);
        assert_eq!(d, vec![Stage::DoubleUrlDecode], "got {d:?}");
        assert!(!d.contains(&Stage::UrlDecode {
            plus_is_space: false
        }));
    }

    #[test]
    fn single_url_decode_is_not_reported_as_double() {
        // Precision twin: a single-decoding origin must NOT trip the double
        // probe — `%252D` survives one pass as `%2D` and never folds to `-`.
        let d = detect(vec![Stage::UrlDecode {
            plus_is_space: false,
        }]);
        assert_eq!(
            d,
            vec![Stage::UrlDecode {
                plus_is_space: false
            }]
        );
        assert!(!d.contains(&Stage::DoubleUrlDecode));
    }

    #[test]
    fn json_and_html_decodes_do_not_cross_report() {
        // Disjoint escape syntaxes: a JSON-unescaping origin must not report
        // HTML-entity-decode, and vice versa (neither carrier folds the other).
        let j = detect(vec![Stage::JsonUnescape]);
        assert_eq!(j, vec![Stage::JsonUnescape]);
        assert!(!j.contains(&Stage::HtmlEntityDecode));
        let h = detect(vec![Stage::HtmlEntityDecode]);
        assert_eq!(h, vec![Stage::HtmlEntityDecode]);
        assert!(!h.contains(&Stage::JsonUnescape));
    }

    #[test]
    fn framework_decodes_do_not_falsely_report_url_or_base64() {
        // Precision: the HTML/JSON carriers are not valid percent-encoding,
        // base64, or hex of the fold, so those origins must report only
        // themselves — never a spurious byte-decode.
        for st in [Stage::HtmlEntityDecode, Stage::JsonUnescape] {
            let d = detect(vec![st.clone()]);
            assert_eq!(d, vec![st.clone()], "stage {st:?} reported {d:?}");
        }
    }

    #[test]
    fn every_invertible_solver_stage_has_a_detection_probe() {
        // Anti-drift guard — this is the test that catches a stage gaining a
        // non-identity `solve::stage_inverse` without a fingerprint probe here.
        // When that happens the offline solver can bypass an origin class the
        // LIVE decompiler is blind to (exactly the gap this commit closes for
        // DoubleUrlDecode/JsonUnescape/HtmlEntityDecode). The list mirrors the
        // non-identity arms of `stage_inverse`; keep them in lockstep.
        use std::collections::HashSet;
        use std::mem::discriminant;
        let probed: HashSet<_> = probes().iter().map(|p| discriminant(&p.stage)).collect();
        let invertible = [
            Stage::UrlDecode {
                plus_is_space: false,
            },
            Stage::DoubleUrlDecode,
            Stage::JsonUnescape,
            Stage::HtmlEntityDecode,
            Stage::NfkcNormalize,
            Stage::BestFitDownconvert,
            Stage::StripNulls,
            Stage::OverlongUtf8Decode,
            Stage::Base64Decode,
            Stage::HexDecode,
        ];
        for st in &invertible {
            assert!(
                probed.contains(&discriminant(st)),
                "invertible solver stage {st:?} has no detection probe in probes() \
                 — the live fingerprinter is blind to an origin the solver can bypass"
            );
        }
    }

    #[test]
    fn composite_url_then_nfkc_origin_detects_both_in_order() {
        // A framework that url-decodes then NFKC-normalizes: both probes fold,
        // and the canonical order (decode before normalize) is returned.
        let detected = detect(vec![
            Stage::UrlDecode {
                plus_is_space: false,
            },
            Stage::NfkcNormalize,
        ]);
        assert_eq!(
            detected,
            vec![
                Stage::UrlDecode {
                    plus_is_space: false
                },
                Stage::NfkcNormalize
            ]
        );
    }

    #[test]
    fn nfkc_normalizing_origin_does_not_falsely_report_bestfit() {
        // Precision twin: NFKC does NOT fold the curly quote, so the best-fit
        // probe must stay unfolded and best-fit must be absent.
        let detected = detect(vec![Stage::NfkcNormalize]);
        assert!(!detected.contains(&Stage::BestFitDownconvert));
        assert!(detected.contains(&Stage::NfkcNormalize));
    }

    #[test]
    fn detected_pipeline_drives_the_solver_to_a_targeted_bypass() {
        // The payoff: fingerprint the origin, then feed the detected sink to
        // the SAME solver — and it lands a homoglyph bypass with no sink guess.
        use crate::canon::Channel;
        use crate::normalize::Transform;
        use crate::oracle::{ChannelSet, Rule, SimRegexWaf};
        use crate::{Outcome, WafOracle, solve_bypass};
        use wafrift_types::Request;

        let detected = detect(vec![Stage::NfkcNormalize]);
        assert_eq!(detected, vec![Stage::NfkcNormalize]);
        let sink = Pipeline(detected);

        let attack = b"<script>";
        let mut waf = SimRegexWaf::new(
            vec![Rule {
                id: "941".into(),
                channels: ChannelSet::none().with(Channel::Body),
                transforms: vec![Transform::UrlDecodeUni, Transform::Lowercase],
                pattern: regex::bytes::Regex::new("<script").unwrap(),
                score: 5,
            }],
            5,
        );
        let build =
            |b: &[u8]| Request::post("https://h/p", b.to_vec()).header("Content-Type", "text/html");

        let sol = solve_bypass(attack, &sink, &mut waf, &build)
            .unwrap()
            .expect("a fingerprinted NFKC origin must yield a targeted homoglyph bypass");
        assert!(!sol.input.contains(&b'<') && !sol.input.contains(&b'>'));
        let mut replay = SimRegexWaf::new(
            vec![Rule {
                id: "941".into(),
                channels: ChannelSet::none().with(Channel::Body),
                transforms: vec![Transform::UrlDecodeUni, Transform::Lowercase],
                pattern: regex::bytes::Regex::new("<script").unwrap(),
                score: 5,
            }],
            5,
        );
        assert_eq!(replay.classify(&build(&sol.input)).unwrap(), Outcome::Pass);
    }

    #[test]
    fn detected_double_decode_origin_drives_the_classic_double_encode_bypass() {
        // The headline trick, closed on the live path: fingerprint an origin
        // that URL-decodes TWICE, hand the detected pipeline to the solver, and
        // it derives the double-encoded payload — `%253Cscript` survives the
        // WAF's single decode as `%3Cscript` (inert) but the origin's second
        // pass reconstitutes `<script`. Before this commit the detector could
        // not see DoubleUrlDecode at all, so this end-to-end was unreachable.
        use crate::canon::Channel;
        use crate::normalize::Transform;
        use crate::oracle::{ChannelSet, Rule, SimRegexWaf};
        use crate::{Outcome, WafOracle, solve_bypass};
        use wafrift_types::Request;

        let detected = detect(vec![Stage::DoubleUrlDecode]);
        assert_eq!(detected, vec![Stage::DoubleUrlDecode], "got {detected:?}");
        let sink = Pipeline(detected);

        let attack = b"<script";
        // WAF decodes ONCE (urlDecodeUni is single-pass) — the asymmetry the
        // double-decode origin exploits.
        let rule = || Rule {
            id: "941".into(),
            channels: ChannelSet::none().with(Channel::Body),
            transforms: vec![Transform::UrlDecodeUni, Transform::Lowercase],
            pattern: regex::bytes::Regex::new("<script").unwrap(),
            score: 5,
        };
        let mut waf = SimRegexWaf::new(vec![rule()], 5);
        let build =
            |b: &[u8]| Request::post("https://h/p", b.to_vec()).header("Content-Type", "text/html");

        let sol = solve_bypass(attack, &sink, &mut waf, &build)
            .unwrap()
            .expect("a fingerprinted double-decoding origin must yield a double-encoded bypass");
        // The solved input carries no raw `<` (the WAF would catch it); the
        // bypass lives entirely in the second percent layer.
        assert!(
            !sol.input.contains(&b'<'),
            "solved input must not contain raw '<': {:?}",
            String::from_utf8_lossy(&sol.input)
        );
        // The origin's pipeline reconstructs the literal attack.
        assert!(
            sink.apply(&sol.input)
                .windows(attack.len())
                .any(|w| w == attack)
        );
        let mut replay = SimRegexWaf::new(vec![rule()], 5);
        assert_eq!(replay.classify(&build(&sol.input)).unwrap(), Outcome::Pass);
    }

    // ── scan_origin: differential-baseline robustness on live-style oracles ──

    /// Reflects exactly the input it is sent (a perfectly echoing, otherwise
    /// non-normalizing origin). Used to prove the baseline confirms reflection.
    struct EchoOrigin;
    impl ReflectionOracle for EchoOrigin {
        fn reflect(&mut self, input: &[u8]) -> Result<Vec<u8>> {
            Ok(input.to_vec())
        }
    }

    /// Always returns a fixed body, ignoring the input — models a target that
    /// does NOT reflect the probed parameter at all (wrong param / no echo).
    struct ConstOrigin(Vec<u8>);
    impl ReflectionOracle for ConstOrigin {
        fn reflect(&mut self, _input: &[u8]) -> Result<Vec<u8>> {
            Ok(self.0.clone())
        }
    }

    /// Echoes the input but ALSO injects the bare marker into every response —
    /// models a live page whose ambient content collides with our fold marker.
    struct MarkerInjectOrigin;
    impl ReflectionOracle for MarkerInjectOrigin {
        fn reflect(&mut self, input: &[u8]) -> Result<Vec<u8>> {
            let mut out = input.to_vec();
            out.extend_from_slice(MARKER.as_bytes());
            Ok(out)
        }
    }

    #[test]
    fn scan_confirms_reflection_on_an_echoing_identity_origin() {
        // The control reflects, the marker is not ambient ⇒ a trustworthy
        // "this origin normalizes nothing" (empty stages, reflection observed).
        let scan = scan_origin(&mut EchoOrigin).unwrap();
        assert!(scan.reflection_observed, "echoing origin must be observed");
        assert!(!scan.marker_collision);
        assert!(scan.stages.is_empty(), "identity origin has no stages");
    }

    #[test]
    fn scan_reports_no_reflection_when_the_channel_does_not_echo() {
        // A non-reflecting target ⇒ reflection_observed=false. The empty stage
        // list is explicitly NOT a clean bill of health (the whole point of the
        // baseline: distinguish "no echo" from "no normalization").
        let scan = scan_origin(&mut ConstOrigin(b"static page, no echo".to_vec())).unwrap();
        assert!(
            !scan.reflection_observed,
            "a non-echoing channel must not be reported as observed"
        );
        assert!(scan.stages.is_empty());
    }

    #[test]
    fn scan_fails_closed_on_ambient_marker_collision() {
        // The dangerous live false positive: the page already contains the fold
        // marker, so base64/hex/overlong/nul probes would all spuriously fire.
        // The baseline detects the collision and refuses to report ANY stage.
        let scan = scan_origin(&mut MarkerInjectOrigin).unwrap();
        assert!(scan.reflection_observed, "the echo channel still works");
        assert!(
            scan.marker_collision,
            "ambient marker must be detected at baseline"
        );
        assert!(
            scan.stages.is_empty(),
            "marker collision must yield NO detections (fail-closed), got {:?}",
            scan.stages
        );
    }

    #[test]
    fn scan_still_detects_a_real_stage_through_the_baseline() {
        // The baseline must not suppress a genuine detection: a base64-decoding
        // origin still fingerprints as Base64Decode after the control passes.
        let mut o = FakeOrigin(Pipeline(vec![Stage::Base64Decode]));
        let scan = scan_origin(&mut o).unwrap();
        assert!(scan.reflection_observed);
        assert!(!scan.marker_collision);
        assert_eq!(scan.stages, vec![Stage::Base64Decode]);
    }

    #[test]
    fn marker_and_control_are_distinct_and_non_overlapping() {
        // The differential baseline relies on CONTROL reflecting while MARKER
        // is absent — which only works if neither token contains the other.
        assert_ne!(MARKER, CONTROL);
        assert!(!CONTROL.contains(MARKER));
        assert!(!MARKER.contains(CONTROL));
        // Both must be normalization-neutral (lowercase alnum) so no stage
        // moves them — otherwise the baseline itself could fold.
        for tok in [MARKER, CONTROL] {
            assert!(
                tok.bytes()
                    .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit()),
                "{tok} must be lowercase-alnum (normalization-neutral)"
            );
        }
    }
}