fionn_simd/skip/
mod.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! Skip strategies for JSON container traversal
3//!
4//! Provides multiple implementations for skipping JSON values, objects, and arrays.
5//! See `docs/research/skip-strategies.md` for algorithm attribution.
6//!
7//! Scalar implementations are in this module. SIMD implementations (AVX2, NEON)
8//! are in the unified `simd` module for architecture-specific acceleration.
9//!
10//! # Parallel Processing
11//! For processing multiple documents, use `skip_batch_parallel` which distributes
12//! work across all available CPU cores using rayon.
13
14mod arena_jsonski;
15mod jsonski;
16mod langdale;
17mod scalar;
18
19use rayon::prelude::*;
20
21pub use arena_jsonski::ArenaJsonSkiSkip;
22pub use jsonski::JsonSkiSkip;
23pub use langdale::LangdaleSkip;
24pub use scalar::ScalarSkip;
25
26// Re-export SIMD implementations from the x86 module
27#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
28pub use crate::x86::skip::Avx2Skip;
29
30/// Result of a skip operation
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub struct SkipResult {
33    /// Bytes consumed (offset past closing delimiter)
34    pub consumed: usize,
35    /// Whether escape sequences were encountered
36    pub has_escapes: bool,
37}
38
39/// Strategy for skipping JSON values
40pub trait Skip {
41    /// Skip an object starting after the opening `{`
42    fn skip_object(&self, input: &[u8]) -> Option<SkipResult>;
43
44    /// Skip an array starting after the opening `[`
45    fn skip_array(&self, input: &[u8]) -> Option<SkipResult>;
46
47    /// Skip a string starting after the opening `"`
48    fn skip_string(&self, input: &[u8]) -> Option<SkipResult>;
49
50    /// Skip any JSON value (auto-detects type from first non-whitespace byte)
51    fn skip_value(&self, input: &[u8]) -> Option<SkipResult>;
52}
53
54/// Runtime selection of skip strategy
55#[derive(Debug, Clone, Copy, Default)]
56pub enum SkipStrategy {
57    /// Scalar byte-by-byte (baseline)
58    Scalar,
59    /// Langdale-Lemire XOR prefix with branchless escape
60    Langdale,
61    /// `JSONSki` bracket counting (default - best general performance)
62    #[default]
63    JsonSki,
64    /// AVX2 SIMD (`x86_64` only)
65    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
66    Avx2,
67}
68
69impl std::fmt::Display for SkipStrategy {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        let name = match self {
72            Self::Scalar => "Scalar",
73            Self::Langdale => "Langdale",
74            Self::JsonSki => "JsonSki",
75            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
76            Self::Avx2 => "Avx2",
77        };
78        write!(f, "{name}")
79    }
80}
81
82impl SkipStrategy {
83    /// Create a skipper for this strategy
84    #[must_use]
85    pub fn skipper(&self) -> Box<dyn Skip + Send + Sync> {
86        match self {
87            Self::Scalar => Box::new(ScalarSkip),
88            Self::Langdale => Box::new(LangdaleSkip::new()),
89            Self::JsonSki => Box::new(JsonSkiSkip::new()),
90            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
91            Self::Avx2 => Box::new(Avx2Skip::new()),
92        }
93    }
94
95    /// Get the best available SIMD strategy for this platform
96    #[must_use]
97    pub fn best_simd() -> Self {
98        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
99        {
100            if Avx2Skip::is_available() {
101                return Self::Avx2;
102            }
103        }
104        Self::JsonSki
105    }
106
107    /// Get all available strategies for benchmarking
108    #[must_use]
109    pub fn all_strategies() -> Vec<Self> {
110        let mut strategies = vec![Self::Scalar, Self::Langdale, Self::JsonSki];
111        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
112        {
113            if Avx2Skip::is_available() {
114                strategies.push(Self::Avx2);
115            }
116        }
117        strategies
118    }
119}
120
121/// Skip multiple JSON documents in parallel using all available CPU cores
122///
123/// Each document is processed independently using the best available SIMD strategy.
124/// This is ideal for batch processing of JSON lines (JSONL) or arrays of documents.
125///
126/// # Arguments
127/// * `documents` - Slice of byte slices, each containing a JSON document starting with `{` or `[`
128///
129/// # Returns
130/// Vector of skip results in the same order as input documents
131///
132/// # Example
133/// ```
134/// use fionn_simd::skip::skip_objects_parallel;
135///
136/// let docs: Vec<&[u8]> = vec![
137///     br#""key": "value"}"#,
138///     br#""name": "test"}"#,
139/// ];
140/// let results = skip_objects_parallel(&docs);
141/// ```
142#[must_use]
143pub fn skip_objects_parallel(documents: &[&[u8]]) -> Vec<Option<SkipResult>> {
144    documents
145        .par_iter()
146        .map(|doc| {
147            let skipper = SkipStrategy::best_simd().skipper();
148            skipper.skip_object(doc)
149        })
150        .collect()
151}
152
153/// Skip multiple JSON arrays in parallel using all available CPU cores
154#[must_use]
155pub fn skip_arrays_parallel(documents: &[&[u8]]) -> Vec<Option<SkipResult>> {
156    documents
157        .par_iter()
158        .map(|doc| {
159            let skipper = SkipStrategy::best_simd().skipper();
160            skipper.skip_array(doc)
161        })
162        .collect()
163}
164
165/// Skip multiple JSON values in parallel using all available CPU cores
166/// Auto-detects type (object, array, string, etc.) for each document
167#[must_use]
168pub fn skip_values_parallel(documents: &[&[u8]]) -> Vec<Option<SkipResult>> {
169    documents
170        .par_iter()
171        .map(|doc| {
172            let skipper = SkipStrategy::best_simd().skipper();
173            skipper.skip_value(doc)
174        })
175        .collect()
176}
177
178/// Parallel batch skipper with configurable strategy
179///
180/// Use this when you need more control over the skip strategy or
181/// want to reuse the same skipper across multiple batches.
182#[derive(Debug, Clone, Copy)]
183pub struct ParallelSkipper {
184    strategy: SkipStrategy,
185}
186
187impl std::fmt::Display for ParallelSkipper {
188    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
189        write!(f, "ParallelSkipper(strategy={})", self.strategy)
190    }
191}
192
193impl ParallelSkipper {
194    /// Create a new parallel skipper with the given strategy
195    #[must_use]
196    pub const fn new(strategy: SkipStrategy) -> Self {
197        Self { strategy }
198    }
199
200    /// Create a parallel skipper using the best available SIMD strategy
201    #[must_use]
202    pub fn best_simd() -> Self {
203        Self::new(SkipStrategy::best_simd())
204    }
205
206    /// Skip objects in parallel
207    #[must_use]
208    pub fn skip_objects(&self, documents: &[&[u8]]) -> Vec<Option<SkipResult>> {
209        let strategy = self.strategy;
210        documents
211            .par_iter()
212            .map(|doc| {
213                let skipper = strategy.skipper();
214                skipper.skip_object(doc)
215            })
216            .collect()
217    }
218
219    /// Skip arrays in parallel
220    #[must_use]
221    pub fn skip_arrays(&self, documents: &[&[u8]]) -> Vec<Option<SkipResult>> {
222        let strategy = self.strategy;
223        documents
224            .par_iter()
225            .map(|doc| {
226                let skipper = strategy.skipper();
227                skipper.skip_array(doc)
228            })
229            .collect()
230    }
231
232    /// Skip values in parallel (auto-detects type)
233    #[must_use]
234    pub fn skip_values(&self, documents: &[&[u8]]) -> Vec<Option<SkipResult>> {
235        let strategy = self.strategy;
236        documents
237            .par_iter()
238            .map(|doc| {
239                let skipper = strategy.skipper();
240                skipper.skip_value(doc)
241            })
242            .collect()
243    }
244
245    /// Benchmark skip operations and return timing data.
246    ///
247    /// Returns `(total_bytes_processed, elapsed_duration)`.
248    /// Caller can compute throughput as: `total_bytes as f64 / elapsed.as_secs_f64()`
249    #[must_use]
250    pub fn benchmark(
251        &self,
252        documents: &[&[u8]],
253        iterations: usize,
254    ) -> (usize, std::time::Duration) {
255        let total_bytes: usize = documents.iter().map(|d| d.len()).sum();
256        let start = std::time::Instant::now();
257
258        for _ in 0..iterations {
259            let _ = self.skip_objects(documents);
260        }
261
262        (total_bytes.saturating_mul(iterations), start.elapsed())
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    // =========================================================================
271    // SkipResult Tests
272    // =========================================================================
273
274    #[test]
275    fn test_skip_result_debug() {
276        let result = SkipResult {
277            consumed: 10,
278            has_escapes: true,
279        };
280        let debug_str = format!("{result:?}");
281        assert!(debug_str.contains("consumed"));
282        assert!(debug_str.contains("10"));
283        assert!(debug_str.contains("has_escapes"));
284    }
285
286    #[test]
287    fn test_skip_result_clone() {
288        let result = SkipResult {
289            consumed: 42,
290            has_escapes: false,
291        };
292        let cloned = result;
293        assert_eq!(result, cloned);
294    }
295
296    #[test]
297    fn test_skip_result_copy() {
298        let result = SkipResult {
299            consumed: 100,
300            has_escapes: true,
301        };
302        let copied = result;
303        assert_eq!(result.consumed, copied.consumed);
304        assert_eq!(result.has_escapes, copied.has_escapes);
305    }
306
307    #[test]
308    fn test_skip_result_equality() {
309        let a = SkipResult {
310            consumed: 5,
311            has_escapes: false,
312        };
313        let b = SkipResult {
314            consumed: 5,
315            has_escapes: false,
316        };
317        let c = SkipResult {
318            consumed: 5,
319            has_escapes: true,
320        };
321        let d = SkipResult {
322            consumed: 6,
323            has_escapes: false,
324        };
325
326        assert_eq!(a, b);
327        assert_ne!(a, c);
328        assert_ne!(a, d);
329    }
330
331    // =========================================================================
332    // SkipStrategy Tests
333    // =========================================================================
334
335    #[test]
336    fn test_skip_strategy_default() {
337        let strategy = SkipStrategy::default();
338        matches!(strategy, SkipStrategy::JsonSki);
339    }
340
341    #[test]
342    fn test_skip_strategy_debug() {
343        let strategy = SkipStrategy::Scalar;
344        let debug_str = format!("{strategy}");
345        assert!(debug_str.contains("Scalar"));
346    }
347
348    #[test]
349    fn test_skip_strategy_clone() {
350        let strategy = SkipStrategy::Langdale;
351        let cloned = strategy;
352        matches!(cloned, SkipStrategy::Langdale);
353    }
354
355    #[test]
356    fn test_skip_strategy_skipper_scalar() {
357        let strategy = SkipStrategy::Scalar;
358        let skipper = strategy.skipper();
359        let result = skipper.skip_object(b"}");
360        assert!(result.is_some());
361    }
362
363    #[test]
364    fn test_skip_strategy_skipper_langdale() {
365        let strategy = SkipStrategy::Langdale;
366        let skipper = strategy.skipper();
367        let result = skipper.skip_object(b"}");
368        assert!(result.is_some());
369    }
370
371    #[test]
372    fn test_skip_strategy_skipper_jsonski() {
373        let strategy = SkipStrategy::JsonSki;
374        let skipper = strategy.skipper();
375        let result = skipper.skip_object(b"}");
376        assert!(result.is_some());
377    }
378
379    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
380    #[test]
381    fn test_skip_strategy_skipper_avx2() {
382        let strategy = SkipStrategy::Avx2;
383        let skipper = strategy.skipper();
384        let result = skipper.skip_object(b"}");
385        assert!(result.is_some());
386    }
387
388    #[test]
389    fn test_skip_strategy_best_simd() {
390        let strategy = SkipStrategy::best_simd();
391        // Should return a valid strategy for the current platform
392        let skipper = strategy.skipper();
393        let result = skipper.skip_object(b"}");
394        assert!(result.is_some());
395    }
396
397    #[test]
398    fn test_skip_strategy_all_strategies() {
399        let strategies = SkipStrategy::all_strategies();
400        // Should have at least 3 strategies (Scalar, Langdale, JsonSki)
401        assert!(strategies.len() >= 3);
402        assert!(strategies.iter().any(|s| matches!(s, SkipStrategy::Scalar)));
403        assert!(
404            strategies
405                .iter()
406                .any(|s| matches!(s, SkipStrategy::Langdale))
407        );
408        assert!(
409            strategies
410                .iter()
411                .any(|s| matches!(s, SkipStrategy::JsonSki))
412        );
413    }
414
415    // =========================================================================
416    // Parallel Skip Functions Tests
417    // =========================================================================
418
419    #[test]
420    fn test_skip_objects_parallel_empty() {
421        let documents: Vec<&[u8]> = vec![];
422        let results = skip_objects_parallel(&documents);
423        assert!(results.is_empty());
424    }
425
426    #[test]
427    fn test_skip_objects_parallel_single() {
428        let documents: Vec<&[u8]> = vec![b"}"];
429        let results = skip_objects_parallel(&documents);
430        assert_eq!(results.len(), 1);
431        assert!(results[0].is_some());
432    }
433
434    #[test]
435    fn test_skip_objects_parallel_multiple() {
436        let documents: Vec<&[u8]> = vec![
437            br#""key": "value"}"#,
438            br#""name": "test"}"#,
439            br#""a": 1, "b": 2}"#,
440        ];
441        let results = skip_objects_parallel(&documents);
442        assert_eq!(results.len(), 3);
443        for result in &results {
444            assert!(result.is_some());
445        }
446    }
447
448    #[test]
449    fn test_skip_arrays_parallel_empty() {
450        let documents: Vec<&[u8]> = vec![];
451        let results = skip_arrays_parallel(&documents);
452        assert!(results.is_empty());
453    }
454
455    #[test]
456    fn test_skip_arrays_parallel_single() {
457        let documents: Vec<&[u8]> = vec![b"]"];
458        let results = skip_arrays_parallel(&documents);
459        assert_eq!(results.len(), 1);
460        assert!(results[0].is_some());
461    }
462
463    #[test]
464    fn test_skip_arrays_parallel_multiple() {
465        let documents: Vec<&[u8]> = vec![b"1, 2, 3]", b"\"a\", \"b\"]", b"]"];
466        let results = skip_arrays_parallel(&documents);
467        assert_eq!(results.len(), 3);
468        for result in &results {
469            assert!(result.is_some());
470        }
471    }
472
473    #[test]
474    fn test_skip_values_parallel_empty() {
475        let documents: Vec<&[u8]> = vec![];
476        let results = skip_values_parallel(&documents);
477        assert!(results.is_empty());
478    }
479
480    #[test]
481    fn test_skip_values_parallel_mixed() {
482        let documents: Vec<&[u8]> = vec![
483            b"{}",      // object
484            b"[]",      // array
485            b"\"str\"", // string
486            b"123",     // number
487            b"true",    // boolean
488            b"null",    // null
489        ];
490        let results = skip_values_parallel(&documents);
491        assert_eq!(results.len(), 6);
492    }
493
494    // =========================================================================
495    // ParallelSkipper Tests
496    // =========================================================================
497
498    #[test]
499    fn test_parallel_skipper_new() {
500        let skipper = ParallelSkipper::new(SkipStrategy::Scalar);
501        let docs: Vec<&[u8]> = vec![b"}"];
502        let results = skipper.skip_objects(&docs);
503        assert_eq!(results.len(), 1);
504    }
505
506    #[test]
507    fn test_parallel_skipper_best_simd() {
508        let skipper = ParallelSkipper::best_simd();
509        let docs: Vec<&[u8]> = vec![b"}"];
510        let results = skipper.skip_objects(&docs);
511        assert_eq!(results.len(), 1);
512    }
513
514    #[test]
515    fn test_parallel_skipper_debug() {
516        let skipper = ParallelSkipper::new(SkipStrategy::JsonSki);
517        let debug_str = format!("{skipper}");
518        assert!(debug_str.contains("ParallelSkipper"));
519    }
520
521    #[test]
522    fn test_parallel_skipper_clone() {
523        let skipper = ParallelSkipper::new(SkipStrategy::Langdale);
524        let cloned = skipper;
525        let docs: Vec<&[u8]> = vec![b"}"];
526        let results1 = skipper.skip_objects(&docs);
527        let results2 = cloned.skip_objects(&docs);
528        assert_eq!(results1.len(), results2.len());
529    }
530
531    #[test]
532    fn test_parallel_skipper_skip_objects() {
533        let skipper = ParallelSkipper::new(SkipStrategy::JsonSki);
534        let docs: Vec<&[u8]> = vec![br#""a": 1}"#, br#""b": 2}"#];
535        let results = skipper.skip_objects(&docs);
536        assert_eq!(results.len(), 2);
537        assert!(results.iter().all(std::option::Option::is_some));
538    }
539
540    #[test]
541    fn test_parallel_skipper_skip_arrays() {
542        let skipper = ParallelSkipper::new(SkipStrategy::JsonSki);
543        let docs: Vec<&[u8]> = vec![b"1, 2]", b"3, 4]"];
544        let results = skipper.skip_arrays(&docs);
545        assert_eq!(results.len(), 2);
546        assert!(results.iter().all(std::option::Option::is_some));
547    }
548
549    #[test]
550    fn test_parallel_skipper_skip_values() {
551        let skipper = ParallelSkipper::new(SkipStrategy::JsonSki);
552        let docs: Vec<&[u8]> = vec![b"{}", b"[]", b"42"];
553        let results = skipper.skip_values(&docs);
554        assert_eq!(results.len(), 3);
555    }
556
557    #[test]
558    fn test_parallel_skipper_benchmark() {
559        let skipper = ParallelSkipper::new(SkipStrategy::Scalar);
560        let docs: Vec<&[u8]> = vec![br#""key": "value"}"#];
561        let (bytes, duration) = skipper.benchmark(&docs, 1);
562        // Should have processed some bytes
563        assert!(bytes > 0);
564        // Duration should be non-zero (or at least not panic)
565        let _ = duration;
566    }
567
568    // =========================================================================
569    // Cross-Strategy Equivalence Tests
570    // =========================================================================
571
572    #[test]
573    fn test_all_strategies_produce_same_results() {
574        let test_cases: Vec<&[u8]> = vec![
575            b"}",                  // empty object
576            br#""key": "value"}"#, // simple object
577            br#""a": {"b": 1}}"#,  // nested object
578            b"]",                  // empty array
579            b"1, 2, 3]",           // simple array
580        ];
581
582        let strategies = SkipStrategy::all_strategies();
583
584        for input in &test_cases {
585            let results: Vec<Option<SkipResult>> = strategies
586                .iter()
587                .map(|s| s.skipper().skip_object(input))
588                .collect();
589
590            // All strategies should return same consumed count
591            let first = results[0].as_ref();
592            for (i, result) in results.iter().enumerate().skip(1) {
593                assert_eq!(
594                    first.map(|r| r.consumed),
595                    result.as_ref().map(|r| r.consumed),
596                    "Strategy {i} differs from first for input {input:?}"
597                );
598            }
599        }
600    }
601}