fionn_simd/
lib.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! SIMD-accelerated utilities for fionn
3//!
4//! This crate provides SIMD-accelerated utilities for JSON processing:
5//! - Skip strategies for fast JSON value skipping (Scalar, Langdale, `JsonSki`, AVX2)
6//! - Line boundary detection for JSONL files
7//! - Character classification
8//!
9//! # Skip Strategies
10//!
11//! The [`skip`] module provides multiple implementations for skipping JSON values:
12//!
13//! - [`ScalarSkip`](skip::ScalarSkip) - Byte-by-byte baseline
14//! - [`LangdaleSkip`](skip::LangdaleSkip) - Langdale-Lemire XOR prefix algorithm
15//! - [`JsonSkiSkip`](skip::JsonSkiSkip) - `JSONSki` bracket counting (default)
16//! - [`Avx2Skip`](x86::skip::Avx2Skip) - AVX2 SIMD acceleration (`x86_64`)
17//!
18//! Use [`SkipStrategy`](skip::SkipStrategy) for runtime selection of the best strategy.
19
20pub mod skip;
21
22#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
23pub mod x86;
24
25// Re-export key types from skip module
26pub use skip::{
27    JsonSkiSkip, LangdaleSkip, ParallelSkipper, ScalarSkip, Skip, SkipResult, SkipStrategy,
28    skip_arrays_parallel, skip_objects_parallel, skip_values_parallel,
29};
30
31// Re-export SIMD skip implementations
32#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
33pub use x86::skip::Avx2Skip;
34
35/// SIMD-accelerated line separator detection for JSONL
36#[derive(Debug, Clone, Default)]
37pub struct SimdLineSeparator;
38
39impl SimdLineSeparator {
40    /// Create a new SIMD line separator detector
41    #[must_use]
42    pub const fn new() -> Self {
43        Self {}
44    }
45
46    /// Detect line boundaries in a data chunk using SIMD
47    ///
48    /// Returns a vector of positions marking the end of each line (position after newline).
49    /// If the data doesn't end with a newline, the final position is the end of the data.
50    #[must_use]
51    pub fn find_line_boundaries(&self, data: &[u8]) -> Vec<usize> {
52        // Use memchr iterator which exploits SIMD for finding byte occurrences
53        let mut boundaries: Vec<usize> = memchr::memchr_iter(b'\n', data)
54            .map(|pos| pos + 1) // Position after the \n
55            .collect();
56
57        // If data doesn't end with \n, add the end position
58        if !data.is_empty() && data[data.len() - 1] != b'\n' {
59            boundaries.push(data.len());
60        }
61
62        boundaries
63    }
64}
65
66/// SIMD-accelerated structural filtering for JSONL documents
67#[derive(Debug, Clone, Default)]
68pub struct SimdStructuralFilter;
69
70impl SimdStructuralFilter {
71    /// Create a new SIMD structural filter
72    #[must_use]
73    pub const fn new() -> Self {
74        Self {}
75    }
76
77    /// Check if a JSON line contains required schema fields using SIMD
78    #[must_use]
79    pub fn matches_schema(&self, line: &[u8], required_fields: &[String]) -> bool {
80        if line.is_empty() {
81            return false;
82        }
83
84        // Fast pre-filter using memchr::memmem to check for required fields in the raw bytes
85        for field in required_fields {
86            let needle = format!("\"{field}\"");
87            if memchr::memmem::find(line, needle.as_bytes()).is_none() {
88                return false;
89            }
90        }
91        true
92    }
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98
99    #[test]
100    fn test_line_separator_empty() {
101        let sep = SimdLineSeparator::new();
102        let boundaries = sep.find_line_boundaries(b"");
103        assert!(boundaries.is_empty());
104    }
105
106    #[test]
107    fn test_line_separator_single_line_no_newline() {
108        let sep = SimdLineSeparator::new();
109        let boundaries = sep.find_line_boundaries(b"hello");
110        assert_eq!(boundaries, vec![5]);
111    }
112
113    #[test]
114    fn test_line_separator_single_line_with_newline() {
115        let sep = SimdLineSeparator::new();
116        let boundaries = sep.find_line_boundaries(b"hello\n");
117        assert_eq!(boundaries, vec![6]);
118    }
119
120    #[test]
121    fn test_line_separator_multiple_lines() {
122        let sep = SimdLineSeparator::new();
123        let boundaries = sep.find_line_boundaries(b"line1\nline2\nline3\n");
124        assert_eq!(boundaries, vec![6, 12, 18]);
125    }
126
127    #[test]
128    fn test_line_separator_multiple_lines_no_trailing() {
129        let sep = SimdLineSeparator::new();
130        let boundaries = sep.find_line_boundaries(b"line1\nline2\nline3");
131        assert_eq!(boundaries, vec![6, 12, 17]);
132    }
133
134    #[test]
135    fn test_structural_filter_empty() {
136        let filter = SimdStructuralFilter::new();
137        assert!(!filter.matches_schema(b"", &[]));
138    }
139
140    #[test]
141    fn test_structural_filter_match() {
142        let filter = SimdStructuralFilter::new();
143        let line = br#"{"name": "Alice", "age": 30}"#;
144        assert!(filter.matches_schema(line, &["name".to_string()]));
145        assert!(filter.matches_schema(line, &["age".to_string()]));
146        assert!(filter.matches_schema(line, &["name".to_string(), "age".to_string()]));
147    }
148
149    #[test]
150    fn test_structural_filter_no_match() {
151        let filter = SimdStructuralFilter::new();
152        let line = br#"{"name": "Alice"}"#;
153        assert!(!filter.matches_schema(line, &["missing".to_string()]));
154    }
155
156    #[test]
157    fn test_skip_strategy_default() {
158        let strategy = SkipStrategy::default();
159        assert!(matches!(strategy, SkipStrategy::JsonSki));
160    }
161
162    #[test]
163    fn test_skip_trait_object() {
164        let skipper = ScalarSkip;
165        let result = skipper.skip_object(b"}");
166        assert!(result.is_some());
167    }
168}