Skip to main content

fionn_stream/skiptape/
schema.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! Schema compilation for SIMD-JSONL Skip Tape
3//!
4//! This module handles the compilation of schema definitions into SIMD-friendly
5//! patterns that can be used for fast filtering during parsing.
6
7use crate::skiptape::error::{Result, SkipTapeError};
8use crate::skiptape::simd_ops::SimdStringOps;
9
10/// Compiled schema for SIMD-accelerated filtering
11#[derive(Debug)]
12pub struct CompiledSchema {
13    /// Include patterns (paths that should be kept)
14    pub include_patterns: Vec<SchemaPattern>,
15    /// Exclude patterns (paths that should be skipped)
16    pub exclude_patterns: Vec<SchemaPattern>,
17    /// Maximum parsing depth
18    pub max_depth: usize,
19    /// SIMD-friendly hash table for fast lookups
20    pub pattern_hashes: Vec<u64>,
21}
22
23impl CompiledSchema {
24    /// Compile a list of field paths into a SIMD-friendly schema
25    ///
26    /// # Errors
27    /// Returns an error if any path pattern is invalid
28    pub fn compile(paths: &[String]) -> Result<Self> {
29        let mut include_patterns = Vec::new();
30        let mut pattern_hashes = Vec::new();
31
32        for path in paths {
33            let pattern = SchemaPattern::compile(path)?;
34            include_patterns.push(pattern);
35
36            // Pre-compute hashes for SIMD comparison
37            let hash = SimdStringOps::hash_field_name(path.as_bytes());
38            pattern_hashes.push(hash);
39        }
40
41        Ok(Self {
42            include_patterns,
43            exclude_patterns: Vec::new(),
44            max_depth: 10, // Default max depth
45            pattern_hashes,
46        })
47    }
48
49    /// Compile a schema with both include and exclude patterns
50    ///
51    /// # Errors
52    /// Returns an error if any path pattern is invalid
53    pub fn compile_with_excludes(
54        include_paths: &[String],
55        exclude_paths: &[String],
56    ) -> Result<Self> {
57        let mut include_patterns = Vec::new();
58        let mut exclude_patterns = Vec::new();
59        let mut pattern_hashes = Vec::new();
60
61        for path in include_paths {
62            let pattern = SchemaPattern::compile(path)?;
63            include_patterns.push(pattern);
64            let hash = SimdStringOps::hash_field_name(path.as_bytes());
65            pattern_hashes.push(hash);
66        }
67
68        for path in exclude_paths {
69            let pattern = SchemaPattern::compile(path)?;
70            exclude_patterns.push(pattern);
71        }
72
73        Ok(Self {
74            include_patterns,
75            exclude_patterns,
76            max_depth: 10,
77            pattern_hashes,
78        })
79    }
80
81    /// Get the field paths that this schema includes
82    #[must_use]
83    pub fn field_paths(&self) -> Vec<String> {
84        self.include_patterns
85            .iter()
86            .map(|pattern| pattern.path.clone())
87            .collect()
88    }
89
90    /// Check if a path matches the schema (includes but not excludes)
91    #[must_use]
92    pub fn matches_path(&self, path: &str) -> bool {
93        // First check if path is excluded
94        for exclude_pattern in &self.exclude_patterns {
95            if exclude_pattern.matches(path) {
96                return false;
97            }
98        }
99
100        // Fast SIMD hash-based lookup for includes
101        let path_hash = SimdStringOps::hash_field_name(path.as_bytes());
102
103        // Check if hash matches any pattern (fast rejection)
104        if !self.pattern_hashes.contains(&path_hash) {
105            return false;
106        }
107
108        // Full pattern matching for hash collisions
109        for pattern in &self.include_patterns {
110            if pattern.matches(path) {
111                return true;
112            }
113        }
114
115        false
116    }
117
118    /// Check if a path is explicitly excluded
119    #[must_use]
120    pub fn is_excluded(&self, path: &str) -> bool {
121        for exclude_pattern in &self.exclude_patterns {
122            if exclude_pattern.matches(path) {
123                return true;
124            }
125        }
126        false
127    }
128
129    /// Check if we should include an object at the given path
130    #[must_use]
131    pub fn should_include_object(&self, path: &str) -> bool {
132        // First check if path is excluded
133        if self.is_excluded(path) {
134            return false;
135        }
136
137        // Check if any child paths would match
138        for pattern in &self.include_patterns {
139            if pattern.could_match_children(path) {
140                // Also verify the children aren't all excluded
141                return true;
142            }
143        }
144        false
145    }
146}
147
148/// Individual schema pattern for path matching
149#[derive(Debug)]
150pub struct SchemaPattern {
151    /// Original path string
152    pub path: String,
153    /// Path components for structured matching
154    pub components: Vec<String>,
155    /// Match type (exact, prefix, wildcard)
156    pub match_type: MatchType,
157    /// Pre-compiled regex for complex patterns
158    pub regex: Option<regex::Regex>,
159}
160
161impl SchemaPattern {
162    /// Compile a path pattern
163    ///
164    /// # Errors
165    /// Returns an error if the pattern contains an invalid regex
166    pub fn compile(path: &str) -> Result<Self> {
167        let components: Vec<String> = path
168            .split('.')
169            .map(std::string::ToString::to_string)
170            .collect();
171
172        let match_type = if path.contains('*') {
173            MatchType::Wildcard
174        } else if components.len() > 1 {
175            MatchType::Prefix
176        } else {
177            MatchType::Exact
178        };
179
180        let regex =
181            if matches!(match_type, MatchType::Wildcard) {
182                // Convert glob pattern to regex
183                let regex_pattern = Self::glob_to_regex(path);
184                Some(regex::Regex::new(&regex_pattern).map_err(|e| {
185                    SkipTapeError::SchemaError(format!("Invalid regex pattern: {e}"))
186                })?)
187            } else {
188                None
189            };
190
191        Ok(Self {
192            path: path.to_string(),
193            components,
194            match_type,
195            regex,
196        })
197    }
198
199    /// Check if this pattern matches a path
200    #[must_use]
201    pub fn matches(&self, path: &str) -> bool {
202        match self.match_type {
203            MatchType::Exact => self.path == path,
204            MatchType::Prefix => path.starts_with(&self.path),
205            MatchType::Wildcard => self
206                .regex
207                .as_ref()
208                .is_some_and(|regex| regex.is_match(path)),
209        }
210    }
211
212    /// Check if this pattern could match children of the given path
213    #[must_use]
214    pub fn could_match_children(&self, path: &str) -> bool {
215        match self.match_type {
216            MatchType::Exact => self.path.starts_with(&format!("{path}.")),
217            MatchType::Prefix => {
218                self.path.starts_with(&format!("{path}."))
219                    || path.starts_with(&format!("{}.", self.path))
220            }
221            MatchType::Wildcard => {
222                // For wildcards, check if the pattern could match deeper paths
223                let test_path = format!("{path}.test");
224                self.regex
225                    .as_ref()
226                    .is_some_and(|regex| regex.is_match(&test_path))
227            }
228        }
229    }
230
231    /// Convert glob pattern to regex
232    fn glob_to_regex(pattern: &str) -> String {
233        let mut regex = String::from("^");
234        let mut chars = pattern.chars().peekable();
235
236        while let Some(ch) = chars.next() {
237            match ch {
238                '*' => {
239                    if chars.peek() == Some(&'*') {
240                        // ** matches any character sequence including dots
241                        chars.next(); // consume second *
242                        regex.push_str(".*");
243                    } else {
244                        // * matches any character sequence except dots
245                        regex.push_str("[^.]*");
246                    }
247                }
248                '.' => regex.push_str("\\."),
249                '?' => regex.push('.'),
250                '[' => regex.push_str("\\["),
251                ']' => regex.push_str("\\]"),
252                '{' => regex.push_str("\\{"),
253                '}' => regex.push_str("\\}"),
254                '(' => regex.push_str("\\("),
255                ')' => regex.push_str("\\)"),
256                '+' => regex.push_str("\\+"),
257                '^' => regex.push_str("\\^"),
258                '$' => regex.push_str("\\$"),
259                '|' => regex.push_str("\\|"),
260                '\\' => regex.push_str("\\\\"),
261                other => regex.push(other),
262            }
263        }
264
265        regex.push('$');
266        regex
267    }
268}
269
270/// Type of pattern matching
271#[derive(Debug, Clone, Copy)]
272pub enum MatchType {
273    /// Exact path match only
274    Exact,
275    /// Prefix match (path starts with pattern)
276    Prefix,
277    /// Wildcard/glob pattern matching
278    Wildcard,
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_compiled_schema_compile() {
287        let schema = CompiledSchema::compile(&["name".to_string(), "age".to_string()]);
288        assert!(schema.is_ok());
289        let schema = schema.unwrap();
290        assert_eq!(schema.include_patterns.len(), 2);
291    }
292
293    #[test]
294    fn test_compiled_schema_compile_empty() {
295        let schema = CompiledSchema::compile(&[]);
296        assert!(schema.is_ok());
297        let schema = schema.unwrap();
298        assert!(schema.include_patterns.is_empty());
299    }
300
301    #[test]
302    fn test_compiled_schema_with_excludes() {
303        let schema =
304            CompiledSchema::compile_with_excludes(&["name".to_string()], &["age".to_string()]);
305        assert!(schema.is_ok());
306        let schema = schema.unwrap();
307        assert_eq!(schema.include_patterns.len(), 1);
308        assert_eq!(schema.exclude_patterns.len(), 1);
309    }
310
311    #[test]
312    fn test_compiled_schema_field_paths() {
313        let schema = CompiledSchema::compile(&["name".to_string(), "age".to_string()]).unwrap();
314        let paths = schema.field_paths();
315        assert!(paths.contains(&"name".to_string()));
316        assert!(paths.contains(&"age".to_string()));
317    }
318
319    #[test]
320    fn test_compiled_schema_matches_path_exact() {
321        let schema = CompiledSchema::compile(&["name".to_string()]).unwrap();
322        assert!(schema.matches_path("name"));
323        assert!(!schema.matches_path("age"));
324    }
325
326    #[test]
327    fn test_compiled_schema_is_excluded() {
328        let schema =
329            CompiledSchema::compile_with_excludes(&["*".to_string()], &["secret".to_string()])
330                .unwrap();
331        assert!(schema.is_excluded("secret"));
332        assert!(!schema.is_excluded("name"));
333    }
334
335    #[test]
336    fn test_compiled_schema_should_include_object() {
337        let schema = CompiledSchema::compile(&["user.name".to_string()]).unwrap();
338        assert!(schema.should_include_object("user"));
339    }
340
341    #[test]
342    fn test_compiled_schema_should_not_include_excluded() {
343        let schema = CompiledSchema::compile_with_excludes(
344            &["user.name".to_string()],
345            &["user".to_string()],
346        )
347        .unwrap();
348        assert!(!schema.should_include_object("user"));
349    }
350
351    #[test]
352    fn test_schema_pattern_exact() {
353        let pattern = SchemaPattern::compile("name").unwrap();
354        assert!(matches!(pattern.match_type, MatchType::Exact));
355        assert!(pattern.matches("name"));
356        assert!(!pattern.matches("age"));
357    }
358
359    #[test]
360    fn test_schema_pattern_prefix() {
361        let pattern = SchemaPattern::compile("user.name").unwrap();
362        assert!(matches!(pattern.match_type, MatchType::Prefix));
363        assert!(pattern.matches("user.name"));
364        assert!(pattern.matches("user.name.first"));
365    }
366
367    #[test]
368    fn test_schema_pattern_wildcard() {
369        let pattern = SchemaPattern::compile("user.*").unwrap();
370        assert!(matches!(pattern.match_type, MatchType::Wildcard));
371        assert!(pattern.matches("user.name"));
372        assert!(pattern.matches("user.age"));
373        assert!(!pattern.matches("name"));
374    }
375
376    #[test]
377    fn test_schema_pattern_double_wildcard() {
378        let pattern = SchemaPattern::compile("user.**").unwrap();
379        assert!(pattern.matches("user.profile.name"));
380    }
381
382    #[test]
383    fn test_schema_pattern_could_match_children() {
384        let pattern = SchemaPattern::compile("user.name").unwrap();
385        assert!(pattern.could_match_children("user"));
386    }
387
388    #[test]
389    fn test_schema_pattern_could_match_children_wildcard() {
390        let pattern = SchemaPattern::compile("user.*").unwrap();
391        assert!(pattern.could_match_children("user"));
392    }
393
394    #[test]
395    fn test_glob_to_regex_special_chars() {
396        // Test various special characters
397        let pattern = SchemaPattern::compile("test[0]").unwrap();
398        assert!(pattern.matches("test[0]"));
399    }
400
401    #[test]
402    fn test_glob_to_regex_question_mark() {
403        // Question mark is only used in wildcard patterns (with *)
404        let pattern = SchemaPattern::compile("test?*").unwrap();
405        assert!(pattern.matches("test1abc"));
406        assert!(pattern.matches("testAbc"));
407    }
408
409    #[test]
410    fn test_match_type_debug() {
411        let mt = MatchType::Exact;
412        let debug = format!("{mt:?}");
413        assert!(debug.contains("Exact"));
414    }
415
416    #[test]
417    fn test_match_type_clone() {
418        let mt = MatchType::Wildcard;
419        let cloned = mt;
420        assert!(matches!(cloned, MatchType::Wildcard));
421    }
422
423    #[test]
424    fn test_schema_pattern_debug() {
425        let pattern = SchemaPattern::compile("name").unwrap();
426        let debug = format!("{pattern:?}");
427        assert!(debug.contains("name"));
428    }
429
430    #[test]
431    fn test_compiled_schema_debug() {
432        let schema = CompiledSchema::compile(&["name".to_string()]).unwrap();
433        let debug = format!("{schema:?}");
434        assert!(!debug.is_empty());
435    }
436
437    #[test]
438    fn test_schema_max_depth() {
439        let schema = CompiledSchema::compile(&["name".to_string()]).unwrap();
440        assert_eq!(schema.max_depth, 10);
441    }
442
443    #[test]
444    fn test_schema_pattern_hashes() {
445        let schema = CompiledSchema::compile(&["name".to_string()]).unwrap();
446        assert_eq!(schema.pattern_hashes.len(), 1);
447    }
448
449    #[test]
450    fn test_glob_to_regex_curly_braces() {
451        let pattern = SchemaPattern::compile("test{0}*").unwrap();
452        assert!(pattern.matches("test{0}"));
453    }
454
455    #[test]
456    fn test_glob_to_regex_parentheses() {
457        let pattern = SchemaPattern::compile("test(1)*").unwrap();
458        assert!(pattern.matches("test(1)"));
459    }
460
461    #[test]
462    fn test_glob_to_regex_plus() {
463        let pattern = SchemaPattern::compile("test+1*").unwrap();
464        assert!(pattern.matches("test+1"));
465    }
466
467    #[test]
468    fn test_glob_to_regex_caret() {
469        let pattern = SchemaPattern::compile("test^1*").unwrap();
470        assert!(pattern.matches("test^1"));
471    }
472
473    #[test]
474    fn test_glob_to_regex_dollar() {
475        let pattern = SchemaPattern::compile("test$1*").unwrap();
476        assert!(pattern.matches("test$1"));
477    }
478
479    #[test]
480    fn test_glob_to_regex_pipe() {
481        let pattern = SchemaPattern::compile("test|1*").unwrap();
482        assert!(pattern.matches("test|1"));
483    }
484
485    #[test]
486    fn test_glob_to_regex_backslash() {
487        let pattern = SchemaPattern::compile("test\\1*").unwrap();
488        assert!(pattern.matches("test\\1"));
489    }
490
491    #[test]
492    fn test_matches_path_hash_collision() {
493        // Create a schema and test path matching with exact matches
494        let schema =
495            CompiledSchema::compile(&["user.name".to_string(), "user.email".to_string()]).unwrap();
496        assert!(schema.matches_path("user.name"));
497        assert!(schema.matches_path("user.email"));
498        assert!(!schema.matches_path("user.age"));
499    }
500
501    #[test]
502    fn test_matches_path_with_exclusion() {
503        // Note: matches_path uses hash-based lookup, so exact matches work best
504        // Wildcards should use should_include_object for object filtering
505        let schema = CompiledSchema::compile_with_excludes(
506            &["user.name".to_string()],
507            &["user.secret".to_string()],
508        )
509        .unwrap();
510        assert!(schema.matches_path("user.name"));
511        // user.secret is excluded
512        assert!(schema.is_excluded("user.secret"));
513    }
514
515    #[test]
516    fn test_could_match_children_exact_no_match() {
517        let pattern = SchemaPattern::compile("name").unwrap();
518        // "name" cannot have children like "other.child"
519        assert!(!pattern.could_match_children("other"));
520    }
521
522    #[test]
523    fn test_could_match_children_prefix_reverse() {
524        let pattern = SchemaPattern::compile("user.name").unwrap();
525        // Test where path starts with pattern (prefix reverse)
526        assert!(pattern.could_match_children("user.name.first"));
527    }
528
529    #[test]
530    fn test_could_match_children_wildcard_no_regex() {
531        // This is an edge case - a pattern that is Wildcard but has no regex
532        // In practice, this shouldn't happen with compile(), but let's be thorough
533        let pattern = SchemaPattern {
534            path: "test.*".to_string(),
535            components: vec!["test".to_string(), "*".to_string()],
536            match_type: MatchType::Wildcard,
537            regex: None,
538        };
539        assert!(!pattern.could_match_children("test"));
540    }
541
542    #[test]
543    fn test_matches_wildcard_no_regex() {
544        // Edge case: Wildcard match type but no regex
545        let pattern = SchemaPattern {
546            path: "test.*".to_string(),
547            components: vec!["test".to_string(), "*".to_string()],
548            match_type: MatchType::Wildcard,
549            regex: None,
550        };
551        assert!(!pattern.matches("test.abc"));
552    }
553
554    #[test]
555    fn test_should_include_object_no_match() {
556        let schema = CompiledSchema::compile(&["other.field".to_string()]).unwrap();
557        // "user" is not a prefix of "other.field"
558        assert!(!schema.should_include_object("user"));
559    }
560
561    #[test]
562    fn test_exclude_patterns_empty() {
563        let schema = CompiledSchema::compile(&["name".to_string()]).unwrap();
564        assert!(schema.exclude_patterns.is_empty());
565    }
566
567    #[test]
568    fn test_match_type_copy() {
569        let mt = MatchType::Prefix;
570        let copied = mt;
571        assert!(matches!(copied, MatchType::Prefix));
572    }
573
574    #[test]
575    fn test_compiled_schema_multiple_patterns() {
576        // Note: matches_path uses exact hash matching, not wildcard matching
577        let schema = CompiledSchema::compile(&[
578            "user.name".to_string(),
579            "user.email".to_string(),
580            "address.city".to_string(),
581        ])
582        .unwrap();
583
584        assert!(schema.matches_path("user.name"));
585        assert!(schema.matches_path("address.city"));
586        assert!(!schema.matches_path("phone"));
587    }
588
589    #[test]
590    fn test_compiled_schema_excludes_with_wildcards() {
591        let schema =
592            CompiledSchema::compile_with_excludes(&["**".to_string()], &["secret.*".to_string()])
593                .unwrap();
594
595        assert!(schema.is_excluded("secret.key"));
596        assert!(!schema.is_excluded("public.key"));
597    }
598
599    #[test]
600    fn test_schema_pattern_components() {
601        let pattern = SchemaPattern::compile("user.profile.name").unwrap();
602        assert_eq!(pattern.components.len(), 3);
603        assert_eq!(pattern.components[0], "user");
604        assert_eq!(pattern.components[1], "profile");
605        assert_eq!(pattern.components[2], "name");
606    }
607
608    #[test]
609    fn test_could_match_children_wildcard_match() {
610        let pattern = SchemaPattern::compile("user.**").unwrap();
611        // Double wildcard should match any depth
612        assert!(pattern.could_match_children("user"));
613    }
614
615    #[test]
616    fn test_could_match_children_no_match() {
617        let pattern = SchemaPattern::compile("user.name").unwrap();
618        // "other" is not a prefix of "user.name"
619        assert!(!pattern.could_match_children("other"));
620    }
621
622    #[test]
623    fn test_matches_path_no_hash_match() {
624        // Path that doesn't match any hash should return false quickly
625        let schema = CompiledSchema::compile(&["specific.field".to_string()]).unwrap();
626        assert!(!schema.matches_path("different.field"));
627    }
628}