genson_core/schema/
core.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use std::collections::HashMap;
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct SchemaInferenceConfig {
7    /// Whether to treat top-level arrays as streams of objects
8    pub ignore_outer_array: bool,
9    /// Delimiter for NDJSON format (None for regular JSON)
10    pub delimiter: Option<u8>,
11    /// Schema URI to use ("AUTO" for auto-detection)
12    pub schema_uri: Option<String>,
13    /// Threshold above which non-fixed keys are treated as a map
14    pub map_threshold: usize,
15    /// Maximum number of required keys a Map can have. If None, no gating based on required keys.
16    /// If Some(n), objects with more than n required keys will be forced to Record type.
17    pub map_max_required_keys: Option<usize>,
18    /// Enable unification of compatible but non-homogeneous record schemas into maps
19    pub unify_maps: bool,
20    /// Fields whose keys should not be merged during record unification
21    pub no_unify: std::collections::HashSet<String>,
22    /// Force override of field treatment, e.g. {"labels": "map"}
23    pub force_field_types: HashMap<String, String>,
24    /// Force parent objects containing these fields to remain as records, preventing map inference.
25    /// e.g. {"mainsnak": "record"} prevents any object containing a "mainsnak" field from being
26    /// converted to a map, ensuring homogeneity across array items.
27    pub force_parent_field_types: HashMap<String, String>,
28    /// Set of field names that should always be promoted to wrapped scalars,
29    /// even when they appear as simple scalars (not in type unions). This ensures
30    /// schema stability for fields known to have heterogeneous types across schematised files.
31    pub force_scalar_promotion: std::collections::HashSet<String>,
32    /// Whether to promote scalar values to wrapped objects when they collide with record values
33    /// during unification. If `true`, scalars are promoted under a synthetic property name derived from
34    /// the parent field and the scalar type (e.g. "foo__string"). If `false`, don't unify on conflicts.
35    pub wrap_scalars: bool,
36    /// Wrap the inferred top-level schema under a single required field with this name.
37    /// Example: wrap_root = Some("labels") turns `{...}` into
38    /// `{"type":"object","properties":{"labels":{...}},"required":["labels"]}`.
39    pub wrap_root: Option<String>,
40    /// Prevent the document root from becoming a map type, even if it meets map inference criteria
41    pub no_root_map: bool,
42    /// Maximum number of schema builders to create in parallel at once
43    /// Lower values reduce peak memory usage during schema inference
44    /// None: process all strings at once
45    pub max_builders: Option<usize>,
46    /// Whether to output Avro schema rather than regular JSON Schema.
47    #[cfg(feature = "avro")]
48    pub avro: bool,
49    /// Enable debug output. When `true`, prints detailed information about schema inference
50    /// processes including field unification, map detection, and scalar wrapping decisions.
51    pub debug: bool,
52    /// Enable profiling output. When `true`, prints detailed information about timing.
53    pub profile: bool,
54    /// Controls the verbosity level of debug output
55    pub verbosity: DebugVerbosity,
56}
57
58#[derive(Default, Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
59pub enum DebugVerbosity {
60    /// Show important unification decisions and failures  
61    #[default]
62    Normal,
63    /// Show all debug information including field introductions
64    Verbose,
65}
66
67impl SchemaInferenceConfig {
68    pub(crate) fn profile(&self, args: std::fmt::Arguments) {
69        if self.profile {
70            let message = format!("{}", args);
71            anstream::eprintln!("{}", message);
72        }
73    }
74
75    pub(crate) fn profile_verbose(&self, args: std::fmt::Arguments) {
76        if self.profile && matches!(self.verbosity, DebugVerbosity::Verbose) {
77            let message = format!("{}", args);
78            anstream::eprintln!("{}", message);
79        }
80    }
81
82    pub(crate) fn debug(&self, args: std::fmt::Arguments) {
83        if self.debug {
84            let message = format!("{}", args);
85            anstream::eprintln!("{}", self.maybe_truncate(message));
86        }
87    }
88
89    pub(crate) fn debug_verbose(&self, args: std::fmt::Arguments) {
90        if self.debug && matches!(self.verbosity, DebugVerbosity::Verbose) {
91            let message = format!("{}", args);
92            anstream::eprintln!("{}", self.maybe_truncate(message));
93        }
94    }
95
96    fn maybe_truncate(&self, message: String) -> String {
97        let lines: Vec<&str> = message.lines().collect();
98
99        if lines.len() > 20 && self.verbosity == DebugVerbosity::Normal {
100            let mut truncated = String::new();
101
102            // First 10 lines
103            for line in lines.iter().take(10) {
104                truncated.push_str(line);
105                truncated.push('\n');
106            }
107
108            truncated.push_str(&format!("... ({} lines truncated) ...\n", lines.len() - 15));
109
110            // Last 5 lines
111            for line in lines.iter().skip(lines.len() - 5) {
112                truncated.push_str(line);
113                truncated.push('\n');
114            }
115
116            truncated
117        } else {
118            message
119        }
120    }
121}
122
123impl Default for SchemaInferenceConfig {
124    fn default() -> Self {
125        Self {
126            ignore_outer_array: true,
127            delimiter: None,
128            schema_uri: Some("AUTO".to_string()),
129            map_threshold: 20,
130            map_max_required_keys: None,
131            unify_maps: false,
132            no_unify: std::collections::HashSet::new(),
133            force_field_types: std::collections::HashMap::new(),
134            force_parent_field_types: std::collections::HashMap::new(),
135            force_scalar_promotion: std::collections::HashSet::new(),
136            wrap_scalars: true,
137            wrap_root: None,
138            no_root_map: true,
139            max_builders: None,
140            #[cfg(feature = "avro")]
141            avro: false,
142            debug: false,
143            profile: false,
144            verbosity: DebugVerbosity::default(),
145        }
146    }
147}
148
149#[macro_export]
150macro_rules! profile {
151    ($cfg:expr, $($arg:tt)*) => {
152        $cfg.profile(format_args!($($arg)*))
153    };
154}
155
156#[macro_export]
157macro_rules! profile_verbose {
158    ($cfg:expr, $($arg:tt)*) => {
159        $cfg.profile_verbose(format_args!($($arg)*))
160    };
161}
162
163#[macro_export]
164macro_rules! debug {
165    ($cfg:expr, $($arg:tt)*) => {
166        $cfg.debug(format_args!($($arg)*))
167    };
168}
169
170#[macro_export]
171macro_rules! debug_verbose {
172    ($cfg:expr, $($arg:tt)*) => {
173        $cfg.debug_verbose(format_args!($($arg)*))
174    };
175}
176
177#[derive(Debug, Clone, Serialize, Deserialize)]
178pub struct SchemaInferenceResult {
179    pub schema: Value,
180    pub processed_count: usize,
181}
182
183#[cfg(feature = "avro")]
184impl SchemaInferenceResult {
185    pub fn to_avro_schema(
186        &self,
187        namespace: &str,
188        utility_namespace: Option<&str>,
189        base_uri: Option<&str>,
190        split_top_level: bool,
191    ) -> Value {
192        avrotize::converter::jsons_to_avro(
193            &self.schema,
194            namespace,
195            utility_namespace.unwrap_or(""),
196            base_uri.unwrap_or("genson-core"),
197            split_top_level,
198        )
199    }
200}
201
202/// Generate a consistent key name for promoted scalar values.
203///
204/// Creates keys in the format `{field_prefix}__{scalar_type}` for scalar values
205/// that are promoted to object fields during schema unification or normalisation.
206pub fn make_promoted_scalar_key(field_prefix: &str, scalar_type: &str) -> String {
207    // Could be parameterised by config in future to make configurable
208    format!("{}__{}", field_prefix, scalar_type)
209}