pjson_rs/
semantic.rs

1//! Semantic type hints for automatic optimization
2
3use serde::{Deserialize, Serialize};
4use smallvec::SmallVec;
5
6/// Semantic type hints that enable automatic optimization
7#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
8pub enum SemanticType {
9    /// Array of homogeneous numeric data (SIMD-friendly)
10    NumericArray {
11        /// Data type of array elements
12        dtype: NumericDType,
13        /// Number of elements (if known)
14        length: Option<usize>,
15    },
16
17    /// Time series data with timestamp and values
18    TimeSeries {
19        /// Field name containing timestamps
20        timestamp_field: String,
21        /// Field names containing values
22        value_fields: SmallVec<[String; 4]>,
23        /// Optional sampling interval hint
24        interval_ms: Option<u64>,
25    },
26
27    /// Tabular data (columnar processing friendly)
28    Table {
29        /// Column metadata
30        columns: Box<SmallVec<[ColumnMeta; 16]>>,
31        /// Estimated row count
32        row_count: Option<usize>,
33    },
34
35    /// Graph/tree structure
36    Graph {
37        /// Node type identifier
38        node_type: String,
39        /// Edge type identifier
40        edge_type: String,
41        /// Estimated node count
42        node_count: Option<usize>,
43    },
44
45    /// Geospatial data
46    Geospatial {
47        /// Coordinate system (e.g., "WGS84", "UTM")
48        coordinate_system: String,
49        /// Geometry type (Point, LineString, Polygon, etc.)
50        geometry_type: String,
51    },
52
53    /// Image/matrix data
54    Matrix {
55        /// Matrix dimensions
56        dimensions: SmallVec<[usize; 4]>,
57        /// Element data type
58        dtype: NumericDType,
59    },
60
61    /// Generic JSON (no specific optimization)
62    Generic,
63}
64
65/// Numeric data types for type-specific optimization
66#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
67pub enum NumericDType {
68    /// 64-bit float
69    F64,
70    /// 32-bit float
71    F32,
72    /// 64-bit signed integer
73    I64,
74    /// 32-bit signed integer
75    I32,
76    /// 16-bit signed integer
77    I16,
78    /// 8-bit signed integer
79    I8,
80    /// 64-bit unsigned integer
81    U64,
82    /// 32-bit unsigned integer
83    U32,
84    /// 16-bit unsigned integer
85    U16,
86    /// 8-bit unsigned integer
87    U8,
88}
89
90/// Column metadata for tabular data
91#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
92pub struct ColumnMeta {
93    /// Column name
94    pub name: String,
95    /// Column data type
96    pub dtype: ColumnType,
97    /// Whether column allows null values
98    pub nullable: bool,
99}
100
101/// Column data types
102#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
103pub enum ColumnType {
104    /// Numeric column
105    Numeric(NumericDType),
106    /// String/text column
107    String,
108    /// Boolean column
109    Boolean,
110    /// Timestamp column
111    Timestamp,
112    /// JSON object column
113    Json,
114    /// Array column with element type
115    Array(Box<ColumnType>),
116}
117
118/// Complete semantic metadata for a frame
119#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
120pub struct SemanticMeta {
121    /// Primary semantic type
122    pub semantic_type: SemanticType,
123    /// Optional secondary types for mixed data
124    pub secondary_types: SmallVec<[SemanticType; 2]>,
125    /// Processing hints
126    pub hints: ProcessingHints,
127}
128
129/// Processing hints for optimization
130#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
131pub struct ProcessingHints {
132    /// Prefer SIMD processing
133    pub prefer_simd: bool,
134    /// Prefer GPU processing
135    pub prefer_gpu: bool,
136    /// Prefer parallel processing
137    pub prefer_parallel: bool,
138    /// Memory access pattern hint
139    pub access_pattern: AccessPattern,
140    /// Compression hint
141    pub compression_hint: CompressionHint,
142}
143
144/// Memory access pattern hints
145#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
146pub enum AccessPattern {
147    /// Sequential access
148    Sequential,
149    /// Random access
150    Random,
151    /// Streaming (read-once)
152    Streaming,
153}
154
155/// Compression strategy hints
156#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
157pub enum CompressionHint {
158    /// No compression preferred
159    None,
160    /// Fast compression (LZ4)
161    Fast,
162    /// Balanced compression
163    Balanced,
164    /// Maximum compression
165    Maximum,
166}
167
168impl SemanticType {
169    /// Get the primary numeric data type if applicable
170    pub fn numeric_dtype(&self) -> Option<NumericDType> {
171        match self {
172            Self::NumericArray { dtype, .. } => Some(*dtype),
173            Self::Matrix { dtype, .. } => Some(*dtype),
174            _ => None,
175        }
176    }
177
178    /// Check if type is suitable for SIMD processing
179    pub fn is_simd_friendly(&self) -> bool {
180        matches!(self, Self::NumericArray { .. } | Self::Matrix { .. })
181    }
182
183    /// Check if type is suitable for columnar processing
184    pub fn is_columnar(&self) -> bool {
185        matches!(self, Self::Table { .. } | Self::TimeSeries { .. })
186    }
187
188    /// Get estimated data size hint
189    pub fn size_hint(&self) -> Option<usize> {
190        match self {
191            Self::NumericArray {
192                dtype,
193                length: Some(len),
194            } => Some(len * dtype.size()),
195            Self::Table {
196                row_count: Some(rows),
197                columns,
198            } => {
199                Some(rows * columns.len() * 8) // Rough estimate
200            }
201            Self::Matrix { dimensions, dtype } => {
202                Some(dimensions.iter().product::<usize>() * dtype.size())
203            }
204            _ => None,
205        }
206    }
207}
208
209impl NumericDType {
210    /// Get size in bytes
211    pub fn size(self) -> usize {
212        match self {
213            Self::F64 | Self::I64 | Self::U64 => 8,
214            Self::F32 | Self::I32 | Self::U32 => 4,
215            Self::I16 | Self::U16 => 2,
216            Self::I8 | Self::U8 => 1,
217        }
218    }
219
220    /// Check if type is floating-point
221    pub fn is_float(self) -> bool {
222        matches!(self, Self::F32 | Self::F64)
223    }
224
225    /// Check if type is signed
226    pub fn is_signed(self) -> bool {
227        matches!(
228            self,
229            Self::I8 | Self::I16 | Self::I32 | Self::I64 | Self::F32 | Self::F64
230        )
231    }
232}
233
234impl Default for ProcessingHints {
235    fn default() -> Self {
236        Self {
237            prefer_simd: false,
238            prefer_gpu: false,
239            prefer_parallel: true,
240            access_pattern: AccessPattern::Sequential,
241            compression_hint: CompressionHint::Balanced,
242        }
243    }
244}
245
246impl SemanticMeta {
247    /// Create new semantic metadata
248    pub fn new(semantic_type: SemanticType) -> Self {
249        Self {
250            semantic_type,
251            secondary_types: SmallVec::new(),
252            hints: ProcessingHints::default(),
253        }
254    }
255
256    /// Create with explicit hints
257    pub fn with_hints(semantic_type: SemanticType, hints: ProcessingHints) -> Self {
258        Self {
259            semantic_type,
260            secondary_types: SmallVec::new(),
261            hints,
262        }
263    }
264
265    /// Add secondary semantic type
266    pub fn with_secondary(mut self, secondary_type: SemanticType) -> Self {
267        self.secondary_types.push(secondary_type);
268        self
269    }
270
271    /// Get the best processing strategy based on semantics
272    pub fn processing_strategy(&self) -> ProcessingStrategy {
273        // Prefer explicit hints first
274        if self.hints.prefer_gpu {
275            return ProcessingStrategy::Gpu;
276        }
277
278        if self.hints.prefer_simd && self.semantic_type.is_simd_friendly() {
279            return ProcessingStrategy::Simd;
280        }
281
282        // Auto-select based on semantic type
283        match &self.semantic_type {
284            SemanticType::NumericArray {
285                length: Some(len), ..
286            } if *len > 1000 => ProcessingStrategy::Simd,
287            SemanticType::Table {
288                row_count: Some(rows),
289                ..
290            } if *rows > 10000 => ProcessingStrategy::Columnar,
291            SemanticType::TimeSeries { .. } => ProcessingStrategy::Streaming,
292            _ => ProcessingStrategy::Generic,
293        }
294    }
295}
296
297/// Processing strategy recommendation
298#[derive(Debug, Clone, Copy, PartialEq)]
299pub enum ProcessingStrategy {
300    /// Use SIMD-optimized parsing
301    Simd,
302    /// Use GPU acceleration
303    Gpu,
304    /// Use columnar processing
305    Columnar,
306    /// Use streaming processing
307    Streaming,
308    /// Use generic processing
309    Generic,
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    #[test]
317    fn test_semantic_type_creation() {
318        let numeric_array = SemanticType::NumericArray {
319            dtype: NumericDType::F64,
320            length: Some(1000),
321        };
322
323        assert!(numeric_array.is_simd_friendly());
324        assert_eq!(numeric_array.numeric_dtype(), Some(NumericDType::F64));
325        assert_eq!(numeric_array.size_hint(), Some(8000)); // 1000 * 8 bytes
326    }
327
328    #[test]
329    fn test_processing_strategy() {
330        let meta = SemanticMeta::new(SemanticType::NumericArray {
331            dtype: NumericDType::F32,
332            length: Some(2000),
333        });
334
335        assert_eq!(meta.processing_strategy(), ProcessingStrategy::Simd);
336    }
337
338    #[test]
339    fn test_column_meta() {
340        let column = ColumnMeta {
341            name: "value".to_string(),
342            dtype: ColumnType::Numeric(NumericDType::F64),
343            nullable: false,
344        };
345
346        assert_eq!(column.name, "value");
347        assert!(!column.nullable);
348    }
349}