wow_cdbc/
schema_discovery.rs

1//! Schema discovery functionality to auto-detect field types in DBC files.
2
3use crate::{DbcHeader, Error, FieldType, Result, Schema, SchemaField, StringBlock, StringRef};
4use std::collections::HashSet;
5use std::io::{Cursor, Read, Seek, SeekFrom};
6
7/// Confidence level for a field type detection
8#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
9pub enum Confidence {
10    /// Low confidence (50-70%)
11    Low,
12    /// Medium confidence (70-90%)
13    Medium,
14    /// High confidence (90-100%)
15    High,
16}
17
18/// Represents a discovered field type with confidence level
19#[derive(Debug, Clone)]
20pub struct DiscoveredField {
21    /// The field type
22    pub field_type: FieldType,
23    /// Confidence level in the detection
24    pub confidence: Confidence,
25    /// Whether the field is potentially a key field
26    pub is_key_candidate: bool,
27    /// Whether the field is an array
28    pub is_array: bool,
29    /// Size of the array, if the field is an array
30    pub array_size: Option<usize>,
31    /// Sample values (for validation and debugging)
32    pub sample_values: Vec<u32>,
33}
34
35/// Represents a complete discovered schema
36#[derive(Debug, Clone)]
37pub struct DiscoveredSchema {
38    /// The discovered fields
39    pub fields: Vec<DiscoveredField>,
40    /// Key field index, if detected
41    pub key_field_index: Option<usize>,
42    /// Validation status of the schema
43    pub is_valid: bool,
44    /// Validation message, if any
45    pub validation_message: Option<String>,
46}
47
48impl DiscoveredSchema {
49    /// Convert a discovered schema to a regular schema
50    pub fn to_schema(&self, name: &str) -> Schema {
51        let mut schema = Schema::new(name);
52
53        for (i, field) in self.fields.iter().enumerate() {
54            let field_name = format!("field_{i}");
55
56            if field.is_array {
57                schema.add_field(SchemaField::new_array(
58                    field_name,
59                    field.field_type,
60                    field.array_size.unwrap_or(0),
61                ));
62            } else {
63                schema.add_field(SchemaField::new(field_name, field.field_type));
64            }
65        }
66
67        if let Some(key_index) = self.key_field_index {
68            schema.set_key_field_index(key_index);
69        }
70
71        schema
72    }
73}
74
75/// Schema discoverer for DBC files
76#[derive(Debug)]
77pub struct SchemaDiscoverer<'a> {
78    /// The DBC header
79    header: &'a DbcHeader,
80    /// The raw data of the DBC file
81    data: &'a [u8],
82    /// The string block
83    string_block: &'a StringBlock,
84    /// Maximum number of records to analyze (0 = all)
85    max_records: u32,
86    /// Whether to validate string references
87    validate_strings: bool,
88    /// Whether to detect arrays
89    detect_arrays: bool,
90    /// Whether to detect the key field
91    detect_key: bool,
92}
93
94impl<'a> SchemaDiscoverer<'a> {
95    /// Create a new schema discoverer
96    pub fn new(header: &'a DbcHeader, data: &'a [u8], string_block: &'a StringBlock) -> Self {
97        Self {
98            header,
99            data,
100            string_block,
101            max_records: 100, // Default sample size
102            validate_strings: true,
103            detect_arrays: true,
104            detect_key: true,
105        }
106    }
107
108    /// Set the maximum number of records to analyze
109    pub fn with_max_records(mut self, max_records: u32) -> Self {
110        self.max_records = max_records;
111        self
112    }
113
114    /// Set whether to validate string references
115    pub fn with_validate_strings(mut self, validate_strings: bool) -> Self {
116        self.validate_strings = validate_strings;
117        self
118    }
119
120    /// Set whether to detect arrays
121    pub fn with_detect_arrays(mut self, detect_arrays: bool) -> Self {
122        self.detect_arrays = detect_arrays;
123        self
124    }
125
126    /// Set whether to detect the key field
127    pub fn with_detect_key(mut self, detect_key: bool) -> Self {
128        self.detect_key = detect_key;
129        self
130    }
131
132    /// Discover the schema of the DBC file
133    pub fn discover(&self) -> Result<DiscoveredSchema> {
134        // Determine how many records to analyze
135        let records_to_analyze =
136            if self.max_records == 0 || self.max_records > self.header.record_count {
137                self.header.record_count
138            } else {
139                self.max_records
140            };
141
142        // Skip the header
143        let mut cursor = Cursor::new(self.data);
144        cursor.seek(SeekFrom::Start(DbcHeader::SIZE as u64))?;
145
146        // Fetch raw record data for analysis
147        let mut record_data = Vec::with_capacity(records_to_analyze as usize);
148        for _ in 0..records_to_analyze {
149            let mut record = Vec::with_capacity(self.header.record_size as usize);
150            let mut buffer = vec![0u8; self.header.record_size as usize];
151            cursor.read_exact(&mut buffer)?;
152
153            // Parse into u32 values (most DBC fields are 4 bytes)
154            let mut record_cursor = Cursor::new(&buffer);
155            for _ in 0..self.header.field_count {
156                let mut buf = [0u8; 4];
157                record_cursor.read_exact(&mut buf)?;
158                let value = u32::from_le_bytes(buf);
159                record.push(value);
160            }
161
162            record_data.push(record);
163        }
164
165        // Analyze the record data to discover field types
166        let discovered_fields = self.analyze_fields(&record_data)?;
167
168        // Detect the key field
169        let key_field_index = if self.detect_key {
170            self.detect_key_field(&record_data, &discovered_fields)
171        } else {
172            None
173        };
174
175        // Validate the discovered schema
176        let (is_valid, validation_message) = self.validate_schema(&discovered_fields)?;
177
178        Ok(DiscoveredSchema {
179            fields: discovered_fields,
180            key_field_index,
181            is_valid,
182            validation_message,
183        })
184    }
185
186    /// Analyze all fields to determine their types
187    fn analyze_fields(&self, record_data: &[Vec<u32>]) -> Result<Vec<DiscoveredField>> {
188        let mut discovered_fields = Vec::with_capacity(self.header.field_count as usize);
189
190        // If no records to analyze, return empty fields
191        if record_data.is_empty() {
192            return Ok(discovered_fields);
193        }
194
195        // Analyze each field
196        for field_index in 0..self.header.field_count as usize {
197            // Extract values for this field from all analyzed records
198            let field_values: Vec<u32> = record_data
199                .iter()
200                .map(|record| record[field_index])
201                .collect();
202
203            // Analyze field values to determine type
204            let discovered_field = self.analyze_field(field_index, &field_values)?;
205            discovered_fields.push(discovered_field);
206        }
207
208        // Detect arrays if configured
209        if self.detect_arrays {
210            self.detect_array_fields(&mut discovered_fields);
211        }
212
213        Ok(discovered_fields)
214    }
215
216    /// Analyze a single field to determine its type
217    fn analyze_field(&self, _field_index: usize, values: &[u32]) -> Result<DiscoveredField> {
218        // Check if all values are 0 or 1 (boolean)
219        let is_bool = values.iter().all(|&value| value == 0 || value == 1);
220
221        // Check if any values are in the string block range
222        let possible_string_refs = values
223            .iter()
224            .filter(|&&value| value > 0 && value < self.string_block.size() as u32)
225            .count();
226
227        let is_string_ref = possible_string_refs > 0 && possible_string_refs >= values.len() / 2; // At least half of values should be potential strings
228
229        // Validate string references if configured
230        let is_valid_string_ref = if self.validate_strings && is_string_ref {
231            // Check if the string references point to valid strings
232            let valid_strings = values
233                .iter()
234                .filter(|&&value| {
235                    if value == 0 {
236                        // Empty string is valid
237                        return true;
238                    }
239
240                    // Check if the value points to a valid string
241                    self.string_block.get_string(StringRef::new(value)).is_ok()
242                })
243                .count();
244
245            valid_strings >= values.len() * 3 / 4 // At least 75% of values should be valid strings
246        } else {
247            false
248        };
249
250        // Check for potential key field
251        let is_key_candidate = self.is_potential_key(values);
252
253        // Check if the values fit in different integer ranges
254        let min_value = values.iter().copied().min().unwrap_or(0);
255        let max_value = values.iter().copied().max().unwrap_or(0);
256
257        let fits_uint8 = max_value <= 0xFF;
258        let fits_int8 = min_value >= 0x80 && max_value <= 0x7F;
259        let fits_uint16 = max_value <= 0xFFFF;
260        let fits_int16 = min_value >= 0x8000 && max_value <= 0x7FFF;
261
262        // Check if the values could be floating point
263        let could_be_float = values.iter().any(|&value| {
264            // Check if the bit pattern could represent a reasonable float
265            let float_val = f32::from_bits(value);
266            float_val.is_finite()
267                && !float_val.is_subnormal()
268                && (float_val.abs() < 0.00001 || float_val.abs() > 0.00001)
269        });
270
271        // Determine the most likely field type
272        let (field_type, confidence) = if is_valid_string_ref {
273            (FieldType::String, Confidence::High)
274        } else if is_string_ref {
275            (FieldType::String, Confidence::Medium)
276        } else if is_bool {
277            (FieldType::Bool, Confidence::High)
278        } else if fits_uint8 {
279            (FieldType::UInt8, Confidence::Medium)
280        } else if fits_int8 {
281            (FieldType::Int8, Confidence::Medium)
282        } else if fits_uint16 {
283            (FieldType::UInt16, Confidence::Medium)
284        } else if fits_int16 {
285            (FieldType::Int16, Confidence::Medium)
286        } else if could_be_float {
287            (FieldType::Float32, Confidence::Medium)
288        } else if values.iter().any(|&v| v > 0x7FFFFFFF) {
289            // If any value is larger than i32::MAX, it's probably unsigned
290            (FieldType::UInt32, Confidence::High)
291        } else {
292            // Default to Int32
293            (FieldType::Int32, Confidence::Low)
294        };
295
296        // Collect sample values for validation and debugging
297        let sample_values = values.iter().take(10).copied().collect();
298
299        Ok(DiscoveredField {
300            field_type,
301            confidence,
302            is_key_candidate,
303            is_array: false,  // Will be set later if detected
304            array_size: None, // Will be set later if detected
305            sample_values,
306        })
307    }
308
309    /// Check if a field could be a key field
310    fn is_potential_key(&self, values: &[u32]) -> bool {
311        // A key field should have unique, non-zero values
312        if values.is_empty() {
313            return false;
314        }
315
316        // Check if all values are unique
317        let unique_values: HashSet<u32> = values.iter().copied().collect();
318        if unique_values.len() != values.len() {
319            return false;
320        }
321
322        // Check if all values are non-zero
323        if values.contains(&0) {
324            return false;
325        }
326
327        // Check if values are sequential or mostly sequential
328        let min_value = *values.iter().min().unwrap();
329        let max_value = *values.iter().max().unwrap();
330
331        // Sequential or nearly sequential values are good candidates
332        let range = max_value - min_value + 1;
333        if range as usize <= values.len() * 2 {
334            return true;
335        }
336
337        // Check if values are reasonably dense in their range
338        let density = values.len() as f32 / range as f32;
339        density > 0.2 // At least 20% of the range is filled
340    }
341
342    /// Detect array fields based on patterns in field types
343    fn detect_array_fields(&self, fields: &mut Vec<DiscoveredField>) {
344        if fields.len() <= 1 {
345            return; // No arrays possible with one or zero fields
346        }
347
348        // Look for repeating patterns of field types
349        for array_size in 2..=10 {
350            // Try different array sizes
351            if fields.len() % array_size != 0 {
352                continue; // Fields must divide evenly by array size
353            }
354
355            let potential_arrays = fields.len() / array_size;
356            let mut is_array_pattern = true;
357
358            for a in 0..potential_arrays {
359                let base_type = fields[a * array_size].field_type;
360
361                // Check if all fields in the potential array have the same type
362                for i in 1..array_size {
363                    if fields[a * array_size + i].field_type != base_type {
364                        is_array_pattern = false;
365                        break;
366                    }
367                }
368
369                if !is_array_pattern {
370                    break;
371                }
372            }
373
374            if is_array_pattern {
375                // Mark fields as array elements
376                let mut new_fields = Vec::with_capacity(potential_arrays);
377
378                for a in 0..potential_arrays {
379                    let mut base_field = fields[a * array_size].clone();
380                    base_field.is_array = true;
381                    base_field.array_size = Some(array_size);
382                    new_fields.push(base_field);
383                }
384
385                *fields = new_fields;
386                return; // Successfully detected arrays
387            }
388        }
389    }
390
391    /// Detect the key field
392    fn detect_key_field(
393        &self,
394        record_data: &[Vec<u32>],
395        fields: &[DiscoveredField],
396    ) -> Option<usize> {
397        // Find candidates based on field analysis
398        let mut candidates: Vec<usize> = fields
399            .iter()
400            .enumerate()
401            .filter(|(_, field)| field.is_key_candidate)
402            .map(|(i, _)| i)
403            .collect();
404
405        // If no candidates, check for fields with ascending values
406        if candidates.is_empty() {
407            for (field_index, field) in fields.iter().enumerate() {
408                if field.field_type != FieldType::UInt32 && field.field_type != FieldType::Int32 {
409                    continue;
410                }
411
412                // Get values for this field
413                let values: Vec<u32> = record_data
414                    .iter()
415                    .map(|record| record[field_index])
416                    .collect();
417
418                // Check if values are always increasing
419                let mut is_increasing = true;
420                for i in 1..values.len() {
421                    if values[i] <= values[i - 1] {
422                        is_increasing = false;
423                        break;
424                    }
425                }
426
427                if is_increasing {
428                    candidates.push(field_index);
429                }
430            }
431        }
432
433        // If still no candidates, pick the first UInt32 field
434        if candidates.is_empty() {
435            for (field_index, field) in fields.iter().enumerate() {
436                if field.field_type == FieldType::UInt32 {
437                    candidates.push(field_index);
438                    break;
439                }
440            }
441        }
442
443        // If only one candidate, return it
444        if candidates.len() == 1 {
445            return Some(candidates[0]);
446        }
447
448        // If multiple candidates, prefer the first field
449        candidates.sort();
450        candidates.first().copied()
451    }
452
453    /// Validate the discovered schema
454    fn validate_schema(&self, fields: &[DiscoveredField]) -> Result<(bool, Option<String>)> {
455        // Check if the field count matches
456        let field_count = if fields.iter().any(|f| f.is_array) {
457            fields
458                .iter()
459                .map(|f| {
460                    if f.is_array {
461                        f.array_size.unwrap_or(0)
462                    } else {
463                        1
464                    }
465                })
466                .sum::<usize>() as u32
467        } else {
468            fields.len() as u32
469        };
470
471        if field_count != self.header.field_count {
472            return Ok((
473                false,
474                Some(format!(
475                    "Field count mismatch: schema has {} fields, but DBC has {} fields",
476                    field_count, self.header.field_count
477                )),
478            ));
479        }
480
481        // Calculate the record size based on field types
482        let record_size = fields
483            .iter()
484            .map(|f| {
485                if f.is_array {
486                    f.field_type.size() * f.array_size.unwrap_or(0)
487                } else {
488                    f.field_type.size()
489                }
490            })
491            .sum::<usize>() as u32;
492
493        // Check if the record size matches
494        if record_size != self.header.record_size {
495            return Ok((
496                false,
497                Some(format!(
498                    "Record size mismatch: schema defines {} bytes, but DBC has {} bytes per record",
499                    record_size, self.header.record_size
500                )),
501            ));
502        }
503
504        Ok((true, None))
505    }
506
507    /// Generate a schema from the discovered fields with automatic field naming
508    pub fn generate_schema(&self, name: &str) -> Result<Schema> {
509        let discovered = self.discover()?;
510        if !discovered.is_valid {
511            return Err(Error::SchemaValidation(
512                discovered
513                    .validation_message
514                    .unwrap_or_else(|| "Invalid discovered schema".to_string()),
515            ));
516        }
517
518        let mut schema = Schema::new(name);
519
520        // Add fields with meaningful names based on type and position
521        for (i, field) in discovered.fields.iter().enumerate() {
522            // Use field index as a base for field names
523            let field_name = if field.is_key_candidate {
524                "ID".to_string()
525            } else {
526                match field.field_type {
527                    FieldType::String => format!("String_{i}"),
528                    FieldType::Float32 => format!("Float_{i}"),
529                    FieldType::Bool => format!("Flag_{i}"),
530                    FieldType::UInt32 | FieldType::Int32 => format!("Value_{i}"),
531                    FieldType::UInt8 | FieldType::Int8 => format!("Byte_{i}"),
532                    FieldType::UInt16 | FieldType::Int16 => format!("Short_{i}"),
533                }
534            };
535
536            if field.is_array {
537                schema.add_field(SchemaField::new_array(
538                    field_name,
539                    field.field_type,
540                    field.array_size.unwrap_or(0),
541                ));
542            } else {
543                schema.add_field(SchemaField::new(field_name, field.field_type));
544            }
545        }
546
547        // Set the key field if detected
548        if let Some(key_index) = discovered.key_field_index {
549            schema.set_key_field_index(key_index);
550        }
551
552        Ok(schema)
553    }
554}