wow_cdbc/
schema_discovery.rs

1//! Schema discovery functionality to auto-detect field types in DBC files.
2
3use crate::{DbcHeader, Error, FieldType, Result, Schema, SchemaField, StringBlock, StringRef};
4use std::collections::HashSet;
5use std::io::{Cursor, Read, Seek, SeekFrom};
6
7/// Confidence level for a field type detection
8#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
9pub enum Confidence {
10    /// Low confidence (50-70%)
11    Low,
12    /// Medium confidence (70-90%)
13    Medium,
14    /// High confidence (90-100%)
15    High,
16}
17
18/// Represents a discovered field type with confidence level
19#[derive(Debug, Clone)]
20pub struct DiscoveredField {
21    /// The field type
22    pub field_type: FieldType,
23    /// Confidence level in the detection
24    pub confidence: Confidence,
25    /// Whether the field is potentially a key field
26    pub is_key_candidate: bool,
27    /// Whether the field is an array
28    pub is_array: bool,
29    /// Size of the array, if the field is an array
30    pub array_size: Option<usize>,
31    /// Whether this field is part of a localized string (locstring)
32    /// Classic WoW locstrings have 8 string refs (one per locale) + 1 flags field
33    pub is_locstring: bool,
34    /// Locale index within a locstring (0-7 for string refs, 8 for flags field)
35    pub locstring_index: Option<u8>,
36    /// Sample values (for validation and debugging)
37    pub sample_values: Vec<u32>,
38}
39
40/// Represents a complete discovered schema
41#[derive(Debug, Clone)]
42pub struct DiscoveredSchema {
43    /// The discovered fields
44    pub fields: Vec<DiscoveredField>,
45    /// Key field index, if detected
46    pub key_field_index: Option<usize>,
47    /// Validation status of the schema
48    pub is_valid: bool,
49    /// Validation message, if any
50    pub validation_message: Option<String>,
51}
52
53impl DiscoveredSchema {
54    /// Convert a discovered schema to a regular schema
55    pub fn to_schema(&self, name: &str) -> Schema {
56        let mut schema = Schema::new(name);
57
58        for (i, field) in self.fields.iter().enumerate() {
59            let field_name = format!("field_{i}");
60
61            if field.is_array {
62                schema.add_field(SchemaField::new_array(
63                    field_name,
64                    field.field_type,
65                    field.array_size.unwrap_or(0),
66                ));
67            } else {
68                schema.add_field(SchemaField::new(field_name, field.field_type));
69            }
70        }
71
72        if let Some(key_index) = self.key_field_index {
73            schema.set_key_field_index(key_index);
74        }
75
76        schema
77    }
78}
79
80/// Schema discoverer for DBC files
81#[derive(Debug)]
82pub struct SchemaDiscoverer<'a> {
83    /// The DBC header
84    header: &'a DbcHeader,
85    /// The raw data of the DBC file
86    data: &'a [u8],
87    /// The string block
88    string_block: &'a StringBlock,
89    /// Maximum number of records to analyze (0 = all)
90    max_records: u32,
91    /// Whether to validate string references
92    validate_strings: bool,
93    /// Whether to detect arrays
94    detect_arrays: bool,
95    /// Whether to detect the key field
96    detect_key: bool,
97}
98
99impl<'a> SchemaDiscoverer<'a> {
100    /// Create a new schema discoverer
101    pub fn new(header: &'a DbcHeader, data: &'a [u8], string_block: &'a StringBlock) -> Self {
102        Self {
103            header,
104            data,
105            string_block,
106            max_records: 100, // Default sample size
107            validate_strings: true,
108            detect_arrays: true,
109            detect_key: true,
110        }
111    }
112
113    /// Set the maximum number of records to analyze
114    pub fn with_max_records(mut self, max_records: u32) -> Self {
115        self.max_records = max_records;
116        self
117    }
118
119    /// Set whether to validate string references
120    pub fn with_validate_strings(mut self, validate_strings: bool) -> Self {
121        self.validate_strings = validate_strings;
122        self
123    }
124
125    /// Set whether to detect arrays
126    pub fn with_detect_arrays(mut self, detect_arrays: bool) -> Self {
127        self.detect_arrays = detect_arrays;
128        self
129    }
130
131    /// Set whether to detect the key field
132    pub fn with_detect_key(mut self, detect_key: bool) -> Self {
133        self.detect_key = detect_key;
134        self
135    }
136
137    /// Discover the schema of the DBC file
138    pub fn discover(&self) -> Result<DiscoveredSchema> {
139        // Determine how many records to analyze
140        let records_to_analyze =
141            if self.max_records == 0 || self.max_records > self.header.record_count {
142                self.header.record_count
143            } else {
144                self.max_records
145            };
146
147        // Skip the header
148        let mut cursor = Cursor::new(self.data);
149        cursor.seek(SeekFrom::Start(DbcHeader::SIZE as u64))?;
150
151        // Fetch raw record data for analysis
152        let mut record_data = Vec::with_capacity(records_to_analyze as usize);
153        for _ in 0..records_to_analyze {
154            let mut record = Vec::with_capacity(self.header.record_size as usize);
155            let mut buffer = vec![0u8; self.header.record_size as usize];
156            cursor.read_exact(&mut buffer)?;
157
158            // Parse into u32 values (most DBC fields are 4 bytes)
159            let mut record_cursor = Cursor::new(&buffer);
160            for _ in 0..self.header.field_count {
161                let mut buf = [0u8; 4];
162                record_cursor.read_exact(&mut buf)?;
163                let value = u32::from_le_bytes(buf);
164                record.push(value);
165            }
166
167            record_data.push(record);
168        }
169
170        // Analyze the record data to discover field types
171        let discovered_fields = self.analyze_fields(&record_data)?;
172
173        // Detect the key field
174        let key_field_index = if self.detect_key {
175            self.detect_key_field(&record_data, &discovered_fields)
176        } else {
177            None
178        };
179
180        // Validate the discovered schema
181        let (is_valid, validation_message) = self.validate_schema(&discovered_fields)?;
182
183        Ok(DiscoveredSchema {
184            fields: discovered_fields,
185            key_field_index,
186            is_valid,
187            validation_message,
188        })
189    }
190
191    /// Analyze all fields to determine their types
192    fn analyze_fields(&self, record_data: &[Vec<u32>]) -> Result<Vec<DiscoveredField>> {
193        let mut discovered_fields = Vec::with_capacity(self.header.field_count as usize);
194
195        // If no records to analyze, return empty fields
196        if record_data.is_empty() {
197            return Ok(discovered_fields);
198        }
199
200        // Analyze each field
201        for field_index in 0..self.header.field_count as usize {
202            // Extract values for this field from all analyzed records
203            let field_values: Vec<u32> = record_data
204                .iter()
205                .map(|record| record[field_index])
206                .collect();
207
208            // Analyze field values to determine type
209            let discovered_field = self.analyze_field(field_index, &field_values)?;
210            discovered_fields.push(discovered_field);
211        }
212
213        // Detect localized strings (locstrings) - 8 string refs + 1 flags field
214        // This must run before array detection to properly classify fields
215        self.detect_locstrings(&mut discovered_fields);
216
217        // Detect arrays if configured
218        if self.detect_arrays {
219            self.detect_array_fields(&mut discovered_fields);
220        }
221
222        Ok(discovered_fields)
223    }
224
225    /// Analyze a single field to determine its type
226    fn analyze_field(&self, _field_index: usize, values: &[u32]) -> Result<DiscoveredField> {
227        // Check if all values are 0 or 1 (boolean)
228        let is_bool = values.iter().all(|&value| value == 0 || value == 1);
229
230        // Check if any values are in the string block range
231        let possible_string_refs = values
232            .iter()
233            .filter(|&&value| value > 0 && value < self.string_block.size() as u32)
234            .count();
235
236        let is_string_ref = possible_string_refs > 0 && possible_string_refs >= values.len() / 2; // At least half of values should be potential strings
237
238        // Validate string references if configured
239        let is_valid_string_ref = if self.validate_strings && is_string_ref {
240            // Check if the string references point to the START of valid strings
241            // This eliminates false positives where integer values happen to fall
242            // within the string block range but don't point to actual string starts
243            let valid_strings = values
244                .iter()
245                .filter(|&&value| {
246                    if value == 0 {
247                        // Empty string (offset 0) is valid
248                        return true;
249                    }
250
251                    // Check if the value points to the start of a string
252                    // A string start is at offset 0 or immediately after a NUL byte
253                    if !self.string_block.is_string_start(value) {
254                        return false;
255                    }
256
257                    // Also verify the string at that offset is valid UTF-8
258                    self.string_block.get_string(StringRef::new(value)).is_ok()
259                })
260                .count();
261
262            valid_strings >= values.len() * 3 / 4 // At least 75% of values should be valid strings
263        } else {
264            false
265        };
266
267        // Check for potential key field
268        let is_key_candidate = self.is_potential_key(values);
269
270        // Check if the values could be floating point using better heuristics
271        // Key insight: small integers (0-65535) as u32 reinterpret as tiny denormals
272        // when viewed as f32, while actual floats like 1.0f32 have u32 value 0x3F800000
273        let is_float_like = |value: u32| -> bool {
274            // Small integers (< 65536) are almost never stored as floats
275            // because float 1.0 = 0x3F800000 = 1065353216, not 1
276            // A u32 of 100 reinterpreted as float is ~1.4e-43 (denormal)
277            if value < 65536 {
278                return false;
279            }
280
281            let float_val = f32::from_bits(value);
282
283            // Must be finite and not subnormal
284            if !float_val.is_finite() || float_val.is_subnormal() {
285                return false;
286            }
287
288            // Check if float is in reasonable game data range
289            // Most game floats are: normalized (0-1), percentages (0-100),
290            // coordinates (-10000 to 10000), scales (0.001 to 1000)
291            let abs_val = float_val.abs();
292            (1e-6..=1e7).contains(&abs_val)
293        };
294
295        // Count non-zero values and how many look like floats
296        let non_zero_values: Vec<u32> = values.iter().copied().filter(|&v| v != 0).collect();
297        let float_like_count = non_zero_values
298            .iter()
299            .filter(|&&v| is_float_like(v))
300            .count();
301
302        // Require majority (>= 75%) of non-zero values to look like floats
303        // Also require at least one float-like value (handles edge case where
304        // integer division of small counts could yield 0)
305        let could_be_float =
306            float_like_count > 0 && float_like_count >= (non_zero_values.len() * 3 / 4).max(1);
307
308        // Determine the most likely field type
309        // NOTE: DBC files always store 4 bytes per field, so we only detect 4-byte types.
310        // Smaller types (UInt8, Int8, UInt16, Int16) are not used because they would
311        // cause incorrect size calculations during schema validation.
312        let (field_type, confidence) = if is_valid_string_ref {
313            (FieldType::String, Confidence::High)
314        } else if is_string_ref && !self.validate_strings {
315            // Only use unvalidated string detection when validation is disabled
316            (FieldType::String, Confidence::Medium)
317        } else if is_bool {
318            (FieldType::Bool, Confidence::High)
319        } else if could_be_float {
320            (FieldType::Float32, Confidence::Medium)
321        } else if values.iter().any(|&v| v > 0x7FFFFFFF) {
322            // If any value is larger than i32::MAX, it's probably unsigned
323            (FieldType::UInt32, Confidence::High)
324        } else {
325            // Default to Int32
326            (FieldType::Int32, Confidence::Low)
327        };
328
329        // Collect sample values for validation and debugging
330        let sample_values = values.iter().take(10).copied().collect();
331
332        Ok(DiscoveredField {
333            field_type,
334            confidence,
335            is_key_candidate,
336            is_array: false,       // Will be set later if detected
337            array_size: None,      // Will be set later if detected
338            is_locstring: false,   // Will be set later if detected
339            locstring_index: None, // Will be set later if detected
340            sample_values,
341        })
342    }
343
344    /// Check if a field could be a key field
345    fn is_potential_key(&self, values: &[u32]) -> bool {
346        // A key field should have unique, non-zero values
347        if values.is_empty() {
348            return false;
349        }
350
351        // Check if all values are unique
352        let unique_values: HashSet<u32> = values.iter().copied().collect();
353        if unique_values.len() != values.len() {
354            return false;
355        }
356
357        // Check if all values are non-zero
358        if values.contains(&0) {
359            return false;
360        }
361
362        // Check if values are sequential or mostly sequential
363        let min_value = *values.iter().min().unwrap();
364        let max_value = *values.iter().max().unwrap();
365
366        // Sequential or nearly sequential values are good candidates
367        let range = max_value - min_value + 1;
368        if range as usize <= values.len() * 2 {
369            return true;
370        }
371
372        // Check if values are reasonably dense in their range
373        let density = values.len() as f32 / range as f32;
374        density > 0.2 // At least 20% of the range is filled
375    }
376
377    /// Detect array fields based on patterns in field types
378    fn detect_array_fields(&self, fields: &mut Vec<DiscoveredField>) {
379        if fields.len() <= 1 {
380            return; // No arrays possible with one or zero fields
381        }
382
383        // Look for repeating patterns of field types
384        for array_size in 2..=10 {
385            // Try different array sizes
386            if !fields.len().is_multiple_of(array_size) {
387                continue; // Fields must divide evenly by array size
388            }
389
390            let potential_arrays = fields.len() / array_size;
391            let mut is_array_pattern = true;
392
393            for a in 0..potential_arrays {
394                let base_type = fields[a * array_size].field_type;
395
396                // Check if all fields in the potential array have the same type
397                for i in 1..array_size {
398                    if fields[a * array_size + i].field_type != base_type {
399                        is_array_pattern = false;
400                        break;
401                    }
402                }
403
404                if !is_array_pattern {
405                    break;
406                }
407            }
408
409            if is_array_pattern {
410                // Mark fields as array elements
411                let mut new_fields = Vec::with_capacity(potential_arrays);
412
413                for a in 0..potential_arrays {
414                    let mut base_field = fields[a * array_size].clone();
415                    base_field.is_array = true;
416                    base_field.array_size = Some(array_size);
417                    new_fields.push(base_field);
418                }
419
420                *fields = new_fields;
421                return; // Successfully detected arrays
422            }
423        }
424    }
425
426    /// Detect localized string (locstring) patterns in fields
427    ///
428    /// Classic WoW locstrings consist of 9 consecutive fields:
429    /// - 8 string references (one per locale: enUS, koKR, frFR, deDE, zhCN, zhTW, esES, esMX)
430    /// - 1 flags field (u32)
431    ///
432    /// In non-English clients or files, most locale fields are empty (offset 0),
433    /// which causes them to be detected as Bool. This method identifies this pattern
434    /// and reclassifies those fields as String.
435    fn detect_locstrings(&self, fields: &mut [DiscoveredField]) {
436        // Need at least 9 fields for a locstring
437        if fields.len() < 9 {
438            return;
439        }
440
441        let mut i = 0;
442        while i + 8 < fields.len() {
443            // Look for a String field with High confidence as the start
444            if fields[i].field_type != FieldType::String || fields[i].confidence != Confidence::High
445            {
446                i += 1;
447                continue;
448            }
449
450            // Check if the next 7 fields are either String or "faux Bool" (all zeros)
451            let mut is_locstring_pattern = true;
452            for j in 1..8 {
453                let field = &fields[i + j];
454                let is_string = field.field_type == FieldType::String;
455                let is_empty_string_ref = field.field_type == FieldType::Bool
456                    && field.sample_values.iter().all(|&v| v == 0);
457
458                if !is_string && !is_empty_string_ref {
459                    is_locstring_pattern = false;
460                    break;
461                }
462            }
463
464            if !is_locstring_pattern {
465                i += 1;
466                continue;
467            }
468
469            // Check the 9th field - it should be an integer (flags field)
470            // The flags field is typically 0 or a small bitmask
471            let flags_field = &fields[i + 8];
472            let is_valid_flags = matches!(
473                flags_field.field_type,
474                FieldType::Int32 | FieldType::UInt32 | FieldType::Bool
475            );
476
477            if !is_valid_flags {
478                i += 1;
479                continue;
480            }
481
482            // Found a locstring pattern! Mark all 9 fields
483            for j in 0..8 {
484                fields[i + j].is_locstring = true;
485                fields[i + j].locstring_index = Some(j as u8);
486                // Reclassify Bool fields as String (they're empty string refs)
487                if fields[i + j].field_type == FieldType::Bool {
488                    fields[i + j].field_type = FieldType::String;
489                    fields[i + j].confidence = Confidence::Medium;
490                }
491            }
492
493            // Mark the flags field
494            fields[i + 8].is_locstring = true;
495            fields[i + 8].locstring_index = Some(8);
496            // Reclassify Bool as Int32 for the flags field
497            if fields[i + 8].field_type == FieldType::Bool {
498                fields[i + 8].field_type = FieldType::Int32;
499                fields[i + 8].confidence = Confidence::Medium;
500            }
501
502            // Skip past this locstring
503            i += 9;
504        }
505    }
506
507    /// Detect the key field
508    fn detect_key_field(
509        &self,
510        record_data: &[Vec<u32>],
511        fields: &[DiscoveredField],
512    ) -> Option<usize> {
513        // Find candidates based on field analysis
514        let mut candidates: Vec<usize> = fields
515            .iter()
516            .enumerate()
517            .filter(|(_, field)| field.is_key_candidate)
518            .map(|(i, _)| i)
519            .collect();
520
521        // If no candidates, check for fields with ascending values
522        if candidates.is_empty() {
523            for (field_index, field) in fields.iter().enumerate() {
524                if field.field_type != FieldType::UInt32 && field.field_type != FieldType::Int32 {
525                    continue;
526                }
527
528                // Get values for this field
529                let values: Vec<u32> = record_data
530                    .iter()
531                    .map(|record| record[field_index])
532                    .collect();
533
534                // Check if values are always increasing
535                let mut is_increasing = true;
536                for i in 1..values.len() {
537                    if values[i] <= values[i - 1] {
538                        is_increasing = false;
539                        break;
540                    }
541                }
542
543                if is_increasing {
544                    candidates.push(field_index);
545                }
546            }
547        }
548
549        // If still no candidates, pick the first UInt32 field
550        if candidates.is_empty() {
551            for (field_index, field) in fields.iter().enumerate() {
552                if field.field_type == FieldType::UInt32 {
553                    candidates.push(field_index);
554                    break;
555                }
556            }
557        }
558
559        // If only one candidate, return it
560        if candidates.len() == 1 {
561            return Some(candidates[0]);
562        }
563
564        // If multiple candidates, prefer the first field
565        candidates.sort();
566        candidates.first().copied()
567    }
568
569    /// Validate the discovered schema
570    fn validate_schema(&self, fields: &[DiscoveredField]) -> Result<(bool, Option<String>)> {
571        // Check if the field count matches
572        let field_count = if fields.iter().any(|f| f.is_array) {
573            fields
574                .iter()
575                .map(|f| {
576                    if f.is_array {
577                        f.array_size.unwrap_or(0)
578                    } else {
579                        1
580                    }
581                })
582                .sum::<usize>() as u32
583        } else {
584            fields.len() as u32
585        };
586
587        if field_count != self.header.field_count {
588            return Ok((
589                false,
590                Some(format!(
591                    "Field count mismatch: schema has {} fields, but DBC has {} fields",
592                    field_count, self.header.field_count
593                )),
594            ));
595        }
596
597        // Calculate the record size based on field types
598        let record_size = fields
599            .iter()
600            .map(|f| {
601                if f.is_array {
602                    f.field_type.size() * f.array_size.unwrap_or(0)
603                } else {
604                    f.field_type.size()
605                }
606            })
607            .sum::<usize>() as u32;
608
609        // Check if the record size matches
610        if record_size != self.header.record_size {
611            return Ok((
612                false,
613                Some(format!(
614                    "Record size mismatch: schema defines {} bytes, but DBC has {} bytes per record",
615                    record_size, self.header.record_size
616                )),
617            ));
618        }
619
620        Ok((true, None))
621    }
622
623    /// Generate a schema from the discovered fields with automatic field naming
624    pub fn generate_schema(&self, name: &str) -> Result<Schema> {
625        let discovered = self.discover()?;
626        if !discovered.is_valid {
627            return Err(Error::SchemaValidation(
628                discovered
629                    .validation_message
630                    .unwrap_or_else(|| "Invalid discovered schema".to_string()),
631            ));
632        }
633
634        let mut schema = Schema::new(name);
635
636        // Add fields with meaningful names based on type and position
637        for (i, field) in discovered.fields.iter().enumerate() {
638            // Use field index as a base for field names
639            let field_name = if field.is_key_candidate {
640                "ID".to_string()
641            } else {
642                match field.field_type {
643                    FieldType::String => format!("String_{i}"),
644                    FieldType::Float32 => format!("Float_{i}"),
645                    FieldType::Bool => format!("Flag_{i}"),
646                    FieldType::UInt32 | FieldType::Int32 => format!("Value_{i}"),
647                    FieldType::UInt8 | FieldType::Int8 => format!("Byte_{i}"),
648                    FieldType::UInt16 | FieldType::Int16 => format!("Short_{i}"),
649                }
650            };
651
652            if field.is_array {
653                schema.add_field(SchemaField::new_array(
654                    field_name,
655                    field.field_type,
656                    field.array_size.unwrap_or(0),
657                ));
658            } else {
659                schema.add_field(SchemaField::new(field_name, field.field_type));
660            }
661        }
662
663        // Set the key field if detected
664        if let Some(key_index) = discovered.key_field_index {
665            schema.set_key_field_index(key_index);
666        }
667
668        Ok(schema)
669    }
670}