1use crate::{DbcHeader, Error, FieldType, Result, Schema, SchemaField, StringBlock, StringRef};
4use std::collections::HashSet;
5use std::io::{Cursor, Read, Seek, SeekFrom};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
9pub enum Confidence {
10 Low,
12 Medium,
14 High,
16}
17
18#[derive(Debug, Clone)]
20pub struct DiscoveredField {
21 pub field_type: FieldType,
23 pub confidence: Confidence,
25 pub is_key_candidate: bool,
27 pub is_array: bool,
29 pub array_size: Option<usize>,
31 pub is_locstring: bool,
34 pub locstring_index: Option<u8>,
36 pub sample_values: Vec<u32>,
38}
39
40#[derive(Debug, Clone)]
42pub struct DiscoveredSchema {
43 pub fields: Vec<DiscoveredField>,
45 pub key_field_index: Option<usize>,
47 pub is_valid: bool,
49 pub validation_message: Option<String>,
51}
52
53impl DiscoveredSchema {
54 pub fn to_schema(&self, name: &str) -> Schema {
56 let mut schema = Schema::new(name);
57
58 for (i, field) in self.fields.iter().enumerate() {
59 let field_name = format!("field_{i}");
60
61 if field.is_array {
62 schema.add_field(SchemaField::new_array(
63 field_name,
64 field.field_type,
65 field.array_size.unwrap_or(0),
66 ));
67 } else {
68 schema.add_field(SchemaField::new(field_name, field.field_type));
69 }
70 }
71
72 if let Some(key_index) = self.key_field_index {
73 schema.set_key_field_index(key_index);
74 }
75
76 schema
77 }
78}
79
80#[derive(Debug)]
82pub struct SchemaDiscoverer<'a> {
83 header: &'a DbcHeader,
85 data: &'a [u8],
87 string_block: &'a StringBlock,
89 max_records: u32,
91 validate_strings: bool,
93 detect_arrays: bool,
95 detect_key: bool,
97}
98
99impl<'a> SchemaDiscoverer<'a> {
100 pub fn new(header: &'a DbcHeader, data: &'a [u8], string_block: &'a StringBlock) -> Self {
102 Self {
103 header,
104 data,
105 string_block,
106 max_records: 100, validate_strings: true,
108 detect_arrays: true,
109 detect_key: true,
110 }
111 }
112
113 pub fn with_max_records(mut self, max_records: u32) -> Self {
115 self.max_records = max_records;
116 self
117 }
118
119 pub fn with_validate_strings(mut self, validate_strings: bool) -> Self {
121 self.validate_strings = validate_strings;
122 self
123 }
124
125 pub fn with_detect_arrays(mut self, detect_arrays: bool) -> Self {
127 self.detect_arrays = detect_arrays;
128 self
129 }
130
131 pub fn with_detect_key(mut self, detect_key: bool) -> Self {
133 self.detect_key = detect_key;
134 self
135 }
136
137 pub fn discover(&self) -> Result<DiscoveredSchema> {
139 let records_to_analyze =
141 if self.max_records == 0 || self.max_records > self.header.record_count {
142 self.header.record_count
143 } else {
144 self.max_records
145 };
146
147 let mut cursor = Cursor::new(self.data);
149 cursor.seek(SeekFrom::Start(DbcHeader::SIZE as u64))?;
150
151 let mut record_data = Vec::with_capacity(records_to_analyze as usize);
153 for _ in 0..records_to_analyze {
154 let mut record = Vec::with_capacity(self.header.record_size as usize);
155 let mut buffer = vec![0u8; self.header.record_size as usize];
156 cursor.read_exact(&mut buffer)?;
157
158 let mut record_cursor = Cursor::new(&buffer);
160 for _ in 0..self.header.field_count {
161 let mut buf = [0u8; 4];
162 record_cursor.read_exact(&mut buf)?;
163 let value = u32::from_le_bytes(buf);
164 record.push(value);
165 }
166
167 record_data.push(record);
168 }
169
170 let discovered_fields = self.analyze_fields(&record_data)?;
172
173 let key_field_index = if self.detect_key {
175 self.detect_key_field(&record_data, &discovered_fields)
176 } else {
177 None
178 };
179
180 let (is_valid, validation_message) = self.validate_schema(&discovered_fields)?;
182
183 Ok(DiscoveredSchema {
184 fields: discovered_fields,
185 key_field_index,
186 is_valid,
187 validation_message,
188 })
189 }
190
191 fn analyze_fields(&self, record_data: &[Vec<u32>]) -> Result<Vec<DiscoveredField>> {
193 let mut discovered_fields = Vec::with_capacity(self.header.field_count as usize);
194
195 if record_data.is_empty() {
197 return Ok(discovered_fields);
198 }
199
200 for field_index in 0..self.header.field_count as usize {
202 let field_values: Vec<u32> = record_data
204 .iter()
205 .map(|record| record[field_index])
206 .collect();
207
208 let discovered_field = self.analyze_field(field_index, &field_values)?;
210 discovered_fields.push(discovered_field);
211 }
212
213 self.detect_locstrings(&mut discovered_fields);
216
217 if self.detect_arrays {
219 self.detect_array_fields(&mut discovered_fields);
220 }
221
222 Ok(discovered_fields)
223 }
224
225 fn analyze_field(&self, _field_index: usize, values: &[u32]) -> Result<DiscoveredField> {
227 let is_bool = values.iter().all(|&value| value == 0 || value == 1);
229
230 let possible_string_refs = values
232 .iter()
233 .filter(|&&value| value > 0 && value < self.string_block.size() as u32)
234 .count();
235
236 let is_string_ref = possible_string_refs > 0 && possible_string_refs >= values.len() / 2; let is_valid_string_ref = if self.validate_strings && is_string_ref {
240 let valid_strings = values
244 .iter()
245 .filter(|&&value| {
246 if value == 0 {
247 return true;
249 }
250
251 if !self.string_block.is_string_start(value) {
254 return false;
255 }
256
257 self.string_block.get_string(StringRef::new(value)).is_ok()
259 })
260 .count();
261
262 valid_strings >= values.len() * 3 / 4 } else {
264 false
265 };
266
267 let is_key_candidate = self.is_potential_key(values);
269
270 let is_float_like = |value: u32| -> bool {
274 if value < 65536 {
278 return false;
279 }
280
281 let float_val = f32::from_bits(value);
282
283 if !float_val.is_finite() || float_val.is_subnormal() {
285 return false;
286 }
287
288 let abs_val = float_val.abs();
292 (1e-6..=1e7).contains(&abs_val)
293 };
294
295 let non_zero_values: Vec<u32> = values.iter().copied().filter(|&v| v != 0).collect();
297 let float_like_count = non_zero_values
298 .iter()
299 .filter(|&&v| is_float_like(v))
300 .count();
301
302 let could_be_float =
306 float_like_count > 0 && float_like_count >= (non_zero_values.len() * 3 / 4).max(1);
307
308 let (field_type, confidence) = if is_valid_string_ref {
313 (FieldType::String, Confidence::High)
314 } else if is_string_ref && !self.validate_strings {
315 (FieldType::String, Confidence::Medium)
317 } else if is_bool {
318 (FieldType::Bool, Confidence::High)
319 } else if could_be_float {
320 (FieldType::Float32, Confidence::Medium)
321 } else if values.iter().any(|&v| v > 0x7FFFFFFF) {
322 (FieldType::UInt32, Confidence::High)
324 } else {
325 (FieldType::Int32, Confidence::Low)
327 };
328
329 let sample_values = values.iter().take(10).copied().collect();
331
332 Ok(DiscoveredField {
333 field_type,
334 confidence,
335 is_key_candidate,
336 is_array: false, array_size: None, is_locstring: false, locstring_index: None, sample_values,
341 })
342 }
343
344 fn is_potential_key(&self, values: &[u32]) -> bool {
346 if values.is_empty() {
348 return false;
349 }
350
351 let unique_values: HashSet<u32> = values.iter().copied().collect();
353 if unique_values.len() != values.len() {
354 return false;
355 }
356
357 if values.contains(&0) {
359 return false;
360 }
361
362 let min_value = *values.iter().min().unwrap();
364 let max_value = *values.iter().max().unwrap();
365
366 let range = max_value - min_value + 1;
368 if range as usize <= values.len() * 2 {
369 return true;
370 }
371
372 let density = values.len() as f32 / range as f32;
374 density > 0.2 }
376
377 fn detect_array_fields(&self, fields: &mut Vec<DiscoveredField>) {
379 if fields.len() <= 1 {
380 return; }
382
383 for array_size in 2..=10 {
385 if !fields.len().is_multiple_of(array_size) {
387 continue; }
389
390 let potential_arrays = fields.len() / array_size;
391 let mut is_array_pattern = true;
392
393 for a in 0..potential_arrays {
394 let base_type = fields[a * array_size].field_type;
395
396 for i in 1..array_size {
398 if fields[a * array_size + i].field_type != base_type {
399 is_array_pattern = false;
400 break;
401 }
402 }
403
404 if !is_array_pattern {
405 break;
406 }
407 }
408
409 if is_array_pattern {
410 let mut new_fields = Vec::with_capacity(potential_arrays);
412
413 for a in 0..potential_arrays {
414 let mut base_field = fields[a * array_size].clone();
415 base_field.is_array = true;
416 base_field.array_size = Some(array_size);
417 new_fields.push(base_field);
418 }
419
420 *fields = new_fields;
421 return; }
423 }
424 }
425
426 fn detect_locstrings(&self, fields: &mut [DiscoveredField]) {
436 if fields.len() < 9 {
438 return;
439 }
440
441 let mut i = 0;
442 while i + 8 < fields.len() {
443 if fields[i].field_type != FieldType::String || fields[i].confidence != Confidence::High
445 {
446 i += 1;
447 continue;
448 }
449
450 let mut is_locstring_pattern = true;
452 for j in 1..8 {
453 let field = &fields[i + j];
454 let is_string = field.field_type == FieldType::String;
455 let is_empty_string_ref = field.field_type == FieldType::Bool
456 && field.sample_values.iter().all(|&v| v == 0);
457
458 if !is_string && !is_empty_string_ref {
459 is_locstring_pattern = false;
460 break;
461 }
462 }
463
464 if !is_locstring_pattern {
465 i += 1;
466 continue;
467 }
468
469 let flags_field = &fields[i + 8];
472 let is_valid_flags = matches!(
473 flags_field.field_type,
474 FieldType::Int32 | FieldType::UInt32 | FieldType::Bool
475 );
476
477 if !is_valid_flags {
478 i += 1;
479 continue;
480 }
481
482 for j in 0..8 {
484 fields[i + j].is_locstring = true;
485 fields[i + j].locstring_index = Some(j as u8);
486 if fields[i + j].field_type == FieldType::Bool {
488 fields[i + j].field_type = FieldType::String;
489 fields[i + j].confidence = Confidence::Medium;
490 }
491 }
492
493 fields[i + 8].is_locstring = true;
495 fields[i + 8].locstring_index = Some(8);
496 if fields[i + 8].field_type == FieldType::Bool {
498 fields[i + 8].field_type = FieldType::Int32;
499 fields[i + 8].confidence = Confidence::Medium;
500 }
501
502 i += 9;
504 }
505 }
506
507 fn detect_key_field(
509 &self,
510 record_data: &[Vec<u32>],
511 fields: &[DiscoveredField],
512 ) -> Option<usize> {
513 let mut candidates: Vec<usize> = fields
515 .iter()
516 .enumerate()
517 .filter(|(_, field)| field.is_key_candidate)
518 .map(|(i, _)| i)
519 .collect();
520
521 if candidates.is_empty() {
523 for (field_index, field) in fields.iter().enumerate() {
524 if field.field_type != FieldType::UInt32 && field.field_type != FieldType::Int32 {
525 continue;
526 }
527
528 let values: Vec<u32> = record_data
530 .iter()
531 .map(|record| record[field_index])
532 .collect();
533
534 let mut is_increasing = true;
536 for i in 1..values.len() {
537 if values[i] <= values[i - 1] {
538 is_increasing = false;
539 break;
540 }
541 }
542
543 if is_increasing {
544 candidates.push(field_index);
545 }
546 }
547 }
548
549 if candidates.is_empty() {
551 for (field_index, field) in fields.iter().enumerate() {
552 if field.field_type == FieldType::UInt32 {
553 candidates.push(field_index);
554 break;
555 }
556 }
557 }
558
559 if candidates.len() == 1 {
561 return Some(candidates[0]);
562 }
563
564 candidates.sort();
566 candidates.first().copied()
567 }
568
569 fn validate_schema(&self, fields: &[DiscoveredField]) -> Result<(bool, Option<String>)> {
571 let field_count = if fields.iter().any(|f| f.is_array) {
573 fields
574 .iter()
575 .map(|f| {
576 if f.is_array {
577 f.array_size.unwrap_or(0)
578 } else {
579 1
580 }
581 })
582 .sum::<usize>() as u32
583 } else {
584 fields.len() as u32
585 };
586
587 if field_count != self.header.field_count {
588 return Ok((
589 false,
590 Some(format!(
591 "Field count mismatch: schema has {} fields, but DBC has {} fields",
592 field_count, self.header.field_count
593 )),
594 ));
595 }
596
597 let record_size = fields
599 .iter()
600 .map(|f| {
601 if f.is_array {
602 f.field_type.size() * f.array_size.unwrap_or(0)
603 } else {
604 f.field_type.size()
605 }
606 })
607 .sum::<usize>() as u32;
608
609 if record_size != self.header.record_size {
611 return Ok((
612 false,
613 Some(format!(
614 "Record size mismatch: schema defines {} bytes, but DBC has {} bytes per record",
615 record_size, self.header.record_size
616 )),
617 ));
618 }
619
620 Ok((true, None))
621 }
622
623 pub fn generate_schema(&self, name: &str) -> Result<Schema> {
625 let discovered = self.discover()?;
626 if !discovered.is_valid {
627 return Err(Error::SchemaValidation(
628 discovered
629 .validation_message
630 .unwrap_or_else(|| "Invalid discovered schema".to_string()),
631 ));
632 }
633
634 let mut schema = Schema::new(name);
635
636 for (i, field) in discovered.fields.iter().enumerate() {
638 let field_name = if field.is_key_candidate {
640 "ID".to_string()
641 } else {
642 match field.field_type {
643 FieldType::String => format!("String_{i}"),
644 FieldType::Float32 => format!("Float_{i}"),
645 FieldType::Bool => format!("Flag_{i}"),
646 FieldType::UInt32 | FieldType::Int32 => format!("Value_{i}"),
647 FieldType::UInt8 | FieldType::Int8 => format!("Byte_{i}"),
648 FieldType::UInt16 | FieldType::Int16 => format!("Short_{i}"),
649 }
650 };
651
652 if field.is_array {
653 schema.add_field(SchemaField::new_array(
654 field_name,
655 field.field_type,
656 field.array_size.unwrap_or(0),
657 ));
658 } else {
659 schema.add_field(SchemaField::new(field_name, field.field_type));
660 }
661 }
662
663 if let Some(key_index) = discovered.key_field_index {
665 schema.set_key_field_index(key_index);
666 }
667
668 Ok(schema)
669 }
670}