1use crate::{DbcHeader, Error, FieldType, Result, Schema, SchemaField, StringBlock, StringRef};
4use std::collections::HashSet;
5use std::io::{Cursor, Read, Seek, SeekFrom};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
9pub enum Confidence {
10 Low,
12 Medium,
14 High,
16}
17
18#[derive(Debug, Clone)]
20pub struct DiscoveredField {
21 pub field_type: FieldType,
23 pub confidence: Confidence,
25 pub is_key_candidate: bool,
27 pub is_array: bool,
29 pub array_size: Option<usize>,
31 pub sample_values: Vec<u32>,
33}
34
35#[derive(Debug, Clone)]
37pub struct DiscoveredSchema {
38 pub fields: Vec<DiscoveredField>,
40 pub key_field_index: Option<usize>,
42 pub is_valid: bool,
44 pub validation_message: Option<String>,
46}
47
48impl DiscoveredSchema {
49 pub fn to_schema(&self, name: &str) -> Schema {
51 let mut schema = Schema::new(name);
52
53 for (i, field) in self.fields.iter().enumerate() {
54 let field_name = format!("field_{i}");
55
56 if field.is_array {
57 schema.add_field(SchemaField::new_array(
58 field_name,
59 field.field_type,
60 field.array_size.unwrap_or(0),
61 ));
62 } else {
63 schema.add_field(SchemaField::new(field_name, field.field_type));
64 }
65 }
66
67 if let Some(key_index) = self.key_field_index {
68 schema.set_key_field_index(key_index);
69 }
70
71 schema
72 }
73}
74
75#[derive(Debug)]
77pub struct SchemaDiscoverer<'a> {
78 header: &'a DbcHeader,
80 data: &'a [u8],
82 string_block: &'a StringBlock,
84 max_records: u32,
86 validate_strings: bool,
88 detect_arrays: bool,
90 detect_key: bool,
92}
93
94impl<'a> SchemaDiscoverer<'a> {
95 pub fn new(header: &'a DbcHeader, data: &'a [u8], string_block: &'a StringBlock) -> Self {
97 Self {
98 header,
99 data,
100 string_block,
101 max_records: 100, validate_strings: true,
103 detect_arrays: true,
104 detect_key: true,
105 }
106 }
107
108 pub fn with_max_records(mut self, max_records: u32) -> Self {
110 self.max_records = max_records;
111 self
112 }
113
114 pub fn with_validate_strings(mut self, validate_strings: bool) -> Self {
116 self.validate_strings = validate_strings;
117 self
118 }
119
120 pub fn with_detect_arrays(mut self, detect_arrays: bool) -> Self {
122 self.detect_arrays = detect_arrays;
123 self
124 }
125
126 pub fn with_detect_key(mut self, detect_key: bool) -> Self {
128 self.detect_key = detect_key;
129 self
130 }
131
132 pub fn discover(&self) -> Result<DiscoveredSchema> {
134 let records_to_analyze =
136 if self.max_records == 0 || self.max_records > self.header.record_count {
137 self.header.record_count
138 } else {
139 self.max_records
140 };
141
142 let mut cursor = Cursor::new(self.data);
144 cursor.seek(SeekFrom::Start(DbcHeader::SIZE as u64))?;
145
146 let mut record_data = Vec::with_capacity(records_to_analyze as usize);
148 for _ in 0..records_to_analyze {
149 let mut record = Vec::with_capacity(self.header.record_size as usize);
150 let mut buffer = vec![0u8; self.header.record_size as usize];
151 cursor.read_exact(&mut buffer)?;
152
153 let mut record_cursor = Cursor::new(&buffer);
155 for _ in 0..self.header.field_count {
156 let mut buf = [0u8; 4];
157 record_cursor.read_exact(&mut buf)?;
158 let value = u32::from_le_bytes(buf);
159 record.push(value);
160 }
161
162 record_data.push(record);
163 }
164
165 let discovered_fields = self.analyze_fields(&record_data)?;
167
168 let key_field_index = if self.detect_key {
170 self.detect_key_field(&record_data, &discovered_fields)
171 } else {
172 None
173 };
174
175 let (is_valid, validation_message) = self.validate_schema(&discovered_fields)?;
177
178 Ok(DiscoveredSchema {
179 fields: discovered_fields,
180 key_field_index,
181 is_valid,
182 validation_message,
183 })
184 }
185
186 fn analyze_fields(&self, record_data: &[Vec<u32>]) -> Result<Vec<DiscoveredField>> {
188 let mut discovered_fields = Vec::with_capacity(self.header.field_count as usize);
189
190 if record_data.is_empty() {
192 return Ok(discovered_fields);
193 }
194
195 for field_index in 0..self.header.field_count as usize {
197 let field_values: Vec<u32> = record_data
199 .iter()
200 .map(|record| record[field_index])
201 .collect();
202
203 let discovered_field = self.analyze_field(field_index, &field_values)?;
205 discovered_fields.push(discovered_field);
206 }
207
208 if self.detect_arrays {
210 self.detect_array_fields(&mut discovered_fields);
211 }
212
213 Ok(discovered_fields)
214 }
215
216 fn analyze_field(&self, _field_index: usize, values: &[u32]) -> Result<DiscoveredField> {
218 let is_bool = values.iter().all(|&value| value == 0 || value == 1);
220
221 let possible_string_refs = values
223 .iter()
224 .filter(|&&value| value > 0 && value < self.string_block.size() as u32)
225 .count();
226
227 let is_string_ref = possible_string_refs > 0 && possible_string_refs >= values.len() / 2; let is_valid_string_ref = if self.validate_strings && is_string_ref {
231 let valid_strings = values
233 .iter()
234 .filter(|&&value| {
235 if value == 0 {
236 return true;
238 }
239
240 self.string_block.get_string(StringRef::new(value)).is_ok()
242 })
243 .count();
244
245 valid_strings >= values.len() * 3 / 4 } else {
247 false
248 };
249
250 let is_key_candidate = self.is_potential_key(values);
252
253 let min_value = values.iter().copied().min().unwrap_or(0);
255 let max_value = values.iter().copied().max().unwrap_or(0);
256
257 let fits_uint8 = max_value <= 0xFF;
258 let fits_int8 = min_value >= 0x80 && max_value <= 0x7F;
259 let fits_uint16 = max_value <= 0xFFFF;
260 let fits_int16 = min_value >= 0x8000 && max_value <= 0x7FFF;
261
262 let could_be_float = values.iter().any(|&value| {
264 let float_val = f32::from_bits(value);
266 float_val.is_finite()
267 && !float_val.is_subnormal()
268 && (float_val.abs() < 0.00001 || float_val.abs() > 0.00001)
269 });
270
271 let (field_type, confidence) = if is_valid_string_ref {
273 (FieldType::String, Confidence::High)
274 } else if is_string_ref {
275 (FieldType::String, Confidence::Medium)
276 } else if is_bool {
277 (FieldType::Bool, Confidence::High)
278 } else if fits_uint8 {
279 (FieldType::UInt8, Confidence::Medium)
280 } else if fits_int8 {
281 (FieldType::Int8, Confidence::Medium)
282 } else if fits_uint16 {
283 (FieldType::UInt16, Confidence::Medium)
284 } else if fits_int16 {
285 (FieldType::Int16, Confidence::Medium)
286 } else if could_be_float {
287 (FieldType::Float32, Confidence::Medium)
288 } else if values.iter().any(|&v| v > 0x7FFFFFFF) {
289 (FieldType::UInt32, Confidence::High)
291 } else {
292 (FieldType::Int32, Confidence::Low)
294 };
295
296 let sample_values = values.iter().take(10).copied().collect();
298
299 Ok(DiscoveredField {
300 field_type,
301 confidence,
302 is_key_candidate,
303 is_array: false, array_size: None, sample_values,
306 })
307 }
308
309 fn is_potential_key(&self, values: &[u32]) -> bool {
311 if values.is_empty() {
313 return false;
314 }
315
316 let unique_values: HashSet<u32> = values.iter().copied().collect();
318 if unique_values.len() != values.len() {
319 return false;
320 }
321
322 if values.contains(&0) {
324 return false;
325 }
326
327 let min_value = *values.iter().min().unwrap();
329 let max_value = *values.iter().max().unwrap();
330
331 let range = max_value - min_value + 1;
333 if range as usize <= values.len() * 2 {
334 return true;
335 }
336
337 let density = values.len() as f32 / range as f32;
339 density > 0.2 }
341
342 fn detect_array_fields(&self, fields: &mut Vec<DiscoveredField>) {
344 if fields.len() <= 1 {
345 return; }
347
348 for array_size in 2..=10 {
350 if fields.len() % array_size != 0 {
352 continue; }
354
355 let potential_arrays = fields.len() / array_size;
356 let mut is_array_pattern = true;
357
358 for a in 0..potential_arrays {
359 let base_type = fields[a * array_size].field_type;
360
361 for i in 1..array_size {
363 if fields[a * array_size + i].field_type != base_type {
364 is_array_pattern = false;
365 break;
366 }
367 }
368
369 if !is_array_pattern {
370 break;
371 }
372 }
373
374 if is_array_pattern {
375 let mut new_fields = Vec::with_capacity(potential_arrays);
377
378 for a in 0..potential_arrays {
379 let mut base_field = fields[a * array_size].clone();
380 base_field.is_array = true;
381 base_field.array_size = Some(array_size);
382 new_fields.push(base_field);
383 }
384
385 *fields = new_fields;
386 return; }
388 }
389 }
390
391 fn detect_key_field(
393 &self,
394 record_data: &[Vec<u32>],
395 fields: &[DiscoveredField],
396 ) -> Option<usize> {
397 let mut candidates: Vec<usize> = fields
399 .iter()
400 .enumerate()
401 .filter(|(_, field)| field.is_key_candidate)
402 .map(|(i, _)| i)
403 .collect();
404
405 if candidates.is_empty() {
407 for (field_index, field) in fields.iter().enumerate() {
408 if field.field_type != FieldType::UInt32 && field.field_type != FieldType::Int32 {
409 continue;
410 }
411
412 let values: Vec<u32> = record_data
414 .iter()
415 .map(|record| record[field_index])
416 .collect();
417
418 let mut is_increasing = true;
420 for i in 1..values.len() {
421 if values[i] <= values[i - 1] {
422 is_increasing = false;
423 break;
424 }
425 }
426
427 if is_increasing {
428 candidates.push(field_index);
429 }
430 }
431 }
432
433 if candidates.is_empty() {
435 for (field_index, field) in fields.iter().enumerate() {
436 if field.field_type == FieldType::UInt32 {
437 candidates.push(field_index);
438 break;
439 }
440 }
441 }
442
443 if candidates.len() == 1 {
445 return Some(candidates[0]);
446 }
447
448 candidates.sort();
450 candidates.first().copied()
451 }
452
453 fn validate_schema(&self, fields: &[DiscoveredField]) -> Result<(bool, Option<String>)> {
455 let field_count = if fields.iter().any(|f| f.is_array) {
457 fields
458 .iter()
459 .map(|f| {
460 if f.is_array {
461 f.array_size.unwrap_or(0)
462 } else {
463 1
464 }
465 })
466 .sum::<usize>() as u32
467 } else {
468 fields.len() as u32
469 };
470
471 if field_count != self.header.field_count {
472 return Ok((
473 false,
474 Some(format!(
475 "Field count mismatch: schema has {} fields, but DBC has {} fields",
476 field_count, self.header.field_count
477 )),
478 ));
479 }
480
481 let record_size = fields
483 .iter()
484 .map(|f| {
485 if f.is_array {
486 f.field_type.size() * f.array_size.unwrap_or(0)
487 } else {
488 f.field_type.size()
489 }
490 })
491 .sum::<usize>() as u32;
492
493 if record_size != self.header.record_size {
495 return Ok((
496 false,
497 Some(format!(
498 "Record size mismatch: schema defines {} bytes, but DBC has {} bytes per record",
499 record_size, self.header.record_size
500 )),
501 ));
502 }
503
504 Ok((true, None))
505 }
506
507 pub fn generate_schema(&self, name: &str) -> Result<Schema> {
509 let discovered = self.discover()?;
510 if !discovered.is_valid {
511 return Err(Error::SchemaValidation(
512 discovered
513 .validation_message
514 .unwrap_or_else(|| "Invalid discovered schema".to_string()),
515 ));
516 }
517
518 let mut schema = Schema::new(name);
519
520 for (i, field) in discovered.fields.iter().enumerate() {
522 let field_name = if field.is_key_candidate {
524 "ID".to_string()
525 } else {
526 match field.field_type {
527 FieldType::String => format!("String_{i}"),
528 FieldType::Float32 => format!("Float_{i}"),
529 FieldType::Bool => format!("Flag_{i}"),
530 FieldType::UInt32 | FieldType::Int32 => format!("Value_{i}"),
531 FieldType::UInt8 | FieldType::Int8 => format!("Byte_{i}"),
532 FieldType::UInt16 | FieldType::Int16 => format!("Short_{i}"),
533 }
534 };
535
536 if field.is_array {
537 schema.add_field(SchemaField::new_array(
538 field_name,
539 field.field_type,
540 field.array_size.unwrap_or(0),
541 ));
542 } else {
543 schema.add_field(SchemaField::new(field_name, field.field_type));
544 }
545 }
546
547 if let Some(key_index) = discovered.key_field_index {
549 schema.set_key_field_index(key_index);
550 }
551
552 Ok(schema)
553 }
554}