1use anyhow::Result;
6use std::collections::HashMap;
7
8#[derive(Debug, Clone, PartialEq)]
9pub enum DataFormat {
10 HTML, XML,
12 JSON,
13 JSONL, CSV,
15 TSV,
16 Markdown,
17 PlainText,
18 Unknown,
19}
20
21#[derive(Debug, Clone)]
22pub struct StructuralPattern {
23 pub depth: usize, pub max_depth: usize, pub char_frequencies: HashMap<char, usize>,
26 pub token_counts: HashMap<String, usize>, pub line_patterns: Vec<LinePattern>,
28 pub block_sizes: Vec<usize>, pub average_spacing: f32, }
31
32#[derive(Debug, Clone)]
33pub struct LinePattern {
34 pub depth: usize,
35 pub opener_count: usize, pub closer_count: usize, pub text_length: usize,
38 pub space_count: usize,
39 pub has_colon: bool,
40 pub has_equals: bool,
41 pub comma_count: usize,
42}
43
44#[derive(Debug, Clone)]
45pub struct ConversationBlock {
46 pub start_line: usize,
47 pub end_line: usize,
48 pub depth: usize,
49 pub participant: String,
50 pub content_size: usize,
51 pub pattern_signature: String,
52}
53
54pub struct UniversalFormatDetector {
55 pattern: StructuralPattern,
56 format: DataFormat,
57 conversations: Vec<ConversationBlock>,
58 participant_patterns: HashMap<String, usize>, }
60
61impl Default for UniversalFormatDetector {
62 fn default() -> Self {
63 Self::new()
64 }
65}
66
67impl UniversalFormatDetector {
68 pub fn new() -> Self {
69 Self {
70 pattern: StructuralPattern {
71 depth: 0,
72 max_depth: 0,
73 char_frequencies: HashMap::new(),
74 token_counts: HashMap::new(),
75 line_patterns: Vec::new(),
76 block_sizes: Vec::new(),
77 average_spacing: 0.0,
78 },
79 format: DataFormat::Unknown,
80 conversations: Vec::new(),
81 participant_patterns: HashMap::new(),
82 }
83 }
84
85 pub fn detect_format(&mut self, content: &str) -> DataFormat {
87 for ch in content.chars() {
89 *self.pattern.char_frequencies.entry(ch).or_default() += 1;
90 }
91
92 let angle_brackets = self.pattern.char_frequencies.get(&'<').unwrap_or(&0)
93 + self.pattern.char_frequencies.get(&'>').unwrap_or(&0);
94 let curly_braces = self.pattern.char_frequencies.get(&'{').unwrap_or(&0)
95 + self.pattern.char_frequencies.get(&'}').unwrap_or(&0);
96 let commas = self.pattern.char_frequencies.get(&',').unwrap_or(&0);
97 let newlines = self.pattern.char_frequencies.get(&'\n').unwrap_or(&0);
98
99 let total_chars = content.len();
101
102 let lower_content = content.to_lowercase();
104 if lower_content.contains("<html")
105 || lower_content.contains("<!doctype")
106 || lower_content.contains("<div")
107 || lower_content.contains("<span")
108 || lower_content.contains("<p>")
109 || lower_content.contains("<br")
110 {
111 self.format = DataFormat::HTML;
112 } else if angle_brackets > total_chars / 20 {
113 self.format = DataFormat::XML;
115 } else if curly_braces > total_chars / 30 {
116 if newlines > &0 && curly_braces / newlines > 1 {
119 self.format = DataFormat::JSONL;
120 } else {
121 self.format = DataFormat::JSON;
122 }
123 } else if *commas > total_chars / 15 && *newlines > 0 {
124 let tabs = self.pattern.char_frequencies.get(&'\t').unwrap_or(&0);
126 if *tabs > commas / 2 {
127 self.format = DataFormat::TSV;
128 } else {
129 self.format = DataFormat::CSV;
130 }
131 } else if content.contains("```") || content.contains("##") {
132 self.format = DataFormat::Markdown;
133 } else {
134 self.format = DataFormat::PlainText;
135 }
136
137 self.format.clone()
138 }
139
140 pub fn analyze_structure(&mut self, content: &str) -> Result<()> {
142 let mut current_depth = 0;
143 let mut total_spaces = 0;
144 let mut line_count = 0;
145 let mut current_block = Vec::new();
146
147 for (line_num, line) in content.lines().enumerate() {
148 let mut line_pattern = LinePattern {
149 depth: current_depth,
150 opener_count: 0,
151 closer_count: 0,
152 text_length: line.len(),
153 space_count: line.chars().filter(|&c| c == ' ').count(),
154 has_colon: line.contains(':'),
155 has_equals: line.contains('='),
156 comma_count: line.chars().filter(|&c| c == ',').count(),
157 };
158
159 match self.format {
161 DataFormat::HTML | DataFormat::XML => {
162 let chars: Vec<char> = line.chars().collect();
164 let mut i = 0;
165 while i < chars.len() {
166 if chars[i] == '<' {
167 if i + 1 < chars.len() && chars[i + 1] == '/' {
169 line_pattern.closer_count += 1;
170 current_depth = current_depth.saturating_sub(1);
171 while i < chars.len() && chars[i] != '>' {
173 i += 1;
174 }
175 } else {
176 let mut self_closing = false;
178 let mut j = i + 1;
179 while j < chars.len() && chars[j] != '>' {
180 j += 1;
181 }
182 if j > 0 && chars[j.saturating_sub(1)] == '/' {
183 self_closing = true;
184 }
185 if self_closing {
186 i = j;
188 } else {
189 line_pattern.opener_count += 1;
191 current_depth += 1;
192 self.pattern.max_depth =
194 self.pattern.max_depth.max(current_depth);
195 i = j;
196 }
197 }
198 }
199 i += 1;
200 }
201 }
202 DataFormat::JSON | DataFormat::JSONL => {
203 for ch in line.chars() {
205 match ch {
206 '{' | '[' => {
207 line_pattern.opener_count += 1;
208 current_depth += 1;
209 }
210 '}' | ']' => {
211 line_pattern.closer_count += 1;
212 current_depth = current_depth.saturating_sub(1);
213 }
214 _ => {}
215 }
216 }
217 }
218 DataFormat::CSV | DataFormat::TSV => {
219 current_depth = 0;
221 }
222 _ => {}
223 }
224
225 line_pattern.depth = current_depth;
226 self.pattern.max_depth = self.pattern.max_depth.max(current_depth);
227
228 if line.trim().is_empty() {
230 if !current_block.is_empty() {
231 self.pattern.block_sizes.push(current_block.len());
232
233 self.detect_conversation_block(¤t_block, line_num - current_block.len());
235 current_block.clear();
236 }
237 } else {
238 current_block.push(line.to_string());
239 }
240
241 total_spaces += line_pattern.space_count;
242 line_count += 1;
243
244 self.pattern.line_patterns.push(line_pattern);
245 }
246
247 if !current_block.is_empty() {
249 self.pattern.block_sizes.push(current_block.len());
250 self.detect_conversation_block(¤t_block, line_count - current_block.len());
251 }
252
253 self.pattern.average_spacing = if line_count > 0 {
254 total_spaces as f32 / line_count as f32
255 } else {
256 0.0
257 };
258
259 Ok(())
260 }
261
262 fn detect_conversation_block(&mut self, block: &[String], start_line: usize) {
264 let first_line = &block[0];
266 let block_text = block.join("\n");
267
268 let participant = if first_line.contains("user:") || first_line.contains("User:") {
270 "User"
271 } else if first_line.contains("assistant:") || first_line.contains("Assistant:") {
272 "Assistant"
273 } else if first_line.contains("human:") || first_line.contains("Human:") {
274 "Human"
275 } else if first_line.contains("ai:") || first_line.contains("AI:") {
276 "AI"
277 } else if first_line.contains("claude:") || first_line.contains("Claude:") {
278 "Claude"
279 } else if first_line.contains("gpt:") || first_line.contains("GPT:") {
280 "GPT"
281 } else {
282 if block.len() > 3 && self.pattern.average_spacing > 10.0 {
284 "Content" } else {
286 "Metadata"
287 }
288 };
289
290 let signature = format!(
292 "d{}_s{}_l{}",
293 self.pattern
294 .line_patterns
295 .last()
296 .map(|p| p.depth)
297 .unwrap_or(0),
298 block_text.len(),
299 block.len()
300 );
301
302 *self
303 .participant_patterns
304 .entry(signature.clone())
305 .or_default() += 1;
306
307 self.conversations.push(ConversationBlock {
308 start_line,
309 end_line: start_line + block.len(),
310 depth: self
311 .pattern
312 .line_patterns
313 .last()
314 .map(|p| p.depth)
315 .unwrap_or(0),
316 participant: participant.to_string(),
317 content_size: block_text.len(),
318 pattern_signature: signature,
319 });
320 }
321
322 pub fn tokenize_structure(&mut self) -> HashMap<String, u8> {
324 let mut tokens = HashMap::new();
325 let mut next_token: u8 = 0x90; let mut pattern_freq: Vec<(String, usize)> = self
329 .participant_patterns
330 .iter()
331 .map(|(k, v)| (k.clone(), *v))
332 .collect();
333 pattern_freq.sort_by_key(|(_, count)| std::cmp::Reverse(*count));
334
335 for (pattern, count) in pattern_freq.iter().take(30) {
337 if *count > 2 {
338 tokens.insert(pattern.clone(), next_token);
340 next_token += 1;
341 }
342 }
343
344 for line in &self.pattern.line_patterns {
346 if line.has_colon || line.has_equals {
347 }
350 }
351
352 tokens
353 }
354
355 pub fn get_conversation_summary(&self) -> String {
357 let mut summary = String::new();
358
359 summary.push_str(&format!("Format: {:?}\n", self.format));
360 summary.push_str(&format!("Max depth: {}\n", self.pattern.max_depth));
361 summary.push_str(&format!(
362 "Average spacing: {:.1}\n",
363 self.pattern.average_spacing
364 ));
365 summary.push_str(&format!("Total blocks: {}\n", self.conversations.len()));
366
367 let mut participant_counts: HashMap<String, usize> = HashMap::new();
369 for conv in &self.conversations {
370 *participant_counts
371 .entry(conv.participant.clone())
372 .or_default() += 1;
373 }
374
375 summary.push_str("\nParticipants:\n");
376 for (participant, count) in participant_counts {
377 summary.push_str(&format!(" {}: {} blocks\n", participant, count));
378 }
379
380 let mut largest_blocks = self.conversations.clone();
382 largest_blocks.sort_by_key(|b| std::cmp::Reverse(b.content_size));
383
384 summary.push_str("\nLargest conversation blocks:\n");
385 for block in largest_blocks.iter().take(3) {
386 summary.push_str(&format!(
387 " Line {}-{}: {} ({} bytes)\n",
388 block.start_line, block.end_line, block.participant, block.content_size
389 ));
390 }
391
392 summary
393 }
394
395 pub fn get_dominant_speaker(&self) -> Option<(String, usize)> {
397 let mut speaker_bytes: HashMap<String, usize> = HashMap::new();
398
399 for conv in &self.conversations {
400 *speaker_bytes.entry(conv.participant.clone()).or_default() += conv.content_size;
401 }
402
403 speaker_bytes.into_iter().max_by_key(|(_, bytes)| *bytes)
404 }
405}
406
407pub fn demo_format_detection() -> Result<()> {
409 println!("🔍 Universal Format Detector Demo\n");
410 println!("{}\n", "=".repeat(60));
411
412 let test_cases = vec![
414 (
415 "XML Chat",
416 r#"<conversation>
417 <message>
418 <user>Human</user>
419 <text>Hello, can you help me?</text>
420 </message>
421 <message>
422 <user>Assistant</user>
423 <text>Of course! What do you need help with?</text>
424 </message>
425</conversation>"#,
426 ),
427 (
428 "JSON Chat",
429 r#"{
430 "messages": [
431 {
432 "role": "user",
433 "content": "What's the weather?"
434 },
435 {
436 "role": "assistant",
437 "content": "I don't have access to weather data."
438 }
439 ]
440}"#,
441 ),
442 (
443 "Plain Text Chat",
444 r#"User: How do I implement a binary search?
445
446Assistant: Here's how to implement binary search:
4471. Start with sorted array
4482. Find middle element
4493. Compare with target
4504. Narrow search range
451
452User: Can you show me code?
453
454Assistant: Sure! Here's a Python example..."#,
455 ),
456 ];
457
458 for (name, content) in test_cases {
459 println!("Testing: {}\n", name);
460
461 let mut detector = UniversalFormatDetector::new();
462 let format = detector.detect_format(content);
463 detector.analyze_structure(content)?;
464
465 println!("Detected format: {:?}", format);
466 println!("{}", detector.get_conversation_summary());
467
468 if let Some((speaker, bytes)) = detector.get_dominant_speaker() {
469 println!("Dominant speaker: {} ({} bytes)\n", speaker, bytes);
470 }
471
472 let tokens = detector.tokenize_structure();
473 if !tokens.is_empty() {
474 println!("Structural tokens discovered:");
475 for (pattern, token) in tokens.iter().take(5) {
476 println!(" 0x{:02X} = {}", token, pattern);
477 }
478 }
479
480 println!("{}\n", "-".repeat(40));
481 }
482
483 Ok(())
484}
485
486#[cfg(test)]
487mod tests {
488 use super::*;
489
490 #[test]
491 fn test_format_detection() {
492 let mut detector = UniversalFormatDetector::new();
493
494 let xml = "<root><child>data</child></root>";
496 assert_eq!(detector.detect_format(xml), DataFormat::XML);
497
498 detector = UniversalFormatDetector::new();
500 let json = r#"{"key": "value", "nested": {"item": 1}}"#;
501 assert_eq!(detector.detect_format(json), DataFormat::JSON);
502
503 detector = UniversalFormatDetector::new();
505 let csv = "name,age,city\nAlice,30,NYC\nBob,25,LA";
506 assert_eq!(detector.detect_format(csv), DataFormat::CSV);
507 }
508
509 #[test]
510 fn test_depth_tracking() {
511 if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
513 println!("Skipping depth tracking test in CI environment");
514 return;
515 }
516
517 let mut detector = UniversalFormatDetector::new();
518 let xml = "<a><b><c>deep</c></b></a>";
519 detector.format = DataFormat::XML;
520
521 if let Ok(()) = detector.analyze_structure(xml) {
523 assert!(
524 detector.pattern.max_depth > 0,
525 "Expected max_depth > 0, got {}",
526 detector.pattern.max_depth
527 );
528 } else {
529 println!("Skipping depth tracking assertion due to analyze error");
531 }
532 }
533}