1use crate::{data::AnnotatedDocument, exceptions::LangExtractResult};
4use serde_json::{json, Value};
5use std::collections::HashMap;
6use crate::Extraction;
7#[derive(Debug, Clone, Copy, PartialEq)]
9pub enum ExportFormat {
10 Text,
12 Html,
14 Markdown,
16 Json,
18 Csv,
20}
21
22#[derive(Debug, Clone)]
24pub struct ExportConfig {
25 pub format: ExportFormat,
27 pub show_char_intervals: bool,
29 pub include_text: bool,
31 pub highlight_extractions: bool,
33 pub include_statistics: bool,
35 pub custom_css: Option<String>,
37 pub title: Option<String>,
39}
40
41impl Default for ExportConfig {
42 fn default() -> Self {
43 Self {
44 format: ExportFormat::Text,
45 show_char_intervals: false,
46 include_text: true,
47 highlight_extractions: true,
48 include_statistics: true,
49 custom_css: None,
50 title: None,
51 }
52 }
53}
54
55pub fn export_document(
57 annotated_document: &AnnotatedDocument,
58 config: &ExportConfig,
59) -> LangExtractResult<String> {
60 match config.format {
61 ExportFormat::Text => visualize_text(annotated_document, config.show_char_intervals),
62 ExportFormat::Html => export_html(annotated_document, config),
63 ExportFormat::Markdown => export_markdown(annotated_document, config),
64 ExportFormat::Json => export_json(annotated_document, config),
65 ExportFormat::Csv => export_csv(annotated_document, config),
66 }
67}
68
69pub fn visualize(
71 annotated_document: &AnnotatedDocument,
72 show_char_intervals: bool,
73) -> LangExtractResult<String> {
74 visualize_text(annotated_document, show_char_intervals)
75}
76
77fn visualize_text(
79 annotated_document: &AnnotatedDocument,
80 show_char_intervals: bool,
81) -> LangExtractResult<String> {
82 let mut result = String::new();
83
84 result.push_str("đ EXTRACTION VISUALIZATION\n");
85 result.push_str("=" .repeat(50).as_str());
86 result.push('\n');
87
88 let text = annotated_document.text.as_deref().unwrap_or("No text");
90 result.push_str(&format!("đ Document Text ({} chars):\n", text.len()));
91 result.push_str(&format!(" {}\n\n", text));
92
93 if let Some(extractions) = &annotated_document.extractions {
95 result.push_str(&format!("đ¯ Found {} Extractions:\n", extractions.len()));
96 result.push_str("-".repeat(30).as_str());
97 result.push('\n');
98
99 for (i, extraction) in extractions.iter().enumerate() {
100 result.push_str(&format!("{}. [{}] {}\n",
101 i + 1,
102 extraction.extraction_class,
103 extraction.extraction_text
104 ));
105
106 if show_char_intervals {
107 if let Some(interval) = &extraction.char_interval {
108 result.push_str(&format!(" Position: {:?}\n", interval));
109 }
110 }
111
112 if let Some(description) = &extraction.description {
113 result.push_str(&format!(" Description: {}\n", description));
114 }
115
116 result.push('\n');
117 }
118 } else {
119 result.push_str("âšī¸ No extractions found\n");
120 }
121
122 result.push_str("đ Statistics:\n");
124 result.push_str("-".repeat(15).as_str());
125 result.push('\n');
126 result.push_str(&format!(" Document ID: {}\n",
127 annotated_document.document_id.as_deref().unwrap_or("None")));
128 result.push_str(&format!(" Text Length: {} characters\n", text.len()));
129 result.push_str(&format!(" Total Extractions: {}\n", annotated_document.extraction_count()));
130
131 if let Some(extractions) = &annotated_document.extractions {
132 let mut class_counts = std::collections::HashMap::new();
134 for extraction in extractions {
135 *class_counts.entry(&extraction.extraction_class).or_insert(0) += 1;
136 }
137
138 result.push_str(" Extraction Classes:\n");
139 for (class, count) in class_counts {
140 result.push_str(&format!(" âĸ {}: {} instance(s)\n", class, count));
141 }
142 }
143
144 Ok(result)
145}
146
147fn export_html(
149 annotated_document: &AnnotatedDocument,
150 config: &ExportConfig,
151) -> LangExtractResult<String> {
152 let title = config.title.as_deref().unwrap_or("LangExtract Results");
153 let text = annotated_document.text.as_deref().unwrap_or("No text");
154
155 let mut html = String::new();
156
157 html.push_str(&format!(r#"<!DOCTYPE html>
159<html lang="en">
160<head>
161 <meta charset="UTF-8">
162 <meta name="viewport" content="width=device-width, initial-scale=1.0">
163 <title>{}</title>
164 <style>
165 body {{
166 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
167 max-width: 1200px;
168 margin: 0 auto;
169 padding: 20px;
170 background: #f8fafc;
171 color: #334155;
172 }}
173 .container {{
174 background: white;
175 border-radius: 12px;
176 box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
177 overflow: hidden;
178 }}
179 .header {{
180 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
181 color: white;
182 padding: 30px;
183 text-align: center;
184 }}
185 .header h1 {{
186 margin: 0;
187 font-size: 2.5em;
188 font-weight: 300;
189 }}
190 .content {{
191 padding: 30px;
192 }}
193 .section {{
194 margin-bottom: 40px;
195 }}
196 .section h2 {{
197 color: #1e293b;
198 border-bottom: 2px solid #e2e8f0;
199 padding-bottom: 10px;
200 margin-bottom: 20px;
201 }}
202 .document-text {{
203 background: #f1f5f9;
204 border-radius: 8px;
205 padding: 20px;
206 font-family: 'Monaco', 'Menlo', monospace;
207 line-height: 1.6;
208 white-space: pre-wrap;
209 position: relative;
210 margin-bottom: 20px;
211 }}
212 .extraction-highlight {{
213 background: rgba(59, 130, 246, 0.2);
214 border: 1px solid rgba(59, 130, 246, 0.4);
215 border-radius: 3px;
216 padding: 1px 2px;
217 cursor: pointer;
218 transition: all 0.2s ease;
219 }}
220 .extraction-highlight:hover {{
221 background: rgba(59, 130, 246, 0.3);
222 transform: translateY(-1px);
223 }}
224 .extractions-grid {{
225 display: grid;
226 grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
227 gap: 20px;
228 margin-bottom: 30px;
229 }}
230 .extraction-card {{
231 background: #f8fafc;
232 border: 1px solid #e2e8f0;
233 border-radius: 8px;
234 padding: 15px;
235 transition: all 0.2s ease;
236 }}
237 .extraction-card:hover {{
238 border-color: #3b82f6;
239 box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15);
240 }}
241 .extraction-class {{
242 background: #3b82f6;
243 color: white;
244 padding: 4px 8px;
245 border-radius: 4px;
246 font-size: 0.8em;
247 font-weight: 600;
248 display: inline-block;
249 margin-bottom: 8px;
250 }}
251 .extraction-text {{
252 font-weight: 600;
253 color: #1e293b;
254 margin-bottom: 8px;
255 }}
256 .extraction-meta {{
257 font-size: 0.9em;
258 color: #64748b;
259 }}
260 .stats-grid {{
261 display: grid;
262 grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
263 gap: 20px;
264 }}
265 .stat-card {{
266 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
267 color: white;
268 padding: 20px;
269 border-radius: 8px;
270 text-align: center;
271 }}
272 .stat-number {{
273 font-size: 2em;
274 font-weight: bold;
275 margin-bottom: 5px;
276 }}
277 .stat-label {{
278 opacity: 0.9;
279 font-size: 0.9em;
280 }}
281 .class-counts {{
282 background: #f1f5f9;
283 border-radius: 8px;
284 padding: 20px;
285 }}
286 .class-count-item {{
287 display: flex;
288 justify-content: space-between;
289 align-items: center;
290 padding: 8px 0;
291 border-bottom: 1px solid #e2e8f0;
292 }}
293 .class-count-item:last-child {{
294 border-bottom: none;
295 }}
296 .class-badge {{
297 background: #10b981;
298 color: white;
299 padding: 2px 6px;
300 border-radius: 12px;
301 font-size: 0.8em;
302 font-weight: 600;
303 }}
304 {}
305 </style>
306</head>
307<body>
308"#, title, config.custom_css.as_deref().unwrap_or("")));
309
310 html.push_str(&format!(r#" <div class="container">
312 <div class="header">
313 <h1>{}</h1>
314 </div>
315 <div class="content">
316"#, title));
317
318 if config.include_text {
320 html.push_str(r#" <div class="section">
321 <h2>đ Document Text</h2>
322 <div class="document-text">"#);
323
324 if config.highlight_extractions {
325 html.push_str(&highlight_text_html(text, annotated_document)?);
326 } else {
327 html.push_str(&html_escape(text));
328 }
329
330 html.push_str("</div>\n </div>\n");
331 }
332
333 if let Some(extractions) = &annotated_document.extractions {
335 html.push_str(&format!(r#" <div class="section">
336 <h2>đ¯ Extractions ({} found)</h2>
337 <div class="extractions-grid">
338"#, extractions.len()));
339
340 for extraction in extractions {
341 html.push_str(&format!(r#" <div class="extraction-card">
342 <div class="extraction-class">{}</div>
343 <div class="extraction-text">{}</div>
344"#, html_escape(&extraction.extraction_class), html_escape(&extraction.extraction_text)));
345
346 if config.show_char_intervals {
347 if let Some(interval) = &extraction.char_interval {
348 html.push_str(&format!(r#" <div class="extraction-meta">Position: {}-{}</div>
349"#, interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
350 }
351 }
352
353 if let Some(description) = &extraction.description {
354 html.push_str(&format!(r#" <div class="extraction-meta">Description: {}</div>
355"#, html_escape(description)));
356 }
357
358 html.push_str(" </div>\n");
359 }
360
361 html.push_str(" </div>\n </div>\n");
362 }
363
364 if config.include_statistics {
366 html.push_str(r#" <div class="section">
367 <h2>đ Statistics</h2>
368 <div class="stats-grid">
369"#);
370
371 let extraction_count = annotated_document.extraction_count();
372 html.push_str(&format!(r#" <div class="stat-card">
373 <div class="stat-number">{}</div>
374 <div class="stat-label">Total Extractions</div>
375 </div>
376 <div class="stat-card">
377 <div class="stat-number">{}</div>
378 <div class="stat-label">Characters</div>
379 </div>
380"#, extraction_count, text.len()));
381
382 if let Some(extractions) = &annotated_document.extractions {
383 let class_counts = count_extraction_classes(extractions);
384 html.push_str(&format!(r#" <div class="stat-card">
385 <div class="stat-number">{}</div>
386 <div class="stat-label">Unique Classes</div>
387 </div>
388"#, class_counts.len()));
389
390 html.push_str(" </div>\n");
391
392 html.push_str(r#" <h3>Extraction Classes</h3>
394 <div class="class-counts">
395"#);
396
397 for (class, count) in class_counts {
398 html.push_str(&format!(r#" <div class="class-count-item">
399 <span>{}</span>
400 <span class="class-badge">{}</span>
401 </div>
402"#, html_escape(class), count));
403 }
404
405 html.push_str(" </div>\n");
406 } else {
407 html.push_str(" </div>\n");
408 }
409
410 html.push_str(" </div>\n");
411 }
412
413 html.push_str(r#" </div>
415 </div>
416
417 <script>
418 // Add interactivity for extraction highlights
419 document.querySelectorAll('.extraction-highlight').forEach(element => {
420 element.addEventListener('click', function() {
421 const className = this.getAttribute('data-class');
422 const text = this.getAttribute('data-text');
423 alert(`Extraction: ${className}\nText: ${text}`);
424 });
425 });
426 </script>
427</body>
428</html>"#);
429
430 Ok(html)
431}
432
433fn html_escape(text: &str) -> String {
435 text.replace('&', "&")
436 .replace('<', "<")
437 .replace('>', ">")
438 .replace('"', """)
439 .replace('\'', "'")
440}
441
442fn find_char_boundary(text: &str, mut index: usize) -> usize {
444 if index >= text.len() {
446 return text.len();
447 }
448
449 if text.is_char_boundary(index) {
451 return index;
452 }
453
454 while index > 0 && !text.is_char_boundary(index) {
456 index -= 1;
457 }
458
459 index
460}
461
462fn highlight_text_html(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
464 if let Some(extractions) = &annotated_document.extractions {
465 let mut intervals: Vec<(usize, usize, &Extraction)> = Vec::new();
467
468 for extraction in extractions {
469 if let Some(interval) = &extraction.char_interval {
470 if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
471 if start < end && end <= text.len() {
472 intervals.push((start, end, extraction));
473 }
474 }
475 }
476 }
477
478 intervals.sort_by_key(|(start, _, _)| *start);
480
481 let mut filtered_intervals = Vec::new();
483 let mut last_end = 0;
484
485 for (start, end, extraction) in intervals {
486 if start >= last_end {
487 filtered_intervals.push((start, end, extraction));
488 last_end = end;
489 } else {
490 log::debug!("Skipping overlapping extraction: '{}' at {}-{} (overlaps with previous ending at {})",
492 extraction.extraction_text, start, end, last_end);
493 }
494 }
495
496 let mut result = String::new();
498 let mut last_pos = 0;
499
500 for (start, end, extraction) in filtered_intervals {
501 let safe_start = find_char_boundary(text, start);
503 let safe_end = find_char_boundary(text, end);
504
505 if safe_start > last_pos {
507 let safe_last_pos = find_char_boundary(text, last_pos);
508 if safe_last_pos < safe_start {
509 result.push_str(&html_escape(&text[safe_last_pos..safe_start]));
510 }
511 }
512
513 if safe_start < safe_end && safe_end <= text.len() {
515 let actual_text = &text[safe_start..safe_end];
516 result.push_str(&format!(
517 r#"<span class="extraction-highlight" data-class="{}" data-text="{}">{}</span>"#,
518 html_escape(&extraction.extraction_class),
519 html_escape(&extraction.extraction_text),
520 html_escape(actual_text)
521 ));
522 last_pos = safe_end;
523 } else {
524 log::debug!("Skipping extraction with invalid UTF-8 boundaries: '{}' at {}-{}",
526 extraction.extraction_text, start, end);
527 }
528 }
529
530 if last_pos < text.len() {
532 let safe_last_pos = find_char_boundary(text, last_pos);
533 if safe_last_pos < text.len() {
534 result.push_str(&html_escape(&text[safe_last_pos..]));
535 }
536 }
537
538 Ok(result)
539 } else {
540 Ok(html_escape(text))
541 }
542}
543
544fn count_extraction_classes(extractions: &[crate::data::Extraction]) -> HashMap<&str, usize> {
546 let mut class_counts = HashMap::new();
547 for extraction in extractions {
548 *class_counts.entry(extraction.extraction_class.as_str()).or_insert(0) += 1;
549 }
550 class_counts
551}
552
553fn export_markdown(
555 annotated_document: &AnnotatedDocument,
556 config: &ExportConfig,
557) -> LangExtractResult<String> {
558 let title = config.title.as_deref().unwrap_or("LangExtract Results");
559 let text = annotated_document.text.as_deref().unwrap_or("No text");
560
561 let mut md = String::new();
562
563 md.push_str(&format!("# {}\n\n", title));
565
566 if config.include_text {
568 md.push_str("## đ Document Text\n\n");
569
570 if config.highlight_extractions {
571 md.push_str(&highlight_text_markdown(text, annotated_document)?);
572 } else {
573 md.push_str(&format!("```\n{}\n```\n", text));
574 }
575
576 md.push_str("\n");
577 }
578
579 if let Some(extractions) = &annotated_document.extractions {
581 md.push_str(&format!("## đ¯ Extractions ({} found)\n\n", extractions.len()));
582
583 for (i, extraction) in extractions.iter().enumerate() {
584 md.push_str(&format!("### {}. {}\n\n", i + 1, extraction.extraction_class));
585 md.push_str(&format!("**Text:** {}\n\n", extraction.extraction_text));
586
587 if config.show_char_intervals {
588 if let Some(interval) = &extraction.char_interval {
589 md.push_str(&format!("**Position:** {}-{}\n\n", interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
590 }
591 }
592
593 if let Some(description) = &extraction.description {
594 md.push_str(&format!("**Description:** {}\n\n", description));
595 }
596 }
597 }
598
599 if config.include_statistics {
601 md.push_str("## đ Statistics\n\n");
602
603 let extraction_count = annotated_document.extraction_count();
604 md.push_str(&format!("- **Total Extractions:** {}\n", extraction_count));
605 md.push_str(&format!("- **Text Length:** {} characters\n", text.len()));
606
607 if let Some(extractions) = &annotated_document.extractions {
608 let class_counts = count_extraction_classes(extractions);
609 md.push_str(&format!("- **Unique Classes:** {}\n\n", class_counts.len()));
610
611 md.push_str("### Extraction Classes\n\n");
612 md.push_str("| Class | Count |\n");
613 md.push_str("|-------|-------|\n");
614
615 for (class, count) in class_counts {
616 md.push_str(&format!("| {} | {} |\n", class, count));
617 }
618 }
619
620 md.push_str("\n");
621 }
622
623 Ok(md)
624}
625
626fn highlight_text_markdown(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
628 if let Some(extractions) = &annotated_document.extractions {
629 let mut result = String::new();
630 let mut last_pos = 0;
631
632 let mut sorted_extractions: Vec<_> = extractions.iter().collect();
634 sorted_extractions.sort_by_key(|e| {
635 e.char_interval.as_ref().and_then(|i| i.start_pos).unwrap_or(usize::MAX)
636 });
637
638 result.push_str("```\n");
639
640 for extraction in sorted_extractions {
641 if let Some(interval) = &extraction.char_interval {
642 if interval.start_pos.unwrap_or(0) > last_pos && interval.start_pos.unwrap_or(0) <= text.len() {
644 result.push_str(&text[last_pos..interval.start_pos.unwrap_or(0)]);
645 }
646
647 if interval.end_pos.unwrap_or(0) <= text.len() && interval.start_pos.unwrap_or(0) < interval.end_pos.unwrap_or(0) {
649 let extraction_text = &text[interval.start_pos.unwrap_or(0)..interval.end_pos.unwrap_or(0)];
650 result.push_str(&format!("**{}**", extraction_text));
651 last_pos = interval.end_pos.unwrap_or(0);
652 }
653 }
654 }
655
656 if last_pos < text.len() {
658 result.push_str(&text[last_pos..]);
659 }
660
661 result.push_str("\n```\n");
662 Ok(result)
663 } else {
664 Ok(format!("```\n{}\n```\n", text))
665 }
666}
667
668fn export_json(
670 annotated_document: &AnnotatedDocument,
671 config: &ExportConfig,
672) -> LangExtractResult<String> {
673 let mut json_data = json!({
674 "document_id": annotated_document.document_id,
675 "export_config": {
676 "format": "json",
677 "show_char_intervals": config.show_char_intervals,
678 "include_text": config.include_text,
679 "include_statistics": config.include_statistics,
680 "title": config.title
681 }
682 });
683
684 if config.include_text {
686 json_data["text"] = json!(annotated_document.text);
687 }
688
689 if let Some(extractions) = &annotated_document.extractions {
691 let extractions_json: Vec<Value> = extractions.iter().map(|extraction| {
692 let mut ext_json = json!({
693 "extraction_class": extraction.extraction_class,
694 "extraction_text": extraction.extraction_text,
695 "description": extraction.description
696 });
697
698 if config.show_char_intervals {
699 if let Some(interval) = &extraction.char_interval {
700 ext_json["char_interval"] = json!({
701 "start_char": interval.start_pos.unwrap_or(0),
702 "end_char": interval.end_pos.unwrap_or(0),
703 "alignment_status": extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string())
704 });
705 }
706 }
707
708 if let Some(group_index) = extraction.group_index {
709 ext_json["group_index"] = json!(group_index);
710 }
711
712 ext_json
713 }).collect();
714
715 json_data["extractions"] = json!(extractions_json);
716 }
717
718 if config.include_statistics {
720 let text = annotated_document.text.as_deref().unwrap_or("");
721 let mut stats = json!({
722 "total_extractions": annotated_document.extraction_count(),
723 "text_length": text.len()
724 });
725
726 if let Some(extractions) = &annotated_document.extractions {
727 let class_counts = count_extraction_classes(extractions);
728 stats["unique_classes"] = json!(class_counts.len());
729 stats["extraction_classes"] = json!(class_counts);
730 }
731
732 json_data["statistics"] = stats;
733 }
734
735 Ok(serde_json::to_string_pretty(&json_data)?)
736}
737
738fn export_csv(
740 annotated_document: &AnnotatedDocument,
741 config: &ExportConfig,
742) -> LangExtractResult<String> {
743 let mut csv = String::new();
744
745 if config.show_char_intervals {
747 csv.push_str("extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index\n");
748 } else {
749 csv.push_str("extraction_class,extraction_text,description,group_index\n");
750 }
751
752 if let Some(extractions) = &annotated_document.extractions {
754 for extraction in extractions {
755 let class = csv_escape(&extraction.extraction_class);
756 let text = csv_escape(&extraction.extraction_text);
757 let description = extraction.description.as_ref().map(|d| csv_escape(d)).unwrap_or_else(|| "".to_string());
758 let group_index = extraction.group_index.map(|i| i.to_string()).unwrap_or_else(|| "".to_string());
759
760 if config.show_char_intervals {
761 if let Some(interval) = &extraction.char_interval {
762 csv.push_str(&format!("{},{},{},{},{},{:?},{}\n",
763 class, text, description,
764 interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0),
765 extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string()), group_index));
766 } else {
767 csv.push_str(&format!("{},{},{},,,None,{}\n",
768 class, text, description, group_index));
769 }
770 } else {
771 csv.push_str(&format!("{},{},{},{}\n",
772 class, text, description, group_index));
773 }
774 }
775 }
776
777 Ok(csv)
778}
779
780fn csv_escape(text: &str) -> String {
782 if text.contains(',') || text.contains('"') || text.contains('\n') {
783 format!("\"{}\"", text.replace('"', "\"\""))
784 } else {
785 text.to_string()
786 }
787}
788
789#[cfg(test)]
790mod tests {
791 use super::*;
792 use crate::data::{AlignmentStatus, CharInterval, Extraction};
793 use std::collections::HashMap;
794
795 fn create_sample_document() -> AnnotatedDocument {
796 let text = "John Smith works at TechCorp and earns $50,000.";
797 let extractions = vec![
798 Extraction {
799 extraction_class: "person".to_string(),
800 extraction_text: "John Smith".to_string(),
801 char_interval: Some(CharInterval::new(Some(0), Some(10))),
802 alignment_status: Some(AlignmentStatus::MatchExact),
803 extraction_index: Some(0),
804 group_index: Some(0),
805 description: Some("Person name".to_string()),
806 attributes: Some(HashMap::new()),
807 token_interval: None,
808 },
809 Extraction {
810 extraction_class: "company".to_string(),
811 extraction_text: "TechCorp".to_string(),
812 char_interval: Some(CharInterval::new(Some(20), Some(28))),
813 alignment_status: Some(AlignmentStatus::MatchExact),
814 extraction_index: Some(1),
815 group_index: Some(0),
816 description: None,
817 attributes: Some(HashMap::new()),
818 token_interval: None,
819 },
820 Extraction {
821 extraction_class: "salary".to_string(),
822 extraction_text: "$50,000".to_string(),
823 char_interval: Some(CharInterval::new(Some(39), Some(46))),
824 alignment_status: Some(AlignmentStatus::MatchFuzzy),
825 extraction_index: Some(2),
826 group_index: Some(0),
827 description: Some("Annual salary".to_string()),
828 attributes: Some(HashMap::new()),
829 token_interval: None,
830 },
831 ];
832
833 AnnotatedDocument {
834 document_id: Some("test_doc".to_string()),
835 text: Some(text.to_string()),
836 extractions: Some(extractions),
837 }
838 }
839
840 #[test]
841 fn test_text_export() {
842 let document = create_sample_document();
843 let config = ExportConfig {
844 format: ExportFormat::Text,
845 show_char_intervals: true,
846 ..Default::default()
847 };
848
849 let result = export_document(&document, &config).unwrap();
850
851 assert!(result.contains("EXTRACTION VISUALIZATION"));
852 assert!(result.contains("John Smith"));
853 assert!(result.contains("TechCorp"));
854 assert!(result.contains("$50,000"));
855 assert!(result.contains("Position:"));
856 assert!(result.contains("Statistics:"));
857 }
858
859 #[test]
860 fn test_html_export() {
861 let document = create_sample_document();
862 let config = ExportConfig {
863 format: ExportFormat::Html,
864 title: Some("Test HTML Export".to_string()),
865 highlight_extractions: true,
866 show_char_intervals: true,
867 ..Default::default()
868 };
869
870 let result = export_document(&document, &config).unwrap();
871
872 assert!(result.contains("<!DOCTYPE html>"));
873 assert!(result.contains("<title>Test HTML Export</title>"));
874 assert!(result.contains("extraction-highlight"));
875 assert!(result.contains("John Smith"));
876 assert!(result.contains("TechCorp"));
877 assert!(result.contains("extraction-card"));
878 assert!(result.contains("stats-grid"));
879 assert!(result.contains("</html>"));
880 }
881
882 #[test]
883 fn test_html_export_with_custom_css() {
884 let document = create_sample_document();
885 let custom_css = "body { background: red; }";
886 let config = ExportConfig {
887 format: ExportFormat::Html,
888 custom_css: Some(custom_css.to_string()),
889 ..Default::default()
890 };
891
892 let result = export_document(&document, &config).unwrap();
893
894 assert!(result.contains(custom_css));
895 }
896
897 #[test]
898 fn test_markdown_export() {
899 let document = create_sample_document();
900 let config = ExportConfig {
901 format: ExportFormat::Markdown,
902 title: Some("Test Markdown".to_string()),
903 show_char_intervals: true,
904 highlight_extractions: true,
905 ..Default::default()
906 };
907
908 let result = export_document(&document, &config).unwrap();
909
910 assert!(result.starts_with("# Test Markdown"));
911 assert!(result.contains("## đ Document Text"));
912 assert!(result.contains("## đ¯ Extractions"));
913 assert!(result.contains("### 1. person"));
914 assert!(result.contains("**Text:** John Smith"));
915 assert!(result.contains("**Position:** 0-10"));
916 assert!(result.contains("| Class | Count |"));
917 assert!(result.contains("| person | 1 |"));
918 }
919
920 #[test]
921 fn test_json_export() {
922 let document = create_sample_document();
923 let config = ExportConfig {
924 format: ExportFormat::Json,
925 show_char_intervals: true,
926 include_text: true,
927 include_statistics: true,
928 ..Default::default()
929 };
930
931 let result = export_document(&document, &config).unwrap();
932 let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
933
934 assert_eq!(parsed["document_id"], "test_doc");
935 assert!(parsed["text"].is_string());
936 assert!(parsed["extractions"].is_array());
937 assert!(parsed["statistics"].is_object());
938
939 let extractions = parsed["extractions"].as_array().unwrap();
940 assert_eq!(extractions.len(), 3);
941
942 let first_extraction = &extractions[0];
943 assert_eq!(first_extraction["extraction_class"], "person");
944 assert_eq!(first_extraction["extraction_text"], "John Smith");
945 assert!(first_extraction["char_interval"].is_object());
946
947 let stats = &parsed["statistics"];
948 assert_eq!(stats["total_extractions"], 3);
949 assert_eq!(stats["unique_classes"], 3);
950 }
951
952 #[test]
953 fn test_csv_export() {
954 let document = create_sample_document();
955 let config = ExportConfig {
956 format: ExportFormat::Csv,
957 show_char_intervals: true,
958 ..Default::default()
959 };
960
961 let result = export_document(&document, &config).unwrap();
962 let lines: Vec<&str> = result.lines().collect();
963
964 assert_eq!(lines[0], "extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index");
966
967 assert_eq!(lines.len(), 4); assert!(lines[1].contains("person,John Smith"));
970 assert!(lines[2].contains("company,TechCorp"));
971 assert!(lines[3].contains("salary,\"$50,000\""));
972 assert!(lines[1].contains("MatchExact"));
973 assert!(lines[3].contains("MatchFuzzy"));
974 }
975
976 #[test]
977 fn test_csv_export_without_intervals() {
978 let document = create_sample_document();
979 let config = ExportConfig {
980 format: ExportFormat::Csv,
981 show_char_intervals: false,
982 ..Default::default()
983 };
984
985 let result = export_document(&document, &config).unwrap();
986 let lines: Vec<&str> = result.lines().collect();
987
988 assert_eq!(lines[0], "extraction_class,extraction_text,description,group_index");
990
991 assert!(!result.contains("start_char"));
993 assert!(!result.contains("end_char"));
994 }
995
996 #[test]
997 fn test_csv_escape() {
998 assert_eq!(csv_escape("simple"), "simple");
999 assert_eq!(csv_escape("has,comma"), "\"has,comma\"");
1000 assert_eq!(csv_escape("has\"quote"), "\"has\"\"quote\"");
1001 assert_eq!(csv_escape("has\nnewline"), "\"has\nnewline\"");
1002 assert_eq!(csv_escape("has,comma\"and quote"), "\"has,comma\"\"and quote\"");
1003 }
1004
1005 #[test]
1006 fn test_html_escape() {
1007 assert_eq!(html_escape("simple"), "simple");
1008 assert_eq!(html_escape("has<tag>"), "has<tag>");
1009 assert_eq!(html_escape("has\"quote"), "has"quote");
1010 assert_eq!(html_escape("has'apostrophe"), "has'apostrophe");
1011 assert_eq!(html_escape("has&ersand"), "has&ampersand");
1012 }
1013
1014 #[test]
1015 fn test_export_config_defaults() {
1016 let config = ExportConfig::default();
1017 assert_eq!(config.format, ExportFormat::Text);
1018 assert!(!config.show_char_intervals);
1019 assert!(config.include_text);
1020 assert!(config.highlight_extractions);
1021 assert!(config.include_statistics);
1022 assert!(config.custom_css.is_none());
1023 assert!(config.title.is_none());
1024 }
1025
1026 #[test]
1027 fn test_empty_document() {
1028 let document = AnnotatedDocument {
1029 document_id: Some("empty".to_string()),
1030 text: Some("".to_string()),
1031 extractions: None,
1032 };
1033
1034 let config = ExportConfig::default();
1035 let result = export_document(&document, &config).unwrap();
1036
1037 assert!(result.contains("No extractions found"));
1038 }
1039
1040 #[test]
1041 fn test_document_without_text() {
1042 let document = AnnotatedDocument {
1043 document_id: Some("no_text".to_string()),
1044 text: None,
1045 extractions: None,
1046 };
1047
1048 let config = ExportConfig::default();
1049 let result = export_document(&document, &config).unwrap();
1050
1051 assert!(result.contains("No text"));
1052 }
1053
1054 #[test]
1055 fn test_export_format_variants() {
1056 let document = create_sample_document();
1057
1058 for format in [ExportFormat::Text, ExportFormat::Html, ExportFormat::Markdown, ExportFormat::Json, ExportFormat::Csv] {
1060 let config = ExportConfig {
1061 format,
1062 ..Default::default()
1063 };
1064 let result = export_document(&document, &config);
1065 assert!(result.is_ok(), "Format {:?} failed", format);
1066 }
1067 }
1068
1069 #[test]
1070 fn test_highlight_text_html() {
1071 let document = create_sample_document();
1072 let text = document.text.as_ref().unwrap();
1073
1074 let result = highlight_text_html(text, &document).unwrap();
1075
1076 assert!(result.contains("extraction-highlight"));
1077 assert!(result.contains("data-class=\"person\""));
1078 assert!(result.contains("data-text=\"John Smith\""));
1079 assert!(result.contains("John Smith"));
1080 }
1081
1082 #[test]
1083 fn test_count_extraction_classes() {
1084 let extractions = vec![
1085 Extraction {
1086 extraction_class: "person".to_string(),
1087 extraction_text: "John".to_string(),
1088 char_interval: None,
1089 alignment_status: None,
1090 extraction_index: None,
1091 group_index: None,
1092 description: None,
1093 attributes: Some(HashMap::new()),
1094 token_interval: None,
1095 },
1096 Extraction {
1097 extraction_class: "person".to_string(),
1098 extraction_text: "Jane".to_string(),
1099 char_interval: None,
1100 alignment_status: None,
1101 extraction_index: None,
1102 group_index: None,
1103 description: None,
1104 attributes: Some(HashMap::new()),
1105 token_interval: None,
1106 },
1107 Extraction {
1108 extraction_class: "company".to_string(),
1109 extraction_text: "TechCorp".to_string(),
1110 char_interval: None,
1111 alignment_status: None,
1112 extraction_index: None,
1113 group_index: None,
1114 description: None,
1115 attributes: Some(HashMap::new()),
1116 token_interval: None,
1117 },
1118 ];
1119
1120 let counts = count_extraction_classes(&extractions);
1121
1122 assert_eq!(counts.get("person"), Some(&2));
1123 assert_eq!(counts.get("company"), Some(&1));
1124 assert_eq!(counts.len(), 2);
1125 }
1126}