1use crate::{data::AnnotatedDocument, exceptions::LangExtractResult};
4use serde_json::{json, Value};
5use std::collections::HashMap;
6use crate::Extraction;
7#[derive(Debug, Clone, Copy, PartialEq)]
9#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
10pub enum ExportFormat {
11 Text,
13 Html,
15 Markdown,
17 Json,
19 Csv,
21}
22
23#[derive(Debug, Clone)]
25pub struct ExportConfig {
26 pub format: ExportFormat,
28 pub show_char_intervals: bool,
30 pub include_text: bool,
32 pub highlight_extractions: bool,
34 pub include_statistics: bool,
36 pub custom_css: Option<String>,
38 pub title: Option<String>,
40}
41
42impl Default for ExportConfig {
43 fn default() -> Self {
44 Self {
45 format: ExportFormat::Text,
46 show_char_intervals: false,
47 include_text: true,
48 highlight_extractions: true,
49 include_statistics: true,
50 custom_css: None,
51 title: None,
52 }
53 }
54}
55
56pub fn export_document(
58 annotated_document: &AnnotatedDocument,
59 config: &ExportConfig,
60) -> LangExtractResult<String> {
61 match config.format {
62 ExportFormat::Text => visualize_text(annotated_document, config.show_char_intervals),
63 ExportFormat::Html => export_html(annotated_document, config),
64 ExportFormat::Markdown => export_markdown(annotated_document, config),
65 ExportFormat::Json => export_json(annotated_document, config),
66 ExportFormat::Csv => export_csv(annotated_document, config),
67 }
68}
69
70pub fn visualize(
72 annotated_document: &AnnotatedDocument,
73 show_char_intervals: bool,
74) -> LangExtractResult<String> {
75 visualize_text(annotated_document, show_char_intervals)
76}
77
78fn visualize_text(
80 annotated_document: &AnnotatedDocument,
81 show_char_intervals: bool,
82) -> LangExtractResult<String> {
83 let mut result = String::new();
84
85 result.push_str("đ EXTRACTION VISUALIZATION\n");
86 result.push_str("=" .repeat(50).as_str());
87 result.push('\n');
88
89 let text = annotated_document.text.as_deref().unwrap_or("No text");
91 result.push_str(&format!("đ Document Text ({} chars):\n", text.len()));
92 result.push_str(&format!(" {}\n\n", text));
93
94 if let Some(extractions) = &annotated_document.extractions {
96 result.push_str(&format!("đ¯ Found {} Extractions:\n", extractions.len()));
97 result.push_str("-".repeat(30).as_str());
98 result.push('\n');
99
100 for (i, extraction) in extractions.iter().enumerate() {
101 result.push_str(&format!("{}. [{}] {}\n",
102 i + 1,
103 extraction.extraction_class,
104 extraction.extraction_text
105 ));
106
107 if show_char_intervals {
108 if let Some(interval) = &extraction.char_interval {
109 result.push_str(&format!(" Position: {:?}\n", interval));
110 }
111 }
112
113 if let Some(description) = &extraction.description {
114 result.push_str(&format!(" Description: {}\n", description));
115 }
116
117 result.push('\n');
118 }
119 } else {
120 result.push_str("âšī¸ No extractions found\n");
121 }
122
123 result.push_str("đ Statistics:\n");
125 result.push_str("-".repeat(15).as_str());
126 result.push('\n');
127 result.push_str(&format!(" Document ID: {}\n",
128 annotated_document.document_id.as_deref().unwrap_or("None")));
129 result.push_str(&format!(" Text Length: {} characters\n", text.len()));
130 result.push_str(&format!(" Total Extractions: {}\n", annotated_document.extraction_count()));
131
132 if let Some(extractions) = &annotated_document.extractions {
133 let mut class_counts = std::collections::HashMap::new();
135 for extraction in extractions {
136 *class_counts.entry(&extraction.extraction_class).or_insert(0) += 1;
137 }
138
139 result.push_str(" Extraction Classes:\n");
140 for (class, count) in class_counts {
141 result.push_str(&format!(" âĸ {}: {} instance(s)\n", class, count));
142 }
143 }
144
145 Ok(result)
146}
147
148fn export_html(
150 annotated_document: &AnnotatedDocument,
151 config: &ExportConfig,
152) -> LangExtractResult<String> {
153 let title = config.title.as_deref().unwrap_or("LangExtract Results");
154 let text = annotated_document.text.as_deref().unwrap_or("No text");
155
156 let mut html = String::new();
157
158 html.push_str(&format!(r#"<!DOCTYPE html>
160<html lang="en">
161<head>
162 <meta charset="UTF-8">
163 <meta name="viewport" content="width=device-width, initial-scale=1.0">
164 <title>{}</title>
165 <style>
166 body {{
167 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
168 max-width: 1200px;
169 margin: 0 auto;
170 padding: 20px;
171 background: #f8fafc;
172 color: #334155;
173 }}
174 .container {{
175 background: white;
176 border-radius: 12px;
177 box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
178 overflow: hidden;
179 }}
180 .header {{
181 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
182 color: white;
183 padding: 30px;
184 text-align: center;
185 }}
186 .header h1 {{
187 margin: 0;
188 font-size: 2.5em;
189 font-weight: 300;
190 }}
191 .content {{
192 padding: 30px;
193 }}
194 .section {{
195 margin-bottom: 40px;
196 }}
197 .section h2 {{
198 color: #1e293b;
199 border-bottom: 2px solid #e2e8f0;
200 padding-bottom: 10px;
201 margin-bottom: 20px;
202 }}
203 .document-text {{
204 background: #f1f5f9;
205 border-radius: 8px;
206 padding: 20px;
207 font-family: 'Monaco', 'Menlo', monospace;
208 line-height: 1.6;
209 white-space: pre-wrap;
210 position: relative;
211 margin-bottom: 20px;
212 }}
213 .extraction-highlight {{
214 background: rgba(59, 130, 246, 0.2);
215 border: 1px solid rgba(59, 130, 246, 0.4);
216 border-radius: 3px;
217 padding: 1px 2px;
218 cursor: pointer;
219 transition: all 0.2s ease;
220 }}
221 .extraction-highlight:hover {{
222 background: rgba(59, 130, 246, 0.3);
223 transform: translateY(-1px);
224 }}
225 .extractions-grid {{
226 display: grid;
227 grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
228 gap: 20px;
229 margin-bottom: 30px;
230 }}
231 .extraction-card {{
232 background: #f8fafc;
233 border: 1px solid #e2e8f0;
234 border-radius: 8px;
235 padding: 15px;
236 transition: all 0.2s ease;
237 }}
238 .extraction-card:hover {{
239 border-color: #3b82f6;
240 box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15);
241 }}
242 .extraction-class {{
243 background: #3b82f6;
244 color: white;
245 padding: 4px 8px;
246 border-radius: 4px;
247 font-size: 0.8em;
248 font-weight: 600;
249 display: inline-block;
250 margin-bottom: 8px;
251 }}
252 .extraction-text {{
253 font-weight: 600;
254 color: #1e293b;
255 margin-bottom: 8px;
256 }}
257 .extraction-meta {{
258 font-size: 0.9em;
259 color: #64748b;
260 }}
261 .stats-grid {{
262 display: grid;
263 grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
264 gap: 20px;
265 }}
266 .stat-card {{
267 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
268 color: white;
269 padding: 20px;
270 border-radius: 8px;
271 text-align: center;
272 }}
273 .stat-number {{
274 font-size: 2em;
275 font-weight: bold;
276 margin-bottom: 5px;
277 }}
278 .stat-label {{
279 opacity: 0.9;
280 font-size: 0.9em;
281 }}
282 .class-counts {{
283 background: #f1f5f9;
284 border-radius: 8px;
285 padding: 20px;
286 }}
287 .class-count-item {{
288 display: flex;
289 justify-content: space-between;
290 align-items: center;
291 padding: 8px 0;
292 border-bottom: 1px solid #e2e8f0;
293 }}
294 .class-count-item:last-child {{
295 border-bottom: none;
296 }}
297 .class-badge {{
298 background: #10b981;
299 color: white;
300 padding: 2px 6px;
301 border-radius: 12px;
302 font-size: 0.8em;
303 font-weight: 600;
304 }}
305 {}
306 </style>
307</head>
308<body>
309"#, title, config.custom_css.as_deref().unwrap_or("")));
310
311 html.push_str(&format!(r#" <div class="container">
313 <div class="header">
314 <h1>{}</h1>
315 </div>
316 <div class="content">
317"#, title));
318
319 if config.include_text {
321 html.push_str(r#" <div class="section">
322 <h2>đ Document Text</h2>
323 <div class="document-text">"#);
324
325 if config.highlight_extractions {
326 html.push_str(&highlight_text_html(text, annotated_document)?);
327 } else {
328 html.push_str(&html_escape(text));
329 }
330
331 html.push_str("</div>\n </div>\n");
332 }
333
334 if let Some(extractions) = &annotated_document.extractions {
336 html.push_str(&format!(r#" <div class="section">
337 <h2>đ¯ Extractions ({} found)</h2>
338 <div class="extractions-grid">
339"#, extractions.len()));
340
341 for extraction in extractions {
342 html.push_str(&format!(r#" <div class="extraction-card">
343 <div class="extraction-class">{}</div>
344 <div class="extraction-text">{}</div>
345"#, html_escape(&extraction.extraction_class), html_escape(&extraction.extraction_text)));
346
347 if config.show_char_intervals {
348 if let Some(interval) = &extraction.char_interval {
349 html.push_str(&format!(r#" <div class="extraction-meta">Position: {}-{}</div>
350"#, interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
351 }
352 }
353
354 if let Some(description) = &extraction.description {
355 html.push_str(&format!(r#" <div class="extraction-meta">Description: {}</div>
356"#, html_escape(description)));
357 }
358
359 html.push_str(" </div>\n");
360 }
361
362 html.push_str(" </div>\n </div>\n");
363 }
364
365 if config.include_statistics {
367 html.push_str(r#" <div class="section">
368 <h2>đ Statistics</h2>
369 <div class="stats-grid">
370"#);
371
372 let extraction_count = annotated_document.extraction_count();
373 html.push_str(&format!(r#" <div class="stat-card">
374 <div class="stat-number">{}</div>
375 <div class="stat-label">Total Extractions</div>
376 </div>
377 <div class="stat-card">
378 <div class="stat-number">{}</div>
379 <div class="stat-label">Characters</div>
380 </div>
381"#, extraction_count, text.len()));
382
383 if let Some(extractions) = &annotated_document.extractions {
384 let class_counts = count_extraction_classes(extractions);
385 html.push_str(&format!(r#" <div class="stat-card">
386 <div class="stat-number">{}</div>
387 <div class="stat-label">Unique Classes</div>
388 </div>
389"#, class_counts.len()));
390
391 html.push_str(" </div>\n");
392
393 html.push_str(r#" <h3>Extraction Classes</h3>
395 <div class="class-counts">
396"#);
397
398 for (class, count) in class_counts {
399 html.push_str(&format!(r#" <div class="class-count-item">
400 <span>{}</span>
401 <span class="class-badge">{}</span>
402 </div>
403"#, html_escape(class), count));
404 }
405
406 html.push_str(" </div>\n");
407 } else {
408 html.push_str(" </div>\n");
409 }
410
411 html.push_str(" </div>\n");
412 }
413
414 html.push_str(r#" </div>
416 </div>
417
418 <script>
419 // Add interactivity for extraction highlights
420 document.querySelectorAll('.extraction-highlight').forEach(element => {
421 element.addEventListener('click', function() {
422 const className = this.getAttribute('data-class');
423 const text = this.getAttribute('data-text');
424 alert(`Extraction: ${className}\nText: ${text}`);
425 });
426 });
427 </script>
428</body>
429</html>"#);
430
431 Ok(html)
432}
433
434fn html_escape(text: &str) -> String {
436 text.replace('&', "&")
437 .replace('<', "<")
438 .replace('>', ">")
439 .replace('"', """)
440 .replace('\'', "'")
441}
442
443fn find_char_boundary(text: &str, mut index: usize) -> usize {
445 if index >= text.len() {
447 return text.len();
448 }
449
450 if text.is_char_boundary(index) {
452 return index;
453 }
454
455 while index > 0 && !text.is_char_boundary(index) {
457 index -= 1;
458 }
459
460 index
461}
462
463fn highlight_text_html(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
465 if let Some(extractions) = &annotated_document.extractions {
466 let mut intervals: Vec<(usize, usize, &Extraction)> = Vec::new();
468
469 for extraction in extractions {
470 if let Some(interval) = &extraction.char_interval {
471 if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
472 if start < end && end <= text.len() {
473 intervals.push((start, end, extraction));
474 }
475 }
476 }
477 }
478
479 intervals.sort_by_key(|(start, _, _)| *start);
481
482 let mut filtered_intervals = Vec::new();
484 let mut last_end = 0;
485
486 for (start, end, extraction) in intervals {
487 if start >= last_end {
488 filtered_intervals.push((start, end, extraction));
489 last_end = end;
490 } else {
491 log::debug!("Skipping overlapping extraction: '{}' at {}-{} (overlaps with previous ending at {})",
493 extraction.extraction_text, start, end, last_end);
494 }
495 }
496
497 let mut result = String::new();
499 let mut last_pos = 0;
500
501 for (start, end, extraction) in filtered_intervals {
502 let safe_start = find_char_boundary(text, start);
504 let safe_end = find_char_boundary(text, end);
505
506 if safe_start > last_pos {
508 let safe_last_pos = find_char_boundary(text, last_pos);
509 if safe_last_pos < safe_start {
510 result.push_str(&html_escape(&text[safe_last_pos..safe_start]));
511 }
512 }
513
514 if safe_start < safe_end && safe_end <= text.len() {
516 let actual_text = &text[safe_start..safe_end];
517 result.push_str(&format!(
518 r#"<span class="extraction-highlight" data-class="{}" data-text="{}">{}</span>"#,
519 html_escape(&extraction.extraction_class),
520 html_escape(&extraction.extraction_text),
521 html_escape(actual_text)
522 ));
523 last_pos = safe_end;
524 } else {
525 log::debug!("Skipping extraction with invalid UTF-8 boundaries: '{}' at {}-{}",
527 extraction.extraction_text, start, end);
528 }
529 }
530
531 if last_pos < text.len() {
533 let safe_last_pos = find_char_boundary(text, last_pos);
534 if safe_last_pos < text.len() {
535 result.push_str(&html_escape(&text[safe_last_pos..]));
536 }
537 }
538
539 Ok(result)
540 } else {
541 Ok(html_escape(text))
542 }
543}
544
545fn count_extraction_classes(extractions: &[crate::data::Extraction]) -> HashMap<&str, usize> {
547 let mut class_counts = HashMap::new();
548 for extraction in extractions {
549 *class_counts.entry(extraction.extraction_class.as_str()).or_insert(0) += 1;
550 }
551 class_counts
552}
553
554fn export_markdown(
556 annotated_document: &AnnotatedDocument,
557 config: &ExportConfig,
558) -> LangExtractResult<String> {
559 let title = config.title.as_deref().unwrap_or("LangExtract Results");
560 let text = annotated_document.text.as_deref().unwrap_or("No text");
561
562 let mut md = String::new();
563
564 md.push_str(&format!("# {}\n\n", title));
566
567 if config.include_text {
569 md.push_str("## đ Document Text\n\n");
570
571 if config.highlight_extractions {
572 md.push_str(&highlight_text_markdown(text, annotated_document)?);
573 } else {
574 md.push_str(&format!("```\n{}\n```\n", text));
575 }
576
577 md.push_str("\n");
578 }
579
580 if let Some(extractions) = &annotated_document.extractions {
582 md.push_str(&format!("## đ¯ Extractions ({} found)\n\n", extractions.len()));
583
584 for (i, extraction) in extractions.iter().enumerate() {
585 md.push_str(&format!("### {}. {}\n\n", i + 1, extraction.extraction_class));
586 md.push_str(&format!("**Text:** {}\n\n", extraction.extraction_text));
587
588 if config.show_char_intervals {
589 if let Some(interval) = &extraction.char_interval {
590 md.push_str(&format!("**Position:** {}-{}\n\n", interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
591 }
592 }
593
594 if let Some(description) = &extraction.description {
595 md.push_str(&format!("**Description:** {}\n\n", description));
596 }
597 }
598 }
599
600 if config.include_statistics {
602 md.push_str("## đ Statistics\n\n");
603
604 let extraction_count = annotated_document.extraction_count();
605 md.push_str(&format!("- **Total Extractions:** {}\n", extraction_count));
606 md.push_str(&format!("- **Text Length:** {} characters\n", text.len()));
607
608 if let Some(extractions) = &annotated_document.extractions {
609 let class_counts = count_extraction_classes(extractions);
610 md.push_str(&format!("- **Unique Classes:** {}\n\n", class_counts.len()));
611
612 md.push_str("### Extraction Classes\n\n");
613 md.push_str("| Class | Count |\n");
614 md.push_str("|-------|-------|\n");
615
616 for (class, count) in class_counts {
617 md.push_str(&format!("| {} | {} |\n", class, count));
618 }
619 }
620
621 md.push_str("\n");
622 }
623
624 Ok(md)
625}
626
627fn highlight_text_markdown(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
629 if let Some(extractions) = &annotated_document.extractions {
630 let mut result = String::new();
631 let mut last_pos = 0;
632
633 let mut sorted_extractions: Vec<_> = extractions.iter().collect();
635 sorted_extractions.sort_by_key(|e| {
636 e.char_interval.as_ref().and_then(|i| i.start_pos).unwrap_or(usize::MAX)
637 });
638
639 result.push_str("```\n");
640
641 for extraction in sorted_extractions {
642 if let Some(interval) = &extraction.char_interval {
643 if interval.start_pos.unwrap_or(0) > last_pos && interval.start_pos.unwrap_or(0) <= text.len() {
645 result.push_str(&text[last_pos..interval.start_pos.unwrap_or(0)]);
646 }
647
648 if interval.end_pos.unwrap_or(0) <= text.len() && interval.start_pos.unwrap_or(0) < interval.end_pos.unwrap_or(0) {
650 let extraction_text = &text[interval.start_pos.unwrap_or(0)..interval.end_pos.unwrap_or(0)];
651 result.push_str(&format!("**{}**", extraction_text));
652 last_pos = interval.end_pos.unwrap_or(0);
653 }
654 }
655 }
656
657 if last_pos < text.len() {
659 result.push_str(&text[last_pos..]);
660 }
661
662 result.push_str("\n```\n");
663 Ok(result)
664 } else {
665 Ok(format!("```\n{}\n```\n", text))
666 }
667}
668
669fn export_json(
671 annotated_document: &AnnotatedDocument,
672 config: &ExportConfig,
673) -> LangExtractResult<String> {
674 let mut json_data = json!({
675 "document_id": annotated_document.document_id,
676 "export_config": {
677 "format": "json",
678 "show_char_intervals": config.show_char_intervals,
679 "include_text": config.include_text,
680 "include_statistics": config.include_statistics,
681 "title": config.title
682 }
683 });
684
685 if config.include_text {
687 json_data["text"] = json!(annotated_document.text);
688 }
689
690 if let Some(extractions) = &annotated_document.extractions {
692 let extractions_json: Vec<Value> = extractions.iter().map(|extraction| {
693 let mut ext_json = json!({
694 "extraction_class": extraction.extraction_class,
695 "extraction_text": extraction.extraction_text,
696 "description": extraction.description
697 });
698
699 if config.show_char_intervals {
700 if let Some(interval) = &extraction.char_interval {
701 ext_json["char_interval"] = json!({
702 "start_char": interval.start_pos.unwrap_or(0),
703 "end_char": interval.end_pos.unwrap_or(0),
704 "alignment_status": extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string())
705 });
706 }
707 }
708
709 if let Some(group_index) = extraction.group_index {
710 ext_json["group_index"] = json!(group_index);
711 }
712
713 ext_json
714 }).collect();
715
716 json_data["extractions"] = json!(extractions_json);
717 }
718
719 if config.include_statistics {
721 let text = annotated_document.text.as_deref().unwrap_or("");
722 let mut stats = json!({
723 "total_extractions": annotated_document.extraction_count(),
724 "text_length": text.len()
725 });
726
727 if let Some(extractions) = &annotated_document.extractions {
728 let class_counts = count_extraction_classes(extractions);
729 stats["unique_classes"] = json!(class_counts.len());
730 stats["extraction_classes"] = json!(class_counts);
731 }
732
733 json_data["statistics"] = stats;
734 }
735
736 Ok(serde_json::to_string_pretty(&json_data)?)
737}
738
739fn export_csv(
741 annotated_document: &AnnotatedDocument,
742 config: &ExportConfig,
743) -> LangExtractResult<String> {
744 let mut csv = String::new();
745
746 if config.show_char_intervals {
748 csv.push_str("extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index\n");
749 } else {
750 csv.push_str("extraction_class,extraction_text,description,group_index\n");
751 }
752
753 if let Some(extractions) = &annotated_document.extractions {
755 for extraction in extractions {
756 let class = csv_escape(&extraction.extraction_class);
757 let text = csv_escape(&extraction.extraction_text);
758 let description = extraction.description.as_ref().map(|d| csv_escape(d)).unwrap_or_else(|| "".to_string());
759 let group_index = extraction.group_index.map(|i| i.to_string()).unwrap_or_else(|| "".to_string());
760
761 if config.show_char_intervals {
762 if let Some(interval) = &extraction.char_interval {
763 csv.push_str(&format!("{},{},{},{},{},{:?},{}\n",
764 class, text, description,
765 interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0),
766 extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string()), group_index));
767 } else {
768 csv.push_str(&format!("{},{},{},,,None,{}\n",
769 class, text, description, group_index));
770 }
771 } else {
772 csv.push_str(&format!("{},{},{},{}\n",
773 class, text, description, group_index));
774 }
775 }
776 }
777
778 Ok(csv)
779}
780
781fn csv_escape(text: &str) -> String {
783 if text.contains(',') || text.contains('"') || text.contains('\n') {
784 format!("\"{}\"", text.replace('"', "\"\""))
785 } else {
786 text.to_string()
787 }
788}
789
790#[cfg(test)]
791mod tests {
792 use super::*;
793 use crate::data::{AlignmentStatus, CharInterval, Extraction};
794 use std::collections::HashMap;
795
796 fn create_sample_document() -> AnnotatedDocument {
797 let text = "John Smith works at TechCorp and earns $50,000.";
798 let extractions = vec![
799 Extraction {
800 extraction_class: "person".to_string(),
801 extraction_text: "John Smith".to_string(),
802 char_interval: Some(CharInterval::new(Some(0), Some(10))),
803 alignment_status: Some(AlignmentStatus::MatchExact),
804 extraction_index: Some(0),
805 group_index: Some(0),
806 description: Some("Person name".to_string()),
807 attributes: Some(HashMap::new()),
808 token_interval: None,
809 },
810 Extraction {
811 extraction_class: "company".to_string(),
812 extraction_text: "TechCorp".to_string(),
813 char_interval: Some(CharInterval::new(Some(20), Some(28))),
814 alignment_status: Some(AlignmentStatus::MatchExact),
815 extraction_index: Some(1),
816 group_index: Some(0),
817 description: None,
818 attributes: Some(HashMap::new()),
819 token_interval: None,
820 },
821 Extraction {
822 extraction_class: "salary".to_string(),
823 extraction_text: "$50,000".to_string(),
824 char_interval: Some(CharInterval::new(Some(39), Some(46))),
825 alignment_status: Some(AlignmentStatus::MatchFuzzy),
826 extraction_index: Some(2),
827 group_index: Some(0),
828 description: Some("Annual salary".to_string()),
829 attributes: Some(HashMap::new()),
830 token_interval: None,
831 },
832 ];
833
834 AnnotatedDocument {
835 document_id: Some("test_doc".to_string()),
836 text: Some(text.to_string()),
837 extractions: Some(extractions),
838 }
839 }
840
841 #[test]
842 fn test_text_export() {
843 let document = create_sample_document();
844 let config = ExportConfig {
845 format: ExportFormat::Text,
846 show_char_intervals: true,
847 ..Default::default()
848 };
849
850 let result = export_document(&document, &config).unwrap();
851
852 assert!(result.contains("EXTRACTION VISUALIZATION"));
853 assert!(result.contains("John Smith"));
854 assert!(result.contains("TechCorp"));
855 assert!(result.contains("$50,000"));
856 assert!(result.contains("Position:"));
857 assert!(result.contains("Statistics:"));
858 }
859
860 #[test]
861 fn test_html_export() {
862 let document = create_sample_document();
863 let config = ExportConfig {
864 format: ExportFormat::Html,
865 title: Some("Test HTML Export".to_string()),
866 highlight_extractions: true,
867 show_char_intervals: true,
868 ..Default::default()
869 };
870
871 let result = export_document(&document, &config).unwrap();
872
873 assert!(result.contains("<!DOCTYPE html>"));
874 assert!(result.contains("<title>Test HTML Export</title>"));
875 assert!(result.contains("extraction-highlight"));
876 assert!(result.contains("John Smith"));
877 assert!(result.contains("TechCorp"));
878 assert!(result.contains("extraction-card"));
879 assert!(result.contains("stats-grid"));
880 assert!(result.contains("</html>"));
881 }
882
883 #[test]
884 fn test_html_export_with_custom_css() {
885 let document = create_sample_document();
886 let custom_css = "body { background: red; }";
887 let config = ExportConfig {
888 format: ExportFormat::Html,
889 custom_css: Some(custom_css.to_string()),
890 ..Default::default()
891 };
892
893 let result = export_document(&document, &config).unwrap();
894
895 assert!(result.contains(custom_css));
896 }
897
898 #[test]
899 fn test_markdown_export() {
900 let document = create_sample_document();
901 let config = ExportConfig {
902 format: ExportFormat::Markdown,
903 title: Some("Test Markdown".to_string()),
904 show_char_intervals: true,
905 highlight_extractions: true,
906 ..Default::default()
907 };
908
909 let result = export_document(&document, &config).unwrap();
910
911 assert!(result.starts_with("# Test Markdown"));
912 assert!(result.contains("## đ Document Text"));
913 assert!(result.contains("## đ¯ Extractions"));
914 assert!(result.contains("### 1. person"));
915 assert!(result.contains("**Text:** John Smith"));
916 assert!(result.contains("**Position:** 0-10"));
917 assert!(result.contains("| Class | Count |"));
918 assert!(result.contains("| person | 1 |"));
919 }
920
921 #[test]
922 fn test_json_export() {
923 let document = create_sample_document();
924 let config = ExportConfig {
925 format: ExportFormat::Json,
926 show_char_intervals: true,
927 include_text: true,
928 include_statistics: true,
929 ..Default::default()
930 };
931
932 let result = export_document(&document, &config).unwrap();
933 let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
934
935 assert_eq!(parsed["document_id"], "test_doc");
936 assert!(parsed["text"].is_string());
937 assert!(parsed["extractions"].is_array());
938 assert!(parsed["statistics"].is_object());
939
940 let extractions = parsed["extractions"].as_array().unwrap();
941 assert_eq!(extractions.len(), 3);
942
943 let first_extraction = &extractions[0];
944 assert_eq!(first_extraction["extraction_class"], "person");
945 assert_eq!(first_extraction["extraction_text"], "John Smith");
946 assert!(first_extraction["char_interval"].is_object());
947
948 let stats = &parsed["statistics"];
949 assert_eq!(stats["total_extractions"], 3);
950 assert_eq!(stats["unique_classes"], 3);
951 }
952
953 #[test]
954 fn test_csv_export() {
955 let document = create_sample_document();
956 let config = ExportConfig {
957 format: ExportFormat::Csv,
958 show_char_intervals: true,
959 ..Default::default()
960 };
961
962 let result = export_document(&document, &config).unwrap();
963 let lines: Vec<&str> = result.lines().collect();
964
965 assert_eq!(lines[0], "extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index");
967
968 assert_eq!(lines.len(), 4); assert!(lines[1].contains("person,John Smith"));
971 assert!(lines[2].contains("company,TechCorp"));
972 assert!(lines[3].contains("salary,\"$50,000\""));
973 assert!(lines[1].contains("MatchExact"));
974 assert!(lines[3].contains("MatchFuzzy"));
975 }
976
977 #[test]
978 fn test_csv_export_without_intervals() {
979 let document = create_sample_document();
980 let config = ExportConfig {
981 format: ExportFormat::Csv,
982 show_char_intervals: false,
983 ..Default::default()
984 };
985
986 let result = export_document(&document, &config).unwrap();
987 let lines: Vec<&str> = result.lines().collect();
988
989 assert_eq!(lines[0], "extraction_class,extraction_text,description,group_index");
991
992 assert!(!result.contains("start_char"));
994 assert!(!result.contains("end_char"));
995 }
996
997 #[test]
998 fn test_csv_escape() {
999 assert_eq!(csv_escape("simple"), "simple");
1000 assert_eq!(csv_escape("has,comma"), "\"has,comma\"");
1001 assert_eq!(csv_escape("has\"quote"), "\"has\"\"quote\"");
1002 assert_eq!(csv_escape("has\nnewline"), "\"has\nnewline\"");
1003 assert_eq!(csv_escape("has,comma\"and quote"), "\"has,comma\"\"and quote\"");
1004 }
1005
1006 #[test]
1007 fn test_html_escape() {
1008 assert_eq!(html_escape("simple"), "simple");
1009 assert_eq!(html_escape("has<tag>"), "has<tag>");
1010 assert_eq!(html_escape("has\"quote"), "has"quote");
1011 assert_eq!(html_escape("has'apostrophe"), "has'apostrophe");
1012 assert_eq!(html_escape("has&ersand"), "has&ampersand");
1013 }
1014
1015 #[test]
1016 fn test_export_config_defaults() {
1017 let config = ExportConfig::default();
1018 assert_eq!(config.format, ExportFormat::Text);
1019 assert!(!config.show_char_intervals);
1020 assert!(config.include_text);
1021 assert!(config.highlight_extractions);
1022 assert!(config.include_statistics);
1023 assert!(config.custom_css.is_none());
1024 assert!(config.title.is_none());
1025 }
1026
1027 #[test]
1028 fn test_empty_document() {
1029 let document = AnnotatedDocument {
1030 document_id: Some("empty".to_string()),
1031 text: Some("".to_string()),
1032 extractions: None,
1033 };
1034
1035 let config = ExportConfig::default();
1036 let result = export_document(&document, &config).unwrap();
1037
1038 assert!(result.contains("No extractions found"));
1039 }
1040
1041 #[test]
1042 fn test_document_without_text() {
1043 let document = AnnotatedDocument {
1044 document_id: Some("no_text".to_string()),
1045 text: None,
1046 extractions: None,
1047 };
1048
1049 let config = ExportConfig::default();
1050 let result = export_document(&document, &config).unwrap();
1051
1052 assert!(result.contains("No text"));
1053 }
1054
1055 #[test]
1056 fn test_export_format_variants() {
1057 let document = create_sample_document();
1058
1059 for format in [ExportFormat::Text, ExportFormat::Html, ExportFormat::Markdown, ExportFormat::Json, ExportFormat::Csv] {
1061 let config = ExportConfig {
1062 format,
1063 ..Default::default()
1064 };
1065 let result = export_document(&document, &config);
1066 assert!(result.is_ok(), "Format {:?} failed", format);
1067 }
1068 }
1069
1070 #[test]
1071 fn test_highlight_text_html() {
1072 let document = create_sample_document();
1073 let text = document.text.as_ref().unwrap();
1074
1075 let result = highlight_text_html(text, &document).unwrap();
1076
1077 assert!(result.contains("extraction-highlight"));
1078 assert!(result.contains("data-class=\"person\""));
1079 assert!(result.contains("data-text=\"John Smith\""));
1080 assert!(result.contains("John Smith"));
1081 }
1082
1083 #[test]
1084 fn test_count_extraction_classes() {
1085 let extractions = vec![
1086 Extraction {
1087 extraction_class: "person".to_string(),
1088 extraction_text: "John".to_string(),
1089 char_interval: None,
1090 alignment_status: None,
1091 extraction_index: None,
1092 group_index: None,
1093 description: None,
1094 attributes: Some(HashMap::new()),
1095 token_interval: None,
1096 },
1097 Extraction {
1098 extraction_class: "person".to_string(),
1099 extraction_text: "Jane".to_string(),
1100 char_interval: None,
1101 alignment_status: None,
1102 extraction_index: None,
1103 group_index: None,
1104 description: None,
1105 attributes: Some(HashMap::new()),
1106 token_interval: None,
1107 },
1108 Extraction {
1109 extraction_class: "company".to_string(),
1110 extraction_text: "TechCorp".to_string(),
1111 char_interval: None,
1112 alignment_status: None,
1113 extraction_index: None,
1114 group_index: None,
1115 description: None,
1116 attributes: Some(HashMap::new()),
1117 token_interval: None,
1118 },
1119 ];
1120
1121 let counts = count_extraction_classes(&extractions);
1122
1123 assert_eq!(counts.get("person"), Some(&2));
1124 assert_eq!(counts.get("company"), Some(&1));
1125 assert_eq!(counts.len(), 2);
1126 }
1127}