langextract_rust/
visualization.rs

1//! Visualization utilities for annotated documents.
2
3use crate::{data::AnnotatedDocument, exceptions::LangExtractResult};
4use serde_json::{json, Value};
5use std::collections::HashMap;
6use crate::Extraction;
7/// Export format options for visualization
8#[derive(Debug, Clone, Copy, PartialEq)]
9pub enum ExportFormat {
10    /// Simple text format (existing functionality)
11    Text,
12    /// Rich HTML with highlighting and interactivity
13    Html,
14    /// Structured markdown with summaries
15    Markdown,
16    /// Raw JSON export for analysis
17    Json,
18    /// CSV export for spreadsheet analysis
19    Csv,
20}
21
22/// Configuration for visualization exports
23#[derive(Debug, Clone)]
24pub struct ExportConfig {
25    /// Export format to use
26    pub format: ExportFormat,
27    /// Show character intervals in output
28    pub show_char_intervals: bool,
29    /// Include original text in export
30    pub include_text: bool,
31    /// Highlight extractions in text (for HTML/Markdown)
32    pub highlight_extractions: bool,
33    /// Include extraction statistics
34    pub include_statistics: bool,
35    /// Custom CSS for HTML export
36    pub custom_css: Option<String>,
37    /// Title for the export
38    pub title: Option<String>,
39}
40
41impl Default for ExportConfig {
42    fn default() -> Self {
43        Self {
44            format: ExportFormat::Text,
45            show_char_intervals: false,
46            include_text: true,
47            highlight_extractions: true,
48            include_statistics: true,
49            custom_css: None,
50            title: None,
51        }
52    }
53}
54
55/// Export an annotated document in the specified format
56pub fn export_document(
57    annotated_document: &AnnotatedDocument,
58    config: &ExportConfig,
59) -> LangExtractResult<String> {
60    match config.format {
61        ExportFormat::Text => visualize_text(annotated_document, config.show_char_intervals),
62        ExportFormat::Html => export_html(annotated_document, config),
63        ExportFormat::Markdown => export_markdown(annotated_document, config),
64        ExportFormat::Json => export_json(annotated_document, config),
65        ExportFormat::Csv => export_csv(annotated_document, config),
66    }
67}
68
69/// Visualize an annotated document (legacy function for backward compatibility)
70pub fn visualize(
71    annotated_document: &AnnotatedDocument,
72    show_char_intervals: bool,
73) -> LangExtractResult<String> {
74    visualize_text(annotated_document, show_char_intervals)
75}
76
77/// Export as simple text format (original implementation)
78fn visualize_text(
79    annotated_document: &AnnotatedDocument,
80    show_char_intervals: bool,
81) -> LangExtractResult<String> {
82    let mut result = String::new();
83    
84    result.push_str("📄 EXTRACTION VISUALIZATION\n");
85    result.push_str("=" .repeat(50).as_str());
86    result.push('\n');
87    
88    // Show document text
89    let text = annotated_document.text.as_deref().unwrap_or("No text");
90    result.push_str(&format!("📝 Document Text ({} chars):\n", text.len()));
91    result.push_str(&format!("   {}\n\n", text));
92    
93    // Show extractions
94    if let Some(extractions) = &annotated_document.extractions {
95        result.push_str(&format!("đŸŽ¯ Found {} Extractions:\n", extractions.len()));
96        result.push_str("-".repeat(30).as_str());
97        result.push('\n');
98        
99        for (i, extraction) in extractions.iter().enumerate() {
100            result.push_str(&format!("{}. [{}] {}\n", 
101                i + 1, 
102                extraction.extraction_class, 
103                extraction.extraction_text
104            ));
105            
106            if show_char_intervals {
107                if let Some(interval) = &extraction.char_interval {
108                    result.push_str(&format!("   Position: {:?}\n", interval));
109                }
110            }
111            
112            if let Some(description) = &extraction.description {
113                result.push_str(&format!("   Description: {}\n", description));
114            }
115            
116            result.push('\n');
117        }
118    } else {
119        result.push_str("â„šī¸  No extractions found\n");
120    }
121    
122    // Show statistics
123    result.push_str("📊 Statistics:\n");
124    result.push_str("-".repeat(15).as_str());
125    result.push('\n');
126    result.push_str(&format!("   Document ID: {}\n", 
127        annotated_document.document_id.as_deref().unwrap_or("None")));
128    result.push_str(&format!("   Text Length: {} characters\n", text.len()));
129    result.push_str(&format!("   Total Extractions: {}\n", annotated_document.extraction_count()));
130    
131    if let Some(extractions) = &annotated_document.extractions {
132        // Count unique extraction classes
133        let mut class_counts = std::collections::HashMap::new();
134        for extraction in extractions {
135            *class_counts.entry(&extraction.extraction_class).or_insert(0) += 1;
136        }
137        
138        result.push_str("   Extraction Classes:\n");
139        for (class, count) in class_counts {
140            result.push_str(&format!("     â€ĸ {}: {} instance(s)\n", class, count));
141        }
142    }
143    
144    Ok(result)
145}
146
147/// Export as rich HTML with highlighting and interactivity
148fn export_html(
149    annotated_document: &AnnotatedDocument,
150    config: &ExportConfig,
151) -> LangExtractResult<String> {
152    let title = config.title.as_deref().unwrap_or("LangExtract Results");
153    let text = annotated_document.text.as_deref().unwrap_or("No text");
154    
155    let mut html = String::new();
156    
157    // HTML Header
158    html.push_str(&format!(r#"<!DOCTYPE html>
159<html lang="en">
160<head>
161    <meta charset="UTF-8">
162    <meta name="viewport" content="width=device-width, initial-scale=1.0">
163    <title>{}</title>
164    <style>
165        body {{
166            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
167            max-width: 1200px;
168            margin: 0 auto;
169            padding: 20px;
170            background: #f8fafc;
171            color: #334155;
172        }}
173        .container {{
174            background: white;
175            border-radius: 12px;
176            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
177            overflow: hidden;
178        }}
179        .header {{
180            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
181            color: white;
182            padding: 30px;
183            text-align: center;
184        }}
185        .header h1 {{
186            margin: 0;
187            font-size: 2.5em;
188            font-weight: 300;
189        }}
190        .content {{
191            padding: 30px;
192        }}
193        .section {{
194            margin-bottom: 40px;
195        }}
196        .section h2 {{
197            color: #1e293b;
198            border-bottom: 2px solid #e2e8f0;
199            padding-bottom: 10px;
200            margin-bottom: 20px;
201        }}
202        .document-text {{
203            background: #f1f5f9;
204            border-radius: 8px;
205            padding: 20px;
206            font-family: 'Monaco', 'Menlo', monospace;
207            line-height: 1.6;
208            white-space: pre-wrap;
209            position: relative;
210            margin-bottom: 20px;
211        }}
212        .extraction-highlight {{
213            background: rgba(59, 130, 246, 0.2);
214            border: 1px solid rgba(59, 130, 246, 0.4);
215            border-radius: 3px;
216            padding: 1px 2px;
217            cursor: pointer;
218            transition: all 0.2s ease;
219        }}
220        .extraction-highlight:hover {{
221            background: rgba(59, 130, 246, 0.3);
222            transform: translateY(-1px);
223        }}
224        .extractions-grid {{
225            display: grid;
226            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
227            gap: 20px;
228            margin-bottom: 30px;
229        }}
230        .extraction-card {{
231            background: #f8fafc;
232            border: 1px solid #e2e8f0;
233            border-radius: 8px;
234            padding: 15px;
235            transition: all 0.2s ease;
236        }}
237        .extraction-card:hover {{
238            border-color: #3b82f6;
239            box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15);
240        }}
241        .extraction-class {{
242            background: #3b82f6;
243            color: white;
244            padding: 4px 8px;
245            border-radius: 4px;
246            font-size: 0.8em;
247            font-weight: 600;
248            display: inline-block;
249            margin-bottom: 8px;
250        }}
251        .extraction-text {{
252            font-weight: 600;
253            color: #1e293b;
254            margin-bottom: 8px;
255        }}
256        .extraction-meta {{
257            font-size: 0.9em;
258            color: #64748b;
259        }}
260        .stats-grid {{
261            display: grid;
262            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
263            gap: 20px;
264        }}
265        .stat-card {{
266            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
267            color: white;
268            padding: 20px;
269            border-radius: 8px;
270            text-align: center;
271        }}
272        .stat-number {{
273            font-size: 2em;
274            font-weight: bold;
275            margin-bottom: 5px;
276        }}
277        .stat-label {{
278            opacity: 0.9;
279            font-size: 0.9em;
280        }}
281        .class-counts {{
282            background: #f1f5f9;
283            border-radius: 8px;
284            padding: 20px;
285        }}
286        .class-count-item {{
287            display: flex;
288            justify-content: space-between;
289            align-items: center;
290            padding: 8px 0;
291            border-bottom: 1px solid #e2e8f0;
292        }}
293        .class-count-item:last-child {{
294            border-bottom: none;
295        }}
296        .class-badge {{
297            background: #10b981;
298            color: white;
299            padding: 2px 6px;
300            border-radius: 12px;
301            font-size: 0.8em;
302            font-weight: 600;
303        }}
304        {}
305    </style>
306</head>
307<body>
308"#, title, config.custom_css.as_deref().unwrap_or("")));
309
310    // Header
311    html.push_str(&format!(r#"    <div class="container">
312        <div class="header">
313            <h1>{}</h1>
314        </div>
315        <div class="content">
316"#, title));
317
318    // Document text section (with highlighting if enabled)
319    if config.include_text {
320        html.push_str(r#"            <div class="section">
321                <h2>📄 Document Text</h2>
322                <div class="document-text">"#);
323        
324        if config.highlight_extractions {
325            html.push_str(&highlight_text_html(text, annotated_document)?);
326        } else {
327            html.push_str(&html_escape(text));
328        }
329        
330        html.push_str("</div>\n            </div>\n");
331    }
332
333    // Extractions section
334    if let Some(extractions) = &annotated_document.extractions {
335        html.push_str(&format!(r#"            <div class="section">
336                <h2>đŸŽ¯ Extractions ({} found)</h2>
337                <div class="extractions-grid">
338"#, extractions.len()));
339
340        for extraction in extractions {
341            html.push_str(&format!(r#"                    <div class="extraction-card">
342                        <div class="extraction-class">{}</div>
343                        <div class="extraction-text">{}</div>
344"#, html_escape(&extraction.extraction_class), html_escape(&extraction.extraction_text)));
345
346            if config.show_char_intervals {
347                if let Some(interval) = &extraction.char_interval {
348                    html.push_str(&format!(r#"                        <div class="extraction-meta">Position: {}-{}</div>
349"#, interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
350                }
351            }
352
353            if let Some(description) = &extraction.description {
354                html.push_str(&format!(r#"                        <div class="extraction-meta">Description: {}</div>
355"#, html_escape(description)));
356            }
357
358            html.push_str("                    </div>\n");
359        }
360        
361        html.push_str("                </div>\n            </div>\n");
362    }
363
364    // Statistics section
365    if config.include_statistics {
366        html.push_str(r#"            <div class="section">
367                <h2>📊 Statistics</h2>
368                <div class="stats-grid">
369"#);
370
371        let extraction_count = annotated_document.extraction_count();
372        html.push_str(&format!(r#"                    <div class="stat-card">
373                        <div class="stat-number">{}</div>
374                        <div class="stat-label">Total Extractions</div>
375                    </div>
376                    <div class="stat-card">
377                        <div class="stat-number">{}</div>
378                        <div class="stat-label">Characters</div>
379                    </div>
380"#, extraction_count, text.len()));
381
382        if let Some(extractions) = &annotated_document.extractions {
383            let class_counts = count_extraction_classes(extractions);
384            html.push_str(&format!(r#"                    <div class="stat-card">
385                        <div class="stat-number">{}</div>
386                        <div class="stat-label">Unique Classes</div>
387                    </div>
388"#, class_counts.len()));
389
390            html.push_str("                </div>\n");
391            
392            // Class breakdown
393            html.push_str(r#"                <h3>Extraction Classes</h3>
394                <div class="class-counts">
395"#);
396            
397            for (class, count) in class_counts {
398                html.push_str(&format!(r#"                    <div class="class-count-item">
399                        <span>{}</span>
400                        <span class="class-badge">{}</span>
401                    </div>
402"#, html_escape(class), count));
403            }
404            
405            html.push_str("                </div>\n");
406        } else {
407            html.push_str("                </div>\n");
408        }
409        
410        html.push_str("            </div>\n");
411    }
412
413    // Footer
414    html.push_str(r#"        </div>
415    </div>
416    
417    <script>
418        // Add interactivity for extraction highlights
419        document.querySelectorAll('.extraction-highlight').forEach(element => {
420            element.addEventListener('click', function() {
421                const className = this.getAttribute('data-class');
422                const text = this.getAttribute('data-text');
423                alert(`Extraction: ${className}\nText: ${text}`);
424            });
425        });
426    </script>
427</body>
428</html>"#);
429
430    Ok(html)
431}
432
433/// Helper function to escape HTML characters
434fn html_escape(text: &str) -> String {
435    text.replace('&', "&amp;")
436        .replace('<', "&lt;")
437        .replace('>', "&gt;")
438        .replace('"', "&quot;")
439        .replace('\'', "&#x27;")
440}
441
442/// Helper function to find the nearest valid UTF-8 character boundary
443fn find_char_boundary(text: &str, mut index: usize) -> usize {
444    // Clamp to text length first
445    if index >= text.len() {
446        return text.len();
447    }
448    
449    // If we're already at a character boundary, return as-is
450    if text.is_char_boundary(index) {
451        return index;
452    }
453    
454    // Search backwards for the nearest character boundary
455    while index > 0 && !text.is_char_boundary(index) {
456        index -= 1;
457    }
458    
459    index
460}
461
462/// Helper function to highlight extractions in text
463fn highlight_text_html(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
464    if let Some(extractions) = &annotated_document.extractions {
465        // Collect all valid intervals with their extraction info
466        let mut intervals: Vec<(usize, usize, &Extraction)> = Vec::new();
467        
468        for extraction in extractions {
469            if let Some(interval) = &extraction.char_interval {
470                if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
471                    if start < end && end <= text.len() {
472                        intervals.push((start, end, extraction));
473                    }
474                }
475            }
476        }
477        
478        // Sort by start position
479        intervals.sort_by_key(|(start, _, _)| *start);
480        
481        // Remove overlapping intervals - keep the first one when intervals overlap
482        let mut filtered_intervals = Vec::new();
483        let mut last_end = 0;
484        
485        for (start, end, extraction) in intervals {
486            if start >= last_end {
487                filtered_intervals.push((start, end, extraction));
488                last_end = end;
489            } else {
490                // Skip overlapping interval, but log it for debugging
491                log::debug!("Skipping overlapping extraction: '{}' at {}-{} (overlaps with previous ending at {})", 
492                    extraction.extraction_text, start, end, last_end);
493            }
494        }
495        
496        // Now build the HTML with non-overlapping intervals
497        let mut result = String::new();
498        let mut last_pos = 0;
499        
500        for (start, end, extraction) in filtered_intervals {
501            // Ensure we're at valid UTF-8 boundaries
502            let safe_start = find_char_boundary(text, start);
503            let safe_end = find_char_boundary(text, end);
504            
505            // Add text before this extraction
506            if safe_start > last_pos {
507                let safe_last_pos = find_char_boundary(text, last_pos);
508                if safe_last_pos < safe_start {
509                    result.push_str(&html_escape(&text[safe_last_pos..safe_start]));
510                }
511            }
512            
513            // Add the highlighted extraction (only if we have valid boundaries)
514            if safe_start < safe_end && safe_end <= text.len() {
515                let actual_text = &text[safe_start..safe_end];
516                result.push_str(&format!(
517                    r#"<span class="extraction-highlight" data-class="{}" data-text="{}">{}</span>"#,
518                    html_escape(&extraction.extraction_class),
519                    html_escape(&extraction.extraction_text),
520                    html_escape(actual_text)
521                ));
522                last_pos = safe_end;
523            } else {
524                // Skip invalid boundaries but log for debugging
525                log::debug!("Skipping extraction with invalid UTF-8 boundaries: '{}' at {}-{}", 
526                    extraction.extraction_text, start, end);
527            }
528        }
529        
530        // Add remaining text
531        if last_pos < text.len() {
532            let safe_last_pos = find_char_boundary(text, last_pos);
533            if safe_last_pos < text.len() {
534                result.push_str(&html_escape(&text[safe_last_pos..]));
535            }
536        }
537        
538        Ok(result)
539    } else {
540        Ok(html_escape(text))
541    }
542}
543
544/// Helper function to count extraction classes
545fn count_extraction_classes(extractions: &[crate::data::Extraction]) -> HashMap<&str, usize> {
546    let mut class_counts = HashMap::new();
547    for extraction in extractions {
548        *class_counts.entry(extraction.extraction_class.as_str()).or_insert(0) += 1;
549    }
550    class_counts
551}
552
553/// Export as structured markdown with extraction summaries
554fn export_markdown(
555    annotated_document: &AnnotatedDocument,
556    config: &ExportConfig,
557) -> LangExtractResult<String> {
558    let title = config.title.as_deref().unwrap_or("LangExtract Results");
559    let text = annotated_document.text.as_deref().unwrap_or("No text");
560    
561    let mut md = String::new();
562    
563    // Title
564    md.push_str(&format!("# {}\n\n", title));
565    
566    // Document text section
567    if config.include_text {
568        md.push_str("## 📄 Document Text\n\n");
569        
570        if config.highlight_extractions {
571            md.push_str(&highlight_text_markdown(text, annotated_document)?);
572        } else {
573            md.push_str(&format!("```\n{}\n```\n", text));
574        }
575        
576        md.push_str("\n");
577    }
578    
579    // Extractions section
580    if let Some(extractions) = &annotated_document.extractions {
581        md.push_str(&format!("## đŸŽ¯ Extractions ({} found)\n\n", extractions.len()));
582        
583        for (i, extraction) in extractions.iter().enumerate() {
584            md.push_str(&format!("### {}. {}\n\n", i + 1, extraction.extraction_class));
585            md.push_str(&format!("**Text:** {}\n\n", extraction.extraction_text));
586            
587            if config.show_char_intervals {
588                if let Some(interval) = &extraction.char_interval {
589                    md.push_str(&format!("**Position:** {}-{}\n\n", interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
590                }
591            }
592            
593            if let Some(description) = &extraction.description {
594                md.push_str(&format!("**Description:** {}\n\n", description));
595            }
596        }
597    }
598    
599    // Statistics section
600    if config.include_statistics {
601        md.push_str("## 📊 Statistics\n\n");
602        
603        let extraction_count = annotated_document.extraction_count();
604        md.push_str(&format!("- **Total Extractions:** {}\n", extraction_count));
605        md.push_str(&format!("- **Text Length:** {} characters\n", text.len()));
606        
607        if let Some(extractions) = &annotated_document.extractions {
608            let class_counts = count_extraction_classes(extractions);
609            md.push_str(&format!("- **Unique Classes:** {}\n\n", class_counts.len()));
610            
611            md.push_str("### Extraction Classes\n\n");
612            md.push_str("| Class | Count |\n");
613            md.push_str("|-------|-------|\n");
614            
615            for (class, count) in class_counts {
616                md.push_str(&format!("| {} | {} |\n", class, count));
617            }
618        }
619        
620        md.push_str("\n");
621    }
622    
623    Ok(md)
624}
625
626/// Helper function to highlight extractions in markdown
627fn highlight_text_markdown(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
628    if let Some(extractions) = &annotated_document.extractions {
629        let mut result = String::new();
630        let mut last_pos = 0;
631        
632        // Sort extractions by start position
633        let mut sorted_extractions: Vec<_> = extractions.iter().collect();
634        sorted_extractions.sort_by_key(|e| {
635            e.char_interval.as_ref().and_then(|i| i.start_pos).unwrap_or(usize::MAX)
636        });
637        
638        result.push_str("```\n");
639        
640        for extraction in sorted_extractions {
641            if let Some(interval) = &extraction.char_interval {
642                // Add text before the extraction
643                if interval.start_pos.unwrap_or(0) > last_pos && interval.start_pos.unwrap_or(0) <= text.len() {
644                    result.push_str(&text[last_pos..interval.start_pos.unwrap_or(0)]);
645                }
646                
647                // Add highlighted extraction with markdown bold
648                if interval.end_pos.unwrap_or(0) <= text.len() && interval.start_pos.unwrap_or(0) < interval.end_pos.unwrap_or(0) {
649                    let extraction_text = &text[interval.start_pos.unwrap_or(0)..interval.end_pos.unwrap_or(0)];
650                    result.push_str(&format!("**{}**", extraction_text));
651                    last_pos = interval.end_pos.unwrap_or(0);
652                }
653            }
654        }
655        
656        // Add remaining text
657        if last_pos < text.len() {
658            result.push_str(&text[last_pos..]);
659        }
660        
661        result.push_str("\n```\n");
662        Ok(result)
663    } else {
664        Ok(format!("```\n{}\n```\n", text))
665    }
666}
667
668/// Export as JSON for analysis
669fn export_json(
670    annotated_document: &AnnotatedDocument,
671    config: &ExportConfig,
672) -> LangExtractResult<String> {
673    let mut json_data = json!({
674        "document_id": annotated_document.document_id,
675        "export_config": {
676            "format": "json",
677            "show_char_intervals": config.show_char_intervals,
678            "include_text": config.include_text,
679            "include_statistics": config.include_statistics,
680            "title": config.title
681        }
682    });
683    
684    // Add text if requested
685    if config.include_text {
686        json_data["text"] = json!(annotated_document.text);
687    }
688    
689    // Add extractions
690    if let Some(extractions) = &annotated_document.extractions {
691        let extractions_json: Vec<Value> = extractions.iter().map(|extraction| {
692            let mut ext_json = json!({
693                "extraction_class": extraction.extraction_class,
694                "extraction_text": extraction.extraction_text,
695                "description": extraction.description
696            });
697            
698            if config.show_char_intervals {
699                if let Some(interval) = &extraction.char_interval {
700                    ext_json["char_interval"] = json!({
701                        "start_char": interval.start_pos.unwrap_or(0),
702                        "end_char": interval.end_pos.unwrap_or(0),
703                        "alignment_status": extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string())
704                    });
705                }
706            }
707            
708            if let Some(group_index) = extraction.group_index {
709                ext_json["group_index"] = json!(group_index);
710            }
711            
712            ext_json
713        }).collect();
714        
715        json_data["extractions"] = json!(extractions_json);
716    }
717    
718    // Add statistics if requested
719    if config.include_statistics {
720        let text = annotated_document.text.as_deref().unwrap_or("");
721        let mut stats = json!({
722            "total_extractions": annotated_document.extraction_count(),
723            "text_length": text.len()
724        });
725        
726        if let Some(extractions) = &annotated_document.extractions {
727            let class_counts = count_extraction_classes(extractions);
728            stats["unique_classes"] = json!(class_counts.len());
729            stats["extraction_classes"] = json!(class_counts);
730        }
731        
732        json_data["statistics"] = stats;
733    }
734    
735    Ok(serde_json::to_string_pretty(&json_data)?)
736}
737
738/// Export as CSV for spreadsheet analysis
739fn export_csv(
740    annotated_document: &AnnotatedDocument,
741    config: &ExportConfig,
742) -> LangExtractResult<String> {
743    let mut csv = String::new();
744    
745    // CSV Header
746    if config.show_char_intervals {
747        csv.push_str("extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index\n");
748    } else {
749        csv.push_str("extraction_class,extraction_text,description,group_index\n");
750    }
751    
752    // CSV Rows
753    if let Some(extractions) = &annotated_document.extractions {
754        for extraction in extractions {
755            let class = csv_escape(&extraction.extraction_class);
756            let text = csv_escape(&extraction.extraction_text);
757            let description = extraction.description.as_ref().map(|d| csv_escape(d)).unwrap_or_else(|| "".to_string());
758            let group_index = extraction.group_index.map(|i| i.to_string()).unwrap_or_else(|| "".to_string());
759            
760            if config.show_char_intervals {
761                if let Some(interval) = &extraction.char_interval {
762                    csv.push_str(&format!("{},{},{},{},{},{:?},{}\n",
763                        class, text, description,
764                        interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0),
765                        extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string()), group_index));
766                } else {
767                    csv.push_str(&format!("{},{},{},,,None,{}\n",
768                        class, text, description, group_index));
769                }
770            } else {
771                csv.push_str(&format!("{},{},{},{}\n",
772                    class, text, description, group_index));
773            }
774        }
775    }
776    
777    Ok(csv)
778}
779
780/// Helper function to escape CSV values
781fn csv_escape(text: &str) -> String {
782    if text.contains(',') || text.contains('"') || text.contains('\n') {
783        format!("\"{}\"", text.replace('"', "\"\""))
784    } else {
785        text.to_string()
786    }
787}
788
789#[cfg(test)]
790mod tests {
791    use super::*;
792    use crate::data::{AlignmentStatus, CharInterval, Extraction};
793    use std::collections::HashMap;
794
795    fn create_sample_document() -> AnnotatedDocument {
796        let text = "John Smith works at TechCorp and earns $50,000.";
797        let extractions = vec![
798            Extraction {
799                extraction_class: "person".to_string(),
800                extraction_text: "John Smith".to_string(),
801                char_interval: Some(CharInterval::new(Some(0), Some(10))),
802                alignment_status: Some(AlignmentStatus::MatchExact),
803                extraction_index: Some(0),
804                group_index: Some(0),
805                description: Some("Person name".to_string()),
806                attributes: Some(HashMap::new()),
807                token_interval: None,
808            },
809            Extraction {
810                extraction_class: "company".to_string(),
811                extraction_text: "TechCorp".to_string(),
812                char_interval: Some(CharInterval::new(Some(20), Some(28))),
813                alignment_status: Some(AlignmentStatus::MatchExact),
814                extraction_index: Some(1),
815                group_index: Some(0),
816                description: None,
817                attributes: Some(HashMap::new()),
818                token_interval: None,
819            },
820            Extraction {
821                extraction_class: "salary".to_string(),
822                extraction_text: "$50,000".to_string(),
823                char_interval: Some(CharInterval::new(Some(39), Some(46))),
824                alignment_status: Some(AlignmentStatus::MatchFuzzy),
825                extraction_index: Some(2),
826                group_index: Some(0),
827                description: Some("Annual salary".to_string()),
828                attributes: Some(HashMap::new()),
829                token_interval: None,
830            },
831        ];
832
833        AnnotatedDocument {
834            document_id: Some("test_doc".to_string()),
835            text: Some(text.to_string()),
836            extractions: Some(extractions),
837        }
838    }
839
840    #[test]
841    fn test_text_export() {
842        let document = create_sample_document();
843        let config = ExportConfig {
844            format: ExportFormat::Text,
845            show_char_intervals: true,
846            ..Default::default()
847        };
848
849        let result = export_document(&document, &config).unwrap();
850        
851        assert!(result.contains("EXTRACTION VISUALIZATION"));
852        assert!(result.contains("John Smith"));
853        assert!(result.contains("TechCorp"));
854        assert!(result.contains("$50,000"));
855        assert!(result.contains("Position:"));
856        assert!(result.contains("Statistics:"));
857    }
858
859    #[test]
860    fn test_html_export() {
861        let document = create_sample_document();
862        let config = ExportConfig {
863            format: ExportFormat::Html,
864            title: Some("Test HTML Export".to_string()),
865            highlight_extractions: true,
866            show_char_intervals: true,
867            ..Default::default()
868        };
869
870        let result = export_document(&document, &config).unwrap();
871        
872        assert!(result.contains("<!DOCTYPE html>"));
873        assert!(result.contains("<title>Test HTML Export</title>"));
874        assert!(result.contains("extraction-highlight"));
875        assert!(result.contains("John Smith"));
876        assert!(result.contains("TechCorp"));
877        assert!(result.contains("extraction-card"));
878        assert!(result.contains("stats-grid"));
879        assert!(result.contains("</html>"));
880    }
881
882    #[test]
883    fn test_html_export_with_custom_css() {
884        let document = create_sample_document();
885        let custom_css = "body { background: red; }";
886        let config = ExportConfig {
887            format: ExportFormat::Html,
888            custom_css: Some(custom_css.to_string()),
889            ..Default::default()
890        };
891
892        let result = export_document(&document, &config).unwrap();
893        
894        assert!(result.contains(custom_css));
895    }
896
897    #[test]
898    fn test_markdown_export() {
899        let document = create_sample_document();
900        let config = ExportConfig {
901            format: ExportFormat::Markdown,
902            title: Some("Test Markdown".to_string()),
903            show_char_intervals: true,
904            highlight_extractions: true,
905            ..Default::default()
906        };
907
908        let result = export_document(&document, &config).unwrap();
909        
910        assert!(result.starts_with("# Test Markdown"));
911        assert!(result.contains("## 📄 Document Text"));
912        assert!(result.contains("## đŸŽ¯ Extractions"));
913        assert!(result.contains("### 1. person"));
914        assert!(result.contains("**Text:** John Smith"));
915        assert!(result.contains("**Position:** 0-10"));
916        assert!(result.contains("| Class | Count |"));
917        assert!(result.contains("| person | 1 |"));
918    }
919
920    #[test]
921    fn test_json_export() {
922        let document = create_sample_document();
923        let config = ExportConfig {
924            format: ExportFormat::Json,
925            show_char_intervals: true,
926            include_text: true,
927            include_statistics: true,
928            ..Default::default()
929        };
930
931        let result = export_document(&document, &config).unwrap();
932        let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
933        
934        assert_eq!(parsed["document_id"], "test_doc");
935        assert!(parsed["text"].is_string());
936        assert!(parsed["extractions"].is_array());
937        assert!(parsed["statistics"].is_object());
938        
939        let extractions = parsed["extractions"].as_array().unwrap();
940        assert_eq!(extractions.len(), 3);
941        
942        let first_extraction = &extractions[0];
943        assert_eq!(first_extraction["extraction_class"], "person");
944        assert_eq!(first_extraction["extraction_text"], "John Smith");
945        assert!(first_extraction["char_interval"].is_object());
946        
947        let stats = &parsed["statistics"];
948        assert_eq!(stats["total_extractions"], 3);
949        assert_eq!(stats["unique_classes"], 3);
950    }
951
952    #[test]
953    fn test_csv_export() {
954        let document = create_sample_document();
955        let config = ExportConfig {
956            format: ExportFormat::Csv,
957            show_char_intervals: true,
958            ..Default::default()
959        };
960
961        let result = export_document(&document, &config).unwrap();
962        let lines: Vec<&str> = result.lines().collect();
963        
964        // Check header
965        assert_eq!(lines[0], "extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index");
966        
967        // Check data rows
968        assert_eq!(lines.len(), 4); // Header + 3 data rows
969        assert!(lines[1].contains("person,John Smith"));
970        assert!(lines[2].contains("company,TechCorp"));
971        assert!(lines[3].contains("salary,\"$50,000\""));
972        assert!(lines[1].contains("MatchExact"));
973        assert!(lines[3].contains("MatchFuzzy"));
974    }
975
976    #[test]
977    fn test_csv_export_without_intervals() {
978        let document = create_sample_document();
979        let config = ExportConfig {
980            format: ExportFormat::Csv,
981            show_char_intervals: false,
982            ..Default::default()
983        };
984
985        let result = export_document(&document, &config).unwrap();
986        let lines: Vec<&str> = result.lines().collect();
987        
988        // Check header
989        assert_eq!(lines[0], "extraction_class,extraction_text,description,group_index");
990        
991        // Should not contain position columns
992        assert!(!result.contains("start_char"));
993        assert!(!result.contains("end_char"));
994    }
995
996    #[test]
997    fn test_csv_escape() {
998        assert_eq!(csv_escape("simple"), "simple");
999        assert_eq!(csv_escape("has,comma"), "\"has,comma\"");
1000        assert_eq!(csv_escape("has\"quote"), "\"has\"\"quote\"");
1001        assert_eq!(csv_escape("has\nnewline"), "\"has\nnewline\"");
1002        assert_eq!(csv_escape("has,comma\"and quote"), "\"has,comma\"\"and quote\"");
1003    }
1004
1005    #[test]
1006    fn test_html_escape() {
1007        assert_eq!(html_escape("simple"), "simple");
1008        assert_eq!(html_escape("has<tag>"), "has&lt;tag&gt;");
1009        assert_eq!(html_escape("has\"quote"), "has&quot;quote");
1010        assert_eq!(html_escape("has'apostrophe"), "has&#x27;apostrophe");
1011        assert_eq!(html_escape("has&ampersand"), "has&amp;ampersand");
1012    }
1013
1014    #[test]
1015    fn test_export_config_defaults() {
1016        let config = ExportConfig::default();
1017        assert_eq!(config.format, ExportFormat::Text);
1018        assert!(!config.show_char_intervals);
1019        assert!(config.include_text);
1020        assert!(config.highlight_extractions);
1021        assert!(config.include_statistics);
1022        assert!(config.custom_css.is_none());
1023        assert!(config.title.is_none());
1024    }
1025
1026    #[test]
1027    fn test_empty_document() {
1028        let document = AnnotatedDocument {
1029            document_id: Some("empty".to_string()),
1030            text: Some("".to_string()),
1031            extractions: None,
1032        };
1033
1034        let config = ExportConfig::default();
1035        let result = export_document(&document, &config).unwrap();
1036        
1037        assert!(result.contains("No extractions found"));
1038    }
1039
1040    #[test]
1041    fn test_document_without_text() {
1042        let document = AnnotatedDocument {
1043            document_id: Some("no_text".to_string()),
1044            text: None,
1045            extractions: None,
1046        };
1047
1048        let config = ExportConfig::default();
1049        let result = export_document(&document, &config).unwrap();
1050        
1051        assert!(result.contains("No text"));
1052    }
1053
1054    #[test]
1055    fn test_export_format_variants() {
1056        let document = create_sample_document();
1057        
1058        // Test all export formats don't panic
1059        for format in [ExportFormat::Text, ExportFormat::Html, ExportFormat::Markdown, ExportFormat::Json, ExportFormat::Csv] {
1060            let config = ExportConfig {
1061                format,
1062                ..Default::default()
1063            };
1064            let result = export_document(&document, &config);
1065            assert!(result.is_ok(), "Format {:?} failed", format);
1066        }
1067    }
1068
1069    #[test]
1070    fn test_highlight_text_html() {
1071        let document = create_sample_document();
1072        let text = document.text.as_ref().unwrap();
1073        
1074        let result = highlight_text_html(text, &document).unwrap();
1075        
1076        assert!(result.contains("extraction-highlight"));
1077        assert!(result.contains("data-class=\"person\""));
1078        assert!(result.contains("data-text=\"John Smith\""));
1079        assert!(result.contains("John Smith"));
1080    }
1081
1082    #[test]
1083    fn test_count_extraction_classes() {
1084        let extractions = vec![
1085            Extraction {
1086                extraction_class: "person".to_string(),
1087                extraction_text: "John".to_string(),
1088                char_interval: None,
1089                alignment_status: None,
1090                extraction_index: None,
1091                group_index: None,
1092                description: None,
1093                attributes: Some(HashMap::new()),
1094                token_interval: None,
1095            },
1096            Extraction {
1097                extraction_class: "person".to_string(),
1098                extraction_text: "Jane".to_string(),
1099                char_interval: None,
1100                alignment_status: None,
1101                extraction_index: None,
1102                group_index: None,
1103                description: None,
1104                attributes: Some(HashMap::new()),
1105                token_interval: None,
1106            },
1107            Extraction {
1108                extraction_class: "company".to_string(),
1109                extraction_text: "TechCorp".to_string(),
1110                char_interval: None,
1111                alignment_status: None,
1112                extraction_index: None,
1113                group_index: None,
1114                description: None,
1115                attributes: Some(HashMap::new()),
1116                token_interval: None,
1117            },
1118        ];
1119
1120        let counts = count_extraction_classes(&extractions);
1121        
1122        assert_eq!(counts.get("person"), Some(&2));
1123        assert_eq!(counts.get("company"), Some(&1));
1124        assert_eq!(counts.len(), 2);
1125    }
1126}