langextract_rust/
visualization.rs

1//! Visualization utilities for annotated documents.
2
3use crate::{data::AnnotatedDocument, exceptions::LangExtractResult};
4use serde_json::{json, Value};
5use std::collections::HashMap;
6use crate::Extraction;
7/// Export format options for visualization
8#[derive(Debug, Clone, Copy, PartialEq)]
9#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
10pub enum ExportFormat {
11    /// Simple text format (existing functionality)
12    Text,
13    /// Rich HTML with highlighting and interactivity
14    Html,
15    /// Structured markdown with summaries
16    Markdown,
17    /// Raw JSON export for analysis
18    Json,
19    /// CSV export for spreadsheet analysis
20    Csv,
21}
22
23/// Configuration for visualization exports
24#[derive(Debug, Clone)]
25pub struct ExportConfig {
26    /// Export format to use
27    pub format: ExportFormat,
28    /// Show character intervals in output
29    pub show_char_intervals: bool,
30    /// Include original text in export
31    pub include_text: bool,
32    /// Highlight extractions in text (for HTML/Markdown)
33    pub highlight_extractions: bool,
34    /// Include extraction statistics
35    pub include_statistics: bool,
36    /// Custom CSS for HTML export
37    pub custom_css: Option<String>,
38    /// Title for the export
39    pub title: Option<String>,
40}
41
42impl Default for ExportConfig {
43    fn default() -> Self {
44        Self {
45            format: ExportFormat::Text,
46            show_char_intervals: false,
47            include_text: true,
48            highlight_extractions: true,
49            include_statistics: true,
50            custom_css: None,
51            title: None,
52        }
53    }
54}
55
56/// Export an annotated document in the specified format
57pub fn export_document(
58    annotated_document: &AnnotatedDocument,
59    config: &ExportConfig,
60) -> LangExtractResult<String> {
61    match config.format {
62        ExportFormat::Text => visualize_text(annotated_document, config.show_char_intervals),
63        ExportFormat::Html => export_html(annotated_document, config),
64        ExportFormat::Markdown => export_markdown(annotated_document, config),
65        ExportFormat::Json => export_json(annotated_document, config),
66        ExportFormat::Csv => export_csv(annotated_document, config),
67    }
68}
69
70/// Visualize an annotated document (legacy function for backward compatibility)
71pub fn visualize(
72    annotated_document: &AnnotatedDocument,
73    show_char_intervals: bool,
74) -> LangExtractResult<String> {
75    visualize_text(annotated_document, show_char_intervals)
76}
77
78/// Export as simple text format (original implementation)
79fn visualize_text(
80    annotated_document: &AnnotatedDocument,
81    show_char_intervals: bool,
82) -> LangExtractResult<String> {
83    let mut result = String::new();
84    
85    result.push_str("📄 EXTRACTION VISUALIZATION\n");
86    result.push_str("=" .repeat(50).as_str());
87    result.push('\n');
88    
89    // Show document text
90    let text = annotated_document.text.as_deref().unwrap_or("No text");
91    result.push_str(&format!("📝 Document Text ({} chars):\n", text.len()));
92    result.push_str(&format!("   {}\n\n", text));
93    
94    // Show extractions
95    if let Some(extractions) = &annotated_document.extractions {
96        result.push_str(&format!("🎯 Found {} Extractions:\n", extractions.len()));
97        result.push_str("-".repeat(30).as_str());
98        result.push('\n');
99        
100        for (i, extraction) in extractions.iter().enumerate() {
101            result.push_str(&format!("{}. [{}] {}\n", 
102                i + 1, 
103                extraction.extraction_class, 
104                extraction.extraction_text
105            ));
106            
107            if show_char_intervals {
108                if let Some(interval) = &extraction.char_interval {
109                    result.push_str(&format!("   Position: {:?}\n", interval));
110                }
111            }
112            
113            if let Some(description) = &extraction.description {
114                result.push_str(&format!("   Description: {}\n", description));
115            }
116            
117            result.push('\n');
118        }
119    } else {
120        result.push_str("ℹ️  No extractions found\n");
121    }
122    
123    // Show statistics
124    result.push_str("📊 Statistics:\n");
125    result.push_str("-".repeat(15).as_str());
126    result.push('\n');
127    result.push_str(&format!("   Document ID: {}\n", 
128        annotated_document.document_id.as_deref().unwrap_or("None")));
129    result.push_str(&format!("   Text Length: {} characters\n", text.len()));
130    result.push_str(&format!("   Total Extractions: {}\n", annotated_document.extraction_count()));
131    
132    if let Some(extractions) = &annotated_document.extractions {
133        // Count unique extraction classes
134        let mut class_counts = std::collections::HashMap::new();
135        for extraction in extractions {
136            *class_counts.entry(&extraction.extraction_class).or_insert(0) += 1;
137        }
138        
139        result.push_str("   Extraction Classes:\n");
140        for (class, count) in class_counts {
141            result.push_str(&format!("     • {}: {} instance(s)\n", class, count));
142        }
143    }
144    
145    Ok(result)
146}
147
148/// Export as rich HTML with highlighting and interactivity
149fn export_html(
150    annotated_document: &AnnotatedDocument,
151    config: &ExportConfig,
152) -> LangExtractResult<String> {
153    let title = config.title.as_deref().unwrap_or("LangExtract Results");
154    let text = annotated_document.text.as_deref().unwrap_or("No text");
155    
156    let mut html = String::new();
157    
158    // HTML Header
159    html.push_str(&format!(r#"<!DOCTYPE html>
160<html lang="en">
161<head>
162    <meta charset="UTF-8">
163    <meta name="viewport" content="width=device-width, initial-scale=1.0">
164    <title>{}</title>
165    <style>
166        body {{
167            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
168            max-width: 1200px;
169            margin: 0 auto;
170            padding: 20px;
171            background: #f8fafc;
172            color: #334155;
173        }}
174        .container {{
175            background: white;
176            border-radius: 12px;
177            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
178            overflow: hidden;
179        }}
180        .header {{
181            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
182            color: white;
183            padding: 30px;
184            text-align: center;
185        }}
186        .header h1 {{
187            margin: 0;
188            font-size: 2.5em;
189            font-weight: 300;
190        }}
191        .content {{
192            padding: 30px;
193        }}
194        .section {{
195            margin-bottom: 40px;
196        }}
197        .section h2 {{
198            color: #1e293b;
199            border-bottom: 2px solid #e2e8f0;
200            padding-bottom: 10px;
201            margin-bottom: 20px;
202        }}
203        .document-text {{
204            background: #f1f5f9;
205            border-radius: 8px;
206            padding: 20px;
207            font-family: 'Monaco', 'Menlo', monospace;
208            line-height: 1.6;
209            white-space: pre-wrap;
210            position: relative;
211            margin-bottom: 20px;
212        }}
213        .extraction-highlight {{
214            background: rgba(59, 130, 246, 0.2);
215            border: 1px solid rgba(59, 130, 246, 0.4);
216            border-radius: 3px;
217            padding: 1px 2px;
218            cursor: pointer;
219            transition: all 0.2s ease;
220        }}
221        .extraction-highlight:hover {{
222            background: rgba(59, 130, 246, 0.3);
223            transform: translateY(-1px);
224        }}
225        .extractions-grid {{
226            display: grid;
227            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
228            gap: 20px;
229            margin-bottom: 30px;
230        }}
231        .extraction-card {{
232            background: #f8fafc;
233            border: 1px solid #e2e8f0;
234            border-radius: 8px;
235            padding: 15px;
236            transition: all 0.2s ease;
237        }}
238        .extraction-card:hover {{
239            border-color: #3b82f6;
240            box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15);
241        }}
242        .extraction-class {{
243            background: #3b82f6;
244            color: white;
245            padding: 4px 8px;
246            border-radius: 4px;
247            font-size: 0.8em;
248            font-weight: 600;
249            display: inline-block;
250            margin-bottom: 8px;
251        }}
252        .extraction-text {{
253            font-weight: 600;
254            color: #1e293b;
255            margin-bottom: 8px;
256        }}
257        .extraction-meta {{
258            font-size: 0.9em;
259            color: #64748b;
260        }}
261        .stats-grid {{
262            display: grid;
263            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
264            gap: 20px;
265        }}
266        .stat-card {{
267            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
268            color: white;
269            padding: 20px;
270            border-radius: 8px;
271            text-align: center;
272        }}
273        .stat-number {{
274            font-size: 2em;
275            font-weight: bold;
276            margin-bottom: 5px;
277        }}
278        .stat-label {{
279            opacity: 0.9;
280            font-size: 0.9em;
281        }}
282        .class-counts {{
283            background: #f1f5f9;
284            border-radius: 8px;
285            padding: 20px;
286        }}
287        .class-count-item {{
288            display: flex;
289            justify-content: space-between;
290            align-items: center;
291            padding: 8px 0;
292            border-bottom: 1px solid #e2e8f0;
293        }}
294        .class-count-item:last-child {{
295            border-bottom: none;
296        }}
297        .class-badge {{
298            background: #10b981;
299            color: white;
300            padding: 2px 6px;
301            border-radius: 12px;
302            font-size: 0.8em;
303            font-weight: 600;
304        }}
305        {}
306    </style>
307</head>
308<body>
309"#, title, config.custom_css.as_deref().unwrap_or("")));
310
311    // Header
312    html.push_str(&format!(r#"    <div class="container">
313        <div class="header">
314            <h1>{}</h1>
315        </div>
316        <div class="content">
317"#, title));
318
319    // Document text section (with highlighting if enabled)
320    if config.include_text {
321        html.push_str(r#"            <div class="section">
322                <h2>📄 Document Text</h2>
323                <div class="document-text">"#);
324        
325        if config.highlight_extractions {
326            html.push_str(&highlight_text_html(text, annotated_document)?);
327        } else {
328            html.push_str(&html_escape(text));
329        }
330        
331        html.push_str("</div>\n            </div>\n");
332    }
333
334    // Extractions section
335    if let Some(extractions) = &annotated_document.extractions {
336        html.push_str(&format!(r#"            <div class="section">
337                <h2>🎯 Extractions ({} found)</h2>
338                <div class="extractions-grid">
339"#, extractions.len()));
340
341        for extraction in extractions {
342            html.push_str(&format!(r#"                    <div class="extraction-card">
343                        <div class="extraction-class">{}</div>
344                        <div class="extraction-text">{}</div>
345"#, html_escape(&extraction.extraction_class), html_escape(&extraction.extraction_text)));
346
347            if config.show_char_intervals {
348                if let Some(interval) = &extraction.char_interval {
349                    html.push_str(&format!(r#"                        <div class="extraction-meta">Position: {}-{}</div>
350"#, interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
351                }
352            }
353
354            if let Some(description) = &extraction.description {
355                html.push_str(&format!(r#"                        <div class="extraction-meta">Description: {}</div>
356"#, html_escape(description)));
357            }
358
359            html.push_str("                    </div>\n");
360        }
361        
362        html.push_str("                </div>\n            </div>\n");
363    }
364
365    // Statistics section
366    if config.include_statistics {
367        html.push_str(r#"            <div class="section">
368                <h2>📊 Statistics</h2>
369                <div class="stats-grid">
370"#);
371
372        let extraction_count = annotated_document.extraction_count();
373        html.push_str(&format!(r#"                    <div class="stat-card">
374                        <div class="stat-number">{}</div>
375                        <div class="stat-label">Total Extractions</div>
376                    </div>
377                    <div class="stat-card">
378                        <div class="stat-number">{}</div>
379                        <div class="stat-label">Characters</div>
380                    </div>
381"#, extraction_count, text.len()));
382
383        if let Some(extractions) = &annotated_document.extractions {
384            let class_counts = count_extraction_classes(extractions);
385            html.push_str(&format!(r#"                    <div class="stat-card">
386                        <div class="stat-number">{}</div>
387                        <div class="stat-label">Unique Classes</div>
388                    </div>
389"#, class_counts.len()));
390
391            html.push_str("                </div>\n");
392            
393            // Class breakdown
394            html.push_str(r#"                <h3>Extraction Classes</h3>
395                <div class="class-counts">
396"#);
397            
398            for (class, count) in class_counts {
399                html.push_str(&format!(r#"                    <div class="class-count-item">
400                        <span>{}</span>
401                        <span class="class-badge">{}</span>
402                    </div>
403"#, html_escape(class), count));
404            }
405            
406            html.push_str("                </div>\n");
407        } else {
408            html.push_str("                </div>\n");
409        }
410        
411        html.push_str("            </div>\n");
412    }
413
414    // Footer
415    html.push_str(r#"        </div>
416    </div>
417    
418    <script>
419        // Add interactivity for extraction highlights
420        document.querySelectorAll('.extraction-highlight').forEach(element => {
421            element.addEventListener('click', function() {
422                const className = this.getAttribute('data-class');
423                const text = this.getAttribute('data-text');
424                alert(`Extraction: ${className}\nText: ${text}`);
425            });
426        });
427    </script>
428</body>
429</html>"#);
430
431    Ok(html)
432}
433
434/// Helper function to escape HTML characters
435fn html_escape(text: &str) -> String {
436    text.replace('&', "&amp;")
437        .replace('<', "&lt;")
438        .replace('>', "&gt;")
439        .replace('"', "&quot;")
440        .replace('\'', "&#x27;")
441}
442
443/// Helper function to find the nearest valid UTF-8 character boundary
444fn find_char_boundary(text: &str, mut index: usize) -> usize {
445    // Clamp to text length first
446    if index >= text.len() {
447        return text.len();
448    }
449    
450    // If we're already at a character boundary, return as-is
451    if text.is_char_boundary(index) {
452        return index;
453    }
454    
455    // Search backwards for the nearest character boundary
456    while index > 0 && !text.is_char_boundary(index) {
457        index -= 1;
458    }
459    
460    index
461}
462
463/// Helper function to highlight extractions in text
464fn highlight_text_html(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
465    if let Some(extractions) = &annotated_document.extractions {
466        // Collect all valid intervals with their extraction info
467        let mut intervals: Vec<(usize, usize, &Extraction)> = Vec::new();
468        
469        for extraction in extractions {
470            if let Some(interval) = &extraction.char_interval {
471                if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
472                    if start < end && end <= text.len() {
473                        intervals.push((start, end, extraction));
474                    }
475                }
476            }
477        }
478        
479        // Sort by start position
480        intervals.sort_by_key(|(start, _, _)| *start);
481        
482        // Remove overlapping intervals - keep the first one when intervals overlap
483        let mut filtered_intervals = Vec::new();
484        let mut last_end = 0;
485        
486        for (start, end, extraction) in intervals {
487            if start >= last_end {
488                filtered_intervals.push((start, end, extraction));
489                last_end = end;
490            } else {
491                // Skip overlapping interval, but log it for debugging
492                log::debug!("Skipping overlapping extraction: '{}' at {}-{} (overlaps with previous ending at {})", 
493                    extraction.extraction_text, start, end, last_end);
494            }
495        }
496        
497        // Now build the HTML with non-overlapping intervals
498        let mut result = String::new();
499        let mut last_pos = 0;
500        
501        for (start, end, extraction) in filtered_intervals {
502            // Ensure we're at valid UTF-8 boundaries
503            let safe_start = find_char_boundary(text, start);
504            let safe_end = find_char_boundary(text, end);
505            
506            // Add text before this extraction
507            if safe_start > last_pos {
508                let safe_last_pos = find_char_boundary(text, last_pos);
509                if safe_last_pos < safe_start {
510                    result.push_str(&html_escape(&text[safe_last_pos..safe_start]));
511                }
512            }
513            
514            // Add the highlighted extraction (only if we have valid boundaries)
515            if safe_start < safe_end && safe_end <= text.len() {
516                let actual_text = &text[safe_start..safe_end];
517                result.push_str(&format!(
518                    r#"<span class="extraction-highlight" data-class="{}" data-text="{}">{}</span>"#,
519                    html_escape(&extraction.extraction_class),
520                    html_escape(&extraction.extraction_text),
521                    html_escape(actual_text)
522                ));
523                last_pos = safe_end;
524            } else {
525                // Skip invalid boundaries but log for debugging
526                log::debug!("Skipping extraction with invalid UTF-8 boundaries: '{}' at {}-{}", 
527                    extraction.extraction_text, start, end);
528            }
529        }
530        
531        // Add remaining text
532        if last_pos < text.len() {
533            let safe_last_pos = find_char_boundary(text, last_pos);
534            if safe_last_pos < text.len() {
535                result.push_str(&html_escape(&text[safe_last_pos..]));
536            }
537        }
538        
539        Ok(result)
540    } else {
541        Ok(html_escape(text))
542    }
543}
544
545/// Helper function to count extraction classes
546fn count_extraction_classes(extractions: &[crate::data::Extraction]) -> HashMap<&str, usize> {
547    let mut class_counts = HashMap::new();
548    for extraction in extractions {
549        *class_counts.entry(extraction.extraction_class.as_str()).or_insert(0) += 1;
550    }
551    class_counts
552}
553
554/// Export as structured markdown with extraction summaries
555fn export_markdown(
556    annotated_document: &AnnotatedDocument,
557    config: &ExportConfig,
558) -> LangExtractResult<String> {
559    let title = config.title.as_deref().unwrap_or("LangExtract Results");
560    let text = annotated_document.text.as_deref().unwrap_or("No text");
561    
562    let mut md = String::new();
563    
564    // Title
565    md.push_str(&format!("# {}\n\n", title));
566    
567    // Document text section
568    if config.include_text {
569        md.push_str("## 📄 Document Text\n\n");
570        
571        if config.highlight_extractions {
572            md.push_str(&highlight_text_markdown(text, annotated_document)?);
573        } else {
574            md.push_str(&format!("```\n{}\n```\n", text));
575        }
576        
577        md.push_str("\n");
578    }
579    
580    // Extractions section
581    if let Some(extractions) = &annotated_document.extractions {
582        md.push_str(&format!("## 🎯 Extractions ({} found)\n\n", extractions.len()));
583        
584        for (i, extraction) in extractions.iter().enumerate() {
585            md.push_str(&format!("### {}. {}\n\n", i + 1, extraction.extraction_class));
586            md.push_str(&format!("**Text:** {}\n\n", extraction.extraction_text));
587            
588            if config.show_char_intervals {
589                if let Some(interval) = &extraction.char_interval {
590                    md.push_str(&format!("**Position:** {}-{}\n\n", interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
591                }
592            }
593            
594            if let Some(description) = &extraction.description {
595                md.push_str(&format!("**Description:** {}\n\n", description));
596            }
597        }
598    }
599    
600    // Statistics section
601    if config.include_statistics {
602        md.push_str("## 📊 Statistics\n\n");
603        
604        let extraction_count = annotated_document.extraction_count();
605        md.push_str(&format!("- **Total Extractions:** {}\n", extraction_count));
606        md.push_str(&format!("- **Text Length:** {} characters\n", text.len()));
607        
608        if let Some(extractions) = &annotated_document.extractions {
609            let class_counts = count_extraction_classes(extractions);
610            md.push_str(&format!("- **Unique Classes:** {}\n\n", class_counts.len()));
611            
612            md.push_str("### Extraction Classes\n\n");
613            md.push_str("| Class | Count |\n");
614            md.push_str("|-------|-------|\n");
615            
616            for (class, count) in class_counts {
617                md.push_str(&format!("| {} | {} |\n", class, count));
618            }
619        }
620        
621        md.push_str("\n");
622    }
623    
624    Ok(md)
625}
626
627/// Helper function to highlight extractions in markdown
628fn highlight_text_markdown(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
629    if let Some(extractions) = &annotated_document.extractions {
630        let mut result = String::new();
631        let mut last_pos = 0;
632        
633        // Sort extractions by start position
634        let mut sorted_extractions: Vec<_> = extractions.iter().collect();
635        sorted_extractions.sort_by_key(|e| {
636            e.char_interval.as_ref().and_then(|i| i.start_pos).unwrap_or(usize::MAX)
637        });
638        
639        result.push_str("```\n");
640        
641        for extraction in sorted_extractions {
642            if let Some(interval) = &extraction.char_interval {
643                // Add text before the extraction
644                if interval.start_pos.unwrap_or(0) > last_pos && interval.start_pos.unwrap_or(0) <= text.len() {
645                    result.push_str(&text[last_pos..interval.start_pos.unwrap_or(0)]);
646                }
647                
648                // Add highlighted extraction with markdown bold
649                if interval.end_pos.unwrap_or(0) <= text.len() && interval.start_pos.unwrap_or(0) < interval.end_pos.unwrap_or(0) {
650                    let extraction_text = &text[interval.start_pos.unwrap_or(0)..interval.end_pos.unwrap_or(0)];
651                    result.push_str(&format!("**{}**", extraction_text));
652                    last_pos = interval.end_pos.unwrap_or(0);
653                }
654            }
655        }
656        
657        // Add remaining text
658        if last_pos < text.len() {
659            result.push_str(&text[last_pos..]);
660        }
661        
662        result.push_str("\n```\n");
663        Ok(result)
664    } else {
665        Ok(format!("```\n{}\n```\n", text))
666    }
667}
668
669/// Export as JSON for analysis
670fn export_json(
671    annotated_document: &AnnotatedDocument,
672    config: &ExportConfig,
673) -> LangExtractResult<String> {
674    let mut json_data = json!({
675        "document_id": annotated_document.document_id,
676        "export_config": {
677            "format": "json",
678            "show_char_intervals": config.show_char_intervals,
679            "include_text": config.include_text,
680            "include_statistics": config.include_statistics,
681            "title": config.title
682        }
683    });
684    
685    // Add text if requested
686    if config.include_text {
687        json_data["text"] = json!(annotated_document.text);
688    }
689    
690    // Add extractions
691    if let Some(extractions) = &annotated_document.extractions {
692        let extractions_json: Vec<Value> = extractions.iter().map(|extraction| {
693            let mut ext_json = json!({
694                "extraction_class": extraction.extraction_class,
695                "extraction_text": extraction.extraction_text,
696                "description": extraction.description
697            });
698            
699            if config.show_char_intervals {
700                if let Some(interval) = &extraction.char_interval {
701                    ext_json["char_interval"] = json!({
702                        "start_char": interval.start_pos.unwrap_or(0),
703                        "end_char": interval.end_pos.unwrap_or(0),
704                        "alignment_status": extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string())
705                    });
706                }
707            }
708            
709            if let Some(group_index) = extraction.group_index {
710                ext_json["group_index"] = json!(group_index);
711            }
712            
713            ext_json
714        }).collect();
715        
716        json_data["extractions"] = json!(extractions_json);
717    }
718    
719    // Add statistics if requested
720    if config.include_statistics {
721        let text = annotated_document.text.as_deref().unwrap_or("");
722        let mut stats = json!({
723            "total_extractions": annotated_document.extraction_count(),
724            "text_length": text.len()
725        });
726        
727        if let Some(extractions) = &annotated_document.extractions {
728            let class_counts = count_extraction_classes(extractions);
729            stats["unique_classes"] = json!(class_counts.len());
730            stats["extraction_classes"] = json!(class_counts);
731        }
732        
733        json_data["statistics"] = stats;
734    }
735    
736    Ok(serde_json::to_string_pretty(&json_data)?)
737}
738
739/// Export as CSV for spreadsheet analysis
740fn export_csv(
741    annotated_document: &AnnotatedDocument,
742    config: &ExportConfig,
743) -> LangExtractResult<String> {
744    let mut csv = String::new();
745    
746    // CSV Header
747    if config.show_char_intervals {
748        csv.push_str("extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index\n");
749    } else {
750        csv.push_str("extraction_class,extraction_text,description,group_index\n");
751    }
752    
753    // CSV Rows
754    if let Some(extractions) = &annotated_document.extractions {
755        for extraction in extractions {
756            let class = csv_escape(&extraction.extraction_class);
757            let text = csv_escape(&extraction.extraction_text);
758            let description = extraction.description.as_ref().map(|d| csv_escape(d)).unwrap_or_else(|| "".to_string());
759            let group_index = extraction.group_index.map(|i| i.to_string()).unwrap_or_else(|| "".to_string());
760            
761            if config.show_char_intervals {
762                if let Some(interval) = &extraction.char_interval {
763                    csv.push_str(&format!("{},{},{},{},{},{:?},{}\n",
764                        class, text, description,
765                        interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0),
766                        extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string()), group_index));
767                } else {
768                    csv.push_str(&format!("{},{},{},,,None,{}\n",
769                        class, text, description, group_index));
770                }
771            } else {
772                csv.push_str(&format!("{},{},{},{}\n",
773                    class, text, description, group_index));
774            }
775        }
776    }
777    
778    Ok(csv)
779}
780
781/// Helper function to escape CSV values
782fn csv_escape(text: &str) -> String {
783    if text.contains(',') || text.contains('"') || text.contains('\n') {
784        format!("\"{}\"", text.replace('"', "\"\""))
785    } else {
786        text.to_string()
787    }
788}
789
790#[cfg(test)]
791mod tests {
792    use super::*;
793    use crate::data::{AlignmentStatus, CharInterval, Extraction};
794    use std::collections::HashMap;
795
796    fn create_sample_document() -> AnnotatedDocument {
797        let text = "John Smith works at TechCorp and earns $50,000.";
798        let extractions = vec![
799            Extraction {
800                extraction_class: "person".to_string(),
801                extraction_text: "John Smith".to_string(),
802                char_interval: Some(CharInterval::new(Some(0), Some(10))),
803                alignment_status: Some(AlignmentStatus::MatchExact),
804                extraction_index: Some(0),
805                group_index: Some(0),
806                description: Some("Person name".to_string()),
807                attributes: Some(HashMap::new()),
808                token_interval: None,
809            },
810            Extraction {
811                extraction_class: "company".to_string(),
812                extraction_text: "TechCorp".to_string(),
813                char_interval: Some(CharInterval::new(Some(20), Some(28))),
814                alignment_status: Some(AlignmentStatus::MatchExact),
815                extraction_index: Some(1),
816                group_index: Some(0),
817                description: None,
818                attributes: Some(HashMap::new()),
819                token_interval: None,
820            },
821            Extraction {
822                extraction_class: "salary".to_string(),
823                extraction_text: "$50,000".to_string(),
824                char_interval: Some(CharInterval::new(Some(39), Some(46))),
825                alignment_status: Some(AlignmentStatus::MatchFuzzy),
826                extraction_index: Some(2),
827                group_index: Some(0),
828                description: Some("Annual salary".to_string()),
829                attributes: Some(HashMap::new()),
830                token_interval: None,
831            },
832        ];
833
834        AnnotatedDocument {
835            document_id: Some("test_doc".to_string()),
836            text: Some(text.to_string()),
837            extractions: Some(extractions),
838        }
839    }
840
841    #[test]
842    fn test_text_export() {
843        let document = create_sample_document();
844        let config = ExportConfig {
845            format: ExportFormat::Text,
846            show_char_intervals: true,
847            ..Default::default()
848        };
849
850        let result = export_document(&document, &config).unwrap();
851        
852        assert!(result.contains("EXTRACTION VISUALIZATION"));
853        assert!(result.contains("John Smith"));
854        assert!(result.contains("TechCorp"));
855        assert!(result.contains("$50,000"));
856        assert!(result.contains("Position:"));
857        assert!(result.contains("Statistics:"));
858    }
859
860    #[test]
861    fn test_html_export() {
862        let document = create_sample_document();
863        let config = ExportConfig {
864            format: ExportFormat::Html,
865            title: Some("Test HTML Export".to_string()),
866            highlight_extractions: true,
867            show_char_intervals: true,
868            ..Default::default()
869        };
870
871        let result = export_document(&document, &config).unwrap();
872        
873        assert!(result.contains("<!DOCTYPE html>"));
874        assert!(result.contains("<title>Test HTML Export</title>"));
875        assert!(result.contains("extraction-highlight"));
876        assert!(result.contains("John Smith"));
877        assert!(result.contains("TechCorp"));
878        assert!(result.contains("extraction-card"));
879        assert!(result.contains("stats-grid"));
880        assert!(result.contains("</html>"));
881    }
882
883    #[test]
884    fn test_html_export_with_custom_css() {
885        let document = create_sample_document();
886        let custom_css = "body { background: red; }";
887        let config = ExportConfig {
888            format: ExportFormat::Html,
889            custom_css: Some(custom_css.to_string()),
890            ..Default::default()
891        };
892
893        let result = export_document(&document, &config).unwrap();
894        
895        assert!(result.contains(custom_css));
896    }
897
898    #[test]
899    fn test_markdown_export() {
900        let document = create_sample_document();
901        let config = ExportConfig {
902            format: ExportFormat::Markdown,
903            title: Some("Test Markdown".to_string()),
904            show_char_intervals: true,
905            highlight_extractions: true,
906            ..Default::default()
907        };
908
909        let result = export_document(&document, &config).unwrap();
910        
911        assert!(result.starts_with("# Test Markdown"));
912        assert!(result.contains("## 📄 Document Text"));
913        assert!(result.contains("## 🎯 Extractions"));
914        assert!(result.contains("### 1. person"));
915        assert!(result.contains("**Text:** John Smith"));
916        assert!(result.contains("**Position:** 0-10"));
917        assert!(result.contains("| Class | Count |"));
918        assert!(result.contains("| person | 1 |"));
919    }
920
921    #[test]
922    fn test_json_export() {
923        let document = create_sample_document();
924        let config = ExportConfig {
925            format: ExportFormat::Json,
926            show_char_intervals: true,
927            include_text: true,
928            include_statistics: true,
929            ..Default::default()
930        };
931
932        let result = export_document(&document, &config).unwrap();
933        let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
934        
935        assert_eq!(parsed["document_id"], "test_doc");
936        assert!(parsed["text"].is_string());
937        assert!(parsed["extractions"].is_array());
938        assert!(parsed["statistics"].is_object());
939        
940        let extractions = parsed["extractions"].as_array().unwrap();
941        assert_eq!(extractions.len(), 3);
942        
943        let first_extraction = &extractions[0];
944        assert_eq!(first_extraction["extraction_class"], "person");
945        assert_eq!(first_extraction["extraction_text"], "John Smith");
946        assert!(first_extraction["char_interval"].is_object());
947        
948        let stats = &parsed["statistics"];
949        assert_eq!(stats["total_extractions"], 3);
950        assert_eq!(stats["unique_classes"], 3);
951    }
952
953    #[test]
954    fn test_csv_export() {
955        let document = create_sample_document();
956        let config = ExportConfig {
957            format: ExportFormat::Csv,
958            show_char_intervals: true,
959            ..Default::default()
960        };
961
962        let result = export_document(&document, &config).unwrap();
963        let lines: Vec<&str> = result.lines().collect();
964        
965        // Check header
966        assert_eq!(lines[0], "extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index");
967        
968        // Check data rows
969        assert_eq!(lines.len(), 4); // Header + 3 data rows
970        assert!(lines[1].contains("person,John Smith"));
971        assert!(lines[2].contains("company,TechCorp"));
972        assert!(lines[3].contains("salary,\"$50,000\""));
973        assert!(lines[1].contains("MatchExact"));
974        assert!(lines[3].contains("MatchFuzzy"));
975    }
976
977    #[test]
978    fn test_csv_export_without_intervals() {
979        let document = create_sample_document();
980        let config = ExportConfig {
981            format: ExportFormat::Csv,
982            show_char_intervals: false,
983            ..Default::default()
984        };
985
986        let result = export_document(&document, &config).unwrap();
987        let lines: Vec<&str> = result.lines().collect();
988        
989        // Check header
990        assert_eq!(lines[0], "extraction_class,extraction_text,description,group_index");
991        
992        // Should not contain position columns
993        assert!(!result.contains("start_char"));
994        assert!(!result.contains("end_char"));
995    }
996
997    #[test]
998    fn test_csv_escape() {
999        assert_eq!(csv_escape("simple"), "simple");
1000        assert_eq!(csv_escape("has,comma"), "\"has,comma\"");
1001        assert_eq!(csv_escape("has\"quote"), "\"has\"\"quote\"");
1002        assert_eq!(csv_escape("has\nnewline"), "\"has\nnewline\"");
1003        assert_eq!(csv_escape("has,comma\"and quote"), "\"has,comma\"\"and quote\"");
1004    }
1005
1006    #[test]
1007    fn test_html_escape() {
1008        assert_eq!(html_escape("simple"), "simple");
1009        assert_eq!(html_escape("has<tag>"), "has&lt;tag&gt;");
1010        assert_eq!(html_escape("has\"quote"), "has&quot;quote");
1011        assert_eq!(html_escape("has'apostrophe"), "has&#x27;apostrophe");
1012        assert_eq!(html_escape("has&ampersand"), "has&amp;ampersand");
1013    }
1014
1015    #[test]
1016    fn test_export_config_defaults() {
1017        let config = ExportConfig::default();
1018        assert_eq!(config.format, ExportFormat::Text);
1019        assert!(!config.show_char_intervals);
1020        assert!(config.include_text);
1021        assert!(config.highlight_extractions);
1022        assert!(config.include_statistics);
1023        assert!(config.custom_css.is_none());
1024        assert!(config.title.is_none());
1025    }
1026
1027    #[test]
1028    fn test_empty_document() {
1029        let document = AnnotatedDocument {
1030            document_id: Some("empty".to_string()),
1031            text: Some("".to_string()),
1032            extractions: None,
1033        };
1034
1035        let config = ExportConfig::default();
1036        let result = export_document(&document, &config).unwrap();
1037        
1038        assert!(result.contains("No extractions found"));
1039    }
1040
1041    #[test]
1042    fn test_document_without_text() {
1043        let document = AnnotatedDocument {
1044            document_id: Some("no_text".to_string()),
1045            text: None,
1046            extractions: None,
1047        };
1048
1049        let config = ExportConfig::default();
1050        let result = export_document(&document, &config).unwrap();
1051        
1052        assert!(result.contains("No text"));
1053    }
1054
1055    #[test]
1056    fn test_export_format_variants() {
1057        let document = create_sample_document();
1058        
1059        // Test all export formats don't panic
1060        for format in [ExportFormat::Text, ExportFormat::Html, ExportFormat::Markdown, ExportFormat::Json, ExportFormat::Csv] {
1061            let config = ExportConfig {
1062                format,
1063                ..Default::default()
1064            };
1065            let result = export_document(&document, &config);
1066            assert!(result.is_ok(), "Format {:?} failed", format);
1067        }
1068    }
1069
1070    #[test]
1071    fn test_highlight_text_html() {
1072        let document = create_sample_document();
1073        let text = document.text.as_ref().unwrap();
1074        
1075        let result = highlight_text_html(text, &document).unwrap();
1076        
1077        assert!(result.contains("extraction-highlight"));
1078        assert!(result.contains("data-class=\"person\""));
1079        assert!(result.contains("data-text=\"John Smith\""));
1080        assert!(result.contains("John Smith"));
1081    }
1082
1083    #[test]
1084    fn test_count_extraction_classes() {
1085        let extractions = vec![
1086            Extraction {
1087                extraction_class: "person".to_string(),
1088                extraction_text: "John".to_string(),
1089                char_interval: None,
1090                alignment_status: None,
1091                extraction_index: None,
1092                group_index: None,
1093                description: None,
1094                attributes: Some(HashMap::new()),
1095                token_interval: None,
1096            },
1097            Extraction {
1098                extraction_class: "person".to_string(),
1099                extraction_text: "Jane".to_string(),
1100                char_interval: None,
1101                alignment_status: None,
1102                extraction_index: None,
1103                group_index: None,
1104                description: None,
1105                attributes: Some(HashMap::new()),
1106                token_interval: None,
1107            },
1108            Extraction {
1109                extraction_class: "company".to_string(),
1110                extraction_text: "TechCorp".to_string(),
1111                char_interval: None,
1112                alignment_status: None,
1113                extraction_index: None,
1114                group_index: None,
1115                description: None,
1116                attributes: Some(HashMap::new()),
1117                token_interval: None,
1118            },
1119        ];
1120
1121        let counts = count_extraction_classes(&extractions);
1122        
1123        assert_eq!(counts.get("person"), Some(&2));
1124        assert_eq!(counts.get("company"), Some(&1));
1125        assert_eq!(counts.len(), 2);
1126    }
1127}
langextract_rust/visualization.rs

langextract_rust/
visualization.rs