langextract_rust/
visualization.rs

1//! Visualization utilities for annotated documents.
2
3use crate::{data::AnnotatedDocument, exceptions::LangExtractResult};
4use crate::pipeline::PipelineResult;
5use serde_json::{json, Value};
6use std::collections::HashMap;
7use crate::Extraction;
8/// Export format options for visualization
9#[derive(Debug, Clone, Copy, PartialEq)]
10#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
11pub enum ExportFormat {
12    /// Simple text format (existing functionality)
13    Text,
14    /// Rich HTML with highlighting and interactivity
15    Html,
16    /// Structured markdown with summaries
17    Markdown,
18    /// Raw JSON export for analysis
19    Json,
20    /// CSV export for spreadsheet analysis
21    Csv,
22}
23
24/// Configuration for visualization exports
25#[derive(Debug, Clone)]
26pub struct ExportConfig {
27    /// Export format to use
28    pub format: ExportFormat,
29    /// Show character intervals in output
30    pub show_char_intervals: bool,
31    /// Include original text in export
32    pub include_text: bool,
33    /// Highlight extractions in text (for HTML/Markdown)
34    pub highlight_extractions: bool,
35    /// Include extraction statistics
36    pub include_statistics: bool,
37    /// Custom CSS for HTML export
38    pub custom_css: Option<String>,
39    /// Title for the export
40    pub title: Option<String>,
41    /// Aggregate highlights across pipeline steps (pipeline HTML export)
42    pub aggregate_pipeline_highlights: bool,
43    /// Expand nested JSON extraction_text into atomic extractions when possible
44    pub expand_nested_json: bool,
45    /// Allow overlapping highlights (layered rendering)
46    pub allow_overlapping_highlights: bool,
47    /// Show legend for pipeline steps/colors
48    pub show_pipeline_legend: bool,
49}
50
51impl Default for ExportConfig {
52    fn default() -> Self {
53        Self {
54            format: ExportFormat::Text,
55            show_char_intervals: false,
56            include_text: true,
57            highlight_extractions: true,
58            include_statistics: true,
59            custom_css: None,
60            title: None,
61            aggregate_pipeline_highlights: false,
62            expand_nested_json: false,
63            allow_overlapping_highlights: false,
64            show_pipeline_legend: true,
65        }
66    }
67}
68
69/// Export an annotated document in the specified format
70pub fn export_document(
71    annotated_document: &AnnotatedDocument,
72    config: &ExportConfig,
73) -> LangExtractResult<String> {
74    match config.format {
75        ExportFormat::Text => visualize_text(annotated_document, config.show_char_intervals),
76        ExportFormat::Html => export_html(annotated_document, config),
77        ExportFormat::Markdown => export_markdown(annotated_document, config),
78        ExportFormat::Json => export_json(annotated_document, config),
79        ExportFormat::Csv => export_csv(annotated_document, config),
80    }
81}
82
83/// Export a pipeline result as rich HTML with layered highlights per step
84pub fn export_pipeline_html(
85    pipeline_result: &PipelineResult,
86    original_text: &str,
87    config: &ExportConfig,
88) -> LangExtractResult<String> {
89    let title = config.title.as_deref().unwrap_or("LangExtract Pipeline Results");
90
91    // Build layered spans from pipeline results, remapping to absolute intervals if needed
92    let mut spans: Vec<LayeredSpan> = build_layered_spans(pipeline_result, original_text, config.expand_nested_json);
93    spans.sort_by_key(|s| (s.start, s.end));
94
95    let mut html = String::new();
96    html.push_str("<!DOCTYPE html>\n");
97    html.push_str("<html lang=\"en\">\n");
98    html.push_str("<head>\n");
99    html.push_str("    <meta charset=\"UTF-8\">\n");
100    html.push_str("    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n");
101    html.push_str(&format!("    <title>{}</title>\n", title));
102    html.push_str("    <style>\n");
103    html.push_str("        body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; background: #f8fafc; color: #334155; }\n");
104    html.push_str("        .container { background: white; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); overflow: hidden; }\n");
105    html.push_str("        .header { background: linear-gradient(135deg, #0ea5e9 0%, #6366f1 100%); color: white; padding: 30px; text-align: center; }\n");
106    html.push_str("        .header h1 { margin: 0; font-size: 2.2em; font-weight: 400; }\n");
107    html.push_str("        .content { padding: 30px; }\n");
108    html.push_str("        .section { margin-bottom: 32px; }\n");
109    html.push_str("        .section h2 { color: #1e293b; border-bottom: 2px solid #e2e8f0; padding-bottom: 10px; margin-bottom: 16px; }\n");
110    html.push_str("        .document-text { background: #f1f5f9; border-radius: 8px; padding: 16px; font-family: 'Monaco', 'Menlo', monospace; line-height: 1.6; white-space: pre-wrap; position: relative; }\n");
111    html.push_str("        .legend { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 12px; }\n");
112    html.push_str("        .legend-item { display: inline-flex; align-items: center; gap: 8px; padding: 6px 10px; border: 1px solid #e2e8f0; border-radius: 6px; background: #fff; }\n");
113    html.push_str("        .badge { width: 12px; height: 12px; border-radius: 3px; display: inline-block; }\n");
114    html.push_str("        .extraction-highlight { border-radius: 3px; padding: 1px 2px; cursor: pointer; }\n");
115    html.push_str("        .step-0 { background: rgba(59, 130, 246, 0.2); border: 1px solid rgba(59, 130, 246, 0.4); }\n");
116    html.push_str("        .step-1 { background: rgba(16, 185, 129, 0.2); border: 1px solid rgba(16, 185, 129, 0.4); }\n");
117    html.push_str("        .step-2 { background: rgba(234, 179, 8, 0.2); border: 1px solid rgba(234, 179, 8, 0.5); }\n");
118    html.push_str("        .step-3 { background: rgba(244, 63, 94, 0.2); border: 1px solid rgba(244, 63, 94, 0.4); }\n");
119    html.push_str("        .step-4 { background: rgba(99, 102, 241, 0.2); border: 1px solid rgba(99, 102, 241, 0.4); }\n");
120    html.push_str("    </style>\n");
121    html.push_str("</head>\n");
122    html.push_str("<body>\n");
123    html.push_str("    <div class=\"container\">\n");
124    html.push_str("        <div class=\"header\">\n");
125    html.push_str(&format!("            <h1>{}</h1>\n", title));
126    html.push_str("        </div>\n");
127    html.push_str("        <div class=\"content\">\n");
128    html.push_str("            <div class=\"section\">\n");
129    html.push_str("                <h2>📄 Document Text</h2>\n");
130    if config.show_pipeline_legend {
131        html.push_str(&build_legend_html(pipeline_result));
132    }
133    html.push_str("                <div class=\"document-text\">");
134    html.push_str(&highlight_text_html_with_layers(original_text, &spans, config.allow_overlapping_highlights)?);
135    html.push_str("</div>\n");
136    html.push_str("            </div>\n");
137    html.push_str("            <div class=\"section\">\n");
138    html.push_str("                <h2>🎯 Extractions by Step</h2>\n");
139    html.push_str("                <div>\n");
140    html.push_str(&build_extractions_list_html(&spans));
141    html.push_str("                </div>\n");
142    html.push_str("            </div>\n");
143    html.push_str("        </div>\n");
144    html.push_str("    </div>\n");
145    html.push_str("</body>\n");
146    html.push_str("</html>\n");
147
148    Ok(html)
149}
150
151/// Export a flattened JSON view of pipeline results (one item per atomic extraction)
152pub fn export_pipeline_flattened_json(
153    pipeline_result: &PipelineResult,
154    original_text: &str,
155    expand_nested_json: bool,
156) -> LangExtractResult<String> {
157    // Helper to push one flattened item
158    fn push_item(
159        items: &mut Vec<Value>,
160        class_name: &str,
161        text: &str,
162        step_id: &str,
163        step_name: &str,
164        start: Option<usize>,
165        end: Option<usize>,
166        parent_attrs: Option<&std::collections::HashMap<String, Value>>,
167    ) {
168        let mut obj = serde_json::Map::new();
169        obj.insert("extraction_class".to_string(), Value::String(class_name.to_string()));
170        obj.insert("extraction_text".to_string(), Value::String(text.to_string()));
171        obj.insert("step_id".to_string(), Value::String(step_id.to_string()));
172        obj.insert("step_name".to_string(), Value::String(step_name.to_string()));
173        if let (Some(s), Some(e)) = (start, end) {
174            let mut ci = serde_json::Map::new();
175            ci.insert("start_pos".to_string(), Value::Number(serde_json::Number::from(s as u64)));
176            ci.insert("end_pos".to_string(), Value::Number(serde_json::Number::from(e as u64)));
177            obj.insert("char_interval".to_string(), Value::Object(ci));
178        }
179        if let Some(attrs) = parent_attrs {
180            if let Some(ps) = attrs.get("parent_step_id") {
181                obj.insert("parent_step_id".to_string(), ps.clone());
182            }
183            if let Some(ps) = attrs.get("parent_start") {
184                obj.insert("parent_start".to_string(), ps.clone());
185            }
186            if let Some(pe) = attrs.get("parent_end") {
187                obj.insert("parent_end".to_string(), pe.clone());
188            }
189        }
190        items.push(Value::Object(obj));
191    }
192
193    let mut items: Vec<Value> = Vec::new();
194
195    // Map step id to name
196    let mut step_id_to_name: std::collections::HashMap<&str, &str> = std::collections::HashMap::new();
197    for s in &pipeline_result.config.steps {
198        step_id_to_name.insert(s.id.as_str(), s.name.as_str());
199    }
200
201    for step_res in &pipeline_result.step_results {
202        let step_name = step_id_to_name.get(step_res.step_id.as_str()).copied().unwrap_or("");
203        for e in &step_res.extractions {
204            // Determine absolute positions; fall back to exact match
205            let (mut start, mut end) = (e.char_interval.as_ref().and_then(|ci| ci.start_pos), e.char_interval.as_ref().and_then(|ci| ci.end_pos));
206            if start.is_none() || end.is_none() {
207                if let Some(found) = original_text.find(&e.extraction_text) {
208                    start = Some(found);
209                    end = Some(found + e.extraction_text.len());
210                }
211            }
212
213            // Push the main extraction
214            push_item(
215                &mut items,
216                &e.extraction_class,
217                &e.extraction_text,
218                &step_res.step_id,
219                step_name,
220                start,
221                end,
222                e.attributes.as_ref(),
223            );
224
225            // Optionally expand nested JSON inside extraction_text
226            if expand_nested_json {
227                if let Ok(json_val) = serde_json::from_str::<Value>(&e.extraction_text) {
228                    // Depth-first walk and collect leaf strings
229                    fn collect(prefix: &str, val: &Value, out: &mut Vec<(String, String)>) {
230                        match val {
231                            Value::String(s) => out.push((prefix.to_string(), s.clone())),
232                            Value::Object(map) => {
233                                for (k, v) in map {
234                                    let p = if prefix.is_empty() { k.clone() } else { format!("{}:{}", prefix, k) };
235                                    collect(&p, v, out);
236                                }
237                            }
238                            Value::Array(arr) => {
239                                for (i, v) in arr.iter().enumerate() {
240                                    let p = if prefix.is_empty() { format!("[{}]", i) } else { format!("{}:[{}]", prefix, i) };
241                                    collect(&p, v, out);
242                                }
243                            }
244                            _ => {}
245                        }
246                    }
247
248                    let mut leafs: Vec<(String, String)> = Vec::new();
249                    collect(&e.extraction_class, &json_val, &mut leafs);
250
251                    for (cls, s) in leafs {
252                        if s.is_empty() { continue; }
253                        let (mut ls, mut le) = (None, None);
254                        if let Some(found) = original_text.find(&s) {
255                            ls = Some(found);
256                            le = Some(found + s.len());
257                        }
258                        push_item(
259                            &mut items,
260                            &cls,
261                            &s,
262                            &step_res.step_id,
263                            step_name,
264                            ls,
265                            le,
266                            e.attributes.as_ref(),
267                        );
268                    }
269                }
270            }
271        }
272    }
273
274    let mut root = serde_json::Map::new();
275    root.insert("extractions".to_string(), Value::Array(items));
276    let mut meta = serde_json::Map::new();
277    meta.insert("steps".to_string(), Value::Number(serde_json::Number::from(pipeline_result.config.steps.len() as u64)));
278    meta.insert("total_time_ms".to_string(), Value::Number(serde_json::Number::from(pipeline_result.total_time_ms)));
279    meta.insert("expand_nested_json".to_string(), Value::Bool(expand_nested_json));
280    root.insert("metadata".to_string(), Value::Object(meta));
281
282    Ok(serde_json::to_string_pretty(&Value::Object(root))?)
283}
284
285#[derive(Debug, Clone)]
286struct LayeredSpan {
287    start: usize,
288    end: usize,
289    class_name: String,
290    text: String,
291    step_index: usize,
292    parent_step_id: Option<String>,
293    parent_class: Option<String>,
294    parent_text: Option<String>,
295}
296
297fn build_layered_spans(pipeline_result: &PipelineResult, original_text: &str, expand_nested_json: bool) -> Vec<LayeredSpan> {
298    // Map step_id to step index for stable coloring
299    let mut step_id_to_index: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
300    for (i, s) in pipeline_result.config.steps.iter().enumerate() {
301        step_id_to_index.insert(s.id.as_str(), i);
302    }
303
304    let mut spans = Vec::new();
305    for step_res in &pipeline_result.step_results {
306        let step_index = *step_id_to_index.get(step_res.step_id.as_str()).unwrap_or(&0);
307        for e in &step_res.extractions {
308            let mut added = false;
309            if let Some(interval) = &e.char_interval {
310                if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
311                    if start < end && end <= original_text.len() {
312                        spans.push(LayeredSpan {
313                            start,
314                            end,
315                            class_name: e.extraction_class.clone(),
316                            text: e.extraction_text.clone(),
317                            step_index,
318                            parent_step_id: e.attributes.as_ref().and_then(|m| m.get("parent_step_id")).and_then(|v| v.as_str()).map(|s| s.to_string()),
319                            parent_class: e.attributes.as_ref().and_then(|m| m.get("parent_class")).and_then(|v| v.as_str()).map(|s| s.to_string()),
320                            parent_text: e.attributes.as_ref().and_then(|m| m.get("parent_text")).and_then(|v| v.as_str()).map(|s| s.to_string()),
321                        });
322                        added = true;
323                    }
324                }
325            }
326            if !added {
327                // Attempt exact match search in original text
328                if let Some(found) = original_text.find(&e.extraction_text) {
329                    let start = found;
330                    let end = start + e.extraction_text.len();
331                    if end <= original_text.len() {
332                        spans.push(LayeredSpan {
333                            start,
334                            end,
335                            class_name: e.extraction_class.clone(),
336                            text: e.extraction_text.clone(),
337                            step_index,
338                            parent_step_id: e.attributes.as_ref().and_then(|m| m.get("parent_step_id")).and_then(|v| v.as_str()).map(|s| s.to_string()),
339                            parent_class: e.attributes.as_ref().and_then(|m| m.get("parent_class")).and_then(|v| v.as_str()).map(|s| s.to_string()),
340                            parent_text: e.attributes.as_ref().and_then(|m| m.get("parent_text")).and_then(|v| v.as_str()).map(|s| s.to_string()),
341                        });
342                    }
343                }
344            }
345
346            // Optional nested JSON expansion: create child spans for string values found in the original text
347            if expand_nested_json {
348                if let Ok(json_val) = serde_json::from_str::<Value>(&e.extraction_text) {
349                    // Collect (class_name, text) pairs
350                    fn collect_strings(prefix: &str, val: &Value, out: &mut Vec<(String, String)>) {
351                        match val {
352                            Value::String(s) => {
353                                out.push((prefix.to_string(), s.clone()));
354                            }
355                            Value::Object(map) => {
356                                for (k, v) in map {
357                                    let new_prefix = if prefix.is_empty() { k.clone() } else { format!("{}:{}", prefix, k) };
358                                    collect_strings(&new_prefix, v, out);
359                                }
360                            }
361                            Value::Array(arr) => {
362                                for (i, v) in arr.iter().enumerate() {
363                                    let new_prefix = if prefix.is_empty() { format!("[{}]", i) } else { format!("{}:[{}]", prefix, i) };
364                                    collect_strings(&new_prefix, v, out);
365                                }
366                            }
367                            _ => {}
368                        }
369                    }
370
371                    let mut pairs: Vec<(String, String)> = Vec::new();
372                    collect_strings(&e.extraction_class, &json_val, &mut pairs);
373
374                    let parent_step_id = e.attributes.as_ref().and_then(|m| m.get("parent_step_id")).and_then(|v| v.as_str()).map(|s| s.to_string());
375
376                    for (class_name, s) in pairs {
377                        if !s.is_empty() {
378                            if let Some(found) = original_text.find(&s) {
379                                let start = found;
380                                let end = start + s.len();
381                                if end <= original_text.len() {
382                                    spans.push(LayeredSpan {
383                                        start,
384                                        end,
385                                        class_name: class_name.clone(),
386                                        text: s.clone(),
387                                        step_index,
388                                        parent_step_id: parent_step_id.clone(),
389                                        parent_class: e.attributes.as_ref().and_then(|m| m.get("parent_class")).and_then(|v| v.as_str()).map(|s| s.to_string()),
390                                        parent_text: e.attributes.as_ref().and_then(|m| m.get("parent_text")).and_then(|v| v.as_str()).map(|s| s.to_string()),
391                                    });
392                                }
393                            }
394                        }
395                    }
396                }
397            }
398        }
399    }
400    spans
401}
402
403fn build_legend_html(pipeline_result: &PipelineResult) -> String {
404    let mut step_id_to_index: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
405    for (i, s) in pipeline_result.config.steps.iter().enumerate() {
406        step_id_to_index.insert(s.id.as_str(), i);
407    }
408    let mut items = String::new();
409    for step in &pipeline_result.config.steps {
410        let idx = *step_id_to_index.get(step.id.as_str()).unwrap_or(&0);
411        items.push_str(&format!(r#"<span class="legend-item"><span class="badge step-{}"></span>Step {}: {}</span>"#, idx, idx + 1, html_escape(&step.name)));
412    }
413    format!(r#"<div class="legend">{}</div>"#, items)
414}
415
416fn build_extractions_list_html(spans: &[LayeredSpan]) -> String {
417    let mut grouped: std::collections::BTreeMap<usize, Vec<&LayeredSpan>> = std::collections::BTreeMap::new();
418    for s in spans {
419        grouped.entry(s.step_index).or_default().push(s);
420    }
421    let mut html = String::new();
422    for (step_idx, list) in grouped {
423        html.push_str(&format!(r#"<h3>Step {}</h3>"#, step_idx + 1));
424        html.push_str("<ul>");
425        for s in list {
426            let parent_info = match (&s.parent_class, &s.parent_text) {
427                (Some(pc), Some(pt)) if !pc.is_empty() && !pt.is_empty() => format!(" (parent: [{}] {})", html_escape(pc), html_escape(pt)),
428                _ => String::new(),
429            };
430            html.push_str(&format!(r#"<li><span class=\"step-{} extraction-highlight\">[{}] {}{}</span></li>"#, step_idx, html_escape(&s.class_name), html_escape(&s.text), parent_info));
431        }
432        html.push_str("</ul>");
433    }
434    html
435}
436
437/// Build HTML of text with layered spans. Currently uses non-overlapping simplification.
438fn highlight_text_html_with_layers(
439    text: &str,
440    spans: &[LayeredSpan],
441    allow_overlaps: bool,
442) -> LangExtractResult<String> {
443    if !allow_overlaps {
444        let mut intervals: Vec<(usize, usize, usize)> = spans
445            .iter()
446            .enumerate()
447            .filter_map(|(i, s)| if s.start < s.end && s.end <= text.len() { Some((s.start, s.end, i)) } else { None })
448            .collect();
449        intervals.sort_by_key(|(start, end, _)| (*start, *end));
450
451        let mut result = String::new();
452        let mut last_pos = 0usize;
453        let mut last_end = 0usize;
454        for (start, end, idx) in intervals {
455            if start < last_end { continue; }
456            let safe_start = find_char_boundary(text, start);
457            let safe_end = find_char_boundary(text, end);
458            if safe_start > last_pos {
459                result.push_str(&html_escape(&text[last_pos..safe_start]));
460            }
461            if safe_start < safe_end {
462                let s = &spans[idx];
463                let seg = &text[safe_start..safe_end];
464                result.push_str(&format!(
465                    r#"<span class="extraction-highlight step-{}" data-class="{}" data-text="{}" data-parent-class="{}">{}</span>"#,
466                    s.step_index,
467                    html_escape(&s.class_name),
468                    html_escape(&s.text),
469                    html_escape(s.parent_class.as_deref().unwrap_or("")),
470                    html_escape(seg)
471                ));
472                last_pos = safe_end;
473                last_end = safe_end;
474            }
475        }
476        if last_pos < text.len() {
477            result.push_str(&html_escape(&text[last_pos..]));
478        }
479        return Ok(result);
480    }
481
482    let mut events: Vec<(usize, bool, usize)> = Vec::new();
483    for (i, s) in spans.iter().enumerate() {
484        if s.start < s.end && s.end <= text.len() {
485            events.push((s.start, true, i));
486            events.push((s.end, false, i));
487        }
488    }
489    events.sort_by_key(|(pos, is_start, idx)| (*pos, !*is_start, spans[*idx].step_index));
490
491    let mut result = String::new();
492    let mut cursor = 0usize;
493    let mut open: Vec<usize> = Vec::new();
494
495    let mut push_plain = |from: usize, to: usize, out: &mut String| {
496        if to > from {
497            out.push_str(&html_escape(&text[from..to]));
498        }
499    };
500
501    for (pos, is_start, idx) in events {
502        let safe_pos = find_char_boundary(text, pos);
503        if is_start {
504            push_plain(cursor, safe_pos, &mut result);
505            let s = &spans[idx];
506            result.push_str(&format!(
507                r#"<span class="extraction-highlight step-{}" data-class="{}" data-text="{}" data-parent-class="{}">{}</span>"#,
508                s.step_index,
509                html_escape(&s.class_name),
510                html_escape(&s.text),
511                html_escape(s.parent_class.as_deref().unwrap_or("")),
512                html_escape(&text[cursor..safe_pos])
513            ));
514            open.push(idx);
515            cursor = safe_pos;
516        } else {
517            push_plain(cursor, safe_pos, &mut result);
518            cursor = safe_pos;
519            if let Some(pos_in_open) = open.iter().rposition(|&j| j == idx) {
520                for _ in pos_in_open..open.len() {
521                    result.push_str("</span>");
522                }
523                open.remove(pos_in_open);
524                for j in open.iter().copied() {
525                    let s = &spans[j];
526                    result.push_str(&format!(
527                        r#"<span class="extraction-highlight step-{}" data-class="{}" data-text="{}" data-parent-class="{}">{}</span>"#,
528                        s.step_index,
529                        html_escape(&s.class_name),
530                        html_escape(&s.text),
531                        html_escape(s.parent_class.as_deref().unwrap_or("")),
532                        html_escape(&text[cursor..safe_pos])
533                    ));
534                }
535            }
536        }
537    }
538    push_plain(cursor, text.len(), &mut result);
539    for _ in 0..open.len() { result.push_str("</span>"); }
540    Ok(result)
541}
542
543/// Visualize an annotated document (legacy function for backward compatibility)
544pub fn visualize(
545    annotated_document: &AnnotatedDocument,
546    show_char_intervals: bool,
547) -> LangExtractResult<String> {
548    visualize_text(annotated_document, show_char_intervals)
549}
550
551/// Export as simple text format (original implementation)
552fn visualize_text(
553    annotated_document: &AnnotatedDocument,
554    show_char_intervals: bool,
555) -> LangExtractResult<String> {
556    let mut result = String::new();
557    
558    result.push_str("📄 EXTRACTION VISUALIZATION\n");
559    result.push_str("=" .repeat(50).as_str());
560    result.push('\n');
561    
562    // Show document text
563    let text = annotated_document.text.as_deref().unwrap_or("No text");
564    result.push_str(&format!("📝 Document Text ({} chars):\n", text.len()));
565    result.push_str(&format!("   {}\n\n", text));
566    
567    // Show extractions
568    if let Some(extractions) = &annotated_document.extractions {
569        result.push_str(&format!("🎯 Found {} Extractions:\n", extractions.len()));
570        result.push_str("-".repeat(30).as_str());
571        result.push('\n');
572        
573        for (i, extraction) in extractions.iter().enumerate() {
574            result.push_str(&format!("{}. [{}] {}\n", 
575                i + 1, 
576                extraction.extraction_class, 
577                extraction.extraction_text
578            ));
579            
580            if show_char_intervals {
581                if let Some(interval) = &extraction.char_interval {
582                    result.push_str(&format!("   Position: {:?}\n", interval));
583                }
584            }
585            
586            if let Some(description) = &extraction.description {
587                result.push_str(&format!("   Description: {}\n", description));
588            }
589            
590            result.push('\n');
591        }
592    } else {
593        result.push_str("ℹ️  No extractions found\n");
594    }
595    
596    // Show statistics
597    result.push_str("📊 Statistics:\n");
598    result.push_str("-".repeat(15).as_str());
599    result.push('\n');
600    result.push_str(&format!("   Document ID: {}\n", 
601        annotated_document.document_id.as_deref().unwrap_or("None")));
602    result.push_str(&format!("   Text Length: {} characters\n", text.len()));
603    result.push_str(&format!("   Total Extractions: {}\n", annotated_document.extraction_count()));
604    
605    if let Some(extractions) = &annotated_document.extractions {
606        // Count unique extraction classes
607        let mut class_counts = std::collections::HashMap::new();
608        for extraction in extractions {
609            *class_counts.entry(&extraction.extraction_class).or_insert(0) += 1;
610        }
611        
612        result.push_str("   Extraction Classes:\n");
613        for (class, count) in class_counts {
614            result.push_str(&format!("     • {}: {} instance(s)\n", class, count));
615        }
616    }
617    
618    Ok(result)
619}
620
621/// Export as rich HTML with highlighting and interactivity
622fn export_html(
623    annotated_document: &AnnotatedDocument,
624    config: &ExportConfig,
625) -> LangExtractResult<String> {
626    let title = config.title.as_deref().unwrap_or("LangExtract Results");
627    let text = annotated_document.text.as_deref().unwrap_or("No text");
628    
629    let mut html = String::new();
630    
631    // HTML Header
632    html.push_str(&format!(r#"<!DOCTYPE html>
633<html lang="en">
634<head>
635    <meta charset="UTF-8">
636    <meta name="viewport" content="width=device-width, initial-scale=1.0">
637    <title>{}</title>
638    <style>
639        body {{
640            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
641            max-width: 1200px;
642            margin: 0 auto;
643            padding: 20px;
644            background: #f8fafc;
645            color: #334155;
646        }}
647        .container {{
648            background: white;
649            border-radius: 12px;
650            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
651            overflow: hidden;
652        }}
653        .header {{
654            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
655            color: white;
656            padding: 30px;
657            text-align: center;
658        }}
659        .header h1 {{
660            margin: 0;
661            font-size: 2.5em;
662            font-weight: 300;
663        }}
664        .content {{
665            padding: 30px;
666        }}
667        .section {{
668            margin-bottom: 40px;
669        }}
670        .section h2 {{
671            color: #1e293b;
672            border-bottom: 2px solid #e2e8f0;
673            padding-bottom: 10px;
674            margin-bottom: 20px;
675        }}
676        .document-text {{
677            background: #f1f5f9;
678            border-radius: 8px;
679            padding: 20px;
680            font-family: 'Monaco', 'Menlo', monospace;
681            line-height: 1.6;
682            white-space: pre-wrap;
683            position: relative;
684            margin-bottom: 20px;
685        }}
686        .extraction-highlight {{
687            background: rgba(59, 130, 246, 0.2);
688            border: 1px solid rgba(59, 130, 246, 0.4);
689            border-radius: 3px;
690            padding: 1px 2px;
691            cursor: pointer;
692            transition: all 0.2s ease;
693        }}
694        .extraction-highlight:hover {{
695            background: rgba(59, 130, 246, 0.3);
696            transform: translateY(-1px);
697        }}
698        .extractions-grid {{
699            display: grid;
700            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
701            gap: 20px;
702            margin-bottom: 30px;
703        }}
704        .extraction-card {{
705            background: #f8fafc;
706            border: 1px solid #e2e8f0;
707            border-radius: 8px;
708            padding: 15px;
709            transition: all 0.2s ease;
710        }}
711        .extraction-card:hover {{
712            border-color: #3b82f6;
713            box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15);
714        }}
715        .extraction-class {{
716            background: #3b82f6;
717            color: white;
718            padding: 4px 8px;
719            border-radius: 4px;
720            font-size: 0.8em;
721            font-weight: 600;
722            display: inline-block;
723            margin-bottom: 8px;
724        }}
725        .extraction-text {{
726            font-weight: 600;
727            color: #1e293b;
728            margin-bottom: 8px;
729        }}
730        .extraction-meta {{
731            font-size: 0.9em;
732            color: #64748b;
733        }}
734        .stats-grid {{
735            display: grid;
736            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
737            gap: 20px;
738        }}
739        .stat-card {{
740            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
741            color: white;
742            padding: 20px;
743            border-radius: 8px;
744            text-align: center;
745        }}
746        .stat-number {{
747            font-size: 2em;
748            font-weight: bold;
749            margin-bottom: 5px;
750        }}
751        .stat-label {{
752            opacity: 0.9;
753            font-size: 0.9em;
754        }}
755        .class-counts {{
756            background: #f1f5f9;
757            border-radius: 8px;
758            padding: 20px;
759        }}
760        .class-count-item {{
761            display: flex;
762            justify-content: space-between;
763            align-items: center;
764            padding: 8px 0;
765            border-bottom: 1px solid #e2e8f0;
766        }}
767        .class-count-item:last-child {{
768            border-bottom: none;
769        }}
770        .class-badge {{
771            background: #10b981;
772            color: white;
773            padding: 2px 6px;
774            border-radius: 12px;
775            font-size: 0.8em;
776            font-weight: 600;
777        }}
778        {}
779    </style>
780</head>
781<body>
782"#, title, config.custom_css.as_deref().unwrap_or("")));
783
784    // Header
785    html.push_str(&format!(r#"    <div class="container">
786        <div class="header">
787            <h1>{}</h1>
788        </div>
789        <div class="content">
790"#, title));
791
792    // Document text section (with highlighting if enabled)
793    if config.include_text {
794        html.push_str(r#"            <div class="section">
795                <h2>📄 Document Text</h2>
796                <div class="document-text">"#);
797        
798        if config.highlight_extractions {
799            html.push_str(&highlight_text_html(text, annotated_document)?);
800        } else {
801            html.push_str(&html_escape(text));
802        }
803        
804        html.push_str("</div>\n            </div>\n");
805    }
806
807    // Extractions section
808    if let Some(extractions) = &annotated_document.extractions {
809        html.push_str(&format!(r#"            <div class="section">
810                <h2>🎯 Extractions ({} found)</h2>
811                <div class="extractions-grid">
812"#, extractions.len()));
813
814        for extraction in extractions {
815            html.push_str(&format!(r#"                    <div class="extraction-card">
816                        <div class="extraction-class">{}</div>
817                        <div class="extraction-text">{}</div>
818"#, html_escape(&extraction.extraction_class), html_escape(&extraction.extraction_text)));
819
820            if config.show_char_intervals {
821                if let Some(interval) = &extraction.char_interval {
822                    html.push_str(&format!(r#"                        <div class="extraction-meta">Position: {}-{}</div>
823"#, interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
824                }
825            }
826
827            if let Some(description) = &extraction.description {
828                html.push_str(&format!(r#"                        <div class="extraction-meta">Description: {}</div>
829"#, html_escape(description)));
830            }
831
832            html.push_str("                    </div>\n");
833        }
834        
835        html.push_str("                </div>\n            </div>\n");
836    }
837
838    // Statistics section
839    if config.include_statistics {
840        html.push_str(r#"            <div class="section">
841                <h2>📊 Statistics</h2>
842                <div class="stats-grid">
843"#);
844
845        let extraction_count = annotated_document.extraction_count();
846        html.push_str(&format!(r#"                    <div class="stat-card">
847                        <div class="stat-number">{}</div>
848                        <div class="stat-label">Total Extractions</div>
849                    </div>
850                    <div class="stat-card">
851                        <div class="stat-number">{}</div>
852                        <div class="stat-label">Characters</div>
853                    </div>
854"#, extraction_count, text.len()));
855
856        if let Some(extractions) = &annotated_document.extractions {
857            let class_counts = count_extraction_classes(extractions);
858            html.push_str(&format!(r#"                    <div class="stat-card">
859                        <div class="stat-number">{}</div>
860                        <div class="stat-label">Unique Classes</div>
861                    </div>
862"#, class_counts.len()));
863
864            html.push_str("                </div>\n");
865            
866            // Class breakdown
867            html.push_str(r#"                <h3>Extraction Classes</h3>
868                <div class="class-counts">
869"#);
870            
871            for (class, count) in class_counts {
872                html.push_str(&format!(r#"                    <div class="class-count-item">
873                        <span>{}</span>
874                        <span class="class-badge">{}</span>
875                    </div>
876"#, html_escape(class), count));
877            }
878            
879            html.push_str("                </div>\n");
880        } else {
881            html.push_str("                </div>\n");
882        }
883        
884        html.push_str("            </div>\n");
885    }
886
887    // Footer
888    html.push_str(r#"        </div>
889    </div>
890    
891    <script>
892        // Add interactivity for extraction highlights
893        document.querySelectorAll('.extraction-highlight').forEach(element => {
894            element.addEventListener('click', function() {
895                const className = this.getAttribute('data-class');
896                const text = this.getAttribute('data-text');
897                alert(`Extraction: ${className}\nText: ${text}`);
898            });
899        });
900    </script>
901</body>
902</html>"#);
903
904    Ok(html)
905}
906
907/// Helper function to escape HTML characters
908fn html_escape(text: &str) -> String {
909    text.replace('&', "&amp;")
910        .replace('<', "&lt;")
911        .replace('>', "&gt;")
912        .replace('"', "&quot;")
913        .replace('\'', "&#x27;")
914}
915
916/// Helper function to find the nearest valid UTF-8 character boundary
917fn find_char_boundary(text: &str, mut index: usize) -> usize {
918    // Clamp to text length first
919    if index >= text.len() {
920        return text.len();
921    }
922    
923    // If we're already at a character boundary, return as-is
924    if text.is_char_boundary(index) {
925        return index;
926    }
927    
928    // Search backwards for the nearest character boundary
929    while index > 0 && !text.is_char_boundary(index) {
930        index -= 1;
931    }
932    
933    index
934}
935
936/// Helper function to highlight extractions in text
937fn highlight_text_html(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
938    if let Some(extractions) = &annotated_document.extractions {
939        // Collect all valid intervals with their extraction info
940        let mut intervals: Vec<(usize, usize, &Extraction)> = Vec::new();
941        
942        for extraction in extractions {
943            if let Some(interval) = &extraction.char_interval {
944                if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
945                    if start < end && end <= text.len() {
946                        intervals.push((start, end, extraction));
947                    }
948                }
949            }
950        }
951        
952        // Sort by start position
953        intervals.sort_by_key(|(start, _, _)| *start);
954        
955        // Remove overlapping intervals - keep the first one when intervals overlap
956        let mut filtered_intervals = Vec::new();
957        let mut last_end = 0;
958        
959        for (start, end, extraction) in intervals {
960            if start >= last_end {
961                filtered_intervals.push((start, end, extraction));
962                last_end = end;
963            } else {
964                // Skip overlapping interval, but log it for debugging
965                log::debug!("Skipping overlapping extraction: '{}' at {}-{} (overlaps with previous ending at {})", 
966                    extraction.extraction_text, start, end, last_end);
967            }
968        }
969        
970        // Now build the HTML with non-overlapping intervals
971        let mut result = String::new();
972        let mut last_pos = 0;
973        
974        for (start, end, extraction) in filtered_intervals {
975            // Ensure we're at valid UTF-8 boundaries
976            let safe_start = find_char_boundary(text, start);
977            let safe_end = find_char_boundary(text, end);
978            
979            // Add text before this extraction
980            if safe_start > last_pos {
981                let safe_last_pos = find_char_boundary(text, last_pos);
982                if safe_last_pos < safe_start {
983                    result.push_str(&html_escape(&text[safe_last_pos..safe_start]));
984                }
985            }
986            
987            // Add the highlighted extraction (only if we have valid boundaries)
988            if safe_start < safe_end && safe_end <= text.len() {
989                let actual_text = &text[safe_start..safe_end];
990                result.push_str(&format!(
991                    r#"<span class="extraction-highlight" data-class="{}" data-text="{}">{}</span>"#,
992                    html_escape(&extraction.extraction_class),
993                    html_escape(&extraction.extraction_text),
994                    html_escape(actual_text)
995                ));
996                last_pos = safe_end;
997            } else {
998                // Skip invalid boundaries but log for debugging
999                log::debug!("Skipping extraction with invalid UTF-8 boundaries: '{}' at {}-{}", 
1000                    extraction.extraction_text, start, end);
1001            }
1002        }
1003        
1004        // Add remaining text
1005        if last_pos < text.len() {
1006            let safe_last_pos = find_char_boundary(text, last_pos);
1007            if safe_last_pos < text.len() {
1008                result.push_str(&html_escape(&text[safe_last_pos..]));
1009            }
1010        }
1011        
1012        Ok(result)
1013    } else {
1014        Ok(html_escape(text))
1015    }
1016}
1017
1018/// Helper function to count extraction classes
1019fn count_extraction_classes(extractions: &[crate::data::Extraction]) -> HashMap<&str, usize> {
1020    let mut class_counts = HashMap::new();
1021    for extraction in extractions {
1022        *class_counts.entry(extraction.extraction_class.as_str()).or_insert(0) += 1;
1023    }
1024    class_counts
1025}
1026
1027/// Export as structured markdown with extraction summaries
1028fn export_markdown(
1029    annotated_document: &AnnotatedDocument,
1030    config: &ExportConfig,
1031) -> LangExtractResult<String> {
1032    let title = config.title.as_deref().unwrap_or("LangExtract Results");
1033    let text = annotated_document.text.as_deref().unwrap_or("No text");
1034    
1035    let mut md = String::new();
1036    
1037    // Title
1038    md.push_str(&format!("# {}\n\n", title));
1039    
1040    // Document text section
1041    if config.include_text {
1042        md.push_str("## 📄 Document Text\n\n");
1043        
1044        if config.highlight_extractions {
1045            md.push_str(&highlight_text_markdown(text, annotated_document)?);
1046        } else {
1047            md.push_str(&format!("```\n{}\n```\n", text));
1048        }
1049        
1050        md.push_str("\n");
1051    }
1052    
1053    // Extractions section
1054    if let Some(extractions) = &annotated_document.extractions {
1055        md.push_str(&format!("## 🎯 Extractions ({} found)\n\n", extractions.len()));
1056        
1057        for (i, extraction) in extractions.iter().enumerate() {
1058            md.push_str(&format!("### {}. {}\n\n", i + 1, extraction.extraction_class));
1059            md.push_str(&format!("**Text:** {}\n\n", extraction.extraction_text));
1060            
1061            if config.show_char_intervals {
1062                if let Some(interval) = &extraction.char_interval {
1063                    md.push_str(&format!("**Position:** {}-{}\n\n", interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
1064                }
1065            }
1066            
1067            if let Some(description) = &extraction.description {
1068                md.push_str(&format!("**Description:** {}\n\n", description));
1069            }
1070        }
1071    }
1072    
1073    // Statistics section
1074    if config.include_statistics {
1075        md.push_str("## 📊 Statistics\n\n");
1076        
1077        let extraction_count = annotated_document.extraction_count();
1078        md.push_str(&format!("- **Total Extractions:** {}\n", extraction_count));
1079        md.push_str(&format!("- **Text Length:** {} characters\n", text.len()));
1080        
1081        if let Some(extractions) = &annotated_document.extractions {
1082            let class_counts = count_extraction_classes(extractions);
1083            md.push_str(&format!("- **Unique Classes:** {}\n\n", class_counts.len()));
1084            
1085            md.push_str("### Extraction Classes\n\n");
1086            md.push_str("| Class | Count |\n");
1087            md.push_str("|-------|-------|\n");
1088            
1089            for (class, count) in class_counts {
1090                md.push_str(&format!("| {} | {} |\n", class, count));
1091            }
1092        }
1093        
1094        md.push_str("\n");
1095    }
1096    
1097    Ok(md)
1098}
1099
1100/// Helper function to highlight extractions in markdown
1101fn highlight_text_markdown(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
1102    if let Some(extractions) = &annotated_document.extractions {
1103        let mut result = String::new();
1104        let mut last_pos = 0;
1105        
1106        // Sort extractions by start position
1107        let mut sorted_extractions: Vec<_> = extractions.iter().collect();
1108        sorted_extractions.sort_by_key(|e| {
1109            e.char_interval.as_ref().and_then(|i| i.start_pos).unwrap_or(usize::MAX)
1110        });
1111        
1112        result.push_str("```\n");
1113        
1114        for extraction in sorted_extractions {
1115            if let Some(interval) = &extraction.char_interval {
1116                // Add text before the extraction
1117                if interval.start_pos.unwrap_or(0) > last_pos && interval.start_pos.unwrap_or(0) <= text.len() {
1118                    result.push_str(&text[last_pos..interval.start_pos.unwrap_or(0)]);
1119                }
1120                
1121                // Add highlighted extraction with markdown bold
1122                if interval.end_pos.unwrap_or(0) <= text.len() && interval.start_pos.unwrap_or(0) < interval.end_pos.unwrap_or(0) {
1123                    let extraction_text = &text[interval.start_pos.unwrap_or(0)..interval.end_pos.unwrap_or(0)];
1124                    result.push_str(&format!("**{}**", extraction_text));
1125                    last_pos = interval.end_pos.unwrap_or(0);
1126                }
1127            }
1128        }
1129        
1130        // Add remaining text
1131        if last_pos < text.len() {
1132            result.push_str(&text[last_pos..]);
1133        }
1134        
1135        result.push_str("\n```\n");
1136        Ok(result)
1137    } else {
1138        Ok(format!("```\n{}\n```\n", text))
1139    }
1140}
1141
1142/// Export as JSON for analysis
1143fn export_json(
1144    annotated_document: &AnnotatedDocument,
1145    config: &ExportConfig,
1146) -> LangExtractResult<String> {
1147    let mut json_data = json!({
1148        "document_id": annotated_document.document_id,
1149        "export_config": {
1150            "format": "json",
1151            "show_char_intervals": config.show_char_intervals,
1152            "include_text": config.include_text,
1153            "include_statistics": config.include_statistics,
1154            "title": config.title
1155        }
1156    });
1157    
1158    // Add text if requested
1159    if config.include_text {
1160        json_data["text"] = json!(annotated_document.text);
1161    }
1162    
1163    // Add extractions
1164    if let Some(extractions) = &annotated_document.extractions {
1165        let extractions_json: Vec<Value> = extractions.iter().map(|extraction| {
1166            let mut ext_json = json!({
1167                "extraction_class": extraction.extraction_class,
1168                "extraction_text": extraction.extraction_text,
1169                "description": extraction.description
1170            });
1171            
1172            if config.show_char_intervals {
1173                if let Some(interval) = &extraction.char_interval {
1174                    ext_json["char_interval"] = json!({
1175                        "start_char": interval.start_pos.unwrap_or(0),
1176                        "end_char": interval.end_pos.unwrap_or(0),
1177                        "alignment_status": extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string())
1178                    });
1179                }
1180            }
1181            
1182            if let Some(group_index) = extraction.group_index {
1183                ext_json["group_index"] = json!(group_index);
1184            }
1185            
1186            ext_json
1187        }).collect();
1188        
1189        json_data["extractions"] = json!(extractions_json);
1190    }
1191    
1192    // Add statistics if requested
1193    if config.include_statistics {
1194        let text = annotated_document.text.as_deref().unwrap_or("");
1195        let mut stats = json!({
1196            "total_extractions": annotated_document.extraction_count(),
1197            "text_length": text.len()
1198        });
1199        
1200        if let Some(extractions) = &annotated_document.extractions {
1201            let class_counts = count_extraction_classes(extractions);
1202            stats["unique_classes"] = json!(class_counts.len());
1203            stats["extraction_classes"] = json!(class_counts);
1204        }
1205        
1206        json_data["statistics"] = stats;
1207    }
1208    
1209    Ok(serde_json::to_string_pretty(&json_data)?)
1210}
1211
1212/// Export as CSV for spreadsheet analysis
1213fn export_csv(
1214    annotated_document: &AnnotatedDocument,
1215    config: &ExportConfig,
1216) -> LangExtractResult<String> {
1217    let mut csv = String::new();
1218    
1219    // CSV Header
1220    if config.show_char_intervals {
1221        csv.push_str("extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index\n");
1222    } else {
1223        csv.push_str("extraction_class,extraction_text,description,group_index\n");
1224    }
1225    
1226    // CSV Rows
1227    if let Some(extractions) = &annotated_document.extractions {
1228        for extraction in extractions {
1229            let class = csv_escape(&extraction.extraction_class);
1230            let text = csv_escape(&extraction.extraction_text);
1231            let description = extraction.description.as_ref().map(|d| csv_escape(d)).unwrap_or_else(|| "".to_string());
1232            let group_index = extraction.group_index.map(|i| i.to_string()).unwrap_or_else(|| "".to_string());
1233            
1234            if config.show_char_intervals {
1235                if let Some(interval) = &extraction.char_interval {
1236                    csv.push_str(&format!("{},{},{},{},{},{:?},{}\n",
1237                        class, text, description,
1238                        interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0),
1239                        extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string()), group_index));
1240                } else {
1241                    csv.push_str(&format!("{},{},{},,,None,{}\n",
1242                        class, text, description, group_index));
1243                }
1244            } else {
1245                csv.push_str(&format!("{},{},{},{}\n",
1246                    class, text, description, group_index));
1247            }
1248        }
1249    }
1250    
1251    Ok(csv)
1252}
1253
1254/// Helper function to escape CSV values
1255fn csv_escape(text: &str) -> String {
1256    if text.contains(',') || text.contains('"') || text.contains('\n') {
1257        format!("\"{}\"", text.replace('"', "\"\""))
1258    } else {
1259        text.to_string()
1260    }
1261}
1262
1263#[cfg(test)]
1264mod tests {
1265    use super::*;
1266    use crate::data::{AlignmentStatus, CharInterval, Extraction};
1267    use std::collections::HashMap;
1268    use crate::pipeline::{PipelineConfig, PipelineStep, StepResult, PipelineResult};
1269    use crate::ExtractConfig as LibExtractConfig;
1270
1271    fn create_sample_document() -> AnnotatedDocument {
1272        let text = "John Smith works at TechCorp and earns $50,000.";
1273        let extractions = vec![
1274            Extraction {
1275                extraction_class: "person".to_string(),
1276                extraction_text: "John Smith".to_string(),
1277                char_interval: Some(CharInterval::new(Some(0), Some(10))),
1278                alignment_status: Some(AlignmentStatus::MatchExact),
1279                extraction_index: Some(0),
1280                group_index: Some(0),
1281                description: Some("Person name".to_string()),
1282                attributes: Some(HashMap::new()),
1283                token_interval: None,
1284            },
1285            Extraction {
1286                extraction_class: "company".to_string(),
1287                extraction_text: "TechCorp".to_string(),
1288                char_interval: Some(CharInterval::new(Some(20), Some(28))),
1289                alignment_status: Some(AlignmentStatus::MatchExact),
1290                extraction_index: Some(1),
1291                group_index: Some(0),
1292                description: None,
1293                attributes: Some(HashMap::new()),
1294                token_interval: None,
1295            },
1296            Extraction {
1297                extraction_class: "salary".to_string(),
1298                extraction_text: "$50,000".to_string(),
1299                char_interval: Some(CharInterval::new(Some(39), Some(46))),
1300                alignment_status: Some(AlignmentStatus::MatchFuzzy),
1301                extraction_index: Some(2),
1302                group_index: Some(0),
1303                description: Some("Annual salary".to_string()),
1304                attributes: Some(HashMap::new()),
1305                token_interval: None,
1306            },
1307        ];
1308
1309        AnnotatedDocument {
1310            document_id: Some("test_doc".to_string()),
1311            text: Some(text.to_string()),
1312            extractions: Some(extractions),
1313        }
1314    }
1315
1316    #[test]
1317    fn test_text_export() {
1318        let document = create_sample_document();
1319        let config = ExportConfig {
1320            format: ExportFormat::Text,
1321            show_char_intervals: true,
1322            ..Default::default()
1323        };
1324
1325        let result = export_document(&document, &config).unwrap();
1326        
1327        assert!(result.contains("EXTRACTION VISUALIZATION"));
1328        assert!(result.contains("John Smith"));
1329        assert!(result.contains("TechCorp"));
1330        assert!(result.contains("$50,000"));
1331        assert!(result.contains("Position:"));
1332        assert!(result.contains("Statistics:"));
1333    }
1334
1335    #[test]
1336    fn test_html_export() {
1337        let document = create_sample_document();
1338        let config = ExportConfig {
1339            format: ExportFormat::Html,
1340            title: Some("Test HTML Export".to_string()),
1341            highlight_extractions: true,
1342            show_char_intervals: true,
1343            ..Default::default()
1344        };
1345
1346        let result = export_document(&document, &config).unwrap();
1347        
1348        assert!(result.contains("<!DOCTYPE html>"));
1349        assert!(result.contains("<title>Test HTML Export</title>"));
1350        assert!(result.contains("extraction-highlight"));
1351        assert!(result.contains("John Smith"));
1352        assert!(result.contains("TechCorp"));
1353        assert!(result.contains("extraction-card"));
1354        assert!(result.contains("stats-grid"));
1355        assert!(result.contains("</html>"));
1356    }
1357
1358    #[test]
1359    fn test_html_export_with_custom_css() {
1360        let document = create_sample_document();
1361        let custom_css = "body { background: red; }";
1362        let config = ExportConfig {
1363            format: ExportFormat::Html,
1364            custom_css: Some(custom_css.to_string()),
1365            ..Default::default()
1366        };
1367
1368        let result = export_document(&document, &config).unwrap();
1369        
1370        assert!(result.contains(custom_css));
1371    }
1372
1373    #[test]
1374    fn test_markdown_export() {
1375        let document = create_sample_document();
1376        let config = ExportConfig {
1377            format: ExportFormat::Markdown,
1378            title: Some("Test Markdown".to_string()),
1379            show_char_intervals: true,
1380            highlight_extractions: true,
1381            ..Default::default()
1382        };
1383
1384        let result = export_document(&document, &config).unwrap();
1385        
1386        assert!(result.starts_with("# Test Markdown"));
1387        assert!(result.contains("## 📄 Document Text"));
1388        assert!(result.contains("## 🎯 Extractions"));
1389        assert!(result.contains("### 1. person"));
1390        assert!(result.contains("**Text:** John Smith"));
1391        assert!(result.contains("**Position:** 0-10"));
1392        assert!(result.contains("| Class | Count |"));
1393        assert!(result.contains("| person | 1 |"));
1394    }
1395
1396    #[test]
1397    fn test_json_export() {
1398        let document = create_sample_document();
1399        let config = ExportConfig {
1400            format: ExportFormat::Json,
1401            show_char_intervals: true,
1402            include_text: true,
1403            include_statistics: true,
1404            ..Default::default()
1405        };
1406
1407        let result = export_document(&document, &config).unwrap();
1408        let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
1409        
1410        assert_eq!(parsed["document_id"], "test_doc");
1411        assert!(parsed["text"].is_string());
1412        assert!(parsed["extractions"].is_array());
1413        assert!(parsed["statistics"].is_object());
1414        
1415        let extractions = parsed["extractions"].as_array().unwrap();
1416        assert_eq!(extractions.len(), 3);
1417        
1418        let first_extraction = &extractions[0];
1419        assert_eq!(first_extraction["extraction_class"], "person");
1420        assert_eq!(first_extraction["extraction_text"], "John Smith");
1421        assert!(first_extraction["char_interval"].is_object());
1422        
1423        let stats = &parsed["statistics"];
1424        assert_eq!(stats["total_extractions"], 3);
1425        assert_eq!(stats["unique_classes"], 3);
1426    }
1427
1428    #[test]
1429    fn test_csv_export() {
1430        let document = create_sample_document();
1431        let config = ExportConfig {
1432            format: ExportFormat::Csv,
1433            show_char_intervals: true,
1434            ..Default::default()
1435        };
1436
1437        let result = export_document(&document, &config).unwrap();
1438        let lines: Vec<&str> = result.lines().collect();
1439        
1440        // Check header
1441        assert_eq!(lines[0], "extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index");
1442        
1443        // Check data rows
1444        assert_eq!(lines.len(), 4); // Header + 3 data rows
1445        assert!(lines[1].contains("person,John Smith"));
1446        assert!(lines[2].contains("company,TechCorp"));
1447        assert!(lines[3].contains("salary,\"$50,000\""));
1448        assert!(lines[1].contains("MatchExact"));
1449        assert!(lines[3].contains("MatchFuzzy"));
1450    }
1451
1452    #[test]
1453    fn test_csv_export_without_intervals() {
1454        let document = create_sample_document();
1455        let config = ExportConfig {
1456            format: ExportFormat::Csv,
1457            show_char_intervals: false,
1458            ..Default::default()
1459        };
1460
1461        let result = export_document(&document, &config).unwrap();
1462        let lines: Vec<&str> = result.lines().collect();
1463        
1464        // Check header
1465        assert_eq!(lines[0], "extraction_class,extraction_text,description,group_index");
1466        
1467        // Should not contain position columns
1468        assert!(!result.contains("start_char"));
1469        assert!(!result.contains("end_char"));
1470    }
1471
1472    #[test]
1473    fn test_csv_escape() {
1474        assert_eq!(csv_escape("simple"), "simple");
1475        assert_eq!(csv_escape("has,comma"), "\"has,comma\"");
1476        assert_eq!(csv_escape("has\"quote"), "\"has\"\"quote\"");
1477        assert_eq!(csv_escape("has\nnewline"), "\"has\nnewline\"");
1478        assert_eq!(csv_escape("has,comma\"and quote"), "\"has,comma\"\"and quote\"");
1479    }
1480
1481    #[test]
1482    fn test_html_escape() {
1483        assert_eq!(html_escape("simple"), "simple");
1484        assert_eq!(html_escape("has<tag>"), "has&lt;tag&gt;");
1485        assert_eq!(html_escape("has\"quote"), "has&quot;quote");
1486        assert_eq!(html_escape("has'apostrophe"), "has&#x27;apostrophe");
1487        assert_eq!(html_escape("has&ampersand"), "has&amp;ampersand");
1488    }
1489
1490    #[test]
1491    fn test_export_config_defaults() {
1492        let config = ExportConfig::default();
1493        assert_eq!(config.format, ExportFormat::Text);
1494        assert!(!config.show_char_intervals);
1495        assert!(config.include_text);
1496        assert!(config.highlight_extractions);
1497        assert!(config.include_statistics);
1498        assert!(config.custom_css.is_none());
1499        assert!(config.title.is_none());
1500    }
1501
1502    #[test]
1503    fn test_empty_document() {
1504        let document = AnnotatedDocument {
1505            document_id: Some("empty".to_string()),
1506            text: Some("".to_string()),
1507            extractions: None,
1508        };
1509
1510        let config = ExportConfig::default();
1511        let result = export_document(&document, &config).unwrap();
1512        
1513        assert!(result.contains("No extractions found"));
1514    }
1515
1516    #[test]
1517    fn test_document_without_text() {
1518        let document = AnnotatedDocument {
1519            document_id: Some("no_text".to_string()),
1520            text: None,
1521            extractions: None,
1522        };
1523
1524        let config = ExportConfig::default();
1525        let result = export_document(&document, &config).unwrap();
1526        
1527        assert!(result.contains("No text"));
1528    }
1529
1530    #[test]
1531    fn test_export_format_variants() {
1532        let document = create_sample_document();
1533        
1534        // Test all export formats don't panic
1535        for format in [ExportFormat::Text, ExportFormat::Html, ExportFormat::Markdown, ExportFormat::Json, ExportFormat::Csv] {
1536            let config = ExportConfig {
1537                format,
1538                ..Default::default()
1539            };
1540            let result = export_document(&document, &config);
1541            assert!(result.is_ok(), "Format {:?} failed", format);
1542        }
1543    }
1544
1545    #[test]
1546    fn test_highlight_text_html() {
1547        let document = create_sample_document();
1548        let text = document.text.as_ref().unwrap();
1549        
1550        let result = highlight_text_html(text, &document).unwrap();
1551        
1552        assert!(result.contains("extraction-highlight"));
1553        assert!(result.contains("data-class=\"person\""));
1554        assert!(result.contains("data-text=\"John Smith\""));
1555        assert!(result.contains("John Smith"));
1556    }
1557
1558    #[test]
1559    fn test_count_extraction_classes() {
1560        let extractions = vec![
1561            Extraction {
1562                extraction_class: "person".to_string(),
1563                extraction_text: "John".to_string(),
1564                char_interval: None,
1565                alignment_status: None,
1566                extraction_index: None,
1567                group_index: None,
1568                description: None,
1569                attributes: Some(HashMap::new()),
1570                token_interval: None,
1571            },
1572            Extraction {
1573                extraction_class: "person".to_string(),
1574                extraction_text: "Jane".to_string(),
1575                char_interval: None,
1576                alignment_status: None,
1577                extraction_index: None,
1578                group_index: None,
1579                description: None,
1580                attributes: Some(HashMap::new()),
1581                token_interval: None,
1582            },
1583            Extraction {
1584                extraction_class: "company".to_string(),
1585                extraction_text: "TechCorp".to_string(),
1586                char_interval: None,
1587                alignment_status: None,
1588                extraction_index: None,
1589                group_index: None,
1590                description: None,
1591                attributes: Some(HashMap::new()),
1592                token_interval: None,
1593            },
1594        ];
1595
1596        let counts = count_extraction_classes(&extractions);
1597        
1598        assert_eq!(counts.get("person"), Some(&2));
1599        assert_eq!(counts.get("company"), Some(&1));
1600        assert_eq!(counts.len(), 2);
1601    }
1602
1603    #[test]
1604    fn test_export_pipeline_html_renders_layers() {
1605        // Original text
1606        let text = "The system shall process 100 transactions per second.";
1607
1608        // Step definitions
1609        let steps = vec![
1610            PipelineStep {
1611                id: "s1".to_string(),
1612                name: "Extract Requirements".to_string(),
1613                description: "".to_string(),
1614                examples: vec![],
1615                prompt: "".to_string(),
1616                output_field: "requirements".to_string(),
1617                filter: None,
1618                depends_on: vec![],
1619            },
1620            PipelineStep {
1621                id: "s2".to_string(),
1622                name: "Extract Values".to_string(),
1623                description: "".to_string(),
1624                examples: vec![],
1625                prompt: "".to_string(),
1626                output_field: "values".to_string(),
1627                filter: None,
1628                depends_on: vec!["s1".to_string()],
1629            },
1630        ];
1631
1632        let cfg = PipelineConfig {
1633            name: "Test".to_string(),
1634            description: "".to_string(),
1635            version: "0.0.0".to_string(),
1636            steps: steps.clone(),
1637            global_config: LibExtractConfig::default(),
1638            enable_parallel_execution: false,
1639        };
1640
1641        // Compute positions
1642        let parent_start = 0usize;
1643        let parent_end = text.len();
1644        let hundred_idx = text.find("100").unwrap();
1645        let unit_idx = text.find("transactions per second").unwrap();
1646
1647        let step1_res = StepResult {
1648            step_id: "s1".to_string(),
1649            step_name: "Extract Requirements".to_string(),
1650            extractions: vec![Extraction {
1651                extraction_class: "requirement".to_string(),
1652                extraction_text: text.to_string(),
1653                char_interval: Some(CharInterval::new(Some(parent_start), Some(parent_end))),
1654                alignment_status: Some(AlignmentStatus::MatchExact),
1655                extraction_index: Some(0),
1656                group_index: None,
1657                description: None,
1658                attributes: Some(HashMap::new()),
1659                token_interval: None,
1660            }],
1661            processing_time_ms: 1,
1662            input_count: 1,
1663            success: true,
1664            error_message: None,
1665        };
1666
1667        let step2_res = StepResult {
1668            step_id: "s2".to_string(),
1669            step_name: "Extract Values".to_string(),
1670            extractions: vec![
1671                Extraction {
1672                    extraction_class: "value".to_string(),
1673                    extraction_text: "100".to_string(),
1674                    char_interval: Some(CharInterval::new(Some(hundred_idx), Some(hundred_idx + 3))),
1675                    alignment_status: Some(AlignmentStatus::MatchExact),
1676                    extraction_index: Some(0),
1677                    group_index: None,
1678                    description: None,
1679                    attributes: Some(HashMap::new()),
1680                    token_interval: None,
1681                },
1682                Extraction {
1683                    extraction_class: "unit".to_string(),
1684                    extraction_text: "transactions per second".to_string(),
1685                    char_interval: Some(CharInterval::new(Some(unit_idx), Some(unit_idx + "transactions per second".len()))),
1686                    alignment_status: Some(AlignmentStatus::MatchExact),
1687                    extraction_index: Some(1),
1688                    group_index: None,
1689                    description: None,
1690                    attributes: Some(HashMap::new()),
1691                    token_interval: None,
1692                }
1693            ],
1694            processing_time_ms: 1,
1695            input_count: 1,
1696            success: true,
1697            error_message: None,
1698        };
1699
1700        let pr = PipelineResult {
1701            config: cfg,
1702            step_results: vec![step1_res, step2_res],
1703            nested_output: serde_json::json!({}),
1704            total_time_ms: 2,
1705            success: true,
1706            error_message: None,
1707        };
1708
1709        let config = ExportConfig { format: ExportFormat::Html, ..Default::default() };
1710        let html = export_pipeline_html(&pr, text, &config).unwrap();
1711        assert!(html.contains("step-0"), "Should render step-0 (parent)");
1712        assert!(html.contains("step-1"), "Should render step-1 (child)");
1713        assert!(html.contains("100"));
1714        assert!(html.contains("transactions per second"));
1715    }
1716
1717    #[test]
1718    fn test_export_pipeline_html_exact_match_fallback() {
1719        let text = "System uptime must be 99.9% for availability.";
1720
1721        let steps = vec![
1722            PipelineStep { id: "s1".to_string(), name: "Req".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "req".to_string(), filter: None, depends_on: vec![] },
1723            PipelineStep { id: "s2".to_string(), name: "Vals".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "vals".to_string(), filter: None, depends_on: vec!["s1".to_string()] },
1724        ];
1725        let cfg = PipelineConfig { name: "T".to_string(), description: "".to_string(), version: "0".to_string(), steps, global_config: LibExtractConfig::default(), enable_parallel_execution: false };
1726
1727        let step1_res = StepResult {
1728            step_id: "s1".to_string(),
1729            step_name: "Req".to_string(),
1730            extractions: vec![Extraction {
1731                extraction_class: "requirement".to_string(),
1732                extraction_text: text.to_string(),
1733                char_interval: None, // Not strictly needed for this test
1734                alignment_status: None,
1735                extraction_index: None,
1736                group_index: None,
1737                description: None,
1738                attributes: Some(HashMap::new()),
1739                token_interval: None,
1740            }],
1741            processing_time_ms: 1,
1742            input_count: 1,
1743            success: true,
1744            error_message: None,
1745        };
1746
1747        let step2_res = StepResult {
1748            step_id: "s2".to_string(),
1749            step_name: "Vals".to_string(),
1750            extractions: vec![Extraction {
1751                extraction_class: "uptime".to_string(),
1752                extraction_text: "99.9%".to_string(),
1753                char_interval: None, // Force fallback
1754                alignment_status: None,
1755                extraction_index: None,
1756                group_index: None,
1757                description: None,
1758                attributes: Some(HashMap::new()),
1759                token_interval: None,
1760            }],
1761            processing_time_ms: 1,
1762            input_count: 1,
1763            success: true,
1764            error_message: None,
1765        };
1766
1767        let pr = PipelineResult { config: cfg, step_results: vec![step1_res, step2_res], nested_output: serde_json::json!({}), total_time_ms: 2, success: true, error_message: None };
1768
1769        let config = ExportConfig { format: ExportFormat::Html, ..Default::default() };
1770        let html = export_pipeline_html(&pr, text, &config).unwrap();
1771        assert!(html.contains("99.9%"), "Fallback should highlight exact match in original text");
1772    }
1773
1774    #[test]
1775    fn test_export_pipeline_html_overlap_rendering() {
1776        // Overlapping child spans inside a parent requirement
1777        let text = "The system shall support 10 users concurrently.";
1778
1779        let steps = vec![
1780            PipelineStep { id: "s1".to_string(), name: "Req".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "req".to_string(), filter: None, depends_on: vec![] },
1781            PipelineStep { id: "s2".to_string(), name: "Vals".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "vals".to_string(), filter: None, depends_on: vec!["s1".to_string()] },
1782        ];
1783        let cfg = PipelineConfig { name: "T".to_string(), description: "".to_string(), version: "0".to_string(), steps, global_config: LibExtractConfig::default(), enable_parallel_execution: false };
1784
1785        let parent_start = 0usize;
1786        let parent_end = text.len();
1787        let ten_idx = text.find("10").unwrap();
1788        let users_idx = text.find("10 users").unwrap();
1789
1790        let step1_res = StepResult {
1791            step_id: "s1".to_string(),
1792            step_name: "Req".to_string(),
1793            extractions: vec![Extraction {
1794                extraction_class: "requirement".to_string(),
1795                extraction_text: text.to_string(),
1796                char_interval: Some(CharInterval::new(Some(parent_start), Some(parent_end))),
1797                alignment_status: None,
1798                extraction_index: None,
1799                group_index: None,
1800                description: None,
1801                attributes: Some(HashMap::new()),
1802                token_interval: None,
1803            }],
1804            processing_time_ms: 1,
1805            input_count: 1,
1806            success: true,
1807            error_message: None,
1808        };
1809
1810        let step2_res = StepResult {
1811            step_id: "s2".to_string(),
1812            step_name: "Vals".to_string(),
1813            extractions: vec![
1814                Extraction {
1815                    extraction_class: "value".to_string(),
1816                    extraction_text: "10".to_string(),
1817                    char_interval: Some(CharInterval::new(Some(ten_idx), Some(ten_idx + 2))),
1818                    alignment_status: None,
1819                    extraction_index: None,
1820                    group_index: None,
1821                    description: None,
1822                    attributes: Some(HashMap::new()),
1823                    token_interval: None,
1824                },
1825                Extraction {
1826                    extraction_class: "phrase".to_string(),
1827                    extraction_text: "10 users".to_string(),
1828                    char_interval: Some(CharInterval::new(Some(users_idx), Some(users_idx + "10 users".len()))),
1829                    alignment_status: None,
1830                    extraction_index: None,
1831                    group_index: None,
1832                    description: None,
1833                    attributes: Some(HashMap::new()),
1834                    token_interval: None,
1835                },
1836            ],
1837            processing_time_ms: 1,
1838            input_count: 1,
1839            success: true,
1840            error_message: None,
1841        };
1842
1843        let pr = PipelineResult { config: cfg, step_results: vec![step1_res, step2_res], nested_output: serde_json::json!({}), total_time_ms: 2, success: true, error_message: None };
1844
1845        let mut config = ExportConfig { format: ExportFormat::Html, ..Default::default() };
1846        config.allow_overlapping_highlights = true;
1847        let html = export_pipeline_html(&pr, text, &config).unwrap();
1848        // Should include both occurrences
1849        assert!(html.contains("10"));
1850        assert!(html.contains("10 users"));
1851    }
1852}
langextract_rust/visualization.rs

langextract_rust/
visualization.rs