1use crate::{data::AnnotatedDocument, exceptions::LangExtractResult};
4use crate::pipeline::PipelineResult;
5use serde_json::{json, Value};
6use std::collections::HashMap;
7use crate::Extraction;
8#[derive(Debug, Clone, Copy, PartialEq)]
10#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
11pub enum ExportFormat {
12 Text,
14 Html,
16 Markdown,
18 Json,
20 Csv,
22}
23
24#[derive(Debug, Clone)]
26pub struct ExportConfig {
27 pub format: ExportFormat,
29 pub show_char_intervals: bool,
31 pub include_text: bool,
33 pub highlight_extractions: bool,
35 pub include_statistics: bool,
37 pub custom_css: Option<String>,
39 pub title: Option<String>,
41 pub aggregate_pipeline_highlights: bool,
43 pub expand_nested_json: bool,
45 pub allow_overlapping_highlights: bool,
47 pub show_pipeline_legend: bool,
49}
50
51impl Default for ExportConfig {
52 fn default() -> Self {
53 Self {
54 format: ExportFormat::Text,
55 show_char_intervals: false,
56 include_text: true,
57 highlight_extractions: true,
58 include_statistics: true,
59 custom_css: None,
60 title: None,
61 aggregate_pipeline_highlights: false,
62 expand_nested_json: false,
63 allow_overlapping_highlights: false,
64 show_pipeline_legend: true,
65 }
66 }
67}
68
69pub fn export_document(
71 annotated_document: &AnnotatedDocument,
72 config: &ExportConfig,
73) -> LangExtractResult<String> {
74 match config.format {
75 ExportFormat::Text => visualize_text(annotated_document, config.show_char_intervals),
76 ExportFormat::Html => export_html(annotated_document, config),
77 ExportFormat::Markdown => export_markdown(annotated_document, config),
78 ExportFormat::Json => export_json(annotated_document, config),
79 ExportFormat::Csv => export_csv(annotated_document, config),
80 }
81}
82
83pub fn export_pipeline_html(
85 pipeline_result: &PipelineResult,
86 original_text: &str,
87 config: &ExportConfig,
88) -> LangExtractResult<String> {
89 let title = config.title.as_deref().unwrap_or("LangExtract Pipeline Results");
90
91 let mut spans: Vec<LayeredSpan> = build_layered_spans(pipeline_result, original_text, config.expand_nested_json);
93 spans.sort_by_key(|s| (s.start, s.end));
94
95 let mut html = String::new();
96 html.push_str("<!DOCTYPE html>\n");
97 html.push_str("<html lang=\"en\">\n");
98 html.push_str("<head>\n");
99 html.push_str(" <meta charset=\"UTF-8\">\n");
100 html.push_str(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n");
101 html.push_str(&format!(" <title>{}</title>\n", title));
102 html.push_str(" <style>\n");
103 html.push_str(" body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; background: #f8fafc; color: #334155; }\n");
104 html.push_str(" .container { background: white; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); overflow: hidden; }\n");
105 html.push_str(" .header { background: linear-gradient(135deg, #0ea5e9 0%, #6366f1 100%); color: white; padding: 30px; text-align: center; }\n");
106 html.push_str(" .header h1 { margin: 0; font-size: 2.2em; font-weight: 400; }\n");
107 html.push_str(" .content { padding: 30px; }\n");
108 html.push_str(" .section { margin-bottom: 32px; }\n");
109 html.push_str(" .section h2 { color: #1e293b; border-bottom: 2px solid #e2e8f0; padding-bottom: 10px; margin-bottom: 16px; }\n");
110 html.push_str(" .document-text { background: #f1f5f9; border-radius: 8px; padding: 16px; font-family: 'Monaco', 'Menlo', monospace; line-height: 1.6; white-space: pre-wrap; position: relative; }\n");
111 html.push_str(" .legend { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 12px; }\n");
112 html.push_str(" .legend-item { display: inline-flex; align-items: center; gap: 8px; padding: 6px 10px; border: 1px solid #e2e8f0; border-radius: 6px; background: #fff; }\n");
113 html.push_str(" .badge { width: 12px; height: 12px; border-radius: 3px; display: inline-block; }\n");
114 html.push_str(" .extraction-highlight { border-radius: 3px; padding: 1px 2px; cursor: pointer; }\n");
115 html.push_str(" .step-0 { background: rgba(59, 130, 246, 0.2); border: 1px solid rgba(59, 130, 246, 0.4); }\n");
116 html.push_str(" .step-1 { background: rgba(16, 185, 129, 0.2); border: 1px solid rgba(16, 185, 129, 0.4); }\n");
117 html.push_str(" .step-2 { background: rgba(234, 179, 8, 0.2); border: 1px solid rgba(234, 179, 8, 0.5); }\n");
118 html.push_str(" .step-3 { background: rgba(244, 63, 94, 0.2); border: 1px solid rgba(244, 63, 94, 0.4); }\n");
119 html.push_str(" .step-4 { background: rgba(99, 102, 241, 0.2); border: 1px solid rgba(99, 102, 241, 0.4); }\n");
120 html.push_str(" </style>\n");
121 html.push_str("</head>\n");
122 html.push_str("<body>\n");
123 html.push_str(" <div class=\"container\">\n");
124 html.push_str(" <div class=\"header\">\n");
125 html.push_str(&format!(" <h1>{}</h1>\n", title));
126 html.push_str(" </div>\n");
127 html.push_str(" <div class=\"content\">\n");
128 html.push_str(" <div class=\"section\">\n");
129 html.push_str(" <h2>📄 Document Text</h2>\n");
130 if config.show_pipeline_legend {
131 html.push_str(&build_legend_html(pipeline_result));
132 }
133 html.push_str(" <div class=\"document-text\">");
134 html.push_str(&highlight_text_html_with_layers(original_text, &spans, config.allow_overlapping_highlights)?);
135 html.push_str("</div>\n");
136 html.push_str(" </div>\n");
137 html.push_str(" <div class=\"section\">\n");
138 html.push_str(" <h2>🎯 Extractions by Step</h2>\n");
139 html.push_str(" <div>\n");
140 html.push_str(&build_extractions_list_html(&spans));
141 html.push_str(" </div>\n");
142 html.push_str(" </div>\n");
143 html.push_str(" </div>\n");
144 html.push_str(" </div>\n");
145 html.push_str("</body>\n");
146 html.push_str("</html>\n");
147
148 Ok(html)
149}
150
151pub fn export_pipeline_flattened_json(
153 pipeline_result: &PipelineResult,
154 original_text: &str,
155 expand_nested_json: bool,
156) -> LangExtractResult<String> {
157 fn push_item(
159 items: &mut Vec<Value>,
160 class_name: &str,
161 text: &str,
162 step_id: &str,
163 step_name: &str,
164 start: Option<usize>,
165 end: Option<usize>,
166 parent_attrs: Option<&std::collections::HashMap<String, Value>>,
167 ) {
168 let mut obj = serde_json::Map::new();
169 obj.insert("extraction_class".to_string(), Value::String(class_name.to_string()));
170 obj.insert("extraction_text".to_string(), Value::String(text.to_string()));
171 obj.insert("step_id".to_string(), Value::String(step_id.to_string()));
172 obj.insert("step_name".to_string(), Value::String(step_name.to_string()));
173 if let (Some(s), Some(e)) = (start, end) {
174 let mut ci = serde_json::Map::new();
175 ci.insert("start_pos".to_string(), Value::Number(serde_json::Number::from(s as u64)));
176 ci.insert("end_pos".to_string(), Value::Number(serde_json::Number::from(e as u64)));
177 obj.insert("char_interval".to_string(), Value::Object(ci));
178 }
179 if let Some(attrs) = parent_attrs {
180 if let Some(ps) = attrs.get("parent_step_id") {
181 obj.insert("parent_step_id".to_string(), ps.clone());
182 }
183 if let Some(ps) = attrs.get("parent_start") {
184 obj.insert("parent_start".to_string(), ps.clone());
185 }
186 if let Some(pe) = attrs.get("parent_end") {
187 obj.insert("parent_end".to_string(), pe.clone());
188 }
189 }
190 items.push(Value::Object(obj));
191 }
192
193 let mut items: Vec<Value> = Vec::new();
194
195 let mut step_id_to_name: std::collections::HashMap<&str, &str> = std::collections::HashMap::new();
197 for s in &pipeline_result.config.steps {
198 step_id_to_name.insert(s.id.as_str(), s.name.as_str());
199 }
200
201 for step_res in &pipeline_result.step_results {
202 let step_name = step_id_to_name.get(step_res.step_id.as_str()).copied().unwrap_or("");
203 for e in &step_res.extractions {
204 let (mut start, mut end) = (e.char_interval.as_ref().and_then(|ci| ci.start_pos), e.char_interval.as_ref().and_then(|ci| ci.end_pos));
206 if start.is_none() || end.is_none() {
207 if let Some(found) = original_text.find(&e.extraction_text) {
208 start = Some(found);
209 end = Some(found + e.extraction_text.len());
210 }
211 }
212
213 push_item(
215 &mut items,
216 &e.extraction_class,
217 &e.extraction_text,
218 &step_res.step_id,
219 step_name,
220 start,
221 end,
222 e.attributes.as_ref(),
223 );
224
225 if expand_nested_json {
227 if let Ok(json_val) = serde_json::from_str::<Value>(&e.extraction_text) {
228 fn collect(prefix: &str, val: &Value, out: &mut Vec<(String, String)>) {
230 match val {
231 Value::String(s) => out.push((prefix.to_string(), s.clone())),
232 Value::Object(map) => {
233 for (k, v) in map {
234 let p = if prefix.is_empty() { k.clone() } else { format!("{}:{}", prefix, k) };
235 collect(&p, v, out);
236 }
237 }
238 Value::Array(arr) => {
239 for (i, v) in arr.iter().enumerate() {
240 let p = if prefix.is_empty() { format!("[{}]", i) } else { format!("{}:[{}]", prefix, i) };
241 collect(&p, v, out);
242 }
243 }
244 _ => {}
245 }
246 }
247
248 let mut leafs: Vec<(String, String)> = Vec::new();
249 collect(&e.extraction_class, &json_val, &mut leafs);
250
251 for (cls, s) in leafs {
252 if s.is_empty() { continue; }
253 let (mut ls, mut le) = (None, None);
254 if let Some(found) = original_text.find(&s) {
255 ls = Some(found);
256 le = Some(found + s.len());
257 }
258 push_item(
259 &mut items,
260 &cls,
261 &s,
262 &step_res.step_id,
263 step_name,
264 ls,
265 le,
266 e.attributes.as_ref(),
267 );
268 }
269 }
270 }
271 }
272 }
273
274 let mut root = serde_json::Map::new();
275 root.insert("extractions".to_string(), Value::Array(items));
276 let mut meta = serde_json::Map::new();
277 meta.insert("steps".to_string(), Value::Number(serde_json::Number::from(pipeline_result.config.steps.len() as u64)));
278 meta.insert("total_time_ms".to_string(), Value::Number(serde_json::Number::from(pipeline_result.total_time_ms)));
279 meta.insert("expand_nested_json".to_string(), Value::Bool(expand_nested_json));
280 root.insert("metadata".to_string(), Value::Object(meta));
281
282 Ok(serde_json::to_string_pretty(&Value::Object(root))?)
283}
284
285#[derive(Debug, Clone)]
286struct LayeredSpan {
287 start: usize,
288 end: usize,
289 class_name: String,
290 text: String,
291 step_index: usize,
292 parent_step_id: Option<String>,
293 parent_class: Option<String>,
294 parent_text: Option<String>,
295}
296
297fn build_layered_spans(pipeline_result: &PipelineResult, original_text: &str, expand_nested_json: bool) -> Vec<LayeredSpan> {
298 let mut step_id_to_index: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
300 for (i, s) in pipeline_result.config.steps.iter().enumerate() {
301 step_id_to_index.insert(s.id.as_str(), i);
302 }
303
304 let mut spans = Vec::new();
305 for step_res in &pipeline_result.step_results {
306 let step_index = *step_id_to_index.get(step_res.step_id.as_str()).unwrap_or(&0);
307 for e in &step_res.extractions {
308 let mut added = false;
309 if let Some(interval) = &e.char_interval {
310 if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
311 if start < end && end <= original_text.len() {
312 spans.push(LayeredSpan {
313 start,
314 end,
315 class_name: e.extraction_class.clone(),
316 text: e.extraction_text.clone(),
317 step_index,
318 parent_step_id: e.attributes.as_ref().and_then(|m| m.get("parent_step_id")).and_then(|v| v.as_str()).map(|s| s.to_string()),
319 parent_class: e.attributes.as_ref().and_then(|m| m.get("parent_class")).and_then(|v| v.as_str()).map(|s| s.to_string()),
320 parent_text: e.attributes.as_ref().and_then(|m| m.get("parent_text")).and_then(|v| v.as_str()).map(|s| s.to_string()),
321 });
322 added = true;
323 }
324 }
325 }
326 if !added {
327 if let Some(found) = original_text.find(&e.extraction_text) {
329 let start = found;
330 let end = start + e.extraction_text.len();
331 if end <= original_text.len() {
332 spans.push(LayeredSpan {
333 start,
334 end,
335 class_name: e.extraction_class.clone(),
336 text: e.extraction_text.clone(),
337 step_index,
338 parent_step_id: e.attributes.as_ref().and_then(|m| m.get("parent_step_id")).and_then(|v| v.as_str()).map(|s| s.to_string()),
339 parent_class: e.attributes.as_ref().and_then(|m| m.get("parent_class")).and_then(|v| v.as_str()).map(|s| s.to_string()),
340 parent_text: e.attributes.as_ref().and_then(|m| m.get("parent_text")).and_then(|v| v.as_str()).map(|s| s.to_string()),
341 });
342 }
343 }
344 }
345
346 if expand_nested_json {
348 if let Ok(json_val) = serde_json::from_str::<Value>(&e.extraction_text) {
349 fn collect_strings(prefix: &str, val: &Value, out: &mut Vec<(String, String)>) {
351 match val {
352 Value::String(s) => {
353 out.push((prefix.to_string(), s.clone()));
354 }
355 Value::Object(map) => {
356 for (k, v) in map {
357 let new_prefix = if prefix.is_empty() { k.clone() } else { format!("{}:{}", prefix, k) };
358 collect_strings(&new_prefix, v, out);
359 }
360 }
361 Value::Array(arr) => {
362 for (i, v) in arr.iter().enumerate() {
363 let new_prefix = if prefix.is_empty() { format!("[{}]", i) } else { format!("{}:[{}]", prefix, i) };
364 collect_strings(&new_prefix, v, out);
365 }
366 }
367 _ => {}
368 }
369 }
370
371 let mut pairs: Vec<(String, String)> = Vec::new();
372 collect_strings(&e.extraction_class, &json_val, &mut pairs);
373
374 let parent_step_id = e.attributes.as_ref().and_then(|m| m.get("parent_step_id")).and_then(|v| v.as_str()).map(|s| s.to_string());
375
376 for (class_name, s) in pairs {
377 if !s.is_empty() {
378 if let Some(found) = original_text.find(&s) {
379 let start = found;
380 let end = start + s.len();
381 if end <= original_text.len() {
382 spans.push(LayeredSpan {
383 start,
384 end,
385 class_name: class_name.clone(),
386 text: s.clone(),
387 step_index,
388 parent_step_id: parent_step_id.clone(),
389 parent_class: e.attributes.as_ref().and_then(|m| m.get("parent_class")).and_then(|v| v.as_str()).map(|s| s.to_string()),
390 parent_text: e.attributes.as_ref().and_then(|m| m.get("parent_text")).and_then(|v| v.as_str()).map(|s| s.to_string()),
391 });
392 }
393 }
394 }
395 }
396 }
397 }
398 }
399 }
400 spans
401}
402
403fn build_legend_html(pipeline_result: &PipelineResult) -> String {
404 let mut step_id_to_index: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
405 for (i, s) in pipeline_result.config.steps.iter().enumerate() {
406 step_id_to_index.insert(s.id.as_str(), i);
407 }
408 let mut items = String::new();
409 for step in &pipeline_result.config.steps {
410 let idx = *step_id_to_index.get(step.id.as_str()).unwrap_or(&0);
411 items.push_str(&format!(r#"<span class="legend-item"><span class="badge step-{}"></span>Step {}: {}</span>"#, idx, idx + 1, html_escape(&step.name)));
412 }
413 format!(r#"<div class="legend">{}</div>"#, items)
414}
415
416fn build_extractions_list_html(spans: &[LayeredSpan]) -> String {
417 let mut grouped: std::collections::BTreeMap<usize, Vec<&LayeredSpan>> = std::collections::BTreeMap::new();
418 for s in spans {
419 grouped.entry(s.step_index).or_default().push(s);
420 }
421 let mut html = String::new();
422 for (step_idx, list) in grouped {
423 html.push_str(&format!(r#"<h3>Step {}</h3>"#, step_idx + 1));
424 html.push_str("<ul>");
425 for s in list {
426 let parent_info = match (&s.parent_class, &s.parent_text) {
427 (Some(pc), Some(pt)) if !pc.is_empty() && !pt.is_empty() => format!(" (parent: [{}] {})", html_escape(pc), html_escape(pt)),
428 _ => String::new(),
429 };
430 html.push_str(&format!(r#"<li><span class=\"step-{} extraction-highlight\">[{}] {}{}</span></li>"#, step_idx, html_escape(&s.class_name), html_escape(&s.text), parent_info));
431 }
432 html.push_str("</ul>");
433 }
434 html
435}
436
437fn highlight_text_html_with_layers(
439 text: &str,
440 spans: &[LayeredSpan],
441 allow_overlaps: bool,
442) -> LangExtractResult<String> {
443 if !allow_overlaps {
444 let mut intervals: Vec<(usize, usize, usize)> = spans
445 .iter()
446 .enumerate()
447 .filter_map(|(i, s)| if s.start < s.end && s.end <= text.len() { Some((s.start, s.end, i)) } else { None })
448 .collect();
449 intervals.sort_by_key(|(start, end, _)| (*start, *end));
450
451 let mut result = String::new();
452 let mut last_pos = 0usize;
453 let mut last_end = 0usize;
454 for (start, end, idx) in intervals {
455 if start < last_end { continue; }
456 let safe_start = find_char_boundary(text, start);
457 let safe_end = find_char_boundary(text, end);
458 if safe_start > last_pos {
459 result.push_str(&html_escape(&text[last_pos..safe_start]));
460 }
461 if safe_start < safe_end {
462 let s = &spans[idx];
463 let seg = &text[safe_start..safe_end];
464 result.push_str(&format!(
465 r#"<span class="extraction-highlight step-{}" data-class="{}" data-text="{}" data-parent-class="{}">{}</span>"#,
466 s.step_index,
467 html_escape(&s.class_name),
468 html_escape(&s.text),
469 html_escape(s.parent_class.as_deref().unwrap_or("")),
470 html_escape(seg)
471 ));
472 last_pos = safe_end;
473 last_end = safe_end;
474 }
475 }
476 if last_pos < text.len() {
477 result.push_str(&html_escape(&text[last_pos..]));
478 }
479 return Ok(result);
480 }
481
482 let mut events: Vec<(usize, bool, usize)> = Vec::new();
483 for (i, s) in spans.iter().enumerate() {
484 if s.start < s.end && s.end <= text.len() {
485 events.push((s.start, true, i));
486 events.push((s.end, false, i));
487 }
488 }
489 events.sort_by_key(|(pos, is_start, idx)| (*pos, !*is_start, spans[*idx].step_index));
490
491 let mut result = String::new();
492 let mut cursor = 0usize;
493 let mut open: Vec<usize> = Vec::new();
494
495 let mut push_plain = |from: usize, to: usize, out: &mut String| {
496 if to > from {
497 out.push_str(&html_escape(&text[from..to]));
498 }
499 };
500
501 for (pos, is_start, idx) in events {
502 let safe_pos = find_char_boundary(text, pos);
503 if is_start {
504 push_plain(cursor, safe_pos, &mut result);
505 let s = &spans[idx];
506 result.push_str(&format!(
507 r#"<span class="extraction-highlight step-{}" data-class="{}" data-text="{}" data-parent-class="{}">{}</span>"#,
508 s.step_index,
509 html_escape(&s.class_name),
510 html_escape(&s.text),
511 html_escape(s.parent_class.as_deref().unwrap_or("")),
512 html_escape(&text[cursor..safe_pos])
513 ));
514 open.push(idx);
515 cursor = safe_pos;
516 } else {
517 push_plain(cursor, safe_pos, &mut result);
518 cursor = safe_pos;
519 if let Some(pos_in_open) = open.iter().rposition(|&j| j == idx) {
520 for _ in pos_in_open..open.len() {
521 result.push_str("</span>");
522 }
523 open.remove(pos_in_open);
524 for j in open.iter().copied() {
525 let s = &spans[j];
526 result.push_str(&format!(
527 r#"<span class="extraction-highlight step-{}" data-class="{}" data-text="{}" data-parent-class="{}">{}</span>"#,
528 s.step_index,
529 html_escape(&s.class_name),
530 html_escape(&s.text),
531 html_escape(s.parent_class.as_deref().unwrap_or("")),
532 html_escape(&text[cursor..safe_pos])
533 ));
534 }
535 }
536 }
537 }
538 push_plain(cursor, text.len(), &mut result);
539 for _ in 0..open.len() { result.push_str("</span>"); }
540 Ok(result)
541}
542
543pub fn visualize(
545 annotated_document: &AnnotatedDocument,
546 show_char_intervals: bool,
547) -> LangExtractResult<String> {
548 visualize_text(annotated_document, show_char_intervals)
549}
550
551fn visualize_text(
553 annotated_document: &AnnotatedDocument,
554 show_char_intervals: bool,
555) -> LangExtractResult<String> {
556 let mut result = String::new();
557
558 result.push_str("📄 EXTRACTION VISUALIZATION\n");
559 result.push_str("=" .repeat(50).as_str());
560 result.push('\n');
561
562 let text = annotated_document.text.as_deref().unwrap_or("No text");
564 result.push_str(&format!("📝 Document Text ({} chars):\n", text.len()));
565 result.push_str(&format!(" {}\n\n", text));
566
567 if let Some(extractions) = &annotated_document.extractions {
569 result.push_str(&format!("🎯 Found {} Extractions:\n", extractions.len()));
570 result.push_str("-".repeat(30).as_str());
571 result.push('\n');
572
573 for (i, extraction) in extractions.iter().enumerate() {
574 result.push_str(&format!("{}. [{}] {}\n",
575 i + 1,
576 extraction.extraction_class,
577 extraction.extraction_text
578 ));
579
580 if show_char_intervals {
581 if let Some(interval) = &extraction.char_interval {
582 result.push_str(&format!(" Position: {:?}\n", interval));
583 }
584 }
585
586 if let Some(description) = &extraction.description {
587 result.push_str(&format!(" Description: {}\n", description));
588 }
589
590 result.push('\n');
591 }
592 } else {
593 result.push_str("ℹ️ No extractions found\n");
594 }
595
596 result.push_str("📊 Statistics:\n");
598 result.push_str("-".repeat(15).as_str());
599 result.push('\n');
600 result.push_str(&format!(" Document ID: {}\n",
601 annotated_document.document_id.as_deref().unwrap_or("None")));
602 result.push_str(&format!(" Text Length: {} characters\n", text.len()));
603 result.push_str(&format!(" Total Extractions: {}\n", annotated_document.extraction_count()));
604
605 if let Some(extractions) = &annotated_document.extractions {
606 let mut class_counts = std::collections::HashMap::new();
608 for extraction in extractions {
609 *class_counts.entry(&extraction.extraction_class).or_insert(0) += 1;
610 }
611
612 result.push_str(" Extraction Classes:\n");
613 for (class, count) in class_counts {
614 result.push_str(&format!(" • {}: {} instance(s)\n", class, count));
615 }
616 }
617
618 Ok(result)
619}
620
621fn export_html(
623 annotated_document: &AnnotatedDocument,
624 config: &ExportConfig,
625) -> LangExtractResult<String> {
626 let title = config.title.as_deref().unwrap_or("LangExtract Results");
627 let text = annotated_document.text.as_deref().unwrap_or("No text");
628
629 let mut html = String::new();
630
631 html.push_str(&format!(r#"<!DOCTYPE html>
633<html lang="en">
634<head>
635 <meta charset="UTF-8">
636 <meta name="viewport" content="width=device-width, initial-scale=1.0">
637 <title>{}</title>
638 <style>
639 body {{
640 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
641 max-width: 1200px;
642 margin: 0 auto;
643 padding: 20px;
644 background: #f8fafc;
645 color: #334155;
646 }}
647 .container {{
648 background: white;
649 border-radius: 12px;
650 box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
651 overflow: hidden;
652 }}
653 .header {{
654 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
655 color: white;
656 padding: 30px;
657 text-align: center;
658 }}
659 .header h1 {{
660 margin: 0;
661 font-size: 2.5em;
662 font-weight: 300;
663 }}
664 .content {{
665 padding: 30px;
666 }}
667 .section {{
668 margin-bottom: 40px;
669 }}
670 .section h2 {{
671 color: #1e293b;
672 border-bottom: 2px solid #e2e8f0;
673 padding-bottom: 10px;
674 margin-bottom: 20px;
675 }}
676 .document-text {{
677 background: #f1f5f9;
678 border-radius: 8px;
679 padding: 20px;
680 font-family: 'Monaco', 'Menlo', monospace;
681 line-height: 1.6;
682 white-space: pre-wrap;
683 position: relative;
684 margin-bottom: 20px;
685 }}
686 .extraction-highlight {{
687 background: rgba(59, 130, 246, 0.2);
688 border: 1px solid rgba(59, 130, 246, 0.4);
689 border-radius: 3px;
690 padding: 1px 2px;
691 cursor: pointer;
692 transition: all 0.2s ease;
693 }}
694 .extraction-highlight:hover {{
695 background: rgba(59, 130, 246, 0.3);
696 transform: translateY(-1px);
697 }}
698 .extractions-grid {{
699 display: grid;
700 grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
701 gap: 20px;
702 margin-bottom: 30px;
703 }}
704 .extraction-card {{
705 background: #f8fafc;
706 border: 1px solid #e2e8f0;
707 border-radius: 8px;
708 padding: 15px;
709 transition: all 0.2s ease;
710 }}
711 .extraction-card:hover {{
712 border-color: #3b82f6;
713 box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15);
714 }}
715 .extraction-class {{
716 background: #3b82f6;
717 color: white;
718 padding: 4px 8px;
719 border-radius: 4px;
720 font-size: 0.8em;
721 font-weight: 600;
722 display: inline-block;
723 margin-bottom: 8px;
724 }}
725 .extraction-text {{
726 font-weight: 600;
727 color: #1e293b;
728 margin-bottom: 8px;
729 }}
730 .extraction-meta {{
731 font-size: 0.9em;
732 color: #64748b;
733 }}
734 .stats-grid {{
735 display: grid;
736 grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
737 gap: 20px;
738 }}
739 .stat-card {{
740 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
741 color: white;
742 padding: 20px;
743 border-radius: 8px;
744 text-align: center;
745 }}
746 .stat-number {{
747 font-size: 2em;
748 font-weight: bold;
749 margin-bottom: 5px;
750 }}
751 .stat-label {{
752 opacity: 0.9;
753 font-size: 0.9em;
754 }}
755 .class-counts {{
756 background: #f1f5f9;
757 border-radius: 8px;
758 padding: 20px;
759 }}
760 .class-count-item {{
761 display: flex;
762 justify-content: space-between;
763 align-items: center;
764 padding: 8px 0;
765 border-bottom: 1px solid #e2e8f0;
766 }}
767 .class-count-item:last-child {{
768 border-bottom: none;
769 }}
770 .class-badge {{
771 background: #10b981;
772 color: white;
773 padding: 2px 6px;
774 border-radius: 12px;
775 font-size: 0.8em;
776 font-weight: 600;
777 }}
778 {}
779 </style>
780</head>
781<body>
782"#, title, config.custom_css.as_deref().unwrap_or("")));
783
784 html.push_str(&format!(r#" <div class="container">
786 <div class="header">
787 <h1>{}</h1>
788 </div>
789 <div class="content">
790"#, title));
791
792 if config.include_text {
794 html.push_str(r#" <div class="section">
795 <h2>📄 Document Text</h2>
796 <div class="document-text">"#);
797
798 if config.highlight_extractions {
799 html.push_str(&highlight_text_html(text, annotated_document)?);
800 } else {
801 html.push_str(&html_escape(text));
802 }
803
804 html.push_str("</div>\n </div>\n");
805 }
806
807 if let Some(extractions) = &annotated_document.extractions {
809 html.push_str(&format!(r#" <div class="section">
810 <h2>🎯 Extractions ({} found)</h2>
811 <div class="extractions-grid">
812"#, extractions.len()));
813
814 for extraction in extractions {
815 html.push_str(&format!(r#" <div class="extraction-card">
816 <div class="extraction-class">{}</div>
817 <div class="extraction-text">{}</div>
818"#, html_escape(&extraction.extraction_class), html_escape(&extraction.extraction_text)));
819
820 if config.show_char_intervals {
821 if let Some(interval) = &extraction.char_interval {
822 html.push_str(&format!(r#" <div class="extraction-meta">Position: {}-{}</div>
823"#, interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
824 }
825 }
826
827 if let Some(description) = &extraction.description {
828 html.push_str(&format!(r#" <div class="extraction-meta">Description: {}</div>
829"#, html_escape(description)));
830 }
831
832 html.push_str(" </div>\n");
833 }
834
835 html.push_str(" </div>\n </div>\n");
836 }
837
838 if config.include_statistics {
840 html.push_str(r#" <div class="section">
841 <h2>📊 Statistics</h2>
842 <div class="stats-grid">
843"#);
844
845 let extraction_count = annotated_document.extraction_count();
846 html.push_str(&format!(r#" <div class="stat-card">
847 <div class="stat-number">{}</div>
848 <div class="stat-label">Total Extractions</div>
849 </div>
850 <div class="stat-card">
851 <div class="stat-number">{}</div>
852 <div class="stat-label">Characters</div>
853 </div>
854"#, extraction_count, text.len()));
855
856 if let Some(extractions) = &annotated_document.extractions {
857 let class_counts = count_extraction_classes(extractions);
858 html.push_str(&format!(r#" <div class="stat-card">
859 <div class="stat-number">{}</div>
860 <div class="stat-label">Unique Classes</div>
861 </div>
862"#, class_counts.len()));
863
864 html.push_str(" </div>\n");
865
866 html.push_str(r#" <h3>Extraction Classes</h3>
868 <div class="class-counts">
869"#);
870
871 for (class, count) in class_counts {
872 html.push_str(&format!(r#" <div class="class-count-item">
873 <span>{}</span>
874 <span class="class-badge">{}</span>
875 </div>
876"#, html_escape(class), count));
877 }
878
879 html.push_str(" </div>\n");
880 } else {
881 html.push_str(" </div>\n");
882 }
883
884 html.push_str(" </div>\n");
885 }
886
887 html.push_str(r#" </div>
889 </div>
890
891 <script>
892 // Add interactivity for extraction highlights
893 document.querySelectorAll('.extraction-highlight').forEach(element => {
894 element.addEventListener('click', function() {
895 const className = this.getAttribute('data-class');
896 const text = this.getAttribute('data-text');
897 alert(`Extraction: ${className}\nText: ${text}`);
898 });
899 });
900 </script>
901</body>
902</html>"#);
903
904 Ok(html)
905}
906
907fn html_escape(text: &str) -> String {
909 text.replace('&', "&")
910 .replace('<', "<")
911 .replace('>', ">")
912 .replace('"', """)
913 .replace('\'', "'")
914}
915
916fn find_char_boundary(text: &str, mut index: usize) -> usize {
918 if index >= text.len() {
920 return text.len();
921 }
922
923 if text.is_char_boundary(index) {
925 return index;
926 }
927
928 while index > 0 && !text.is_char_boundary(index) {
930 index -= 1;
931 }
932
933 index
934}
935
936fn highlight_text_html(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
938 if let Some(extractions) = &annotated_document.extractions {
939 let mut intervals: Vec<(usize, usize, &Extraction)> = Vec::new();
941
942 for extraction in extractions {
943 if let Some(interval) = &extraction.char_interval {
944 if let (Some(start), Some(end)) = (interval.start_pos, interval.end_pos) {
945 if start < end && end <= text.len() {
946 intervals.push((start, end, extraction));
947 }
948 }
949 }
950 }
951
952 intervals.sort_by_key(|(start, _, _)| *start);
954
955 let mut filtered_intervals = Vec::new();
957 let mut last_end = 0;
958
959 for (start, end, extraction) in intervals {
960 if start >= last_end {
961 filtered_intervals.push((start, end, extraction));
962 last_end = end;
963 } else {
964 log::debug!("Skipping overlapping extraction: '{}' at {}-{} (overlaps with previous ending at {})",
966 extraction.extraction_text, start, end, last_end);
967 }
968 }
969
970 let mut result = String::new();
972 let mut last_pos = 0;
973
974 for (start, end, extraction) in filtered_intervals {
975 let safe_start = find_char_boundary(text, start);
977 let safe_end = find_char_boundary(text, end);
978
979 if safe_start > last_pos {
981 let safe_last_pos = find_char_boundary(text, last_pos);
982 if safe_last_pos < safe_start {
983 result.push_str(&html_escape(&text[safe_last_pos..safe_start]));
984 }
985 }
986
987 if safe_start < safe_end && safe_end <= text.len() {
989 let actual_text = &text[safe_start..safe_end];
990 result.push_str(&format!(
991 r#"<span class="extraction-highlight" data-class="{}" data-text="{}">{}</span>"#,
992 html_escape(&extraction.extraction_class),
993 html_escape(&extraction.extraction_text),
994 html_escape(actual_text)
995 ));
996 last_pos = safe_end;
997 } else {
998 log::debug!("Skipping extraction with invalid UTF-8 boundaries: '{}' at {}-{}",
1000 extraction.extraction_text, start, end);
1001 }
1002 }
1003
1004 if last_pos < text.len() {
1006 let safe_last_pos = find_char_boundary(text, last_pos);
1007 if safe_last_pos < text.len() {
1008 result.push_str(&html_escape(&text[safe_last_pos..]));
1009 }
1010 }
1011
1012 Ok(result)
1013 } else {
1014 Ok(html_escape(text))
1015 }
1016}
1017
1018fn count_extraction_classes(extractions: &[crate::data::Extraction]) -> HashMap<&str, usize> {
1020 let mut class_counts = HashMap::new();
1021 for extraction in extractions {
1022 *class_counts.entry(extraction.extraction_class.as_str()).or_insert(0) += 1;
1023 }
1024 class_counts
1025}
1026
1027fn export_markdown(
1029 annotated_document: &AnnotatedDocument,
1030 config: &ExportConfig,
1031) -> LangExtractResult<String> {
1032 let title = config.title.as_deref().unwrap_or("LangExtract Results");
1033 let text = annotated_document.text.as_deref().unwrap_or("No text");
1034
1035 let mut md = String::new();
1036
1037 md.push_str(&format!("# {}\n\n", title));
1039
1040 if config.include_text {
1042 md.push_str("## 📄 Document Text\n\n");
1043
1044 if config.highlight_extractions {
1045 md.push_str(&highlight_text_markdown(text, annotated_document)?);
1046 } else {
1047 md.push_str(&format!("```\n{}\n```\n", text));
1048 }
1049
1050 md.push_str("\n");
1051 }
1052
1053 if let Some(extractions) = &annotated_document.extractions {
1055 md.push_str(&format!("## 🎯 Extractions ({} found)\n\n", extractions.len()));
1056
1057 for (i, extraction) in extractions.iter().enumerate() {
1058 md.push_str(&format!("### {}. {}\n\n", i + 1, extraction.extraction_class));
1059 md.push_str(&format!("**Text:** {}\n\n", extraction.extraction_text));
1060
1061 if config.show_char_intervals {
1062 if let Some(interval) = &extraction.char_interval {
1063 md.push_str(&format!("**Position:** {}-{}\n\n", interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0)));
1064 }
1065 }
1066
1067 if let Some(description) = &extraction.description {
1068 md.push_str(&format!("**Description:** {}\n\n", description));
1069 }
1070 }
1071 }
1072
1073 if config.include_statistics {
1075 md.push_str("## 📊 Statistics\n\n");
1076
1077 let extraction_count = annotated_document.extraction_count();
1078 md.push_str(&format!("- **Total Extractions:** {}\n", extraction_count));
1079 md.push_str(&format!("- **Text Length:** {} characters\n", text.len()));
1080
1081 if let Some(extractions) = &annotated_document.extractions {
1082 let class_counts = count_extraction_classes(extractions);
1083 md.push_str(&format!("- **Unique Classes:** {}\n\n", class_counts.len()));
1084
1085 md.push_str("### Extraction Classes\n\n");
1086 md.push_str("| Class | Count |\n");
1087 md.push_str("|-------|-------|\n");
1088
1089 for (class, count) in class_counts {
1090 md.push_str(&format!("| {} | {} |\n", class, count));
1091 }
1092 }
1093
1094 md.push_str("\n");
1095 }
1096
1097 Ok(md)
1098}
1099
1100fn highlight_text_markdown(text: &str, annotated_document: &AnnotatedDocument) -> LangExtractResult<String> {
1102 if let Some(extractions) = &annotated_document.extractions {
1103 let mut result = String::new();
1104 let mut last_pos = 0;
1105
1106 let mut sorted_extractions: Vec<_> = extractions.iter().collect();
1108 sorted_extractions.sort_by_key(|e| {
1109 e.char_interval.as_ref().and_then(|i| i.start_pos).unwrap_or(usize::MAX)
1110 });
1111
1112 result.push_str("```\n");
1113
1114 for extraction in sorted_extractions {
1115 if let Some(interval) = &extraction.char_interval {
1116 if interval.start_pos.unwrap_or(0) > last_pos && interval.start_pos.unwrap_or(0) <= text.len() {
1118 result.push_str(&text[last_pos..interval.start_pos.unwrap_or(0)]);
1119 }
1120
1121 if interval.end_pos.unwrap_or(0) <= text.len() && interval.start_pos.unwrap_or(0) < interval.end_pos.unwrap_or(0) {
1123 let extraction_text = &text[interval.start_pos.unwrap_or(0)..interval.end_pos.unwrap_or(0)];
1124 result.push_str(&format!("**{}**", extraction_text));
1125 last_pos = interval.end_pos.unwrap_or(0);
1126 }
1127 }
1128 }
1129
1130 if last_pos < text.len() {
1132 result.push_str(&text[last_pos..]);
1133 }
1134
1135 result.push_str("\n```\n");
1136 Ok(result)
1137 } else {
1138 Ok(format!("```\n{}\n```\n", text))
1139 }
1140}
1141
1142fn export_json(
1144 annotated_document: &AnnotatedDocument,
1145 config: &ExportConfig,
1146) -> LangExtractResult<String> {
1147 let mut json_data = json!({
1148 "document_id": annotated_document.document_id,
1149 "export_config": {
1150 "format": "json",
1151 "show_char_intervals": config.show_char_intervals,
1152 "include_text": config.include_text,
1153 "include_statistics": config.include_statistics,
1154 "title": config.title
1155 }
1156 });
1157
1158 if config.include_text {
1160 json_data["text"] = json!(annotated_document.text);
1161 }
1162
1163 if let Some(extractions) = &annotated_document.extractions {
1165 let extractions_json: Vec<Value> = extractions.iter().map(|extraction| {
1166 let mut ext_json = json!({
1167 "extraction_class": extraction.extraction_class,
1168 "extraction_text": extraction.extraction_text,
1169 "description": extraction.description
1170 });
1171
1172 if config.show_char_intervals {
1173 if let Some(interval) = &extraction.char_interval {
1174 ext_json["char_interval"] = json!({
1175 "start_char": interval.start_pos.unwrap_or(0),
1176 "end_char": interval.end_pos.unwrap_or(0),
1177 "alignment_status": extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string())
1178 });
1179 }
1180 }
1181
1182 if let Some(group_index) = extraction.group_index {
1183 ext_json["group_index"] = json!(group_index);
1184 }
1185
1186 ext_json
1187 }).collect();
1188
1189 json_data["extractions"] = json!(extractions_json);
1190 }
1191
1192 if config.include_statistics {
1194 let text = annotated_document.text.as_deref().unwrap_or("");
1195 let mut stats = json!({
1196 "total_extractions": annotated_document.extraction_count(),
1197 "text_length": text.len()
1198 });
1199
1200 if let Some(extractions) = &annotated_document.extractions {
1201 let class_counts = count_extraction_classes(extractions);
1202 stats["unique_classes"] = json!(class_counts.len());
1203 stats["extraction_classes"] = json!(class_counts);
1204 }
1205
1206 json_data["statistics"] = stats;
1207 }
1208
1209 Ok(serde_json::to_string_pretty(&json_data)?)
1210}
1211
1212fn export_csv(
1214 annotated_document: &AnnotatedDocument,
1215 config: &ExportConfig,
1216) -> LangExtractResult<String> {
1217 let mut csv = String::new();
1218
1219 if config.show_char_intervals {
1221 csv.push_str("extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index\n");
1222 } else {
1223 csv.push_str("extraction_class,extraction_text,description,group_index\n");
1224 }
1225
1226 if let Some(extractions) = &annotated_document.extractions {
1228 for extraction in extractions {
1229 let class = csv_escape(&extraction.extraction_class);
1230 let text = csv_escape(&extraction.extraction_text);
1231 let description = extraction.description.as_ref().map(|d| csv_escape(d)).unwrap_or_else(|| "".to_string());
1232 let group_index = extraction.group_index.map(|i| i.to_string()).unwrap_or_else(|| "".to_string());
1233
1234 if config.show_char_intervals {
1235 if let Some(interval) = &extraction.char_interval {
1236 csv.push_str(&format!("{},{},{},{},{},{:?},{}\n",
1237 class, text, description,
1238 interval.start_pos.unwrap_or(0), interval.end_pos.unwrap_or(0),
1239 extraction.alignment_status.as_ref().map(|s| format!("{:?}", s)).unwrap_or_else(|| "None".to_string()), group_index));
1240 } else {
1241 csv.push_str(&format!("{},{},{},,,None,{}\n",
1242 class, text, description, group_index));
1243 }
1244 } else {
1245 csv.push_str(&format!("{},{},{},{}\n",
1246 class, text, description, group_index));
1247 }
1248 }
1249 }
1250
1251 Ok(csv)
1252}
1253
1254fn csv_escape(text: &str) -> String {
1256 if text.contains(',') || text.contains('"') || text.contains('\n') {
1257 format!("\"{}\"", text.replace('"', "\"\""))
1258 } else {
1259 text.to_string()
1260 }
1261}
1262
1263#[cfg(test)]
1264mod tests {
1265 use super::*;
1266 use crate::data::{AlignmentStatus, CharInterval, Extraction};
1267 use std::collections::HashMap;
1268 use crate::pipeline::{PipelineConfig, PipelineStep, StepResult, PipelineResult};
1269 use crate::ExtractConfig as LibExtractConfig;
1270
1271 fn create_sample_document() -> AnnotatedDocument {
1272 let text = "John Smith works at TechCorp and earns $50,000.";
1273 let extractions = vec![
1274 Extraction {
1275 extraction_class: "person".to_string(),
1276 extraction_text: "John Smith".to_string(),
1277 char_interval: Some(CharInterval::new(Some(0), Some(10))),
1278 alignment_status: Some(AlignmentStatus::MatchExact),
1279 extraction_index: Some(0),
1280 group_index: Some(0),
1281 description: Some("Person name".to_string()),
1282 attributes: Some(HashMap::new()),
1283 token_interval: None,
1284 },
1285 Extraction {
1286 extraction_class: "company".to_string(),
1287 extraction_text: "TechCorp".to_string(),
1288 char_interval: Some(CharInterval::new(Some(20), Some(28))),
1289 alignment_status: Some(AlignmentStatus::MatchExact),
1290 extraction_index: Some(1),
1291 group_index: Some(0),
1292 description: None,
1293 attributes: Some(HashMap::new()),
1294 token_interval: None,
1295 },
1296 Extraction {
1297 extraction_class: "salary".to_string(),
1298 extraction_text: "$50,000".to_string(),
1299 char_interval: Some(CharInterval::new(Some(39), Some(46))),
1300 alignment_status: Some(AlignmentStatus::MatchFuzzy),
1301 extraction_index: Some(2),
1302 group_index: Some(0),
1303 description: Some("Annual salary".to_string()),
1304 attributes: Some(HashMap::new()),
1305 token_interval: None,
1306 },
1307 ];
1308
1309 AnnotatedDocument {
1310 document_id: Some("test_doc".to_string()),
1311 text: Some(text.to_string()),
1312 extractions: Some(extractions),
1313 }
1314 }
1315
1316 #[test]
1317 fn test_text_export() {
1318 let document = create_sample_document();
1319 let config = ExportConfig {
1320 format: ExportFormat::Text,
1321 show_char_intervals: true,
1322 ..Default::default()
1323 };
1324
1325 let result = export_document(&document, &config).unwrap();
1326
1327 assert!(result.contains("EXTRACTION VISUALIZATION"));
1328 assert!(result.contains("John Smith"));
1329 assert!(result.contains("TechCorp"));
1330 assert!(result.contains("$50,000"));
1331 assert!(result.contains("Position:"));
1332 assert!(result.contains("Statistics:"));
1333 }
1334
1335 #[test]
1336 fn test_html_export() {
1337 let document = create_sample_document();
1338 let config = ExportConfig {
1339 format: ExportFormat::Html,
1340 title: Some("Test HTML Export".to_string()),
1341 highlight_extractions: true,
1342 show_char_intervals: true,
1343 ..Default::default()
1344 };
1345
1346 let result = export_document(&document, &config).unwrap();
1347
1348 assert!(result.contains("<!DOCTYPE html>"));
1349 assert!(result.contains("<title>Test HTML Export</title>"));
1350 assert!(result.contains("extraction-highlight"));
1351 assert!(result.contains("John Smith"));
1352 assert!(result.contains("TechCorp"));
1353 assert!(result.contains("extraction-card"));
1354 assert!(result.contains("stats-grid"));
1355 assert!(result.contains("</html>"));
1356 }
1357
1358 #[test]
1359 fn test_html_export_with_custom_css() {
1360 let document = create_sample_document();
1361 let custom_css = "body { background: red; }";
1362 let config = ExportConfig {
1363 format: ExportFormat::Html,
1364 custom_css: Some(custom_css.to_string()),
1365 ..Default::default()
1366 };
1367
1368 let result = export_document(&document, &config).unwrap();
1369
1370 assert!(result.contains(custom_css));
1371 }
1372
1373 #[test]
1374 fn test_markdown_export() {
1375 let document = create_sample_document();
1376 let config = ExportConfig {
1377 format: ExportFormat::Markdown,
1378 title: Some("Test Markdown".to_string()),
1379 show_char_intervals: true,
1380 highlight_extractions: true,
1381 ..Default::default()
1382 };
1383
1384 let result = export_document(&document, &config).unwrap();
1385
1386 assert!(result.starts_with("# Test Markdown"));
1387 assert!(result.contains("## 📄 Document Text"));
1388 assert!(result.contains("## 🎯 Extractions"));
1389 assert!(result.contains("### 1. person"));
1390 assert!(result.contains("**Text:** John Smith"));
1391 assert!(result.contains("**Position:** 0-10"));
1392 assert!(result.contains("| Class | Count |"));
1393 assert!(result.contains("| person | 1 |"));
1394 }
1395
1396 #[test]
1397 fn test_json_export() {
1398 let document = create_sample_document();
1399 let config = ExportConfig {
1400 format: ExportFormat::Json,
1401 show_char_intervals: true,
1402 include_text: true,
1403 include_statistics: true,
1404 ..Default::default()
1405 };
1406
1407 let result = export_document(&document, &config).unwrap();
1408 let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
1409
1410 assert_eq!(parsed["document_id"], "test_doc");
1411 assert!(parsed["text"].is_string());
1412 assert!(parsed["extractions"].is_array());
1413 assert!(parsed["statistics"].is_object());
1414
1415 let extractions = parsed["extractions"].as_array().unwrap();
1416 assert_eq!(extractions.len(), 3);
1417
1418 let first_extraction = &extractions[0];
1419 assert_eq!(first_extraction["extraction_class"], "person");
1420 assert_eq!(first_extraction["extraction_text"], "John Smith");
1421 assert!(first_extraction["char_interval"].is_object());
1422
1423 let stats = &parsed["statistics"];
1424 assert_eq!(stats["total_extractions"], 3);
1425 assert_eq!(stats["unique_classes"], 3);
1426 }
1427
1428 #[test]
1429 fn test_csv_export() {
1430 let document = create_sample_document();
1431 let config = ExportConfig {
1432 format: ExportFormat::Csv,
1433 show_char_intervals: true,
1434 ..Default::default()
1435 };
1436
1437 let result = export_document(&document, &config).unwrap();
1438 let lines: Vec<&str> = result.lines().collect();
1439
1440 assert_eq!(lines[0], "extraction_class,extraction_text,description,start_char,end_char,alignment_status,group_index");
1442
1443 assert_eq!(lines.len(), 4); assert!(lines[1].contains("person,John Smith"));
1446 assert!(lines[2].contains("company,TechCorp"));
1447 assert!(lines[3].contains("salary,\"$50,000\""));
1448 assert!(lines[1].contains("MatchExact"));
1449 assert!(lines[3].contains("MatchFuzzy"));
1450 }
1451
1452 #[test]
1453 fn test_csv_export_without_intervals() {
1454 let document = create_sample_document();
1455 let config = ExportConfig {
1456 format: ExportFormat::Csv,
1457 show_char_intervals: false,
1458 ..Default::default()
1459 };
1460
1461 let result = export_document(&document, &config).unwrap();
1462 let lines: Vec<&str> = result.lines().collect();
1463
1464 assert_eq!(lines[0], "extraction_class,extraction_text,description,group_index");
1466
1467 assert!(!result.contains("start_char"));
1469 assert!(!result.contains("end_char"));
1470 }
1471
1472 #[test]
1473 fn test_csv_escape() {
1474 assert_eq!(csv_escape("simple"), "simple");
1475 assert_eq!(csv_escape("has,comma"), "\"has,comma\"");
1476 assert_eq!(csv_escape("has\"quote"), "\"has\"\"quote\"");
1477 assert_eq!(csv_escape("has\nnewline"), "\"has\nnewline\"");
1478 assert_eq!(csv_escape("has,comma\"and quote"), "\"has,comma\"\"and quote\"");
1479 }
1480
1481 #[test]
1482 fn test_html_escape() {
1483 assert_eq!(html_escape("simple"), "simple");
1484 assert_eq!(html_escape("has<tag>"), "has<tag>");
1485 assert_eq!(html_escape("has\"quote"), "has"quote");
1486 assert_eq!(html_escape("has'apostrophe"), "has'apostrophe");
1487 assert_eq!(html_escape("has&ersand"), "has&ampersand");
1488 }
1489
1490 #[test]
1491 fn test_export_config_defaults() {
1492 let config = ExportConfig::default();
1493 assert_eq!(config.format, ExportFormat::Text);
1494 assert!(!config.show_char_intervals);
1495 assert!(config.include_text);
1496 assert!(config.highlight_extractions);
1497 assert!(config.include_statistics);
1498 assert!(config.custom_css.is_none());
1499 assert!(config.title.is_none());
1500 }
1501
1502 #[test]
1503 fn test_empty_document() {
1504 let document = AnnotatedDocument {
1505 document_id: Some("empty".to_string()),
1506 text: Some("".to_string()),
1507 extractions: None,
1508 };
1509
1510 let config = ExportConfig::default();
1511 let result = export_document(&document, &config).unwrap();
1512
1513 assert!(result.contains("No extractions found"));
1514 }
1515
1516 #[test]
1517 fn test_document_without_text() {
1518 let document = AnnotatedDocument {
1519 document_id: Some("no_text".to_string()),
1520 text: None,
1521 extractions: None,
1522 };
1523
1524 let config = ExportConfig::default();
1525 let result = export_document(&document, &config).unwrap();
1526
1527 assert!(result.contains("No text"));
1528 }
1529
1530 #[test]
1531 fn test_export_format_variants() {
1532 let document = create_sample_document();
1533
1534 for format in [ExportFormat::Text, ExportFormat::Html, ExportFormat::Markdown, ExportFormat::Json, ExportFormat::Csv] {
1536 let config = ExportConfig {
1537 format,
1538 ..Default::default()
1539 };
1540 let result = export_document(&document, &config);
1541 assert!(result.is_ok(), "Format {:?} failed", format);
1542 }
1543 }
1544
1545 #[test]
1546 fn test_highlight_text_html() {
1547 let document = create_sample_document();
1548 let text = document.text.as_ref().unwrap();
1549
1550 let result = highlight_text_html(text, &document).unwrap();
1551
1552 assert!(result.contains("extraction-highlight"));
1553 assert!(result.contains("data-class=\"person\""));
1554 assert!(result.contains("data-text=\"John Smith\""));
1555 assert!(result.contains("John Smith"));
1556 }
1557
1558 #[test]
1559 fn test_count_extraction_classes() {
1560 let extractions = vec![
1561 Extraction {
1562 extraction_class: "person".to_string(),
1563 extraction_text: "John".to_string(),
1564 char_interval: None,
1565 alignment_status: None,
1566 extraction_index: None,
1567 group_index: None,
1568 description: None,
1569 attributes: Some(HashMap::new()),
1570 token_interval: None,
1571 },
1572 Extraction {
1573 extraction_class: "person".to_string(),
1574 extraction_text: "Jane".to_string(),
1575 char_interval: None,
1576 alignment_status: None,
1577 extraction_index: None,
1578 group_index: None,
1579 description: None,
1580 attributes: Some(HashMap::new()),
1581 token_interval: None,
1582 },
1583 Extraction {
1584 extraction_class: "company".to_string(),
1585 extraction_text: "TechCorp".to_string(),
1586 char_interval: None,
1587 alignment_status: None,
1588 extraction_index: None,
1589 group_index: None,
1590 description: None,
1591 attributes: Some(HashMap::new()),
1592 token_interval: None,
1593 },
1594 ];
1595
1596 let counts = count_extraction_classes(&extractions);
1597
1598 assert_eq!(counts.get("person"), Some(&2));
1599 assert_eq!(counts.get("company"), Some(&1));
1600 assert_eq!(counts.len(), 2);
1601 }
1602
1603 #[test]
1604 fn test_export_pipeline_html_renders_layers() {
1605 let text = "The system shall process 100 transactions per second.";
1607
1608 let steps = vec![
1610 PipelineStep {
1611 id: "s1".to_string(),
1612 name: "Extract Requirements".to_string(),
1613 description: "".to_string(),
1614 examples: vec![],
1615 prompt: "".to_string(),
1616 output_field: "requirements".to_string(),
1617 filter: None,
1618 depends_on: vec![],
1619 },
1620 PipelineStep {
1621 id: "s2".to_string(),
1622 name: "Extract Values".to_string(),
1623 description: "".to_string(),
1624 examples: vec![],
1625 prompt: "".to_string(),
1626 output_field: "values".to_string(),
1627 filter: None,
1628 depends_on: vec!["s1".to_string()],
1629 },
1630 ];
1631
1632 let cfg = PipelineConfig {
1633 name: "Test".to_string(),
1634 description: "".to_string(),
1635 version: "0.0.0".to_string(),
1636 steps: steps.clone(),
1637 global_config: LibExtractConfig::default(),
1638 enable_parallel_execution: false,
1639 };
1640
1641 let parent_start = 0usize;
1643 let parent_end = text.len();
1644 let hundred_idx = text.find("100").unwrap();
1645 let unit_idx = text.find("transactions per second").unwrap();
1646
1647 let step1_res = StepResult {
1648 step_id: "s1".to_string(),
1649 step_name: "Extract Requirements".to_string(),
1650 extractions: vec![Extraction {
1651 extraction_class: "requirement".to_string(),
1652 extraction_text: text.to_string(),
1653 char_interval: Some(CharInterval::new(Some(parent_start), Some(parent_end))),
1654 alignment_status: Some(AlignmentStatus::MatchExact),
1655 extraction_index: Some(0),
1656 group_index: None,
1657 description: None,
1658 attributes: Some(HashMap::new()),
1659 token_interval: None,
1660 }],
1661 processing_time_ms: 1,
1662 input_count: 1,
1663 success: true,
1664 error_message: None,
1665 };
1666
1667 let step2_res = StepResult {
1668 step_id: "s2".to_string(),
1669 step_name: "Extract Values".to_string(),
1670 extractions: vec![
1671 Extraction {
1672 extraction_class: "value".to_string(),
1673 extraction_text: "100".to_string(),
1674 char_interval: Some(CharInterval::new(Some(hundred_idx), Some(hundred_idx + 3))),
1675 alignment_status: Some(AlignmentStatus::MatchExact),
1676 extraction_index: Some(0),
1677 group_index: None,
1678 description: None,
1679 attributes: Some(HashMap::new()),
1680 token_interval: None,
1681 },
1682 Extraction {
1683 extraction_class: "unit".to_string(),
1684 extraction_text: "transactions per second".to_string(),
1685 char_interval: Some(CharInterval::new(Some(unit_idx), Some(unit_idx + "transactions per second".len()))),
1686 alignment_status: Some(AlignmentStatus::MatchExact),
1687 extraction_index: Some(1),
1688 group_index: None,
1689 description: None,
1690 attributes: Some(HashMap::new()),
1691 token_interval: None,
1692 }
1693 ],
1694 processing_time_ms: 1,
1695 input_count: 1,
1696 success: true,
1697 error_message: None,
1698 };
1699
1700 let pr = PipelineResult {
1701 config: cfg,
1702 step_results: vec![step1_res, step2_res],
1703 nested_output: serde_json::json!({}),
1704 total_time_ms: 2,
1705 success: true,
1706 error_message: None,
1707 };
1708
1709 let config = ExportConfig { format: ExportFormat::Html, ..Default::default() };
1710 let html = export_pipeline_html(&pr, text, &config).unwrap();
1711 assert!(html.contains("step-0"), "Should render step-0 (parent)");
1712 assert!(html.contains("step-1"), "Should render step-1 (child)");
1713 assert!(html.contains("100"));
1714 assert!(html.contains("transactions per second"));
1715 }
1716
1717 #[test]
1718 fn test_export_pipeline_html_exact_match_fallback() {
1719 let text = "System uptime must be 99.9% for availability.";
1720
1721 let steps = vec![
1722 PipelineStep { id: "s1".to_string(), name: "Req".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "req".to_string(), filter: None, depends_on: vec![] },
1723 PipelineStep { id: "s2".to_string(), name: "Vals".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "vals".to_string(), filter: None, depends_on: vec!["s1".to_string()] },
1724 ];
1725 let cfg = PipelineConfig { name: "T".to_string(), description: "".to_string(), version: "0".to_string(), steps, global_config: LibExtractConfig::default(), enable_parallel_execution: false };
1726
1727 let step1_res = StepResult {
1728 step_id: "s1".to_string(),
1729 step_name: "Req".to_string(),
1730 extractions: vec![Extraction {
1731 extraction_class: "requirement".to_string(),
1732 extraction_text: text.to_string(),
1733 char_interval: None, alignment_status: None,
1735 extraction_index: None,
1736 group_index: None,
1737 description: None,
1738 attributes: Some(HashMap::new()),
1739 token_interval: None,
1740 }],
1741 processing_time_ms: 1,
1742 input_count: 1,
1743 success: true,
1744 error_message: None,
1745 };
1746
1747 let step2_res = StepResult {
1748 step_id: "s2".to_string(),
1749 step_name: "Vals".to_string(),
1750 extractions: vec![Extraction {
1751 extraction_class: "uptime".to_string(),
1752 extraction_text: "99.9%".to_string(),
1753 char_interval: None, alignment_status: None,
1755 extraction_index: None,
1756 group_index: None,
1757 description: None,
1758 attributes: Some(HashMap::new()),
1759 token_interval: None,
1760 }],
1761 processing_time_ms: 1,
1762 input_count: 1,
1763 success: true,
1764 error_message: None,
1765 };
1766
1767 let pr = PipelineResult { config: cfg, step_results: vec![step1_res, step2_res], nested_output: serde_json::json!({}), total_time_ms: 2, success: true, error_message: None };
1768
1769 let config = ExportConfig { format: ExportFormat::Html, ..Default::default() };
1770 let html = export_pipeline_html(&pr, text, &config).unwrap();
1771 assert!(html.contains("99.9%"), "Fallback should highlight exact match in original text");
1772 }
1773
1774 #[test]
1775 fn test_export_pipeline_html_overlap_rendering() {
1776 let text = "The system shall support 10 users concurrently.";
1778
1779 let steps = vec![
1780 PipelineStep { id: "s1".to_string(), name: "Req".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "req".to_string(), filter: None, depends_on: vec![] },
1781 PipelineStep { id: "s2".to_string(), name: "Vals".to_string(), description: "".to_string(), examples: vec![], prompt: "".to_string(), output_field: "vals".to_string(), filter: None, depends_on: vec!["s1".to_string()] },
1782 ];
1783 let cfg = PipelineConfig { name: "T".to_string(), description: "".to_string(), version: "0".to_string(), steps, global_config: LibExtractConfig::default(), enable_parallel_execution: false };
1784
1785 let parent_start = 0usize;
1786 let parent_end = text.len();
1787 let ten_idx = text.find("10").unwrap();
1788 let users_idx = text.find("10 users").unwrap();
1789
1790 let step1_res = StepResult {
1791 step_id: "s1".to_string(),
1792 step_name: "Req".to_string(),
1793 extractions: vec![Extraction {
1794 extraction_class: "requirement".to_string(),
1795 extraction_text: text.to_string(),
1796 char_interval: Some(CharInterval::new(Some(parent_start), Some(parent_end))),
1797 alignment_status: None,
1798 extraction_index: None,
1799 group_index: None,
1800 description: None,
1801 attributes: Some(HashMap::new()),
1802 token_interval: None,
1803 }],
1804 processing_time_ms: 1,
1805 input_count: 1,
1806 success: true,
1807 error_message: None,
1808 };
1809
1810 let step2_res = StepResult {
1811 step_id: "s2".to_string(),
1812 step_name: "Vals".to_string(),
1813 extractions: vec![
1814 Extraction {
1815 extraction_class: "value".to_string(),
1816 extraction_text: "10".to_string(),
1817 char_interval: Some(CharInterval::new(Some(ten_idx), Some(ten_idx + 2))),
1818 alignment_status: None,
1819 extraction_index: None,
1820 group_index: None,
1821 description: None,
1822 attributes: Some(HashMap::new()),
1823 token_interval: None,
1824 },
1825 Extraction {
1826 extraction_class: "phrase".to_string(),
1827 extraction_text: "10 users".to_string(),
1828 char_interval: Some(CharInterval::new(Some(users_idx), Some(users_idx + "10 users".len()))),
1829 alignment_status: None,
1830 extraction_index: None,
1831 group_index: None,
1832 description: None,
1833 attributes: Some(HashMap::new()),
1834 token_interval: None,
1835 },
1836 ],
1837 processing_time_ms: 1,
1838 input_count: 1,
1839 success: true,
1840 error_message: None,
1841 };
1842
1843 let pr = PipelineResult { config: cfg, step_results: vec![step1_res, step2_res], nested_output: serde_json::json!({}), total_time_ms: 2, success: true, error_message: None };
1844
1845 let mut config = ExportConfig { format: ExportFormat::Html, ..Default::default() };
1846 config.allow_overlapping_highlights = true;
1847 let html = export_pipeline_html(&pr, text, &config).unwrap();
1848 assert!(html.contains("10"));
1850 assert!(html.contains("10 users"));
1851 }
1852}