sakurs_core/api/
output.rs

1//! Output types for unified API
2
3use std::time::Duration;
4
5/// Processing output with rich metadata
6#[derive(Debug, Clone)]
7pub struct Output {
8    /// Sentence boundaries found
9    pub boundaries: Vec<Boundary>,
10    /// Processing metadata
11    pub metadata: ProcessingMetadata,
12}
13
14/// A sentence boundary with detailed information
15#[derive(Debug, Clone)]
16pub struct Boundary {
17    /// Byte offset in the original text
18    pub offset: usize,
19    /// Character offset in the original text
20    pub char_offset: usize,
21    /// Confidence score (0.0 to 1.0)
22    pub confidence: f32,
23    /// Optional context for debugging
24    pub context: Option<BoundaryContext>,
25}
26
27/// Context information for a boundary (for debugging)
28#[derive(Debug, Clone)]
29pub struct BoundaryContext {
30    /// Text before the boundary
31    pub before: String,
32    /// Text after the boundary
33    pub after: String,
34    /// Reason for the boundary
35    pub reason: String,
36}
37
38/// Metadata about the processing
39#[derive(Debug, Clone)]
40pub struct ProcessingMetadata {
41    /// Total processing duration
42    pub duration: Duration,
43    /// Strategy used for processing
44    pub strategy_used: String,
45    /// Number of chunks processed
46    pub chunks_processed: usize,
47    /// Peak memory usage in bytes
48    pub memory_peak: usize,
49    /// Additional statistics
50    pub stats: ProcessingStats,
51}
52
53/// Additional processing statistics
54#[derive(Debug, Clone)]
55pub struct ProcessingStats {
56    /// Total bytes processed
57    pub bytes_processed: usize,
58    /// Total characters processed
59    pub chars_processed: usize,
60    /// Number of sentences found
61    pub sentence_count: usize,
62    /// Average sentence length in characters
63    pub avg_sentence_length: f32,
64}
65
66impl Output {
67    /// Create output from delta stack processing result
68    pub(crate) fn from_delta_stack_result(
69        result: crate::application::DeltaStackResult,
70        text: &str,
71        duration: Duration,
72    ) -> Self {
73        // Calculate character offsets for each byte boundary
74        let char_boundaries = Self::calculate_char_offsets(text, &result.boundaries);
75
76        let boundaries = result
77            .boundaries
78            .into_iter()
79            .zip(char_boundaries)
80            .map(|(offset, char_offset)| Boundary {
81                offset,
82                char_offset,
83                confidence: 1.0, // DeltaStack algorithm has high confidence
84                context: None,
85            })
86            .collect::<Vec<_>>();
87
88        let sentence_count = boundaries.len();
89        let avg_sentence_length = if sentence_count > 0 {
90            text.chars().count() as f32 / sentence_count as f32
91        } else {
92            0.0
93        };
94
95        // Determine strategy used based on thread count
96        let strategy_used = if result.thread_count > 1 {
97            format!("parallel ({} threads)", result.thread_count)
98        } else {
99            "sequential".to_string()
100        };
101
102        Self {
103            boundaries,
104            metadata: ProcessingMetadata {
105                duration,
106                strategy_used,
107                chunks_processed: result.chunk_count,
108                memory_peak: 0, // Future: memory tracking integration
109                stats: ProcessingStats {
110                    bytes_processed: text.len(),
111                    chars_processed: text.chars().count(),
112                    sentence_count,
113                    avg_sentence_length,
114                },
115            },
116        }
117    }
118
119    /// Calculate character offsets from byte offsets
120    fn calculate_char_offsets(text: &str, byte_offsets: &[usize]) -> Vec<usize> {
121        let mut char_offsets = Vec::with_capacity(byte_offsets.len());
122        let mut char_count = 0;
123        let mut byte_count = 0;
124
125        for (i, ch) in text.chars().enumerate() {
126            if byte_offsets.contains(&byte_count) {
127                char_offsets.push(i);
128            }
129            byte_count += ch.len_utf8();
130            char_count += 1;
131        }
132
133        // Handle any remaining offsets at the end
134        if byte_offsets.contains(&byte_count) {
135            char_offsets.push(char_count);
136        }
137
138        char_offsets
139    }
140}