avocado_core/
span.rs

1//! Span extraction from documents
2//!
3//! This module handles extracting meaningful text spans from documents.
4//! Spans are the fundamental unit of retrieval in AvocadoDB.
5//!
6//! # Key Principles
7//!
8//! - Spans should be 20-50 lines (roughly 200-500 tokens)
9//! - Respect natural boundaries (paragraphs, code blocks, sections)
10//! - Never split mid-sentence
11//! - Maintain line number accuracy for citations
12
13use crate::types::{Result, Span};
14use uuid::Uuid;
15
16/// Extract spans from document text
17///
18/// # Arguments
19///
20/// * `content` - The full document text
21/// * `artifact_id` - The ID of the parent artifact
22///
23/// # Returns
24///
25/// A vector of spans extracted from the document
26///
27/// # Algorithm
28///
29/// 1. Split document into lines
30/// 2. Group lines into spans (target 20-50 lines)
31/// 3. Respect paragraph boundaries (double newlines)
32/// 4. Keep code blocks together
33/// 5. Calculate token counts for each span
34///
35pub fn extract_spans(content: &str, artifact_id: &str) -> Result<Vec<Span>> {
36    let lines: Vec<&str> = content.lines().collect();
37    let mut spans = Vec::new();
38    let mut current_start = 1;
39    let mut current_lines = Vec::new();
40
41    for (idx, line) in lines.iter().enumerate() {
42        let line_num = idx + 1;
43        current_lines.push(*line);
44
45        // Determine if we should create a span at this point
46        let should_split = should_create_span(
47            &current_lines,
48            line,
49            idx,
50            lines.len(),
51        );
52
53        if should_split && !current_lines.is_empty() {
54            let text = current_lines.join("\n");
55            let token_count = estimate_tokens(&text);
56
57            spans.push(Span {
58                id: Uuid::new_v4().to_string(),
59                artifact_id: artifact_id.to_string(),
60                start_line: current_start,
61                end_line: line_num,
62                text,
63                embedding: None,
64                embedding_model: None,
65                token_count,
66                metadata: None,
67            });
68
69            current_start = line_num + 1;
70            current_lines.clear();
71        }
72    }
73
74    // Handle any remaining lines
75    if !current_lines.is_empty() {
76        let text = current_lines.join("\n");
77        let token_count = estimate_tokens(&text);
78
79        spans.push(Span {
80            id: Uuid::new_v4().to_string(),
81            artifact_id: artifact_id.to_string(),
82            start_line: current_start,
83            end_line: lines.len(),
84            text,
85            embedding: None,
86            embedding_model: None,
87            token_count,
88            metadata: None,
89        });
90    }
91
92    Ok(spans)
93}
94
95/// Determine if we should create a span at the current position
96///
97/// # Arguments
98///
99/// * `current_lines` - Lines accumulated so far
100/// * `line` - The current line being processed
101/// * `idx` - Current line index
102/// * `total_lines` - Total number of lines in document
103///
104/// # Returns
105///
106/// `true` if a span should be created at this point
107///
108/// Smart span boundary detection
109///
110/// Detects natural boundaries for creating semantic spans:
111/// - Paragraph boundaries (empty lines)
112/// - Code block boundaries (``` markers)
113/// - Section headers (# markdown, == rst, etc.)
114/// - Target span size (20-50 lines)
115/// - Minimum span size (avoid tiny spans)
116fn should_create_span(
117    current_lines: &[&str],
118    line: &str,
119    idx: usize,
120    total_lines: usize,
121) -> bool {
122    let num_lines = current_lines.len();
123
124    // Always split at end of document
125    if idx == total_lines - 1 {
126        return true;
127    }
128
129    // Reached maximum size (50 lines)
130    if num_lines >= 50 {
131        return true;
132    }
133
134    // Hit paragraph boundary and have minimum size
135    if line.trim().is_empty() && num_lines >= 20 {
136        return true;
137    }
138
139    // Detect markdown/RST section headers (good split points)
140    let is_header = is_section_header(line);
141    if is_header && num_lines >= 15 {
142        // Split before headers if we have enough content
143        return true;
144    }
145
146    // Detect code fence boundaries
147    let is_code_fence = line.trim().starts_with("```") || line.trim().starts_with("~~~");
148    if is_code_fence && num_lines >= 20 {
149        // Split at code block boundaries
150        return true;
151    }
152
153    // Ideal target size is 30 lines - split at next good boundary
154    if num_lines >= 30 {
155        // Look for natural boundaries
156        if line.trim().is_empty() || is_header || is_code_fence {
157            return true;
158        }
159        
160        // Detect function/class definitions (common in code)
161        let trimmed = line.trim();
162        let is_definition = trimmed.starts_with("def ") 
163            || trimmed.starts_with("class ")
164            || trimmed.starts_with("function ")
165            || trimmed.starts_with("pub fn ")
166            || trimmed.starts_with("fn ")
167            || trimmed.starts_with("const ")
168            || trimmed.starts_with("let ");
169        if is_definition && num_lines >= 25 {
170            return true;
171        }
172
173        // If we're at 40+ lines and haven't found a boundary, force split
174        if num_lines >= 40 {
175            return true;
176        }
177    }
178
179    false
180}
181
182/// Check if a line looks like a section header
183fn is_section_header(line: &str) -> bool {
184    let trimmed = line.trim();
185
186    // Markdown headers: # Title, ## Title, etc.
187    if trimmed.starts_with('#') && trimmed.len() > 1 {
188        return true;
189    }
190
191    // ReStructuredText style headers: underlines with =, -, ~, etc.
192    if trimmed.len() > 2 && trimmed.chars().all(|c| c == '=' || c == '-' || c == '~' || c == '^') {
193        return true;
194    }
195
196    false
197}
198
199use std::sync::OnceLock;
200
201/// Cached tiktoken tokenizer for performance
202static TOKENIZER: OnceLock<tiktoken_rs::CoreBPE> = OnceLock::new();
203
204/// Estimate token count for text
205///
206/// Uses tiktoken-rs for accurate token counting compatible with OpenAI models.
207/// The tokenizer is cached for performance (avoids reloading on every call).
208/// Falls back to simple heuristic if tiktoken initialization fails.
209///
210/// # Arguments
211///
212/// * `text` - The text to estimate tokens for
213///
214/// # Returns
215///
216/// Accurate token count (or heuristic estimate if tiktoken fails)
217fn estimate_tokens(text: &str) -> usize {
218    // Use cached tiktoken tokenizer for accurate counting
219    let tokenizer = TOKENIZER.get_or_init(|| {
220        // If initialization fails, we'll use a dummy tokenizer that always returns 0
221        // and fall back to heuristic below
222        tiktoken_rs::cl100k_base().unwrap_or_else(|_| {
223            // Return a dummy - we'll detect this and use heuristic
224            // This is a workaround since we can't return an error from get_or_init
225            // In practice, tiktoken should never fail to initialize
226            panic!("Failed to initialize tiktoken tokenizer - this should not happen")
227        })
228    });
229
230    // Use the tokenizer
231    tokenizer.encode_with_special_tokens(text).len()
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    #[test]
239    fn test_extract_spans_simple() {
240        let content = "Line 1\nLine 2\nLine 3";
241        let spans = extract_spans(content, "test-artifact").unwrap();
242
243        assert!(!spans.is_empty());
244        assert_eq!(spans[0].start_line, 1);
245        assert!(spans[0].token_count > 0);
246    }
247
248    #[test]
249    fn test_extract_spans_with_paragraphs() {
250        let content = (0..25).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n")
251            + "\n\n"
252            + &(25..50).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
253
254        let spans = extract_spans(&content, "test-artifact").unwrap();
255
256        // Should create multiple spans
257        assert!(spans.len() >= 2);
258
259        // Each span should have valid line numbers
260        for span in &spans {
261            assert!(span.end_line >= span.start_line);
262        }
263    }
264
265    #[test]
266    fn test_no_overlapping_spans() {
267        let content = (0..100).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
268        let spans = extract_spans(&content, "test-artifact").unwrap();
269
270        // Verify no gaps or overlaps
271        for i in 0..spans.len() - 1 {
272            assert_eq!(spans[i].end_line + 1, spans[i + 1].start_line);
273        }
274    }
275
276    #[test]
277    fn test_token_estimation() {
278        let text = "This is a test sentence with about ten words in it.";
279        let tokens = estimate_tokens(text);
280        assert!(tokens > 0);
281    }
282}