avocado_core/span.rs
1//! Span extraction from documents
2//!
3//! This module handles extracting meaningful text spans from documents.
4//! Spans are the fundamental unit of retrieval in AvocadoDB.
5//!
6//! # Key Principles
7//!
8//! - Spans should be 20-50 lines (roughly 200-500 tokens)
9//! - Respect natural boundaries (paragraphs, code blocks, sections)
10//! - Never split mid-sentence
11//! - Maintain line number accuracy for citations
12
13use crate::types::{Result, Span};
14use uuid::Uuid;
15
16/// Extract spans from document text
17///
18/// # Arguments
19///
20/// * `content` - The full document text
21/// * `artifact_id` - The ID of the parent artifact
22///
23/// # Returns
24///
25/// A vector of spans extracted from the document
26///
27/// # Algorithm
28///
29/// 1. Split document into lines
30/// 2. Group lines into spans (target 20-50 lines)
31/// 3. Respect paragraph boundaries (double newlines)
32/// 4. Keep code blocks together
33/// 5. Calculate token counts for each span
34///
35pub fn extract_spans(content: &str, artifact_id: &str) -> Result<Vec<Span>> {
36 let lines: Vec<&str> = content.lines().collect();
37 let mut spans = Vec::new();
38 let mut current_start = 1;
39 let mut current_lines = Vec::new();
40
41 for (idx, line) in lines.iter().enumerate() {
42 let line_num = idx + 1;
43 current_lines.push(*line);
44
45 // Determine if we should create a span at this point
46 let should_split = should_create_span(
47 ¤t_lines,
48 line,
49 idx,
50 lines.len(),
51 );
52
53 if should_split && !current_lines.is_empty() {
54 let text = current_lines.join("\n");
55 let token_count = estimate_tokens(&text);
56
57 spans.push(Span {
58 id: Uuid::new_v4().to_string(),
59 artifact_id: artifact_id.to_string(),
60 start_line: current_start,
61 end_line: line_num,
62 text,
63 embedding: None,
64 embedding_model: None,
65 token_count,
66 metadata: None,
67 });
68
69 current_start = line_num + 1;
70 current_lines.clear();
71 }
72 }
73
74 // Handle any remaining lines
75 if !current_lines.is_empty() {
76 let text = current_lines.join("\n");
77 let token_count = estimate_tokens(&text);
78
79 spans.push(Span {
80 id: Uuid::new_v4().to_string(),
81 artifact_id: artifact_id.to_string(),
82 start_line: current_start,
83 end_line: lines.len(),
84 text,
85 embedding: None,
86 embedding_model: None,
87 token_count,
88 metadata: None,
89 });
90 }
91
92 Ok(spans)
93}
94
95/// Determine if we should create a span at the current position
96///
97/// # Arguments
98///
99/// * `current_lines` - Lines accumulated so far
100/// * `line` - The current line being processed
101/// * `idx` - Current line index
102/// * `total_lines` - Total number of lines in document
103///
104/// # Returns
105///
106/// `true` if a span should be created at this point
107///
108/// Smart span boundary detection
109///
110/// Detects natural boundaries for creating semantic spans:
111/// - Paragraph boundaries (empty lines)
112/// - Code block boundaries (``` markers)
113/// - Section headers (# markdown, == rst, etc.)
114/// - Target span size (20-50 lines)
115/// - Minimum span size (avoid tiny spans)
116fn should_create_span(
117 current_lines: &[&str],
118 line: &str,
119 idx: usize,
120 total_lines: usize,
121) -> bool {
122 let num_lines = current_lines.len();
123
124 // Always split at end of document
125 if idx == total_lines - 1 {
126 return true;
127 }
128
129 // Reached maximum size (50 lines)
130 if num_lines >= 50 {
131 return true;
132 }
133
134 // Hit paragraph boundary and have minimum size
135 if line.trim().is_empty() && num_lines >= 20 {
136 return true;
137 }
138
139 // Detect markdown/RST section headers (good split points)
140 let is_header = is_section_header(line);
141 if is_header && num_lines >= 15 {
142 // Split before headers if we have enough content
143 return true;
144 }
145
146 // Detect code fence boundaries
147 let is_code_fence = line.trim().starts_with("```") || line.trim().starts_with("~~~");
148 if is_code_fence && num_lines >= 20 {
149 // Split at code block boundaries
150 return true;
151 }
152
153 // Ideal target size is 30 lines - split at next good boundary
154 if num_lines >= 30 {
155 // Look for natural boundaries
156 if line.trim().is_empty() || is_header || is_code_fence {
157 return true;
158 }
159
160 // Detect function/class definitions (common in code)
161 let trimmed = line.trim();
162 let is_definition = trimmed.starts_with("def ")
163 || trimmed.starts_with("class ")
164 || trimmed.starts_with("function ")
165 || trimmed.starts_with("pub fn ")
166 || trimmed.starts_with("fn ")
167 || trimmed.starts_with("const ")
168 || trimmed.starts_with("let ");
169 if is_definition && num_lines >= 25 {
170 return true;
171 }
172
173 // If we're at 40+ lines and haven't found a boundary, force split
174 if num_lines >= 40 {
175 return true;
176 }
177 }
178
179 false
180}
181
182/// Check if a line looks like a section header
183fn is_section_header(line: &str) -> bool {
184 let trimmed = line.trim();
185
186 // Markdown headers: # Title, ## Title, etc.
187 if trimmed.starts_with('#') && trimmed.len() > 1 {
188 return true;
189 }
190
191 // ReStructuredText style headers: underlines with =, -, ~, etc.
192 if trimmed.len() > 2 && trimmed.chars().all(|c| c == '=' || c == '-' || c == '~' || c == '^') {
193 return true;
194 }
195
196 false
197}
198
199use std::sync::OnceLock;
200
201/// Cached tiktoken tokenizer for performance
202static TOKENIZER: OnceLock<tiktoken_rs::CoreBPE> = OnceLock::new();
203
204/// Estimate token count for text
205///
206/// Uses tiktoken-rs for accurate token counting compatible with OpenAI models.
207/// The tokenizer is cached for performance (avoids reloading on every call).
208/// Falls back to simple heuristic if tiktoken initialization fails.
209///
210/// # Arguments
211///
212/// * `text` - The text to estimate tokens for
213///
214/// # Returns
215///
216/// Accurate token count (or heuristic estimate if tiktoken fails)
217fn estimate_tokens(text: &str) -> usize {
218 // Use cached tiktoken tokenizer for accurate counting
219 let tokenizer = TOKENIZER.get_or_init(|| {
220 // If initialization fails, we'll use a dummy tokenizer that always returns 0
221 // and fall back to heuristic below
222 tiktoken_rs::cl100k_base().unwrap_or_else(|_| {
223 // Return a dummy - we'll detect this and use heuristic
224 // This is a workaround since we can't return an error from get_or_init
225 // In practice, tiktoken should never fail to initialize
226 panic!("Failed to initialize tiktoken tokenizer - this should not happen")
227 })
228 });
229
230 // Use the tokenizer
231 tokenizer.encode_with_special_tokens(text).len()
232}
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237
238 #[test]
239 fn test_extract_spans_simple() {
240 let content = "Line 1\nLine 2\nLine 3";
241 let spans = extract_spans(content, "test-artifact").unwrap();
242
243 assert!(!spans.is_empty());
244 assert_eq!(spans[0].start_line, 1);
245 assert!(spans[0].token_count > 0);
246 }
247
248 #[test]
249 fn test_extract_spans_with_paragraphs() {
250 let content = (0..25).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n")
251 + "\n\n"
252 + &(25..50).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
253
254 let spans = extract_spans(&content, "test-artifact").unwrap();
255
256 // Should create multiple spans
257 assert!(spans.len() >= 2);
258
259 // Each span should have valid line numbers
260 for span in &spans {
261 assert!(span.end_line >= span.start_line);
262 }
263 }
264
265 #[test]
266 fn test_no_overlapping_spans() {
267 let content = (0..100).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
268 let spans = extract_spans(&content, "test-artifact").unwrap();
269
270 // Verify no gaps or overlaps
271 for i in 0..spans.len() - 1 {
272 assert_eq!(spans[i].end_line + 1, spans[i + 1].start_line);
273 }
274 }
275
276 #[test]
277 fn test_token_estimation() {
278 let text = "This is a test sentence with about ten words in it.";
279 let tokens = estimate_tokens(text);
280 assert!(tokens > 0);
281 }
282}