use crate::types::{Result, Span};
use uuid::Uuid;
pub fn extract_spans(content: &str, artifact_id: &str) -> Result<Vec<Span>> {
let lines: Vec<&str> = content.lines().collect();
let mut spans = Vec::new();
let mut current_start = 1;
let mut current_lines = Vec::new();
for (idx, line) in lines.iter().enumerate() {
let line_num = idx + 1;
current_lines.push(*line);
let should_split = should_create_span(
¤t_lines,
line,
idx,
lines.len(),
);
if should_split && !current_lines.is_empty() {
let text = current_lines.join("\n");
let token_count = estimate_tokens(&text);
spans.push(Span {
id: Uuid::new_v4().to_string(),
artifact_id: artifact_id.to_string(),
start_line: current_start,
end_line: line_num,
text,
embedding: None,
embedding_model: None,
token_count,
metadata: None,
});
current_start = line_num + 1;
current_lines.clear();
}
}
if !current_lines.is_empty() {
let text = current_lines.join("\n");
let token_count = estimate_tokens(&text);
spans.push(Span {
id: Uuid::new_v4().to_string(),
artifact_id: artifact_id.to_string(),
start_line: current_start,
end_line: lines.len(),
text,
embedding: None,
embedding_model: None,
token_count,
metadata: None,
});
}
Ok(spans)
}
fn should_create_span(
current_lines: &[&str],
line: &str,
idx: usize,
total_lines: usize,
) -> bool {
let num_lines = current_lines.len();
if idx == total_lines - 1 {
return true;
}
if num_lines >= 50 {
return true;
}
if line.trim().is_empty() && num_lines >= 20 {
return true;
}
let is_header = is_section_header(line);
if is_header && num_lines >= 15 {
return true;
}
let is_code_fence = line.trim().starts_with("```") || line.trim().starts_with("~~~");
if is_code_fence && num_lines >= 20 {
return true;
}
if num_lines >= 30 {
if line.trim().is_empty() || is_header || is_code_fence {
return true;
}
let trimmed = line.trim();
let is_definition = trimmed.starts_with("def ")
|| trimmed.starts_with("class ")
|| trimmed.starts_with("function ")
|| trimmed.starts_with("pub fn ")
|| trimmed.starts_with("fn ")
|| trimmed.starts_with("const ")
|| trimmed.starts_with("let ");
if is_definition && num_lines >= 25 {
return true;
}
if num_lines >= 40 {
return true;
}
}
false
}
fn is_section_header(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.starts_with('#') && trimmed.len() > 1 {
return true;
}
if trimmed.len() > 2 && trimmed.chars().all(|c| c == '=' || c == '-' || c == '~' || c == '^') {
return true;
}
false
}
use std::sync::OnceLock;
static TOKENIZER: OnceLock<tiktoken_rs::CoreBPE> = OnceLock::new();
fn estimate_tokens(text: &str) -> usize {
let tokenizer = TOKENIZER.get_or_init(|| {
tiktoken_rs::cl100k_base().unwrap_or_else(|_| {
panic!("Failed to initialize tiktoken tokenizer - this should not happen")
})
});
tokenizer.encode_with_special_tokens(text).len()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_spans_simple() {
let content = "Line 1\nLine 2\nLine 3";
let spans = extract_spans(content, "test-artifact").unwrap();
assert!(!spans.is_empty());
assert_eq!(spans[0].start_line, 1);
assert!(spans[0].token_count > 0);
}
#[test]
fn test_extract_spans_with_paragraphs() {
let content = (0..25).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n")
+ "\n\n"
+ &(25..50).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
let spans = extract_spans(&content, "test-artifact").unwrap();
assert!(spans.len() >= 2);
for span in &spans {
assert!(span.end_line >= span.start_line);
}
}
#[test]
fn test_no_overlapping_spans() {
let content = (0..100).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
let spans = extract_spans(&content, "test-artifact").unwrap();
for i in 0..spans.len() - 1 {
assert_eq!(spans[i].end_line + 1, spans[i + 1].start_line);
}
}
#[test]
fn test_token_estimation() {
let text = "This is a test sentence with about ten words in it.";
let tokens = estimate_tokens(text);
assert!(tokens > 0);
}
}