use once_cell::sync::Lazy;
use regex::Regex;
use crate::{error::Result, parser::Candidate};
static CODE_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?s)```(\w*)\n(.*?)```").unwrap());
pub trait Extractor: Send + Sync + std::fmt::Debug {
fn name(&self) -> &'static str;
fn extract(&self, input: &str) -> Result<Vec<Candidate>>;
fn priority(&self) -> u8;
}
#[derive(Debug, Clone, Default)]
pub struct DirectExtractor;
impl Extractor for DirectExtractor {
fn name(&self) -> &'static str {
"direct"
}
fn extract(&self, input: &str) -> Result<Vec<Candidate>> {
if input.trim().is_empty() {
return Ok(Vec::new());
}
Ok(vec![Candidate::direct(input)])
}
fn priority(&self) -> u8 {
1 }
}
#[derive(Debug, Clone)]
pub struct HeuristicExtractor {
max_candidates: usize,
}
impl Default for HeuristicExtractor {
fn default() -> Self {
Self::new()
}
}
impl HeuristicExtractor {
const MAX_INPUT_SIZE: usize = 1024 * 1024;
pub fn new() -> Self {
Self { max_candidates: 20 }
}
pub const fn with_max_candidates(max_candidates: usize) -> Self {
Self { max_candidates }
}
fn find_json_boundaries(&self, input: &str) -> Vec<(usize, usize, &'static str)> {
let mut boundaries = Vec::new();
self.find_balanced_boundaries(input, '{', '}', "object", &mut boundaries);
self.find_balanced_boundaries(input, '[', ']', "array", &mut boundaries);
boundaries.sort_by(|a, b| {
let len_a = a.1 - a.0;
let len_b = b.1 - b.0;
a.0.cmp(&b.0).then(len_b.cmp(&len_a)) });
let mut deduped = Vec::new();
for boundary in boundaries {
let overlaps = deduped.iter().any(|(start, end, _)| {
!(boundary.1 <= *start || boundary.0 >= *end)
});
if !overlaps {
deduped.push(boundary);
}
}
deduped
}
fn find_balanced_boundaries(
&self,
input: &str,
open: char,
close: char,
pattern: &'static str,
boundaries: &mut Vec<(usize, usize, &'static str)>,
) {
let chars: Vec<(usize, char)> = input.char_indices().collect();
let mut i = 0;
while i < chars.len() {
if chars[i].1 == open {
if let Some(end_idx) = self.find_matching_close(&chars, i, open, close) {
let byte_start = chars[i].0;
let byte_end = chars
.get(end_idx + 1)
.map(|(offset, _)| *offset)
.unwrap_or(input.len());
boundaries.push((byte_start, byte_end, pattern));
i = end_idx + 1;
} else {
i += 1;
}
} else {
i += 1;
}
}
}
fn find_matching_close(
&self,
chars: &[(usize, char)],
start: usize,
open: char,
close: char,
) -> Option<usize> {
let mut depth = 0;
let mut in_string = false;
let mut escape_next = false;
for (idx, &(_, ch)) in chars.iter().enumerate().skip(start) {
if escape_next {
escape_next = false;
continue;
}
match ch {
'\\' if in_string => escape_next = true,
'"' | '\'' => in_string = !in_string,
_ if ch == open && !in_string => depth += 1,
_ if ch == close && !in_string => {
depth -= 1;
if depth == 0 {
return Some(idx);
}
}
_ => {}
}
}
None }
}
impl Extractor for HeuristicExtractor {
fn name(&self) -> &'static str {
"heuristic"
}
fn extract(&self, input: &str) -> Result<Vec<Candidate>> {
if input.len() > Self::MAX_INPUT_SIZE {
return Ok(Vec::new());
}
let mut candidates = Vec::new();
let boundaries = self.find_json_boundaries(input);
for (start, end, pattern) in boundaries.iter().take(self.max_candidates) {
if start >= end || *end > input.len() {
continue;
}
let substring = &input[*start..*end];
candidates.push(Candidate::heuristic(substring, pattern.to_string()));
}
Ok(candidates)
}
fn priority(&self) -> u8 {
2 }
}
#[derive(Debug, Clone, Default)]
pub struct MarkdownExtractor;
impl MarkdownExtractor {
pub fn new() -> Self {
Self
}
}
impl Extractor for MarkdownExtractor {
fn name(&self) -> &'static str {
"markdown"
}
fn extract(&self, input: &str) -> Result<Vec<Candidate>> {
let mut candidates = Vec::new();
for cap in CODE_BLOCK_REGEX.captures_iter(input) {
let lang = cap.get(1).map(|m| m.as_str());
let content = cap.get(2).map(|m| m.as_str()).unwrap_or("");
if !content.trim().is_empty() {
let lang_opt = if let Some(l) = lang {
if !l.is_empty() {
Some(l.to_string())
} else {
None
}
} else {
None
};
candidates.push(Candidate::markdown(content.trim(), lang_opt));
}
}
Ok(candidates)
}
fn priority(&self) -> u8 {
2 }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_direct_extractor() {
let extractor = DirectExtractor;
let candidates = extractor.extract(r#"{"name": "Alice"}"#).unwrap();
assert_eq!(candidates.len(), 1);
assert_eq!(candidates[0].content, r#"{"name": "Alice"}"#);
}
#[test]
fn test_direct_extractor_empty() {
let extractor = DirectExtractor;
let candidates = extractor.extract(" ").unwrap();
assert_eq!(candidates.len(), 0);
}
#[test]
fn test_heuristic_extractor() {
let extractor = HeuristicExtractor::new();
let input = r#"Sure! Here's the data: {"name": "Alice", "age": 30} hope that helps!"#;
let candidates = extractor.extract(input).unwrap();
assert!(!candidates.is_empty());
assert_eq!(candidates[0].content, r#"{"name": "Alice", "age": 30}"#);
}
#[test]
fn test_heuristic_extractor_multiple() {
let extractor = HeuristicExtractor::new();
let input = r#"First: {"a": 1} and second: {"b": 2}"#;
let candidates = extractor.extract(input).unwrap();
assert_eq!(candidates.len(), 2);
}
#[test]
fn test_heuristic_extractor_array() {
let extractor = HeuristicExtractor::new();
let input = r#"The numbers are [1, 2, 3, 4, 5] as you can see."#;
let candidates = extractor.extract(input).unwrap();
assert_eq!(candidates.len(), 1);
assert_eq!(candidates[0].content, "[1, 2, 3, 4, 5]");
}
#[test]
fn test_heuristic_extractor_non_ascii_value() {
let extractor = HeuristicExtractor::new();
let input = r#"{"message":"สวัสดี"}"#;
let candidates = extractor.extract(input).unwrap();
assert!(!candidates.is_empty());
assert_eq!(candidates[0].content, input);
}
#[test]
fn test_heuristic_extractor_emoji_in_value() {
let extractor = HeuristicExtractor::new();
let input = r#"Result: {"msg": "Hello 🎉"} done"#;
let candidates = extractor.extract(input).unwrap();
assert!(!candidates.is_empty());
assert_eq!(candidates[0].content, r#"{"msg": "Hello 🎉"}"#);
}
#[test]
fn test_heuristic_extractor_multibyte_prose_prefix() {
let extractor = HeuristicExtractor::new();
let input = r#"สวัสดี: {"name": "Alice"}"#;
let candidates = extractor.extract(input).unwrap();
assert!(!candidates.is_empty());
assert_eq!(candidates[0].content, r#"{"name": "Alice"}"#);
}
}