use super::ParsingStrategy;
use crate::{
error::Result,
value::{FlexValue, Source},
};
const MAX_INPUT_SIZE: usize = 1024 * 1024;
#[derive(Debug, Clone)]
pub struct HeuristicStrategy {
max_candidates: usize,
}
impl Default for HeuristicStrategy {
fn default() -> Self {
Self::new()
}
}
impl HeuristicStrategy {
#[inline]
pub fn new() -> Self {
Self { max_candidates: 20 }
}
#[inline]
pub const fn with_max_candidates(max_candidates: usize) -> Self {
Self { max_candidates }
}
fn find_json_boundaries(&self, input: &str) -> Vec<(usize, usize, &'static str)> {
let mut boundaries = Vec::new();
self.find_balanced_boundaries(input, '{', '}', "object", &mut boundaries);
self.find_balanced_boundaries(input, '[', ']', "array", &mut boundaries);
boundaries.sort_by(|a, b| {
let len_a = a.1 - a.0;
let len_b = b.1 - b.0;
a.0.cmp(&b.0).then(len_b.cmp(&len_a)) });
let mut deduped = Vec::new();
for boundary in boundaries {
let overlaps = deduped.iter().any(|(start, end, _)| {
!(boundary.1 <= *start || boundary.0 >= *end)
});
if !overlaps {
deduped.push(boundary);
}
}
deduped
}
fn find_balanced_boundaries(
&self,
input: &str,
open: char,
close: char,
pattern: &'static str,
boundaries: &mut Vec<(usize, usize, &'static str)>,
) {
let chars: Vec<(usize, char)> = input.char_indices().collect();
let mut i = 0;
while i < chars.len() {
if chars[i].1 == open {
if let Some(end_idx) = self.find_matching_close(&chars, i, open, close) {
let byte_start = chars[i].0;
let byte_end = chars
.get(end_idx + 1)
.map(|(offset, _)| *offset)
.unwrap_or(input.len());
boundaries.push((byte_start, byte_end, pattern));
i = end_idx + 1;
} else {
i += 1;
}
} else {
i += 1;
}
}
}
fn find_matching_close(
&self,
chars: &[(usize, char)],
start: usize,
open: char,
close: char,
) -> Option<usize> {
let mut depth = 0;
let mut in_string = false;
let mut escape_next = false;
for (idx, &(_, ch)) in chars.iter().enumerate().skip(start) {
if escape_next {
escape_next = false;
continue;
}
match ch {
'\\' if in_string => escape_next = true,
'"' | '\'' => in_string = !in_string,
_ if ch == open && !in_string => depth += 1,
_ if ch == close && !in_string => {
depth -= 1;
if depth == 0 {
return Some(idx);
}
}
_ => {}
}
}
None }
}
impl ParsingStrategy for HeuristicStrategy {
#[inline]
fn name(&self) -> &'static str {
"heuristic"
}
fn parse(&self, input: &str) -> Result<Vec<FlexValue>> {
if input.len() > MAX_INPUT_SIZE {
return Ok(Vec::new());
}
let mut candidates = Vec::new();
let boundaries = self.find_json_boundaries(input);
for (start, end, pattern) in boundaries.iter().take(self.max_candidates) {
if start >= end || *end > input.len() {
continue;
}
let substring = &input[*start..*end];
if let Ok(value) = serde_json::from_str(substring) {
candidates.push(FlexValue::new(
value,
Source::Heuristic {
pattern: pattern.to_string(),
},
));
}
}
Ok(candidates)
}
#[inline]
fn priority(&self) -> u8 {
4 }
}
#[cfg(test)]
mod tests {
use serde_json::json;
use super::*;
#[test]
fn test_extract_json_from_prose() {
let strategy = HeuristicStrategy::new();
let input = r#"Sure! Here's the data: {"name": "Alice", "age": 30} hope that helps!"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!({"name": "Alice", "age": 30}));
}
#[test]
fn test_extract_array_from_prose() {
let strategy = HeuristicStrategy::new();
let input = r#"The numbers are [1, 2, 3, 4, 5] as you can see."#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!([1, 2, 3, 4, 5]));
}
#[test]
fn test_multiple_json_in_prose() {
let strategy = HeuristicStrategy::new();
let input = r#"First: {"a": 1} and second: {"b": 2}"#;
let result = strategy.parse(input).unwrap();
assert_eq!(result.len(), 2);
}
#[test]
fn test_nested_json_in_prose() {
let strategy = HeuristicStrategy::new();
let input = r#"The user is {"name": "Alice", "address": {"city": "NYC"}} thanks!"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(
result[0].value,
json!({"name": "Alice", "address": {"city": "NYC"}})
);
}
#[test]
fn test_json_with_strings_containing_braces() {
let strategy = HeuristicStrategy::new();
let input = r#"Data: {"text": "Hello {world}"} done"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!({"text": "Hello {world}"}));
}
#[test]
fn test_no_json_in_text() {
let strategy = HeuristicStrategy::new();
let input = "This is just plain text with no JSON.";
let result = strategy.parse(input).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_unbalanced_braces() {
let strategy = HeuristicStrategy::new();
let input = r#"Invalid: {"name": "Alice" missing brace"#;
let result = strategy.parse(input).unwrap();
assert!(result.is_empty()); }
#[test]
fn test_long_rambling_response() {
let strategy = HeuristicStrategy::new();
let input = r#"
Well, let me think about this. The user you're asking about is quite interesting.
They have been with us for a while. Actually, I should give you their data.
The information is {"name": "Alice", "age": 30} as you can see.
Let me know if you need anything else about this user or other users.
"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!({"name": "Alice", "age": 30}));
}
#[test]
fn test_max_candidates() {
let strategy = HeuristicStrategy::with_max_candidates(2);
let input = r#"{"a": 1} {"b": 2} {"c": 3} {"d": 4}"#;
let result = strategy.parse(input).unwrap();
assert!(result.len() <= 2); }
#[test]
fn test_find_matching_close() {
let strategy = HeuristicStrategy::new();
let chars: Vec<(usize, char)> = r#"{"name": "Alice"}"#.char_indices().collect();
let close_idx = strategy.find_matching_close(&chars, 0, '{', '}');
assert_eq!(close_idx, Some(16)); }
#[test]
fn test_find_matching_close_with_nested() {
let strategy = HeuristicStrategy::new();
let chars: Vec<(usize, char)> = r#"{"a": {"b": 1}}"#.char_indices().collect();
let close_idx = strategy.find_matching_close(&chars, 0, '{', '}');
assert_eq!(close_idx, Some(14));
}
#[test]
fn test_dos_protection() {
let strategy = HeuristicStrategy::new();
let huge_input = "x".repeat(2 * 1024 * 1024);
let result = strategy.parse(&huge_input).unwrap();
assert!(result.is_empty()); }
#[test]
fn test_non_ascii_thai_in_value() {
let strategy = HeuristicStrategy::new();
let input = r#"{"status":"complete","message":"สวัสดี"}"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value["message"], "สวัสดี");
}
#[test]
fn test_non_ascii_emoji_in_value() {
let strategy = HeuristicStrategy::new();
let input = r#"{"status":"complete","message":"Hello 🎉"}"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value["message"], "Hello 🎉");
}
#[test]
fn test_non_ascii_chinese_in_value() {
let strategy = HeuristicStrategy::new();
let input = r#"Some text {"greeting": "你好世界"} more text"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value["greeting"], "你好世界");
}
#[test]
fn test_non_ascii_prose_before_json() {
let strategy = HeuristicStrategy::new();
let input = r#"สวัสดี: {"name": "Alice", "age": 30}"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!({"name": "Alice", "age": 30}));
}
#[test]
fn test_non_ascii_accented_characters() {
let strategy = HeuristicStrategy::new();
let input = r#"Résumé: {"name": "Ångström", "city": "München"}"#;
let result = strategy.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value["name"], "Ångström");
assert_eq!(result[0].value["city"], "München");
}
}