mod candidate;
mod cleaner;
pub mod state_machine;
pub mod strategies;
pub use candidate::{Candidate, CandidateSource};
pub use cleaner::{Cleaner, GarbageCleaner};
use strategies::{
DirectExtractor, DirectJsonStrategy, Extractor, HeuristicExtractor, HeuristicStrategy,
JsonFixerStrategy, MarkdownExtractor, MarkdownStrategy, MultipleObjectsStrategy,
ParsingStrategy, RawPrimitiveStrategy, StateMachineStrategy,
};
use crate::{error::Result, value::FlexValue};
pub const MAX_NESTING_DEPTH: usize = 50;
#[cfg(feature = "yaml")]
use strategies::YamlStrategy;
#[derive(Debug)]
pub struct FlexibleParser {
strategies: Vec<Box<dyn ParsingStrategy>>,
}
impl Clone for FlexibleParser {
fn clone(&self) -> Self {
Self::new()
}
}
impl Default for FlexibleParser {
fn default() -> Self {
Self::new()
}
}
impl FlexibleParser {
pub fn new() -> Self {
let mut strategies: Vec<Box<dyn ParsingStrategy>> = vec![
Box::new(DirectJsonStrategy),
Box::new(JsonFixerStrategy::default()),
Box::new(RawPrimitiveStrategy::new()),
Box::new(StateMachineStrategy::new()),
Box::new(HeuristicStrategy::default()),
Box::new(MarkdownStrategy),
Box::new(MultipleObjectsStrategy::new()),
];
#[cfg(feature = "yaml")]
{
strategies.push(Box::new(YamlStrategy));
}
strategies.sort_by_key(|s| s.priority());
Self { strategies }
}
pub fn with_strategies(mut strategies: Vec<Box<dyn ParsingStrategy>>) -> Self {
strategies.sort_by_key(|s| s.priority());
Self { strategies }
}
pub fn builder() -> FlexibleParserBuilder {
FlexibleParserBuilder::new()
}
}
pub struct FlexibleParserBuilder {
use_defaults: bool,
#[cfg(feature = "yaml")]
without_yaml: bool,
without_markdown: bool,
without_heuristic: bool,
additional_strategies: Vec<Box<dyn ParsingStrategy>>,
}
impl FlexibleParserBuilder {
fn new() -> Self {
Self {
use_defaults: true,
#[cfg(feature = "yaml")]
without_yaml: false,
without_markdown: false,
without_heuristic: false,
additional_strategies: Vec::new(),
}
}
pub fn without_defaults(mut self) -> Self {
self.use_defaults = false;
self
}
#[cfg(feature = "yaml")]
pub fn without_yaml(mut self) -> Self {
self.without_yaml = true;
self
}
pub fn without_markdown(mut self) -> Self {
self.without_markdown = true;
self
}
pub fn without_heuristic(mut self) -> Self {
self.without_heuristic = true;
self
}
pub fn with_strategy(mut self, strategy: Box<dyn ParsingStrategy>) -> Self {
self.additional_strategies.push(strategy);
self
}
pub fn build(self) -> FlexibleParser {
let mut strategies: Vec<Box<dyn ParsingStrategy>> = if self.use_defaults {
let mut default_strategies: Vec<Box<dyn ParsingStrategy>> = vec![
Box::new(DirectJsonStrategy),
Box::new(JsonFixerStrategy::default()),
Box::new(RawPrimitiveStrategy::new()),
Box::new(StateMachineStrategy::new()),
Box::new(MultipleObjectsStrategy::new()),
];
if !self.without_heuristic {
default_strategies.push(Box::new(HeuristicStrategy::default()));
}
if !self.without_markdown {
default_strategies.push(Box::new(MarkdownStrategy));
}
#[cfg(feature = "yaml")]
if !self.without_yaml {
default_strategies.push(Box::new(YamlStrategy));
}
default_strategies
} else {
Vec::new()
};
strategies.extend(self.additional_strategies);
strategies.sort_by_key(|s| s.priority());
FlexibleParser { strategies }
}
}
impl FlexibleParser {
pub fn parse(&self, input: &str) -> Result<Vec<FlexValue>> {
self.parse_multi_stage(input)
}
fn parse_multi_stage(&self, input: &str) -> Result<Vec<FlexValue>> {
let cleaner = GarbageCleaner::new();
let deep_nesting_extracted = cleaner.extract_from_deep_nesting(input, MAX_NESTING_DEPTH);
let input_after_nesting = deep_nesting_extracted.as_deref().unwrap_or(input);
let step1 = cleaner.remove_invisible_chars(input_after_nesting);
let preprocessed = cleaner.fix_unnecessary_backslashes(&step1);
let input = preprocessed.as_str();
let mut all_candidates = Vec::new();
let mut needs_normalization = false;
for strategy in &self.strategies {
match strategy.parse(input) {
Ok(mut candidates) => {
let is_direct = candidates
.iter()
.any(|c| matches!(c.source, crate::value::Source::Direct));
if is_direct {
needs_normalization = true;
}
all_candidates.append(&mut candidates);
if !all_candidates.is_empty()
&& (is_direct
|| candidates
.iter()
.any(|c| matches!(c.source, crate::value::Source::Yaml)))
{
if needs_normalization {
all_candidates = self.normalize_candidates(all_candidates)?;
}
return Ok(all_candidates);
}
}
Err(_) => {
}
}
}
if !all_candidates.is_empty() {
return Ok(all_candidates);
}
let extracted = self.extract_candidates(input)?;
if extracted.is_empty() {
return Ok(Vec::new());
}
let cleaned = self.clean_candidates(extracted)?;
let fixer = JsonFixerStrategy::default();
for candidate in cleaned {
if let Ok(value) = serde_json::from_str(&candidate.content) {
all_candidates.push(FlexValue::new(value, candidate.to_source()));
continue;
}
match fixer.parse(&candidate.content) {
Ok(mut fixed_candidates) => {
for fc in &mut fixed_candidates {
if let crate::value::Source::Fixed { fixes } = &fc.source {
fc.source = crate::value::Source::Fixed {
fixes: fixes.clone(),
};
}
}
all_candidates.append(&mut fixed_candidates);
}
Err(_) => {
continue;
}
}
}
Ok(all_candidates)
}
fn clean_candidates(&self, candidates: Vec<Candidate>) -> Result<Vec<Candidate>> {
let cleaner = GarbageCleaner::new();
let mut cleaned = Vec::new();
for candidate in candidates {
match cleaner.clean(&candidate)? {
Some(cleaned_candidate) => cleaned.push(cleaned_candidate),
None => cleaned.push(candidate), }
}
Ok(cleaned)
}
fn normalize_candidates(&self, candidates: Vec<FlexValue>) -> Result<Vec<FlexValue>> {
let cleaner = GarbageCleaner::new();
let mut normalized = Vec::new();
for flex_value in candidates {
if let serde_json::Value::String(s) = &flex_value.value {
if let Ok(inner_value) = serde_json::from_str::<serde_json::Value>(s) {
normalized.push(FlexValue::new(inner_value, flex_value.source));
continue;
}
}
match serde_json::to_string(&flex_value.value) {
Ok(json_str) => {
let invisible_removed = cleaner.remove_invisible_chars(&json_str);
let normalized_str = cleaner.normalize_field_names(&invisible_removed);
if normalized_str != json_str {
if let Ok(new_value) = serde_json::from_str(&normalized_str) {
normalized.push(FlexValue::new(new_value, flex_value.source));
continue;
}
}
normalized.push(flex_value);
}
Err(_) => {
normalized.push(flex_value);
}
}
}
Ok(normalized)
}
fn extract_candidates(&self, input: &str) -> Result<Vec<Candidate>> {
let mut candidates = Vec::new();
let extractors: Vec<Box<dyn Extractor>> = vec![
Box::new(DirectExtractor),
Box::new(HeuristicExtractor::default()),
Box::new(MarkdownExtractor),
];
for extractor in extractors {
match extractor.extract(input) {
Ok(mut extracted) => {
candidates.append(&mut extracted);
}
Err(_) => {
}
}
}
Ok(candidates)
}
#[inline]
pub fn strategy_count(&self) -> usize {
self.strategies.len()
}
pub fn strategy_names(&self) -> Vec<&'static str> {
self.strategies.iter().map(|s| s.name()).collect()
}
}
#[cfg(test)]
mod tests {
use serde_json::json;
use super::*;
#[test]
fn test_new_parser() {
let parser = FlexibleParser::new();
#[cfg(feature = "yaml")]
assert_eq!(parser.strategy_count(), 8);
#[cfg(not(feature = "yaml"))]
assert_eq!(parser.strategy_count(), 7);
}
#[test]
fn test_strategy_priority_order() {
let parser = FlexibleParser::new();
let names = parser.strategy_names();
assert_eq!(names[0], "multiple_objects");
assert_eq!(names[1], "direct_json");
}
#[test]
fn test_parse_direct_json() {
let parser = FlexibleParser::new();
let result = parser.parse(r#"{"name": "Alice"}"#).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!({"name": "Alice"}));
}
#[test]
fn test_parse_with_trailing_comma() {
let parser = FlexibleParser::new();
let result = parser.parse(r#"{"name": "Alice",}"#).unwrap();
assert!(!result.is_empty());
}
#[test]
fn test_parse_markdown() {
let parser = FlexibleParser::new();
let input = r#"
```json
{"name": "Bob"}
```
"#;
let result = parser.parse(input).unwrap();
assert!(!result.is_empty());
assert_eq!(result[0].value, json!({"name": "Bob"}));
}
#[test]
fn test_parse_empty_input() {
let parser = FlexibleParser::new();
let result = parser.parse("").unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].value, json!(""));
}
#[test]
fn test_parse_invalid_text() {
let parser = FlexibleParser::new();
let result = parser.parse("This is just plain text").unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].value, json!("This is just plain text"));
}
#[test]
fn test_multiple_candidates() {
let parser = FlexibleParser::new();
let result = parser.parse(r#"{'name': 'Alice'}"#).unwrap();
assert!(!result.is_empty());
}
#[test]
fn test_with_custom_strategies() {
let strategies: Vec<Box<dyn ParsingStrategy>> = vec![Box::new(DirectJsonStrategy)];
let parser = FlexibleParser::with_strategies(strategies);
assert_eq!(parser.strategy_count(), 1);
assert_eq!(parser.strategy_names()[0], "direct_json");
}
}