use async_trait::async_trait;
use serde_json::Value;
use super::{TextSplitter, TextSplitterError};
#[derive(Debug, Clone)]
pub enum JsonSplitMode {
Object,
Array,
Both,
}
#[derive(Debug, Clone)]
pub struct JsonSplitterOptions {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub split_mode: JsonSplitMode,
pub include_path: bool,
pub trim_chunks: bool,
}
impl Default for JsonSplitterOptions {
fn default() -> Self {
Self::new()
}
}
impl JsonSplitterOptions {
pub fn new() -> Self {
Self {
chunk_size: 1000,
chunk_overlap: 200,
split_mode: JsonSplitMode::Both,
include_path: true,
trim_chunks: true,
}
}
pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
self.chunk_size = chunk_size;
self
}
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.chunk_overlap = chunk_overlap;
self
}
pub fn with_split_mode(mut self, split_mode: JsonSplitMode) -> Self {
self.split_mode = split_mode;
self
}
pub fn with_include_path(mut self, include_path: bool) -> Self {
self.include_path = include_path;
self
}
pub fn with_trim_chunks(mut self, trim_chunks: bool) -> Self {
self.trim_chunks = trim_chunks;
self
}
}
pub struct JsonSplitter {
options: JsonSplitterOptions,
}
impl Default for JsonSplitter {
fn default() -> Self {
Self::new()
}
}
impl JsonSplitter {
pub fn new() -> Self {
Self::with_options(JsonSplitterOptions::default())
}
pub fn with_options(options: JsonSplitterOptions) -> Self {
Self { options }
}
pub fn with_chunk_size(chunk_size: usize) -> Self {
Self::new().with_chunk_size_option(chunk_size)
}
pub fn with_chunk_size_option(mut self, chunk_size: usize) -> Self {
self.options.chunk_size = chunk_size;
self
}
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.options.chunk_overlap = chunk_overlap;
self
}
pub fn with_split_mode(mut self, split_mode: JsonSplitMode) -> Self {
self.options.split_mode = split_mode;
self
}
pub fn with_include_path(mut self, include_path: bool) -> Self {
self.options.include_path = include_path;
self
}
fn extract_json_elements(&self, json: &Value, path: &str) -> Vec<(String, String)> {
let mut elements = Vec::new();
match json {
Value::Object(map) => {
match &self.options.split_mode {
JsonSplitMode::Object | JsonSplitMode::Both => {
for (key, value) in map {
let new_path = if path.is_empty() {
key.clone()
} else {
format!("{}.{}", path, key)
};
let json_str =
serde_json::to_string(value).unwrap_or_else(|_| value.to_string());
if self.options.include_path {
elements.push((format!("{}: {}", new_path, json_str), new_path));
} else {
elements.push((json_str, new_path));
}
}
}
_ => {
let json_str =
serde_json::to_string(json).unwrap_or_else(|_| json.to_string());
elements.push((json_str, path.to_string()));
}
}
}
Value::Array(arr) => {
match &self.options.split_mode {
JsonSplitMode::Array | JsonSplitMode::Both => {
for (i, value) in arr.iter().enumerate() {
let new_path = if path.is_empty() {
format!("[{}]", i)
} else {
format!("{}[{}]", path, i)
};
let json_str =
serde_json::to_string(value).unwrap_or_else(|_| value.to_string());
if self.options.include_path {
elements.push((format!("{}: {}", new_path, json_str), new_path));
} else {
elements.push((json_str, new_path));
}
}
}
_ => {
let json_str =
serde_json::to_string(json).unwrap_or_else(|_| json.to_string());
elements.push((json_str, path.to_string()));
}
}
}
_ => {
let json_str = serde_json::to_string(json).unwrap_or_else(|_| json.to_string());
elements.push((json_str, path.to_string()));
}
}
elements
}
fn split_elements_into_chunks(&self, elements: Vec<(String, String)>) -> Vec<String> {
let mut chunks = Vec::new();
let mut current_chunk = String::new();
for (text, _path) in elements {
let text_trimmed = if self.options.trim_chunks {
text.trim().to_string()
} else {
text
};
if text_trimmed.is_empty() {
continue;
}
let test_chunk = if current_chunk.is_empty() {
text_trimmed.clone()
} else {
format!("{}\n{}", current_chunk, text_trimmed)
};
if test_chunk.len() <= self.options.chunk_size {
if current_chunk.is_empty() {
current_chunk = text_trimmed;
} else {
current_chunk = format!("{}\n{}", current_chunk, text_trimmed);
}
} else {
if !current_chunk.is_empty() {
chunks.push(current_chunk.clone());
}
if text_trimmed.len() > self.options.chunk_size {
let sub_chunks = self.split_large_text(&text_trimmed);
chunks.extend(sub_chunks);
current_chunk = String::new();
} else {
current_chunk = text_trimmed;
}
}
}
if !current_chunk.is_empty() {
chunks.push(current_chunk);
}
self.apply_overlap(chunks)
}
fn split_large_text(&self, text: &str) -> Vec<String> {
let mut chunks = Vec::new();
let mut start = 0;
while start < text.len() {
let end = (start + self.options.chunk_size).min(text.len());
let chunk = text[start..end].to_string();
let trimmed = if self.options.trim_chunks {
chunk.trim().to_string()
} else {
chunk
};
if !trimmed.is_empty() {
chunks.push(trimmed);
}
start = end.saturating_sub(self.options.chunk_overlap);
}
chunks
}
fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
if self.options.chunk_overlap == 0 || chunks.len() <= 1 {
return chunks;
}
let mut overlapped = Vec::new();
for (i, chunk) in chunks.iter().enumerate() {
if i == 0 {
overlapped.push(chunk.clone());
} else {
let prev_chunk = &chunks[i - 1];
let overlap_start = prev_chunk.len().saturating_sub(self.options.chunk_overlap);
let overlap_text = &prev_chunk[overlap_start..];
let mut new_chunk = String::new();
if !overlap_text.is_empty() {
new_chunk.push_str(overlap_text);
new_chunk.push('\n');
}
new_chunk.push_str(chunk);
overlapped.push(new_chunk);
}
}
overlapped
}
}
#[async_trait]
impl TextSplitter for JsonSplitter {
async fn split_text(&self, text: &str) -> Result<Vec<String>, TextSplitterError> {
if text.is_empty() {
return Ok(vec![]);
}
if self.options.chunk_size == 0 {
return Err(TextSplitterError::InvalidSplitterOptions);
}
let json: Value = serde_json::from_str(text)
.map_err(|e| TextSplitterError::OtherError(format!("JSON parse error: {}", e)))?;
let elements = self.extract_json_elements(&json, "");
let chunks = self.split_elements_into_chunks(elements);
Ok(chunks)
}
}