use super::header::detect_primary_subject;
use super::parse_lines::process_content_lines;
use super::parse_units::process_units;
use super::regex::sanitize_entity_name;
use super::text::chunk_document_content;
use super::types::{
ExtractionAccumulator, MemoryIngestionConfig, MemoryIngestionResult, ParsedIngestion,
DEFAULT_CHUNK_TOKENS,
};
pub(super) fn parse_document(
content: &str,
title: &str,
config: &MemoryIngestionConfig,
) -> ParsedIngestion {
let chunks = chunk_document_content(content, DEFAULT_CHUNK_TOKENS);
let mut accumulator = ExtractionAccumulator {
document_title: Some(sanitize_entity_name(title)),
primary_subject: detect_primary_subject(title),
..ExtractionAccumulator::default()
};
process_content_lines(content, &chunks, &mut accumulator, config);
process_units(&chunks, &mut accumulator, config);
super::aggregate::finalize(accumulator, config, chunks.len())
}
pub fn extract_document(
content: &str,
title: &str,
config: &MemoryIngestionConfig,
) -> MemoryIngestionResult {
let parsed = parse_document(content, title, config);
MemoryIngestionResult {
document_id: String::new(),
namespace: String::new(),
model_name: config.model_name.clone(),
extraction_mode: config.extraction_mode.as_str().to_string(),
chunk_count: parsed.chunk_count,
entity_count: parsed.entities.len(),
relation_count: parsed.relations.len(),
preference_count: parsed.preference_count,
decision_count: parsed.decision_count,
tags: parsed.tags,
entities: parsed.entities,
relations: parsed.relations,
}
}
#[cfg(test)]
#[path = "parse_tests.rs"]
mod parse_tests;