use std::ops::Range;
use mnem_core::id::Cid;
use mnem_ner_providers::NerConfig;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Section {
pub heading: Option<String>,
pub depth: u8,
pub text: String,
pub byte_range: Range<usize>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Chunk {
pub section_path: Vec<String>,
pub text: String,
pub tokens_estimate: u32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SourceKind {
Markdown,
Text,
Pdf,
Conversation,
}
pub type ChunkerKind = crate::chunk::ChunkerKind;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct IngestConfig {
pub chunker: ChunkerKind,
pub ntype: String,
pub max_tokens: u32,
pub overlap: u32,
#[serde(default)]
pub ner: NerConfig,
}
impl Default for IngestConfig {
fn default() -> Self {
Self {
chunker: ChunkerKind::Paragraph,
ntype: "Doc".into(),
max_tokens: 512,
overlap: 32,
ner: NerConfig::default(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct IngestResult {
pub commit_cid: Option<Cid>,
pub node_count: u64,
pub chunk_count: u64,
pub entity_count: u64,
pub relation_count: u64,
pub elapsed_ms: u64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ConversationFormat {
ChatGpt,
Claude,
Generic,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Message {
pub role: String,
pub content: String,
pub timestamp: Option<u64>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExtractorConfig {
#[serde(default = "default_true")]
pub extract_ner: bool,
pub relation_window_tokens: usize,
}
fn default_true() -> bool {
true
}
impl Default for ExtractorConfig {
fn default() -> Self {
Self {
extract_ner: true,
relation_window_tokens: 6,
}
}
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct ChunkerAuto {
pub max_tokens: Option<u32>,
pub overlap: Option<u32>,
pub max_messages: Option<usize>,
}