Skip to main content

mnem_ingest/
types.rs

1//! Shared data types used throughout the ingest pipeline.
2//!
3//! Kept in a single file for B5a; if the surface grows past ~150 lines in
4//! later sub-waves we will split (`section.rs`, `chunk.rs`, `config.rs`).
5
6use std::ops::Range;
7
8use mnem_core::id::Cid;
9use mnem_ner_providers::NerConfig;
10use serde::{Deserialize, Serialize};
11
12/// A hierarchical text region extracted from a source.
13///
14/// Produced by parsers in [`crate::md`] / [`crate::text`] and consumed by
15/// chunkers in [`mod@crate::chunk`]. The `byte_range` always refers to offsets
16/// in the *original* source input (not the post-parse normalized text), so
17/// downstream stages can slice back into the raw document for diffing or
18/// provenance tracking.
19///
20/// Heading depth uses `CommonMark`'s 1-indexed convention (`# H1 → 1`). A
21/// depth of `0` indicates "no heading" (e.g. top-of-file prose before any
22/// heading, or the synthetic root produced by [`crate::text::parse_text`]).
23#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
24pub struct Section {
25    /// Heading text, without the leading `#` markers and trimmed.
26    pub heading: Option<String>,
27    /// Heading depth (1–6 for actual headings, 0 for headless prose).
28    pub depth: u8,
29    /// Body text contained under this heading (code blocks are kept intact).
30    pub text: String,
31    /// Byte range in the original source input.
32    pub byte_range: Range<usize>,
33}
34
35/// A single chunk emitted by a [`crate::chunk::ChunkerKind`].
36///
37/// `section_path` records the hierarchy of headings that enclose this
38/// chunk, from the root of the document down. It is used by downstream
39/// stages for breadcrumb display and for attaching graph edges back to
40/// the enclosing `Doc` node.
41#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
42pub struct Chunk {
43    /// Heading hierarchy from outermost to innermost.
44    pub section_path: Vec<String>,
45    /// Chunk body text.
46    pub text: String,
47    /// Whitespace-split token count (deterministic estimate).
48    pub tokens_estimate: u32,
49}
50
51/// The kind of source being ingested.
52///
53/// Only `Markdown` and `Text` are handled in Phase-B5a; the other variants
54/// are declared here so public signatures remain stable across sub-waves.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56#[serde(rename_all = "lowercase")]
57pub enum SourceKind {
58    /// `CommonMark` + GFM (tables, fenced code).
59    Markdown,
60    /// UTF-8 plain text, no structure inferred.
61    Text,
62    /// PDF (text-layer extraction). Handled in Phase-B5b.
63    Pdf,
64    /// Chat transcript (JSON/JSONL). Handled in Phase-B5b.
65    Conversation,
66}
67
68/// Which chunker strategy to use, and its parameters.
69///
70/// Re-exported from [`mod@crate::chunk`] for convenience.
71pub type ChunkerKind = crate::chunk::ChunkerKind;
72
73/// Configuration for an ingest run.
74///
75/// `ntype` is the `Node::ntype` string applied to the root document node
76/// once Phase-B5c wires commit. Typical values: `"Doc"`, `"Note"`,
77/// `"Transcript"`.
78#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
79pub struct IngestConfig {
80    /// Which chunker to use.
81    pub chunker: ChunkerKind,
82    /// `Node::ntype` of the root Doc node.
83    pub ntype: String,
84    /// Target maximum tokens per chunk (advisory; used by recursive chunker).
85    pub max_tokens: u32,
86    /// Overlap tokens between adjacent chunks (recursive chunker only).
87    pub overlap: u32,
88    /// NER provider selection. Defaults to [`NerConfig::Rule`] (the
89    /// capitalized-phrase heuristic). Set to [`NerConfig::None`] to
90    /// suppress all entity extraction.
91    #[serde(default)]
92    pub ner: NerConfig,
93}
94
95impl Default for IngestConfig {
96    fn default() -> Self {
97        Self {
98            chunker: ChunkerKind::Paragraph,
99            ntype: "Doc".into(),
100            max_tokens: 512,
101            overlap: 32,
102            ner: NerConfig::default(),
103        }
104    }
105}
106
107/// Outcome of a completed ingest run.
108///
109/// Phase-B5c wires the real pipeline: `commit_cid` is `Some(_)` whenever
110/// the caller committed the transaction after [`crate::Ingester::ingest`]
111/// returned; `None` when they ran a dry-run (ingest without commit) or
112/// when the underlying backend reports no change. `node_count` counts
113/// every `Node` added (the Doc root, one per chunk, one per unique
114/// entity). `entity_count` and `relation_count` report extraction
115/// output before dedup. `chunk_count` reports the number of chunks
116/// produced by the chunker stage.
117#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
118pub struct IngestResult {
119    /// Commit produced by the run, if any.
120    pub commit_cid: Option<Cid>,
121    /// Number of graph nodes created.
122    pub node_count: u64,
123    /// Number of chunks produced.
124    pub chunk_count: u64,
125    /// Number of entity nodes created (deduplicated across the run).
126    pub entity_count: u64,
127    /// Number of relation edges created.
128    pub relation_count: u64,
129    /// Wall-clock elapsed time in milliseconds.
130    pub elapsed_ms: u64,
131}
132
133/// Recognised conversation-export formats.
134///
135/// Used by [`crate::conversation::parse_conversation`] to route JSON into
136/// the right schema decoder. [`Self::Generic`] is the fallback for
137/// `[{"role", "content", "timestamp"?}]` shaped payloads.
138#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
139#[serde(rename_all = "snake_case")]
140pub enum ConversationFormat {
141    /// `ChatGPT` export (`conversations.json`) with a `mapping` tree of
142    /// message nodes keyed by UUID.
143    ChatGpt,
144    /// Claude export with a flat `{"conversation": [{role, content}]}`
145    /// top-level object.
146    Claude,
147    /// Generic `[{role, content, timestamp?}]` array.
148    Generic,
149}
150
151/// A single turn in a conversation.
152///
153/// `timestamp` is an optional Unix epoch in seconds - some exports
154/// (Claude, generic) omit it and we preserve that absence rather than
155/// fabricating zeroes.
156#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
157pub struct Message {
158    /// Speaker role, e.g. `"user"`, `"assistant"`, `"system"`, `"tool"`.
159    pub role: String,
160    /// Turn text content. Multi-part `ChatGPT` messages are concatenated
161    /// with `"\n\n"` separators by the parser.
162    pub content: String,
163    /// Unix epoch seconds, if the source provided one.
164    pub timestamp: Option<u64>,
165}
166
167/// Configuration for the entity + relation extractor.
168///
169/// Entity extraction is handled entirely by the NER provider wired via
170/// [`IngestConfig::ner`]. The provider may return any label strings it
171/// chooses; there is no fixed vocabulary.
172#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
173pub struct ExtractorConfig {
174    /// Call the NER provider for named-entity extraction. All labels
175    /// returned by the provider pass through unconditionally.
176    #[serde(default = "default_true")]
177    pub extract_ner: bool,
178    /// Maximum number of whitespace-separated tokens between two entity
179    /// spans that may still be linked by a proximity relation.
180    pub relation_window_tokens: usize,
181}
182
183fn default_true() -> bool {
184    true
185}
186
187impl Default for ExtractorConfig {
188    fn default() -> Self {
189        Self {
190            extract_ner: true,
191            relation_window_tokens: 6,
192        }
193    }
194}
195
196/// Advisory inputs for [`crate::chunk::auto_chunker`].
197///
198/// Defaults match the production heuristics documented on each
199/// [`crate::SourceKind`] → [`ChunkerKind`] mapping. Callers only need to
200/// override when they want tighter or looser chunking than the out-of-
201/// the-box behaviour.
202#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
203pub struct ChunkerAuto {
204    /// Override `max_tokens` for recursive chunking. `None` picks the
205    /// per-source-kind default.
206    pub max_tokens: Option<u32>,
207    /// Override `overlap` for recursive chunking. `None` picks the
208    /// per-source-kind default.
209    pub overlap: Option<u32>,
210    /// Override the session-chunker boundary for conversations. `None`
211    /// picks the default of 10 messages per chunk.
212    pub max_messages: Option<usize>,
213}