mnem_ingest/types.rs
1//! Shared data types used throughout the ingest pipeline.
2//!
3//! Kept in a single file for B5a; if the surface grows past ~150 lines in
4//! later sub-waves we will split (`section.rs`, `chunk.rs`, `config.rs`).
5
6use std::ops::Range;
7
8use mnem_core::id::Cid;
9use mnem_ner_providers::NerConfig;
10use serde::{Deserialize, Serialize};
11
12/// A hierarchical text region extracted from a source.
13///
14/// Produced by parsers in [`crate::md`] / [`crate::text`] and consumed by
15/// chunkers in [`mod@crate::chunk`]. The `byte_range` always refers to offsets
16/// in the *original* source input (not the post-parse normalized text), so
17/// downstream stages can slice back into the raw document for diffing or
18/// provenance tracking.
19///
20/// Heading depth uses `CommonMark`'s 1-indexed convention (`# H1 → 1`). A
21/// depth of `0` indicates "no heading" (e.g. top-of-file prose before any
22/// heading, or the synthetic root produced by [`crate::text::parse_text`]).
23#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
24pub struct Section {
25 /// Heading text, without the leading `#` markers and trimmed.
26 pub heading: Option<String>,
27 /// Heading depth (1–6 for actual headings, 0 for headless prose).
28 pub depth: u8,
29 /// Body text contained under this heading (code blocks are kept intact).
30 pub text: String,
31 /// Byte range in the original source input.
32 pub byte_range: Range<usize>,
33}
34
35/// A single chunk emitted by a [`crate::chunk::ChunkerKind`].
36///
37/// `section_path` records the hierarchy of headings that enclose this
38/// chunk, from the root of the document down. It is used by downstream
39/// stages for breadcrumb display and for attaching graph edges back to
40/// the enclosing `Doc` node.
41#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
42pub struct Chunk {
43 /// Heading hierarchy from outermost to innermost.
44 pub section_path: Vec<String>,
45 /// Chunk body text.
46 pub text: String,
47 /// Whitespace-split token count (deterministic estimate).
48 pub tokens_estimate: u32,
49}
50
51/// The kind of source being ingested.
52///
53/// Only `Markdown` and `Text` are handled in Phase-B5a; the other variants
54/// are declared here so public signatures remain stable across sub-waves.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56#[serde(rename_all = "lowercase")]
57pub enum SourceKind {
58 /// `CommonMark` + GFM (tables, fenced code).
59 Markdown,
60 /// UTF-8 plain text, no structure inferred.
61 Text,
62 /// PDF (text-layer extraction). Handled in Phase-B5b.
63 Pdf,
64 /// Chat transcript (JSON/JSONL). Handled in Phase-B5b.
65 Conversation,
66}
67
68/// Which chunker strategy to use, and its parameters.
69///
70/// Re-exported from [`mod@crate::chunk`] for convenience.
71pub type ChunkerKind = crate::chunk::ChunkerKind;
72
73/// Configuration for an ingest run.
74///
75/// `ntype` is the `Node::ntype` string applied to the root document node
76/// once Phase-B5c wires commit. Typical values: `"Doc"`, `"Note"`,
77/// `"Transcript"`.
78#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
79pub struct IngestConfig {
80 /// Which chunker to use.
81 pub chunker: ChunkerKind,
82 /// `Node::ntype` of the root Doc node.
83 pub ntype: String,
84 /// Target maximum tokens per chunk (advisory; used by recursive chunker).
85 pub max_tokens: u32,
86 /// Overlap tokens between adjacent chunks (recursive chunker only).
87 pub overlap: u32,
88 /// NER provider selection. Defaults to [`NerConfig::Rule`] (the
89 /// capitalized-phrase heuristic). Set to [`NerConfig::None`] to
90 /// suppress all entity extraction.
91 #[serde(default)]
92 pub ner: NerConfig,
93}
94
95impl Default for IngestConfig {
96 fn default() -> Self {
97 Self {
98 chunker: ChunkerKind::Paragraph,
99 ntype: "Doc".into(),
100 max_tokens: 512,
101 overlap: 32,
102 ner: NerConfig::default(),
103 }
104 }
105}
106
107/// Outcome of a completed ingest run.
108///
109/// Phase-B5c wires the real pipeline: `commit_cid` is `Some(_)` whenever
110/// the caller committed the transaction after [`crate::Ingester::ingest`]
111/// returned; `None` when they ran a dry-run (ingest without commit) or
112/// when the underlying backend reports no change. `node_count` counts
113/// every `Node` added (the Doc root, one per chunk, one per unique
114/// entity). `entity_count` and `relation_count` report extraction
115/// output before dedup. `chunk_count` reports the number of chunks
116/// produced by the chunker stage.
117#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
118pub struct IngestResult {
119 /// Commit produced by the run, if any.
120 pub commit_cid: Option<Cid>,
121 /// Number of graph nodes created.
122 pub node_count: u64,
123 /// Number of chunks produced.
124 pub chunk_count: u64,
125 /// Number of entity nodes created (deduplicated across the run).
126 pub entity_count: u64,
127 /// Number of relation edges created.
128 pub relation_count: u64,
129 /// Wall-clock elapsed time in milliseconds.
130 pub elapsed_ms: u64,
131}
132
133/// Recognised conversation-export formats.
134///
135/// Used by [`crate::conversation::parse_conversation`] to route JSON into
136/// the right schema decoder. [`Self::Generic`] is the fallback for
137/// `[{"role", "content", "timestamp"?}]` shaped payloads.
138#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
139#[serde(rename_all = "snake_case")]
140pub enum ConversationFormat {
141 /// `ChatGPT` export (`conversations.json`) with a `mapping` tree of
142 /// message nodes keyed by UUID.
143 ChatGpt,
144 /// Claude export with a flat `{"conversation": [{role, content}]}`
145 /// top-level object.
146 Claude,
147 /// Generic `[{role, content, timestamp?}]` array.
148 Generic,
149}
150
151/// A single turn in a conversation.
152///
153/// `timestamp` is an optional Unix epoch in seconds - some exports
154/// (Claude, generic) omit it and we preserve that absence rather than
155/// fabricating zeroes.
156#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
157pub struct Message {
158 /// Speaker role, e.g. `"user"`, `"assistant"`, `"system"`, `"tool"`.
159 pub role: String,
160 /// Turn text content. Multi-part `ChatGPT` messages are concatenated
161 /// with `"\n\n"` separators by the parser.
162 pub content: String,
163 /// Unix epoch seconds, if the source provided one.
164 pub timestamp: Option<u64>,
165}
166
167/// Configuration for the entity + relation extractor.
168///
169/// Entity extraction is handled entirely by the NER provider wired via
170/// [`IngestConfig::ner`]. The provider may return any label strings it
171/// chooses; there is no fixed vocabulary.
172#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
173pub struct ExtractorConfig {
174 /// Call the NER provider for named-entity extraction. All labels
175 /// returned by the provider pass through unconditionally.
176 #[serde(default = "default_true")]
177 pub extract_ner: bool,
178 /// Maximum number of whitespace-separated tokens between two entity
179 /// spans that may still be linked by a proximity relation.
180 pub relation_window_tokens: usize,
181}
182
183fn default_true() -> bool {
184 true
185}
186
187impl Default for ExtractorConfig {
188 fn default() -> Self {
189 Self {
190 extract_ner: true,
191 relation_window_tokens: 6,
192 }
193 }
194}
195
196/// Advisory inputs for [`crate::chunk::auto_chunker`].
197///
198/// Defaults match the production heuristics documented on each
199/// [`crate::SourceKind`] → [`ChunkerKind`] mapping. Callers only need to
200/// override when they want tighter or looser chunking than the out-of-
201/// the-box behaviour.
202#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
203pub struct ChunkerAuto {
204 /// Override `max_tokens` for recursive chunking. `None` picks the
205 /// per-source-kind default.
206 pub max_tokens: Option<u32>,
207 /// Override `overlap` for recursive chunking. `None` picks the
208 /// per-source-kind default.
209 pub overlap: Option<u32>,
210 /// Override the session-chunker boundary for conversations. `None`
211 /// picks the default of 10 messages per chunk.
212 pub max_messages: Option<usize>,
213}