mnem_ingest/lib.rs
1//! # mnem-ingest
2//!
3//! Ingest pipeline for [mnem].
4//!
5//! Converts external source artifacts (Markdown, plain text, PDFs, and
6//! chat-conversation exports) into the chunk-and-section intermediate
7//! representation that downstream stages (extraction, embedding, graph
8//! commit) consume.
9//!
10//! ## Scope (through Phase-B5c)
11//!
12//! - [`md::parse_markdown`] - `CommonMark` + GFM tables/code fences with
13//! heading hierarchy preserved.
14//! - [`text::parse_text`] - single-section pass-through for plain text.
15//! - [`pdf::parse_pdf`] - pure-Rust text-layer extraction via
16//! `pdf-extract`, page-boundary detection on form-feed.
17//! - [`conversation::parse_conversation`] - `ChatGPT` / Claude / generic
18//! JSON exports flattened into one [`Section`] per turn.
19//! - [`chunk::chunk`] - three chunker strategies:
20//! - [`ChunkerKind::Paragraph`] - double-newline split.
21//! - [`ChunkerKind::Recursive`] - token-budgeted sliding window.
22//! - [`ChunkerKind::Session`] - contiguous conversation messages
23//! grouped until role returns to `user` or a cap is hit.
24//! - [`chunk::auto_chunker`] - picks a sensible [`ChunkerKind`] per
25//! [`SourceKind`].
26//! - [`extract::RuleExtractor`] - entity extractor that delegates to the
27//! configured [`mnem_ner_providers::NerProvider`] (default: capitalized-phrase
28//! heuristic). Provider labels pass through unconditionally.
29//! - [`pipeline::Ingester`] - end-to-end driver that writes Doc +
30//! Chunk + Entity nodes and the relation edges between them into a
31//! borrowed [`mnem_core::repo::Transaction`].
32//!
33//! ## Optional extensions (Phase-B5e)
34//!
35//! - [`extract_llm::OllamaExtractor`] - schema-constrained NER via a
36//! local Ollama server. Gated behind the `ollama` Cargo feature.
37//! Hallucinated spans are re-verified against section text and
38//! rejected; failures (timeout, schema-invalid) degrade to empty
39//! `Vec` rather than an error, so the rule-based baseline remains
40//! the load-bearing path.
41//! - [`sidecar::Sidecar`] - escalation hook to an external
42//! `docling` / `unstructured-ingest` CLI for PDFs whose text-layer
43//! extraction is too thin. Gated behind `sidecar-docling` /
44//! `sidecar-unstructured`.
45//!
46//! ## Non-goals still outstanding
47//!
48//! - No CLI / MCP / HTTP wiring (Phase-B5d).
49//!
50//! ## Example
51//!
52//! ```
53//! use mnem_ingest::{md::parse_markdown, chunk::{chunk, ChunkerKind}};
54//!
55//! let sections = parse_markdown("# Title\n\nFirst para.\n\nSecond para.").unwrap();
56//! let chunks = chunk(§ions, &ChunkerKind::Paragraph);
57//! assert!(!chunks.is_empty());
58//! ```
59//!
60//! [mnem]: https://github.com/Uranid/mnem
61
62#![deny(missing_docs)]
63#![forbid(unsafe_code)]
64
65pub mod chunk;
66pub mod conversation;
67pub mod error;
68pub mod extract;
69#[cfg(feature = "keybert")]
70pub mod extract_keybert;
71#[cfg(feature = "ollama")]
72pub mod extract_llm;
73pub mod md;
74pub mod pdf;
75pub mod pipeline;
76#[cfg(any(feature = "sidecar-docling", feature = "sidecar-unstructured"))]
77pub mod sidecar;
78pub mod text;
79pub mod types;
80
81pub use chunk::{ChunkerKind, auto_chunker, chunk};
82pub use error::Error;
83pub use extract::{EntitySpan, Extractor, RelationSpan, RuleExtractor};
84#[cfg(feature = "keybert")]
85pub use extract_keybert::{KEYBERT_RELATION_LABEL, KeyBertAdapter};
86#[cfg(feature = "ollama")]
87pub use extract_llm::{
88 DEFAULT_OLLAMA_MODEL, DEFAULT_OLLAMA_URL, LLM_ENTITY_CONFIDENCE, LLM_RELATION_CONFIDENCE,
89 OllamaExtractor,
90};
91pub use pipeline::{EmbedText, EmbedderArc, Ingester};
92pub use types::{
93 Chunk, ChunkerAuto, ConversationFormat, ExtractorConfig, IngestConfig, IngestResult, Message,
94 Section, SourceKind,
95};
96// Re-export NerConfig so downstream crates (mnem-cli, mnem-mcp, mnem-http)
97// can refer to `mnem_ingest::NerConfig` without a direct dep on
98// mnem-ner-providers.
99pub use mnem_ner_providers::NerConfig;
100
101// Re-export Cid so downstream crates can refer to `mnem_ingest::Cid`
102// without having to pull mnem-core directly.
103pub use mnem_core::id::Cid as IngestCid;