Skip to main content

heartbit_core/knowledge/
mod.rs

1//! Knowledge base — document ingestion, chunking, and vector or BM25 retrieval.
2
3pub mod chunker;
4pub mod in_memory;
5pub mod loader;
6pub mod tools;
7
8use std::future::Future;
9use std::pin::Pin;
10
11use serde::{Deserialize, Serialize};
12
13use crate::auth::TenantScope;
14use crate::error::Error;
15
16/// Provenance of a document chunk.
17#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
18pub struct DocumentSource {
19    /// File path or URL where the document was loaded from.
20    pub uri: String,
21    /// Human-readable title (filename, page title, etc.).
22    pub title: String,
23}
24
25/// Atomic search unit: a slice of a document.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct Chunk {
28    /// Deterministic ID derived from source URI + chunk index.
29    pub id: String,
30    /// The text content of this chunk.
31    pub content: String,
32    /// Where this chunk came from.
33    pub source: DocumentSource,
34    /// Position of this chunk within its source document (0-based).
35    pub chunk_index: usize,
36    /// Tenant that owns this chunk. `None` means single-tenant (legacy).
37    ///
38    /// SECURITY (F-KB-1): in a multi-tenant deployment, this MUST be set to
39    /// `Some(tenant_id)` so cross-tenant searches do not return another
40    /// tenant's documents. The `KnowledgeBase` implementations filter
41    /// by this field on `search`.
42    #[serde(default, skip_serializing_if = "Option::is_none")]
43    pub tenant_id: Option<String>,
44}
45
46/// Query parameters for knowledge search.
47#[derive(Debug, Clone)]
48pub struct KnowledgeQuery {
49    /// Free-text search query.
50    pub text: String,
51    /// Optional filter to restrict results to a specific source URI prefix.
52    pub source_filter: Option<String>,
53    /// Maximum number of results to return.
54    pub limit: usize,
55}
56
57/// A single search result with relevance info.
58#[derive(Debug, Clone)]
59pub struct SearchResult {
60    /// The matching chunk.
61    pub chunk: Chunk,
62    /// Number of query-term matches found in the chunk.
63    pub match_count: usize,
64}
65
66/// Trait for knowledge base implementations.
67///
68/// Uses `Pin<Box<dyn Future>>` for dyn-compatibility, matching `Tool`, `Memory`,
69/// and `Blackboard` traits.
70///
71/// SECURITY (F-KB-1): every method takes a `&TenantScope`. Implementations
72/// MUST stamp `chunk.tenant_id = scope.tenant_id` on `index` and filter
73/// `search`/`chunk_count` by it. A shared `Arc<dyn KnowledgeBase>` across
74/// tenants would otherwise leak documents cross-tenant via `knowledge_search`.
75pub trait KnowledgeBase: Send + Sync {
76    /// Index a chunk into the knowledge base under the given tenant scope.
77    fn index(
78        &self,
79        scope: &TenantScope,
80        chunk: Chunk,
81    ) -> Pin<Box<dyn Future<Output = Result<(), Error>> + Send + '_>>;
82
83    /// Search the knowledge base, filtered by tenant scope.
84    fn search(
85        &self,
86        scope: &TenantScope,
87        query: KnowledgeQuery,
88    ) -> Pin<Box<dyn Future<Output = Result<Vec<SearchResult>, Error>> + Send + '_>>;
89
90    /// Return the number of indexed chunks for the given tenant scope.
91    fn chunk_count(
92        &self,
93        scope: &TenantScope,
94    ) -> Pin<Box<dyn Future<Output = Result<usize, Error>> + Send + '_>>;
95}
96
97#[cfg(test)]
98mod tests {
99    use super::*;
100
101    #[test]
102    fn document_source_equality() {
103        let a = DocumentSource {
104            uri: "docs/readme.md".into(),
105            title: "README".into(),
106        };
107        let b = DocumentSource {
108            uri: "docs/readme.md".into(),
109            title: "README".into(),
110        };
111        assert_eq!(a, b);
112    }
113
114    #[test]
115    fn chunk_serializes() {
116        let chunk = Chunk {
117            id: "abc-0".into(),
118            content: "Hello world".into(),
119            source: DocumentSource {
120                uri: "test.md".into(),
121                title: "Test".into(),
122            },
123            chunk_index: 0,
124            tenant_id: None,
125        };
126        let json = serde_json::to_string(&chunk).unwrap();
127        let parsed: Chunk = serde_json::from_str(&json).unwrap();
128        assert_eq!(parsed.id, "abc-0");
129        assert_eq!(parsed.content, "Hello world");
130        assert_eq!(parsed.source.uri, "test.md");
131        assert_eq!(parsed.chunk_index, 0);
132    }
133
134    #[test]
135    fn knowledge_query_with_filter() {
136        let q = KnowledgeQuery {
137            text: "rust async".into(),
138            source_filter: Some("docs/".into()),
139            limit: 5,
140        };
141        assert_eq!(q.text, "rust async");
142        assert_eq!(q.source_filter.as_deref(), Some("docs/"));
143        assert_eq!(q.limit, 5);
144    }
145
146    #[test]
147    fn knowledge_query_without_filter() {
148        let q = KnowledgeQuery {
149            text: "search".into(),
150            source_filter: None,
151            limit: 10,
152        };
153        assert!(q.source_filter.is_none());
154    }
155
156    #[test]
157    fn search_result_holds_chunk_and_count() {
158        let result = SearchResult {
159            chunk: Chunk {
160                id: "x-0".into(),
161                content: "test".into(),
162                source: DocumentSource {
163                    uri: "f.md".into(),
164                    title: "F".into(),
165                },
166                chunk_index: 0,
167                tenant_id: None,
168            },
169            match_count: 3,
170        };
171        assert_eq!(result.match_count, 3);
172        assert_eq!(result.chunk.id, "x-0");
173    }
174
175    #[test]
176    fn knowledge_base_is_object_safe() {
177        fn _accepts_dyn(_kb: &dyn KnowledgeBase) {}
178    }
179}