Skip to main content

fabryk_vector/
extractor.rs

1//! VectorExtractor trait for domain-specific document extraction.
2//!
3//! This module defines the core abstraction that enables Fabryk to support
4//! multiple knowledge domains for vector search. Each domain implements
5//! `VectorExtractor` to control how content files are transformed into
6//! `VectorDocument` instances for embedding.
7//!
8//! # Design Philosophy
9//!
10//! The extractor separates text composition from embedding. Domains control
11//! what text gets embedded (title, description, body, etc.) by composing
12//! the `VectorDocument.text` field. The embedding provider then handles
13//! the actual vector generation.
14
15use crate::types::VectorDocument;
16use fabryk_core::Result;
17use std::path::Path;
18
19/// Trait for extracting vector documents from domain-specific content.
20///
21/// Each knowledge domain (music theory, math, etc.) implements this trait
22/// to define how its markdown files with frontmatter are transformed into
23/// `VectorDocument` instances. The key responsibility is **text composition**:
24/// deciding what content should be embedded.
25///
26/// # Lifecycle
27///
28/// For each content file, `VectorIndexBuilder` calls:
29///
30/// 1. `extract_document()` — Parse file and compose text for embedding
31///
32/// The returned `VectorDocument.text` is what gets embedded by the
33/// `EmbeddingProvider`.
34pub trait VectorExtractor: Send + Sync {
35    /// Extract a vector document from a content file.
36    ///
37    /// # Arguments
38    ///
39    /// * `base_path` - Root directory for content
40    /// * `file_path` - Full path to the file being processed
41    /// * `frontmatter` - Parsed YAML frontmatter as generic Value
42    /// * `content` - Markdown body (after frontmatter)
43    ///
44    /// # Text Composition
45    ///
46    /// The implementation should compose the `text` field with all content
47    /// that should influence semantic similarity. A common pattern is:
48    ///
49    /// ```text
50    /// title | description | key terms | body content
51    /// ```
52    fn extract_document(
53        &self,
54        base_path: &Path,
55        file_path: &Path,
56        frontmatter: &yaml_serde::Value,
57        content: &str,
58    ) -> Result<VectorDocument>;
59
60    /// Returns the content glob pattern for this domain.
61    ///
62    /// Used by `VectorIndexBuilder` to discover content files.
63    /// Default: `"**/*.md"` (all markdown files recursively).
64    fn content_glob(&self) -> &str {
65        "**/*.md"
66    }
67
68    /// Returns the name of this extractor for logging/debugging.
69    fn name(&self) -> &str {
70        "unnamed"
71    }
72}
73
74// ============================================================================
75// Mock extractor for testing
76// ============================================================================
77
78/// A simple mock extractor for testing.
79///
80/// Composes text from frontmatter title + body content, separated by ` | `.
81#[derive(Clone, Debug, Default)]
82pub struct MockVectorExtractor;
83
84impl VectorExtractor for MockVectorExtractor {
85    fn extract_document(
86        &self,
87        _base_path: &Path,
88        file_path: &Path,
89        frontmatter: &yaml_serde::Value,
90        content: &str,
91    ) -> Result<VectorDocument> {
92        let id = fabryk_core::util::ids::id_from_path(file_path)
93            .ok_or_else(|| fabryk_core::Error::parse("no file stem"))?;
94
95        let title = frontmatter
96            .get("title")
97            .and_then(|v| v.as_str())
98            .unwrap_or(&id);
99
100        let category = frontmatter
101            .get("category")
102            .and_then(|v| v.as_str())
103            .map(String::from);
104
105        // Compose text: title | content (trimmed)
106        let text = format!("{} | {}", title, content.trim());
107
108        let mut doc = VectorDocument::new(id, text);
109        if let Some(cat) = category {
110            doc = doc.with_category(cat);
111        }
112
113        // Extract any additional metadata from frontmatter
114        if let Some(tier) = frontmatter.get("tier").and_then(|v| v.as_str()) {
115            doc = doc.with_metadata("tier", tier);
116        }
117
118        Ok(doc)
119    }
120
121    fn name(&self) -> &str {
122        "mock"
123    }
124}
125
126// ============================================================================
127// Tests
128// ============================================================================
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133    use std::path::PathBuf;
134
135    fn sample_frontmatter() -> yaml_serde::Value {
136        yaml_serde::from_str(
137            r#"
138title: "Test Concept"
139category: "test-category"
140tier: "beginner"
141"#,
142        )
143        .unwrap()
144    }
145
146    #[test]
147    fn test_mock_extractor_extract_document() {
148        let extractor = MockVectorExtractor;
149        let base_path = PathBuf::from("/data/concepts");
150        let file_path = PathBuf::from("/data/concepts/test-concept.md");
151        let frontmatter = sample_frontmatter();
152
153        let doc = extractor
154            .extract_document(&base_path, &file_path, &frontmatter, "Body content here.")
155            .unwrap();
156
157        assert_eq!(doc.id, "test-concept");
158        assert!(doc.text.contains("Test Concept"));
159        assert!(doc.text.contains("Body content here."));
160        assert_eq!(doc.category, Some("test-category".to_string()));
161        assert_eq!(doc.metadata.get("tier").unwrap(), "beginner");
162    }
163
164    #[test]
165    fn test_mock_extractor_minimal_frontmatter() {
166        let extractor = MockVectorExtractor;
167        let base_path = PathBuf::from("/data");
168        let file_path = PathBuf::from("/data/simple.md");
169        let frontmatter: yaml_serde::Value = yaml_serde::from_str("title: Simple").unwrap();
170
171        let doc = extractor
172            .extract_document(&base_path, &file_path, &frontmatter, "Content")
173            .unwrap();
174
175        assert_eq!(doc.id, "simple");
176        assert_eq!(doc.text, "Simple | Content");
177        assert!(doc.category.is_none());
178        assert!(doc.metadata.is_empty());
179    }
180
181    #[test]
182    fn test_mock_extractor_no_title() {
183        let extractor = MockVectorExtractor;
184        let base_path = PathBuf::from("/data");
185        let file_path = PathBuf::from("/data/no-title.md");
186        let frontmatter: yaml_serde::Value = yaml_serde::from_str("category: test").unwrap();
187
188        let doc = extractor
189            .extract_document(&base_path, &file_path, &frontmatter, "Content")
190            .unwrap();
191
192        // Falls back to file stem as title in text
193        assert!(doc.text.contains("no-title"));
194    }
195
196    #[test]
197    fn test_mock_extractor_defaults() {
198        let extractor = MockVectorExtractor;
199        assert_eq!(extractor.content_glob(), "**/*.md");
200        assert_eq!(extractor.name(), "mock");
201    }
202
203    #[test]
204    fn test_trait_object_safety() {
205        fn _assert_object_safe(_: &dyn VectorExtractor) {}
206    }
207}