vectorless/core/traits.rs
1// Copyright (c) 2026 vectorless developers
2// SPDX-License-Identifier: Apache-2.0
3
4//! Core traits for the vectorless library.
5//!
6//! This module defines the main extension points of the library:
7//! - [`DocumentParser`] - Parse documents into raw nodes
8//! - [`Summarizer`] - Generate summaries for tree nodes
9
10use async_trait::async_trait;
11use std::path::Path;
12
13use super::{DocumentTree, NodeId, Result};
14
15// ============================================================
16// Document Parser Trait
17// ============================================================
18
19/// A parser for extracting content from documents.
20///
21/// Implementations parse different document formats and produce
22/// a sequence of raw nodes that can be organized into a tree.
23///
24/// # Example
25///
26/// ```rust
27/// use vectorless::core::DocumentParser;
28/// use vectorless::document::MarkdownParser;
29/// use async_trait::async_trait;
30///
31/// # #[tokio::main]
32/// # async fn main() -> vectorless::core::Result<()> {
33/// let parser = MarkdownParser::new();
34/// let content = "# Title\n\nContent here.";
35/// let result = parser.parse(content).await?;
36/// println!("Found {} nodes", result.node_count());
37/// # Ok(())
38/// # }
39/// ```
40#[async_trait]
41pub trait DocumentParser: Send + Sync {
42 /// Get the document format this parser handles.
43 fn format(&self) -> crate::document::DocumentFormat;
44
45 /// Parse content from a string.
46 ///
47 /// # Arguments
48 ///
49 /// * `content` - The document content as a string
50 ///
51 /// # Returns
52 ///
53 /// A [`ParseResult`] containing extracted nodes and metadata.
54 async fn parse(&self, content: &str) -> Result<crate::document::ParseResult>;
55
56 /// Parse content from a file.
57 ///
58 /// Default implementation reads the file and calls [`parse`](Self::parse).
59 ///
60 /// # Arguments
61 ///
62 /// * `path` - Path to the file
63 async fn parse_file(&self, path: &Path) -> Result<crate::document::ParseResult> {
64 let content = tokio::fs::read_to_string(path)
65 .await
66 .map_err(|e| crate::core::Error::Parse(format!("Failed to read file: {}", e)))?;
67
68 self.parse(&content).await
69 }
70}
71
72// ============================================================
73// Summarizer Trait
74// ============================================================
75
76/// A summarizer generates concise summaries for tree nodes.
77///
78/// Implementations can use different strategies:
79/// - LLM-based summarization
80/// - Extractive summarization
81/// - Hybrid approaches
82///
83/// # Example
84///
85/// ```rust
86/// use vectorless::core::{Summarizer, DocumentTree, NodeId, Result};
87/// use async_trait::async_trait;
88///
89/// struct MySummarizer;
90///
91/// #[async_trait]
92/// impl Summarizer for MySummarizer {
93/// async fn summarize(&self, tree: &DocumentTree, node: NodeId) -> Result<String> {
94/// let content = tree.get(node)
95/// .map(|n| n.content.as_str())
96/// .unwrap_or("");
97/// Ok(format!("Summary: {}", &content[..50.min(content.len())]))
98/// }
99/// }
100/// ```
101#[async_trait]
102pub trait Summarizer: Send + Sync {
103 /// Generate a summary for the given node.
104 ///
105 /// # Arguments
106 ///
107 /// * `tree` - The document tree containing the node
108 /// * `node` - The node to summarize
109 ///
110 /// # Returns
111 ///
112 /// A summary string, or an error if summarization fails.
113 async fn summarize(&self, tree: &DocumentTree, node: NodeId) -> Result<String>;
114}
115
116// ============================================================
117// Retriever Trait
118// ============================================================
119
120/// A retriever finds relevant content in a document tree.
121///
122/// Implementations can use different strategies:
123/// - LLM-based navigation (tree traversal)
124/// - MCTS (Monte Carlo Tree Search)
125/// - Beam search
126/// - Vector similarity
127///
128/// # Example
129///
130/// ```rust
131/// use vectorless::core::{Retriever, DocumentTree, Result};
132/// use vectorless::retriever::RetrieveOptions;
133/// use async_trait::async_trait;
134///
135/// struct MyRetriever;
136///
137/// #[async_trait]
138/// impl Retriever for MyRetriever {
139/// async fn retrieve(&self, tree: &DocumentTree, query: &str, options: &RetrieveOptions) -> Result<Vec<RetrievalResult>> {
140/// // Return relevant content
141/// Ok(vec![RetrievalResult::new("Relevant content")])
142/// }
143/// }
144/// ```
145#[async_trait]
146pub trait Retriever: Send + Sync {
147 /// Retrieve relevant content for a query.
148 ///
149 /// # Arguments
150 ///
151 /// * `tree` - The document tree to search
152 /// * `query` - The user's question
153 /// * `options` - Retrieval options
154 ///
155 /// # Returns
156 ///
157 /// A list of retrieval results with content, scores, and metadata.
158 async fn retrieve(
159 &self,
160 tree: &DocumentTree,
161 query: &str,
162 options: &crate::retriever::RetrieveOptions,
163 ) -> Result<Vec<crate::retriever::RetrievalResult>>;
164}
165
166// ============================================================
167// Configuration Types
168// ============================================================
169
170/// Configuration for summarization behavior.
171#[derive(Debug, Clone)]
172pub struct SummarizerConfig {
173 /// Maximum tokens for the summary.
174 pub max_tokens: usize,
175
176 /// Whether to include child content in summaries.
177 pub include_children: bool,
178
179 /// Minimum content length to trigger summarization.
180 pub min_content_length: usize,
181}
182
183impl Default for SummarizerConfig {
184 fn default() -> Self {
185 Self {
186 max_tokens: 200,
187 include_children: false,
188 min_content_length: 100,
189 }
190 }
191}