swiftide_core/
node.rs

1//! This module defines the `Node` struct and its associated methods.
2//!
3//! `Node` represents a unit of data in the indexing process, containing metadata,
4//! the data chunk itself, and an optional vector representation.
5//!
6//! # Overview
7//!
8//! The `Node` struct is designed to encapsulate all necessary information for a single
9//! unit of data being processed in the indexing pipeline. It includes fields for an identifier,
10//! file path, data chunk, optional vector representation, and metadata.
11//!
12//! The struct provides methods to convert the node into an embeddable string format and to
13//! calculate a hash value for the node based on its path and chunk.
14//!
15//! # Usage
16//!
17//! The `Node` struct is used throughout the indexing pipeline to represent and process
18//! individual units of data. It is particularly useful in scenarios where metadata and data chunks
19//! need to be processed together.
20use std::{
21    collections::HashMap,
22    fmt::Debug,
23    hash::{Hash, Hasher},
24    os::unix::ffi::OsStrExt,
25    path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{Embedding, SparseEmbedding, metadata::Metadata};
33
34/// Helper trait for types that can be used as data chunks in a `Node`.
35/// For now always expects an owned value
36///
37/// A chunk must be able to yield its bytes, be cloned (not while streaming), and be sent across
38/// threads.
39pub trait Chunk: Clone + Send + Sync + Debug + AsRef<[u8]> + 'static {}
40impl<T> Chunk for T where T: Clone + Send + Sync + Debug + AsRef<[u8]> + 'static {}
41
42/// Represents a unit of data in the indexing process.
43///
44/// `Node` encapsulates all necessary information for a single unit of data being processed
45/// in the indexing pipeline. It includes fields for an identifier, file path, data chunk, optional
46/// vector representation, and metadata.
47#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
48#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
49pub struct Node<T: Chunk> {
50    /// File path associated with the node.
51    #[builder(default)]
52    pub path: PathBuf,
53    /// Data chunk contained in the node.
54    pub chunk: T,
55    /// Optional vector representation of embedded data.
56    #[builder(default)]
57    pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
58    /// Optional sparse vector representation of embedded data.
59    #[builder(default)]
60    pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
61    /// Metadata associated with the node.
62    #[builder(default)]
63    pub metadata: Metadata,
64    /// Mode of embedding data Chunk and Metadata
65    #[builder(default)]
66    pub embed_mode: EmbedMode,
67    /// Size of the input this node was originally derived from in bytes
68    #[builder(default)]
69    pub original_size: usize,
70    /// Offset of the chunk relative to the start of the input this node was originally derived
71    /// from in bytes
72    #[builder(default)]
73    pub offset: usize,
74}
75
76pub type TextNode = Node<String>;
77
78impl<T: Chunk> NodeBuilder<T> {
79    pub fn maybe_sparse_vectors(
80        &mut self,
81        sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
82    ) -> &mut Self {
83        self.sparse_vectors = Some(sparse_vectors);
84        self
85    }
86
87    pub fn maybe_vectors(
88        &mut self,
89        vectors: Option<HashMap<EmbeddedField, Embedding>>,
90    ) -> &mut Self {
91        self.vectors = Some(vectors);
92        self
93    }
94}
95
96impl<T: Chunk> Debug for Node<T> {
97    /// Formats the node for debugging purposes.
98    ///
99    /// This method is used to provide a human-readable representation of the node when debugging.
100    /// The vector field is displayed as the number of elements in the vector if present.
101    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
102        f.debug_struct("Node")
103            .field("id", &self.id())
104            .field("path", &self.path)
105            .field("chunk", &self.chunk)
106            .field("metadata", &self.metadata)
107            .field(
108                "vectors",
109                &self
110                    .vectors
111                    .iter()
112                    .flat_map(HashMap::iter)
113                    .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
114                    .join(","),
115            )
116            .field(
117                "sparse_vectors",
118                &self
119                    .sparse_vectors
120                    .iter()
121                    .flat_map(HashMap::iter)
122                    .map(|(embed_type, vec)| {
123                        format!(
124                            "'{embed_type}': indices({}), values({})",
125                            vec.indices.len(),
126                            vec.values.len()
127                        )
128                    })
129                    .join(","),
130            )
131            .field("embed_mode", &self.embed_mode)
132            .finish()
133    }
134}
135
136impl<T: Chunk> Node<T> {
137    /// Builds a new instance of `Node`, returning a `NodeBuilder`. Copies
138    /// over the fields from the provided `Node`.
139    pub fn build_from_other(node: &Node<T>) -> NodeBuilder<T> {
140        NodeBuilder::default()
141            .path(node.path.clone())
142            .chunk(node.chunk.clone())
143            .metadata(node.metadata.clone())
144            .maybe_vectors(node.vectors.clone())
145            .maybe_sparse_vectors(node.sparse_vectors.clone())
146            .embed_mode(node.embed_mode)
147            .original_size(node.original_size)
148            .offset(node.offset)
149            .to_owned()
150    }
151
152    /// Creates a new instance of `NodeBuilder.`
153    pub fn builder<VALUE: Chunk + Clone>() -> NodeBuilder<VALUE> {
154        NodeBuilder::default()
155    }
156
157    /// Creates a new instance of `Node` with the specified data chunk.
158    ///
159    /// The other fields are set to their default values.
160    pub fn new(chunk: impl Into<String>) -> Node<String> {
161        let chunk = chunk.into();
162        let original_size = chunk.len();
163        Node {
164            chunk,
165            original_size,
166            ..Default::default()
167        }
168    }
169
170    pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
171        self.metadata = metadata.into();
172        self
173    }
174
175    pub fn with_vectors(
176        &mut self,
177        vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
178    ) -> &mut Self {
179        self.vectors = Some(vectors.into());
180        self
181    }
182
183    pub fn with_sparse_vectors(
184        &mut self,
185        sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
186    ) -> &mut Self {
187        self.sparse_vectors = Some(sparse_vectors.into());
188        self
189    }
190
191    /// Retrieve the identifier of the node.
192    ///
193    /// Calculates the identifier of the node based on its path and chunk as bytes, returning a
194    /// UUID (v3).
195    ///
196    /// WARN: Does not memoize the id. Use sparingly.
197    pub fn id(&self) -> uuid::Uuid {
198        // Calculate the identifier based on the path and chunk as bytes
199        let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_ref()].concat();
200
201        uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
202    }
203}
204
205impl Node<String> {
206    /// Creates embeddable data depending on chosen `EmbedMode`.
207    ///
208    /// # Returns
209    ///
210    /// Embeddable data mapped to their `EmbeddedField`.
211    pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
212        // TODO: Cow and borrow the inner data + generic
213        let mut embeddables = Vec::new();
214
215        if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
216            embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
217        }
218
219        if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
220            embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
221            for (name, value) in &self.metadata {
222                let value = value
223                    .as_str()
224                    .map_or_else(|| value.to_string(), ToString::to_string);
225                embeddables.push((EmbeddedField::Metadata(name.clone()), value));
226            }
227        }
228
229        embeddables
230    }
231
232    /// Converts the node into an [`self::EmbeddedField::Combined`] type of embeddable.
233    ///
234    /// This embeddable format consists of the metadata formatted as key-value pairs, each on a new
235    /// line, followed by the data chunk.
236    ///
237    /// # Returns
238    ///
239    /// A string representing the embeddable format of the node.
240    fn combine_chunk_with_metadata(&self) -> String {
241        // Metadata formatted by newlines joined with the chunk
242        let metadata = self
243            .metadata
244            .iter()
245            .map(|(k, v)| {
246                let v = v
247                    .as_str()
248                    .map_or_else(|| v.to_string(), ToString::to_string);
249
250                format!("{k}: {v}")
251            })
252            .collect::<Vec<String>>()
253            .join("\n");
254
255        format!("{}\n{}", metadata, self.chunk)
256    }
257}
258
259impl Hash for Node<String> {
260    /// Hashes the node based on its path and chunk.
261    ///
262    /// This method is used by the `calculate_hash` method to generate a hash value for the node.
263    fn hash<H: Hasher>(&self, state: &mut H) {
264        self.path.hash(state);
265        self.chunk.hash(state);
266    }
267}
268
269impl<T: Into<String>> From<T> for Node<String> {
270    fn from(value: T) -> Self {
271        let value: String = value.into();
272        Node::<String>::new(value)
273    }
274}
275
276/// Embed mode of the pipeline.
277#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
278pub enum EmbedMode {
279    #[default]
280    /// Embedding Chunk of data combined with Metadata.
281    SingleWithMetadata,
282    /// Embedding Chunk of data and every Metadata separately.
283    PerField,
284    /// Embedding Chunk of data and every Metadata separately and Chunk of data combined with
285    /// Metadata.
286    Both,
287}
288
289/// Type of Embeddable stored in model.
290#[derive(
291    Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
292)]
293pub enum EmbeddedField {
294    #[default]
295    /// Embeddable created from Chunk of data combined with Metadata.
296    Combined,
297    /// Embeddable created from Chunk of data only.
298    Chunk,
299    /// Embeddable created from Metadata.
300    /// String stores Metadata name.
301    #[strum(to_string = "Metadata: {0}")]
302    Metadata(String),
303}
304
305impl EmbeddedField {
306    /// Returns the name of the field when it would be a sparse vector
307    pub fn sparse_field_name(&self) -> String {
308        format!("{self}_sparse")
309    }
310
311    /// Returns the name of the field when it would be a dense vector
312    pub fn field_name(&self) -> String {
313        format!("{self}")
314    }
315}
316
317#[allow(clippy::from_over_into)]
318impl Into<String> for EmbeddedField {
319    fn into(self) -> String {
320        self.to_string()
321    }
322}
323
324#[cfg(test)]
325mod tests {
326    use super::*;
327    use test_case::test_case;
328
329    #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
330    #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
331    #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
332    fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
333        assert_eq!(embedded_field.field_name(), expected[0]);
334        assert_eq!(embedded_field.sparse_field_name(), expected[1]);
335    }
336
337    #[test]
338    fn test_debugging_node_with_utf8_char_boundary() {
339        let node = Node::from("🦀".repeat(101));
340        // Single char
341        let _ = format!("{node:?}");
342
343        // With invalid char boundary
344        let node = Node::from("Jürgen".repeat(100));
345        let _ = format!("{node:?}");
346    }
347
348    #[test]
349    fn test_build_from_other_without_vectors() {
350        let original_node = Node::from("test_chunk")
351            .with_metadata(Metadata::default())
352            .with_vectors(HashMap::new())
353            .with_sparse_vectors(HashMap::new())
354            .to_owned();
355
356        let builder = Node::build_from_other(&original_node);
357        let new_node = builder.build().unwrap();
358
359        assert_eq!(original_node, new_node);
360    }
361
362    #[test]
363    fn test_build_from_other_with_vectors() {
364        let mut vectors = HashMap::new();
365        vectors.insert(EmbeddedField::Chunk, Embedding::default());
366
367        let mut sparse_vectors = HashMap::new();
368        sparse_vectors.insert(
369            EmbeddedField::Chunk,
370            SparseEmbedding {
371                indices: vec![],
372                values: vec![],
373            },
374        );
375
376        let original_node = Node::from("test_chunk")
377            .with_metadata(Metadata::default())
378            .with_vectors(vectors.clone())
379            .with_sparse_vectors(sparse_vectors.clone())
380            .to_owned();
381
382        let builder = Node::build_from_other(&original_node);
383        let new_node = builder.build().unwrap();
384
385        assert_eq!(original_node, new_node);
386    }
387}