swiftide_core/
node.rs

1//! This module defines the `Node` struct and its associated methods.
2//!
3//! `Node` represents a unit of data in the indexing process, containing metadata,
4//! the data chunk itself, and an optional vector representation.
5//!
6//! # Overview
7//!
8//! The `Node` struct is designed to encapsulate all necessary information for a single
9//! unit of data being processed in the indexing pipeline. It includes fields for an identifier,
10//! file path, data chunk, optional vector representation, and metadata.
11//!
12//! The struct provides methods to convert the node into an embeddable string format and to
13//! calculate a hash value for the node based on its path and chunk.
14//!
15//! # Usage
16//!
17//! The `Node` struct is used throughout the indexing pipeline to represent and process
18//! individual units of data. It is particularly useful in scenarios where metadata and data chunks
19//! need to be processed together.
20use std::{
21    collections::HashMap,
22    fmt::Debug,
23    hash::{Hash, Hasher},
24    os::unix::ffi::OsStrExt,
25    path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{metadata::Metadata, util::debug_long_utf8, Embedding, SparseEmbedding};
33
34/// Represents a unit of data in the indexing process.
35///
36/// `Node` encapsulates all necessary information for a single unit of data being processed
37/// in the indexing pipeline. It includes fields for an identifier, file path, data chunk, optional
38/// vector representation, and metadata.
39#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
40#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
41pub struct Node {
42    /// File path associated with the node.
43    #[builder(default)]
44    pub path: PathBuf,
45    /// Data chunk contained in the node.
46    pub chunk: String,
47    /// Optional vector representation of embedded data.
48    #[builder(default)]
49    pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
50    /// Optional sparse vector representation of embedded data.
51    #[builder(default)]
52    pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
53    /// Metadata associated with the node.
54    #[builder(default)]
55    pub metadata: Metadata,
56    /// Mode of embedding data Chunk and Metadata
57    #[builder(default)]
58    pub embed_mode: EmbedMode,
59    /// Size of the input this node was originally derived from in bytes
60    #[builder(default)]
61    pub original_size: usize,
62    /// Offset of the chunk relative to the start of the input this node was originally derived from in bytes
63    #[builder(default)]
64    pub offset: usize,
65}
66
67impl NodeBuilder {
68    pub fn maybe_sparse_vectors(
69        &mut self,
70        sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
71    ) -> &mut Self {
72        self.sparse_vectors = Some(sparse_vectors);
73        self
74    }
75
76    pub fn maybe_vectors(
77        &mut self,
78        vectors: Option<HashMap<EmbeddedField, Embedding>>,
79    ) -> &mut Self {
80        self.vectors = Some(vectors);
81        self
82    }
83}
84
85impl Debug for Node {
86    /// Formats the node for debugging purposes.
87    ///
88    /// This method is used to provide a human-readable representation of the node when debugging.
89    /// The vector field is displayed as the number of elements in the vector if present.
90    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91        f.debug_struct("Node")
92            .field("id", &self.id())
93            .field("path", &self.path)
94            .field("chunk", &debug_long_utf8(&self.chunk, 100))
95            .field("metadata", &self.metadata)
96            .field(
97                "vectors",
98                &self
99                    .vectors
100                    .iter()
101                    .flat_map(HashMap::iter)
102                    .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
103                    .join(","),
104            )
105            .field(
106                "sparse_vectors",
107                &self
108                    .sparse_vectors
109                    .iter()
110                    .flat_map(HashMap::iter)
111                    .map(|(embed_type, vec)| {
112                        format!(
113                            "'{embed_type}': indices({}), values({})",
114                            vec.indices.len(),
115                            vec.values.len()
116                        )
117                    })
118                    .join(","),
119            )
120            .field("embed_mode", &self.embed_mode)
121            .finish()
122    }
123}
124
125impl Node {
126    /// Builds a new instance of `Node`, returning a `NodeBuilder`. Copies
127    /// over the fields from the provided `Node`.
128    pub fn build_from_other(node: &Node) -> NodeBuilder {
129        NodeBuilder::default()
130            .path(node.path.clone())
131            .chunk(node.chunk.clone())
132            .metadata(node.metadata.clone())
133            .maybe_vectors(node.vectors.clone())
134            .maybe_sparse_vectors(node.sparse_vectors.clone())
135            .embed_mode(node.embed_mode)
136            .original_size(node.original_size)
137            .offset(node.offset)
138            .to_owned()
139    }
140
141    /// Creates a new instance of `NodeBuilder.`
142    pub fn builder() -> NodeBuilder {
143        NodeBuilder::default()
144    }
145
146    /// Creates a new instance of `Node` with the specified data chunk.
147    ///
148    /// The other fields are set to their default values.
149    pub fn new(chunk: impl Into<String>) -> Node {
150        let chunk = chunk.into();
151        let original_size = chunk.len();
152        Node {
153            chunk,
154            original_size,
155            ..Default::default()
156        }
157    }
158
159    pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
160        self.metadata = metadata.into();
161        self
162    }
163
164    pub fn with_vectors(
165        &mut self,
166        vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
167    ) -> &mut Self {
168        self.vectors = Some(vectors.into());
169        self
170    }
171
172    pub fn with_sparse_vectors(
173        &mut self,
174        sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
175    ) -> &mut Self {
176        self.sparse_vectors = Some(sparse_vectors.into());
177        self
178    }
179
180    /// Creates embeddable data depending on chosen `EmbedMode`.
181    ///
182    /// # Returns
183    ///
184    /// Embeddable data mapped to their `EmbeddedField`.
185    pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
186        // TODO: Figure out a clever way to do zero copy
187        let mut embeddables = Vec::new();
188
189        if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
190            embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
191        }
192
193        if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
194            embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
195            for (name, value) in &self.metadata {
196                let value = value
197                    .as_str()
198                    .map_or_else(|| value.to_string(), ToString::to_string);
199                embeddables.push((EmbeddedField::Metadata(name.clone()), value));
200            }
201        }
202
203        embeddables
204    }
205
206    /// Converts the node into an [`self::EmbeddedField::Combined`] type of embeddable.
207    ///
208    /// This embeddable format consists of the metadata formatted as key-value pairs, each on a new line,
209    /// followed by the data chunk.
210    ///
211    /// # Returns
212    ///
213    /// A string representing the embeddable format of the node.
214    fn combine_chunk_with_metadata(&self) -> String {
215        // Metadata formatted by newlines joined with the chunk
216        let metadata = self
217            .metadata
218            .iter()
219            .map(|(k, v)| {
220                let v = v
221                    .as_str()
222                    .map_or_else(|| v.to_string(), ToString::to_string);
223
224                format!("{k}: {v}")
225            })
226            .collect::<Vec<String>>()
227            .join("\n");
228
229        format!("{}\n{}", metadata, self.chunk)
230    }
231
232    /// Retrieve the identifier of the node.
233    ///
234    /// Calculates the identifier of the node based on its path and chunk as bytes, returning a
235    /// UUID (v3).
236    ///
237    /// WARN: Does not memoize the id. Use sparingly.
238    pub fn id(&self) -> uuid::Uuid {
239        let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
240
241        uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
242    }
243}
244
245impl Hash for Node {
246    /// Hashes the node based on its path and chunk.
247    ///
248    /// This method is used by the `calculate_hash` method to generate a hash value for the node.
249    fn hash<H: Hasher>(&self, state: &mut H) {
250        self.path.hash(state);
251        self.chunk.hash(state);
252    }
253}
254
255impl<T: Into<String>> From<T> for Node {
256    fn from(value: T) -> Self {
257        Node::new(value)
258    }
259}
260
261/// Embed mode of the pipeline.
262#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
263pub enum EmbedMode {
264    #[default]
265    /// Embedding Chunk of data combined with Metadata.
266    SingleWithMetadata,
267    /// Embedding Chunk of data and every Metadata separately.
268    PerField,
269    /// Embedding Chunk of data and every Metadata separately and Chunk of data combined with Metadata.
270    Both,
271}
272
273/// Type of Embeddable stored in model.
274#[derive(
275    Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
276)]
277pub enum EmbeddedField {
278    #[default]
279    /// Embeddable created from Chunk of data combined with Metadata.
280    Combined,
281    /// Embeddable created from Chunk of data only.
282    Chunk,
283    /// Embeddable created from Metadata.
284    /// String stores Metadata name.
285    #[strum(to_string = "Metadata: {0}")]
286    Metadata(String),
287}
288
289impl EmbeddedField {
290    /// Returns the name of the field when it would be a sparse vector
291    pub fn sparse_field_name(&self) -> String {
292        format!("{self}_sparse")
293    }
294
295    /// Returns the name of the field when it would be a dense vector
296    pub fn field_name(&self) -> String {
297        format!("{self}")
298    }
299}
300
301#[allow(clippy::from_over_into)]
302impl Into<String> for EmbeddedField {
303    fn into(self) -> String {
304        self.to_string()
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311    use test_case::test_case;
312
313    #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
314    #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
315    #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
316    fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
317        assert_eq!(embedded_field.field_name(), expected[0]);
318        assert_eq!(embedded_field.sparse_field_name(), expected[1]);
319    }
320
321    #[test]
322    fn test_debugging_node_with_utf8_char_boundary() {
323        let node = Node::new("🦀".repeat(101));
324        // Single char
325        let _ = format!("{node:?}");
326
327        // With invalid char boundary
328        Node::new("Jürgen".repeat(100));
329        let _ = format!("{node:?}");
330    }
331
332    #[test]
333    fn test_build_from_other_without_vectors() {
334        let original_node = Node::new("test_chunk")
335            .with_metadata(Metadata::default())
336            .with_vectors(HashMap::new())
337            .with_sparse_vectors(HashMap::new())
338            .to_owned();
339
340        let builder = Node::build_from_other(&original_node);
341        let new_node = builder.build().unwrap();
342
343        assert_eq!(original_node, new_node);
344    }
345
346    #[test]
347    fn test_build_from_other_with_vectors() {
348        let mut vectors = HashMap::new();
349        vectors.insert(EmbeddedField::Chunk, Embedding::default());
350
351        let mut sparse_vectors = HashMap::new();
352        sparse_vectors.insert(
353            EmbeddedField::Chunk,
354            SparseEmbedding {
355                indices: vec![],
356                values: vec![],
357            },
358        );
359
360        let original_node = Node::new("test_chunk")
361            .with_metadata(Metadata::default())
362            .with_vectors(vectors.clone())
363            .with_sparse_vectors(sparse_vectors.clone())
364            .to_owned();
365
366        let builder = Node::build_from_other(&original_node);
367        let new_node = builder.build().unwrap();
368
369        assert_eq!(original_node, new_node);
370    }
371}