swiftide_core/
node.rs

1//! This module defines the `Node` struct and its associated methods.
2//!
3//! `Node` represents a unit of data in the indexing process, containing metadata,
4//! the data chunk itself, and an optional vector representation.
5//!
6//! # Overview
7//!
8//! The `Node` struct is designed to encapsulate all necessary information for a single
9//! unit of data being processed in the indexing pipeline. It includes fields for an identifier,
10//! file path, data chunk, optional vector representation, and metadata.
11//!
12//! The struct provides methods to convert the node into an embeddable string format and to
13//! calculate a hash value for the node based on its path and chunk.
14//!
15//! # Usage
16//!
17//! The `Node` struct is used throughout the indexing pipeline to represent and process
18//! individual units of data. It is particularly useful in scenarios where metadata and data chunks
19//! need to be processed together.
20use std::{
21    collections::HashMap,
22    fmt::Debug,
23    hash::{Hash, Hasher},
24    os::unix::ffi::OsStrExt,
25    path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{Embedding, SparseEmbedding, metadata::Metadata, util::debug_long_utf8};
33
34/// Represents a unit of data in the indexing process.
35///
36/// `Node` encapsulates all necessary information for a single unit of data being processed
37/// in the indexing pipeline. It includes fields for an identifier, file path, data chunk, optional
38/// vector representation, and metadata.
39#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
40#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
41pub struct Node {
42    /// File path associated with the node.
43    #[builder(default)]
44    pub path: PathBuf,
45    /// Data chunk contained in the node.
46    pub chunk: String,
47    /// Optional vector representation of embedded data.
48    #[builder(default)]
49    pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
50    /// Optional sparse vector representation of embedded data.
51    #[builder(default)]
52    pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
53    /// Metadata associated with the node.
54    #[builder(default)]
55    pub metadata: Metadata,
56    /// Mode of embedding data Chunk and Metadata
57    #[builder(default)]
58    pub embed_mode: EmbedMode,
59    /// Size of the input this node was originally derived from in bytes
60    #[builder(default)]
61    pub original_size: usize,
62    /// Offset of the chunk relative to the start of the input this node was originally derived
63    /// from in bytes
64    #[builder(default)]
65    pub offset: usize,
66}
67
68impl NodeBuilder {
69    pub fn maybe_sparse_vectors(
70        &mut self,
71        sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
72    ) -> &mut Self {
73        self.sparse_vectors = Some(sparse_vectors);
74        self
75    }
76
77    pub fn maybe_vectors(
78        &mut self,
79        vectors: Option<HashMap<EmbeddedField, Embedding>>,
80    ) -> &mut Self {
81        self.vectors = Some(vectors);
82        self
83    }
84}
85
86impl Debug for Node {
87    /// Formats the node for debugging purposes.
88    ///
89    /// This method is used to provide a human-readable representation of the node when debugging.
90    /// The vector field is displayed as the number of elements in the vector if present.
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        f.debug_struct("Node")
93            .field("id", &self.id())
94            .field("path", &self.path)
95            .field("chunk", &debug_long_utf8(&self.chunk, 100))
96            .field("metadata", &self.metadata)
97            .field(
98                "vectors",
99                &self
100                    .vectors
101                    .iter()
102                    .flat_map(HashMap::iter)
103                    .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
104                    .join(","),
105            )
106            .field(
107                "sparse_vectors",
108                &self
109                    .sparse_vectors
110                    .iter()
111                    .flat_map(HashMap::iter)
112                    .map(|(embed_type, vec)| {
113                        format!(
114                            "'{embed_type}': indices({}), values({})",
115                            vec.indices.len(),
116                            vec.values.len()
117                        )
118                    })
119                    .join(","),
120            )
121            .field("embed_mode", &self.embed_mode)
122            .finish()
123    }
124}
125
126impl Node {
127    /// Builds a new instance of `Node`, returning a `NodeBuilder`. Copies
128    /// over the fields from the provided `Node`.
129    pub fn build_from_other(node: &Node) -> NodeBuilder {
130        NodeBuilder::default()
131            .path(node.path.clone())
132            .chunk(node.chunk.clone())
133            .metadata(node.metadata.clone())
134            .maybe_vectors(node.vectors.clone())
135            .maybe_sparse_vectors(node.sparse_vectors.clone())
136            .embed_mode(node.embed_mode)
137            .original_size(node.original_size)
138            .offset(node.offset)
139            .to_owned()
140    }
141
142    /// Creates a new instance of `NodeBuilder.`
143    pub fn builder() -> NodeBuilder {
144        NodeBuilder::default()
145    }
146
147    /// Creates a new instance of `Node` with the specified data chunk.
148    ///
149    /// The other fields are set to their default values.
150    pub fn new(chunk: impl Into<String>) -> Node {
151        let chunk = chunk.into();
152        let original_size = chunk.len();
153        Node {
154            chunk,
155            original_size,
156            ..Default::default()
157        }
158    }
159
160    pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
161        self.metadata = metadata.into();
162        self
163    }
164
165    pub fn with_vectors(
166        &mut self,
167        vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
168    ) -> &mut Self {
169        self.vectors = Some(vectors.into());
170        self
171    }
172
173    pub fn with_sparse_vectors(
174        &mut self,
175        sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
176    ) -> &mut Self {
177        self.sparse_vectors = Some(sparse_vectors.into());
178        self
179    }
180
181    /// Creates embeddable data depending on chosen `EmbedMode`.
182    ///
183    /// # Returns
184    ///
185    /// Embeddable data mapped to their `EmbeddedField`.
186    pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
187        // TODO: Figure out a clever way to do zero copy
188        let mut embeddables = Vec::new();
189
190        if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
191            embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
192        }
193
194        if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
195            embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
196            for (name, value) in &self.metadata {
197                let value = value
198                    .as_str()
199                    .map_or_else(|| value.to_string(), ToString::to_string);
200                embeddables.push((EmbeddedField::Metadata(name.clone()), value));
201            }
202        }
203
204        embeddables
205    }
206
207    /// Converts the node into an [`self::EmbeddedField::Combined`] type of embeddable.
208    ///
209    /// This embeddable format consists of the metadata formatted as key-value pairs, each on a new
210    /// line, followed by the data chunk.
211    ///
212    /// # Returns
213    ///
214    /// A string representing the embeddable format of the node.
215    fn combine_chunk_with_metadata(&self) -> String {
216        // Metadata formatted by newlines joined with the chunk
217        let metadata = self
218            .metadata
219            .iter()
220            .map(|(k, v)| {
221                let v = v
222                    .as_str()
223                    .map_or_else(|| v.to_string(), ToString::to_string);
224
225                format!("{k}: {v}")
226            })
227            .collect::<Vec<String>>()
228            .join("\n");
229
230        format!("{}\n{}", metadata, self.chunk)
231    }
232
233    /// Retrieve the identifier of the node.
234    ///
235    /// Calculates the identifier of the node based on its path and chunk as bytes, returning a
236    /// UUID (v3).
237    ///
238    /// WARN: Does not memoize the id. Use sparingly.
239    pub fn id(&self) -> uuid::Uuid {
240        let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
241
242        uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
243    }
244}
245
246impl Hash for Node {
247    /// Hashes the node based on its path and chunk.
248    ///
249    /// This method is used by the `calculate_hash` method to generate a hash value for the node.
250    fn hash<H: Hasher>(&self, state: &mut H) {
251        self.path.hash(state);
252        self.chunk.hash(state);
253    }
254}
255
256impl<T: Into<String>> From<T> for Node {
257    fn from(value: T) -> Self {
258        Node::new(value)
259    }
260}
261
262/// Embed mode of the pipeline.
263#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
264pub enum EmbedMode {
265    #[default]
266    /// Embedding Chunk of data combined with Metadata.
267    SingleWithMetadata,
268    /// Embedding Chunk of data and every Metadata separately.
269    PerField,
270    /// Embedding Chunk of data and every Metadata separately and Chunk of data combined with
271    /// Metadata.
272    Both,
273}
274
275/// Type of Embeddable stored in model.
276#[derive(
277    Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
278)]
279pub enum EmbeddedField {
280    #[default]
281    /// Embeddable created from Chunk of data combined with Metadata.
282    Combined,
283    /// Embeddable created from Chunk of data only.
284    Chunk,
285    /// Embeddable created from Metadata.
286    /// String stores Metadata name.
287    #[strum(to_string = "Metadata: {0}")]
288    Metadata(String),
289}
290
291impl EmbeddedField {
292    /// Returns the name of the field when it would be a sparse vector
293    pub fn sparse_field_name(&self) -> String {
294        format!("{self}_sparse")
295    }
296
297    /// Returns the name of the field when it would be a dense vector
298    pub fn field_name(&self) -> String {
299        format!("{self}")
300    }
301}
302
303#[allow(clippy::from_over_into)]
304impl Into<String> for EmbeddedField {
305    fn into(self) -> String {
306        self.to_string()
307    }
308}
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313    use test_case::test_case;
314
315    #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
316    #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
317    #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
318    fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
319        assert_eq!(embedded_field.field_name(), expected[0]);
320        assert_eq!(embedded_field.sparse_field_name(), expected[1]);
321    }
322
323    #[test]
324    fn test_debugging_node_with_utf8_char_boundary() {
325        let node = Node::new("🦀".repeat(101));
326        // Single char
327        let _ = format!("{node:?}");
328
329        // With invalid char boundary
330        Node::new("Jürgen".repeat(100));
331        let _ = format!("{node:?}");
332    }
333
334    #[test]
335    fn test_build_from_other_without_vectors() {
336        let original_node = Node::new("test_chunk")
337            .with_metadata(Metadata::default())
338            .with_vectors(HashMap::new())
339            .with_sparse_vectors(HashMap::new())
340            .to_owned();
341
342        let builder = Node::build_from_other(&original_node);
343        let new_node = builder.build().unwrap();
344
345        assert_eq!(original_node, new_node);
346    }
347
348    #[test]
349    fn test_build_from_other_with_vectors() {
350        let mut vectors = HashMap::new();
351        vectors.insert(EmbeddedField::Chunk, Embedding::default());
352
353        let mut sparse_vectors = HashMap::new();
354        sparse_vectors.insert(
355            EmbeddedField::Chunk,
356            SparseEmbedding {
357                indices: vec![],
358                values: vec![],
359            },
360        );
361
362        let original_node = Node::new("test_chunk")
363            .with_metadata(Metadata::default())
364            .with_vectors(vectors.clone())
365            .with_sparse_vectors(sparse_vectors.clone())
366            .to_owned();
367
368        let builder = Node::build_from_other(&original_node);
369        let new_node = builder.build().unwrap();
370
371        assert_eq!(original_node, new_node);
372    }
373}