swiftide_core/
node.rs

1//! This module defines the `Node` struct and its associated methods.
2//!
3//! `Node` represents a unit of data in the indexing process, containing metadata,
4//! the data chunk itself, and an optional vector representation.
5//!
6//! # Overview
7//!
8//! The `Node` struct is designed to encapsulate all necessary information for a single
9//! unit of data being processed in the indexing pipeline. It includes fields for an identifier,
10//! file path, data chunk, optional vector representation, and metadata.
11//!
12//! The struct provides methods to convert the node into an embeddable string format and to
13//! calculate a hash value for the node based on its path and chunk.
14//!
15//! # Usage
16//!
17//! The `Node` struct is used throughout the indexing pipeline to represent and process
18//! individual units of data. It is particularly useful in scenarios where metadata and data chunks
19//! need to be processed together.
20use std::{
21    collections::HashMap,
22    fmt::Debug,
23    hash::{Hash, Hasher},
24    os::unix::ffi::OsStrExt,
25    path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{Embedding, SparseEmbedding, metadata::Metadata, util::debug_long_utf8};
33
34/// Represents a unit of data in the indexing process.
35///
36/// `Node` encapsulates all necessary information for a single unit of data being processed
37/// in the indexing pipeline. It includes fields for an identifier, file path, data chunk, optional
38/// vector representation, and metadata.
39#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
40#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
41pub struct Node {
42    /// File path associated with the node.
43    #[builder(default)]
44    pub path: PathBuf,
45    /// Data chunk contained in the node.
46    pub chunk: String,
47    /// Optional vector representation of embedded data.
48    #[builder(default)]
49    pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
50    /// Optional sparse vector representation of embedded data.
51    #[builder(default)]
52    pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
53    /// Metadata associated with the node.
54    #[builder(default)]
55    pub metadata: Metadata,
56    /// Mode of embedding data Chunk and Metadata
57    #[builder(default)]
58    pub embed_mode: EmbedMode,
59    /// Size of the input this node was originally derived from in bytes
60    #[builder(default)]
61    pub original_size: usize,
62    /// Offset of the chunk relative to the start of the input this node was originally derived
63    /// from in bytes
64    #[builder(default)]
65    pub offset: usize,
66    /// Optional parent id
67    #[builder(default)]
68    pub parent_id: Option<uuid::Uuid>,
69}
70
71impl NodeBuilder {
72    pub fn maybe_sparse_vectors(
73        &mut self,
74        sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
75    ) -> &mut Self {
76        self.sparse_vectors = Some(sparse_vectors);
77        self
78    }
79
80    pub fn maybe_vectors(
81        &mut self,
82        vectors: Option<HashMap<EmbeddedField, Embedding>>,
83    ) -> &mut Self {
84        self.vectors = Some(vectors);
85        self
86    }
87}
88
89impl Debug for Node {
90    /// Formats the node for debugging purposes.
91    ///
92    /// This method is used to provide a human-readable representation of the node when debugging.
93    /// The vector field is displayed as the number of elements in the vector if present.
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        f.debug_struct("Node")
96            .field("id", &self.id())
97            .field("path", &self.path)
98            .field("chunk", &debug_long_utf8(&self.chunk, 100))
99            .field("metadata", &self.metadata)
100            .field(
101                "vectors",
102                &self
103                    .vectors
104                    .iter()
105                    .flat_map(HashMap::iter)
106                    .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
107                    .join(","),
108            )
109            .field(
110                "sparse_vectors",
111                &self
112                    .sparse_vectors
113                    .iter()
114                    .flat_map(HashMap::iter)
115                    .map(|(embed_type, vec)| {
116                        format!(
117                            "'{embed_type}': indices({}), values({})",
118                            vec.indices.len(),
119                            vec.values.len()
120                        )
121                    })
122                    .join(","),
123            )
124            .field("embed_mode", &self.embed_mode)
125            .finish()
126    }
127}
128
129impl Node {
130    /// Builds a new instance of `Node`, returning a `NodeBuilder`. Copies
131    /// over the fields from the provided `Node`.
132    pub fn chunking_from(node: &Node) -> NodeBuilder {
133        NodeBuilder::default()
134            .path(node.path.clone())
135            .chunk(node.chunk.clone())
136            .metadata(node.metadata.clone())
137            .maybe_vectors(node.vectors.clone())
138            .maybe_sparse_vectors(node.sparse_vectors.clone())
139            .embed_mode(node.embed_mode)
140            .original_size(node.original_size)
141            .offset(node.offset)
142            .parent_id(node.id())
143            .to_owned()
144    }
145
146    /// Creates a new instance of `NodeBuilder.`
147    pub fn builder() -> NodeBuilder {
148        NodeBuilder::default()
149    }
150
151    /// Creates a new instance of `Node` with the specified data chunk.
152    ///
153    /// The other fields are set to their default values.
154    pub fn new(chunk: impl Into<String>) -> Node {
155        let chunk = chunk.into();
156        let original_size = chunk.len();
157        Node {
158            chunk,
159            original_size,
160            ..Default::default()
161        }
162    }
163
164    pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
165        self.metadata = metadata.into();
166        self
167    }
168
169    pub fn with_vectors(
170        &mut self,
171        vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
172    ) -> &mut Self {
173        self.vectors = Some(vectors.into());
174        self
175    }
176
177    pub fn with_sparse_vectors(
178        &mut self,
179        sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
180    ) -> &mut Self {
181        self.sparse_vectors = Some(sparse_vectors.into());
182        self
183    }
184
185    /// Creates embeddable data depending on chosen `EmbedMode`.
186    ///
187    /// # Returns
188    ///
189    /// Embeddable data mapped to their `EmbeddedField`.
190    pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
191        // TODO: Figure out a clever way to do zero copy
192        let mut embeddables = Vec::new();
193
194        if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
195            embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
196        }
197
198        if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
199            embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
200            for (name, value) in &self.metadata {
201                let value = value
202                    .as_str()
203                    .map_or_else(|| value.to_string(), ToString::to_string);
204                embeddables.push((EmbeddedField::Metadata(name.clone()), value));
205            }
206        }
207
208        embeddables
209    }
210
211    /// Converts the node into an [`self::EmbeddedField::Combined`] type of embeddable.
212    ///
213    /// This embeddable format consists of the metadata formatted as key-value pairs, each on a new
214    /// line, followed by the data chunk.
215    ///
216    /// # Returns
217    ///
218    /// A string representing the embeddable format of the node.
219    fn combine_chunk_with_metadata(&self) -> String {
220        // Metadata formatted by newlines joined with the chunk
221        let metadata = self
222            .metadata
223            .iter()
224            .map(|(k, v)| {
225                let v = v
226                    .as_str()
227                    .map_or_else(|| v.to_string(), ToString::to_string);
228
229                format!("{k}: {v}")
230            })
231            .collect::<Vec<String>>()
232            .join("\n");
233
234        format!("{}\n{}", metadata, self.chunk)
235    }
236
237    /// Retrieve the identifier of the node.
238    ///
239    /// Calculates the identifier of the node based on its path and chunk as bytes, returning a
240    /// UUID (v3).
241    ///
242    /// WARN: Does not memoize the id. Use sparingly.
243    pub fn id(&self) -> uuid::Uuid {
244        let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
245
246        uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
247    }
248
249    pub fn parent_id(&self) -> Option<uuid::Uuid> {
250        self.parent_id
251    }
252}
253
254impl Hash for Node {
255    /// Hashes the node based on its path and chunk.
256    ///
257    /// This method is used by the `calculate_hash` method to generate a hash value for the node.
258    fn hash<H: Hasher>(&self, state: &mut H) {
259        self.path.hash(state);
260        self.chunk.hash(state);
261    }
262}
263
264impl<T: Into<String>> From<T> for Node {
265    fn from(value: T) -> Self {
266        Node::new(value)
267    }
268}
269
270/// Embed mode of the pipeline.
271#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
272pub enum EmbedMode {
273    #[default]
274    /// Embedding Chunk of data combined with Metadata.
275    SingleWithMetadata,
276    /// Embedding Chunk of data and every Metadata separately.
277    PerField,
278    /// Embedding Chunk of data and every Metadata separately and Chunk of data combined with
279    /// Metadata.
280    Both,
281}
282
283/// Type of Embeddable stored in model.
284#[derive(
285    Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
286)]
287pub enum EmbeddedField {
288    #[default]
289    /// Embeddable created from Chunk of data combined with Metadata.
290    Combined,
291    /// Embeddable created from Chunk of data only.
292    Chunk,
293    /// Embeddable created from Metadata.
294    /// String stores Metadata name.
295    #[strum(to_string = "Metadata: {0}")]
296    Metadata(String),
297}
298
299impl EmbeddedField {
300    /// Returns the name of the field when it would be a sparse vector
301    pub fn sparse_field_name(&self) -> String {
302        format!("{self}_sparse")
303    }
304
305    /// Returns the name of the field when it would be a dense vector
306    pub fn field_name(&self) -> String {
307        format!("{self}")
308    }
309}
310
311#[allow(clippy::from_over_into)]
312impl Into<String> for EmbeddedField {
313    fn into(self) -> String {
314        self.to_string()
315    }
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321    use test_case::test_case;
322
323    #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
324    #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
325    #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
326    fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
327        assert_eq!(embedded_field.field_name(), expected[0]);
328        assert_eq!(embedded_field.sparse_field_name(), expected[1]);
329    }
330
331    #[test]
332    fn test_debugging_node_with_utf8_char_boundary() {
333        let node = Node::new("🦀".repeat(101));
334        // Single char
335        let _ = format!("{node:?}");
336
337        // With invalid char boundary
338        Node::new("Jürgen".repeat(100));
339        let _ = format!("{node:?}");
340    }
341
342    #[test]
343    fn test_build_from_other_without_vectors() {
344        let original_node = Node::new("test_chunk")
345            .with_metadata(Metadata::default())
346            .with_vectors(HashMap::new())
347            .with_sparse_vectors(HashMap::new())
348            .to_owned();
349
350        let builder = Node::chunking_from(&original_node);
351        let chunked_node = builder.build().unwrap();
352
353        assert_eq!(chunked_node.parent_id(), Some(original_node.id()));
354
355        assert_eq!(original_node.parent_id(), None);
356
357        assert_eq!(original_node.chunk, chunked_node.chunk);
358        assert_eq!(original_node.path, chunked_node.path);
359        assert_eq!(original_node.metadata, chunked_node.metadata);
360        assert_eq!(original_node.vectors, chunked_node.vectors);
361        assert_eq!(original_node.sparse_vectors, chunked_node.sparse_vectors);
362        assert_eq!(original_node.embed_mode, chunked_node.embed_mode);
363        assert_eq!(original_node.original_size, chunked_node.original_size);
364        assert_eq!(original_node.offset, chunked_node.offset);
365    }
366
367    #[test]
368    fn test_build_from_other_with_vectors() {
369        let mut vectors = HashMap::new();
370        vectors.insert(EmbeddedField::Chunk, Embedding::default());
371
372        let mut sparse_vectors = HashMap::new();
373        sparse_vectors.insert(
374            EmbeddedField::Chunk,
375            SparseEmbedding {
376                indices: vec![],
377                values: vec![],
378            },
379        );
380
381        let original_node = Node::new("test_chunk")
382            .with_metadata(Metadata::default())
383            .with_vectors(vectors.clone())
384            .with_sparse_vectors(sparse_vectors.clone())
385            .to_owned();
386
387        let builder = Node::chunking_from(&original_node);
388        let chunked_node = builder.build().unwrap();
389
390        assert_eq!(chunked_node.parent_id(), Some(original_node.id()));
391
392        assert_eq!(original_node.parent_id(), None);
393
394        assert_eq!(original_node.chunk, chunked_node.chunk);
395        assert_eq!(original_node.path, chunked_node.path);
396        assert_eq!(original_node.metadata, chunked_node.metadata);
397        assert_eq!(original_node.vectors, chunked_node.vectors);
398        assert_eq!(original_node.sparse_vectors, chunked_node.sparse_vectors);
399        assert_eq!(original_node.embed_mode, chunked_node.embed_mode);
400        assert_eq!(original_node.original_size, chunked_node.original_size);
401        assert_eq!(original_node.offset, chunked_node.offset);
402    }
403}