1use std::{
21 collections::HashMap,
22 fmt::Debug,
23 hash::{Hash, Hasher},
24 os::unix::ffi::OsStrExt,
25 path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{metadata::Metadata, util::debug_long_utf8, Embedding, SparseEmbedding};
33
34#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
40#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
41pub struct Node {
42 #[builder(default)]
44 pub path: PathBuf,
45 pub chunk: String,
47 #[builder(default)]
49 pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
50 #[builder(default)]
52 pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
53 #[builder(default)]
55 pub metadata: Metadata,
56 #[builder(default)]
58 pub embed_mode: EmbedMode,
59 #[builder(default)]
61 pub original_size: usize,
62 #[builder(default)]
64 pub offset: usize,
65}
66
67impl NodeBuilder {
68 pub fn maybe_sparse_vectors(
69 &mut self,
70 sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
71 ) -> &mut Self {
72 self.sparse_vectors = Some(sparse_vectors);
73 self
74 }
75
76 pub fn maybe_vectors(
77 &mut self,
78 vectors: Option<HashMap<EmbeddedField, Embedding>>,
79 ) -> &mut Self {
80 self.vectors = Some(vectors);
81 self
82 }
83}
84
85impl Debug for Node {
86 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91 f.debug_struct("Node")
92 .field("id", &self.id())
93 .field("path", &self.path)
94 .field("chunk", &debug_long_utf8(&self.chunk, 100))
95 .field("metadata", &self.metadata)
96 .field(
97 "vectors",
98 &self
99 .vectors
100 .iter()
101 .flat_map(HashMap::iter)
102 .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
103 .join(","),
104 )
105 .field(
106 "sparse_vectors",
107 &self
108 .sparse_vectors
109 .iter()
110 .flat_map(HashMap::iter)
111 .map(|(embed_type, vec)| {
112 format!(
113 "'{embed_type}': indices({}), values({})",
114 vec.indices.len(),
115 vec.values.len()
116 )
117 })
118 .join(","),
119 )
120 .field("embed_mode", &self.embed_mode)
121 .finish()
122 }
123}
124
125impl Node {
126 pub fn build_from_other(node: &Node) -> NodeBuilder {
129 NodeBuilder::default()
130 .path(node.path.clone())
131 .chunk(node.chunk.clone())
132 .metadata(node.metadata.clone())
133 .maybe_vectors(node.vectors.clone())
134 .maybe_sparse_vectors(node.sparse_vectors.clone())
135 .embed_mode(node.embed_mode)
136 .original_size(node.original_size)
137 .offset(node.offset)
138 .to_owned()
139 }
140
141 pub fn builder() -> NodeBuilder {
143 NodeBuilder::default()
144 }
145
146 pub fn new(chunk: impl Into<String>) -> Node {
150 let chunk = chunk.into();
151 let original_size = chunk.len();
152 Node {
153 chunk,
154 original_size,
155 ..Default::default()
156 }
157 }
158
159 pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
160 self.metadata = metadata.into();
161 self
162 }
163
164 pub fn with_vectors(
165 &mut self,
166 vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
167 ) -> &mut Self {
168 self.vectors = Some(vectors.into());
169 self
170 }
171
172 pub fn with_sparse_vectors(
173 &mut self,
174 sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
175 ) -> &mut Self {
176 self.sparse_vectors = Some(sparse_vectors.into());
177 self
178 }
179
180 pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
186 let mut embeddables = Vec::new();
188
189 if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
190 embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
191 }
192
193 if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
194 embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
195 for (name, value) in &self.metadata {
196 let value = value
197 .as_str()
198 .map_or_else(|| value.to_string(), ToString::to_string);
199 embeddables.push((EmbeddedField::Metadata(name.clone()), value));
200 }
201 }
202
203 embeddables
204 }
205
206 fn combine_chunk_with_metadata(&self) -> String {
215 let metadata = self
217 .metadata
218 .iter()
219 .map(|(k, v)| {
220 let v = v
221 .as_str()
222 .map_or_else(|| v.to_string(), ToString::to_string);
223
224 format!("{k}: {v}")
225 })
226 .collect::<Vec<String>>()
227 .join("\n");
228
229 format!("{}\n{}", metadata, self.chunk)
230 }
231
232 pub fn id(&self) -> uuid::Uuid {
239 let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
240
241 uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
242 }
243}
244
245impl Hash for Node {
246 fn hash<H: Hasher>(&self, state: &mut H) {
250 self.path.hash(state);
251 self.chunk.hash(state);
252 }
253}
254
255impl<T: Into<String>> From<T> for Node {
256 fn from(value: T) -> Self {
257 Node::new(value)
258 }
259}
260
261#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
263pub enum EmbedMode {
264 #[default]
265 SingleWithMetadata,
267 PerField,
269 Both,
271}
272
273#[derive(
275 Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
276)]
277pub enum EmbeddedField {
278 #[default]
279 Combined,
281 Chunk,
283 #[strum(to_string = "Metadata: {0}")]
286 Metadata(String),
287}
288
289impl EmbeddedField {
290 pub fn sparse_field_name(&self) -> String {
292 format!("{self}_sparse")
293 }
294
295 pub fn field_name(&self) -> String {
297 format!("{self}")
298 }
299}
300
301#[allow(clippy::from_over_into)]
302impl Into<String> for EmbeddedField {
303 fn into(self) -> String {
304 self.to_string()
305 }
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311 use test_case::test_case;
312
313 #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
314 #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
315 #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
316 fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
317 assert_eq!(embedded_field.field_name(), expected[0]);
318 assert_eq!(embedded_field.sparse_field_name(), expected[1]);
319 }
320
321 #[test]
322 fn test_debugging_node_with_utf8_char_boundary() {
323 let node = Node::new("🦀".repeat(101));
324 let _ = format!("{node:?}");
326
327 Node::new("Jürgen".repeat(100));
329 let _ = format!("{node:?}");
330 }
331
332 #[test]
333 fn test_build_from_other_without_vectors() {
334 let original_node = Node::new("test_chunk")
335 .with_metadata(Metadata::default())
336 .with_vectors(HashMap::new())
337 .with_sparse_vectors(HashMap::new())
338 .to_owned();
339
340 let builder = Node::build_from_other(&original_node);
341 let new_node = builder.build().unwrap();
342
343 assert_eq!(original_node, new_node);
344 }
345
346 #[test]
347 fn test_build_from_other_with_vectors() {
348 let mut vectors = HashMap::new();
349 vectors.insert(EmbeddedField::Chunk, Embedding::default());
350
351 let mut sparse_vectors = HashMap::new();
352 sparse_vectors.insert(
353 EmbeddedField::Chunk,
354 SparseEmbedding {
355 indices: vec![],
356 values: vec![],
357 },
358 );
359
360 let original_node = Node::new("test_chunk")
361 .with_metadata(Metadata::default())
362 .with_vectors(vectors.clone())
363 .with_sparse_vectors(sparse_vectors.clone())
364 .to_owned();
365
366 let builder = Node::build_from_other(&original_node);
367 let new_node = builder.build().unwrap();
368
369 assert_eq!(original_node, new_node);
370 }
371}