1use std::{
21 collections::HashMap,
22 fmt::Debug,
23 hash::{Hash, Hasher},
24 os::unix::ffi::OsStrExt,
25 path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{Embedding, SparseEmbedding, metadata::Metadata, util::debug_long_utf8};
33
34#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
40#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
41pub struct Node {
42 #[builder(default)]
44 pub path: PathBuf,
45 pub chunk: String,
47 #[builder(default)]
49 pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
50 #[builder(default)]
52 pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
53 #[builder(default)]
55 pub metadata: Metadata,
56 #[builder(default)]
58 pub embed_mode: EmbedMode,
59 #[builder(default)]
61 pub original_size: usize,
62 #[builder(default)]
65 pub offset: usize,
66}
67
68impl NodeBuilder {
69 pub fn maybe_sparse_vectors(
70 &mut self,
71 sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
72 ) -> &mut Self {
73 self.sparse_vectors = Some(sparse_vectors);
74 self
75 }
76
77 pub fn maybe_vectors(
78 &mut self,
79 vectors: Option<HashMap<EmbeddedField, Embedding>>,
80 ) -> &mut Self {
81 self.vectors = Some(vectors);
82 self
83 }
84}
85
86impl Debug for Node {
87 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92 f.debug_struct("Node")
93 .field("id", &self.id())
94 .field("path", &self.path)
95 .field("chunk", &debug_long_utf8(&self.chunk, 100))
96 .field("metadata", &self.metadata)
97 .field(
98 "vectors",
99 &self
100 .vectors
101 .iter()
102 .flat_map(HashMap::iter)
103 .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
104 .join(","),
105 )
106 .field(
107 "sparse_vectors",
108 &self
109 .sparse_vectors
110 .iter()
111 .flat_map(HashMap::iter)
112 .map(|(embed_type, vec)| {
113 format!(
114 "'{embed_type}': indices({}), values({})",
115 vec.indices.len(),
116 vec.values.len()
117 )
118 })
119 .join(","),
120 )
121 .field("embed_mode", &self.embed_mode)
122 .finish()
123 }
124}
125
126impl Node {
127 pub fn build_from_other(node: &Node) -> NodeBuilder {
130 NodeBuilder::default()
131 .path(node.path.clone())
132 .chunk(node.chunk.clone())
133 .metadata(node.metadata.clone())
134 .maybe_vectors(node.vectors.clone())
135 .maybe_sparse_vectors(node.sparse_vectors.clone())
136 .embed_mode(node.embed_mode)
137 .original_size(node.original_size)
138 .offset(node.offset)
139 .to_owned()
140 }
141
142 pub fn builder() -> NodeBuilder {
144 NodeBuilder::default()
145 }
146
147 pub fn new(chunk: impl Into<String>) -> Node {
151 let chunk = chunk.into();
152 let original_size = chunk.len();
153 Node {
154 chunk,
155 original_size,
156 ..Default::default()
157 }
158 }
159
160 pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
161 self.metadata = metadata.into();
162 self
163 }
164
165 pub fn with_vectors(
166 &mut self,
167 vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
168 ) -> &mut Self {
169 self.vectors = Some(vectors.into());
170 self
171 }
172
173 pub fn with_sparse_vectors(
174 &mut self,
175 sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
176 ) -> &mut Self {
177 self.sparse_vectors = Some(sparse_vectors.into());
178 self
179 }
180
181 pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
187 let mut embeddables = Vec::new();
189
190 if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
191 embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
192 }
193
194 if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
195 embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
196 for (name, value) in &self.metadata {
197 let value = value
198 .as_str()
199 .map_or_else(|| value.to_string(), ToString::to_string);
200 embeddables.push((EmbeddedField::Metadata(name.clone()), value));
201 }
202 }
203
204 embeddables
205 }
206
207 fn combine_chunk_with_metadata(&self) -> String {
216 let metadata = self
218 .metadata
219 .iter()
220 .map(|(k, v)| {
221 let v = v
222 .as_str()
223 .map_or_else(|| v.to_string(), ToString::to_string);
224
225 format!("{k}: {v}")
226 })
227 .collect::<Vec<String>>()
228 .join("\n");
229
230 format!("{}\n{}", metadata, self.chunk)
231 }
232
233 pub fn id(&self) -> uuid::Uuid {
240 let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
241
242 uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
243 }
244}
245
246impl Hash for Node {
247 fn hash<H: Hasher>(&self, state: &mut H) {
251 self.path.hash(state);
252 self.chunk.hash(state);
253 }
254}
255
256impl<T: Into<String>> From<T> for Node {
257 fn from(value: T) -> Self {
258 Node::new(value)
259 }
260}
261
262#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
264pub enum EmbedMode {
265 #[default]
266 SingleWithMetadata,
268 PerField,
270 Both,
273}
274
275#[derive(
277 Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
278)]
279pub enum EmbeddedField {
280 #[default]
281 Combined,
283 Chunk,
285 #[strum(to_string = "Metadata: {0}")]
288 Metadata(String),
289}
290
291impl EmbeddedField {
292 pub fn sparse_field_name(&self) -> String {
294 format!("{self}_sparse")
295 }
296
297 pub fn field_name(&self) -> String {
299 format!("{self}")
300 }
301}
302
303#[allow(clippy::from_over_into)]
304impl Into<String> for EmbeddedField {
305 fn into(self) -> String {
306 self.to_string()
307 }
308}
309
310#[cfg(test)]
311mod tests {
312 use super::*;
313 use test_case::test_case;
314
315 #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
316 #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
317 #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
318 fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
319 assert_eq!(embedded_field.field_name(), expected[0]);
320 assert_eq!(embedded_field.sparse_field_name(), expected[1]);
321 }
322
323 #[test]
324 fn test_debugging_node_with_utf8_char_boundary() {
325 let node = Node::new("🦀".repeat(101));
326 let _ = format!("{node:?}");
328
329 Node::new("Jürgen".repeat(100));
331 let _ = format!("{node:?}");
332 }
333
334 #[test]
335 fn test_build_from_other_without_vectors() {
336 let original_node = Node::new("test_chunk")
337 .with_metadata(Metadata::default())
338 .with_vectors(HashMap::new())
339 .with_sparse_vectors(HashMap::new())
340 .to_owned();
341
342 let builder = Node::build_from_other(&original_node);
343 let new_node = builder.build().unwrap();
344
345 assert_eq!(original_node, new_node);
346 }
347
348 #[test]
349 fn test_build_from_other_with_vectors() {
350 let mut vectors = HashMap::new();
351 vectors.insert(EmbeddedField::Chunk, Embedding::default());
352
353 let mut sparse_vectors = HashMap::new();
354 sparse_vectors.insert(
355 EmbeddedField::Chunk,
356 SparseEmbedding {
357 indices: vec![],
358 values: vec![],
359 },
360 );
361
362 let original_node = Node::new("test_chunk")
363 .with_metadata(Metadata::default())
364 .with_vectors(vectors.clone())
365 .with_sparse_vectors(sparse_vectors.clone())
366 .to_owned();
367
368 let builder = Node::build_from_other(&original_node);
369 let new_node = builder.build().unwrap();
370
371 assert_eq!(original_node, new_node);
372 }
373}