1use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13 #[serde(rename = "text")]
15 Text,
16 #[serde(rename = "u64")]
18 U64,
19 #[serde(rename = "i64")]
21 I64,
22 #[serde(rename = "f64")]
24 F64,
25 #[serde(rename = "bytes")]
27 Bytes,
28 #[serde(rename = "sparse_vector")]
30 SparseVector,
31 #[serde(rename = "dense_vector")]
33 DenseVector,
34 #[serde(rename = "json")]
36 Json,
37 #[serde(rename = "binary_dense_vector")]
39 BinaryDenseVector,
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct FieldEntry {
45 pub name: String,
46 pub field_type: FieldType,
47 pub indexed: bool,
48 pub stored: bool,
49 pub tokenizer: Option<String>,
51 #[serde(default)]
53 pub multi: bool,
54 #[serde(default, skip_serializing_if = "Option::is_none")]
56 pub positions: Option<PositionMode>,
57 #[serde(default, skip_serializing_if = "Option::is_none")]
59 pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
60 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub dense_vector_config: Option<DenseVectorConfig>,
63 #[serde(default, skip_serializing_if = "Option::is_none")]
65 pub binary_dense_vector_config: Option<BinaryDenseVectorConfig>,
66 #[serde(default)]
69 pub fast: bool,
70 #[serde(default)]
72 pub primary_key: bool,
73 #[serde(default)]
77 pub reorder: bool,
78}
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
82#[serde(rename_all = "snake_case")]
83pub enum PositionMode {
84 Ordinal,
87 TokenPosition,
90 Full,
93}
94
95impl PositionMode {
96 pub fn tracks_ordinal(&self) -> bool {
98 matches!(self, PositionMode::Ordinal | PositionMode::Full)
99 }
100
101 pub fn tracks_token_position(&self) -> bool {
103 matches!(self, PositionMode::TokenPosition | PositionMode::Full)
104 }
105}
106
107#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
109#[serde(rename_all = "snake_case")]
110pub enum VectorIndexType {
111 Flat,
113 #[default]
115 RaBitQ,
116 IvfRaBitQ,
118 ScaNN,
120}
121
122#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
128#[serde(rename_all = "snake_case")]
129pub enum DenseVectorQuantization {
130 #[default]
132 F32,
133 F16,
135 UInt8,
137 Binary,
140}
141
142impl DenseVectorQuantization {
143 pub fn element_size(self) -> usize {
146 match self {
147 Self::F32 => 4,
148 Self::F16 => 2,
149 Self::UInt8 => 1,
150 Self::Binary => panic!("element_size() not valid for Binary; use dim.div_ceil(8)"),
151 }
152 }
153
154 pub fn tag(self) -> u8 {
156 match self {
157 Self::F32 => 0,
158 Self::F16 => 1,
159 Self::UInt8 => 2,
160 Self::Binary => 3,
161 }
162 }
163
164 pub fn from_tag(tag: u8) -> Option<Self> {
166 match tag {
167 0 => Some(Self::F32),
168 1 => Some(Self::F16),
169 2 => Some(Self::UInt8),
170 3 => Some(Self::Binary),
171 _ => None,
172 }
173 }
174}
175
176#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct DenseVectorConfig {
185 pub dim: usize,
187 #[serde(default)]
190 pub index_type: VectorIndexType,
191 #[serde(default)]
193 pub quantization: DenseVectorQuantization,
194 #[serde(default, skip_serializing_if = "Option::is_none")]
197 pub num_clusters: Option<usize>,
198 #[serde(default = "default_nprobe")]
200 pub nprobe: usize,
201 #[serde(default, skip_serializing_if = "Option::is_none")]
205 pub build_threshold: Option<usize>,
206 #[serde(default = "default_unit_norm")]
211 pub unit_norm: bool,
212}
213
214fn default_nprobe() -> usize {
215 32
216}
217
218fn default_unit_norm() -> bool {
219 true
220}
221
222impl DenseVectorConfig {
223 pub fn new(dim: usize) -> Self {
224 Self {
225 dim,
226 index_type: VectorIndexType::RaBitQ,
227 quantization: DenseVectorQuantization::F32,
228 num_clusters: None,
229 nprobe: 32,
230 build_threshold: None,
231 unit_norm: true,
232 }
233 }
234
235 pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
237 Self {
238 dim,
239 index_type: VectorIndexType::IvfRaBitQ,
240 quantization: DenseVectorQuantization::F32,
241 num_clusters,
242 nprobe,
243 build_threshold: None,
244 unit_norm: true,
245 }
246 }
247
248 pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
250 Self {
251 dim,
252 index_type: VectorIndexType::ScaNN,
253 quantization: DenseVectorQuantization::F32,
254 num_clusters,
255 nprobe,
256 build_threshold: None,
257 unit_norm: true,
258 }
259 }
260
261 pub fn flat(dim: usize) -> Self {
263 Self {
264 dim,
265 index_type: VectorIndexType::Flat,
266 quantization: DenseVectorQuantization::F32,
267 num_clusters: None,
268 nprobe: 0,
269 build_threshold: None,
270 unit_norm: true,
271 }
272 }
273
274 pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
276 self.quantization = quantization;
277 self
278 }
279
280 pub fn with_build_threshold(mut self, threshold: usize) -> Self {
282 self.build_threshold = Some(threshold);
283 self
284 }
285
286 pub fn with_unit_norm(mut self) -> Self {
288 self.unit_norm = true;
289 self
290 }
291
292 pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
294 self.num_clusters = Some(num_clusters);
295 self
296 }
297
298 pub fn uses_ivf(&self) -> bool {
300 matches!(
301 self.index_type,
302 VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
303 )
304 }
305
306 pub fn uses_scann(&self) -> bool {
308 self.index_type == VectorIndexType::ScaNN
309 }
310
311 pub fn is_flat(&self) -> bool {
313 self.index_type == VectorIndexType::Flat
314 }
315
316 pub fn default_build_threshold(&self) -> usize {
318 self.build_threshold.unwrap_or(match self.index_type {
319 VectorIndexType::Flat => usize::MAX, VectorIndexType::RaBitQ => 1000,
321 VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
322 })
323 }
324
325 pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
327 self.num_clusters.unwrap_or_else(|| {
328 let optimal = (num_vectors as f64).sqrt() as usize;
330 optimal.clamp(16, 4096)
331 })
332 }
333}
334
335#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct BinaryDenseVectorConfig {
342 pub dim: usize,
344}
345
346impl BinaryDenseVectorConfig {
347 pub fn new(dim: usize) -> Self {
348 assert!(
349 dim.is_multiple_of(8),
350 "BinaryDenseVector dimension must be a multiple of 8, got {dim}"
351 );
352 Self { dim }
353 }
354
355 pub fn byte_len(&self) -> usize {
357 self.dim.div_ceil(8)
358 }
359}
360
361use super::query_field_router::QueryRouterRule;
362
363#[derive(Debug, Clone, Default, Serialize, Deserialize)]
365pub struct Schema {
366 fields: Vec<FieldEntry>,
367 name_to_field: HashMap<String, Field>,
368 #[serde(default)]
370 default_fields: Vec<Field>,
371 #[serde(default)]
373 query_routers: Vec<QueryRouterRule>,
374}
375
376impl Schema {
377 pub fn builder() -> SchemaBuilder {
378 SchemaBuilder::default()
379 }
380
381 pub fn get_field(&self, name: &str) -> Option<Field> {
382 self.name_to_field.get(name).copied()
383 }
384
385 pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
386 self.fields.get(field.0 as usize)
387 }
388
389 pub fn get_field_name(&self, field: Field) -> Option<&str> {
390 self.fields.get(field.0 as usize).map(|e| e.name.as_str())
391 }
392
393 pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
394 self.fields
395 .iter()
396 .enumerate()
397 .map(|(i, e)| (Field(i as u32), e))
398 }
399
400 pub fn num_fields(&self) -> usize {
401 self.fields.len()
402 }
403
404 pub fn has_reorder_fields(&self) -> bool {
407 self.fields.iter().any(|e| e.reorder)
408 }
409
410 pub fn default_fields(&self) -> &[Field] {
412 &self.default_fields
413 }
414
415 pub fn set_default_fields(&mut self, fields: Vec<Field>) {
417 self.default_fields = fields;
418 }
419
420 pub fn query_routers(&self) -> &[QueryRouterRule] {
422 &self.query_routers
423 }
424
425 pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
427 self.query_routers = rules;
428 }
429
430 pub fn primary_field(&self) -> Option<Field> {
432 self.fields
433 .iter()
434 .enumerate()
435 .find(|(_, e)| e.primary_key)
436 .map(|(i, _)| Field(i as u32))
437 }
438}
439
440#[derive(Debug, Default)]
442pub struct SchemaBuilder {
443 fields: Vec<FieldEntry>,
444 default_fields: Vec<String>,
445 query_routers: Vec<QueryRouterRule>,
446}
447
448impl SchemaBuilder {
449 pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
450 self.add_field_with_tokenizer(
451 name,
452 FieldType::Text,
453 indexed,
454 stored,
455 Some("simple".to_string()),
456 )
457 }
458
459 pub fn add_text_field_with_tokenizer(
460 &mut self,
461 name: &str,
462 indexed: bool,
463 stored: bool,
464 tokenizer: &str,
465 ) -> Field {
466 self.add_field_with_tokenizer(
467 name,
468 FieldType::Text,
469 indexed,
470 stored,
471 Some(tokenizer.to_string()),
472 )
473 }
474
475 pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
476 self.add_field(name, FieldType::U64, indexed, stored)
477 }
478
479 pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
480 self.add_field(name, FieldType::I64, indexed, stored)
481 }
482
483 pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
484 self.add_field(name, FieldType::F64, indexed, stored)
485 }
486
487 pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
488 self.add_field(name, FieldType::Bytes, false, stored)
489 }
490
491 pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
496 self.add_field(name, FieldType::Json, false, stored)
497 }
498
499 pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
504 self.add_sparse_vector_field_with_config(
505 name,
506 indexed,
507 stored,
508 crate::structures::SparseVectorConfig::default(),
509 )
510 }
511
512 pub fn add_sparse_vector_field_with_config(
517 &mut self,
518 name: &str,
519 indexed: bool,
520 stored: bool,
521 config: crate::structures::SparseVectorConfig,
522 ) -> Field {
523 let field = Field(self.fields.len() as u32);
524 self.fields.push(FieldEntry {
525 name: name.to_string(),
526 field_type: FieldType::SparseVector,
527 indexed,
528 stored,
529 tokenizer: None,
530 multi: false,
531 positions: None,
532 sparse_vector_config: Some(config),
533 dense_vector_config: None,
534 binary_dense_vector_config: None,
535 fast: false,
536 primary_key: false,
537 reorder: false,
538 });
539 field
540 }
541
542 pub fn set_sparse_vector_config(
544 &mut self,
545 field: Field,
546 config: crate::structures::SparseVectorConfig,
547 ) {
548 if let Some(entry) = self.fields.get_mut(field.0 as usize) {
549 entry.sparse_vector_config = Some(config);
550 }
551 }
552
553 pub fn add_dense_vector_field(
558 &mut self,
559 name: &str,
560 dim: usize,
561 indexed: bool,
562 stored: bool,
563 ) -> Field {
564 self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
565 }
566
567 pub fn add_dense_vector_field_with_config(
569 &mut self,
570 name: &str,
571 indexed: bool,
572 stored: bool,
573 config: DenseVectorConfig,
574 ) -> Field {
575 let field = Field(self.fields.len() as u32);
576 self.fields.push(FieldEntry {
577 name: name.to_string(),
578 field_type: FieldType::DenseVector,
579 indexed,
580 stored,
581 tokenizer: None,
582 multi: false,
583 positions: None,
584 sparse_vector_config: None,
585 dense_vector_config: Some(config),
586 binary_dense_vector_config: None,
587 fast: false,
588 primary_key: false,
589 reorder: false,
590 });
591 field
592 }
593
594 pub fn add_binary_dense_vector_field(
599 &mut self,
600 name: &str,
601 dim: usize,
602 indexed: bool,
603 stored: bool,
604 ) -> Field {
605 self.add_binary_dense_vector_field_with_config(
606 name,
607 indexed,
608 stored,
609 BinaryDenseVectorConfig::new(dim),
610 )
611 }
612
613 pub fn add_binary_dense_vector_field_with_config(
615 &mut self,
616 name: &str,
617 indexed: bool,
618 stored: bool,
619 config: BinaryDenseVectorConfig,
620 ) -> Field {
621 let field = Field(self.fields.len() as u32);
622 self.fields.push(FieldEntry {
623 name: name.to_string(),
624 field_type: FieldType::BinaryDenseVector,
625 indexed,
626 stored,
627 tokenizer: None,
628 multi: false,
629 positions: None,
630 sparse_vector_config: None,
631 dense_vector_config: None,
632 binary_dense_vector_config: Some(config),
633 fast: false,
634 primary_key: false,
635 reorder: false,
636 });
637 field
638 }
639
640 fn add_field(
641 &mut self,
642 name: &str,
643 field_type: FieldType,
644 indexed: bool,
645 stored: bool,
646 ) -> Field {
647 self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
648 }
649
650 fn add_field_with_tokenizer(
651 &mut self,
652 name: &str,
653 field_type: FieldType,
654 indexed: bool,
655 stored: bool,
656 tokenizer: Option<String>,
657 ) -> Field {
658 self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
659 }
660
661 fn add_field_full(
662 &mut self,
663 name: &str,
664 field_type: FieldType,
665 indexed: bool,
666 stored: bool,
667 tokenizer: Option<String>,
668 multi: bool,
669 ) -> Field {
670 let field = Field(self.fields.len() as u32);
671 self.fields.push(FieldEntry {
672 name: name.to_string(),
673 field_type,
674 indexed,
675 stored,
676 tokenizer,
677 multi,
678 positions: None,
679 sparse_vector_config: None,
680 dense_vector_config: None,
681 binary_dense_vector_config: None,
682 fast: false,
683 primary_key: false,
684 reorder: false,
685 });
686 field
687 }
688
689 pub fn set_multi(&mut self, field: Field, multi: bool) {
691 if let Some(entry) = self.fields.get_mut(field.0 as usize) {
692 entry.multi = multi;
693 }
694 }
695
696 pub fn set_fast(&mut self, field: Field, fast: bool) {
699 if let Some(entry) = self.fields.get_mut(field.0 as usize) {
700 entry.fast = fast;
701 }
702 }
703
704 pub fn set_primary_key(&mut self, field: Field) {
706 if let Some(entry) = self.fields.get_mut(field.0 as usize) {
707 entry.primary_key = true;
708 }
709 }
710
711 pub fn set_reorder(&mut self, field: Field, reorder: bool) {
713 if let Some(entry) = self.fields.get_mut(field.0 as usize) {
714 entry.reorder = reorder;
715 }
716 }
717
718 pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
720 if let Some(entry) = self.fields.get_mut(field.0 as usize) {
721 entry.positions = Some(mode);
722 }
723 }
724
725 pub fn set_default_fields(&mut self, field_names: Vec<String>) {
727 self.default_fields = field_names;
728 }
729
730 pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
732 self.query_routers = rules;
733 }
734
735 pub fn build(self) -> Schema {
736 let mut name_to_field = HashMap::new();
737 for (i, entry) in self.fields.iter().enumerate() {
738 name_to_field.insert(entry.name.clone(), Field(i as u32));
739 }
740
741 let default_fields: Vec<Field> = self
743 .default_fields
744 .iter()
745 .filter_map(|name| name_to_field.get(name).copied())
746 .collect();
747
748 Schema {
749 fields: self.fields,
750 name_to_field,
751 default_fields,
752 query_routers: self.query_routers,
753 }
754 }
755}
756
757#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
759pub enum FieldValue {
760 #[serde(rename = "text")]
761 Text(String),
762 #[serde(rename = "u64")]
763 U64(u64),
764 #[serde(rename = "i64")]
765 I64(i64),
766 #[serde(rename = "f64")]
767 F64(f64),
768 #[serde(rename = "bytes")]
769 Bytes(Vec<u8>),
770 #[serde(rename = "sparse_vector")]
772 SparseVector(Vec<(u32, f32)>),
773 #[serde(rename = "dense_vector")]
775 DenseVector(Vec<f32>),
776 #[serde(rename = "json")]
778 Json(serde_json::Value),
779 #[serde(rename = "binary_dense_vector")]
781 BinaryDenseVector(Vec<u8>),
782}
783
784impl FieldValue {
785 pub fn as_text(&self) -> Option<&str> {
786 match self {
787 FieldValue::Text(s) => Some(s),
788 _ => None,
789 }
790 }
791
792 pub fn as_u64(&self) -> Option<u64> {
793 match self {
794 FieldValue::U64(v) => Some(*v),
795 _ => None,
796 }
797 }
798
799 pub fn as_i64(&self) -> Option<i64> {
800 match self {
801 FieldValue::I64(v) => Some(*v),
802 _ => None,
803 }
804 }
805
806 pub fn as_f64(&self) -> Option<f64> {
807 match self {
808 FieldValue::F64(v) => Some(*v),
809 _ => None,
810 }
811 }
812
813 pub fn as_bytes(&self) -> Option<&[u8]> {
814 match self {
815 FieldValue::Bytes(b) => Some(b),
816 _ => None,
817 }
818 }
819
820 pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
821 match self {
822 FieldValue::SparseVector(entries) => Some(entries),
823 _ => None,
824 }
825 }
826
827 pub fn as_dense_vector(&self) -> Option<&[f32]> {
828 match self {
829 FieldValue::DenseVector(v) => Some(v),
830 _ => None,
831 }
832 }
833
834 pub fn as_json(&self) -> Option<&serde_json::Value> {
835 match self {
836 FieldValue::Json(v) => Some(v),
837 _ => None,
838 }
839 }
840
841 pub fn as_binary_dense_vector(&self) -> Option<&[u8]> {
842 match self {
843 FieldValue::BinaryDenseVector(v) => Some(v),
844 _ => None,
845 }
846 }
847}
848
849#[derive(Debug, Clone, Default, Serialize, Deserialize)]
851pub struct Document {
852 field_values: Vec<(Field, FieldValue)>,
853}
854
855impl Document {
856 pub fn new() -> Self {
857 Self::default()
858 }
859
860 pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
861 self.field_values
862 .push((field, FieldValue::Text(value.into())));
863 }
864
865 pub fn add_u64(&mut self, field: Field, value: u64) {
866 self.field_values.push((field, FieldValue::U64(value)));
867 }
868
869 pub fn add_i64(&mut self, field: Field, value: i64) {
870 self.field_values.push((field, FieldValue::I64(value)));
871 }
872
873 pub fn add_f64(&mut self, field: Field, value: f64) {
874 self.field_values.push((field, FieldValue::F64(value)));
875 }
876
877 pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
878 self.field_values.push((field, FieldValue::Bytes(value)));
879 }
880
881 pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
882 self.field_values
883 .push((field, FieldValue::SparseVector(entries)));
884 }
885
886 pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
887 self.field_values
888 .push((field, FieldValue::DenseVector(values)));
889 }
890
891 pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
892 self.field_values.push((field, FieldValue::Json(value)));
893 }
894
895 pub fn add_binary_dense_vector(&mut self, field: Field, values: Vec<u8>) {
896 self.field_values
897 .push((field, FieldValue::BinaryDenseVector(values)));
898 }
899
900 pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
901 self.field_values
902 .iter()
903 .find(|(f, _)| *f == field)
904 .map(|(_, v)| v)
905 }
906
907 pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
908 self.field_values
909 .iter()
910 .filter(move |(f, _)| *f == field)
911 .map(|(_, v)| v)
912 }
913
914 pub fn field_values(&self) -> &[(Field, FieldValue)] {
915 &self.field_values
916 }
917
918 pub fn filter_stored(&self, schema: &Schema) -> Document {
920 Document {
921 field_values: self
922 .field_values
923 .iter()
924 .filter(|(field, _)| {
925 schema
926 .get_field_entry(*field)
927 .is_some_and(|entry| entry.stored)
928 })
929 .cloned()
930 .collect(),
931 }
932 }
933
934 pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
940 use std::collections::HashMap;
941
942 let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
944 HashMap::new();
945
946 for (field, value) in &self.field_values {
947 if let Some(entry) = schema.get_field_entry(*field) {
948 let json_value = match value {
949 FieldValue::Text(s) => serde_json::Value::String(s.clone()),
950 FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
951 FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
952 FieldValue::F64(n) => serde_json::json!(n),
953 FieldValue::Bytes(b) => {
954 use base64::Engine;
955 serde_json::Value::String(
956 base64::engine::general_purpose::STANDARD.encode(b),
957 )
958 }
959 FieldValue::SparseVector(entries) => {
960 let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
961 let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
962 serde_json::json!({
963 "indices": indices,
964 "values": values
965 })
966 }
967 FieldValue::DenseVector(values) => {
968 serde_json::json!(values)
969 }
970 FieldValue::Json(v) => v.clone(),
971 FieldValue::BinaryDenseVector(b) => {
972 use base64::Engine;
973 serde_json::Value::String(
974 base64::engine::general_purpose::STANDARD.encode(b),
975 )
976 }
977 };
978 field_values_map
979 .entry(*field)
980 .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
981 .2
982 .push(json_value);
983 }
984 }
985
986 let mut map = serde_json::Map::new();
988 for (_field, (name, is_multi, values)) in field_values_map {
989 let json_value = if is_multi || values.len() > 1 {
990 serde_json::Value::Array(values)
991 } else {
992 values.into_iter().next().unwrap()
993 };
994 map.insert(name, json_value);
995 }
996
997 serde_json::Value::Object(map)
998 }
999
1000 pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
1009 let obj = json.as_object()?;
1010 let mut doc = Document::new();
1011
1012 for (key, value) in obj {
1013 if let Some(field) = schema.get_field(key) {
1014 let field_entry = schema.get_field_entry(field)?;
1015 Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
1016 }
1017 }
1018
1019 Some(doc)
1020 }
1021
1022 fn add_json_value(
1024 doc: &mut Document,
1025 field: Field,
1026 field_type: &FieldType,
1027 value: &serde_json::Value,
1028 ) {
1029 match value {
1030 serde_json::Value::String(s) => {
1031 if matches!(field_type, FieldType::Text) {
1032 doc.add_text(field, s.clone());
1033 }
1034 }
1035 serde_json::Value::Number(n) => {
1036 match field_type {
1037 FieldType::I64 => {
1038 if let Some(i) = n.as_i64() {
1039 doc.add_i64(field, i);
1040 }
1041 }
1042 FieldType::U64 => {
1043 if let Some(u) = n.as_u64() {
1044 doc.add_u64(field, u);
1045 } else if let Some(i) = n.as_i64() {
1046 if i >= 0 {
1048 doc.add_u64(field, i as u64);
1049 }
1050 }
1051 }
1052 FieldType::F64 => {
1053 if let Some(f) = n.as_f64() {
1054 doc.add_f64(field, f);
1055 }
1056 }
1057 _ => {}
1058 }
1059 }
1060 serde_json::Value::Array(arr) => {
1062 for item in arr {
1063 Self::add_json_value(doc, field, field_type, item);
1064 }
1065 }
1066 serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
1068 if let (Some(indices_val), Some(values_val)) =
1069 (obj.get("indices"), obj.get("values"))
1070 {
1071 let indices: Vec<u32> = indices_val
1072 .as_array()
1073 .map(|arr| {
1074 arr.iter()
1075 .filter_map(|v| v.as_u64().map(|n| n as u32))
1076 .collect()
1077 })
1078 .unwrap_or_default();
1079 let values: Vec<f32> = values_val
1080 .as_array()
1081 .map(|arr| {
1082 arr.iter()
1083 .filter_map(|v| v.as_f64().map(|n| n as f32))
1084 .collect()
1085 })
1086 .unwrap_or_default();
1087 if indices.len() == values.len() {
1088 let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
1089 doc.add_sparse_vector(field, entries);
1090 }
1091 }
1092 }
1093 _ if matches!(field_type, FieldType::Json) => {
1095 doc.add_json(field, value.clone());
1096 }
1097 serde_json::Value::Object(_) => {}
1098 _ => {}
1099 }
1100 }
1101}
1102
1103#[cfg(test)]
1104mod tests {
1105 use super::*;
1106
1107 #[test]
1108 fn test_schema_builder() {
1109 let mut builder = Schema::builder();
1110 let title = builder.add_text_field("title", true, true);
1111 let body = builder.add_text_field("body", true, false);
1112 let count = builder.add_u64_field("count", true, true);
1113 let schema = builder.build();
1114
1115 assert_eq!(schema.get_field("title"), Some(title));
1116 assert_eq!(schema.get_field("body"), Some(body));
1117 assert_eq!(schema.get_field("count"), Some(count));
1118 assert_eq!(schema.get_field("nonexistent"), None);
1119 }
1120
1121 #[test]
1122 fn test_document() {
1123 let mut builder = Schema::builder();
1124 let title = builder.add_text_field("title", true, true);
1125 let count = builder.add_u64_field("count", true, true);
1126 let _schema = builder.build();
1127
1128 let mut doc = Document::new();
1129 doc.add_text(title, "Hello World");
1130 doc.add_u64(count, 42);
1131
1132 assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
1133 assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
1134 }
1135
1136 #[test]
1137 fn test_document_serialization() {
1138 let mut builder = Schema::builder();
1139 let title = builder.add_text_field("title", true, true);
1140 let count = builder.add_u64_field("count", true, true);
1141 let _schema = builder.build();
1142
1143 let mut doc = Document::new();
1144 doc.add_text(title, "Hello World");
1145 doc.add_u64(count, 42);
1146
1147 let json = serde_json::to_string(&doc).unwrap();
1149 println!("Serialized doc: {}", json);
1150
1151 let doc2: Document = serde_json::from_str(&json).unwrap();
1153 assert_eq!(
1154 doc2.field_values().len(),
1155 2,
1156 "Should have 2 field values after deserialization"
1157 );
1158 assert_eq!(
1159 doc2.get_first(title).unwrap().as_text(),
1160 Some("Hello World")
1161 );
1162 assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
1163 }
1164
1165 #[test]
1166 fn test_multivalue_field() {
1167 let mut builder = Schema::builder();
1168 let uris = builder.add_text_field("uris", true, true);
1169 let title = builder.add_text_field("title", true, true);
1170 let schema = builder.build();
1171
1172 let mut doc = Document::new();
1174 doc.add_text(uris, "one");
1175 doc.add_text(uris, "two");
1176 doc.add_text(title, "Test Document");
1177
1178 assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
1180
1181 let all_uris: Vec<_> = doc.get_all(uris).collect();
1183 assert_eq!(all_uris.len(), 2);
1184 assert_eq!(all_uris[0].as_text(), Some("one"));
1185 assert_eq!(all_uris[1].as_text(), Some("two"));
1186
1187 let json = doc.to_json(&schema);
1189 let uris_json = json.get("uris").unwrap();
1190 assert!(uris_json.is_array(), "Multi-value field should be an array");
1191 let uris_arr = uris_json.as_array().unwrap();
1192 assert_eq!(uris_arr.len(), 2);
1193 assert_eq!(uris_arr[0].as_str(), Some("one"));
1194 assert_eq!(uris_arr[1].as_str(), Some("two"));
1195
1196 let title_json = json.get("title").unwrap();
1198 assert!(
1199 title_json.is_string(),
1200 "Single-value field should be a string"
1201 );
1202 assert_eq!(title_json.as_str(), Some("Test Document"));
1203 }
1204
1205 #[test]
1206 fn test_multivalue_from_json() {
1207 let mut builder = Schema::builder();
1208 let uris = builder.add_text_field("uris", true, true);
1209 let title = builder.add_text_field("title", true, true);
1210 let schema = builder.build();
1211
1212 let json = serde_json::json!({
1214 "uris": ["one", "two"],
1215 "title": "Test Document"
1216 });
1217
1218 let doc = Document::from_json(&json, &schema).unwrap();
1220
1221 let all_uris: Vec<_> = doc.get_all(uris).collect();
1223 assert_eq!(all_uris.len(), 2);
1224 assert_eq!(all_uris[0].as_text(), Some("one"));
1225 assert_eq!(all_uris[1].as_text(), Some("two"));
1226
1227 assert_eq!(
1229 doc.get_first(title).unwrap().as_text(),
1230 Some("Test Document")
1231 );
1232
1233 let json_out = doc.to_json(&schema);
1235 let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1236 assert_eq!(uris_out.len(), 2);
1237 assert_eq!(uris_out[0].as_str(), Some("one"));
1238 assert_eq!(uris_out[1].as_str(), Some("two"));
1239 }
1240
1241 #[test]
1242 fn test_multi_attribute_forces_array() {
1243 let mut builder = Schema::builder();
1246 let uris = builder.add_text_field("uris", true, true);
1247 builder.set_multi(uris, true); let title = builder.add_text_field("title", true, true);
1249 let schema = builder.build();
1250
1251 assert!(schema.get_field_entry(uris).unwrap().multi);
1253 assert!(!schema.get_field_entry(title).unwrap().multi);
1254
1255 let mut doc = Document::new();
1257 doc.add_text(uris, "only_one");
1258 doc.add_text(title, "Test Document");
1259
1260 let json = doc.to_json(&schema);
1262
1263 let uris_json = json.get("uris").unwrap();
1264 assert!(
1265 uris_json.is_array(),
1266 "Multi field should be array even with single value"
1267 );
1268 let uris_arr = uris_json.as_array().unwrap();
1269 assert_eq!(uris_arr.len(), 1);
1270 assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1271
1272 let title_json = json.get("title").unwrap();
1274 assert!(
1275 title_json.is_string(),
1276 "Non-multi single-value field should be a string"
1277 );
1278 assert_eq!(title_json.as_str(), Some("Test Document"));
1279 }
1280
1281 #[test]
1282 fn test_sparse_vector_field() {
1283 let mut builder = Schema::builder();
1284 let embedding = builder.add_sparse_vector_field("embedding", true, true);
1285 let title = builder.add_text_field("title", true, true);
1286 let schema = builder.build();
1287
1288 assert_eq!(schema.get_field("embedding"), Some(embedding));
1289 assert_eq!(
1290 schema.get_field_entry(embedding).unwrap().field_type,
1291 FieldType::SparseVector
1292 );
1293
1294 let mut doc = Document::new();
1296 doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1297 doc.add_text(title, "Test Document");
1298
1299 let entries = doc
1301 .get_first(embedding)
1302 .unwrap()
1303 .as_sparse_vector()
1304 .unwrap();
1305 assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1306
1307 let json = doc.to_json(&schema);
1309 let embedding_json = json.get("embedding").unwrap();
1310 assert!(embedding_json.is_object());
1311 assert_eq!(
1312 embedding_json
1313 .get("indices")
1314 .unwrap()
1315 .as_array()
1316 .unwrap()
1317 .len(),
1318 3
1319 );
1320
1321 let doc2 = Document::from_json(&json, &schema).unwrap();
1323 let entries2 = doc2
1324 .get_first(embedding)
1325 .unwrap()
1326 .as_sparse_vector()
1327 .unwrap();
1328 assert_eq!(entries2[0].0, 0);
1329 assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1330 assert_eq!(entries2[1].0, 5);
1331 assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1332 assert_eq!(entries2[2].0, 10);
1333 assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1334 }
1335
1336 #[test]
1337 fn test_json_field() {
1338 let mut builder = Schema::builder();
1339 let metadata = builder.add_json_field("metadata", true);
1340 let title = builder.add_text_field("title", true, true);
1341 let schema = builder.build();
1342
1343 assert_eq!(schema.get_field("metadata"), Some(metadata));
1344 assert_eq!(
1345 schema.get_field_entry(metadata).unwrap().field_type,
1346 FieldType::Json
1347 );
1348 assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1350 assert!(schema.get_field_entry(metadata).unwrap().stored);
1351
1352 let json_value = serde_json::json!({
1354 "author": "John Doe",
1355 "tags": ["rust", "search"],
1356 "nested": {"key": "value"}
1357 });
1358 let mut doc = Document::new();
1359 doc.add_json(metadata, json_value.clone());
1360 doc.add_text(title, "Test Document");
1361
1362 let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1364 assert_eq!(stored_json, &json_value);
1365 assert_eq!(
1366 stored_json.get("author").unwrap().as_str(),
1367 Some("John Doe")
1368 );
1369
1370 let doc_json = doc.to_json(&schema);
1372 let metadata_out = doc_json.get("metadata").unwrap();
1373 assert_eq!(metadata_out, &json_value);
1374
1375 let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1377 let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1378 assert_eq!(stored_json2, &json_value);
1379 }
1380
1381 #[test]
1382 fn test_json_field_various_types() {
1383 let mut builder = Schema::builder();
1384 let data = builder.add_json_field("data", true);
1385 let _schema = builder.build();
1386
1387 let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1389 let mut doc = Document::new();
1390 doc.add_json(data, arr_value.clone());
1391 assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1392
1393 let str_value = serde_json::json!("just a string");
1395 let mut doc2 = Document::new();
1396 doc2.add_json(data, str_value.clone());
1397 assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1398
1399 let num_value = serde_json::json!(42.5);
1401 let mut doc3 = Document::new();
1402 doc3.add_json(data, num_value.clone());
1403 assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1404
1405 let null_value = serde_json::Value::Null;
1407 let mut doc4 = Document::new();
1408 doc4.add_json(data, null_value.clone());
1409 assert_eq!(
1410 doc4.get_first(data).unwrap().as_json().unwrap(),
1411 &null_value
1412 );
1413
1414 let bool_value = serde_json::json!(true);
1416 let mut doc5 = Document::new();
1417 doc5.add_json(data, bool_value.clone());
1418 assert_eq!(
1419 doc5.get_first(data).unwrap().as_json().unwrap(),
1420 &bool_value
1421 );
1422 }
1423}