milli_core/documents/
primary_key.rs

1use std::iter;
2use std::ops::ControlFlow;
3use std::result::Result as StdResult;
4
5use bumpalo::Bump;
6use serde_json::value::RawValue;
7use serde_json::Value;
8
9use crate::fields_ids_map::MutFieldIdMapper;
10use crate::update::new::indexer::de::{match_component, DeOrBumpStr};
11use crate::update::new::KvReaderFieldId;
12use crate::{FieldId, InternalError, Object, Result, UserError};
13
14/// The symbol used to define levels in a nested primary key.
15const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
16
17/// The default primary that is used when not specified.
18pub const DEFAULT_PRIMARY_KEY: &str = "id";
19
20/// Trait for objects that can map the name of a field to its [`FieldId`].
21pub trait FieldIdMapper {
22    /// Attempts to map the passed name to its [`FieldId`].
23    ///
24    /// `None` if the field with this name was not found.
25    fn id(&self, name: &str) -> Option<FieldId>;
26
27    fn name(&self, id: FieldId) -> Option<&str>;
28}
29
30impl<T> FieldIdMapper for &T
31where
32    T: FieldIdMapper,
33{
34    fn id(&self, name: &str) -> Option<FieldId> {
35        T::id(self, name)
36    }
37
38    fn name(&self, id: FieldId) -> Option<&str> {
39        T::name(self, id)
40    }
41}
42
43/// A type that represent the type of primary key that has been set
44/// for this index, a classic flat one or a nested one.
45#[derive(Debug, Clone, Copy)]
46pub enum PrimaryKey<'a> {
47    Flat { name: &'a str, field_id: FieldId },
48    Nested { name: &'a str },
49}
50
51pub enum DocumentIdExtractionError {
52    InvalidDocumentId(UserError),
53    MissingDocumentId,
54    TooManyDocumentIds(usize),
55}
56
57impl<'a> PrimaryKey<'a> {
58    pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
59        Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
60            Self::Nested { name: path }
61        } else {
62            let field_id = fields.id(path)?;
63            Self::Flat { name: path, field_id }
64        })
65    }
66
67    pub fn new_or_insert(
68        path: &'a str,
69        fields: &mut impl MutFieldIdMapper,
70    ) -> StdResult<Self, UserError> {
71        Ok(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
72            Self::Nested { name: path }
73        } else {
74            let field_id = fields.insert(path).ok_or(UserError::AttributeLimitReached)?;
75            Self::Flat { name: path, field_id }
76        })
77    }
78
79    pub fn name(&self) -> &'a str {
80        match self {
81            PrimaryKey::Flat { name, .. } => name,
82            PrimaryKey::Nested { name } => name,
83        }
84    }
85
86    pub fn document_id(
87        &self,
88        document: &obkv::KvReader<FieldId>,
89        fields: &impl FieldIdMapper,
90    ) -> Result<StdResult<String, DocumentIdExtractionError>> {
91        match self {
92            PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
93                Some(document_id_bytes) => {
94                    let document_id = serde_json::from_slice(document_id_bytes)
95                        .map_err(InternalError::SerdeJson)?;
96                    match validate_document_id_value(document_id) {
97                        Ok(document_id) => Ok(Ok(document_id)),
98                        Err(user_error) => {
99                            Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
100                        }
101                    }
102                }
103                None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
104            },
105            nested @ PrimaryKey::Nested { .. } => {
106                let mut matching_documents_ids = Vec::new();
107                for (first_level_name, right) in nested.possible_level_names() {
108                    if let Some(field_id) = fields.id(first_level_name) {
109                        if let Some(value_bytes) = document.get(field_id) {
110                            let object = serde_json::from_slice(value_bytes)
111                                .map_err(InternalError::SerdeJson)?;
112                            fetch_matching_values(object, right, &mut matching_documents_ids);
113
114                            if matching_documents_ids.len() >= 2 {
115                                return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
116                                    matching_documents_ids.len(),
117                                )));
118                            }
119                        }
120                    }
121                }
122
123                match matching_documents_ids.pop() {
124                    Some(document_id) => match validate_document_id_value(document_id) {
125                        Ok(document_id) => Ok(Ok(document_id)),
126                        Err(user_error) => {
127                            Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
128                        }
129                    },
130                    None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
131                }
132            }
133        }
134    }
135
136    pub fn extract_docid_from_db<'pl, 'bump: 'pl, Mapper: FieldIdMapper>(
137        &self,
138        document: &'pl KvReaderFieldId,
139        db_fields_ids_map: &Mapper,
140        indexer: &'bump Bump,
141    ) -> Result<DeOrBumpStr<'pl, 'bump>> {
142        use serde::Deserializer as _;
143
144        match self {
145            PrimaryKey::Flat { name: _, field_id } => {
146                let Some(document_id) = document.get(*field_id) else {
147                    return Err(InternalError::DocumentsError(
148                        crate::documents::Error::InvalidDocumentFormat,
149                    )
150                    .into());
151                };
152
153                let document_id: &RawValue =
154                    serde_json::from_slice(document_id).map_err(InternalError::SerdeJson)?;
155
156                let document_id = document_id
157                    .deserialize_any(crate::update::new::indexer::de::DocumentIdVisitor(indexer))
158                    .map_err(InternalError::SerdeJson)?;
159
160                let external_document_id = match document_id {
161                    Ok(document_id) => Ok(document_id),
162                    Err(_) => Err(InternalError::DocumentsError(
163                        crate::documents::Error::InvalidDocumentFormat,
164                    )),
165                }?;
166
167                Ok(external_document_id)
168            }
169            nested @ PrimaryKey::Nested { name: _ } => {
170                let mut docid = None;
171                for (first_level, right) in nested.possible_level_names() {
172                    let Some(fid) = db_fields_ids_map.id(first_level) else { continue };
173
174                    let Some(value) = document.get(fid) else { continue };
175                    let value: &RawValue =
176                        serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
177                    match match_component(first_level, right, value, indexer, &mut docid) {
178                        ControlFlow::Continue(()) => continue,
179                        ControlFlow::Break(Ok(_)) => {
180                            return Err(InternalError::DocumentsError(
181                                crate::documents::Error::InvalidDocumentFormat,
182                            )
183                            .into())
184                        }
185                        ControlFlow::Break(Err(err)) => {
186                            return Err(InternalError::SerdeJson(err).into())
187                        }
188                    }
189                }
190                Ok(docid.ok_or(InternalError::DocumentsError(
191                    crate::documents::Error::InvalidDocumentFormat,
192                ))?)
193            }
194        }
195    }
196
197    pub fn extract_fields_and_docid<'pl, 'bump: 'pl, Mapper: MutFieldIdMapper>(
198        &self,
199        document: &'pl RawValue,
200        new_fields_ids_map: &mut Mapper,
201        indexer: &'bump Bump,
202    ) -> Result<DeOrBumpStr<'pl, 'bump>> {
203        use serde::Deserializer as _;
204        let res = document
205            .deserialize_map(crate::update::new::indexer::de::FieldAndDocidExtractor::new(
206                new_fields_ids_map,
207                self,
208                indexer,
209            ))
210            .map_err(UserError::SerdeJson)??;
211
212        let external_document_id = match res {
213            Ok(document_id) => Ok(document_id),
214            Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e),
215            Err(DocumentIdExtractionError::MissingDocumentId) => {
216                Err(UserError::MissingDocumentId {
217                    primary_key: self.name().to_string(),
218                    document: serde_json::from_str(document.get()).unwrap(),
219                })
220            }
221            Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
222                Err(UserError::TooManyDocumentIds {
223                    primary_key: self.name().to_string(),
224                    document: serde_json::from_str(document.get()).unwrap(),
225                })
226            }
227        }?;
228
229        Ok(external_document_id)
230    }
231
232    /// Returns an `Iterator` that gives all the possible fields names the primary key
233    /// can have depending of the first level name and depth of the objects.
234    pub fn possible_level_names(&self) -> impl Iterator<Item = (&'a str, &'a str)> + '_ {
235        let name = self.name();
236        name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
237            .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
238            .chain(iter::once((name, "")))
239    }
240}
241
242fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
243    match value {
244        Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
245        otherwise => output.push(otherwise),
246    }
247}
248
249fn fetch_matching_values_in_object(
250    object: Object,
251    selector: &str,
252    base_key: &str,
253    output: &mut Vec<Value>,
254) {
255    for (key, value) in object {
256        let base_key = if base_key.is_empty() {
257            key.to_string()
258        } else {
259            format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
260        };
261
262        if starts_with(selector, &base_key) {
263            match value {
264                Value::Object(object) => {
265                    fetch_matching_values_in_object(object, selector, &base_key, output)
266                }
267                value => output.push(value),
268            }
269        }
270    }
271}
272
273fn starts_with(selector: &str, key: &str) -> bool {
274    selector.strip_prefix(key).is_some_and(|tail| {
275        tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
276    })
277}
278
279// FIXME: move to a DocumentId struct
280
281pub fn validate_document_id_str(document_id: &str) -> Option<&str> {
282    if document_id.is_empty()
283        || document_id.len() >= 512
284        || !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
285    {
286        None
287    } else {
288        Some(document_id)
289    }
290}
291
292pub fn validate_document_id_value(document_id: Value) -> StdResult<String, UserError> {
293    match document_id {
294        Value::String(string) => match validate_document_id_str(&string) {
295            Some(s) if s.len() == string.len() => Ok(string),
296            Some(s) => Ok(s.to_string()),
297            None => Err(UserError::InvalidDocumentId { document_id: Value::String(string) }),
298        },
299        // a `u64` or `i64` cannot be more than 512 bytes once converted to a string
300        Value::Number(number) if !number.is_f64() => Ok(number.to_string()),
301        content => Err(UserError::InvalidDocumentId { document_id: content }),
302    }
303}