grc_20/codec/
edit.rs

1//! Edit encoding/decoding for GRC-20 binary format.
2//!
3//! Implements the wire format for edits (spec Section 6.3).
4
5use std::borrow::Cow;
6use std::io::Read;
7
8use rustc_hash::{FxHashMap, FxHashSet};
9
10use crate::codec::op::{decode_op, encode_op};
11use crate::codec::primitives::{Reader, Writer};
12use crate::error::{DecodeError, EncodeError};
13use crate::limits::{
14    FORMAT_VERSION, MAGIC_COMPRESSED, MAGIC_UNCOMPRESSED, MAX_AUTHORS, MAX_DICT_SIZE,
15    MAX_EDIT_SIZE, MAX_OPS_PER_EDIT, MAX_STRING_LEN, MIN_FORMAT_VERSION,
16};
17use crate::model::{DataType, DictionaryBuilder, Edit, Id, Op, WireDictionaries};
18
19// =============================================================================
20// DECODING
21// =============================================================================
22
23/// Decompresses a GRC2Z compressed edit, returning the uncompressed bytes.
24///
25/// Use this with [`decode_edit`] for zero-copy decoding of compressed data:
26///
27/// ```ignore
28/// let uncompressed = decompress(&compressed_bytes)?;
29/// let edit = decode_edit(&uncompressed)?;  // zero-copy, borrows from uncompressed
30/// // edit is valid while uncompressed is alive
31/// ```
32pub fn decompress(input: &[u8]) -> Result<Vec<u8>, DecodeError> {
33    if input.len() < 5 {
34        return Err(DecodeError::UnexpectedEof { context: "magic" });
35    }
36    if &input[0..5] != MAGIC_COMPRESSED {
37        let mut found = [0u8; 4];
38        found.copy_from_slice(&input[0..4]);
39        return Err(DecodeError::InvalidMagic { found });
40    }
41    decompress_zstd(&input[5..])
42}
43
44/// Decodes an Edit from binary data with zero-copy borrowing.
45///
46/// Handles both compressed (GRC2Z) and uncompressed (GRC2) formats.
47/// For true zero-copy with compressed data, use [`decompress`] first:
48///
49/// ```ignore
50/// // Zero-copy for compressed data:
51/// let uncompressed = decompress(&compressed)?;
52/// let edit = decode_edit(&uncompressed)?;
53///
54/// // Zero-copy for uncompressed data:
55/// let edit = decode_edit(&uncompressed_bytes)?;
56/// ```
57///
58/// If you pass compressed data directly, it will decompress internally
59/// and allocate owned strings (no zero-copy benefit).
60pub fn decode_edit(input: &[u8]) -> Result<Edit<'_>, DecodeError> {
61    if input.len() < 4 {
62        return Err(DecodeError::UnexpectedEof { context: "magic" });
63    }
64
65    // Detect compression
66    if input.len() >= 5 && &input[0..5] == MAGIC_COMPRESSED {
67        // Compressed: decompress and decode with allocations
68        // (for zero-copy, caller should use decompress() first)
69        let decompressed = decompress_zstd(&input[5..])?;
70        if decompressed.len() > MAX_EDIT_SIZE {
71            return Err(DecodeError::LengthExceedsLimit {
72                field: "edit",
73                len: decompressed.len(),
74                max: MAX_EDIT_SIZE,
75            });
76        }
77        decode_edit_owned(&decompressed)
78    } else if &input[0..4] == MAGIC_UNCOMPRESSED {
79        // Uncompressed: decode with zero-copy borrowing
80        if input.len() > MAX_EDIT_SIZE {
81            return Err(DecodeError::LengthExceedsLimit {
82                field: "edit",
83                len: input.len(),
84                max: MAX_EDIT_SIZE,
85            });
86        }
87        decode_edit_borrowed(input)
88    } else {
89        let mut found = [0u8; 4];
90        found.copy_from_slice(&input[0..4]);
91        Err(DecodeError::InvalidMagic { found })
92    }
93}
94
95/// Decodes an Edit with zero-copy borrowing from the input.
96fn decode_edit_borrowed(input: &[u8]) -> Result<Edit<'_>, DecodeError> {
97    let mut reader = Reader::new(input);
98
99    // Skip magic (already validated)
100    reader.read_bytes(4, "magic")?;
101
102    // Version
103    let version = reader.read_byte("version")?;
104    if version < MIN_FORMAT_VERSION || version > FORMAT_VERSION {
105        return Err(DecodeError::UnsupportedVersion { version });
106    }
107
108    // Header
109    let edit_id = reader.read_id("edit_id")?;
110    let name = Cow::Borrowed(reader.read_str(MAX_STRING_LEN, "name")?);
111    let authors = reader.read_id_vec(MAX_AUTHORS, "authors")?;
112    let created_at = reader.read_signed_varint("created_at")?;
113
114    // Schema dictionaries (with duplicate detection)
115    let property_count = reader.read_varint("property_count")? as usize;
116    if property_count > MAX_DICT_SIZE {
117        return Err(DecodeError::LengthExceedsLimit {
118            field: "properties",
119            len: property_count,
120            max: MAX_DICT_SIZE,
121        });
122    }
123    let mut properties = Vec::with_capacity(property_count);
124    let mut seen_props = FxHashSet::with_capacity_and_hasher(property_count, Default::default());
125    for _ in 0..property_count {
126        let id = reader.read_id("property_id")?;
127        if !seen_props.insert(id) {
128            return Err(DecodeError::DuplicateDictionaryEntry { dict: "properties", id });
129        }
130        let dt_byte = reader.read_byte("data_type")?;
131        let data_type = DataType::from_u8(dt_byte)
132            .ok_or(DecodeError::InvalidDataType { data_type: dt_byte })?;
133        properties.push((id, data_type));
134    }
135
136    let relation_types = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "relation_types")?;
137    let languages = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "languages")?;
138    let units = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "units")?;
139    let objects = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "objects")?;
140
141    let dicts = WireDictionaries {
142        properties,
143        relation_types,
144        languages,
145        units,
146        objects,
147    };
148
149    // Operations
150    let op_count = reader.read_varint("op_count")? as usize;
151    if op_count > MAX_OPS_PER_EDIT {
152        return Err(DecodeError::LengthExceedsLimit {
153            field: "ops",
154            len: op_count,
155            max: MAX_OPS_PER_EDIT,
156        });
157    }
158
159    let mut ops = Vec::with_capacity(op_count);
160    for _ in 0..op_count {
161        ops.push(decode_op(&mut reader, &dicts)?);
162    }
163
164    Ok(Edit {
165        id: edit_id,
166        name,
167        authors,
168        created_at,
169        ops,
170    })
171}
172
173/// Decodes an Edit with allocations (for decompressed data).
174fn decode_edit_owned(data: &[u8]) -> Result<Edit<'static>, DecodeError> {
175    let mut reader = Reader::new(data);
176
177    // Skip magic (already validated in decompress)
178    reader.read_bytes(4, "magic")?;
179
180    // Version
181    let version = reader.read_byte("version")?;
182    if version < MIN_FORMAT_VERSION || version > FORMAT_VERSION {
183        return Err(DecodeError::UnsupportedVersion { version });
184    }
185
186    // Header - use allocating reads
187    let edit_id = reader.read_id("edit_id")?;
188    let name = Cow::Owned(reader.read_string(MAX_STRING_LEN, "name")?);
189    let authors = reader.read_id_vec(MAX_AUTHORS, "authors")?;
190    let created_at = reader.read_signed_varint("created_at")?;
191
192    // Schema dictionaries (with duplicate detection)
193    let property_count = reader.read_varint("property_count")? as usize;
194    if property_count > MAX_DICT_SIZE {
195        return Err(DecodeError::LengthExceedsLimit {
196            field: "properties",
197            len: property_count,
198            max: MAX_DICT_SIZE,
199        });
200    }
201    let mut properties = Vec::with_capacity(property_count);
202    let mut seen_props = FxHashSet::with_capacity_and_hasher(property_count, Default::default());
203    for _ in 0..property_count {
204        let id = reader.read_id("property_id")?;
205        if !seen_props.insert(id) {
206            return Err(DecodeError::DuplicateDictionaryEntry { dict: "properties", id });
207        }
208        let dt_byte = reader.read_byte("data_type")?;
209        let data_type = DataType::from_u8(dt_byte)
210            .ok_or(DecodeError::InvalidDataType { data_type: dt_byte })?;
211        properties.push((id, data_type));
212    }
213
214    let relation_types = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "relation_types")?;
215    let languages = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "languages")?;
216    let units = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "units")?;
217    let objects = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "objects")?;
218
219    let dicts = WireDictionaries {
220        properties,
221        relation_types,
222        languages,
223        units,
224        objects,
225    };
226
227    // Operations - use allocating decode
228    let op_count = reader.read_varint("op_count")? as usize;
229    if op_count > MAX_OPS_PER_EDIT {
230        return Err(DecodeError::LengthExceedsLimit {
231            field: "ops",
232            len: op_count,
233            max: MAX_OPS_PER_EDIT,
234        });
235    }
236
237    let mut ops = Vec::with_capacity(op_count);
238    for _ in 0..op_count {
239        ops.push(decode_op_owned(&mut reader, &dicts)?);
240    }
241
242    Ok(Edit {
243        id: edit_id,
244        name,
245        authors,
246        created_at,
247        ops,
248    })
249}
250
251/// Decodes an Op with allocations (for decompressed data).
252fn decode_op_owned(reader: &mut Reader<'_>, dicts: &WireDictionaries) -> Result<Op<'static>, DecodeError> {
253    // Decode normally, then convert to owned
254    let op = decode_op(reader, dicts)?;
255    Ok(op_to_owned(op))
256}
257
258/// Converts an Op with borrowed data to owned data.
259fn op_to_owned(op: Op<'_>) -> Op<'static> {
260    match op {
261        Op::CreateEntity(ce) => Op::CreateEntity(crate::model::CreateEntity {
262            id: ce.id,
263            values: ce.values.into_iter().map(pv_to_owned).collect(),
264        }),
265        Op::UpdateEntity(ue) => Op::UpdateEntity(crate::model::UpdateEntity {
266            id: ue.id,
267            set_properties: ue.set_properties.into_iter().map(pv_to_owned).collect(),
268            unset_values: ue.unset_values,
269        }),
270        Op::DeleteEntity(de) => Op::DeleteEntity(de),
271        Op::RestoreEntity(re) => Op::RestoreEntity(re),
272        Op::CreateRelation(cr) => Op::CreateRelation(crate::model::CreateRelation {
273            id: cr.id,
274            relation_type: cr.relation_type,
275            from: cr.from,
276            from_is_value_ref: cr.from_is_value_ref,
277            to: cr.to,
278            to_is_value_ref: cr.to_is_value_ref,
279            entity: cr.entity,
280            position: cr.position.map(|p| Cow::Owned(p.into_owned())),
281            from_space: cr.from_space,
282            from_version: cr.from_version,
283            to_space: cr.to_space,
284            to_version: cr.to_version,
285        }),
286        Op::UpdateRelation(ur) => Op::UpdateRelation(crate::model::UpdateRelation {
287            id: ur.id,
288            from_space: ur.from_space,
289            from_version: ur.from_version,
290            to_space: ur.to_space,
291            to_version: ur.to_version,
292            position: ur.position.map(|p| Cow::Owned(p.into_owned())),
293            unset: ur.unset,
294        }),
295        Op::DeleteRelation(dr) => Op::DeleteRelation(dr),
296        Op::RestoreRelation(rr) => Op::RestoreRelation(rr),
297        Op::CreateValueRef(cvr) => Op::CreateValueRef(cvr),
298    }
299}
300
301/// Converts a PropertyValue with borrowed data to owned data.
302fn pv_to_owned(pv: crate::model::PropertyValue<'_>) -> crate::model::PropertyValue<'static> {
303    crate::model::PropertyValue {
304        property: pv.property,
305        value: value_to_owned(pv.value),
306    }
307}
308
309/// Converts a Value with borrowed data to owned data.
310fn value_to_owned(v: crate::model::Value<'_>) -> crate::model::Value<'static> {
311    use crate::model::{DecimalMantissa, Value};
312    match v {
313        Value::Bool(b) => Value::Bool(b),
314        Value::Int64 { value, unit } => Value::Int64 { value, unit },
315        Value::Float64 { value, unit } => Value::Float64 { value, unit },
316        Value::Decimal { exponent, mantissa, unit } => Value::Decimal {
317            exponent,
318            mantissa: match mantissa {
319                DecimalMantissa::I64(i) => DecimalMantissa::I64(i),
320                DecimalMantissa::Big(b) => DecimalMantissa::Big(Cow::Owned(b.into_owned())),
321            },
322            unit,
323        },
324        Value::Text { value, language } => Value::Text {
325            value: Cow::Owned(value.into_owned()),
326            language,
327        },
328        Value::Bytes(b) => Value::Bytes(Cow::Owned(b.into_owned())),
329        Value::Date(s) => Value::Date(Cow::Owned(s.into_owned())),
330        Value::Time(s) => Value::Time(Cow::Owned(s.into_owned())),
331        Value::Datetime(s) => Value::Datetime(Cow::Owned(s.into_owned())),
332        Value::Schedule(s) => Value::Schedule(Cow::Owned(s.into_owned())),
333        Value::Point { lon, lat, alt } => Value::Point { lon, lat, alt },
334        Value::Embedding { sub_type, dims, data } => Value::Embedding {
335            sub_type,
336            dims,
337            data: Cow::Owned(data.into_owned()),
338        },
339    }
340}
341
342/// Reads an ID vector and checks for duplicates.
343fn read_id_vec_no_duplicates(
344    reader: &mut Reader<'_>,
345    max_len: usize,
346    field: &'static str,
347) -> Result<Vec<Id>, DecodeError> {
348    let count = reader.read_varint(field)? as usize;
349    if count > max_len {
350        return Err(DecodeError::LengthExceedsLimit {
351            field,
352            len: count,
353            max: max_len,
354        });
355    }
356
357    let mut ids = Vec::with_capacity(count);
358    let mut seen = FxHashSet::with_capacity_and_hasher(count, Default::default());
359
360    for _ in 0..count {
361        let id = reader.read_id(field)?;
362        if !seen.insert(id) {
363            return Err(DecodeError::DuplicateDictionaryEntry { dict: field, id });
364        }
365        ids.push(id);
366    }
367
368    Ok(ids)
369}
370
371fn decompress_zstd(compressed: &[u8]) -> Result<Vec<u8>, DecodeError> {
372    // Read uncompressed size
373    let mut reader = Reader::new(compressed);
374    let declared_size = reader.read_varint("uncompressed_size")? as usize;
375
376    if declared_size > MAX_EDIT_SIZE {
377        return Err(DecodeError::LengthExceedsLimit {
378            field: "uncompressed_size",
379            len: declared_size,
380            max: MAX_EDIT_SIZE,
381        });
382    }
383
384    let compressed_data = reader.remaining();
385
386    let mut decoder = zstd::Decoder::new(compressed_data)
387        .map_err(|e| DecodeError::DecompressionFailed(e.to_string()))?;
388
389    let mut decompressed = Vec::with_capacity(declared_size);
390    decoder
391        .read_to_end(&mut decompressed)
392        .map_err(|e| DecodeError::DecompressionFailed(e.to_string()))?;
393
394    if decompressed.len() != declared_size {
395        return Err(DecodeError::UncompressedSizeMismatch {
396            declared: declared_size,
397            actual: decompressed.len(),
398        });
399    }
400
401    Ok(decompressed)
402}
403
404// =============================================================================
405// ENCODING
406// =============================================================================
407
408/// Options for encoding edits.
409#[derive(Debug, Clone, Copy, Default)]
410pub struct EncodeOptions {
411    /// Enable canonical encoding mode.
412    ///
413    /// When enabled:
414    /// - Dictionary entries are sorted by ID bytes (lexicographic)
415    /// - This ensures deterministic output for the same logical edit
416    ///
417    /// Use canonical mode when:
418    /// - Computing content hashes for deduplication
419    /// - Creating signatures over edit content
420    /// - Ensuring cross-implementation reproducibility
421    ///
422    /// Note: Canonical mode requires two passes over the ops and is slower
423    /// than non-canonical encoding.
424    pub canonical: bool,
425}
426
427impl EncodeOptions {
428    /// Creates default (non-canonical) encoding options.
429    pub fn new() -> Self {
430        Self::default()
431    }
432
433    /// Creates canonical encoding options.
434    pub fn canonical() -> Self {
435        Self { canonical: true }
436    }
437}
438
439/// Encodes an Edit to binary format (uncompressed).
440///
441/// Uses single-pass encoding: ops are encoded to a buffer while building
442/// dictionaries, then the final output is assembled.
443pub fn encode_edit(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
444    encode_edit_with_options(edit, EncodeOptions::default())
445}
446
447/// Encodes an Edit to binary format with the given options.
448pub fn encode_edit_with_options(edit: &Edit, options: EncodeOptions) -> Result<Vec<u8>, EncodeError> {
449    if options.canonical {
450        encode_edit_canonical(edit)
451    } else {
452        encode_edit_fast(edit)
453    }
454}
455
456/// Fast single-pass encoding (non-canonical).
457fn encode_edit_fast(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
458    // Property types are determined from values themselves (per-edit typing)
459    let property_types = rustc_hash::FxHashMap::default();
460
461    // Single pass: encode ops while building dictionaries
462    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
463    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
464
465    for op in &edit.ops {
466        encode_op(&mut ops_writer, op, &mut dict_builder, &property_types)?;
467    }
468
469    // Now assemble final output: header + dictionaries + ops
470    let ops_bytes = ops_writer.into_bytes();
471    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
472
473    // Magic and version
474    writer.write_bytes(MAGIC_UNCOMPRESSED);
475    writer.write_byte(FORMAT_VERSION);
476
477    // Header
478    writer.write_id(&edit.id);
479    writer.write_string(&edit.name);
480    writer.write_id_vec(&edit.authors);
481    writer.write_signed_varint(edit.created_at);
482
483    // Dictionaries
484    dict_builder.write_dictionaries(&mut writer);
485
486    // Operations (already encoded)
487    writer.write_varint(edit.ops.len() as u64);
488    writer.write_bytes(&ops_bytes);
489
490    Ok(writer.into_bytes())
491}
492
493/// Canonical two-pass encoding with sorted dictionaries, authors, values, and unsets.
494///
495/// Pass 1: Collect all dictionary entries
496/// Pass 2: Sort dictionaries, encode with stable indices and sorted values
497///
498/// Canonical mode requirements (spec Section 4.4):
499/// - Dictionaries sorted by ID bytes
500/// - Authors sorted by ID bytes, no duplicates
501/// - Values sorted by (propertyRef, languageRef), no duplicate (property, language)
502/// - Unset values sorted by (propertyRef, language), no duplicates
503fn encode_edit_canonical(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
504    // Property types are determined from values themselves (per-edit typing)
505    let property_types = rustc_hash::FxHashMap::default();
506
507    // Pass 1: Collect all dictionary entries by doing a dry run
508    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
509    let mut temp_writer = Writer::with_capacity(edit.ops.len() * 50);
510    for op in &edit.ops {
511        encode_op(&mut temp_writer, op, &mut dict_builder, &property_types)?;
512    }
513
514    // Sort dictionaries and get sorted builder
515    let sorted_builder = dict_builder.into_sorted();
516
517    // Sort authors by ID bytes and check for duplicates
518    let mut sorted_authors = edit.authors.clone();
519    sorted_authors.sort();
520    // Check for duplicate authors
521    for i in 1..sorted_authors.len() {
522        if sorted_authors[i] == sorted_authors[i - 1] {
523            return Err(EncodeError::DuplicateAuthor { id: sorted_authors[i] });
524        }
525    }
526
527    // Pass 2: Encode ops with sorted dictionary indices and sorted values
528    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
529    let mut canonical_builder = sorted_builder.clone();
530    for op in &edit.ops {
531        encode_op_canonical(&mut ops_writer, op, &mut canonical_builder, &property_types)?;
532    }
533
534    // Assemble final output: header + dictionaries + ops
535    let ops_bytes = ops_writer.into_bytes();
536    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
537
538    // Magic and version
539    writer.write_bytes(MAGIC_UNCOMPRESSED);
540    writer.write_byte(FORMAT_VERSION);
541
542    // Header
543    writer.write_id(&edit.id);
544    writer.write_string(&edit.name);
545    writer.write_id_vec(&sorted_authors);
546    writer.write_signed_varint(edit.created_at);
547
548    // Dictionaries (sorted)
549    sorted_builder.write_dictionaries(&mut writer);
550
551    // Operations
552    writer.write_varint(edit.ops.len() as u64);
553    writer.write_bytes(&ops_bytes);
554
555    Ok(writer.into_bytes())
556}
557
558/// Encodes an op in canonical mode with sorted values.
559fn encode_op_canonical(
560    writer: &mut Writer,
561    op: &Op<'_>,
562    dict_builder: &mut DictionaryBuilder,
563    property_types: &FxHashMap<Id, DataType>,
564) -> Result<(), EncodeError> {
565    match op {
566        Op::CreateEntity(ce) => {
567            // Sort values by (property_index, language_index) and check for duplicates
568            let sorted_values = sort_and_check_values(&ce.values, dict_builder)?;
569
570            writer.write_byte(1); // OP_CREATE_ENTITY
571            writer.write_id(&ce.id);
572            writer.write_varint(sorted_values.len() as u64);
573
574            for pv in &sorted_values {
575                let data_type = property_types.get(&pv.property)
576                    .copied()
577                    .unwrap_or_else(|| pv.value.data_type());
578                encode_property_value_canonical(writer, pv, dict_builder, data_type)?;
579            }
580            Ok(())
581        }
582        Op::UpdateEntity(ue) => {
583            // Sort set_properties and unset_values, check for duplicates
584            let sorted_set = sort_and_check_values(&ue.set_properties, dict_builder)?;
585            let sorted_unset = sort_and_check_unsets(&ue.unset_values, dict_builder)?;
586
587            writer.write_byte(2); // OP_UPDATE_ENTITY
588            let id_index = dict_builder.add_object(ue.id);
589            writer.write_varint(id_index as u64);
590
591            let mut flags = 0u8;
592            if !sorted_set.is_empty() {
593                flags |= 0x01; // FLAG_HAS_SET_PROPERTIES
594            }
595            if !sorted_unset.is_empty() {
596                flags |= 0x02; // FLAG_HAS_UNSET_VALUES
597            }
598            writer.write_byte(flags);
599
600            if !sorted_set.is_empty() {
601                writer.write_varint(sorted_set.len() as u64);
602                for pv in &sorted_set {
603                    let data_type = property_types.get(&pv.property)
604                        .copied()
605                        .unwrap_or_else(|| pv.value.data_type());
606                    encode_property_value_canonical(writer, pv, dict_builder, data_type)?;
607                }
608            }
609
610            if !sorted_unset.is_empty() {
611                use crate::model::UnsetLanguage;
612                writer.write_varint(sorted_unset.len() as u64);
613                for unset in &sorted_unset {
614                    let prop_idx = dict_builder.add_property(unset.property, DataType::Bool);
615                    writer.write_varint(prop_idx as u64);
616                    let lang_value: u32 = match &unset.language {
617                        UnsetLanguage::All => 0xFFFFFFFF,
618                        UnsetLanguage::English => 0,
619                        UnsetLanguage::Specific(lang_id) => {
620                            dict_builder.add_language(Some(*lang_id)) as u32
621                        }
622                    };
623                    writer.write_varint(lang_value as u64);
624                }
625            }
626            Ok(())
627        }
628        // Other ops don't have values to sort, delegate to regular encode
629        _ => encode_op(writer, op, dict_builder, property_types),
630    }
631}
632
633/// Sorts values by (property_index, language_index) and checks for duplicates.
634fn sort_and_check_values<'a>(
635    values: &[crate::model::PropertyValue<'a>],
636    dict_builder: &DictionaryBuilder,
637) -> Result<Vec<crate::model::PropertyValue<'a>>, EncodeError> {
638    use crate::model::{PropertyValue, Value};
639
640    if values.is_empty() {
641        return Ok(Vec::new());
642    }
643
644    // Create (property_index, language_index, original_index) tuples for sorting
645    let mut indexed: Vec<(usize, usize, usize, &PropertyValue<'a>)> = values
646        .iter()
647        .enumerate()
648        .map(|(i, pv)| {
649            let prop_idx = dict_builder.get_property_index(&pv.property).unwrap_or(0);
650            let lang_idx = match &pv.value {
651                Value::Text { language, .. } => dict_builder.get_language_index(language.as_ref()).unwrap_or(0),
652                _ => 0,
653            };
654            (prop_idx, lang_idx, i, pv)
655        })
656        .collect();
657
658    // Sort by (property_index, language_index)
659    indexed.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
660
661    // Check for duplicates (adjacent entries with same property_index and language_index)
662    for i in 1..indexed.len() {
663        if indexed[i].0 == indexed[i - 1].0 && indexed[i].1 == indexed[i - 1].1 {
664            let pv = indexed[i].3;
665            let language = match &pv.value {
666                Value::Text { language, .. } => *language,
667                _ => None,
668            };
669            return Err(EncodeError::DuplicateValue {
670                property: pv.property,
671                language,
672            });
673        }
674    }
675
676    // Return cloned values in sorted order
677    Ok(indexed.into_iter().map(|(_, _, _, pv)| pv.clone()).collect())
678}
679
680/// Sorts unset values by (property_index, language) and checks for duplicates.
681fn sort_and_check_unsets(
682    unsets: &[crate::model::UnsetValue],
683    dict_builder: &DictionaryBuilder,
684) -> Result<Vec<crate::model::UnsetValue>, EncodeError> {
685    use crate::model::UnsetLanguage;
686
687    if unsets.is_empty() {
688        return Ok(Vec::new());
689    }
690
691    // Create (property_index, language_sort_key, original_index) tuples for sorting
692    let mut indexed: Vec<(usize, u32, usize, &crate::model::UnsetValue)> = unsets
693        .iter()
694        .enumerate()
695        .map(|(i, up)| {
696            let prop_idx = dict_builder.get_property_index(&up.property).unwrap_or(0);
697            let lang_key: u32 = match &up.language {
698                UnsetLanguage::All => 0xFFFFFFFF,
699                UnsetLanguage::English => 0,
700                UnsetLanguage::Specific(lang_id) => {
701                    dict_builder.get_language_index(Some(lang_id)).unwrap_or(0) as u32
702                }
703            };
704            (prop_idx, lang_key, i, up)
705        })
706        .collect();
707
708    // Sort by (property_index, language_key)
709    indexed.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
710
711    // Check for duplicates
712    for i in 1..indexed.len() {
713        if indexed[i].0 == indexed[i - 1].0 && indexed[i].1 == indexed[i - 1].1 {
714            let up = indexed[i].3;
715            let language = match &up.language {
716                UnsetLanguage::All => None,
717                UnsetLanguage::English => None,
718                UnsetLanguage::Specific(id) => Some(*id),
719            };
720            return Err(EncodeError::DuplicateUnset {
721                property: up.property,
722                language,
723            });
724        }
725    }
726
727    Ok(indexed.into_iter().map(|(_, _, _, up)| up.clone()).collect())
728}
729
730/// Encodes a property value in canonical mode (same as regular but separated for clarity).
731fn encode_property_value_canonical(
732    writer: &mut Writer,
733    pv: &crate::model::PropertyValue<'_>,
734    dict_builder: &mut DictionaryBuilder,
735    data_type: DataType,
736) -> Result<(), EncodeError> {
737    let prop_index = dict_builder.add_property(pv.property, data_type);
738    writer.write_varint(prop_index as u64);
739    crate::codec::value::encode_value(writer, &pv.value, dict_builder)?;
740    Ok(())
741}
742
743/// Encodes an Edit with profiling output (two-pass for comparison).
744pub fn encode_edit_profiled(edit: &Edit, profile: bool) -> Result<Vec<u8>, EncodeError> {
745    if !profile {
746        return encode_edit(edit);
747    }
748
749    use std::time::Instant;
750
751    let t0 = Instant::now();
752
753    // Property types are determined from values themselves (per-edit typing)
754    let property_types = rustc_hash::FxHashMap::default();
755    let t1 = Instant::now();
756
757    // Single pass: encode ops while building dictionaries
758    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
759    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
760
761    for op in &edit.ops {
762        encode_op(&mut ops_writer, op, &mut dict_builder, &property_types)?;
763    }
764    let t2 = Instant::now();
765
766    // Assemble final output
767    let ops_bytes = ops_writer.into_bytes();
768    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
769
770    writer.write_bytes(MAGIC_UNCOMPRESSED);
771    writer.write_byte(FORMAT_VERSION);
772    writer.write_id(&edit.id);
773    writer.write_string(&edit.name);
774    writer.write_id_vec(&edit.authors);
775    writer.write_signed_varint(edit.created_at);
776    dict_builder.write_dictionaries(&mut writer);
777    writer.write_varint(edit.ops.len() as u64);
778    writer.write_bytes(&ops_bytes);
779    let t3 = Instant::now();
780
781    let result = writer.into_bytes();
782
783    let total = t3.duration_since(t0);
784    eprintln!("=== Encode Profile (single-pass) ===");
785    eprintln!("  setup: {:?} ({:.1}%)", t1.duration_since(t0), 100.0 * t1.duration_since(t0).as_secs_f64() / total.as_secs_f64());
786    eprintln!("  encode_ops + build_dicts: {:?} ({:.1}%)", t2.duration_since(t1), 100.0 * t2.duration_since(t1).as_secs_f64() / total.as_secs_f64());
787    eprintln!("  assemble output: {:?} ({:.1}%)", t3.duration_since(t2), 100.0 * t3.duration_since(t2).as_secs_f64() / total.as_secs_f64());
788    eprintln!("  TOTAL: {:?}", total);
789
790    Ok(result)
791}
792
793/// Encodes an Edit to binary format with zstd compression.
794pub fn encode_edit_compressed(edit: &Edit, level: i32) -> Result<Vec<u8>, EncodeError> {
795    encode_edit_compressed_with_options(edit, level, EncodeOptions::default())
796}
797
798/// Encodes an Edit to binary format with zstd compression and options.
799pub fn encode_edit_compressed_with_options(
800    edit: &Edit,
801    level: i32,
802    options: EncodeOptions,
803) -> Result<Vec<u8>, EncodeError> {
804    let uncompressed = encode_edit_with_options(edit, options)?;
805
806    let compressed = zstd::encode_all(uncompressed.as_slice(), level)
807        .map_err(|e| EncodeError::CompressionFailed(e.to_string()))?;
808
809    let mut writer = Writer::with_capacity(5 + 10 + compressed.len());
810    writer.write_bytes(MAGIC_COMPRESSED);
811    writer.write_varint(uncompressed.len() as u64);
812    writer.write_bytes(&compressed);
813
814    Ok(writer.into_bytes())
815}
816
817#[cfg(test)]
818mod tests {
819    use super::*;
820    use crate::model::{CreateEntity, PropertyValue, Value};
821
822    fn make_test_edit() -> Edit<'static> {
823        Edit {
824            id: [1u8; 16],
825            name: Cow::Owned("Test Edit".to_string()),
826            authors: vec![[2u8; 16]],
827            created_at: 1234567890,
828            ops: vec![
829                Op::CreateEntity(CreateEntity {
830                    id: [3u8; 16],
831                    values: vec![PropertyValue {
832                        property: [10u8; 16],
833                        value: Value::Text {
834                            value: Cow::Owned("Hello".to_string()),
835                            language: None,
836                        },
837                    }],
838                }),
839            ],
840        }
841    }
842
843    #[test]
844    fn test_edit_roundtrip() {
845        let edit = make_test_edit();
846
847        let encoded = encode_edit(&edit).unwrap();
848        let decoded = decode_edit(&encoded).unwrap();
849
850        assert_eq!(edit.id, decoded.id);
851        assert_eq!(edit.name, decoded.name);
852        assert_eq!(edit.authors, decoded.authors);
853        assert_eq!(edit.created_at, decoded.created_at);
854        assert_eq!(edit.ops.len(), decoded.ops.len());
855    }
856
857    #[test]
858    fn test_edit_compressed_roundtrip() {
859        let edit = make_test_edit();
860
861        let encoded = encode_edit_compressed(&edit, 3).unwrap();
862        let decoded = decode_edit(&encoded).unwrap();
863
864        assert_eq!(edit.id, decoded.id);
865        assert_eq!(edit.name, decoded.name);
866        assert_eq!(edit.authors, decoded.authors);
867        assert_eq!(edit.created_at, decoded.created_at);
868        assert_eq!(edit.ops.len(), decoded.ops.len());
869    }
870
871    #[test]
872    fn test_compression_magic() {
873        let edit = make_test_edit();
874
875        let uncompressed = encode_edit(&edit).unwrap();
876        let compressed = encode_edit_compressed(&edit, 3).unwrap();
877
878        assert_eq!(&uncompressed[0..4], b"GRC2");
879        assert_eq!(&compressed[0..5], b"GRC2Z");
880    }
881
882    #[test]
883    fn test_invalid_magic() {
884        let data = b"XXXX";
885        let result = decode_edit(data);
886        assert!(matches!(result, Err(DecodeError::InvalidMagic { .. })));
887    }
888
889    #[test]
890    fn test_unsupported_version() {
891        let mut data = Vec::new();
892        data.extend_from_slice(MAGIC_UNCOMPRESSED);
893        data.push(99); // Invalid version
894        // Add enough bytes to not trigger EOF
895        data.extend_from_slice(&[0u8; 100]);
896
897        let result = decode_edit(&data);
898        assert!(matches!(result, Err(DecodeError::UnsupportedVersion { version: 99 })));
899    }
900
901    #[test]
902    fn test_empty_edit() {
903        let edit: Edit<'static> = Edit {
904            id: [0u8; 16],
905            name: Cow::Borrowed(""),
906            authors: vec![],
907            created_at: 0,
908            ops: vec![],
909        };
910
911        let encoded = encode_edit(&edit).unwrap();
912        let decoded = decode_edit(&encoded).unwrap();
913
914        assert_eq!(edit.id, decoded.id);
915        assert!(decoded.name.is_empty());
916        assert!(decoded.authors.is_empty());
917        assert!(decoded.ops.is_empty());
918    }
919
920    #[test]
921    fn test_canonical_encoding_deterministic() {
922        // Two edits with values in different order should produce
923        // identical bytes when using canonical encoding
924
925        let prop_a = [0x0A; 16]; // Comes first lexicographically
926        let prop_b = [0x0B; 16]; // Comes second
927
928        // Edit 1: values in order A, B
929        let edit1: Edit<'static> = Edit {
930            id: [1u8; 16],
931            name: Cow::Owned("Test".to_string()),
932            authors: vec![],
933            created_at: 0,
934            ops: vec![
935                Op::CreateEntity(CreateEntity {
936                    id: [3u8; 16],
937                    values: vec![
938                        PropertyValue {
939                            property: prop_a,
940                            value: Value::Text {
941                                value: Cow::Owned("Hello".to_string()),
942                                language: None,
943                            },
944                        },
945                        PropertyValue {
946                            property: prop_b,
947                            value: Value::Int64 { value: 42, unit: None },
948                        },
949                    ],
950                }),
951            ],
952        };
953
954        // Edit 2: Same content but values in different order
955        let edit2: Edit<'static> = Edit {
956            id: [1u8; 16],
957            name: Cow::Owned("Test".to_string()),
958            authors: vec![],
959            created_at: 0,
960            ops: vec![
961                Op::CreateEntity(CreateEntity {
962                    id: [3u8; 16],
963                    values: vec![
964                        // Note: prop_b first this time (different insertion order)
965                        PropertyValue {
966                            property: prop_b,
967                            value: Value::Int64 { value: 42, unit: None },
968                        },
969                        PropertyValue {
970                            property: prop_a,
971                            value: Value::Text {
972                                value: Cow::Owned("Hello".to_string()),
973                                language: None,
974                            },
975                        },
976                    ],
977                }),
978            ],
979        };
980
981        // Non-canonical encoding may produce different bytes
982        let fast1 = encode_edit_with_options(&edit1, EncodeOptions::new()).unwrap();
983        let fast2 = encode_edit_with_options(&edit2, EncodeOptions::new()).unwrap();
984        // These might differ because dictionary order depends on insertion order
985        // (We don't assert they're different because they might happen to be the same)
986
987        // Canonical encoding MUST produce identical bytes for same logical content
988        let canonical1 = encode_edit_with_options(&edit1, EncodeOptions::canonical()).unwrap();
989        let canonical2 = encode_edit_with_options(&edit2, EncodeOptions::canonical()).unwrap();
990
991        // Both should decode correctly
992        let decoded1 = decode_edit(&canonical1).unwrap();
993        let decoded2 = decode_edit(&canonical2).unwrap();
994        assert_eq!(decoded1.id, edit1.id);
995        assert_eq!(decoded2.id, edit2.id);
996
997        // And the encoded bytes should be identical (deterministic)
998        // Note: The ops themselves may have different value orders, but the dictionary
999        // portion should be identical since it's sorted by ID
1000        assert_eq!(
1001            &canonical1[..50], // Check header + dictionary start
1002            &canonical2[..50],
1003            "Canonical encoding should produce identical dictionary bytes"
1004        );
1005
1006        // Verify the edit still roundtrips
1007        let _ = fast1;
1008        let _ = fast2;
1009    }
1010
1011    #[test]
1012    fn test_canonical_encoding_roundtrip() {
1013        let edit = make_test_edit();
1014
1015        let encoded = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1016        let decoded = decode_edit(&encoded).unwrap();
1017
1018        assert_eq!(edit.id, decoded.id);
1019        assert_eq!(edit.name, decoded.name);
1020        assert_eq!(edit.authors, decoded.authors);
1021        assert_eq!(edit.created_at, decoded.created_at);
1022        assert_eq!(edit.ops.len(), decoded.ops.len());
1023    }
1024
1025    #[test]
1026    fn test_canonical_encoding_compressed() {
1027        let edit = make_test_edit();
1028
1029        let encoded = encode_edit_compressed_with_options(&edit, 3, EncodeOptions::canonical()).unwrap();
1030        let decoded = decode_edit(&encoded).unwrap();
1031
1032        assert_eq!(edit.id, decoded.id);
1033        assert_eq!(edit.name, decoded.name);
1034    }
1035
1036    #[test]
1037    fn test_canonical_rejects_duplicate_authors() {
1038        let author1 = [1u8; 16];
1039
1040        let edit: Edit<'static> = Edit {
1041            id: [0u8; 16],
1042            name: Cow::Owned("Test".to_string()),
1043            authors: vec![author1, author1], // Duplicate!
1044            created_at: 0,
1045            ops: vec![],
1046        };
1047
1048        // Fast mode doesn't check duplicates
1049        let result = encode_edit_with_options(&edit, EncodeOptions::new());
1050        assert!(result.is_ok());
1051
1052        // Canonical mode rejects duplicates
1053        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1054        assert!(matches!(result, Err(EncodeError::DuplicateAuthor { .. })));
1055    }
1056
1057    #[test]
1058    fn test_canonical_rejects_duplicate_values() {
1059        let prop = [10u8; 16];
1060
1061        let edit: Edit<'static> = Edit {
1062            id: [0u8; 16],
1063            name: Cow::Owned("Test".to_string()),
1064            authors: vec![],
1065            created_at: 0,
1066            ops: vec![
1067                Op::CreateEntity(CreateEntity {
1068                    id: [1u8; 16],
1069                    values: vec![
1070                        PropertyValue {
1071                            property: prop,
1072                            value: Value::Text {
1073                                value: Cow::Owned("First".to_string()),
1074                                language: None,
1075                            },
1076                        },
1077                        PropertyValue {
1078                            property: prop,
1079                            value: Value::Text {
1080                                value: Cow::Owned("Second".to_string()),
1081                                language: None,
1082                            },
1083                        },
1084                    ],
1085                }),
1086            ],
1087        };
1088
1089        // Canonical mode rejects duplicate (property, language) pairs
1090        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1091        assert!(matches!(result, Err(EncodeError::DuplicateValue { .. })));
1092    }
1093
1094    #[test]
1095    fn test_canonical_allows_different_languages() {
1096        let prop = [10u8; 16];
1097        let lang_en = [20u8; 16];
1098        let lang_es = [21u8; 16];
1099
1100        let edit: Edit<'static> = Edit {
1101            id: [0u8; 16],
1102            name: Cow::Owned("Test".to_string()),
1103            authors: vec![],
1104            created_at: 0,
1105            ops: vec![
1106                Op::CreateEntity(CreateEntity {
1107                    id: [1u8; 16],
1108                    values: vec![
1109                        PropertyValue {
1110                            property: prop,
1111                            value: Value::Text {
1112                                value: Cow::Owned("Hello".to_string()),
1113                                language: Some(lang_en),
1114                            },
1115                        },
1116                        PropertyValue {
1117                            property: prop,
1118                            value: Value::Text {
1119                                value: Cow::Owned("Hola".to_string()),
1120                                language: Some(lang_es),
1121                            },
1122                        },
1123                    ],
1124                }),
1125            ],
1126        };
1127
1128        // Different languages for same property is allowed
1129        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1130        assert!(result.is_ok());
1131    }
1132
1133    #[test]
1134    fn test_canonical_sorts_values_deterministically() {
1135        let prop_a = [0x0A; 16];
1136        let prop_b = [0x0B; 16];
1137
1138        // Values in reverse order (B before A)
1139        let edit: Edit<'static> = Edit {
1140            id: [1u8; 16],
1141            name: Cow::Owned("Test".to_string()),
1142            authors: vec![],
1143            created_at: 0,
1144            ops: vec![
1145                Op::CreateEntity(CreateEntity {
1146                    id: [3u8; 16],
1147                    values: vec![
1148                        PropertyValue {
1149                            property: prop_b, // B first
1150                            value: Value::Int64 { value: 42, unit: None },
1151                        },
1152                        PropertyValue {
1153                            property: prop_a, // A second
1154                            value: Value::Text {
1155                                value: Cow::Owned("Hello".to_string()),
1156                                language: None,
1157                            },
1158                        },
1159                    ],
1160                }),
1161            ],
1162        };
1163
1164        // Encode twice - should produce identical bytes
1165        let encoded1 = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1166        let encoded2 = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1167        assert_eq!(encoded1, encoded2, "Canonical encoding should be deterministic");
1168
1169        // Should roundtrip
1170        let decoded = decode_edit(&encoded1).unwrap();
1171        assert_eq!(decoded.ops.len(), 1);
1172    }
1173}