grc_20/codec/
edit.rs

1//! Edit encoding/decoding for GRC-20 binary format.
2//!
3//! Implements the wire format for edits (spec Section 6.3).
4
5use std::borrow::Cow;
6use std::io::Read;
7
8use rustc_hash::{FxHashMap, FxHashSet};
9
10use crate::codec::op::{decode_op, encode_op};
11use crate::codec::primitives::{Reader, Writer};
12use crate::error::{DecodeError, EncodeError};
13use crate::limits::{
14    FORMAT_VERSION, MAGIC_COMPRESSED, MAGIC_UNCOMPRESSED, MAX_AUTHORS, MAX_DICT_SIZE,
15    MAX_EDIT_SIZE, MAX_OPS_PER_EDIT, MAX_STRING_LEN, MIN_FORMAT_VERSION,
16};
17use crate::model::{Context, ContextEdge, DataType, DictionaryBuilder, Edit, Id, Op, WireDictionaries};
18
19// =============================================================================
20// DECODING
21// =============================================================================
22
23/// Decompresses a GRC2Z compressed edit, returning the uncompressed bytes.
24///
25/// Use this with [`decode_edit`] for zero-copy decoding of compressed data:
26///
27/// ```ignore
28/// let uncompressed = decompress(&compressed_bytes)?;
29/// let edit = decode_edit(&uncompressed)?;  // zero-copy, borrows from uncompressed
30/// // edit is valid while uncompressed is alive
31/// ```
32pub fn decompress(input: &[u8]) -> Result<Vec<u8>, DecodeError> {
33    if input.len() < 5 {
34        return Err(DecodeError::UnexpectedEof { context: "magic" });
35    }
36    if &input[0..5] != MAGIC_COMPRESSED {
37        let mut found = [0u8; 4];
38        found.copy_from_slice(&input[0..4]);
39        return Err(DecodeError::InvalidMagic { found });
40    }
41    decompress_zstd(&input[5..])
42}
43
44/// Decodes an Edit from binary data with zero-copy borrowing.
45///
46/// Handles both compressed (GRC2Z) and uncompressed (GRC2) formats.
47/// For true zero-copy with compressed data, use [`decompress`] first:
48///
49/// ```ignore
50/// // Zero-copy for compressed data:
51/// let uncompressed = decompress(&compressed)?;
52/// let edit = decode_edit(&uncompressed)?;
53///
54/// // Zero-copy for uncompressed data:
55/// let edit = decode_edit(&uncompressed_bytes)?;
56/// ```
57///
58/// If you pass compressed data directly, it will decompress internally
59/// and allocate owned strings (no zero-copy benefit).
60pub fn decode_edit(input: &[u8]) -> Result<Edit<'_>, DecodeError> {
61    if input.len() < 4 {
62        return Err(DecodeError::UnexpectedEof { context: "magic" });
63    }
64
65    // Detect compression
66    if input.len() >= 5 && &input[0..5] == MAGIC_COMPRESSED {
67        // Compressed: decompress and decode with allocations
68        // (for zero-copy, caller should use decompress() first)
69        let decompressed = decompress_zstd(&input[5..])?;
70        if decompressed.len() > MAX_EDIT_SIZE {
71            return Err(DecodeError::LengthExceedsLimit {
72                field: "edit",
73                len: decompressed.len(),
74                max: MAX_EDIT_SIZE,
75            });
76        }
77        decode_edit_owned(&decompressed)
78    } else if &input[0..4] == MAGIC_UNCOMPRESSED {
79        // Uncompressed: decode with zero-copy borrowing
80        if input.len() > MAX_EDIT_SIZE {
81            return Err(DecodeError::LengthExceedsLimit {
82                field: "edit",
83                len: input.len(),
84                max: MAX_EDIT_SIZE,
85            });
86        }
87        decode_edit_borrowed(input)
88    } else {
89        let mut found = [0u8; 4];
90        found.copy_from_slice(&input[0..4]);
91        Err(DecodeError::InvalidMagic { found })
92    }
93}
94
95/// Decodes an Edit with zero-copy borrowing from the input.
96fn decode_edit_borrowed(input: &[u8]) -> Result<Edit<'_>, DecodeError> {
97    let mut reader = Reader::new(input);
98
99    // Skip magic (already validated)
100    reader.read_bytes(4, "magic")?;
101
102    // Version
103    let version = reader.read_byte("version")?;
104    if version < MIN_FORMAT_VERSION || version > FORMAT_VERSION {
105        return Err(DecodeError::UnsupportedVersion { version });
106    }
107
108    // Header
109    let edit_id = reader.read_id("edit_id")?;
110    let name = Cow::Borrowed(reader.read_str(MAX_STRING_LEN, "name")?);
111    let authors = reader.read_id_vec(MAX_AUTHORS, "authors")?;
112    let created_at = reader.read_signed_varint("created_at")?;
113
114    // Schema dictionaries (with duplicate detection)
115    let property_count = reader.read_varint("property_count")? as usize;
116    if property_count > MAX_DICT_SIZE {
117        return Err(DecodeError::LengthExceedsLimit {
118            field: "properties",
119            len: property_count,
120            max: MAX_DICT_SIZE,
121        });
122    }
123    let mut properties = Vec::with_capacity(property_count);
124    let mut seen_props = FxHashSet::with_capacity_and_hasher(property_count, Default::default());
125    for _ in 0..property_count {
126        let id = reader.read_id("property_id")?;
127        if !seen_props.insert(id) {
128            return Err(DecodeError::DuplicateDictionaryEntry { dict: "properties", id });
129        }
130        let dt_byte = reader.read_byte("data_type")?;
131        let data_type = DataType::from_u8(dt_byte)
132            .ok_or(DecodeError::InvalidDataType { data_type: dt_byte })?;
133        properties.push((id, data_type));
134    }
135
136    let relation_types = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "relation_types")?;
137    let languages = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "languages")?;
138    let units = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "units")?;
139    let objects = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "objects")?;
140    let context_ids = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "context_ids")?;
141
142    let mut dicts = WireDictionaries {
143        properties,
144        relation_types,
145        languages,
146        units,
147        objects,
148        context_ids,
149        contexts: Vec::new(),
150    };
151
152    // Contexts - decode and store in dicts for op decoding to resolve
153    let context_count = reader.read_varint("context_count")? as usize;
154    if context_count > MAX_DICT_SIZE {
155        return Err(DecodeError::LengthExceedsLimit {
156            field: "contexts",
157            len: context_count,
158            max: MAX_DICT_SIZE,
159        });
160    }
161    for _ in 0..context_count {
162        dicts.contexts.push(decode_context(&mut reader, &dicts)?);
163    }
164
165    // Operations
166    let op_count = reader.read_varint("op_count")? as usize;
167    if op_count > MAX_OPS_PER_EDIT {
168        return Err(DecodeError::LengthExceedsLimit {
169            field: "ops",
170            len: op_count,
171            max: MAX_OPS_PER_EDIT,
172        });
173    }
174
175    let mut ops = Vec::with_capacity(op_count);
176    for _ in 0..op_count {
177        ops.push(decode_op(&mut reader, &dicts)?);
178    }
179
180    Ok(Edit {
181        id: edit_id,
182        name,
183        authors,
184        created_at,
185        ops,
186    })
187}
188
189/// Decodes an Edit with allocations (for decompressed data).
190fn decode_edit_owned(data: &[u8]) -> Result<Edit<'static>, DecodeError> {
191    let mut reader = Reader::new(data);
192
193    // Skip magic (already validated in decompress)
194    reader.read_bytes(4, "magic")?;
195
196    // Version
197    let version = reader.read_byte("version")?;
198    if version < MIN_FORMAT_VERSION || version > FORMAT_VERSION {
199        return Err(DecodeError::UnsupportedVersion { version });
200    }
201
202    // Header - use allocating reads
203    let edit_id = reader.read_id("edit_id")?;
204    let name = Cow::Owned(reader.read_string(MAX_STRING_LEN, "name")?);
205    let authors = reader.read_id_vec(MAX_AUTHORS, "authors")?;
206    let created_at = reader.read_signed_varint("created_at")?;
207
208    // Schema dictionaries (with duplicate detection)
209    let property_count = reader.read_varint("property_count")? as usize;
210    if property_count > MAX_DICT_SIZE {
211        return Err(DecodeError::LengthExceedsLimit {
212            field: "properties",
213            len: property_count,
214            max: MAX_DICT_SIZE,
215        });
216    }
217    let mut properties = Vec::with_capacity(property_count);
218    let mut seen_props = FxHashSet::with_capacity_and_hasher(property_count, Default::default());
219    for _ in 0..property_count {
220        let id = reader.read_id("property_id")?;
221        if !seen_props.insert(id) {
222            return Err(DecodeError::DuplicateDictionaryEntry { dict: "properties", id });
223        }
224        let dt_byte = reader.read_byte("data_type")?;
225        let data_type = DataType::from_u8(dt_byte)
226            .ok_or(DecodeError::InvalidDataType { data_type: dt_byte })?;
227        properties.push((id, data_type));
228    }
229
230    let relation_types = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "relation_types")?;
231    let languages = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "languages")?;
232    let units = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "units")?;
233    let objects = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "objects")?;
234    let context_ids = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "context_ids")?;
235
236    let mut dicts = WireDictionaries {
237        properties,
238        relation_types,
239        languages,
240        units,
241        objects,
242        context_ids,
243        contexts: Vec::new(),
244    };
245
246    // Contexts - decode and store in dicts for op decoding to resolve
247    let context_count = reader.read_varint("context_count")? as usize;
248    if context_count > MAX_DICT_SIZE {
249        return Err(DecodeError::LengthExceedsLimit {
250            field: "contexts",
251            len: context_count,
252            max: MAX_DICT_SIZE,
253        });
254    }
255    for _ in 0..context_count {
256        dicts.contexts.push(decode_context(&mut reader, &dicts)?);
257    }
258
259    // Operations - use allocating decode
260    let op_count = reader.read_varint("op_count")? as usize;
261    if op_count > MAX_OPS_PER_EDIT {
262        return Err(DecodeError::LengthExceedsLimit {
263            field: "ops",
264            len: op_count,
265            max: MAX_OPS_PER_EDIT,
266        });
267    }
268
269    let mut ops = Vec::with_capacity(op_count);
270    for _ in 0..op_count {
271        ops.push(decode_op_owned(&mut reader, &dicts)?);
272    }
273
274    Ok(Edit {
275        id: edit_id,
276        name,
277        authors,
278        created_at,
279        ops,
280    })
281}
282
283/// Decodes an Op with allocations (for decompressed data).
284fn decode_op_owned(reader: &mut Reader<'_>, dicts: &WireDictionaries) -> Result<Op<'static>, DecodeError> {
285    // Decode normally, then convert to owned
286    let op = decode_op(reader, dicts)?;
287    Ok(op_to_owned(op))
288}
289
290/// Decodes a Context from the reader.
291fn decode_context(reader: &mut Reader<'_>, dicts: &WireDictionaries) -> Result<Context, DecodeError> {
292    let root_id_index = reader.read_varint("root_id")? as usize;
293    if root_id_index >= dicts.context_ids.len() {
294        return Err(DecodeError::IndexOutOfBounds {
295            dict: "context_ids",
296            index: root_id_index,
297            size: dicts.context_ids.len(),
298        });
299    }
300    let root_id = dicts.context_ids[root_id_index];
301
302    let edge_count = reader.read_varint("edge_count")? as usize;
303    if edge_count > MAX_DICT_SIZE {
304        return Err(DecodeError::LengthExceedsLimit {
305            field: "context_edges",
306            len: edge_count,
307            max: MAX_DICT_SIZE,
308        });
309    }
310
311    let mut edges = Vec::with_capacity(edge_count);
312    for _ in 0..edge_count {
313        let type_id_index = reader.read_varint("edge_type_id")? as usize;
314        if type_id_index >= dicts.relation_types.len() {
315            return Err(DecodeError::IndexOutOfBounds {
316                dict: "relation_types",
317                index: type_id_index,
318                size: dicts.relation_types.len(),
319            });
320        }
321        let type_id = dicts.relation_types[type_id_index];
322
323        let to_entity_id_index = reader.read_varint("edge_to_entity_id")? as usize;
324        if to_entity_id_index >= dicts.context_ids.len() {
325            return Err(DecodeError::IndexOutOfBounds {
326                dict: "context_ids",
327                index: to_entity_id_index,
328                size: dicts.context_ids.len(),
329            });
330        }
331        let to_entity_id = dicts.context_ids[to_entity_id_index];
332
333        edges.push(ContextEdge { type_id, to_entity_id });
334    }
335
336    Ok(Context { root_id, edges })
337}
338
339/// Converts an Op with borrowed data to owned data.
340fn op_to_owned(op: Op<'_>) -> Op<'static> {
341    match op {
342        Op::CreateEntity(ce) => Op::CreateEntity(crate::model::CreateEntity {
343            id: ce.id,
344            values: ce.values.into_iter().map(pv_to_owned).collect(),
345            context: ce.context,
346        }),
347        Op::UpdateEntity(ue) => Op::UpdateEntity(crate::model::UpdateEntity {
348            id: ue.id,
349            set_properties: ue.set_properties.into_iter().map(pv_to_owned).collect(),
350            unset_values: ue.unset_values,
351            context: ue.context,
352        }),
353        Op::DeleteEntity(de) => Op::DeleteEntity(de),
354        Op::RestoreEntity(re) => Op::RestoreEntity(re),
355        Op::CreateRelation(cr) => Op::CreateRelation(crate::model::CreateRelation {
356            id: cr.id,
357            relation_type: cr.relation_type,
358            from: cr.from,
359            from_is_value_ref: cr.from_is_value_ref,
360            to: cr.to,
361            to_is_value_ref: cr.to_is_value_ref,
362            entity: cr.entity,
363            position: cr.position.map(|p| Cow::Owned(p.into_owned())),
364            from_space: cr.from_space,
365            from_version: cr.from_version,
366            to_space: cr.to_space,
367            to_version: cr.to_version,
368            context: cr.context,
369        }),
370        Op::UpdateRelation(ur) => Op::UpdateRelation(crate::model::UpdateRelation {
371            id: ur.id,
372            from_space: ur.from_space,
373            from_version: ur.from_version,
374            to_space: ur.to_space,
375            to_version: ur.to_version,
376            position: ur.position.map(|p| Cow::Owned(p.into_owned())),
377            unset: ur.unset,
378            context: ur.context,
379        }),
380        Op::DeleteRelation(dr) => Op::DeleteRelation(dr),
381        Op::RestoreRelation(rr) => Op::RestoreRelation(rr),
382        Op::CreateValueRef(cvr) => Op::CreateValueRef(cvr),
383    }
384}
385
386/// Converts a PropertyValue with borrowed data to owned data.
387fn pv_to_owned(pv: crate::model::PropertyValue<'_>) -> crate::model::PropertyValue<'static> {
388    crate::model::PropertyValue {
389        property: pv.property,
390        value: value_to_owned(pv.value),
391    }
392}
393
394/// Converts a Value with borrowed data to owned data.
395fn value_to_owned(v: crate::model::Value<'_>) -> crate::model::Value<'static> {
396    use crate::model::{DecimalMantissa, Value};
397    match v {
398        Value::Bool(b) => Value::Bool(b),
399        Value::Int64 { value, unit } => Value::Int64 { value, unit },
400        Value::Float64 { value, unit } => Value::Float64 { value, unit },
401        Value::Decimal { exponent, mantissa, unit } => Value::Decimal {
402            exponent,
403            mantissa: match mantissa {
404                DecimalMantissa::I64(i) => DecimalMantissa::I64(i),
405                DecimalMantissa::Big(b) => DecimalMantissa::Big(Cow::Owned(b.into_owned())),
406            },
407            unit,
408        },
409        Value::Text { value, language } => Value::Text {
410            value: Cow::Owned(value.into_owned()),
411            language,
412        },
413        Value::Bytes(b) => Value::Bytes(Cow::Owned(b.into_owned())),
414        Value::Date { days, offset_min } => Value::Date { days, offset_min },
415        Value::Time { time_us, offset_min } => Value::Time { time_us, offset_min },
416        Value::Datetime { epoch_us, offset_min } => Value::Datetime { epoch_us, offset_min },
417        Value::Schedule(s) => Value::Schedule(Cow::Owned(s.into_owned())),
418        Value::Point { lon, lat, alt } => Value::Point { lon, lat, alt },
419        Value::Embedding { sub_type, dims, data } => Value::Embedding {
420            sub_type,
421            dims,
422            data: Cow::Owned(data.into_owned()),
423        },
424    }
425}
426
427/// Reads an ID vector and checks for duplicates.
428fn read_id_vec_no_duplicates(
429    reader: &mut Reader<'_>,
430    max_len: usize,
431    field: &'static str,
432) -> Result<Vec<Id>, DecodeError> {
433    let count = reader.read_varint(field)? as usize;
434    if count > max_len {
435        return Err(DecodeError::LengthExceedsLimit {
436            field,
437            len: count,
438            max: max_len,
439        });
440    }
441
442    let mut ids = Vec::with_capacity(count);
443    let mut seen = FxHashSet::with_capacity_and_hasher(count, Default::default());
444
445    for _ in 0..count {
446        let id = reader.read_id(field)?;
447        if !seen.insert(id) {
448            return Err(DecodeError::DuplicateDictionaryEntry { dict: field, id });
449        }
450        ids.push(id);
451    }
452
453    Ok(ids)
454}
455
456fn decompress_zstd(compressed: &[u8]) -> Result<Vec<u8>, DecodeError> {
457    // Read uncompressed size
458    let mut reader = Reader::new(compressed);
459    let declared_size = reader.read_varint("uncompressed_size")? as usize;
460
461    if declared_size > MAX_EDIT_SIZE {
462        return Err(DecodeError::LengthExceedsLimit {
463            field: "uncompressed_size",
464            len: declared_size,
465            max: MAX_EDIT_SIZE,
466        });
467    }
468
469    let compressed_data = reader.remaining();
470
471    let mut decoder = zstd::Decoder::new(compressed_data)
472        .map_err(|e| DecodeError::DecompressionFailed(e.to_string()))?;
473
474    let mut decompressed = Vec::with_capacity(declared_size);
475    decoder
476        .read_to_end(&mut decompressed)
477        .map_err(|e| DecodeError::DecompressionFailed(e.to_string()))?;
478
479    if decompressed.len() != declared_size {
480        return Err(DecodeError::UncompressedSizeMismatch {
481            declared: declared_size,
482            actual: decompressed.len(),
483        });
484    }
485
486    Ok(decompressed)
487}
488
489// =============================================================================
490// ENCODING
491// =============================================================================
492
493/// Options for encoding edits.
494#[derive(Debug, Clone, Copy, Default)]
495pub struct EncodeOptions {
496    /// Enable canonical encoding mode.
497    ///
498    /// When enabled:
499    /// - Dictionary entries are sorted by ID bytes (lexicographic)
500    /// - This ensures deterministic output for the same logical edit
501    ///
502    /// Use canonical mode when:
503    /// - Computing content hashes for deduplication
504    /// - Creating signatures over edit content
505    /// - Ensuring cross-implementation reproducibility
506    ///
507    /// Note: Canonical mode requires two passes over the ops and is slower
508    /// than non-canonical encoding.
509    pub canonical: bool,
510}
511
512impl EncodeOptions {
513    /// Creates default (non-canonical) encoding options.
514    pub fn new() -> Self {
515        Self::default()
516    }
517
518    /// Creates canonical encoding options.
519    pub fn canonical() -> Self {
520        Self { canonical: true }
521    }
522}
523
524/// Encodes an Edit to binary format (uncompressed).
525///
526/// Uses single-pass encoding: ops are encoded to a buffer while building
527/// dictionaries, then the final output is assembled.
528pub fn encode_edit(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
529    encode_edit_with_options(edit, EncodeOptions::default())
530}
531
532/// Encodes an Edit to binary format with the given options.
533pub fn encode_edit_with_options(edit: &Edit, options: EncodeOptions) -> Result<Vec<u8>, EncodeError> {
534    if options.canonical {
535        encode_edit_canonical(edit)
536    } else {
537        encode_edit_fast(edit)
538    }
539}
540
541/// Fast single-pass encoding (non-canonical).
542fn encode_edit_fast(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
543    // Property types are determined from values themselves (per-edit typing)
544    let property_types = rustc_hash::FxHashMap::default();
545
546    // Create dictionary builder - contexts will be collected from ops
547    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
548
549    // Single pass: encode ops while building dictionaries (including contexts)
550    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
551
552    for op in &edit.ops {
553        encode_op(&mut ops_writer, op, &mut dict_builder, &property_types)?;
554    }
555
556    // Now assemble final output: header + dictionaries + contexts + ops
557    let ops_bytes = ops_writer.into_bytes();
558    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
559
560    // Magic and version
561    writer.write_bytes(MAGIC_UNCOMPRESSED);
562    writer.write_byte(FORMAT_VERSION);
563
564    // Header
565    writer.write_id(&edit.id);
566    writer.write_string(&edit.name);
567    writer.write_id_vec(&edit.authors);
568    writer.write_signed_varint(edit.created_at);
569
570    // Dictionaries
571    dict_builder.write_dictionaries(&mut writer);
572
573    // Contexts (collected from ops during encoding)
574    dict_builder.write_contexts(&mut writer);
575
576    // Operations (already encoded)
577    writer.write_varint(edit.ops.len() as u64);
578    writer.write_bytes(&ops_bytes);
579
580    Ok(writer.into_bytes())
581}
582
583/// Canonical two-pass encoding with sorted dictionaries, authors, values, and unsets.
584///
585/// Pass 1: Collect all dictionary entries
586/// Pass 2: Sort dictionaries, encode with stable indices and sorted values
587///
588/// Canonical mode requirements (spec Section 4.4):
589/// - Dictionaries sorted by ID bytes
590/// - Authors sorted by ID bytes, no duplicates
591/// - Values sorted by (propertyRef, languageRef), no duplicate (property, language)
592/// - Unset values sorted by (propertyRef, language), no duplicates
593fn encode_edit_canonical(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
594    // Property types are determined from values themselves (per-edit typing)
595    let property_types = rustc_hash::FxHashMap::default();
596
597    // Create dictionary builder - contexts will be collected from ops
598    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
599
600    // Pass 1: Collect all dictionary entries (including contexts) by doing a dry run
601    let mut temp_writer = Writer::with_capacity(edit.ops.len() * 50);
602    for op in &edit.ops {
603        encode_op(&mut temp_writer, op, &mut dict_builder, &property_types)?;
604    }
605
606    // Sort dictionaries and get sorted builder
607    let sorted_builder = dict_builder.into_sorted();
608
609    // Sort authors by ID bytes and check for duplicates
610    let mut sorted_authors = edit.authors.clone();
611    sorted_authors.sort();
612    // Check for duplicate authors
613    for i in 1..sorted_authors.len() {
614        if sorted_authors[i] == sorted_authors[i - 1] {
615            return Err(EncodeError::DuplicateAuthor { id: sorted_authors[i] });
616        }
617    }
618
619    // Pass 2: Encode ops with sorted dictionary indices and sorted values
620    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
621    let mut canonical_builder = sorted_builder.clone();
622    for op in &edit.ops {
623        encode_op_canonical(&mut ops_writer, op, &mut canonical_builder, &property_types)?;
624    }
625
626    // Assemble final output: header + dictionaries + contexts + ops
627    let ops_bytes = ops_writer.into_bytes();
628    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
629
630    // Magic and version
631    writer.write_bytes(MAGIC_UNCOMPRESSED);
632    writer.write_byte(FORMAT_VERSION);
633
634    // Header
635    writer.write_id(&edit.id);
636    writer.write_string(&edit.name);
637    writer.write_id_vec(&sorted_authors);
638    writer.write_signed_varint(edit.created_at);
639
640    // Dictionaries (sorted)
641    sorted_builder.write_dictionaries(&mut writer);
642
643    // Contexts (collected from ops during pass 1, sorted)
644    sorted_builder.write_contexts(&mut writer);
645
646    // Operations
647    writer.write_varint(edit.ops.len() as u64);
648    writer.write_bytes(&ops_bytes);
649
650    Ok(writer.into_bytes())
651}
652
653/// Encodes an op in canonical mode with sorted values.
654fn encode_op_canonical(
655    writer: &mut Writer,
656    op: &Op<'_>,
657    dict_builder: &mut DictionaryBuilder,
658    property_types: &FxHashMap<Id, DataType>,
659) -> Result<(), EncodeError> {
660    match op {
661        Op::CreateEntity(ce) => {
662            // Sort values by (property_index, language_index) and check for duplicates
663            let sorted_values = sort_and_check_values(&ce.values, dict_builder)?;
664
665            writer.write_byte(1); // OP_CREATE_ENTITY
666            writer.write_id(&ce.id);
667            writer.write_varint(sorted_values.len() as u64);
668
669            for pv in &sorted_values {
670                let data_type = property_types.get(&pv.property)
671                    .copied()
672                    .unwrap_or_else(|| pv.value.data_type());
673                encode_property_value_canonical(writer, pv, dict_builder, data_type)?;
674            }
675            // Write context_ref: 0xFFFFFFFF = no context, else index into contexts[]
676            let context_ref = match &ce.context {
677                Some(ctx) => dict_builder.add_context(ctx) as u32,
678                None => 0xFFFFFFFF,
679            };
680            writer.write_varint(context_ref as u64);
681            Ok(())
682        }
683        Op::UpdateEntity(ue) => {
684            // Sort set_properties and unset_values, check for duplicates
685            let sorted_set = sort_and_check_values(&ue.set_properties, dict_builder)?;
686            let sorted_unset = sort_and_check_unsets(&ue.unset_values, dict_builder)?;
687
688            writer.write_byte(2); // OP_UPDATE_ENTITY
689            let id_index = dict_builder.add_object(ue.id);
690            writer.write_varint(id_index as u64);
691
692            let mut flags = 0u8;
693            if !sorted_set.is_empty() {
694                flags |= 0x01; // FLAG_HAS_SET_PROPERTIES
695            }
696            if !sorted_unset.is_empty() {
697                flags |= 0x02; // FLAG_HAS_UNSET_VALUES
698            }
699            writer.write_byte(flags);
700
701            if !sorted_set.is_empty() {
702                writer.write_varint(sorted_set.len() as u64);
703                for pv in &sorted_set {
704                    let data_type = property_types.get(&pv.property)
705                        .copied()
706                        .unwrap_or_else(|| pv.value.data_type());
707                    encode_property_value_canonical(writer, pv, dict_builder, data_type)?;
708                }
709            }
710
711            if !sorted_unset.is_empty() {
712                use crate::model::UnsetLanguage;
713                writer.write_varint(sorted_unset.len() as u64);
714                for unset in &sorted_unset {
715                    let prop_idx = dict_builder.add_property(unset.property, DataType::Bool);
716                    writer.write_varint(prop_idx as u64);
717                    let lang_value: u32 = match &unset.language {
718                        UnsetLanguage::All => 0xFFFFFFFF,
719                        UnsetLanguage::English => 0,
720                        UnsetLanguage::Specific(lang_id) => {
721                            dict_builder.add_language(Some(*lang_id)) as u32
722                        }
723                    };
724                    writer.write_varint(lang_value as u64);
725                }
726            }
727            // Write context_ref: 0xFFFFFFFF = no context, else index into contexts[]
728            let context_ref = match &ue.context {
729                Some(ctx) => dict_builder.add_context(ctx) as u32,
730                None => 0xFFFFFFFF,
731            };
732            writer.write_varint(context_ref as u64);
733            Ok(())
734        }
735        // Other ops don't have values to sort, delegate to regular encode
736        _ => encode_op(writer, op, dict_builder, property_types),
737    }
738}
739
740/// Sorts values by (property_index, language_index) and checks for duplicates.
741fn sort_and_check_values<'a>(
742    values: &[crate::model::PropertyValue<'a>],
743    dict_builder: &DictionaryBuilder,
744) -> Result<Vec<crate::model::PropertyValue<'a>>, EncodeError> {
745    use crate::model::{PropertyValue, Value};
746
747    if values.is_empty() {
748        return Ok(Vec::new());
749    }
750
751    // Create (property_index, language_index, original_index) tuples for sorting
752    let mut indexed: Vec<(usize, usize, usize, &PropertyValue<'a>)> = values
753        .iter()
754        .enumerate()
755        .map(|(i, pv)| {
756            let prop_idx = dict_builder.get_property_index(&pv.property).unwrap_or(0);
757            let lang_idx = match &pv.value {
758                Value::Text { language, .. } => dict_builder.get_language_index(language.as_ref()).unwrap_or(0),
759                _ => 0,
760            };
761            (prop_idx, lang_idx, i, pv)
762        })
763        .collect();
764
765    // Sort by (property_index, language_index)
766    indexed.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
767
768    // Check for duplicates (adjacent entries with same property_index and language_index)
769    for i in 1..indexed.len() {
770        if indexed[i].0 == indexed[i - 1].0 && indexed[i].1 == indexed[i - 1].1 {
771            let pv = indexed[i].3;
772            let language = match &pv.value {
773                Value::Text { language, .. } => *language,
774                _ => None,
775            };
776            return Err(EncodeError::DuplicateValue {
777                property: pv.property,
778                language,
779            });
780        }
781    }
782
783    // Return cloned values in sorted order
784    Ok(indexed.into_iter().map(|(_, _, _, pv)| pv.clone()).collect())
785}
786
787/// Sorts unset values by (property_index, language) and checks for duplicates.
788fn sort_and_check_unsets(
789    unsets: &[crate::model::UnsetValue],
790    dict_builder: &DictionaryBuilder,
791) -> Result<Vec<crate::model::UnsetValue>, EncodeError> {
792    use crate::model::UnsetLanguage;
793
794    if unsets.is_empty() {
795        return Ok(Vec::new());
796    }
797
798    // Create (property_index, language_sort_key, original_index) tuples for sorting
799    let mut indexed: Vec<(usize, u32, usize, &crate::model::UnsetValue)> = unsets
800        .iter()
801        .enumerate()
802        .map(|(i, up)| {
803            let prop_idx = dict_builder.get_property_index(&up.property).unwrap_or(0);
804            let lang_key: u32 = match &up.language {
805                UnsetLanguage::All => 0xFFFFFFFF,
806                UnsetLanguage::English => 0,
807                UnsetLanguage::Specific(lang_id) => {
808                    dict_builder.get_language_index(Some(lang_id)).unwrap_or(0) as u32
809                }
810            };
811            (prop_idx, lang_key, i, up)
812        })
813        .collect();
814
815    // Sort by (property_index, language_key)
816    indexed.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
817
818    // Check for duplicates
819    for i in 1..indexed.len() {
820        if indexed[i].0 == indexed[i - 1].0 && indexed[i].1 == indexed[i - 1].1 {
821            let up = indexed[i].3;
822            let language = match &up.language {
823                UnsetLanguage::All => None,
824                UnsetLanguage::English => None,
825                UnsetLanguage::Specific(id) => Some(*id),
826            };
827            return Err(EncodeError::DuplicateUnset {
828                property: up.property,
829                language,
830            });
831        }
832    }
833
834    Ok(indexed.into_iter().map(|(_, _, _, up)| up.clone()).collect())
835}
836
837/// Encodes a property value in canonical mode (same as regular but separated for clarity).
838fn encode_property_value_canonical(
839    writer: &mut Writer,
840    pv: &crate::model::PropertyValue<'_>,
841    dict_builder: &mut DictionaryBuilder,
842    data_type: DataType,
843) -> Result<(), EncodeError> {
844    let prop_index = dict_builder.add_property(pv.property, data_type);
845    writer.write_varint(prop_index as u64);
846    crate::codec::value::encode_value(writer, &pv.value, dict_builder)?;
847    Ok(())
848}
849
850/// Encodes an Edit with profiling output (two-pass for comparison).
851pub fn encode_edit_profiled(edit: &Edit, profile: bool) -> Result<Vec<u8>, EncodeError> {
852    if !profile {
853        return encode_edit(edit);
854    }
855
856    use std::time::Instant;
857
858    let t0 = Instant::now();
859
860    // Property types are determined from values themselves (per-edit typing)
861    let property_types = rustc_hash::FxHashMap::default();
862    let t1 = Instant::now();
863
864    // Create dictionary builder - contexts will be collected from ops
865    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
866
867    // Single pass: encode ops while building dictionaries (including contexts)
868    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
869
870    for op in &edit.ops {
871        encode_op(&mut ops_writer, op, &mut dict_builder, &property_types)?;
872    }
873    let t2 = Instant::now();
874
875    // Assemble final output
876    let ops_bytes = ops_writer.into_bytes();
877    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
878
879    writer.write_bytes(MAGIC_UNCOMPRESSED);
880    writer.write_byte(FORMAT_VERSION);
881    writer.write_id(&edit.id);
882    writer.write_string(&edit.name);
883    writer.write_id_vec(&edit.authors);
884    writer.write_signed_varint(edit.created_at);
885    dict_builder.write_dictionaries(&mut writer);
886    dict_builder.write_contexts(&mut writer);
887    writer.write_varint(edit.ops.len() as u64);
888    writer.write_bytes(&ops_bytes);
889    let t3 = Instant::now();
890
891    let result = writer.into_bytes();
892
893    let total = t3.duration_since(t0);
894    eprintln!("=== Encode Profile (single-pass) ===");
895    eprintln!("  setup: {:?} ({:.1}%)", t1.duration_since(t0), 100.0 * t1.duration_since(t0).as_secs_f64() / total.as_secs_f64());
896    eprintln!("  encode_ops + build_dicts: {:?} ({:.1}%)", t2.duration_since(t1), 100.0 * t2.duration_since(t1).as_secs_f64() / total.as_secs_f64());
897    eprintln!("  assemble output: {:?} ({:.1}%)", t3.duration_since(t2), 100.0 * t3.duration_since(t2).as_secs_f64() / total.as_secs_f64());
898    eprintln!("  TOTAL: {:?}", total);
899
900    Ok(result)
901}
902
903/// Encodes an Edit to binary format with zstd compression.
904pub fn encode_edit_compressed(edit: &Edit, level: i32) -> Result<Vec<u8>, EncodeError> {
905    encode_edit_compressed_with_options(edit, level, EncodeOptions::default())
906}
907
908/// Encodes an Edit to binary format with zstd compression and options.
909pub fn encode_edit_compressed_with_options(
910    edit: &Edit,
911    level: i32,
912    options: EncodeOptions,
913) -> Result<Vec<u8>, EncodeError> {
914    let uncompressed = encode_edit_with_options(edit, options)?;
915
916    let compressed = zstd::encode_all(uncompressed.as_slice(), level)
917        .map_err(|e| EncodeError::CompressionFailed(e.to_string()))?;
918
919    let mut writer = Writer::with_capacity(5 + 10 + compressed.len());
920    writer.write_bytes(MAGIC_COMPRESSED);
921    writer.write_varint(uncompressed.len() as u64);
922    writer.write_bytes(&compressed);
923
924    Ok(writer.into_bytes())
925}
926
927#[cfg(test)]
928mod tests {
929    use super::*;
930    use crate::model::{CreateEntity, PropertyValue, Value};
931
932    fn make_test_edit() -> Edit<'static> {
933        Edit {
934            id: [1u8; 16],
935            name: Cow::Owned("Test Edit".to_string()),
936            authors: vec![[2u8; 16]],
937            created_at: 1234567890,
938                        ops: vec![
939                Op::CreateEntity(CreateEntity {
940                    id: [3u8; 16],
941                    values: vec![PropertyValue {
942                        property: [10u8; 16],
943                        value: Value::Text {
944                            value: Cow::Owned("Hello".to_string()),
945                            language: None,
946                        },
947                    }],
948                    context: None,
949                }),
950            ],
951        }
952    }
953
954    #[test]
955    fn test_edit_roundtrip() {
956        let edit = make_test_edit();
957
958        let encoded = encode_edit(&edit).unwrap();
959        let decoded = decode_edit(&encoded).unwrap();
960
961        assert_eq!(edit.id, decoded.id);
962        assert_eq!(edit.name, decoded.name);
963        assert_eq!(edit.authors, decoded.authors);
964        assert_eq!(edit.created_at, decoded.created_at);
965        assert_eq!(edit.ops.len(), decoded.ops.len());
966    }
967
968    #[test]
969    fn test_edit_compressed_roundtrip() {
970        let edit = make_test_edit();
971
972        let encoded = encode_edit_compressed(&edit, 3).unwrap();
973        let decoded = decode_edit(&encoded).unwrap();
974
975        assert_eq!(edit.id, decoded.id);
976        assert_eq!(edit.name, decoded.name);
977        assert_eq!(edit.authors, decoded.authors);
978        assert_eq!(edit.created_at, decoded.created_at);
979        assert_eq!(edit.ops.len(), decoded.ops.len());
980    }
981
982    #[test]
983    fn test_compression_magic() {
984        let edit = make_test_edit();
985
986        let uncompressed = encode_edit(&edit).unwrap();
987        let compressed = encode_edit_compressed(&edit, 3).unwrap();
988
989        assert_eq!(&uncompressed[0..4], b"GRC2");
990        assert_eq!(&compressed[0..5], b"GRC2Z");
991    }
992
993    #[test]
994    fn test_invalid_magic() {
995        let data = b"XXXX";
996        let result = decode_edit(data);
997        assert!(matches!(result, Err(DecodeError::InvalidMagic { .. })));
998    }
999
1000    #[test]
1001    fn test_unsupported_version() {
1002        let mut data = Vec::new();
1003        data.extend_from_slice(MAGIC_UNCOMPRESSED);
1004        data.push(99); // Invalid version
1005        // Add enough bytes to not trigger EOF
1006        data.extend_from_slice(&[0u8; 100]);
1007
1008        let result = decode_edit(&data);
1009        assert!(matches!(result, Err(DecodeError::UnsupportedVersion { version: 99 })));
1010    }
1011
1012    #[test]
1013    fn test_empty_edit() {
1014        let edit: Edit<'static> = Edit {
1015            id: [0u8; 16],
1016            name: Cow::Borrowed(""),
1017            authors: vec![],
1018            created_at: 0,
1019                        ops: vec![],
1020        };
1021
1022        let encoded = encode_edit(&edit).unwrap();
1023        let decoded = decode_edit(&encoded).unwrap();
1024
1025        assert_eq!(edit.id, decoded.id);
1026        assert!(decoded.name.is_empty());
1027        assert!(decoded.authors.is_empty());
1028        assert!(decoded.ops.is_empty());
1029    }
1030
1031    #[test]
1032    fn test_canonical_encoding_deterministic() {
1033        // Two edits with values in different order should produce
1034        // identical bytes when using canonical encoding
1035
1036        let prop_a = [0x0A; 16]; // Comes first lexicographically
1037        let prop_b = [0x0B; 16]; // Comes second
1038
1039        // Edit 1: values in order A, B
1040        let edit1: Edit<'static> = Edit {
1041            id: [1u8; 16],
1042            name: Cow::Owned("Test".to_string()),
1043            authors: vec![],
1044            created_at: 0,
1045                        ops: vec![
1046                Op::CreateEntity(CreateEntity {
1047                    id: [3u8; 16],
1048                    values: vec![
1049                        PropertyValue {
1050                            property: prop_a,
1051                            value: Value::Text {
1052                                value: Cow::Owned("Hello".to_string()),
1053                                language: None,
1054                            },
1055                        },
1056                        PropertyValue {
1057                            property: prop_b,
1058                            value: Value::Int64 { value: 42, unit: None },
1059                        },
1060                    ],
1061                    context: None,
1062                }),
1063            ],
1064        };
1065
1066        // Edit 2: Same content but values in different order
1067        let edit2: Edit<'static> = Edit {
1068            id: [1u8; 16],
1069            name: Cow::Owned("Test".to_string()),
1070            authors: vec![],
1071            created_at: 0,
1072                        ops: vec![
1073                Op::CreateEntity(CreateEntity {
1074                    id: [3u8; 16],
1075                    values: vec![
1076                        // Note: prop_b first this time (different insertion order)
1077                        PropertyValue {
1078                            property: prop_b,
1079                            value: Value::Int64 { value: 42, unit: None },
1080                        },
1081                        PropertyValue {
1082                            property: prop_a,
1083                            value: Value::Text {
1084                                value: Cow::Owned("Hello".to_string()),
1085                                language: None,
1086                            },
1087                        },
1088                    ],
1089                    context: None,
1090                }),
1091            ],
1092        };
1093
1094        // Non-canonical encoding may produce different bytes
1095        let fast1 = encode_edit_with_options(&edit1, EncodeOptions::new()).unwrap();
1096        let fast2 = encode_edit_with_options(&edit2, EncodeOptions::new()).unwrap();
1097        // These might differ because dictionary order depends on insertion order
1098        // (We don't assert they're different because they might happen to be the same)
1099
1100        // Canonical encoding MUST produce identical bytes for same logical content
1101        let canonical1 = encode_edit_with_options(&edit1, EncodeOptions::canonical()).unwrap();
1102        let canonical2 = encode_edit_with_options(&edit2, EncodeOptions::canonical()).unwrap();
1103
1104        // Both should decode correctly
1105        let decoded1 = decode_edit(&canonical1).unwrap();
1106        let decoded2 = decode_edit(&canonical2).unwrap();
1107        assert_eq!(decoded1.id, edit1.id);
1108        assert_eq!(decoded2.id, edit2.id);
1109
1110        // And the encoded bytes should be identical (deterministic)
1111        // Note: The ops themselves may have different value orders, but the dictionary
1112        // portion should be identical since it's sorted by ID
1113        assert_eq!(
1114            &canonical1[..50], // Check header + dictionary start
1115            &canonical2[..50],
1116            "Canonical encoding should produce identical dictionary bytes"
1117        );
1118
1119        // Verify the edit still roundtrips
1120        let _ = fast1;
1121        let _ = fast2;
1122    }
1123
1124    #[test]
1125    fn test_canonical_encoding_roundtrip() {
1126        let edit = make_test_edit();
1127
1128        let encoded = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1129        let decoded = decode_edit(&encoded).unwrap();
1130
1131        assert_eq!(edit.id, decoded.id);
1132        assert_eq!(edit.name, decoded.name);
1133        assert_eq!(edit.authors, decoded.authors);
1134        assert_eq!(edit.created_at, decoded.created_at);
1135        assert_eq!(edit.ops.len(), decoded.ops.len());
1136    }
1137
1138    #[test]
1139    fn test_canonical_encoding_compressed() {
1140        let edit = make_test_edit();
1141
1142        let encoded = encode_edit_compressed_with_options(&edit, 3, EncodeOptions::canonical()).unwrap();
1143        let decoded = decode_edit(&encoded).unwrap();
1144
1145        assert_eq!(edit.id, decoded.id);
1146        assert_eq!(edit.name, decoded.name);
1147    }
1148
1149    #[test]
1150    fn test_canonical_rejects_duplicate_authors() {
1151        let author1 = [1u8; 16];
1152
1153        let edit: Edit<'static> = Edit {
1154            id: [0u8; 16],
1155            name: Cow::Owned("Test".to_string()),
1156            authors: vec![author1, author1], // Duplicate!
1157            created_at: 0,
1158                        ops: vec![],
1159        };
1160
1161        // Fast mode doesn't check duplicates
1162        let result = encode_edit_with_options(&edit, EncodeOptions::new());
1163        assert!(result.is_ok());
1164
1165        // Canonical mode rejects duplicates
1166        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1167        assert!(matches!(result, Err(EncodeError::DuplicateAuthor { .. })));
1168    }
1169
1170    #[test]
1171    fn test_canonical_rejects_duplicate_values() {
1172        let prop = [10u8; 16];
1173
1174        let edit: Edit<'static> = Edit {
1175            id: [0u8; 16],
1176            name: Cow::Owned("Test".to_string()),
1177            authors: vec![],
1178            created_at: 0,
1179                        ops: vec![
1180                Op::CreateEntity(CreateEntity {
1181                    id: [1u8; 16],
1182                    values: vec![
1183                        PropertyValue {
1184                            property: prop,
1185                            value: Value::Text {
1186                                value: Cow::Owned("First".to_string()),
1187                                language: None,
1188                            },
1189                        },
1190                        PropertyValue {
1191                            property: prop,
1192                            value: Value::Text {
1193                                value: Cow::Owned("Second".to_string()),
1194                                language: None,
1195                            },
1196                        },
1197                    ],
1198                    context: None,
1199                }),
1200            ],
1201        };
1202
1203        // Canonical mode rejects duplicate (property, language) pairs
1204        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1205        assert!(matches!(result, Err(EncodeError::DuplicateValue { .. })));
1206    }
1207
1208    #[test]
1209    fn test_canonical_allows_different_languages() {
1210        let prop = [10u8; 16];
1211        let lang_en = [20u8; 16];
1212        let lang_es = [21u8; 16];
1213
1214        let edit: Edit<'static> = Edit {
1215            id: [0u8; 16],
1216            name: Cow::Owned("Test".to_string()),
1217            authors: vec![],
1218            created_at: 0,
1219                        ops: vec![
1220                Op::CreateEntity(CreateEntity {
1221                    id: [1u8; 16],
1222                    values: vec![
1223                        PropertyValue {
1224                            property: prop,
1225                            value: Value::Text {
1226                                value: Cow::Owned("Hello".to_string()),
1227                                language: Some(lang_en),
1228                            },
1229                        },
1230                        PropertyValue {
1231                            property: prop,
1232                            value: Value::Text {
1233                                value: Cow::Owned("Hola".to_string()),
1234                                language: Some(lang_es),
1235                            },
1236                        },
1237                    ],
1238                    context: None,
1239                }),
1240            ],
1241        };
1242
1243        // Different languages for same property is allowed
1244        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1245        assert!(result.is_ok());
1246    }
1247
1248    #[test]
1249    fn test_canonical_sorts_values_deterministically() {
1250        let prop_a = [0x0A; 16];
1251        let prop_b = [0x0B; 16];
1252
1253        // Values in reverse order (B before A)
1254        let edit: Edit<'static> = Edit {
1255            id: [1u8; 16],
1256            name: Cow::Owned("Test".to_string()),
1257            authors: vec![],
1258            created_at: 0,
1259                        ops: vec![
1260                Op::CreateEntity(CreateEntity {
1261                    id: [3u8; 16],
1262                    values: vec![
1263                        PropertyValue {
1264                            property: prop_b, // B first
1265                            value: Value::Int64 { value: 42, unit: None },
1266                        },
1267                        PropertyValue {
1268                            property: prop_a, // A second
1269                            value: Value::Text {
1270                                value: Cow::Owned("Hello".to_string()),
1271                                language: None,
1272                            },
1273                        },
1274                    ],
1275                    context: None,
1276                }),
1277            ],
1278        };
1279
1280        // Encode twice - should produce identical bytes
1281        let encoded1 = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1282        let encoded2 = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1283        assert_eq!(encoded1, encoded2, "Canonical encoding should be deterministic");
1284
1285        // Should roundtrip
1286        let decoded = decode_edit(&encoded1).unwrap();
1287        assert_eq!(decoded.ops.len(), 1);
1288    }
1289}