grc_20/codec/
edit.rs

1//! Edit encoding/decoding for GRC-20 binary format.
2//!
3//! Implements the wire format for edits (spec Section 6.3).
4
5use std::borrow::Cow;
6use std::io::Read;
7
8use rustc_hash::{FxHashMap, FxHashSet};
9
10use crate::codec::op::{decode_op, encode_op};
11use crate::codec::primitives::{Reader, Writer};
12use crate::error::{DecodeError, EncodeError};
13use crate::limits::{
14    FORMAT_VERSION, MAGIC_COMPRESSED, MAGIC_UNCOMPRESSED, MAX_AUTHORS, MAX_DICT_SIZE,
15    MAX_EDIT_SIZE, MAX_OPS_PER_EDIT, MAX_STRING_LEN, MIN_FORMAT_VERSION,
16};
17use crate::model::{Context, ContextEdge, DataType, DictionaryBuilder, Edit, Id, Op, WireDictionaries};
18
19// =============================================================================
20// DECODING
21// =============================================================================
22
23/// Decompresses a GRC2Z compressed edit, returning the uncompressed bytes.
24///
25/// Use this with [`decode_edit`] for zero-copy decoding of compressed data:
26///
27/// ```ignore
28/// let uncompressed = decompress(&compressed_bytes)?;
29/// let edit = decode_edit(&uncompressed)?;  // zero-copy, borrows from uncompressed
30/// // edit is valid while uncompressed is alive
31/// ```
32pub fn decompress(input: &[u8]) -> Result<Vec<u8>, DecodeError> {
33    if input.len() < 5 {
34        return Err(DecodeError::UnexpectedEof { context: "magic" });
35    }
36    if &input[0..5] != MAGIC_COMPRESSED {
37        let mut found = [0u8; 4];
38        found.copy_from_slice(&input[0..4]);
39        return Err(DecodeError::InvalidMagic { found });
40    }
41    decompress_zstd(&input[5..])
42}
43
44/// Decodes an Edit from binary data with zero-copy borrowing.
45///
46/// Handles both compressed (GRC2Z) and uncompressed (GRC2) formats.
47/// For true zero-copy with compressed data, use [`decompress`] first:
48///
49/// ```ignore
50/// // Zero-copy for compressed data:
51/// let uncompressed = decompress(&compressed)?;
52/// let edit = decode_edit(&uncompressed)?;
53///
54/// // Zero-copy for uncompressed data:
55/// let edit = decode_edit(&uncompressed_bytes)?;
56/// ```
57///
58/// If you pass compressed data directly, it will decompress internally
59/// and allocate owned strings (no zero-copy benefit).
60pub fn decode_edit(input: &[u8]) -> Result<Edit<'_>, DecodeError> {
61    if input.len() < 4 {
62        return Err(DecodeError::UnexpectedEof { context: "magic" });
63    }
64
65    // Detect compression
66    if input.len() >= 5 && &input[0..5] == MAGIC_COMPRESSED {
67        // Compressed: decompress and decode with allocations
68        // (for zero-copy, caller should use decompress() first)
69        let decompressed = decompress_zstd(&input[5..])?;
70        if decompressed.len() > MAX_EDIT_SIZE {
71            return Err(DecodeError::LengthExceedsLimit {
72                field: "edit",
73                len: decompressed.len(),
74                max: MAX_EDIT_SIZE,
75            });
76        }
77        decode_edit_owned(&decompressed)
78    } else if &input[0..4] == MAGIC_UNCOMPRESSED {
79        // Uncompressed: decode with zero-copy borrowing
80        if input.len() > MAX_EDIT_SIZE {
81            return Err(DecodeError::LengthExceedsLimit {
82                field: "edit",
83                len: input.len(),
84                max: MAX_EDIT_SIZE,
85            });
86        }
87        decode_edit_borrowed(input)
88    } else {
89        let mut found = [0u8; 4];
90        found.copy_from_slice(&input[0..4]);
91        Err(DecodeError::InvalidMagic { found })
92    }
93}
94
95/// Decodes an Edit with zero-copy borrowing from the input.
96fn decode_edit_borrowed(input: &[u8]) -> Result<Edit<'_>, DecodeError> {
97    let mut reader = Reader::new(input);
98
99    // Skip magic (already validated)
100    reader.read_bytes(4, "magic")?;
101
102    // Version
103    let version = reader.read_byte("version")?;
104    if version < MIN_FORMAT_VERSION || version > FORMAT_VERSION {
105        return Err(DecodeError::UnsupportedVersion { version });
106    }
107
108    // Header
109    let edit_id = reader.read_id("edit_id")?;
110    let name = Cow::Borrowed(reader.read_str(MAX_STRING_LEN, "name")?);
111    let authors = reader.read_id_vec(MAX_AUTHORS, "authors")?;
112    let created_at = reader.read_signed_varint("created_at")?;
113
114    // Schema dictionaries (with duplicate detection)
115    let property_count = reader.read_varint("property_count")? as usize;
116    if property_count > MAX_DICT_SIZE {
117        return Err(DecodeError::LengthExceedsLimit {
118            field: "properties",
119            len: property_count,
120            max: MAX_DICT_SIZE,
121        });
122    }
123    let mut properties = Vec::with_capacity(property_count);
124    let mut seen_props = FxHashSet::with_capacity_and_hasher(property_count, Default::default());
125    for _ in 0..property_count {
126        let id = reader.read_id("property_id")?;
127        if !seen_props.insert(id) {
128            return Err(DecodeError::DuplicateDictionaryEntry { dict: "properties", id });
129        }
130        let dt_byte = reader.read_byte("data_type")?;
131        let data_type = DataType::from_u8(dt_byte)
132            .ok_or(DecodeError::InvalidDataType { data_type: dt_byte })?;
133        properties.push((id, data_type));
134    }
135
136    let relation_types = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "relation_types")?;
137    let languages = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "languages")?;
138    let units = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "units")?;
139    let objects = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "objects")?;
140    let context_ids = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "context_ids")?;
141
142    let mut dicts = WireDictionaries {
143        properties,
144        relation_types,
145        languages,
146        units,
147        objects,
148        context_ids,
149        contexts: Vec::new(),
150    };
151
152    // Contexts - decode and store in dicts for op decoding to resolve
153    let context_count = reader.read_varint("context_count")? as usize;
154    if context_count > MAX_DICT_SIZE {
155        return Err(DecodeError::LengthExceedsLimit {
156            field: "contexts",
157            len: context_count,
158            max: MAX_DICT_SIZE,
159        });
160    }
161    for _ in 0..context_count {
162        dicts.contexts.push(decode_context(&mut reader, &dicts)?);
163    }
164
165    // Operations
166    let op_count = reader.read_varint("op_count")? as usize;
167    if op_count > MAX_OPS_PER_EDIT {
168        return Err(DecodeError::LengthExceedsLimit {
169            field: "ops",
170            len: op_count,
171            max: MAX_OPS_PER_EDIT,
172        });
173    }
174
175    let mut ops = Vec::with_capacity(op_count);
176    for _ in 0..op_count {
177        ops.push(decode_op(&mut reader, &dicts)?);
178    }
179
180    Ok(Edit {
181        id: edit_id,
182        name,
183        authors,
184        created_at,
185        ops,
186    })
187}
188
189/// Decodes an Edit with allocations (for decompressed data).
190fn decode_edit_owned(data: &[u8]) -> Result<Edit<'static>, DecodeError> {
191    let mut reader = Reader::new(data);
192
193    // Skip magic (already validated in decompress)
194    reader.read_bytes(4, "magic")?;
195
196    // Version
197    let version = reader.read_byte("version")?;
198    if version < MIN_FORMAT_VERSION || version > FORMAT_VERSION {
199        return Err(DecodeError::UnsupportedVersion { version });
200    }
201
202    // Header - use allocating reads
203    let edit_id = reader.read_id("edit_id")?;
204    let name = Cow::Owned(reader.read_string(MAX_STRING_LEN, "name")?);
205    let authors = reader.read_id_vec(MAX_AUTHORS, "authors")?;
206    let created_at = reader.read_signed_varint("created_at")?;
207
208    // Schema dictionaries (with duplicate detection)
209    let property_count = reader.read_varint("property_count")? as usize;
210    if property_count > MAX_DICT_SIZE {
211        return Err(DecodeError::LengthExceedsLimit {
212            field: "properties",
213            len: property_count,
214            max: MAX_DICT_SIZE,
215        });
216    }
217    let mut properties = Vec::with_capacity(property_count);
218    let mut seen_props = FxHashSet::with_capacity_and_hasher(property_count, Default::default());
219    for _ in 0..property_count {
220        let id = reader.read_id("property_id")?;
221        if !seen_props.insert(id) {
222            return Err(DecodeError::DuplicateDictionaryEntry { dict: "properties", id });
223        }
224        let dt_byte = reader.read_byte("data_type")?;
225        let data_type = DataType::from_u8(dt_byte)
226            .ok_or(DecodeError::InvalidDataType { data_type: dt_byte })?;
227        properties.push((id, data_type));
228    }
229
230    let relation_types = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "relation_types")?;
231    let languages = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "languages")?;
232    let units = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "units")?;
233    let objects = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "objects")?;
234    let context_ids = read_id_vec_no_duplicates(&mut reader, MAX_DICT_SIZE, "context_ids")?;
235
236    let mut dicts = WireDictionaries {
237        properties,
238        relation_types,
239        languages,
240        units,
241        objects,
242        context_ids,
243        contexts: Vec::new(),
244    };
245
246    // Contexts - decode and store in dicts for op decoding to resolve
247    let context_count = reader.read_varint("context_count")? as usize;
248    if context_count > MAX_DICT_SIZE {
249        return Err(DecodeError::LengthExceedsLimit {
250            field: "contexts",
251            len: context_count,
252            max: MAX_DICT_SIZE,
253        });
254    }
255    for _ in 0..context_count {
256        dicts.contexts.push(decode_context(&mut reader, &dicts)?);
257    }
258
259    // Operations - use allocating decode
260    let op_count = reader.read_varint("op_count")? as usize;
261    if op_count > MAX_OPS_PER_EDIT {
262        return Err(DecodeError::LengthExceedsLimit {
263            field: "ops",
264            len: op_count,
265            max: MAX_OPS_PER_EDIT,
266        });
267    }
268
269    let mut ops = Vec::with_capacity(op_count);
270    for _ in 0..op_count {
271        ops.push(decode_op_owned(&mut reader, &dicts)?);
272    }
273
274    Ok(Edit {
275        id: edit_id,
276        name,
277        authors,
278        created_at,
279        ops,
280    })
281}
282
283/// Decodes an Op with allocations (for decompressed data).
284fn decode_op_owned(reader: &mut Reader<'_>, dicts: &WireDictionaries) -> Result<Op<'static>, DecodeError> {
285    // Decode normally, then convert to owned
286    let op = decode_op(reader, dicts)?;
287    Ok(op_to_owned(op))
288}
289
290/// Decodes a Context from the reader.
291fn decode_context(reader: &mut Reader<'_>, dicts: &WireDictionaries) -> Result<Context, DecodeError> {
292    let root_id_index = reader.read_varint("root_id")? as usize;
293    if root_id_index >= dicts.context_ids.len() {
294        return Err(DecodeError::IndexOutOfBounds {
295            dict: "context_ids",
296            index: root_id_index,
297            size: dicts.context_ids.len(),
298        });
299    }
300    let root_id = dicts.context_ids[root_id_index];
301
302    let edge_count = reader.read_varint("edge_count")? as usize;
303    if edge_count > MAX_DICT_SIZE {
304        return Err(DecodeError::LengthExceedsLimit {
305            field: "context_edges",
306            len: edge_count,
307            max: MAX_DICT_SIZE,
308        });
309    }
310
311    let mut edges = Vec::with_capacity(edge_count);
312    for _ in 0..edge_count {
313        let type_id_index = reader.read_varint("edge_type_id")? as usize;
314        if type_id_index >= dicts.relation_types.len() {
315            return Err(DecodeError::IndexOutOfBounds {
316                dict: "relation_types",
317                index: type_id_index,
318                size: dicts.relation_types.len(),
319            });
320        }
321        let type_id = dicts.relation_types[type_id_index];
322
323        let to_entity_id_index = reader.read_varint("edge_to_entity_id")? as usize;
324        if to_entity_id_index >= dicts.context_ids.len() {
325            return Err(DecodeError::IndexOutOfBounds {
326                dict: "context_ids",
327                index: to_entity_id_index,
328                size: dicts.context_ids.len(),
329            });
330        }
331        let to_entity_id = dicts.context_ids[to_entity_id_index];
332
333        edges.push(ContextEdge { type_id, to_entity_id });
334    }
335
336    Ok(Context { root_id, edges })
337}
338
339/// Converts an Op with borrowed data to owned data.
340fn op_to_owned(op: Op<'_>) -> Op<'static> {
341    match op {
342        Op::CreateEntity(ce) => Op::CreateEntity(crate::model::CreateEntity {
343            id: ce.id,
344            values: ce.values.into_iter().map(pv_to_owned).collect(),
345            context: ce.context,
346        }),
347        Op::UpdateEntity(ue) => Op::UpdateEntity(crate::model::UpdateEntity {
348            id: ue.id,
349            set_properties: ue.set_properties.into_iter().map(pv_to_owned).collect(),
350            unset_values: ue.unset_values,
351            context: ue.context,
352        }),
353        Op::DeleteEntity(de) => Op::DeleteEntity(de),
354        Op::RestoreEntity(re) => Op::RestoreEntity(re),
355        Op::CreateRelation(cr) => Op::CreateRelation(crate::model::CreateRelation {
356            id: cr.id,
357            relation_type: cr.relation_type,
358            from: cr.from,
359            from_is_value_ref: cr.from_is_value_ref,
360            to: cr.to,
361            to_is_value_ref: cr.to_is_value_ref,
362            entity: cr.entity,
363            position: cr.position.map(|p| Cow::Owned(p.into_owned())),
364            from_space: cr.from_space,
365            from_version: cr.from_version,
366            to_space: cr.to_space,
367            to_version: cr.to_version,
368            context: cr.context,
369        }),
370        Op::UpdateRelation(ur) => Op::UpdateRelation(crate::model::UpdateRelation {
371            id: ur.id,
372            from_space: ur.from_space,
373            from_version: ur.from_version,
374            to_space: ur.to_space,
375            to_version: ur.to_version,
376            position: ur.position.map(|p| Cow::Owned(p.into_owned())),
377            unset: ur.unset,
378            context: ur.context,
379        }),
380        Op::DeleteRelation(dr) => Op::DeleteRelation(dr),
381        Op::RestoreRelation(rr) => Op::RestoreRelation(rr),
382        Op::CreateValueRef(cvr) => Op::CreateValueRef(cvr),
383    }
384}
385
386/// Converts a PropertyValue with borrowed data to owned data.
387fn pv_to_owned(pv: crate::model::PropertyValue<'_>) -> crate::model::PropertyValue<'static> {
388    crate::model::PropertyValue {
389        property: pv.property,
390        value: value_to_owned(pv.value),
391    }
392}
393
394/// Converts a Value with borrowed data to owned data.
395fn value_to_owned(v: crate::model::Value<'_>) -> crate::model::Value<'static> {
396    use crate::model::{DecimalMantissa, Value};
397    match v {
398        Value::Bool(b) => Value::Bool(b),
399        Value::Int64 { value, unit } => Value::Int64 { value, unit },
400        Value::Float64 { value, unit } => Value::Float64 { value, unit },
401        Value::Decimal { exponent, mantissa, unit } => Value::Decimal {
402            exponent,
403            mantissa: match mantissa {
404                DecimalMantissa::I64(i) => DecimalMantissa::I64(i),
405                DecimalMantissa::Big(b) => DecimalMantissa::Big(Cow::Owned(b.into_owned())),
406            },
407            unit,
408        },
409        Value::Text { value, language } => Value::Text {
410            value: Cow::Owned(value.into_owned()),
411            language,
412        },
413        Value::Bytes(b) => Value::Bytes(Cow::Owned(b.into_owned())),
414        Value::Date(s) => Value::Date(Cow::Owned(s.into_owned())),
415        Value::Time(s) => Value::Time(Cow::Owned(s.into_owned())),
416        Value::Datetime(s) => Value::Datetime(Cow::Owned(s.into_owned())),
417        Value::Schedule(s) => Value::Schedule(Cow::Owned(s.into_owned())),
418        Value::Point { lat, lon, alt } => Value::Point { lat, lon, alt },
419        Value::Rect { min_lat, min_lon, max_lat, max_lon } => Value::Rect { min_lat, min_lon, max_lat, max_lon },
420        Value::Embedding { sub_type, dims, data } => Value::Embedding {
421            sub_type,
422            dims,
423            data: Cow::Owned(data.into_owned()),
424        },
425    }
426}
427
428/// Reads an ID vector and checks for duplicates.
429fn read_id_vec_no_duplicates(
430    reader: &mut Reader<'_>,
431    max_len: usize,
432    field: &'static str,
433) -> Result<Vec<Id>, DecodeError> {
434    let count = reader.read_varint(field)? as usize;
435    if count > max_len {
436        return Err(DecodeError::LengthExceedsLimit {
437            field,
438            len: count,
439            max: max_len,
440        });
441    }
442
443    let mut ids = Vec::with_capacity(count);
444    let mut seen = FxHashSet::with_capacity_and_hasher(count, Default::default());
445
446    for _ in 0..count {
447        let id = reader.read_id(field)?;
448        if !seen.insert(id) {
449            return Err(DecodeError::DuplicateDictionaryEntry { dict: field, id });
450        }
451        ids.push(id);
452    }
453
454    Ok(ids)
455}
456
457fn decompress_zstd(compressed: &[u8]) -> Result<Vec<u8>, DecodeError> {
458    // Read uncompressed size
459    let mut reader = Reader::new(compressed);
460    let declared_size = reader.read_varint("uncompressed_size")? as usize;
461
462    if declared_size > MAX_EDIT_SIZE {
463        return Err(DecodeError::LengthExceedsLimit {
464            field: "uncompressed_size",
465            len: declared_size,
466            max: MAX_EDIT_SIZE,
467        });
468    }
469
470    let compressed_data = reader.remaining();
471
472    let mut decoder = zstd::Decoder::new(compressed_data)
473        .map_err(|e| DecodeError::DecompressionFailed(e.to_string()))?;
474
475    let mut decompressed = Vec::with_capacity(declared_size);
476    decoder
477        .read_to_end(&mut decompressed)
478        .map_err(|e| DecodeError::DecompressionFailed(e.to_string()))?;
479
480    if decompressed.len() != declared_size {
481        return Err(DecodeError::UncompressedSizeMismatch {
482            declared: declared_size,
483            actual: decompressed.len(),
484        });
485    }
486
487    Ok(decompressed)
488}
489
490// =============================================================================
491// ENCODING
492// =============================================================================
493
494/// Options for encoding edits.
495#[derive(Debug, Clone, Copy, Default)]
496pub struct EncodeOptions {
497    /// Enable canonical encoding mode.
498    ///
499    /// When enabled:
500    /// - Dictionary entries are sorted by ID bytes (lexicographic)
501    /// - This ensures deterministic output for the same logical edit
502    ///
503    /// Use canonical mode when:
504    /// - Computing content hashes for deduplication
505    /// - Creating signatures over edit content
506    /// - Ensuring cross-implementation reproducibility
507    ///
508    /// Note: Canonical mode requires two passes over the ops and is slower
509    /// than non-canonical encoding.
510    pub canonical: bool,
511}
512
513impl EncodeOptions {
514    /// Creates default (non-canonical) encoding options.
515    pub fn new() -> Self {
516        Self::default()
517    }
518
519    /// Creates canonical encoding options.
520    pub fn canonical() -> Self {
521        Self { canonical: true }
522    }
523}
524
525/// Encodes an Edit to binary format (uncompressed).
526///
527/// Uses single-pass encoding: ops are encoded to a buffer while building
528/// dictionaries, then the final output is assembled.
529pub fn encode_edit(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
530    encode_edit_with_options(edit, EncodeOptions::default())
531}
532
533/// Encodes an Edit to binary format with the given options.
534pub fn encode_edit_with_options(edit: &Edit, options: EncodeOptions) -> Result<Vec<u8>, EncodeError> {
535    if options.canonical {
536        encode_edit_canonical(edit)
537    } else {
538        encode_edit_fast(edit)
539    }
540}
541
542/// Fast single-pass encoding (non-canonical).
543fn encode_edit_fast(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
544    // Property types are determined from values themselves (per-edit typing)
545    let property_types = rustc_hash::FxHashMap::default();
546
547    // Create dictionary builder - contexts will be collected from ops
548    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
549
550    // Single pass: encode ops while building dictionaries (including contexts)
551    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
552
553    for op in &edit.ops {
554        encode_op(&mut ops_writer, op, &mut dict_builder, &property_types)?;
555    }
556
557    // Now assemble final output: header + dictionaries + contexts + ops
558    let ops_bytes = ops_writer.into_bytes();
559    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
560
561    // Magic and version
562    writer.write_bytes(MAGIC_UNCOMPRESSED);
563    writer.write_byte(FORMAT_VERSION);
564
565    // Header
566    writer.write_id(&edit.id);
567    writer.write_string(&edit.name);
568    writer.write_id_vec(&edit.authors);
569    writer.write_signed_varint(edit.created_at);
570
571    // Dictionaries
572    dict_builder.write_dictionaries(&mut writer);
573
574    // Contexts (collected from ops during encoding)
575    dict_builder.write_contexts(&mut writer);
576
577    // Operations (already encoded)
578    writer.write_varint(edit.ops.len() as u64);
579    writer.write_bytes(&ops_bytes);
580
581    Ok(writer.into_bytes())
582}
583
584/// Canonical two-pass encoding with sorted dictionaries, authors, values, and unsets.
585///
586/// Pass 1: Collect all dictionary entries
587/// Pass 2: Sort dictionaries, encode with stable indices and sorted values
588///
589/// Canonical mode requirements (spec Section 4.4):
590/// - Dictionaries sorted by ID bytes
591/// - Authors sorted by ID bytes, no duplicates
592/// - Values sorted by (propertyRef, languageRef), no duplicate (property, language)
593/// - Unset values sorted by (propertyRef, language), no duplicates
594fn encode_edit_canonical(edit: &Edit) -> Result<Vec<u8>, EncodeError> {
595    // Property types are determined from values themselves (per-edit typing)
596    let property_types = rustc_hash::FxHashMap::default();
597
598    // Create dictionary builder - contexts will be collected from ops
599    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
600
601    // Pass 1: Collect all dictionary entries (including contexts) by doing a dry run
602    let mut temp_writer = Writer::with_capacity(edit.ops.len() * 50);
603    for op in &edit.ops {
604        encode_op(&mut temp_writer, op, &mut dict_builder, &property_types)?;
605    }
606
607    // Sort dictionaries and get sorted builder
608    let sorted_builder = dict_builder.into_sorted();
609
610    // Sort authors by ID bytes and check for duplicates
611    let mut sorted_authors = edit.authors.clone();
612    sorted_authors.sort();
613    // Check for duplicate authors
614    for i in 1..sorted_authors.len() {
615        if sorted_authors[i] == sorted_authors[i - 1] {
616            return Err(EncodeError::DuplicateAuthor { id: sorted_authors[i] });
617        }
618    }
619
620    // Pass 2: Encode ops with sorted dictionary indices and sorted values
621    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
622    let mut canonical_builder = sorted_builder.clone();
623    for op in &edit.ops {
624        encode_op_canonical(&mut ops_writer, op, &mut canonical_builder, &property_types)?;
625    }
626
627    // Assemble final output: header + dictionaries + contexts + ops
628    let ops_bytes = ops_writer.into_bytes();
629    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
630
631    // Magic and version
632    writer.write_bytes(MAGIC_UNCOMPRESSED);
633    writer.write_byte(FORMAT_VERSION);
634
635    // Header
636    writer.write_id(&edit.id);
637    writer.write_string(&edit.name);
638    writer.write_id_vec(&sorted_authors);
639    writer.write_signed_varint(edit.created_at);
640
641    // Dictionaries (sorted)
642    sorted_builder.write_dictionaries(&mut writer);
643
644    // Contexts (collected from ops during pass 1, sorted)
645    sorted_builder.write_contexts(&mut writer);
646
647    // Operations
648    writer.write_varint(edit.ops.len() as u64);
649    writer.write_bytes(&ops_bytes);
650
651    Ok(writer.into_bytes())
652}
653
654/// Encodes an op in canonical mode with sorted values.
655fn encode_op_canonical(
656    writer: &mut Writer,
657    op: &Op<'_>,
658    dict_builder: &mut DictionaryBuilder,
659    property_types: &FxHashMap<Id, DataType>,
660) -> Result<(), EncodeError> {
661    match op {
662        Op::CreateEntity(ce) => {
663            // Sort values by (property_index, language_index) and check for duplicates
664            let sorted_values = sort_and_check_values(&ce.values, dict_builder)?;
665
666            writer.write_byte(1); // OP_CREATE_ENTITY
667            writer.write_id(&ce.id);
668            writer.write_varint(sorted_values.len() as u64);
669
670            for pv in &sorted_values {
671                let data_type = property_types.get(&pv.property)
672                    .copied()
673                    .unwrap_or_else(|| pv.value.data_type());
674                encode_property_value_canonical(writer, pv, dict_builder, data_type)?;
675            }
676            // Write context_ref: 0xFFFFFFFF = no context, else index into contexts[]
677            let context_ref = match &ce.context {
678                Some(ctx) => dict_builder.add_context(ctx) as u32,
679                None => 0xFFFFFFFF,
680            };
681            writer.write_varint(context_ref as u64);
682            Ok(())
683        }
684        Op::UpdateEntity(ue) => {
685            // Sort set_properties and unset_values, check for duplicates
686            let sorted_set = sort_and_check_values(&ue.set_properties, dict_builder)?;
687            let sorted_unset = sort_and_check_unsets(&ue.unset_values, dict_builder)?;
688
689            writer.write_byte(2); // OP_UPDATE_ENTITY
690            let id_index = dict_builder.add_object(ue.id);
691            writer.write_varint(id_index as u64);
692
693            let mut flags = 0u8;
694            if !sorted_set.is_empty() {
695                flags |= 0x01; // FLAG_HAS_SET_PROPERTIES
696            }
697            if !sorted_unset.is_empty() {
698                flags |= 0x02; // FLAG_HAS_UNSET_VALUES
699            }
700            writer.write_byte(flags);
701
702            if !sorted_set.is_empty() {
703                writer.write_varint(sorted_set.len() as u64);
704                for pv in &sorted_set {
705                    let data_type = property_types.get(&pv.property)
706                        .copied()
707                        .unwrap_or_else(|| pv.value.data_type());
708                    encode_property_value_canonical(writer, pv, dict_builder, data_type)?;
709                }
710            }
711
712            if !sorted_unset.is_empty() {
713                use crate::model::UnsetLanguage;
714                writer.write_varint(sorted_unset.len() as u64);
715                for unset in &sorted_unset {
716                    let prop_idx = dict_builder.add_property(unset.property, DataType::Bool);
717                    writer.write_varint(prop_idx as u64);
718                    let lang_value: u32 = match &unset.language {
719                        UnsetLanguage::All => 0xFFFFFFFF,
720                        UnsetLanguage::English => 0,
721                        UnsetLanguage::Specific(lang_id) => {
722                            dict_builder.add_language(Some(*lang_id)) as u32
723                        }
724                    };
725                    writer.write_varint(lang_value as u64);
726                }
727            }
728            // Write context_ref: 0xFFFFFFFF = no context, else index into contexts[]
729            let context_ref = match &ue.context {
730                Some(ctx) => dict_builder.add_context(ctx) as u32,
731                None => 0xFFFFFFFF,
732            };
733            writer.write_varint(context_ref as u64);
734            Ok(())
735        }
736        // Other ops don't have values to sort, delegate to regular encode
737        _ => encode_op(writer, op, dict_builder, property_types),
738    }
739}
740
741/// Sorts values by (property_index, language_index) and checks for duplicates.
742fn sort_and_check_values<'a>(
743    values: &[crate::model::PropertyValue<'a>],
744    dict_builder: &DictionaryBuilder,
745) -> Result<Vec<crate::model::PropertyValue<'a>>, EncodeError> {
746    use crate::model::{PropertyValue, Value};
747
748    if values.is_empty() {
749        return Ok(Vec::new());
750    }
751
752    // Create (property_index, language_index, original_index) tuples for sorting
753    let mut indexed: Vec<(usize, usize, usize, &PropertyValue<'a>)> = values
754        .iter()
755        .enumerate()
756        .map(|(i, pv)| {
757            let prop_idx = dict_builder.get_property_index(&pv.property).unwrap_or(0);
758            let lang_idx = match &pv.value {
759                Value::Text { language, .. } => dict_builder.get_language_index(language.as_ref()).unwrap_or(0),
760                _ => 0,
761            };
762            (prop_idx, lang_idx, i, pv)
763        })
764        .collect();
765
766    // Sort by (property_index, language_index)
767    indexed.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
768
769    // Check for duplicates (adjacent entries with same property_index and language_index)
770    for i in 1..indexed.len() {
771        if indexed[i].0 == indexed[i - 1].0 && indexed[i].1 == indexed[i - 1].1 {
772            let pv = indexed[i].3;
773            let language = match &pv.value {
774                Value::Text { language, .. } => *language,
775                _ => None,
776            };
777            return Err(EncodeError::DuplicateValue {
778                property: pv.property,
779                language,
780            });
781        }
782    }
783
784    // Return cloned values in sorted order
785    Ok(indexed.into_iter().map(|(_, _, _, pv)| pv.clone()).collect())
786}
787
788/// Sorts unset values by (property_index, language) and checks for duplicates.
789fn sort_and_check_unsets(
790    unsets: &[crate::model::UnsetValue],
791    dict_builder: &DictionaryBuilder,
792) -> Result<Vec<crate::model::UnsetValue>, EncodeError> {
793    use crate::model::UnsetLanguage;
794
795    if unsets.is_empty() {
796        return Ok(Vec::new());
797    }
798
799    // Create (property_index, language_sort_key, original_index) tuples for sorting
800    let mut indexed: Vec<(usize, u32, usize, &crate::model::UnsetValue)> = unsets
801        .iter()
802        .enumerate()
803        .map(|(i, up)| {
804            let prop_idx = dict_builder.get_property_index(&up.property).unwrap_or(0);
805            let lang_key: u32 = match &up.language {
806                UnsetLanguage::All => 0xFFFFFFFF,
807                UnsetLanguage::English => 0,
808                UnsetLanguage::Specific(lang_id) => {
809                    dict_builder.get_language_index(Some(lang_id)).unwrap_or(0) as u32
810                }
811            };
812            (prop_idx, lang_key, i, up)
813        })
814        .collect();
815
816    // Sort by (property_index, language_key)
817    indexed.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
818
819    // Check for duplicates
820    for i in 1..indexed.len() {
821        if indexed[i].0 == indexed[i - 1].0 && indexed[i].1 == indexed[i - 1].1 {
822            let up = indexed[i].3;
823            let language = match &up.language {
824                UnsetLanguage::All => None,
825                UnsetLanguage::English => None,
826                UnsetLanguage::Specific(id) => Some(*id),
827            };
828            return Err(EncodeError::DuplicateUnset {
829                property: up.property,
830                language,
831            });
832        }
833    }
834
835    Ok(indexed.into_iter().map(|(_, _, _, up)| up.clone()).collect())
836}
837
838/// Encodes a property value in canonical mode (same as regular but separated for clarity).
839fn encode_property_value_canonical(
840    writer: &mut Writer,
841    pv: &crate::model::PropertyValue<'_>,
842    dict_builder: &mut DictionaryBuilder,
843    data_type: DataType,
844) -> Result<(), EncodeError> {
845    let prop_index = dict_builder.add_property(pv.property, data_type);
846    writer.write_varint(prop_index as u64);
847    crate::codec::value::encode_value(writer, &pv.value, dict_builder)?;
848    Ok(())
849}
850
851/// Encodes an Edit with profiling output (two-pass for comparison).
852pub fn encode_edit_profiled(edit: &Edit, profile: bool) -> Result<Vec<u8>, EncodeError> {
853    if !profile {
854        return encode_edit(edit);
855    }
856
857    use std::time::Instant;
858
859    let t0 = Instant::now();
860
861    // Property types are determined from values themselves (per-edit typing)
862    let property_types = rustc_hash::FxHashMap::default();
863    let t1 = Instant::now();
864
865    // Create dictionary builder - contexts will be collected from ops
866    let mut dict_builder = DictionaryBuilder::with_capacity(edit.ops.len());
867
868    // Single pass: encode ops while building dictionaries (including contexts)
869    let mut ops_writer = Writer::with_capacity(edit.ops.len() * 50);
870
871    for op in &edit.ops {
872        encode_op(&mut ops_writer, op, &mut dict_builder, &property_types)?;
873    }
874    let t2 = Instant::now();
875
876    // Assemble final output
877    let ops_bytes = ops_writer.into_bytes();
878    let mut writer = Writer::with_capacity(256 + ops_bytes.len());
879
880    writer.write_bytes(MAGIC_UNCOMPRESSED);
881    writer.write_byte(FORMAT_VERSION);
882    writer.write_id(&edit.id);
883    writer.write_string(&edit.name);
884    writer.write_id_vec(&edit.authors);
885    writer.write_signed_varint(edit.created_at);
886    dict_builder.write_dictionaries(&mut writer);
887    dict_builder.write_contexts(&mut writer);
888    writer.write_varint(edit.ops.len() as u64);
889    writer.write_bytes(&ops_bytes);
890    let t3 = Instant::now();
891
892    let result = writer.into_bytes();
893
894    let total = t3.duration_since(t0);
895    eprintln!("=== Encode Profile (single-pass) ===");
896    eprintln!("  setup: {:?} ({:.1}%)", t1.duration_since(t0), 100.0 * t1.duration_since(t0).as_secs_f64() / total.as_secs_f64());
897    eprintln!("  encode_ops + build_dicts: {:?} ({:.1}%)", t2.duration_since(t1), 100.0 * t2.duration_since(t1).as_secs_f64() / total.as_secs_f64());
898    eprintln!("  assemble output: {:?} ({:.1}%)", t3.duration_since(t2), 100.0 * t3.duration_since(t2).as_secs_f64() / total.as_secs_f64());
899    eprintln!("  TOTAL: {:?}", total);
900
901    Ok(result)
902}
903
904/// Encodes an Edit to binary format with zstd compression.
905pub fn encode_edit_compressed(edit: &Edit, level: i32) -> Result<Vec<u8>, EncodeError> {
906    encode_edit_compressed_with_options(edit, level, EncodeOptions::default())
907}
908
909/// Encodes an Edit to binary format with zstd compression and options.
910pub fn encode_edit_compressed_with_options(
911    edit: &Edit,
912    level: i32,
913    options: EncodeOptions,
914) -> Result<Vec<u8>, EncodeError> {
915    let uncompressed = encode_edit_with_options(edit, options)?;
916
917    let compressed = zstd::encode_all(uncompressed.as_slice(), level)
918        .map_err(|e| EncodeError::CompressionFailed(e.to_string()))?;
919
920    let mut writer = Writer::with_capacity(5 + 10 + compressed.len());
921    writer.write_bytes(MAGIC_COMPRESSED);
922    writer.write_varint(uncompressed.len() as u64);
923    writer.write_bytes(&compressed);
924
925    Ok(writer.into_bytes())
926}
927
928#[cfg(test)]
929mod tests {
930    use super::*;
931    use crate::model::{CreateEntity, PropertyValue, Value};
932
933    fn make_test_edit() -> Edit<'static> {
934        Edit {
935            id: [1u8; 16],
936            name: Cow::Owned("Test Edit".to_string()),
937            authors: vec![[2u8; 16]],
938            created_at: 1234567890,
939                        ops: vec![
940                Op::CreateEntity(CreateEntity {
941                    id: [3u8; 16],
942                    values: vec![PropertyValue {
943                        property: [10u8; 16],
944                        value: Value::Text {
945                            value: Cow::Owned("Hello".to_string()),
946                            language: None,
947                        },
948                    }],
949                    context: None,
950                }),
951            ],
952        }
953    }
954
955    #[test]
956    fn test_edit_roundtrip() {
957        let edit = make_test_edit();
958
959        let encoded = encode_edit(&edit).unwrap();
960        let decoded = decode_edit(&encoded).unwrap();
961
962        assert_eq!(edit.id, decoded.id);
963        assert_eq!(edit.name, decoded.name);
964        assert_eq!(edit.authors, decoded.authors);
965        assert_eq!(edit.created_at, decoded.created_at);
966        assert_eq!(edit.ops.len(), decoded.ops.len());
967    }
968
969    #[test]
970    fn test_edit_compressed_roundtrip() {
971        let edit = make_test_edit();
972
973        let encoded = encode_edit_compressed(&edit, 3).unwrap();
974        let decoded = decode_edit(&encoded).unwrap();
975
976        assert_eq!(edit.id, decoded.id);
977        assert_eq!(edit.name, decoded.name);
978        assert_eq!(edit.authors, decoded.authors);
979        assert_eq!(edit.created_at, decoded.created_at);
980        assert_eq!(edit.ops.len(), decoded.ops.len());
981    }
982
983    #[test]
984    fn test_compression_magic() {
985        let edit = make_test_edit();
986
987        let uncompressed = encode_edit(&edit).unwrap();
988        let compressed = encode_edit_compressed(&edit, 3).unwrap();
989
990        assert_eq!(&uncompressed[0..4], b"GRC2");
991        assert_eq!(&compressed[0..5], b"GRC2Z");
992    }
993
994    #[test]
995    fn test_invalid_magic() {
996        let data = b"XXXX";
997        let result = decode_edit(data);
998        assert!(matches!(result, Err(DecodeError::InvalidMagic { .. })));
999    }
1000
1001    #[test]
1002    fn test_unsupported_version() {
1003        let mut data = Vec::new();
1004        data.extend_from_slice(MAGIC_UNCOMPRESSED);
1005        data.push(99); // Invalid version
1006        // Add enough bytes to not trigger EOF
1007        data.extend_from_slice(&[0u8; 100]);
1008
1009        let result = decode_edit(&data);
1010        assert!(matches!(result, Err(DecodeError::UnsupportedVersion { version: 99 })));
1011    }
1012
1013    #[test]
1014    fn test_empty_edit() {
1015        let edit: Edit<'static> = Edit {
1016            id: [0u8; 16],
1017            name: Cow::Borrowed(""),
1018            authors: vec![],
1019            created_at: 0,
1020                        ops: vec![],
1021        };
1022
1023        let encoded = encode_edit(&edit).unwrap();
1024        let decoded = decode_edit(&encoded).unwrap();
1025
1026        assert_eq!(edit.id, decoded.id);
1027        assert!(decoded.name.is_empty());
1028        assert!(decoded.authors.is_empty());
1029        assert!(decoded.ops.is_empty());
1030    }
1031
1032    #[test]
1033    fn test_canonical_encoding_deterministic() {
1034        // Two edits with values in different order should produce
1035        // identical bytes when using canonical encoding
1036
1037        let prop_a = [0x0A; 16]; // Comes first lexicographically
1038        let prop_b = [0x0B; 16]; // Comes second
1039
1040        // Edit 1: values in order A, B
1041        let edit1: Edit<'static> = Edit {
1042            id: [1u8; 16],
1043            name: Cow::Owned("Test".to_string()),
1044            authors: vec![],
1045            created_at: 0,
1046                        ops: vec![
1047                Op::CreateEntity(CreateEntity {
1048                    id: [3u8; 16],
1049                    values: vec![
1050                        PropertyValue {
1051                            property: prop_a,
1052                            value: Value::Text {
1053                                value: Cow::Owned("Hello".to_string()),
1054                                language: None,
1055                            },
1056                        },
1057                        PropertyValue {
1058                            property: prop_b,
1059                            value: Value::Int64 { value: 42, unit: None },
1060                        },
1061                    ],
1062                    context: None,
1063                }),
1064            ],
1065        };
1066
1067        // Edit 2: Same content but values in different order
1068        let edit2: Edit<'static> = Edit {
1069            id: [1u8; 16],
1070            name: Cow::Owned("Test".to_string()),
1071            authors: vec![],
1072            created_at: 0,
1073                        ops: vec![
1074                Op::CreateEntity(CreateEntity {
1075                    id: [3u8; 16],
1076                    values: vec![
1077                        // Note: prop_b first this time (different insertion order)
1078                        PropertyValue {
1079                            property: prop_b,
1080                            value: Value::Int64 { value: 42, unit: None },
1081                        },
1082                        PropertyValue {
1083                            property: prop_a,
1084                            value: Value::Text {
1085                                value: Cow::Owned("Hello".to_string()),
1086                                language: None,
1087                            },
1088                        },
1089                    ],
1090                    context: None,
1091                }),
1092            ],
1093        };
1094
1095        // Non-canonical encoding may produce different bytes
1096        let fast1 = encode_edit_with_options(&edit1, EncodeOptions::new()).unwrap();
1097        let fast2 = encode_edit_with_options(&edit2, EncodeOptions::new()).unwrap();
1098        // These might differ because dictionary order depends on insertion order
1099        // (We don't assert they're different because they might happen to be the same)
1100
1101        // Canonical encoding MUST produce identical bytes for same logical content
1102        let canonical1 = encode_edit_with_options(&edit1, EncodeOptions::canonical()).unwrap();
1103        let canonical2 = encode_edit_with_options(&edit2, EncodeOptions::canonical()).unwrap();
1104
1105        // Both should decode correctly
1106        let decoded1 = decode_edit(&canonical1).unwrap();
1107        let decoded2 = decode_edit(&canonical2).unwrap();
1108        assert_eq!(decoded1.id, edit1.id);
1109        assert_eq!(decoded2.id, edit2.id);
1110
1111        // And the encoded bytes should be identical (deterministic)
1112        // Note: The ops themselves may have different value orders, but the dictionary
1113        // portion should be identical since it's sorted by ID
1114        assert_eq!(
1115            &canonical1[..50], // Check header + dictionary start
1116            &canonical2[..50],
1117            "Canonical encoding should produce identical dictionary bytes"
1118        );
1119
1120        // Verify the edit still roundtrips
1121        let _ = fast1;
1122        let _ = fast2;
1123    }
1124
1125    #[test]
1126    fn test_canonical_encoding_roundtrip() {
1127        let edit = make_test_edit();
1128
1129        let encoded = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1130        let decoded = decode_edit(&encoded).unwrap();
1131
1132        assert_eq!(edit.id, decoded.id);
1133        assert_eq!(edit.name, decoded.name);
1134        assert_eq!(edit.authors, decoded.authors);
1135        assert_eq!(edit.created_at, decoded.created_at);
1136        assert_eq!(edit.ops.len(), decoded.ops.len());
1137    }
1138
1139    #[test]
1140    fn test_canonical_encoding_compressed() {
1141        let edit = make_test_edit();
1142
1143        let encoded = encode_edit_compressed_with_options(&edit, 3, EncodeOptions::canonical()).unwrap();
1144        let decoded = decode_edit(&encoded).unwrap();
1145
1146        assert_eq!(edit.id, decoded.id);
1147        assert_eq!(edit.name, decoded.name);
1148    }
1149
1150    #[test]
1151    fn test_canonical_rejects_duplicate_authors() {
1152        let author1 = [1u8; 16];
1153
1154        let edit: Edit<'static> = Edit {
1155            id: [0u8; 16],
1156            name: Cow::Owned("Test".to_string()),
1157            authors: vec![author1, author1], // Duplicate!
1158            created_at: 0,
1159                        ops: vec![],
1160        };
1161
1162        // Fast mode doesn't check duplicates
1163        let result = encode_edit_with_options(&edit, EncodeOptions::new());
1164        assert!(result.is_ok());
1165
1166        // Canonical mode rejects duplicates
1167        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1168        assert!(matches!(result, Err(EncodeError::DuplicateAuthor { .. })));
1169    }
1170
1171    #[test]
1172    fn test_canonical_rejects_duplicate_values() {
1173        let prop = [10u8; 16];
1174
1175        let edit: Edit<'static> = Edit {
1176            id: [0u8; 16],
1177            name: Cow::Owned("Test".to_string()),
1178            authors: vec![],
1179            created_at: 0,
1180                        ops: vec![
1181                Op::CreateEntity(CreateEntity {
1182                    id: [1u8; 16],
1183                    values: vec![
1184                        PropertyValue {
1185                            property: prop,
1186                            value: Value::Text {
1187                                value: Cow::Owned("First".to_string()),
1188                                language: None,
1189                            },
1190                        },
1191                        PropertyValue {
1192                            property: prop,
1193                            value: Value::Text {
1194                                value: Cow::Owned("Second".to_string()),
1195                                language: None,
1196                            },
1197                        },
1198                    ],
1199                    context: None,
1200                }),
1201            ],
1202        };
1203
1204        // Canonical mode rejects duplicate (property, language) pairs
1205        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1206        assert!(matches!(result, Err(EncodeError::DuplicateValue { .. })));
1207    }
1208
1209    #[test]
1210    fn test_canonical_allows_different_languages() {
1211        let prop = [10u8; 16];
1212        let lang_en = [20u8; 16];
1213        let lang_es = [21u8; 16];
1214
1215        let edit: Edit<'static> = Edit {
1216            id: [0u8; 16],
1217            name: Cow::Owned("Test".to_string()),
1218            authors: vec![],
1219            created_at: 0,
1220                        ops: vec![
1221                Op::CreateEntity(CreateEntity {
1222                    id: [1u8; 16],
1223                    values: vec![
1224                        PropertyValue {
1225                            property: prop,
1226                            value: Value::Text {
1227                                value: Cow::Owned("Hello".to_string()),
1228                                language: Some(lang_en),
1229                            },
1230                        },
1231                        PropertyValue {
1232                            property: prop,
1233                            value: Value::Text {
1234                                value: Cow::Owned("Hola".to_string()),
1235                                language: Some(lang_es),
1236                            },
1237                        },
1238                    ],
1239                    context: None,
1240                }),
1241            ],
1242        };
1243
1244        // Different languages for same property is allowed
1245        let result = encode_edit_with_options(&edit, EncodeOptions::canonical());
1246        assert!(result.is_ok());
1247    }
1248
1249    #[test]
1250    fn test_canonical_sorts_values_deterministically() {
1251        let prop_a = [0x0A; 16];
1252        let prop_b = [0x0B; 16];
1253
1254        // Values in reverse order (B before A)
1255        let edit: Edit<'static> = Edit {
1256            id: [1u8; 16],
1257            name: Cow::Owned("Test".to_string()),
1258            authors: vec![],
1259            created_at: 0,
1260                        ops: vec![
1261                Op::CreateEntity(CreateEntity {
1262                    id: [3u8; 16],
1263                    values: vec![
1264                        PropertyValue {
1265                            property: prop_b, // B first
1266                            value: Value::Int64 { value: 42, unit: None },
1267                        },
1268                        PropertyValue {
1269                            property: prop_a, // A second
1270                            value: Value::Text {
1271                                value: Cow::Owned("Hello".to_string()),
1272                                language: None,
1273                            },
1274                        },
1275                    ],
1276                    context: None,
1277                }),
1278            ],
1279        };
1280
1281        // Encode twice - should produce identical bytes
1282        let encoded1 = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1283        let encoded2 = encode_edit_with_options(&edit, EncodeOptions::canonical()).unwrap();
1284        assert_eq!(encoded1, encoded2, "Canonical encoding should be deterministic");
1285
1286        // Should roundtrip
1287        let decoded = decode_edit(&encoded1).unwrap();
1288        assert_eq!(decoded.ops.len(), 1);
1289    }
1290}