Skip to main content

prototext_core/serialize/encode_text/
mod.rs

1// SPDX-FileCopyrightText: 2025 - 2026 Frederic Ruget <fred@atlant.is> <fred@s3ns.io> (GitHub: @douzebis)
2// SPDX-FileCopyrightText: 2025 - 2026 Thales Cloud Sécurisé
3//
4// SPDX-License-Identifier: MIT
5
6use crate::helpers::{write_varint_ohb, WT_END_GROUP, WT_LEN, WT_START_GROUP};
7use memchr::memrchr;
8
9mod encode_annotation;
10mod fields;
11mod frame;
12mod placeholder;
13
14#[cfg(test)]
15use encode_annotation::parse_field_decl_into;
16use encode_annotation::{parse_annotation, Ann};
17use fields::{encode_packed_elem, encode_scalar_line, write_tag_ohb_local};
18use frame::Frame;
19use placeholder::{compact, fill_placeholder, write_placeholder};
20
21// ── Helpers: field number and line classification ─────────────────────────────
22
23/// Extract the field number from the LHS of a line and/or annotation.
24///
25/// Precedence: annotation's field_decl (`= N`) > numeric LHS.
26#[inline]
27fn extract_field_number(lhs: &str, ann: &Ann<'_>) -> u64 {
28    if let Some(fn_) = ann.field_number {
29        return fn_;
30    }
31    lhs.trim().parse::<u64>().unwrap_or(0)
32}
33
34/// Split a line into `(value_part, annotation_str)`.
35///
36/// The separator is `  #@ ` (2 spaces + `#` + `@` + space).  We scan right-to-left
37/// so that quoted string values containing `  #@ ` don't confuse the split.
38#[inline]
39fn split_at_annotation(line: &str) -> (&str, &str) {
40    // Find the rightmost "  #@ " separator using SIMD-accelerated memrchr for '#',
41    // then verify the surrounding bytes.  Falls back leftward on false positives
42    // (a bare '#' inside a string value).
43    let b = line.as_bytes();
44    let mut end = b.len();
45    while let Some(p) = memrchr(b'#', &b[..end]) {
46        if p >= 2
47            && b[p - 1] == b' '
48            && b[p - 2] == b' '
49            && p + 2 < b.len()
50            && b[p + 1] == b'@'
51            && b[p + 2] == b' '
52        {
53            // "  #@ " confirmed: field part ends at p-2, annotation starts at p+3
54            return (&line[..p - 2], &line[p + 3..]);
55        }
56        // Also recognize a line whose non-whitespace content starts with "#@ "
57        // (comment-only annotation line, no value token before it).
58        if b[..p].iter().all(|c| *c == b' ' || *c == b'\t')
59            && p + 2 < b.len()
60            && b[p + 1] == b'@'
61            && b[p + 2] == b' '
62        {
63            return ("", &line[p + 3..]);
64        }
65        end = p; // keep searching leftward
66    }
67    (line, "")
68}
69
70// ── Public entry point ────────────────────────────────────────────────────────
71
72/// Decode a textual prototext byte string directly to binary wire bytes.
73///
74/// Input must start with `b"#@ prototext:"`.
75/// The line-by-line format must have been produced with `include_annotations=true`
76/// (the annotation comment on each line is required to reconstruct field numbers
77/// and types when field names are used on the LHS).
78///
79/// Implements Proposal F — Strategy C2 for MESSAGE frames.
80pub fn encode_text_to_binary(text: &[u8]) -> Vec<u8> {
81    let capacity = (text.len() / 6).max(64);
82    let mut out = Vec::with_capacity(capacity);
83
84    let mut stack: Vec<Frame> = Vec::new();
85    let mut first_placeholder: Option<usize> = None;
86    let mut last_placeholder: Option<usize> = None;
87
88    // ── Per-line packed state ─────────────────────────────────────────────────
89    // When non-None, we are buffering elements for a per-line packed record.
90    // `packed_field_number`: the field number of the active record.
91    // `packed_tag_ohb`: tag overhang for the record's wire tag.
92    // `packed_len_ohb`: length overhang for the record's LEN prefix.
93    // `packed_remaining`: how many more element lines to consume.
94    // `packed_payload`: accumulated payload bytes.
95    let mut packed_field_number: u64 = 0;
96    let mut packed_tag_ohb: Option<u64> = None;
97    let mut packed_len_ohb: Option<u64> = None;
98    let mut packed_remaining: usize = 0;
99    let mut packed_payload: Vec<u8> = Vec::new();
100
101    // The text is always valid ASCII (a subset of UTF-8).
102    let text_str = match std::str::from_utf8(text) {
103        Ok(s) => s,
104        Err(_) => return out,
105    };
106
107    let mut lines = text_str.lines();
108
109    // Skip the first line: "#@ prototext: protoc"
110    lines.next();
111
112    for line in lines {
113        let line = line.trim_end(); // strip trailing CR/spaces
114
115        if line.is_empty() {
116            continue;
117        }
118
119        // ── Close brace ───────────────────────────────────────────────────────
120        //
121        // Brace-folding may place multiple `}` on one line, separated by spaces
122        // (e.g. `}}` for indent_size=1, `} } }` for indent_size=2).  A close-
123        // brace line consists solely of `}` and space characters after the
124        // leading indentation.  Walk the trimmed line byte-by-byte and pop the
125        // stack once per `}` found.
126
127        let trimmed = line.trim_start();
128        if !trimmed.is_empty() && trimmed.bytes().all(|b| b == b'}' || b == b' ') {
129            for b in trimmed.bytes() {
130                if b == b'}' {
131                    match stack.pop() {
132                        Some(Frame::Message {
133                            placeholder_start,
134                            ohb,
135                            content_start,
136                            acw,
137                        }) => {
138                            let total_waste = fill_placeholder(
139                                &mut out,
140                                placeholder_start,
141                                ohb,
142                                content_start,
143                                acw,
144                            );
145                            // Propagate waste to parent frame.
146                            if let Some(parent) = stack.last_mut() {
147                                *parent.acw_mut() += total_waste;
148                            }
149                        }
150                        Some(Frame::Group {
151                            field_number,
152                            open_ended,
153                            mismatched_end,
154                            end_tag_ohb,
155                            acw,
156                        }) => {
157                            if !open_ended {
158                                let end_fn = mismatched_end.unwrap_or(field_number);
159                                write_tag_ohb_local(end_fn, WT_END_GROUP, end_tag_ohb, &mut out);
160                            }
161                            // Propagate accumulated waste from inner MESSAGE placeholders.
162                            if acw > 0 {
163                                if let Some(parent) = stack.last_mut() {
164                                    *parent.acw_mut() += acw;
165                                }
166                            }
167                        }
168                        None => { /* unmatched `}` — ignore */ }
169                    }
170                }
171            }
172            continue;
173        }
174
175        // Split value part from annotation.
176        let (value_part, ann_str) = split_at_annotation(line);
177
178        // ── Open brace ────────────────────────────────────────────────────────
179
180        // Detect `name {` (possibly indented, before the annotation).
181        let vp_trimmed = value_part.trim_end();
182        let is_open_brace = vp_trimmed.ends_with(" {") || vp_trimmed == "{";
183
184        if is_open_brace {
185            let ann = parse_annotation(ann_str);
186
187            // Extract the field name (LHS of `name {`).
188            let lhs = vp_trimmed.trim_start().trim_end_matches('{').trim_end();
189
190            let field_number = extract_field_number(lhs, &ann);
191            let tag_ohb = ann.tag_overhang_count;
192
193            if ann.wire_type == "group" {
194                write_tag_ohb_local(field_number, WT_START_GROUP, tag_ohb, &mut out);
195                stack.push(Frame::Group {
196                    field_number,
197                    open_ended: ann.open_ended_group,
198                    mismatched_end: ann.mismatched_group_end,
199                    end_tag_ohb: ann.end_tag_overhang_count,
200                    acw: 0,
201                });
202            } else {
203                // MESSAGE (wire type BYTES or unspecified).
204                write_tag_ohb_local(field_number, WT_LEN, tag_ohb, &mut out);
205                let ohb = ann.length_overhang_count.unwrap_or(0) as usize;
206                let (ph_start, content_start) =
207                    write_placeholder(&mut out, ohb, &mut first_placeholder, &mut last_placeholder);
208                stack.push(Frame::Message {
209                    placeholder_start: ph_start,
210                    ohb,
211                    content_start,
212                    acw: 0,
213                });
214            }
215            continue;
216        }
217
218        // ── Scalar field line ─────────────────────────────────────────────────
219
220        // Detect a comment-only annotation line (no LHS colon, starts with `#@ `).
221        // This is used for empty packed records: `pack_size: 0`.
222        let trimmed_vp = value_part.trim();
223        if trimmed_vp.is_empty() && !ann_str.is_empty() {
224            // Comment-only line — parse annotation to handle pack_size: 0.
225            let ann = parse_annotation(ann_str);
226            if let Some(0) = ann.pack_size {
227                // Empty packed record: emit tag + len=0.
228                write_tag_ohb_local(
229                    ann.field_number.unwrap_or(0),
230                    WT_LEN,
231                    ann.tag_overhang_count,
232                    &mut out,
233                );
234                write_varint_ohb(0, ann.length_overhang_count, &mut out);
235            }
236            continue;
237        }
238
239        // Find the colon separating LHS from value.
240        let Some(colon_pos) = value_part.find(':') else {
241            continue;
242        };
243        let lhs = value_part[..colon_pos].trim_start(); // may be indented
244        let value_str = value_part[colon_pos + 1..].trim();
245
246        let ann = parse_annotation(ann_str);
247        let field_number = extract_field_number(lhs, &ann);
248
249        // ── Per-line packed: continuation element ─────────────────────────────
250        if packed_remaining > 0 {
251            encode_packed_elem(value_str, &ann, &mut packed_payload);
252            packed_remaining -= 1;
253            if packed_remaining == 0 {
254                // Flush the completed wire record.
255                write_tag_ohb_local(packed_field_number, WT_LEN, packed_tag_ohb, &mut out);
256                write_varint_ohb(packed_payload.len() as u64, packed_len_ohb, &mut out);
257                out.extend_from_slice(&packed_payload);
258                packed_payload.clear();
259            }
260            continue;
261        }
262
263        // ── Per-line packed: first element (pack_size: N) ─────────────────────
264        if ann.is_packed {
265            if let Some(n) = ann.pack_size {
266                if n == 0 {
267                    // Empty record — emit immediately.
268                    write_tag_ohb_local(field_number, WT_LEN, ann.tag_overhang_count, &mut out);
269                    write_varint_ohb(0, ann.length_overhang_count, &mut out);
270                } else {
271                    // Start buffering.
272                    packed_field_number = field_number;
273                    packed_tag_ohb = ann.tag_overhang_count;
274                    packed_len_ohb = ann.length_overhang_count;
275                    packed_remaining = n - 1; // this line is element 0
276                    packed_payload.clear();
277                    encode_packed_elem(value_str, &ann, &mut packed_payload);
278                    if packed_remaining == 0 {
279                        // Single-element record — flush immediately.
280                        write_tag_ohb_local(packed_field_number, WT_LEN, packed_tag_ohb, &mut out);
281                        write_varint_ohb(packed_payload.len() as u64, packed_len_ohb, &mut out);
282                        out.extend_from_slice(&packed_payload);
283                        packed_payload.clear();
284                    }
285                }
286                continue;
287            }
288        }
289
290        encode_scalar_line(field_number, value_str, &ann, &mut out);
291    }
292
293    // ── Forward compaction pass ───────────────────────────────────────────────
294
295    if let Some(first_ph) = first_placeholder {
296        compact(&mut out, first_ph);
297    }
298
299    // Development instrumentation — size ratio
300    #[cfg(debug_assertions)]
301    {
302        let ratio = out.len() as f64 / text.len().max(1) as f64;
303        eprintln!(
304            "[encode_text] input_len={} output_len={} ratio={:.2}",
305            text.len(),
306            out.len(),
307            ratio
308        );
309    }
310
311    out
312}
313
314// ── Unit tests ────────────────────────────────────────────────────────────────
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319
320    // ── split_at_annotation ───────────────────────────────────────────────────
321
322    #[test]
323    fn split_bare() {
324        let (field, ann) = split_at_annotation("name: 42");
325        assert_eq!(field, "name: 42");
326        assert_eq!(ann, "");
327    }
328
329    #[test]
330    fn split_hash_at_space() {
331        let (field, ann) = split_at_annotation("name: 42  #@ varint = 1");
332        assert_eq!(field, "name: 42");
333        assert_eq!(ann, "varint = 1");
334    }
335
336    #[test]
337    fn split_hash_only() {
338        // Bare '#' without '@': not a separator.
339        let (field, ann) = split_at_annotation("name: 42  #");
340        assert_eq!(field, "name: 42  #");
341        assert_eq!(ann, "");
342    }
343
344    #[test]
345    fn split_hash_at_end() {
346        // "#@" at end with no space after '@': not a separator.
347        let (field, ann) = split_at_annotation("name: 42  #@");
348        assert_eq!(field, "name: 42  #@");
349        assert_eq!(ann, "");
350    }
351
352    #[test]
353    fn split_hash_at_no_space() {
354        // "#@x" — '@' not followed by space: not a separator.
355        let (field, ann) = split_at_annotation("name: 42  #@x");
356        assert_eq!(field, "name: 42  #@x");
357        assert_eq!(ann, "");
358    }
359
360    // ── parse_field_decl_into — enum suffix forms ─────────────────────────────
361
362    fn make_ann() -> Ann<'static> {
363        Ann {
364            wire_type: "",
365            field_type: "",
366            field_number: None,
367            is_packed: false,
368            tag_overhang_count: None,
369            value_overhang_count: None,
370            length_overhang_count: None,
371            missing_bytes_count: None,
372            mismatched_group_end: None,
373            open_ended_group: false,
374            end_tag_overhang_count: None,
375            records_overhung_count: vec![],
376            neg_int32_truncated: false,
377            records_neg_int32_truncated: vec![],
378            enum_scalar_value: None,
379            enum_packed_values: vec![],
380            nan_bits: None,
381            pack_size: None,
382            elem_ohb: None,
383            elem_neg_trunc: false,
384        }
385    }
386
387    #[test]
388    fn parse_scalar_enum() {
389        let mut ann = make_ann();
390        parse_field_decl_into("Type(9) = 5", &mut ann);
391        assert_eq!(ann.field_type, "enum");
392        assert_eq!(ann.enum_scalar_value, Some(9));
393        assert_eq!(ann.field_number, Some(5));
394    }
395
396    #[test]
397    fn parse_scalar_enum_neg() {
398        let mut ann = make_ann();
399        parse_field_decl_into("Color(-1) = 3", &mut ann);
400        assert_eq!(ann.field_type, "enum");
401        assert_eq!(ann.enum_scalar_value, Some(-1));
402        assert_eq!(ann.field_number, Some(3));
403    }
404
405    #[test]
406    fn parse_packed_enum() {
407        let mut ann = make_ann();
408        parse_field_decl_into("Label([1, 2, 3]) [packed=true] = 4", &mut ann);
409        assert_eq!(ann.field_type, "enum");
410        assert!(ann.is_packed);
411        assert_eq!(ann.enum_packed_values, vec![1, 2, 3]);
412        assert_eq!(ann.field_number, Some(4));
413    }
414
415    #[test]
416    fn parse_primitive_int32() {
417        let mut ann = make_ann();
418        parse_field_decl_into("int32 = 25", &mut ann);
419        assert_eq!(ann.field_type, "int32");
420        assert_eq!(ann.field_number, Some(25));
421        assert_eq!(ann.enum_scalar_value, None);
422    }
423
424    #[test]
425    fn parse_enum_named_float() {
426        // Latent-bug regression (spec 0004 §5.1): an enum whose type name
427        // collides with the 'float' primitive must route to varint, not fixed32.
428        let mut ann = make_ann();
429        parse_field_decl_into("float(1) = 1", &mut ann);
430        assert_eq!(
431            ann.field_type, "enum",
432            "enum named 'float' must set field_type='enum', not 'float'"
433        );
434        assert_eq!(ann.enum_scalar_value, Some(1));
435    }
436
437    // ── ENUM_UNKNOWN silencing ────────────────────────────────────────────────
438
439    #[test]
440    fn enum_unknown_encodes_correctly() {
441        // A field annotated with ENUM_UNKNOWN must encode the varint from the
442        // annotation's EnumType(N) suffix, not fail or produce wrong bytes.
443        // Field 1, value 99 → tag 0x08 (field=1, wire=varint), varint 0x63.
444        let input = b"#@ prototext: protoc\nkind: 99  #@ Type(99) = 1; ENUM_UNKNOWN\n";
445        let wire = encode_text_to_binary(input);
446        assert_eq!(
447            wire,
448            vec![0x08, 0x63],
449            "ENUM_UNKNOWN field 1 value 99: expected [0x08, 0x63]"
450        );
451    }
452}