Skip to main content

shape_runtime/stdlib/
csv_module.rs

1//! Native `csv` module for CSV parsing and serialization.
2//!
3//! Phase 2d Array cluster migration: `parse`, `stringify`, `read_file`,
4//! and `is_valid` ported to the typed marshal layer using
5//! `TypedArrayData::String` (rows of strings) inside
6//! `the-deleted-heterogeneous-element-carrier` (array of rows).
7//!
8//! Stage C HashMap-marshal P1(b) activation (2026-05-07): `parse_records`
9//! and `stringify_records` activated using `HeapValue::HashMap(HashMapData)`
10//! variant. Each record is `Arc<HeapValue::HashMap>` carrying string keys
11//! (header row) → string values (record fields). Insertion order
12//! preserved via the eager-bucket-only HashMapData buffer pair.
13//!
14//! Tests deferred — ValueWord-based test fixtures can't compile and
15//! aren't reconstructed until the shape-vm cascade provides a typed
16//! test harness, mirroring the file_ops migration in commit d716482.
17
18use crate::marshal::{register_typed_fn_1, register_typed_fn_2_full};
19use crate::module_exports::{ModuleExports, ModuleParam};
20use crate::type_schema::register_predeclared_any_schema;
21use crate::typed_module_exports::{ConcreteReturn, ConcreteType, TypedReturn};
22use shape_value::heap_value::{HeapValue, TypedObjectStorage};
23use shape_value::{NativeKind, ValueSlot};
24use std::sync::Arc;
25
26// W17-out-of-bundle-A-followups (2026-05-12): `row_to_heap` was the
27// per-row `Arc<HeapValue::TypedArray(TypedArrayData::String)>` builder
28// for the pre-rewire `csv.parse` / `csv.read_file` `Array<Array<string>>`
29// shape. Both now surface-and-stop pending the
30// W17-typed-carrier-array-typedarray follow-up; the helper is removed
31// alongside its construction call sites.
32
33/// Read a `Vec<Vec<String>>` from a `Vec<Arc<HeapValue>>` whose elements
34/// are each the deleted outer typed-array arm.
35///
36/// V3-S5 ckpt-5-prime²c (2026-05-15) SURFACE-AND-STOP: this consumer
37/// pattern-matched `HeapValue::TypedArray(Arc<TypedArrayData>)` to extract
38/// the per-row `Vec<String>`. Both the outer arm and the inner
39/// `TypedArrayData::String` shape are deleted (V3-S5 ckpt-1/ckpt-4/ckpt-5)
40/// — the per-row carrier is now a `*mut TypedArray<*const StringObj>` raw
41/// pointer with no `HeapValue::*` wrapper, so `Vec<Arc<HeapValue>>` cannot
42/// express it. Pairs with the Round 2 `Vec<Arc<HeapValue>>` rewire
43/// follow-up at `marshal.rs:FromSlot<Vec<Arc<HeapValue>>>` and the
44/// `from_typed_array_<T>` constructor wave at `slot.rs:142`.
45fn rows_from_heap_array(
46    rows: &[Arc<HeapValue>],
47    fn_name: &str,
48) -> Result<Vec<Vec<String>>, String> {
49    let _ = rows;
50    Err(format!(
51        "{}: V3-S5 ckpt-5-prime²c SURFACE — per-row outer-array-arm \
52         consumer needs Vec<Arc<HeapValue>> rewire for the deleted \
53         outer-array-arm. Round 2 follow-up. ADR-006 §2.7.24 Q25.A \
54         SUPERSEDED.",
55        fn_name
56    ))
57}
58
59/// Create the `csv` module with CSV parsing and serialization functions.
60pub fn create_csv_module() -> ModuleExports {
61    let mut module = ModuleExports::new("std::core::csv");
62    module.description = "CSV parsing and serialization".to_string();
63
64    // csv.parse(text: string) -> Array<Array<string>>
65    //
66    // W17-out-of-bundle-A-followups (2026-05-12): surface-and-stop.
67    // `Array<Array<string>>` is homogeneous in
68    // `HeapKind::TypedArray (TypedArrayData::String)` — the natural
69    // Q25.A specialized variant is
70    // `TypedArrayData::TypedArray(Arc<TypedBuffer<Arc<TypedArrayData>>>)`,
71    // but adding a nested-TypedArray variant is out of bundle-A-followups
72    // scope (the prompt forbids new HeapKind variants and an added
73    // TypedArrayData variant cascades through ~40 exhaustive matches).
74    // Users wanting per-record dispatch should use `csv.parse_records`
75    // which lowers to `Array<TypedObject>` via the C+ precedent.
76    register_typed_fn_1::<_, Arc<String>>(
77        &mut module,
78        "parse",
79        "Parse CSV text into an array of rows (each row is an array of strings)",
80        "text",
81        "string",
82        ConcreteType::ArrayHeapValue("Array<Array<string>>".to_string()),
83        |text, _ctx| {
84            let _ = text;
85            // phase-2d-hardening:(f) — csv.parse surface-and-stop:
86            // Array<Array<string>> needs TypedArrayData::TypedArray
87            // (nested-TypedArray) variant. Use csv.parse_records for
88            // per-record TypedObject dispatch in the meantime.
89            Err(format!(
90                "csv.method parse() -> SURFACE — `Array<Array<string>>` needs a \
91                 nested-array variant in ADR-006 \
92                 §2.7.24 Q25.A's spec list. Tracked as \
93                 W17-typed-carrier-array-typedarray follow-up (out of \
94                 bundle-A-followups scope). Use `csv.parse_records` for \
95                 per-record TypedObject access. ADR-006 §2.7.24 Q25.A."
96            ))
97        },
98    );
99
100    // csv.stringify(data: Array<Array<string>>, delimiter?: string) -> string
101    register_typed_fn_2_full::<_, Vec<Arc<HeapValue>>, Arc<String>>(
102        &mut module,
103        "stringify",
104        "Convert an array of rows to a CSV string",
105        [
106            ModuleParam {
107                name: "data".to_string(),
108                type_name: "Array<Array<string>>".to_string(),
109                required: true,
110                description: "Array of rows, each row is an array of field strings".to_string(),
111                ..Default::default()
112            },
113            ModuleParam {
114                name: "delimiter".to_string(),
115                type_name: "string".to_string(),
116                required: false,
117                description: "Field delimiter character (default: comma)".to_string(),
118                default_snippet: Some("\",\"".to_string()),
119                ..Default::default()
120            },
121        ],
122        ConcreteType::String,
123        |data, delimiter, _ctx| {
124            let rows = rows_from_heap_array(&data, "csv.stringify()")?;
125
126            let delim_byte = delimiter
127                .as_bytes()
128                .first()
129                .copied()
130                .unwrap_or(b',');
131
132            let mut writer = csv::WriterBuilder::new()
133                .delimiter(delim_byte)
134                .from_writer(Vec::new());
135
136            for row in &rows {
137                writer
138                    .write_record(row)
139                    .map_err(|e| format!("csv.stringify() failed: {}", e))?;
140            }
141
142            let bytes = writer
143                .into_inner()
144                .map_err(|e| format!("csv.stringify() failed to flush: {}", e))?;
145            let output = String::from_utf8(bytes)
146                .map_err(|e| format!("csv.stringify() UTF-8 error: {}", e))?;
147
148            Ok(TypedReturn::Concrete(ConcreteReturn::String(output)))
149        },
150    );
151
152    // csv.read_file(path: string) -> Result<Array<Array<string>>>
153    //
154    // W17-out-of-bundle-A-followups (2026-05-12): surface-and-stop, same
155    // shape as csv.parse above — `Array<Array<string>>` needs the
156    // nested-TypedArray variant in Q25.A's spec list. Tracked as
157    // W17-typed-carrier-array-typedarray follow-up.
158    register_typed_fn_1::<_, Arc<String>>(
159        &mut module,
160        "read_file",
161        "Read and parse a CSV file into an array of rows",
162        "path",
163        "string",
164        ConcreteType::Result(Box::new(ConcreteType::ArrayHeapValue(
165            "Array<Array<string>>".to_string(),
166        ))),
167        |path, _ctx| {
168            let _ = path;
169            // phase-2d-hardening:(f) — csv.read_file surface-and-stop:
170            // same nested-TypedArray gap as csv.parse.
171            Err(format!(
172                "csv.method read_file() -> SURFACE — `Array<Array<string>>` needs a \
173                 nested-array variant in ADR-006 \
174                 §2.7.24 Q25.A's spec list. Tracked as \
175                 W17-typed-carrier-array-typedarray follow-up. ADR-006 §2.7.24 Q25.A."
176            ))
177        },
178    );
179
180    // csv.is_valid(text: string) -> bool
181    register_typed_fn_1::<_, Arc<String>>(
182        &mut module,
183        "is_valid",
184        "Check if a string is valid CSV",
185        "text",
186        "string",
187        ConcreteType::Bool,
188        |text, _ctx| {
189            let mut reader = csv::ReaderBuilder::new()
190                .has_headers(false)
191                .from_reader(text.as_bytes());
192
193            let valid = reader.records().all(|r| r.is_ok());
194            Ok(TypedReturn::Concrete(ConcreteReturn::Bool(valid)))
195        },
196    );
197
198    // csv.parse_records(text: string) -> Array<{header→string}>
199    //
200    // Parses CSV text using the first row as header keys; each subsequent
201    // row becomes a TypedObject keyed by header column names. Header
202    // order = field order = column order.
203    //
204    // W17-out-of-bundle-A-followups (2026-05-12): per the C+ precedent
205    // recorded in `phase-2d-playbook.md` §3, each record is constructed
206    // as `Arc<HeapValue::TypedObject>` with a schema derived from the
207    // CSV header row. The outer array lowers to
208    // `TypedArrayData::TypedObject` via the marshal-boundary
209    // `build_specialized_from_heap_arcs` dispatch. The pre-rewire
210    // `HashMap<string, string>` shape — which routed through the
211    // deleted `TypedArrayData::HeapValue` carrier — is replaced by the
212    // per-header field schema, which is what user code naturally
213    // addresses (`record.column_name` rather than `record["column_name"]`).
214    //
215    // Schema is auto-registered per unique header set on first
216    // invocation via `register_predeclared_any_schema`. Field types are
217    // all string (csv records carry string-shaped cells); the schema's
218    // `FieldType::Any` annotation is fine because the marshal-boundary
219    // reader does its own kind validation when consumers downstream
220    // read the slots.
221    register_typed_fn_1::<_, Arc<String>>(
222        &mut module,
223        "parse_records",
224        "Parse CSV text using the header row as keys, returning an array of typed records",
225        "text",
226        "string",
227        ConcreteType::ArrayHeapValue("Array<object>".to_string()),
228        |text, _ctx| {
229            let mut reader = csv::ReaderBuilder::new()
230                .has_headers(true)
231                .from_reader(text.as_bytes());
232
233            let headers: Vec<String> = reader
234                .headers()
235                .map_err(|e| format!("csv.parse_records() failed to read headers: {}", e))?
236                .iter()
237                .map(|h| h.to_string())
238                .collect();
239
240            // Auto-register the schema for this header set. The registry
241            // dedupes by field-name list; subsequent CSV files with the
242            // same header columns reuse the same SchemaId.
243            let schema_id = register_predeclared_any_schema(&headers);
244            let field_kinds: Arc<[NativeKind]> = Arc::from(
245                vec![NativeKind::String; headers.len()].into_boxed_slice(),
246            );
247            // Heap mask: every field is a string (heap-resident).
248            let heap_mask: u64 = if headers.len() >= 64 {
249                u64::MAX
250            } else {
251                (1u64 << headers.len()) - 1
252            };
253
254            let mut records: Vec<Arc<HeapValue>> = Vec::new();
255            for result in reader.records() {
256                let record =
257                    result.map_err(|e| format!("csv.parse_records() failed: {}", e))?;
258                let n = headers.len().min(record.len());
259                let mut slots: Vec<ValueSlot> = Vec::with_capacity(headers.len());
260                // Use min(headers, record) length plus pad with empty
261                // string for short rows so the slot count matches the
262                // schema (TypedObjectStorage::new enforces this).
263                for i in 0..headers.len() {
264                    let cell = if i < n {
265                        record.get(i).unwrap_or("").to_string()
266                    } else {
267                        String::new()
268                    };
269                    slots.push(ValueSlot::from_string_arc(Arc::new(cell)));
270                }
271                // Wave 2 Round 4 D4 ckpt-final-prime² (2026-05-14): variant
272                // signature flipped to `HeapValue::TypedObject(TypedObjectPtr)`.
273                // `_new` returns `*mut TypedObjectStorage` with refcount=1; we
274                // wrap it in `TypedObjectPtr` (transferring the share to the
275                // wrapper).
276                let storage = TypedObjectStorage::_new(
277                    schema_id as u64,
278                    slots.into_boxed_slice(),
279                    heap_mask,
280                    Arc::clone(&field_kinds),
281                );
282                records.push(Arc::new(HeapValue::TypedObject(
283                    shape_value::heap_value::TypedObjectPtr::new(storage),
284                )));
285            }
286
287            Ok(TypedReturn::Concrete(ConcreteReturn::ArrayHeapValue(
288                records,
289            )))
290        },
291    );
292
293    // csv.stringify_records(data: Array<HashMap<string, string>>, headers?: Array<string>) -> string
294    //
295    // Serializes an array of HashMap records to CSV. Header order is
296    // either the explicit `headers` argument OR the keys from the first
297    // record (using its HashMapData insertion order — same semantics as
298    // the legacy `from_hashmap_pairs(keys, values)` shape).
299    register_typed_fn_2_full::<_, Vec<Arc<HeapValue>>, Vec<Arc<String>>>(
300        &mut module,
301        "stringify_records",
302        "Convert an array of hashmaps to a CSV string with headers",
303        [
304            ModuleParam {
305                name: "data".to_string(),
306                type_name: "Array<HashMap<string, string>>".to_string(),
307                required: true,
308                description: "Array of records (hashmaps with string keys and values)"
309                    .to_string(),
310                ..Default::default()
311            },
312            ModuleParam {
313                name: "headers".to_string(),
314                type_name: "Array<string>".to_string(),
315                required: false,
316                description: "Explicit header order (default: keys from first record)"
317                    .to_string(),
318                default_snippet: Some("[]".to_string()),
319                ..Default::default()
320            },
321        ],
322        ConcreteType::String,
323        |data, explicit_headers, _ctx| {
324            // W17-out-of-bundle-A-followups (2026-05-12): accept TypedObject
325            // records in addition to legacy HashMap records. parse_records
326            // now emits TypedObjects; round-trip via stringify_records must
327            // therefore read the TypedObject shape. HashMap input remains
328            // supported for users still passing legacy HashMap records.
329            //
330            // Determine header order: explicit argument (if non-empty) or
331            // the first record's keys (TypedObject schema field-order, or
332            // HashMap insertion order).
333            let headers: Vec<String> = if !explicit_headers.is_empty() {
334                explicit_headers.iter().map(|s| (**s).clone()).collect()
335            } else if let Some(first) = data.first() {
336                match &**first {
337                    HeapValue::HashMap(kref) => {
338                        // Wave 2 Round 3b C2-joint ckpt-4 (2026-05-14):
339                        // per-V walk of `*mut TypedArray<*const StringObj>`
340                        // keys. V-agnostic (keys are always string-typed).
341                        let keys_ptr = match kref {
342                            shape_value::heap_value::HashMapKindedRef::I64(arc) => arc.keys,
343                            shape_value::heap_value::HashMapKindedRef::F64(arc) => arc.keys,
344                            shape_value::heap_value::HashMapKindedRef::Bool(arc) => arc.keys,
345                            shape_value::heap_value::HashMapKindedRef::Char(arc) => arc.keys,
346                            shape_value::heap_value::HashMapKindedRef::String(arc) => arc.keys,
347                            shape_value::heap_value::HashMapKindedRef::Decimal(arc) => arc.keys,
348                            shape_value::heap_value::HashMapKindedRef::TypedObject(arc) => arc.keys,
349                            shape_value::heap_value::HashMapKindedRef::TraitObject(arc) => arc.keys,
350                            shape_value::heap_value::HashMapKindedRef::HashMap(arc) => arc.keys,
351                        };
352                        let n = unsafe { shape_value::v2::typed_array::TypedArray::len(keys_ptr) as usize };
353                        (0..n)
354                            .map(|i| unsafe {
355                                let ptr = shape_value::v2::typed_array::TypedArray::get_unchecked(keys_ptr, i as u32);
356                                shape_value::v2::string_obj::StringObj::as_str(ptr).to_owned()
357                            })
358                            .collect()
359                    }
360                    HeapValue::TypedObject(s) => {
361                        let schema = crate::type_schema::lookup_schema_by_id_public(
362                            s.schema_id as u32,
363                        )
364                        .ok_or_else(|| {
365                            format!(
366                                "csv.method stringify_records() -> TypedObject schema id {} \
367                                 not registered",
368                                s.schema_id
369                            )
370                        })?;
371                        schema.fields.iter().map(|f| f.name.clone()).collect()
372                    }
373                    other => {
374                        return Err(format!(
375                            "csv.stringify_records(): each element must be a record \
376                             (HashMap or TypedObject), got {}",
377                            other.type_name()
378                        ));
379                    }
380                }
381            } else {
382                return Ok(TypedReturn::Concrete(ConcreteReturn::String(
383                    String::new(),
384                )));
385            };
386
387            let mut writer = csv::WriterBuilder::new().from_writer(Vec::new());
388            writer
389                .write_record(&headers)
390                .map_err(|e| format!("csv.stringify_records() header write failed: {}", e))?;
391
392            for record_arc in data.iter() {
393                let row: Vec<String> = match &**record_arc {
394                    HeapValue::HashMap(kref) => {
395                        // Wave 2 Round 3b C2-joint ckpt-4 (2026-05-14):
396                        // per-V get(header) → cell extraction. CSV records
397                        // are conventionally HashMap<string, string>
398                        // (V=String); other V variants surface as a
399                        // structured error.
400                        use shape_value::heap_value::HashMapKindedRef;
401                        match kref {
402                            HashMapKindedRef::String(arc) => headers
403                                .iter()
404                                .map(|h| {
405                                    arc.get_index(h.as_str())
406                                        .map(|idx| {
407                                            let ptr: *const shape_value::v2::string_obj::StringObj =
408                                                unsafe { *(*arc.values).data.add(idx) };
409                                            unsafe {
410                                                shape_value::v2::string_obj::StringObj::as_str(ptr).to_owned()
411                                            }
412                                        })
413                                        .unwrap_or_default()
414                                })
415                                .collect(),
416                            other => {
417                                return Err(format!(
418                                    "csv.method stringify_records() -> HashMap records must be \
419                                     HashMap<string, string>, got V={:?}",
420                                    other.values_kind()
421                                ));
422                            }
423                        }
424                    }
425                    HeapValue::TypedObject(storage) => {
426                        let schema = crate::type_schema::lookup_schema_by_id_public(
427                            storage.schema_id as u32,
428                        )
429                        .ok_or_else(|| {
430                            format!(
431                                "csv.method stringify_records() -> TypedObject schema id {} \
432                                 not registered",
433                                storage.schema_id
434                            )
435                        })?;
436                        let mut r = Vec::with_capacity(headers.len());
437                        for header in &headers {
438                            // Resolve header → slot index via the schema's
439                            // field list. Empty cell when the record's
440                            // schema doesn't have the requested header.
441                            let cell = match schema
442                                .fields
443                                .iter()
444                                .position(|f| f.name == *header)
445                            {
446                                Some(idx) if idx < storage.slots.len() => {
447                                    // Slot is a string per parse_records'
448                                    // construction; read via the kind table.
449                                    let bits = storage.slots[idx].raw();
450                                    if bits == 0 {
451                                        String::new()
452                                    } else {
453                                        // SAFETY: parse_records writes each
454                                        // slot via `ValueSlot::from_string_arc`
455                                        // — slot bits = `Arc::into_raw::<String>`.
456                                        // Borrow without releasing the storage's
457                                        // share (which owns the Arc).
458                                        unsafe {
459                                            let arc_ptr = bits as *const String;
460                                            Arc::increment_strong_count(arc_ptr);
461                                            let arc = Arc::from_raw(arc_ptr);
462                                            let owned = (*arc).clone();
463                                            // arc Drop here releases our
464                                            // bumped share; the storage's
465                                            // share is untouched.
466                                            owned
467                                        }
468                                    }
469                                }
470                                _ => String::new(),
471                            };
472                            r.push(cell);
473                        }
474                        r
475                    }
476                    other => {
477                        return Err(format!(
478                            "csv.stringify_records(): each element must be a record \
479                             (HashMap or TypedObject), got {}",
480                            other.type_name()
481                        ));
482                    }
483                };
484                writer
485                    .write_record(&row)
486                    .map_err(|e| format!("csv.stringify_records() row write failed: {}", e))?;
487            }
488
489            let bytes = writer
490                .into_inner()
491                .map_err(|e| format!("csv.stringify_records() flush failed: {}", e))?;
492            let output = String::from_utf8(bytes)
493                .map_err(|e| format!("csv.stringify_records() UTF-8 error: {}", e))?;
494
495            Ok(TypedReturn::Concrete(ConcreteReturn::String(output)))
496        },
497    );
498
499    module
500}