shape_runtime/stdlib/csv_module.rs
1//! Native `csv` module for CSV parsing and serialization.
2//!
3//! Phase 2d Array cluster migration: `parse`, `stringify`, `read_file`,
4//! and `is_valid` ported to the typed marshal layer using
5//! `TypedArrayData::String` (rows of strings) inside
6//! `the-deleted-heterogeneous-element-carrier` (array of rows).
7//!
8//! Stage C HashMap-marshal P1(b) activation (2026-05-07): `parse_records`
9//! and `stringify_records` activated using `HeapValue::HashMap(HashMapData)`
10//! variant. Each record is `Arc<HeapValue::HashMap>` carrying string keys
11//! (header row) → string values (record fields). Insertion order
12//! preserved via the eager-bucket-only HashMapData buffer pair.
13//!
14//! Tests deferred — ValueWord-based test fixtures can't compile and
15//! aren't reconstructed until the shape-vm cascade provides a typed
16//! test harness, mirroring the file_ops migration in commit d716482.
17
18use crate::marshal::{register_typed_fn_1, register_typed_fn_2_full};
19use crate::module_exports::{ModuleExports, ModuleParam};
20use crate::type_schema::register_predeclared_any_schema;
21use crate::typed_module_exports::{ConcreteReturn, ConcreteType, TypedReturn};
22use shape_value::heap_value::{HeapValue, TypedObjectStorage};
23use shape_value::{NativeKind, ValueSlot};
24use std::sync::Arc;
25
26// W17-out-of-bundle-A-followups (2026-05-12): `row_to_heap` was the
27// per-row `Arc<HeapValue::TypedArray(TypedArrayData::String)>` builder
28// for the pre-rewire `csv.parse` / `csv.read_file` `Array<Array<string>>`
29// shape. Both now surface-and-stop pending the
30// W17-typed-carrier-array-typedarray follow-up; the helper is removed
31// alongside its construction call sites.
32
33/// Read a `Vec<Vec<String>>` from a `Vec<Arc<HeapValue>>` whose elements
34/// are each the deleted outer typed-array arm.
35///
36/// V3-S5 ckpt-5-prime²c (2026-05-15) SURFACE-AND-STOP: this consumer
37/// pattern-matched `HeapValue::TypedArray(Arc<TypedArrayData>)` to extract
38/// the per-row `Vec<String>`. Both the outer arm and the inner
39/// `TypedArrayData::String` shape are deleted (V3-S5 ckpt-1/ckpt-4/ckpt-5)
40/// — the per-row carrier is now a `*mut TypedArray<*const StringObj>` raw
41/// pointer with no `HeapValue::*` wrapper, so `Vec<Arc<HeapValue>>` cannot
42/// express it. Pairs with the Round 2 `Vec<Arc<HeapValue>>` rewire
43/// follow-up at `marshal.rs:FromSlot<Vec<Arc<HeapValue>>>` and the
44/// `from_typed_array_<T>` constructor wave at `slot.rs:142`.
45fn rows_from_heap_array(
46 rows: &[Arc<HeapValue>],
47 fn_name: &str,
48) -> Result<Vec<Vec<String>>, String> {
49 let _ = rows;
50 Err(format!(
51 "{}: V3-S5 ckpt-5-prime²c SURFACE — per-row outer-array-arm \
52 consumer needs Vec<Arc<HeapValue>> rewire for the deleted \
53 outer-array-arm. Round 2 follow-up. ADR-006 §2.7.24 Q25.A \
54 SUPERSEDED.",
55 fn_name
56 ))
57}
58
59/// Create the `csv` module with CSV parsing and serialization functions.
60pub fn create_csv_module() -> ModuleExports {
61 let mut module = ModuleExports::new("std::core::csv");
62 module.description = "CSV parsing and serialization".to_string();
63
64 // csv.parse(text: string) -> Array<Array<string>>
65 //
66 // W17-out-of-bundle-A-followups (2026-05-12): surface-and-stop.
67 // `Array<Array<string>>` is homogeneous in
68 // `HeapKind::TypedArray (TypedArrayData::String)` — the natural
69 // Q25.A specialized variant is
70 // `TypedArrayData::TypedArray(Arc<TypedBuffer<Arc<TypedArrayData>>>)`,
71 // but adding a nested-TypedArray variant is out of bundle-A-followups
72 // scope (the prompt forbids new HeapKind variants and an added
73 // TypedArrayData variant cascades through ~40 exhaustive matches).
74 // Users wanting per-record dispatch should use `csv.parse_records`
75 // which lowers to `Array<TypedObject>` via the C+ precedent.
76 register_typed_fn_1::<_, Arc<String>>(
77 &mut module,
78 "parse",
79 "Parse CSV text into an array of rows (each row is an array of strings)",
80 "text",
81 "string",
82 ConcreteType::ArrayHeapValue("Array<Array<string>>".to_string()),
83 |text, _ctx| {
84 let _ = text;
85 // phase-2d-hardening:(f) — csv.parse surface-and-stop:
86 // Array<Array<string>> needs TypedArrayData::TypedArray
87 // (nested-TypedArray) variant. Use csv.parse_records for
88 // per-record TypedObject dispatch in the meantime.
89 Err(format!(
90 "csv.method parse() -> SURFACE — `Array<Array<string>>` needs a \
91 nested-array variant in ADR-006 \
92 §2.7.24 Q25.A's spec list. Tracked as \
93 W17-typed-carrier-array-typedarray follow-up (out of \
94 bundle-A-followups scope). Use `csv.parse_records` for \
95 per-record TypedObject access. ADR-006 §2.7.24 Q25.A."
96 ))
97 },
98 );
99
100 // csv.stringify(data: Array<Array<string>>, delimiter?: string) -> string
101 register_typed_fn_2_full::<_, Vec<Arc<HeapValue>>, Arc<String>>(
102 &mut module,
103 "stringify",
104 "Convert an array of rows to a CSV string",
105 [
106 ModuleParam {
107 name: "data".to_string(),
108 type_name: "Array<Array<string>>".to_string(),
109 required: true,
110 description: "Array of rows, each row is an array of field strings".to_string(),
111 ..Default::default()
112 },
113 ModuleParam {
114 name: "delimiter".to_string(),
115 type_name: "string".to_string(),
116 required: false,
117 description: "Field delimiter character (default: comma)".to_string(),
118 default_snippet: Some("\",\"".to_string()),
119 ..Default::default()
120 },
121 ],
122 ConcreteType::String,
123 |data, delimiter, _ctx| {
124 let rows = rows_from_heap_array(&data, "csv.stringify()")?;
125
126 let delim_byte = delimiter
127 .as_bytes()
128 .first()
129 .copied()
130 .unwrap_or(b',');
131
132 let mut writer = csv::WriterBuilder::new()
133 .delimiter(delim_byte)
134 .from_writer(Vec::new());
135
136 for row in &rows {
137 writer
138 .write_record(row)
139 .map_err(|e| format!("csv.stringify() failed: {}", e))?;
140 }
141
142 let bytes = writer
143 .into_inner()
144 .map_err(|e| format!("csv.stringify() failed to flush: {}", e))?;
145 let output = String::from_utf8(bytes)
146 .map_err(|e| format!("csv.stringify() UTF-8 error: {}", e))?;
147
148 Ok(TypedReturn::Concrete(ConcreteReturn::String(output)))
149 },
150 );
151
152 // csv.read_file(path: string) -> Result<Array<Array<string>>>
153 //
154 // W17-out-of-bundle-A-followups (2026-05-12): surface-and-stop, same
155 // shape as csv.parse above — `Array<Array<string>>` needs the
156 // nested-TypedArray variant in Q25.A's spec list. Tracked as
157 // W17-typed-carrier-array-typedarray follow-up.
158 register_typed_fn_1::<_, Arc<String>>(
159 &mut module,
160 "read_file",
161 "Read and parse a CSV file into an array of rows",
162 "path",
163 "string",
164 ConcreteType::Result(Box::new(ConcreteType::ArrayHeapValue(
165 "Array<Array<string>>".to_string(),
166 ))),
167 |path, _ctx| {
168 let _ = path;
169 // phase-2d-hardening:(f) — csv.read_file surface-and-stop:
170 // same nested-TypedArray gap as csv.parse.
171 Err(format!(
172 "csv.method read_file() -> SURFACE — `Array<Array<string>>` needs a \
173 nested-array variant in ADR-006 \
174 §2.7.24 Q25.A's spec list. Tracked as \
175 W17-typed-carrier-array-typedarray follow-up. ADR-006 §2.7.24 Q25.A."
176 ))
177 },
178 );
179
180 // csv.is_valid(text: string) -> bool
181 register_typed_fn_1::<_, Arc<String>>(
182 &mut module,
183 "is_valid",
184 "Check if a string is valid CSV",
185 "text",
186 "string",
187 ConcreteType::Bool,
188 |text, _ctx| {
189 let mut reader = csv::ReaderBuilder::new()
190 .has_headers(false)
191 .from_reader(text.as_bytes());
192
193 let valid = reader.records().all(|r| r.is_ok());
194 Ok(TypedReturn::Concrete(ConcreteReturn::Bool(valid)))
195 },
196 );
197
198 // csv.parse_records(text: string) -> Array<{header→string}>
199 //
200 // Parses CSV text using the first row as header keys; each subsequent
201 // row becomes a TypedObject keyed by header column names. Header
202 // order = field order = column order.
203 //
204 // W17-out-of-bundle-A-followups (2026-05-12): per the C+ precedent
205 // recorded in `phase-2d-playbook.md` §3, each record is constructed
206 // as `Arc<HeapValue::TypedObject>` with a schema derived from the
207 // CSV header row. The outer array lowers to
208 // `TypedArrayData::TypedObject` via the marshal-boundary
209 // `build_specialized_from_heap_arcs` dispatch. The pre-rewire
210 // `HashMap<string, string>` shape — which routed through the
211 // deleted `TypedArrayData::HeapValue` carrier — is replaced by the
212 // per-header field schema, which is what user code naturally
213 // addresses (`record.column_name` rather than `record["column_name"]`).
214 //
215 // Schema is auto-registered per unique header set on first
216 // invocation via `register_predeclared_any_schema`. Field types are
217 // all string (csv records carry string-shaped cells); the schema's
218 // `FieldType::Any` annotation is fine because the marshal-boundary
219 // reader does its own kind validation when consumers downstream
220 // read the slots.
221 register_typed_fn_1::<_, Arc<String>>(
222 &mut module,
223 "parse_records",
224 "Parse CSV text using the header row as keys, returning an array of typed records",
225 "text",
226 "string",
227 ConcreteType::ArrayHeapValue("Array<object>".to_string()),
228 |text, _ctx| {
229 let mut reader = csv::ReaderBuilder::new()
230 .has_headers(true)
231 .from_reader(text.as_bytes());
232
233 let headers: Vec<String> = reader
234 .headers()
235 .map_err(|e| format!("csv.parse_records() failed to read headers: {}", e))?
236 .iter()
237 .map(|h| h.to_string())
238 .collect();
239
240 // Auto-register the schema for this header set. The registry
241 // dedupes by field-name list; subsequent CSV files with the
242 // same header columns reuse the same SchemaId.
243 let schema_id = register_predeclared_any_schema(&headers);
244 let field_kinds: Arc<[NativeKind]> = Arc::from(
245 vec![NativeKind::String; headers.len()].into_boxed_slice(),
246 );
247 // Heap mask: every field is a string (heap-resident).
248 let heap_mask: u64 = if headers.len() >= 64 {
249 u64::MAX
250 } else {
251 (1u64 << headers.len()) - 1
252 };
253
254 let mut records: Vec<Arc<HeapValue>> = Vec::new();
255 for result in reader.records() {
256 let record =
257 result.map_err(|e| format!("csv.parse_records() failed: {}", e))?;
258 let n = headers.len().min(record.len());
259 let mut slots: Vec<ValueSlot> = Vec::with_capacity(headers.len());
260 // Use min(headers, record) length plus pad with empty
261 // string for short rows so the slot count matches the
262 // schema (TypedObjectStorage::new enforces this).
263 for i in 0..headers.len() {
264 let cell = if i < n {
265 record.get(i).unwrap_or("").to_string()
266 } else {
267 String::new()
268 };
269 slots.push(ValueSlot::from_string_arc(Arc::new(cell)));
270 }
271 // Wave 2 Round 4 D4 ckpt-final-prime² (2026-05-14): variant
272 // signature flipped to `HeapValue::TypedObject(TypedObjectPtr)`.
273 // `_new` returns `*mut TypedObjectStorage` with refcount=1; we
274 // wrap it in `TypedObjectPtr` (transferring the share to the
275 // wrapper).
276 let storage = TypedObjectStorage::_new(
277 schema_id as u64,
278 slots.into_boxed_slice(),
279 heap_mask,
280 Arc::clone(&field_kinds),
281 );
282 records.push(Arc::new(HeapValue::TypedObject(
283 shape_value::heap_value::TypedObjectPtr::new(storage),
284 )));
285 }
286
287 Ok(TypedReturn::Concrete(ConcreteReturn::ArrayHeapValue(
288 records,
289 )))
290 },
291 );
292
293 // csv.stringify_records(data: Array<HashMap<string, string>>, headers?: Array<string>) -> string
294 //
295 // Serializes an array of HashMap records to CSV. Header order is
296 // either the explicit `headers` argument OR the keys from the first
297 // record (using its HashMapData insertion order — same semantics as
298 // the legacy `from_hashmap_pairs(keys, values)` shape).
299 register_typed_fn_2_full::<_, Vec<Arc<HeapValue>>, Vec<Arc<String>>>(
300 &mut module,
301 "stringify_records",
302 "Convert an array of hashmaps to a CSV string with headers",
303 [
304 ModuleParam {
305 name: "data".to_string(),
306 type_name: "Array<HashMap<string, string>>".to_string(),
307 required: true,
308 description: "Array of records (hashmaps with string keys and values)"
309 .to_string(),
310 ..Default::default()
311 },
312 ModuleParam {
313 name: "headers".to_string(),
314 type_name: "Array<string>".to_string(),
315 required: false,
316 description: "Explicit header order (default: keys from first record)"
317 .to_string(),
318 default_snippet: Some("[]".to_string()),
319 ..Default::default()
320 },
321 ],
322 ConcreteType::String,
323 |data, explicit_headers, _ctx| {
324 // W17-out-of-bundle-A-followups (2026-05-12): accept TypedObject
325 // records in addition to legacy HashMap records. parse_records
326 // now emits TypedObjects; round-trip via stringify_records must
327 // therefore read the TypedObject shape. HashMap input remains
328 // supported for users still passing legacy HashMap records.
329 //
330 // Determine header order: explicit argument (if non-empty) or
331 // the first record's keys (TypedObject schema field-order, or
332 // HashMap insertion order).
333 let headers: Vec<String> = if !explicit_headers.is_empty() {
334 explicit_headers.iter().map(|s| (**s).clone()).collect()
335 } else if let Some(first) = data.first() {
336 match &**first {
337 HeapValue::HashMap(kref) => {
338 // Wave 2 Round 3b C2-joint ckpt-4 (2026-05-14):
339 // per-V walk of `*mut TypedArray<*const StringObj>`
340 // keys. V-agnostic (keys are always string-typed).
341 let keys_ptr = match kref {
342 shape_value::heap_value::HashMapKindedRef::I64(arc) => arc.keys,
343 shape_value::heap_value::HashMapKindedRef::F64(arc) => arc.keys,
344 shape_value::heap_value::HashMapKindedRef::Bool(arc) => arc.keys,
345 shape_value::heap_value::HashMapKindedRef::Char(arc) => arc.keys,
346 shape_value::heap_value::HashMapKindedRef::String(arc) => arc.keys,
347 shape_value::heap_value::HashMapKindedRef::Decimal(arc) => arc.keys,
348 shape_value::heap_value::HashMapKindedRef::TypedObject(arc) => arc.keys,
349 shape_value::heap_value::HashMapKindedRef::TraitObject(arc) => arc.keys,
350 shape_value::heap_value::HashMapKindedRef::HashMap(arc) => arc.keys,
351 };
352 let n = unsafe { shape_value::v2::typed_array::TypedArray::len(keys_ptr) as usize };
353 (0..n)
354 .map(|i| unsafe {
355 let ptr = shape_value::v2::typed_array::TypedArray::get_unchecked(keys_ptr, i as u32);
356 shape_value::v2::string_obj::StringObj::as_str(ptr).to_owned()
357 })
358 .collect()
359 }
360 HeapValue::TypedObject(s) => {
361 let schema = crate::type_schema::lookup_schema_by_id_public(
362 s.schema_id as u32,
363 )
364 .ok_or_else(|| {
365 format!(
366 "csv.method stringify_records() -> TypedObject schema id {} \
367 not registered",
368 s.schema_id
369 )
370 })?;
371 schema.fields.iter().map(|f| f.name.clone()).collect()
372 }
373 other => {
374 return Err(format!(
375 "csv.stringify_records(): each element must be a record \
376 (HashMap or TypedObject), got {}",
377 other.type_name()
378 ));
379 }
380 }
381 } else {
382 return Ok(TypedReturn::Concrete(ConcreteReturn::String(
383 String::new(),
384 )));
385 };
386
387 let mut writer = csv::WriterBuilder::new().from_writer(Vec::new());
388 writer
389 .write_record(&headers)
390 .map_err(|e| format!("csv.stringify_records() header write failed: {}", e))?;
391
392 for record_arc in data.iter() {
393 let row: Vec<String> = match &**record_arc {
394 HeapValue::HashMap(kref) => {
395 // Wave 2 Round 3b C2-joint ckpt-4 (2026-05-14):
396 // per-V get(header) → cell extraction. CSV records
397 // are conventionally HashMap<string, string>
398 // (V=String); other V variants surface as a
399 // structured error.
400 use shape_value::heap_value::HashMapKindedRef;
401 match kref {
402 HashMapKindedRef::String(arc) => headers
403 .iter()
404 .map(|h| {
405 arc.get_index(h.as_str())
406 .map(|idx| {
407 let ptr: *const shape_value::v2::string_obj::StringObj =
408 unsafe { *(*arc.values).data.add(idx) };
409 unsafe {
410 shape_value::v2::string_obj::StringObj::as_str(ptr).to_owned()
411 }
412 })
413 .unwrap_or_default()
414 })
415 .collect(),
416 other => {
417 return Err(format!(
418 "csv.method stringify_records() -> HashMap records must be \
419 HashMap<string, string>, got V={:?}",
420 other.values_kind()
421 ));
422 }
423 }
424 }
425 HeapValue::TypedObject(storage) => {
426 let schema = crate::type_schema::lookup_schema_by_id_public(
427 storage.schema_id as u32,
428 )
429 .ok_or_else(|| {
430 format!(
431 "csv.method stringify_records() -> TypedObject schema id {} \
432 not registered",
433 storage.schema_id
434 )
435 })?;
436 let mut r = Vec::with_capacity(headers.len());
437 for header in &headers {
438 // Resolve header → slot index via the schema's
439 // field list. Empty cell when the record's
440 // schema doesn't have the requested header.
441 let cell = match schema
442 .fields
443 .iter()
444 .position(|f| f.name == *header)
445 {
446 Some(idx) if idx < storage.slots.len() => {
447 // Slot is a string per parse_records'
448 // construction; read via the kind table.
449 let bits = storage.slots[idx].raw();
450 if bits == 0 {
451 String::new()
452 } else {
453 // SAFETY: parse_records writes each
454 // slot via `ValueSlot::from_string_arc`
455 // — slot bits = `Arc::into_raw::<String>`.
456 // Borrow without releasing the storage's
457 // share (which owns the Arc).
458 unsafe {
459 let arc_ptr = bits as *const String;
460 Arc::increment_strong_count(arc_ptr);
461 let arc = Arc::from_raw(arc_ptr);
462 let owned = (*arc).clone();
463 // arc Drop here releases our
464 // bumped share; the storage's
465 // share is untouched.
466 owned
467 }
468 }
469 }
470 _ => String::new(),
471 };
472 r.push(cell);
473 }
474 r
475 }
476 other => {
477 return Err(format!(
478 "csv.stringify_records(): each element must be a record \
479 (HashMap or TypedObject), got {}",
480 other.type_name()
481 ));
482 }
483 };
484 writer
485 .write_record(&row)
486 .map_err(|e| format!("csv.stringify_records() row write failed: {}", e))?;
487 }
488
489 let bytes = writer
490 .into_inner()
491 .map_err(|e| format!("csv.stringify_records() flush failed: {}", e))?;
492 let output = String::from_utf8(bytes)
493 .map_err(|e| format!("csv.stringify_records() UTF-8 error: {}", e))?;
494
495 Ok(TypedReturn::Concrete(ConcreteReturn::String(output)))
496 },
497 );
498
499 module
500}