Skip to main content

dag_ml_data_wasm/
lib.rs

1//! Browser-friendly bindings for `dag-ml-data`.
2//!
3//! The DEFAULT build exposes only the stable schema, planning, fingerprint and
4//! coordinator-envelope JSON contracts — no host buffers, no file I/O — so
5//! `nirs4all-lite` can run them in a browser before delegating heavy data
6//! access to a host provider.
7//!
8//! The opt-in `provider` feature adds the `WasmInMemoryProvider` type, an EAGER
9//! in-WASM provider over `dag-ml-data-provider`'s `InMemoryProvider` (JSON in,
10//! JSON out, decimal-string handles). An async JS buffer-fetcher provider
11//! (fetching host buffers on demand via a Promise) is intentionally deferred to
12//! a later slice.
13
14use serde::de::DeserializeOwned;
15use wasm_bindgen::prelude::*;
16
17use dag_ml_data_core::{
18    data_plan_fingerprint, fold_set_fingerprint, plan_model_input, sample_relation_fingerprint,
19    schema_fingerprint, AdapterRegistry, AdapterRegistrySpec, CoordinatorDataPlanEnvelope,
20    DataError as CoreDataError, DataPlan, DataPlanRequest, DatasetSchema, FoldSet, ModelInputSpec,
21    SampleRelationTable,
22};
23
24const SHARED_FOLD_SET_FINGERPRINT: &str =
25    "54d3185d6c628ef0df848828a8d8ae650222a283a78bbd3ab3bc2256f222c05c";
26
27#[wasm_bindgen]
28pub fn dag_ml_data_version() -> String {
29    env!("CARGO_PKG_VERSION").to_string()
30}
31
32/// Eager in-WASM provider over `dag-ml-data-provider`'s `InMemoryProvider`.
33///
34/// JSON in, JSON out; handles cross as decimal strings (JS cannot represent the
35/// full `u64` range as a number). Available only with the `provider` feature.
36#[cfg(feature = "provider")]
37#[wasm_bindgen]
38pub struct WasmInMemoryProvider {
39    core: dag_ml_data_provider::JsonInMemoryProvider,
40}
41
42/// A typed feature projection: a compact JSON `layout` (ids + shape) plus the
43/// flat row-major `values`. The `values` getter marshals the f64 slice as one
44/// `Float64Array` copy — no O(rows×cols) JSON string for the browser to parse.
45#[cfg(feature = "provider")]
46#[wasm_bindgen]
47pub struct WasmFeatureBlockF64 {
48    layout: String,
49    values: Vec<f64>,
50}
51
52#[cfg(feature = "provider")]
53#[wasm_bindgen]
54impl WasmFeatureBlockF64 {
55    /// Compact JSON: `{ feature_set_id, representation_id, feature_names,
56    /// sample_ids, observation_ids, n_rows, n_cols }` — no per-cell values.
57    #[wasm_bindgen(getter)]
58    pub fn layout(&self) -> String {
59        self.layout.clone()
60    }
61
62    /// Flat row-major f64 values as a `Float64Array`. Consumes the block so the
63    /// buffer is moved out without a second copy in WASM memory.
64    pub fn into_values(self) -> Vec<f64> {
65        self.values
66    }
67}
68
69#[cfg(feature = "provider")]
70#[wasm_bindgen]
71impl WasmInMemoryProvider {
72    #[wasm_bindgen(constructor)]
73    pub fn new(
74        envelope_json: &str,
75        target_tables_json: Option<String>,
76        feature_tables_json: Option<String>,
77        f64_feature_matrices_json: Option<String>,
78    ) -> Result<WasmInMemoryProvider, JsValue> {
79        let core = dag_ml_data_provider::JsonInMemoryProvider::from_json(
80            envelope_json,
81            target_tables_json.as_deref(),
82            feature_tables_json.as_deref(),
83            f64_feature_matrices_json.as_deref(),
84        )
85        .map_err(js_core_error)?;
86        Ok(Self { core })
87    }
88
89    /// Typed-input constructor: the feature matrix's flat row-major `values`
90    /// arrive as a `Float64Array` (copied straight into WASM memory) instead of
91    /// a JSON array, so a large matrix never goes through `JSON.stringify` /
92    /// boxed-array encoding on the JS side. `feature_matrix_meta_json` carries
93    /// the matrix metadata (`feature_set_id`, `representation_id`,
94    /// `feature_names`, `observation_ids`) WITHOUT a `values` field.
95    ///
96    /// DENSE-ONLY by contract: no validity mask, every value finite. Masked /
97    /// missing data must use the JSON constructor + `feature_block` path.
98    #[wasm_bindgen(js_name = withF64Features)]
99    pub fn with_f64_features(
100        envelope_json: &str,
101        target_tables_json: Option<String>,
102        feature_matrix_meta_json: &str,
103        values: Vec<f64>,
104    ) -> Result<WasmInMemoryProvider, JsValue> {
105        let core = dag_ml_data_provider::JsonInMemoryProvider::from_json_with_f64_values(
106            envelope_json,
107            target_tables_json.as_deref(),
108            feature_matrix_meta_json,
109            values,
110        )
111        .map_err(js_core_error)?;
112        Ok(Self { core })
113    }
114
115    pub fn materialize(&self, request_json: &str) -> Result<String, JsValue> {
116        self.core.materialize(request_json).map_err(js_core_error)
117    }
118
119    pub fn make_view(&self, data_handle: &str, view_json: &str) -> Result<String, JsValue> {
120        self.core
121            .make_view(data_handle, view_json)
122            .map_err(js_core_error)
123    }
124
125    pub fn view_identity(&self, view_handle: &str) -> Result<String, JsValue> {
126        self.core.view_identity(view_handle).map_err(js_core_error)
127    }
128
129    pub fn target_block(&self, view_handle: &str, target_id: &str) -> Result<String, JsValue> {
130        self.core
131            .target_block(view_handle, target_id)
132            .map_err(js_core_error)
133    }
134
135    pub fn feature_block(
136        &self,
137        view_handle: &str,
138        feature_set_id: &str,
139    ) -> Result<String, JsValue> {
140        self.core
141            .feature_block(view_handle, feature_set_id)
142            .map_err(js_core_error)
143    }
144
145    /// Typed-output projection: returns a [`WasmFeatureBlockF64`] whose `values`
146    /// are a flat `Float64Array`, avoiding the O(rows×cols) JSON of
147    /// [`Self::feature_block`] (the prime memory/latency cost on large datasets).
148    /// Flattens straight from the columnar store — no boxed per-cell values in
149    /// WASM either. Masked cells are an error; use `feature_block` for those.
150    #[wasm_bindgen(js_name = featureBlockF64)]
151    pub fn feature_block_f64(
152        &self,
153        view_handle: &str,
154        feature_set_id: &str,
155    ) -> Result<WasmFeatureBlockF64, JsValue> {
156        let (layout, values) = self
157            .core
158            .feature_block_f64(view_handle, feature_set_id)
159            .map_err(js_core_error)?;
160        Ok(WasmFeatureBlockF64 { layout, values })
161    }
162
163    pub fn feature_collation(
164        &self,
165        view_handle: &str,
166        selector_json: &str,
167    ) -> Result<String, JsValue> {
168        self.core
169            .feature_collation(view_handle, selector_json)
170            .map_err(js_core_error)
171    }
172
173    pub fn feature_buffer_manifests(&self) -> Result<String, JsValue> {
174        self.core.feature_buffer_manifests().map_err(js_core_error)
175    }
176
177    pub fn data_feature_buffer_bindings(&self, data_handle: &str) -> Result<String, JsValue> {
178        self.core
179            .data_feature_buffer_bindings(data_handle)
180            .map_err(js_core_error)
181    }
182
183    pub fn release(&self, handle: &str) -> Result<bool, JsValue> {
184        self.core.release(handle).map_err(js_core_error)
185    }
186}
187
188#[wasm_bindgen]
189pub fn contract_manifest_json() -> Result<String, JsValue> {
190    serde_json::to_string(&contract_manifest()).map_err(js_serde_error)
191}
192
193#[wasm_bindgen]
194pub fn validate_dataset_schema_json(json: &str) -> Result<(), JsValue> {
195    validate_json::<DatasetSchema>(json, DatasetSchema::validate)
196}
197
198#[wasm_bindgen]
199pub fn dataset_schema_fingerprint_json(json: &str) -> Result<String, JsValue> {
200    let schema = parse_and_validate::<DatasetSchema>(json, DatasetSchema::validate)?;
201    schema_fingerprint(&schema).map_err(js_core_error)
202}
203
204#[wasm_bindgen]
205pub fn validate_model_input_spec_json(json: &str) -> Result<(), JsValue> {
206    validate_json::<ModelInputSpec>(json, ModelInputSpec::validate)
207}
208
209#[wasm_bindgen]
210pub fn validate_adapter_registry_json(json: &str) -> Result<(), JsValue> {
211    adapter_registry_from_json(json).map(|_| ())
212}
213
214#[wasm_bindgen]
215pub fn validate_data_plan_json(json: &str) -> Result<(), JsValue> {
216    validate_json::<DataPlan>(json, DataPlan::validate)
217}
218
219#[wasm_bindgen]
220pub fn data_plan_fingerprint_json(json: &str) -> Result<String, JsValue> {
221    let plan = parse_and_validate::<DataPlan>(json, DataPlan::validate)?;
222    data_plan_fingerprint(&plan).map_err(js_core_error)
223}
224
225#[wasm_bindgen]
226pub fn validate_sample_relation_table_json(json: &str) -> Result<(), JsValue> {
227    validate_json::<SampleRelationTable>(json, SampleRelationTable::validate)
228}
229
230#[wasm_bindgen]
231pub fn sample_relation_table_fingerprint_json(json: &str) -> Result<String, JsValue> {
232    let relations = parse_and_validate::<SampleRelationTable>(json, SampleRelationTable::validate)?;
233    sample_relation_fingerprint(&relations).map_err(js_core_error)
234}
235
236#[wasm_bindgen]
237pub fn validate_fold_set_json(json: &str) -> Result<(), JsValue> {
238    validate_json::<FoldSet>(json, FoldSet::validate)
239}
240
241#[wasm_bindgen]
242pub fn fold_set_fingerprint_json(json: &str) -> Result<String, JsValue> {
243    let fold_set = parse_and_validate::<FoldSet>(json, FoldSet::validate)?;
244    fold_set_fingerprint(&fold_set).map_err(js_core_error)
245}
246
247#[wasm_bindgen]
248pub fn validate_fold_set_against_sample_relations_json(
249    fold_set_json: &str,
250    sample_relations_json: &str,
251) -> Result<(), JsValue> {
252    let fold_set = parse_and_validate::<FoldSet>(fold_set_json, FoldSet::validate)?;
253    let relations = parse_and_validate::<SampleRelationTable>(
254        sample_relations_json,
255        SampleRelationTable::validate,
256    )?;
257    relations
258        .validate_fold_set(&fold_set)
259        .map_err(js_core_error)
260}
261
262#[wasm_bindgen]
263pub fn validate_coordinator_data_plan_envelope_json(json: &str) -> Result<(), JsValue> {
264    validate_json::<CoordinatorDataPlanEnvelope>(json, CoordinatorDataPlanEnvelope::validate)
265}
266
267#[wasm_bindgen]
268pub fn build_coordinator_data_plan_envelope_json(
269    schema_json: &str,
270    data_plan_json: &str,
271    sample_relations_json: Option<String>,
272) -> Result<String, JsValue> {
273    let schema = parse_and_validate::<DatasetSchema>(schema_json, DatasetSchema::validate)?;
274    let plan = parse_and_validate::<DataPlan>(data_plan_json, DataPlan::validate)?;
275    let relations = match sample_relations_json {
276        Some(json) => Some(parse_and_validate::<SampleRelationTable>(
277            &json,
278            SampleRelationTable::validate,
279        )?),
280        None => None,
281    };
282    let envelope = CoordinatorDataPlanEnvelope::from_parts(&schema, plan, relations.as_ref())
283        .map_err(js_core_error)?;
284    serde_json::to_string(&envelope).map_err(js_serde_error)
285}
286
287#[wasm_bindgen]
288pub fn plan_model_input_json(
289    schema_json: &str,
290    model_input_json: &str,
291    adapter_registry_json: &str,
292    request_json: &str,
293) -> Result<String, JsValue> {
294    let schema = parse_and_validate::<DatasetSchema>(schema_json, DatasetSchema::validate)?;
295    let model_input =
296        parse_and_validate::<ModelInputSpec>(model_input_json, ModelInputSpec::validate)?;
297    let adapters = adapter_registry_from_json(adapter_registry_json)?;
298    let request = serde_json::from_str::<DataPlanRequest>(request_json).map_err(js_serde_error)?;
299    let plan =
300        plan_model_input(&schema, &model_input, &adapters, &request).map_err(js_core_error)?;
301    serde_json::to_string(&plan).map_err(js_serde_error)
302}
303
304fn adapter_registry_from_json(json: &str) -> Result<AdapterRegistry, JsValue> {
305    let spec = serde_json::from_str::<AdapterRegistrySpec>(json).map_err(js_serde_error)?;
306    AdapterRegistry::from_spec(spec).map_err(js_core_error)
307}
308
309fn contract_manifest() -> serde_json::Value {
310    #[allow(unused_mut)]
311    let mut manifest = serde_json::json!({
312        "schema_version": 1,
313        "crate": "dag-ml-data",
314        "package": "dag-ml-data",
315        "version": env!("CARGO_PKG_VERSION"),
316        "surface": "json-contract-bindings",
317        "contracts": [
318            {"id": "dataset_schema", "version": 1},
319            {"id": "model_input_spec", "version": 1},
320            {"id": "adapter_registry", "version": 1},
321            {"id": "data_plan", "version": 1},
322            {"id": "sample_relation_table", "version": 1},
323            {"id": "fold_set", "version": 1},
324            {"id": "coordinator_data_plan_envelope", "version": 1},
325            {"id": "feature_fusion_selector", "version": 1},
326            {"id": "coordinator_branch_view", "version": 1},
327            {"id": "fitted_adapter_ref", "version": 1}
328        ],
329        "capabilities": [
330            "validate_json_contracts",
331            "fingerprint_json_contracts",
332            "plan_model_input",
333            "build_coordinator_data_plan_envelope",
334            "validate_fold_set_against_sample_relations",
335            "nirs4all_lite_schema_fields",
336            "structured_error_descriptors"
337        ],
338        "shared": {
339            "fold_set_fixture_fingerprint": SHARED_FOLD_SET_FINGERPRINT
340        },
341        "python_exports": [
342            "version",
343            "contract_manifest_json",
344            "validate_dataset_schema_json",
345            "dataset_schema_fingerprint_json",
346            "validate_model_input_spec_json",
347            "validate_adapter_registry_json",
348            "plan_model_input_json",
349            "validate_data_plan_json",
350            "data_plan_fingerprint_json",
351            "validate_sample_relation_table_json",
352            "sample_relation_table_fingerprint_json",
353            "validate_fold_set_json",
354            "fold_set_fingerprint_json",
355            "validate_fold_set_against_sample_relations_json",
356            "build_coordinator_data_plan_envelope_json",
357            "validate_coordinator_data_plan_envelope_json"
358        ],
359        "wasm_exports": [
360            "dag_ml_data_version",
361            "contract_manifest_json",
362            "validate_dataset_schema_json",
363            "dataset_schema_fingerprint_json",
364            "validate_model_input_spec_json",
365            "validate_adapter_registry_json",
366            "plan_model_input_json",
367            "validate_data_plan_json",
368            "data_plan_fingerprint_json",
369            "validate_sample_relation_table_json",
370            "sample_relation_table_fingerprint_json",
371            "validate_fold_set_json",
372            "fold_set_fingerprint_json",
373            "validate_fold_set_against_sample_relations_json",
374            "build_coordinator_data_plan_envelope_json",
375            "validate_coordinator_data_plan_envelope_json"
376        ],
377        "c_abi_symbols": [
378            "dagmldata_schema_fingerprint_json",
379            "dagmldata_fold_set_validate_json",
380            "dagmldata_fold_set_fingerprint_json",
381            "dagmldata_fold_set_validate_against_relations_json",
382            "dagmldata_aggregation_policy_validate_json",
383            "dagmldata_coordinator_multi_target_arrow_json"
384        ]
385    });
386    // The provider surface is a distinct, opt-in (`provider` feature) section so
387    // it never blurs into the JSON-contract-only capability list.
388    #[cfg(feature = "provider")]
389    {
390        manifest["provider_surface"] = serde_json::json!("eager-inwasm-provider");
391        manifest["provider_exports"] = serde_json::json!([
392            "WasmInMemoryProvider.new",
393            "WasmInMemoryProvider.withF64Features",
394            "WasmInMemoryProvider.materialize",
395            "WasmInMemoryProvider.make_view",
396            "WasmInMemoryProvider.view_identity",
397            "WasmInMemoryProvider.target_block",
398            "WasmInMemoryProvider.feature_block",
399            "WasmInMemoryProvider.featureBlockF64",
400            "WasmInMemoryProvider.feature_collation",
401            "WasmInMemoryProvider.feature_buffer_manifests",
402            "WasmInMemoryProvider.data_feature_buffer_bindings",
403            "WasmInMemoryProvider.release"
404        ]);
405        manifest["provider_capabilities"] = serde_json::json!([
406            "materialize",
407            "make_view",
408            "view_identity",
409            "target_block",
410            "feature_block",
411            "feature_block_f64",
412            "feature_collation",
413            "feature_buffer_manifests",
414            "data_feature_buffer_bindings",
415            "release",
416            "f64_typed_feature_io"
417        ]);
418    }
419    manifest
420}
421
422fn validate_json<T>(
423    json: &str,
424    validate: impl FnOnce(&T) -> dag_ml_data_core::Result<()>,
425) -> Result<(), JsValue>
426where
427    T: DeserializeOwned,
428{
429    parse_and_validate::<T>(json, validate).map(|_| ())
430}
431
432fn parse_and_validate<T>(
433    json: &str,
434    validate: impl FnOnce(&T) -> dag_ml_data_core::Result<()>,
435) -> Result<T, JsValue>
436where
437    T: DeserializeOwned,
438{
439    let value = serde_json::from_str::<T>(json).map_err(js_serde_error)?;
440    validate(&value).map_err(js_core_error)?;
441    Ok(value)
442}
443
444fn js_serde_error(error: serde_json::Error) -> JsValue {
445    js_core_error(CoreDataError::Serialization(error))
446}
447
448fn js_core_error(error: CoreDataError) -> JsValue {
449    let payload = error
450        .descriptor_json()
451        .unwrap_or_else(|_| error.to_string());
452    JsValue::from_str(&payload)
453}