Skip to main content

alizarin_core/
loader.rs

1//! File system loader for prebuild directories
2//!
3//! This module handles loading graphs and other data from the prebuild
4//! directory structure used by starches-builder.
5
6use crate::graph::{
7    IndexedGraph, StaticGraph, StaticResource, StaticResourceDescriptors, StaticResourceMetadata,
8    StaticResourceSummary, StaticTile,
9};
10use crate::ontology::{OntologyConfig, OntologyValidator};
11use crate::skos::{parse_skos_to_collections, SkosCollection};
12#[cfg(feature = "parallel")]
13use rayon::prelude::*;
14use serde::Deserialize;
15use std::collections::HashMap;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::mpsc::Sender;
19
20// ============================================================================
21// Business Data File Deserialization Types
22// ============================================================================
23
24/// Top-level wrapper for business_data JSON files
25#[derive(Debug, Deserialize)]
26struct BusinessDataFile {
27    business_data: BusinessDataContent,
28}
29
30/// Content of the business_data section
31#[derive(Debug, Deserialize)]
32struct BusinessDataContent {
33    #[serde(default)]
34    resources: Vec<BusinessDataResource>,
35}
36
37/// A single resource in the business_data file
38#[derive(Debug, Deserialize)]
39struct BusinessDataResource {
40    resourceinstance: BusinessDataResourceInstance,
41    #[serde(default)]
42    metadata: Option<HashMap<String, String>>,
43}
44
45/// The resourceinstance object within a resource
46#[derive(Debug, Deserialize)]
47struct BusinessDataResourceInstance {
48    resourceinstanceid: String,
49    graph_id: String,
50    name: String,
51    #[serde(default)]
52    descriptors: Option<FlexibleDescriptors>,
53    #[serde(default)]
54    createdtime: Option<String>,
55    #[serde(default)]
56    lastmodified: Option<String>,
57    #[serde(default)]
58    publication_id: Option<String>,
59    #[serde(default)]
60    principaluser_id: Option<i32>,
61    #[serde(default)]
62    legacyid: Option<String>,
63    #[serde(default)]
64    graph_publication_id: Option<String>,
65}
66
67/// Descriptors that may appear in either format:
68/// - Language-nested (Arches export): `{"en": {"name": "...", "slug": "..."}}`
69/// - Flat (alizarin's own output): `{"name": "...", "slug": "..."}`
70#[derive(Debug)]
71struct FlexibleDescriptors {
72    resolved: StaticResourceDescriptors,
73}
74
75impl FlexibleDescriptors {
76    fn get_for_lang(&self, _lang: &str) -> Option<StaticResourceDescriptors> {
77        if self.resolved.is_empty() {
78            None
79        } else {
80            Some(self.resolved.clone())
81        }
82    }
83}
84
85impl<'de> Deserialize<'de> for FlexibleDescriptors {
86    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
87    where
88        D: serde::Deserializer<'de>,
89    {
90        let value = serde_json::Value::deserialize(deserializer)?;
91
92        // Try flat format first: {"name": "...", "slug": "..."}
93        if let Ok(flat) = serde_json::from_value::<StaticResourceDescriptors>(value.clone()) {
94            if !flat.is_empty() {
95                return Ok(FlexibleDescriptors { resolved: flat });
96            }
97        }
98
99        // Try language-nested: {"en": {"name": "...", ...}}
100        if let Ok(nested) =
101            serde_json::from_value::<HashMap<String, StaticResourceDescriptors>>(value)
102        {
103            let resolved = nested
104                .get("en")
105                .or_else(|| nested.values().next())
106                .cloned()
107                .unwrap_or_default();
108            return Ok(FlexibleDescriptors { resolved });
109        }
110
111        Ok(FlexibleDescriptors {
112            resolved: StaticResourceDescriptors::default(),
113        })
114    }
115}
116
117impl BusinessDataResource {
118    /// Convert to StaticResourceSummary
119    fn to_summary(&self) -> StaticResourceSummary {
120        let ri = &self.resourceinstance;
121        StaticResourceSummary {
122            resourceinstanceid: ri.resourceinstanceid.clone(),
123            graph_id: ri.graph_id.clone(),
124            name: ri.name.clone(),
125            descriptors: ri.descriptors.as_ref().and_then(|d| d.get_for_lang("en")),
126            metadata: self.metadata.clone().unwrap_or_default(),
127            createdtime: ri.createdtime.clone(),
128            lastmodified: ri.lastmodified.clone(),
129            publication_id: ri.publication_id.clone(),
130            principaluser_id: ri.principaluser_id,
131            legacyid: ri.legacyid.clone(),
132            graph_publication_id: ri.graph_publication_id.clone(),
133        }
134    }
135}
136
137// ============================================================================
138// Fast Count Types (minimal deserialization for counting)
139// ============================================================================
140
141/// Minimal struct for fast counting - only deserializes what we need
142#[derive(Debug, Deserialize)]
143struct BusinessDataFileCount {
144    business_data: BusinessDataContentCount,
145}
146
147/// Count content - resources as raw values we just count
148#[derive(Debug, Deserialize)]
149struct BusinessDataContentCount {
150    #[serde(default)]
151    resources: Vec<BusinessDataResourceCount>,
152}
153
154/// Minimal resource - only deserialize graph_id for filtering
155#[derive(Debug, Deserialize)]
156struct BusinessDataResourceCount {
157    resourceinstance: BusinessDataResourceInstanceCount,
158}
159
160#[derive(Debug, Deserialize)]
161struct BusinessDataResourceInstanceCount {
162    graph_id: String,
163}
164
165// ============================================================================
166// Full Resource Loading Types
167// ============================================================================
168
169/// Full business data file with complete resource data including tiles
170#[derive(Debug, Deserialize)]
171struct BusinessDataFileFull {
172    business_data: BusinessDataContentFull,
173}
174
175#[derive(Debug, Deserialize)]
176struct BusinessDataContentFull {
177    #[serde(default)]
178    resources: Vec<BusinessDataResourceFull>,
179}
180
181/// Full resource with tiles
182#[derive(Debug, Deserialize)]
183struct BusinessDataResourceFull {
184    resourceinstance: BusinessDataResourceInstanceFull,
185    #[serde(default)]
186    tiles: Option<Vec<StaticTile>>,
187    #[serde(default)]
188    metadata: Option<HashMap<String, String>>,
189    #[serde(default, rename = "__cache")]
190    cache: Option<serde_json::Value>,
191    #[serde(default, rename = "__scopes")]
192    scopes: Option<serde_json::Value>,
193}
194
195/// Full resource instance for loading
196#[derive(Debug, Deserialize)]
197struct BusinessDataResourceInstanceFull {
198    resourceinstanceid: String,
199    graph_id: String,
200    name: String,
201    #[serde(default)]
202    descriptors: Option<FlexibleDescriptors>,
203    #[serde(default)]
204    createdtime: Option<String>,
205    #[serde(default)]
206    lastmodified: Option<String>,
207    #[serde(default)]
208    publication_id: Option<String>,
209    #[serde(default)]
210    principaluser_id: Option<i32>,
211    #[serde(default)]
212    legacyid: Option<String>,
213    #[serde(default)]
214    graph_publication_id: Option<String>,
215}
216
217impl BusinessDataResourceFull {
218    /// Convert to StaticResource
219    fn to_static_resource(&self) -> StaticResource {
220        let ri = &self.resourceinstance;
221        let descriptors = ri
222            .descriptors
223            .as_ref()
224            .and_then(|d| d.get_for_lang("en"))
225            .unwrap_or_default();
226
227        StaticResource {
228            resourceinstance: StaticResourceMetadata {
229                resourceinstanceid: ri.resourceinstanceid.clone(),
230                graph_id: ri.graph_id.clone(),
231                name: ri.name.clone(),
232                descriptors,
233                createdtime: ri.createdtime.clone(),
234                lastmodified: ri.lastmodified.clone(),
235                publication_id: ri.publication_id.clone(),
236                principaluser_id: ri.principaluser_id,
237                legacyid: ri.legacyid.clone(),
238                graph_publication_id: ri.graph_publication_id.clone(),
239            },
240            tiles: self.tiles.clone(),
241            metadata: self.metadata.clone().unwrap_or_default(),
242            cache: self.cache.clone(),
243            scopes: self.scopes.clone(),
244            tiles_loaded: Some(true),
245        }
246    }
247}
248
249/// Error type for loader operations
250#[derive(Debug)]
251pub enum LoaderError {
252    IoError(std::io::Error),
253    JsonError(serde_json::Error),
254    GraphError(String),
255    NotFound(String),
256    Other(String),
257}
258
259impl std::fmt::Display for LoaderError {
260    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
261        match self {
262            LoaderError::IoError(e) => write!(f, "IO error: {}", e),
263            LoaderError::JsonError(e) => write!(f, "JSON error: {}", e),
264            LoaderError::GraphError(s) => write!(f, "Graph error: {}", s),
265            LoaderError::NotFound(s) => write!(f, "Not found: {}", s),
266            LoaderError::Other(s) => write!(f, "{}", s),
267        }
268    }
269}
270
271impl std::error::Error for LoaderError {}
272
273impl From<std::io::Error> for LoaderError {
274    fn from(e: std::io::Error) -> Self {
275        LoaderError::IoError(e)
276    }
277}
278
279impl From<serde_json::Error> for LoaderError {
280    fn from(e: serde_json::Error) -> Self {
281        LoaderError::JsonError(e)
282    }
283}
284
285/// Metadata about the prebuild directory
286#[derive(Debug, Clone)]
287pub struct PrebuildInfo {
288    pub path: PathBuf,
289    pub has_graphs: bool,
290    pub has_business_data: bool,
291    pub has_reference_data: bool,
292    pub has_index_templates: bool,
293    pub has_ontologies: bool,
294    pub graph_files: Vec<PathBuf>,
295}
296
297/// Loader for prebuild directories
298pub struct PrebuildLoader {
299    root_path: PathBuf,
300}
301
302impl PrebuildLoader {
303    /// Create a new loader for the given prebuild directory
304    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, LoaderError> {
305        let root_path = path.as_ref().to_path_buf();
306        if !root_path.exists() {
307            return Err(LoaderError::NotFound(format!(
308                "Prebuild directory not found: {}",
309                root_path.display()
310            )));
311        }
312        Ok(PrebuildLoader { root_path })
313    }
314
315    /// Get information about what's in the prebuild directory
316    pub fn get_info(&self) -> Result<PrebuildInfo, LoaderError> {
317        let graphs_dir = self.root_path.join("graphs");
318        let business_data_dir = self.root_path.join("business_data");
319        let reference_data_dir = self.root_path.join("reference_data");
320        let index_templates_dir = self.root_path.join("indexTemplates");
321        let ontologies_dir = self.root_path.join("ontologies");
322
323        let graph_files = if graphs_dir.exists() {
324            self.find_graph_files(&graphs_dir)?
325        } else {
326            Vec::new()
327        };
328
329        Ok(PrebuildInfo {
330            path: self.root_path.clone(),
331            has_graphs: !graph_files.is_empty(),
332            has_business_data: business_data_dir.exists(),
333            has_reference_data: reference_data_dir.exists(),
334            has_index_templates: index_templates_dir.exists(),
335            has_ontologies: ontologies_dir.exists(),
336            graph_files,
337        })
338    }
339
340    /// Find all graph JSON files in the graphs directory
341    fn find_graph_files(&self, graphs_dir: &Path) -> Result<Vec<PathBuf>, LoaderError> {
342        let mut files = Vec::new();
343
344        // Check resource_models subdirectory
345        let resource_models = graphs_dir.join("resource_models");
346        if resource_models.exists() {
347            for entry in fs::read_dir(&resource_models)? {
348                let entry = entry?;
349                let path = entry.path();
350                if path.extension().map(|e| e == "json").unwrap_or(false) {
351                    files.push(path);
352                }
353            }
354        }
355
356        // Check branches subdirectory
357        let branches = graphs_dir.join("branches");
358        if branches.exists() {
359            for entry in fs::read_dir(&branches)? {
360                let entry = entry?;
361                let path = entry.path();
362                if path.extension().map(|e| e == "json").unwrap_or(false) {
363                    files.push(path);
364                }
365            }
366        }
367
368        Ok(files)
369    }
370
371    /// Load a single graph from a JSON file
372    pub fn load_graph<P: AsRef<Path>>(&self, path: P) -> Result<StaticGraph, LoaderError> {
373        let content = fs::read_to_string(path.as_ref())?;
374        StaticGraph::from_json_string(&content).map_err(LoaderError::GraphError)
375    }
376
377    /// Load a single graph and create an indexed version
378    pub fn load_indexed_graph<P: AsRef<Path>>(&self, path: P) -> Result<IndexedGraph, LoaderError> {
379        let graph = self.load_graph(path)?;
380        Ok(IndexedGraph::new(graph))
381    }
382
383    /// Load all graphs from the graphs directory
384    pub fn load_all_graphs(&self) -> Result<Vec<StaticGraph>, LoaderError> {
385        let info = self.get_info()?;
386        let mut graphs = Vec::new();
387
388        for path in &info.graph_files {
389            match self.load_graph(path) {
390                Ok(graph) => graphs.push(graph),
391                Err(e) => {
392                    eprintln!("Warning: Failed to load graph {}: {}", path.display(), e);
393                }
394            }
395        }
396
397        Ok(graphs)
398    }
399
400    /// Load all graphs and create indexed versions
401    pub fn load_all_indexed_graphs(&self) -> Result<Vec<IndexedGraph>, LoaderError> {
402        let graphs = self.load_all_graphs()?;
403        Ok(graphs.into_iter().map(IndexedGraph::new).collect())
404    }
405
406    /// Load all graphs into a map keyed by graph ID
407    pub fn load_graphs_by_id(&self) -> Result<HashMap<String, IndexedGraph>, LoaderError> {
408        let graphs = self.load_all_indexed_graphs()?;
409        Ok(graphs
410            .into_iter()
411            .map(|g| (g.graph.graphid.clone(), g))
412            .collect())
413    }
414
415    /// Get the path to a specific subdirectory
416    pub fn get_subdir(&self, name: &str) -> PathBuf {
417        self.root_path.join(name)
418    }
419
420    /// Get the root path
421    pub fn root_path(&self) -> &Path {
422        &self.root_path
423    }
424
425    // =========================================================================
426    // Collection (SKOS XML) Loading
427    // =========================================================================
428
429    /// Find all SKOS XML files across the reference_data subdirectories:
430    /// concepts/, collections/, controlled_lists/, and staging/.
431    pub fn find_collection_files(&self) -> Result<Vec<PathBuf>, LoaderError> {
432        let reference_data = self.root_path.join("reference_data");
433        if !reference_data.exists() {
434            return Ok(Vec::new());
435        }
436
437        let mut files = Vec::new();
438        for subdir in &["concepts", "collections", "controlled_lists", "staging"] {
439            let dir = reference_data.join(subdir);
440            if !dir.is_dir() {
441                continue;
442            }
443            for entry in fs::read_dir(&dir)? {
444                let entry = entry?;
445                let path = entry.path();
446                let ext = path.extension().and_then(|e| e.to_str());
447                if ext == Some("xml") || ext == Some("json") {
448                    files.push(path);
449                }
450            }
451        }
452        // Sort with XML before JSON so XML takes priority during dedup
453        files.sort_by(|a, b| {
454            let ext_order = |p: &PathBuf| -> u8 {
455                match p.extension().and_then(|e| e.to_str()) {
456                    Some("xml") => 0,
457                    _ => 1,
458                }
459            };
460            ext_order(a).cmp(&ext_order(b)).then_with(|| a.cmp(b))
461        });
462        Ok(files)
463    }
464
465    /// Load all SKOS collections from reference_data/ (XML and JSON).
466    ///
467    /// XML files are parsed via `parse_skos_to_collections`; JSON files
468    /// are deserialized directly as `SkosCollection`. When a collection
469    /// ID appears in both formats, the XML version takes priority.
470    pub fn load_collections(&self, base_uri: &str) -> Result<Vec<SkosCollection>, LoaderError> {
471        let files = self.find_collection_files()?;
472        let mut collections = Vec::new();
473        let mut seen_ids: std::collections::HashSet<String> = std::collections::HashSet::new();
474
475        for file in &files {
476            let content = fs::read_to_string(file)?;
477            let ext = file.extension().and_then(|e| e.to_str());
478
479            let parsed: Vec<SkosCollection> = match ext {
480                Some("xml") => match parse_skos_to_collections(&content, base_uri) {
481                    Ok(p) => p,
482                    Err(e) => {
483                        eprintln!(
484                            "Warning: Failed to parse XML collection {}: {}",
485                            file.display(),
486                            e
487                        );
488                        continue;
489                    }
490                },
491                Some("json") => {
492                    // Try as a single collection first, then as an array
493                    if let Ok(coll) = serde_json::from_str::<SkosCollection>(&content) {
494                        vec![coll]
495                    } else if let Ok(colls) = serde_json::from_str::<Vec<SkosCollection>>(&content)
496                    {
497                        colls
498                    } else {
499                        eprintln!(
500                            "Warning: Failed to parse JSON collection {}: not a valid SkosCollection",
501                            file.display(),
502                        );
503                        continue;
504                    }
505                }
506                _ => continue,
507            };
508
509            for coll in parsed {
510                if seen_ids.insert(coll.id.clone()) {
511                    collections.push(coll);
512                }
513            }
514        }
515
516        Ok(collections)
517    }
518
519    // =========================================================================
520    // Ontology Loading
521    // =========================================================================
522
523    /// Find ontology subdirectories (those containing ontology_config.json)
524    pub fn find_ontology_dirs(&self) -> Result<Vec<PathBuf>, LoaderError> {
525        let ontologies_dir = self.root_path.join("ontologies");
526        if !ontologies_dir.exists() {
527            return Ok(Vec::new());
528        }
529
530        let mut dirs = Vec::new();
531        for entry in fs::read_dir(&ontologies_dir)? {
532            let entry = entry?;
533            let path = entry.path();
534            if path.is_dir() && path.join("ontology_config.json").exists() {
535                dirs.push(path);
536            }
537        }
538        Ok(dirs)
539    }
540
541    /// Load an ontology config from a directory containing ontology_config.json
542    pub fn load_ontology_config(&self, ontology_dir: &Path) -> Result<OntologyConfig, LoaderError> {
543        let config_path = ontology_dir.join("ontology_config.json");
544        let content = fs::read_to_string(&config_path)?;
545        serde_json::from_str(&content).map_err(LoaderError::from)
546    }
547
548    /// Collect ontology RDFS XML contents from a directory containing ontology_config.json.
549    /// Returns the raw XML strings (base file + extensions in order) without building
550    /// the validator, so they can be combined with extra ontology files.
551    pub fn collect_ontology_xml_contents(
552        &self,
553        ontology_dir: &Path,
554    ) -> Result<Vec<String>, LoaderError> {
555        let config = self.load_ontology_config(ontology_dir)?;
556
557        let mut xml_contents = Vec::new();
558        let base_path = ontology_dir.join(&config.base);
559        xml_contents.push(fs::read_to_string(&base_path).map_err(|e| {
560            LoaderError::IoError(std::io::Error::new(
561                e.kind(),
562                format!(
563                    "Failed to read ontology base file {}: {}",
564                    base_path.display(),
565                    e
566                ),
567            ))
568        })?);
569
570        for ext in &config.extensions {
571            let ext_path = ontology_dir.join(ext);
572            xml_contents.push(fs::read_to_string(&ext_path).map_err(|e| {
573                LoaderError::IoError(std::io::Error::new(
574                    e.kind(),
575                    format!(
576                        "Failed to read ontology extension {}: {}",
577                        ext_path.display(),
578                        e
579                    ),
580                ))
581            })?);
582        }
583
584        Ok(xml_contents)
585    }
586
587    /// Load an OntologyValidator from a directory containing ontology_config.json
588    /// and RDFS XML files. Reads the base file and all extensions listed in config.
589    pub fn load_ontology_validator(
590        &self,
591        ontology_dir: &Path,
592    ) -> Result<OntologyValidator, LoaderError> {
593        let xml_contents = self.collect_ontology_xml_contents(ontology_dir)?;
594        let refs: Vec<&str> = xml_contents.iter().map(|s| s.as_str()).collect();
595        OntologyValidator::from_rdfs_xml(&refs).map_err(|e| LoaderError::GraphError(e.to_string()))
596    }
597
598    /// Find all business data JSON files (searches recursively)
599    pub fn find_business_data_files(&self) -> Result<Vec<PathBuf>, LoaderError> {
600        let business_data_dir = self.root_path.join("business_data");
601        if !business_data_dir.exists() {
602            return Ok(Vec::new());
603        }
604
605        let mut files = Vec::new();
606        self.collect_json_files(&business_data_dir, &mut files)?;
607        Ok(files)
608    }
609
610    /// Recursively collect all JSON files from a directory
611    #[allow(clippy::only_used_in_recursion)]
612    fn collect_json_files(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<(), LoaderError> {
613        for entry in fs::read_dir(dir)? {
614            let entry = entry?;
615            let path = entry.path();
616            if path.is_dir() {
617                self.collect_json_files(&path, files)?;
618            } else if path.extension().map(|e| e == "json").unwrap_or(false) {
619                files.push(path);
620            }
621        }
622        Ok(())
623    }
624
625    /// Load resource summaries from a single business data file
626    /// Uses typed deserialization for fast parsing
627    pub fn load_resource_summaries_from_file(
628        &self,
629        path: &Path,
630        graph_id: &str,
631    ) -> Result<Vec<StaticResourceSummary>, LoaderError> {
632        let content = fs::read_to_string(path)?;
633        let file: BusinessDataFile = serde_json::from_str(&content)?;
634
635        let summaries: Vec<StaticResourceSummary> = file
636            .business_data
637            .resources
638            .into_iter()
639            .filter(|r| r.resourceinstance.graph_id == graph_id)
640            .map(|r| r.to_summary())
641            .collect();
642
643        Ok(summaries)
644    }
645
646    /// Load resource summaries for a graph, with optional limit
647    /// Returns (summaries, has_more)
648    pub fn load_resource_summaries(
649        &self,
650        graph_id: &str,
651        offset: usize,
652        limit: usize,
653    ) -> Result<(Vec<StaticResourceSummary>, bool), LoaderError> {
654        let files = self.find_business_data_files()?;
655        let mut all_summaries = Vec::new();
656
657        for file in &files {
658            match self.load_resource_summaries_from_file(file, graph_id) {
659                Ok(summaries) => all_summaries.extend(summaries),
660                Err(e) => {
661                    eprintln!(
662                        "Warning: Failed to load resources from {}: {}",
663                        file.display(),
664                        e
665                    );
666                }
667            }
668        }
669
670        // Apply offset and limit
671        let total = all_summaries.len();
672        let has_more = offset + limit < total;
673        let summaries: Vec<_> = all_summaries.into_iter().skip(offset).take(limit).collect();
674
675        Ok((summaries, has_more))
676    }
677
678    /// Get total count of resources for a graph (without loading all data)
679    pub fn count_resources_for_graph(&self, graph_id: &str) -> Result<usize, LoaderError> {
680        let files = self.find_business_data_files()?;
681        let mut count = 0;
682
683        for file in &files {
684            count += self.fast_count_resources_in_file(file, graph_id)?;
685        }
686
687        Ok(count)
688    }
689
690    /// Fast count of resources in a single file (minimal deserialization)
691    pub fn fast_count_resources_in_file(
692        &self,
693        path: &Path,
694        graph_id: &str,
695    ) -> Result<usize, LoaderError> {
696        let content = fs::read_to_string(path)?;
697        let file_data: BusinessDataFileCount = serde_json::from_str(&content)?;
698
699        let count = file_data
700            .business_data
701            .resources
702            .iter()
703            .filter(|r| r.resourceinstance.graph_id == graph_id)
704            .count();
705
706        Ok(count)
707    }
708
709    /// Get file counts for per-file progress tracking
710    /// Returns Vec of (file_path, resource_count) for each file
711    pub fn get_business_data_file_counts(
712        &self,
713        graph_id: &str,
714    ) -> Result<Vec<(PathBuf, usize)>, LoaderError> {
715        let files = self.find_business_data_files()?;
716        let mut result = Vec::with_capacity(files.len());
717
718        for file in files {
719            let count = self.fast_count_resources_in_file(&file, graph_id)?;
720            if count > 0 {
721                result.push((file, count));
722            }
723        }
724
725        Ok(result)
726    }
727
728    /// Load all full resources (with tiles) from a single business_data file.
729    ///
730    /// Like `load_resource_summaries_from_file` but returns `StaticResource`
731    /// with tiles and resolved descriptors. Useful for bulk index building.
732    pub fn load_full_resources_from_file(
733        &self,
734        path: &Path,
735        graph_id: &str,
736    ) -> Result<Vec<StaticResource>, LoaderError> {
737        let content = fs::read_to_string(path)?;
738        let file_data: BusinessDataFileFull = serde_json::from_str(&content)?;
739
740        let resources: Vec<StaticResource> = file_data
741            .business_data
742            .resources
743            .into_iter()
744            .filter(|r| r.resourceinstance.graph_id == graph_id)
745            .map(|r| r.to_static_resource())
746            .collect();
747
748        Ok(resources)
749    }
750
751    /// Load all full resources (with tiles) from a single business_data file,
752    /// across all graphs. Reads and parses the file only once.
753    ///
754    /// Supports two formats:
755    /// - Prebuild wrapper: `{ business_data: { resources: [...] } }`
756    /// - Bare resource: `{ resourceinstance: {...}, tiles: [...], ... }`
757    pub fn load_all_full_resources_from_file(
758        &self,
759        path: &Path,
760    ) -> Result<Vec<StaticResource>, LoaderError> {
761        let content = fs::read_to_string(path)?;
762
763        // Try the wrapper format first; fall back to a bare resource.
764        if let Ok(file_data) = serde_json::from_str::<BusinessDataFileFull>(&content) {
765            let resources: Vec<StaticResource> = file_data
766                .business_data
767                .resources
768                .into_iter()
769                .map(|r| r.to_static_resource())
770                .collect();
771            Ok(resources)
772        } else {
773            let resource: BusinessDataResourceFull = serde_json::from_str(&content)?;
774            Ok(vec![resource.to_static_resource()])
775        }
776    }
777
778    /// Load a full StaticResource by its resourceinstanceid
779    /// Searches through all business_data files to find the resource
780    pub fn load_full_resource(
781        &self,
782        resource_id: &str,
783        graph_id: &str,
784    ) -> Result<StaticResource, LoaderError> {
785        let files = self.find_business_data_files()?;
786
787        for file in &files {
788            let content = fs::read_to_string(file)?;
789            let file_data: BusinessDataFileFull = serde_json::from_str(&content)?;
790
791            for resource in file_data.business_data.resources {
792                if resource.resourceinstance.resourceinstanceid == resource_id {
793                    return Ok(resource.to_static_resource());
794                }
795            }
796        }
797
798        Err(LoaderError::NotFound(format!(
799            "Resource {} not found in graph {}",
800            resource_id, graph_id
801        )))
802    }
803
804    // =========================================================================
805    // Parallel Loading Methods (requires "parallel" feature)
806    // =========================================================================
807
808    /// Load resources from multiple files in parallel, sending batches via channel.
809    /// Falls back to sequential loading if "parallel" feature is not enabled.
810    ///
811    /// The callback is called for each file's results as they complete.
812    /// Returns total count of resources loaded.
813    #[cfg(feature = "parallel")]
814    pub fn load_resources_parallel(
815        &self,
816        files: &[(PathBuf, usize)],
817        graph_id: &str,
818        tx: &Sender<Vec<StaticResourceSummary>>,
819    ) -> Result<usize, LoaderError> {
820        use std::sync::atomic::{AtomicUsize, Ordering};
821
822        let total_loaded = AtomicUsize::new(0);
823        let graph_id = graph_id.to_string();
824
825        // Process files in parallel using rayon
826        files.par_iter().for_each(|(file_path, _count)| {
827            if let Ok(summaries) = self.load_resource_summaries_from_file(file_path, &graph_id) {
828                if !summaries.is_empty() {
829                    total_loaded.fetch_add(summaries.len(), Ordering::Relaxed);
830                    let _ = tx.send(summaries);
831                }
832            }
833        });
834
835        Ok(total_loaded.load(Ordering::Relaxed))
836    }
837
838    /// Sequential fallback when parallel feature is not enabled
839    #[cfg(not(feature = "parallel"))]
840    pub fn load_resources_parallel(
841        &self,
842        files: &[(PathBuf, usize)],
843        graph_id: &str,
844        tx: &Sender<Vec<StaticResourceSummary>>,
845    ) -> Result<usize, LoaderError> {
846        let mut total_loaded = 0;
847
848        for (file_path, _count) in files {
849            if let Ok(summaries) = self.load_resource_summaries_from_file(file_path, graph_id) {
850                if !summaries.is_empty() {
851                    total_loaded += summaries.len();
852                    let _ = tx.send(summaries);
853                }
854            }
855        }
856
857        Ok(total_loaded)
858    }
859
860    /// Count resources in files in parallel (for initial count phase)
861    #[cfg(feature = "parallel")]
862    pub fn count_resources_parallel(
863        &self,
864        files: &[PathBuf],
865        graph_id: &str,
866    ) -> Vec<(PathBuf, usize)> {
867        files
868            .par_iter()
869            .filter_map(
870                |file| match self.fast_count_resources_in_file(file, graph_id) {
871                    Ok(count) if count > 0 => Some((file.clone(), count)),
872                    _ => None,
873                },
874            )
875            .collect()
876    }
877
878    /// Sequential fallback for counting
879    #[cfg(not(feature = "parallel"))]
880    pub fn count_resources_parallel(
881        &self,
882        files: &[PathBuf],
883        graph_id: &str,
884    ) -> Vec<(PathBuf, usize)> {
885        files
886            .iter()
887            .filter_map(
888                |file| match self.fast_count_resources_in_file(file, graph_id) {
889                    Ok(count) if count > 0 => Some((file.clone(), count)),
890                    _ => None,
891                },
892            )
893            .collect()
894    }
895
896    // =========================================================================
897    // Preindex Loading Methods
898    // =========================================================================
899
900    /// Find all preindex .pi files (searches recursively)
901    pub fn find_preindex_files(&self, _graph_id: &str) -> Result<Vec<PathBuf>, LoaderError> {
902        let preindex_dir = self.root_path.join("preindex");
903        if !preindex_dir.exists() {
904            return Ok(Vec::new());
905        }
906
907        let mut files = Vec::new();
908        self.collect_pi_files(&preindex_dir, &mut files)?;
909        Ok(files)
910    }
911
912    /// Recursively collect all .pi files from a directory
913    #[allow(clippy::only_used_in_recursion)]
914    fn collect_pi_files(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<(), LoaderError> {
915        for entry in fs::read_dir(dir)? {
916            let entry = entry?;
917            let path = entry.path();
918            if path.is_dir() {
919                self.collect_pi_files(&path, files)?;
920            } else if path.extension().map(|e| e == "pi").unwrap_or(false) {
921                files.push(path);
922            }
923        }
924        Ok(())
925    }
926
927    /// Load resource summaries from preindex .pi files
928    /// .pi files contain StaticResourceSummary objects directly (one per line or as JSON array)
929    pub fn load_preindex_summaries(
930        &self,
931        graph_id: &str,
932        offset: usize,
933        limit: usize,
934    ) -> Result<(Vec<StaticResourceSummary>, bool), LoaderError> {
935        let files = self.find_preindex_files(graph_id)?;
936        let mut all_summaries = Vec::new();
937
938        for file in &files {
939            match self.load_preindex_file(file, graph_id) {
940                Ok(summaries) => all_summaries.extend(summaries),
941                Err(e) => {
942                    eprintln!(
943                        "Warning: Failed to load preindex from {}: {}",
944                        file.display(),
945                        e
946                    );
947                }
948            }
949        }
950
951        // Apply offset and limit
952        let total = all_summaries.len();
953        let has_more = offset + limit < total;
954        let summaries: Vec<_> = all_summaries.into_iter().skip(offset).take(limit).collect();
955
956        Ok((summaries, has_more))
957    }
958
959    /// Load a single preindex file
960    fn load_preindex_file(
961        &self,
962        path: &Path,
963        graph_id: &str,
964    ) -> Result<Vec<StaticResourceSummary>, LoaderError> {
965        let content = fs::read_to_string(path)?;
966        let mut summaries = Vec::new();
967
968        // Try parsing as JSON array first
969        if let Ok(array) = serde_json::from_str::<Vec<StaticResourceSummary>>(&content) {
970            for summary in array {
971                if summary.graph_id == graph_id {
972                    summaries.push(summary);
973                }
974            }
975            return Ok(summaries);
976        }
977
978        // Try parsing as newline-delimited JSON (NDJSON)
979        for line in content.lines() {
980            let line = line.trim();
981            if line.is_empty() {
982                continue;
983            }
984            if let Ok(summary) = serde_json::from_str::<StaticResourceSummary>(line) {
985                if summary.graph_id == graph_id {
986                    summaries.push(summary);
987                }
988            }
989        }
990
991        Ok(summaries)
992    }
993
994    /// Count resources in preindex files for a graph
995    pub fn count_preindex_resources_for_graph(&self, graph_id: &str) -> Result<usize, LoaderError> {
996        let files = self.find_preindex_files(graph_id)?;
997        let mut count = 0;
998
999        for file in &files {
1000            if let Ok(summaries) = self.load_preindex_file(file, graph_id) {
1001                count += summaries.len();
1002            }
1003        }
1004
1005        Ok(count)
1006    }
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011    use super::*;
1012    use crate::graph::StaticGraph;
1013    use std::path::PathBuf;
1014
1015    #[test]
1016    fn test_loader_not_found() {
1017        let result = PrebuildLoader::new("/nonexistent/path");
1018        assert!(matches!(result, Err(LoaderError::NotFound(_))));
1019    }
1020
1021    #[test]
1022    fn test_parse_coral_format_json() {
1023        // Test parsing JSON without the new Arches-HER 2.0+ fields
1024        let manifest_dir = env!("CARGO_MANIFEST_DIR");
1025        let test_path = PathBuf::from(manifest_dir)
1026            .parent()
1027            .unwrap()
1028            .parent()
1029            .unwrap()
1030            .join("tests/data/models/Person.json");
1031
1032        let content = std::fs::read_to_string(&test_path).expect("Failed to read test JSON file");
1033
1034        let data: serde_json::Value = serde_json::from_str(&content).expect("Failed to parse JSON");
1035
1036        let graph_json = &data["graph"][0];
1037
1038        // Verify the old format doesn't have the new fields
1039        assert!(
1040            graph_json.get("source_identifier_id").is_none()
1041                || graph_json["source_identifier_id"].is_null()
1042        );
1043
1044        // Parse as StaticGraph - this should succeed with defaults for missing fields
1045        let graph: StaticGraph = serde_json::from_value(graph_json.clone())
1046            .expect("Failed to parse StaticGraph from Coral format");
1047
1048        assert!(!graph.graphid.is_empty());
1049        assert!(graph.source_identifier_id.is_none()); // Defaults to None
1050        assert!(graph.is_active.is_none()); // Defaults to None
1051        assert!(!graph.nodes.is_empty());
1052    }
1053
1054    #[test]
1055    fn test_parse_arches_her_format_json() {
1056        // Test parsing JSON with the new Arches-HER 2.0+ fields
1057        let json = r#"{
1058            "graphid": "test-graph-id",
1059            "name": {"en": "Test Graph"},
1060            "nodes": [],
1061            "edges": [],
1062            "nodegroups": [],
1063            "cards": [],
1064            "cards_x_nodes_x_widgets": [],
1065            "functions_x_graphs": [],
1066            "root": {
1067                "nodeid": "root-node-id",
1068                "name": "Root Node",
1069                "datatype": "semantic",
1070                "graph_id": "test-graph-id"
1071            },
1072            "source_identifier_id": "some-source-id",
1073            "is_active": true,
1074            "has_unpublished_changes": false,
1075            "is_copy_immutable": false
1076        }"#;
1077
1078        let graph: StaticGraph =
1079            serde_json::from_str(json).expect("Failed to parse StaticGraph with Arches-HER fields");
1080
1081        assert_eq!(graph.graphid, "test-graph-id");
1082        assert_eq!(
1083            graph.source_identifier_id,
1084            Some("some-source-id".to_string())
1085        );
1086        assert_eq!(graph.is_active, Some(true));
1087        assert_eq!(graph.has_unpublished_changes, Some(false));
1088    }
1089}
1090
1091// =============================================================================
1092// Standalone helpers for WASM / napi callers
1093// =============================================================================
1094
1095/// Parse a business data JSON blob (as raw bytes) into `StaticResource`s.
1096/// Uses the same internal parsing types as `PrebuildLoader` so there is no
1097/// duplication.
1098///
1099/// Supports two formats:
1100/// - Prebuild wrapper: `{ business_data: { resources: [...] } }`
1101/// - Bare resource: `{ resourceinstance: {...}, tiles: [...], ... }`
1102///
1103/// This is the function WASM and napi crates call so that the heavy JSON
1104/// parsing stays entirely in Rust — callers only pass in a byte buffer.
1105pub fn parse_business_data_bytes(bytes: &[u8]) -> Result<Vec<StaticResource>, LoaderError> {
1106    if let Ok(file_data) = serde_json::from_slice::<BusinessDataFileFull>(bytes) {
1107        Ok(file_data
1108            .business_data
1109            .resources
1110            .into_iter()
1111            .map(|r| r.to_static_resource())
1112            .collect())
1113    } else {
1114        let resource: BusinessDataResourceFull = serde_json::from_slice(bytes)?;
1115        Ok(vec![resource.to_static_resource()])
1116    }
1117}
1118
1119/// Result of importing a prebuild/pkg directory.
1120pub struct ImportPrebuildResult {
1121    pub graph_ids: Vec<String>,
1122    pub collection_ids: Vec<String>,
1123    pub collections: Vec<SkosCollection>,
1124    pub ontology_validators: Vec<OntologyValidator>,
1125    pub ontology_configs: Vec<OntologyConfig>,
1126}
1127
1128/// Load SKOS XML/JSON collections from an arbitrary directory.
1129///
1130/// Scans `dir` for `*.xml` and `*.json` files (non-recursive). XML files are
1131/// parsed as SKOS; JSON files as serialized `SkosCollection`. Useful for loading
1132/// extra reference data from directories outside the main pkg structure.
1133pub fn load_collections_from_dir(
1134    dir: &str,
1135    base_uri: &str,
1136) -> Result<Vec<SkosCollection>, LoaderError> {
1137    let dir_path = Path::new(dir);
1138    if !dir_path.is_dir() {
1139        return Ok(Vec::new());
1140    }
1141
1142    let mut files: Vec<PathBuf> = Vec::new();
1143    for entry in fs::read_dir(dir_path)? {
1144        let entry = entry?;
1145        let path = entry.path();
1146        let ext = path.extension().and_then(|e| e.to_str());
1147        if ext == Some("xml") || ext == Some("json") {
1148            files.push(path);
1149        }
1150    }
1151    files.sort();
1152
1153    let mut collections = Vec::new();
1154    let mut seen_ids: std::collections::HashSet<String> = std::collections::HashSet::new();
1155
1156    for file in &files {
1157        let content = fs::read_to_string(file)?;
1158        let ext = file.extension().and_then(|e| e.to_str());
1159
1160        let parsed: Vec<SkosCollection> = match ext {
1161            Some("xml") => match parse_skos_to_collections(&content, base_uri) {
1162                Ok(p) => p,
1163                Err(e) => {
1164                    eprintln!(
1165                        "Warning: Failed to parse XML collection {}: {}",
1166                        file.display(),
1167                        e
1168                    );
1169                    continue;
1170                }
1171            },
1172            Some("json") => {
1173                if let Ok(coll) = serde_json::from_str::<SkosCollection>(&content) {
1174                    vec![coll]
1175                } else if let Ok(colls) = serde_json::from_str::<Vec<SkosCollection>>(&content) {
1176                    colls
1177                } else {
1178                    eprintln!(
1179                        "Warning: Failed to parse JSON collection {}: not a valid SkosCollection",
1180                        file.display(),
1181                    );
1182                    continue;
1183                }
1184            }
1185            _ => continue,
1186        };
1187
1188        for coll in parsed {
1189            if seen_ids.insert(coll.id.clone()) {
1190                collections.push(coll);
1191            }
1192        }
1193    }
1194
1195    Ok(collections)
1196}
1197
1198/// Load RDFS XML files from a directory (non-recursive, `*.xml` only).
1199///
1200/// Returns the file contents as strings, suitable for passing to
1201/// `OntologyValidator::from_rdfs_xml`.
1202pub fn load_ontology_xml_from_dir(dir: &str) -> Result<Vec<String>, LoaderError> {
1203    let dir_path = Path::new(dir);
1204    if !dir_path.is_dir() {
1205        return Ok(Vec::new());
1206    }
1207
1208    let mut files: Vec<PathBuf> = Vec::new();
1209    for entry in fs::read_dir(dir_path)? {
1210        let entry = entry?;
1211        let path = entry.path();
1212        if path.extension().and_then(|e| e.to_str()) == Some("xml") {
1213            files.push(path);
1214        }
1215    }
1216    files.sort();
1217
1218    let mut contents = Vec::new();
1219    for file in &files {
1220        contents.push(fs::read_to_string(file).map_err(|e| {
1221            LoaderError::IoError(std::io::Error::new(
1222                e.kind(),
1223                format!("Failed to read ontology file {}: {}", file.display(), e),
1224            ))
1225        })?);
1226    }
1227    Ok(contents)
1228}
1229
1230/// Import a prebuild/pkg directory: register graphs in the global graph registry,
1231/// load SKOS collections into the global RDM cache, and load ontology validators.
1232///
1233/// This is the inverse of `export_prebuild`. It reads the directory structure and:
1234/// 1. Loads and registers all graphs from `graphs/resource_models/` and `graphs/branches/`
1235/// 2. Parses SKOS XML from `reference_data/collections/` and adds to the global RDM cache
1236/// 3. Optionally loads extra reference data from additional directories
1237/// 4. Loads ontology RDFS files from `ontologies/` (if present), optionally merged with extras
1238pub fn import_prebuild(
1239    path: &str,
1240    base_uri: &str,
1241    extra_reference_data_dirs: Option<&[&str]>,
1242    extra_ontology_dirs: Option<&[&str]>,
1243) -> Result<ImportPrebuildResult, LoaderError> {
1244    // Set the RDM namespace from base_uri for deterministic UUID generation
1245    crate::set_rdm_namespace(base_uri)
1246        .map_err(|e| LoaderError::Other(format!("Failed to set RDM namespace: {}", e)))?;
1247
1248    let loader = PrebuildLoader::new(path)?;
1249
1250    // 1. Load and register graphs
1251    let graphs = loader.load_all_graphs()?;
1252    let graph_ids: Vec<String> = graphs
1253        .into_iter()
1254        .map(|g| {
1255            let id = g.graphid.clone();
1256            crate::register_graph_owned(g);
1257            id
1258        })
1259        .collect();
1260
1261    // 2. Load SKOS collections into global RDM cache
1262    let collections = loader.load_collections(base_uri)?;
1263    let mut collection_ids = crate::add_to_global_rdm_cache_from_skos(&collections);
1264
1265    // 2b. Load extra reference data directories
1266    if let Some(dirs) = extra_reference_data_dirs {
1267        for dir in dirs {
1268            let extra_collections = load_collections_from_dir(dir, base_uri)?;
1269            let extra_ids = crate::add_to_global_rdm_cache_from_skos(&extra_collections);
1270            collection_ids.extend(extra_ids);
1271        }
1272    }
1273
1274    // 3. Collect ontology XML contents from base pkg
1275    let ontology_dirs = loader.find_ontology_dirs()?;
1276    let mut all_xml_contents = Vec::new();
1277    let mut ontology_configs = Vec::new();
1278    for dir in &ontology_dirs {
1279        ontology_configs.push(loader.load_ontology_config(dir)?);
1280        all_xml_contents.extend(loader.collect_ontology_xml_contents(dir)?);
1281    }
1282
1283    // 3b. Load extra ontology XML files
1284    if let Some(extra_dirs) = extra_ontology_dirs {
1285        for dir in extra_dirs {
1286            all_xml_contents.extend(load_ontology_xml_from_dir(dir)?);
1287        }
1288    }
1289
1290    // 3c. Build single validator from combined ontology files
1291    let mut ontology_validators = Vec::new();
1292    if !all_xml_contents.is_empty() {
1293        let refs: Vec<&str> = all_xml_contents.iter().map(|s| s.as_str()).collect();
1294        let validator = OntologyValidator::from_rdfs_xml(&refs)
1295            .map_err(|e| LoaderError::GraphError(e.to_string()))?;
1296        ontology_validators.push(validator);
1297    }
1298
1299    Ok(ImportPrebuildResult {
1300        graph_ids,
1301        collection_ids,
1302        collections,
1303        ontology_validators,
1304        ontology_configs,
1305    })
1306}