Skip to main content

clayers_spec/
rnc.rs

1//! LLM description fusing and RNC export API.
2//!
3//! Calls `clayers_xml::rnc::xsd_to_rnc()` to auto-discover schemas,
4//! then enriches the result by extracting `llm:describe` annotations from
5//! XSD appinfo and injecting them as `description` fields on the RNC structs.
6
7use std::collections::HashMap;
8use std::path::Path;
9
10use xot::Xot;
11
12use clayers_xml::rnc::RncSchema;
13
14/// Export all schemas as RNC with `llm:describe` annotations as comments.
15///
16/// # Errors
17///
18/// Returns an error if schema files cannot be read or parsed.
19pub fn export_rnc(schema_dir: &Path) -> Result<RncSchema, crate::Error> {
20    let mut schema = clayers_xml::rnc::xsd_to_rnc(schema_dir, &[])?;
21    fuse_descriptions(schema_dir, &mut schema)?;
22    Ok(schema)
23}
24
25/// Export specific layers (by prefix) as RNC with `llm:describe` comments.
26///
27/// # Errors
28///
29/// Returns an error if schema files cannot be read or parsed.
30pub fn export_rnc_filtered(
31    schema_dir: &Path,
32    prefixes: &[&str],
33) -> Result<RncSchema, crate::Error> {
34    let mut schema = export_rnc(schema_dir)?;
35    schema
36        .layers
37        .retain(|layer| prefixes.contains(&layer.prefix.as_str()));
38    Ok(schema)
39}
40
41/// Get the local part of a possibly-prefixed type reference.
42fn split_type_local(type_ref: &str) -> &str {
43    type_ref.rsplit_once(':').map_or(type_ref, |(_, l)| l)
44}
45
46/// Extract `llm:describe` text from a schema root or type element.
47fn extract_llm_describe(xot: &mut Xot, node: xot::Node, llm_uri: &str) -> Option<String> {
48    let xs_ns = xot.add_namespace("http://www.w3.org/2001/XMLSchema");
49    let annotation = xot.add_name_ns("annotation", xs_ns);
50    let appinfo = xot.add_name_ns("appinfo", xs_ns);
51    let llm_ns = xot.add_namespace(llm_uri);
52    let describe = xot.add_name_ns("describe", llm_ns);
53
54    for ann_child in xot.children(node) {
55        if !xot.is_element(ann_child)
56            || xot.element(ann_child).is_none_or(|e| e.name() != annotation)
57        {
58            continue;
59        }
60        for app_child in xot.children(ann_child) {
61            if !xot.is_element(app_child)
62                || xot
63                    .element(app_child)
64                    .is_none_or(|e| e.name() != appinfo)
65            {
66                continue;
67            }
68            for desc_child in xot.children(app_child) {
69                if xot.is_element(desc_child)
70                    && xot
71                        .element(desc_child)
72                        .is_some_and(|e| e.name() == describe)
73                {
74                    let text = xot.text_content_str(desc_child).unwrap_or("").trim().to_string();
75                    if !text.is_empty() {
76                        // Normalize whitespace.
77                        let normalized: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
78                        return Some(normalized);
79                    }
80                }
81            }
82        }
83    }
84    None
85}
86
87/// Walk XSD files and inject `llm:describe` text into matching RNC structs.
88fn fuse_descriptions(schema_dir: &Path, schema: &mut RncSchema) -> Result<(), crate::Error> {
89    let mut xsd_paths: Vec<_> = std::fs::read_dir(schema_dir)?
90        .filter_map(|e| e.ok().map(|e| e.path()))
91        .filter(|p| p.extension().is_some_and(|ext| ext == "xsd"))
92        .collect();
93    xsd_paths.sort();
94
95    let uri_to_prefix: HashMap<String, String> = schema
96        .namespaces
97        .iter()
98        .map(|ns| (ns.uri.clone(), ns.prefix.clone()))
99        .collect();
100
101    // Find the LLM namespace URI from auto-discovered namespaces.
102    let llm_uri = schema
103        .namespaces
104        .iter()
105        .find(|ns| ns.uri == "urn:clayers:llm")
106        .map(|ns| ns.uri.clone());
107
108    // We need a persistent Xot so we can use add_namespace (takes &mut).
109    // But extract_llm_describe also calls add_namespace. To work around this,
110    // parse all files first, collect the descriptions, then apply them.
111    let mut layer_descs: HashMap<String, String> = HashMap::new();
112    let mut type_descs: HashMap<(String, String), String> = HashMap::new();
113    let mut elem_descs: HashMap<(String, String), String> = HashMap::new();
114
115    // If no LLM namespace was discovered, skip description extraction entirely.
116    let Some(llm_uri) = llm_uri else {
117        return Ok(());
118    };
119
120    for xsd_path in &xsd_paths {
121        let content = std::fs::read_to_string(xsd_path)?;
122        let mut xot = Xot::new();
123        let doc = xot.parse(&content).map_err(xot::Error::from)?;
124        let root = xot.document_element(doc)?;
125
126        let tns_attr = xot.add_name("targetNamespace");
127        let tns = xot.get_attribute(root, tns_attr)
128            .unwrap_or("")
129            .to_string();
130        // Skip XSD files whose namespace was not discovered (not in the schema).
131        let Some(pfx) = uri_to_prefix.get(&tns).cloned() else {
132            continue;
133        };
134
135        // Schema-level llm:describe -> layer description.
136        if let Some(desc) = extract_llm_describe(&mut xot, root, &llm_uri) {
137            layer_descs.insert(pfx.clone(), desc);
138        }
139
140        // complexType-level llm:describe -> pattern/element type descriptions.
141        let xs_ns = xot.add_namespace("http://www.w3.org/2001/XMLSchema");
142        let complex_type = xot.add_name_ns("complexType", xs_ns);
143        let element_tag = xot.add_name_ns("element", xs_ns);
144        let name_attr = xot.add_name("name");
145        let type_attr = xot.add_name("type");
146
147        // Collect child info first to avoid borrow conflicts with extract_llm_describe.
148        let child_info: Vec<(xot::Node, xot::NameId, Option<String>, Option<String>)> = xot
149            .children(root)
150            .filter(|c| xot.is_element(*c))
151            .filter_map(|c| {
152                let el = xot.element(c)?;
153                let cn = el.name();
154                let nm = xot.get_attribute(c, name_attr).map(String::from);
155                let tr = xot.get_attribute(c, type_attr).map(String::from);
156                Some((c, cn, nm, tr))
157            })
158            .collect();
159
160        for (child, child_name, name_val, type_ref_val) in child_info {
161            let Some(n) = name_val else { continue };
162            if child_name == complex_type {
163                if let Some(desc) = extract_llm_describe(&mut xot, child, &llm_uri) {
164                    type_descs.insert((pfx.clone(), n), desc);
165                }
166            } else if child_name == element_tag {
167                if let Some(desc) = extract_llm_describe(&mut xot, child, &llm_uri) {
168                    elem_descs.insert((pfx.clone(), n.clone()), desc);
169                }
170                let key = (pfx.clone(), n.clone());
171                if !elem_descs.contains_key(&key)
172                    && let Some(type_ref) = &type_ref_val
173                {
174                    let local = split_type_local(type_ref);
175                    if let Some(desc) = type_descs.get(&(pfx.clone(), local.to_string())) {
176                        elem_descs.insert(key, desc.clone());
177                    }
178                }
179            }
180        }
181    }
182
183    // Apply descriptions to the schema.
184    for layer in &mut schema.layers {
185        if let Some(desc) = layer_descs.get(&layer.prefix) {
186            layer.description = Some(desc.clone());
187        }
188        for pat in &mut layer.patterns {
189            let key = (layer.prefix.clone(), pat.name.clone());
190            if let Some(desc) = type_descs.get(&key) {
191                pat.description = Some(desc.clone());
192            }
193        }
194        for elem in &mut layer.elements {
195            let key = (layer.prefix.clone(), elem.name.clone());
196            if let Some(desc) = elem_descs.get(&key) {
197                elem.description = Some(desc.clone());
198            }
199        }
200    }
201
202    Ok(())
203}
204
205/// Format an `RncSchema` as a string (convenience wrapper around `Display`).
206///
207/// This produces the same output as `schema.to_string()` but makes the
208/// intent explicit.
209#[must_use]
210pub fn render(schema: &RncSchema) -> String {
211    schema.to_string()
212}
213
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218    use std::path::PathBuf;
219
220    fn schemas_dir() -> PathBuf {
221        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
222            .join("../../schemas")
223            .canonicalize()
224            .expect("schemas/ directory not found")
225    }
226
227    #[test]
228    fn export_rnc_produces_output_with_namespaces() {
229        let schema = export_rnc(&schemas_dir()).expect("export_rnc failed");
230        let output = schema.to_string();
231        assert!(output.len() > 100, "Output too short: {}", output.len());
232        assert!(
233            output.contains("namespace"),
234            "Missing namespace declarations"
235        );
236        // Should have multiple layers.
237        assert!(
238            schema.layers.len() >= 10,
239            "Expected 10+ layers, got {}",
240            schema.layers.len()
241        );
242    }
243
244    #[test]
245    fn export_rnc_has_llm_describe_comments() {
246        let schema = export_rnc(&schemas_dir()).expect("export_rnc failed");
247        let output = schema.to_string();
248        // prose.xsd has llm:describe on the schema root and on SectionType.
249        assert!(
250            output.contains("# The prose schema provides"),
251            "Missing prose layer llm:describe comment in output:\n{output}"
252        );
253    }
254
255    #[test]
256    fn export_rnc_filtered_returns_single_layer() {
257        let schema =
258            export_rnc_filtered(&schemas_dir(), &["pr"]).expect("export_rnc_filtered failed");
259        assert_eq!(schema.layers.len(), 1);
260        assert_eq!(schema.layers[0].prefix, "pr");
261    }
262
263    #[test]
264    fn export_rnc_recursive_types_are_named_patterns() {
265        let schema = export_rnc(&schemas_dir()).expect("export_rnc failed");
266        let output = schema.to_string();
267        // SectionType in prose is recursive (section contains section).
268        assert!(
269            output.contains("SectionType ="),
270            "SectionType should be a named pattern: {output}"
271        );
272    }
273
274    #[test]
275    fn export_rnc_topicref_recursive() {
276        let schema = export_rnc(&schemas_dir()).expect("export_rnc failed");
277        let output = schema.to_string();
278        // TopicRefType in organization is recursive.
279        assert!(
280            output.contains("TopicRefType ="),
281            "TopicRefType should be a named pattern: {output}"
282        );
283    }
284
285    #[test]
286    fn render_produces_same_as_display() {
287        let schema = export_rnc(&schemas_dir()).expect("export_rnc failed");
288        assert_eq!(render(&schema), schema.to_string());
289    }
290}