Skip to main content

config_disassembler/
disassemble.rs

1//! Disassemble a JSON, JSON5, YAML, or TOML document into a directory of
2//! smaller files, optionally written in a different format than the input.
3//!
4//! The `input` may be either a single file or a directory. When it points
5//! at a directory, every file under the directory whose extension matches
6//! the input format (or, when `input_format` is `None`, any of the four
7//! supported formats) is disassembled in place. An optional `ignore_path`
8//! can point at a `.gitignore`-style ignore file used to skip paths.
9
10use std::collections::{BTreeMap, BTreeSet};
11use std::fs;
12use std::path::{Path, PathBuf};
13
14use ignore::gitignore::{Gitignore, GitignoreBuilder};
15use serde_json::{Map, Value};
16use sha2::{Digest, Sha256};
17
18use crate::error::{Error, Result};
19use crate::format::Format;
20use crate::ignore_file::DEFAULT_IGNORE_FILENAME;
21use crate::meta::{Meta, Root};
22
23/// File written for object roots that contains the scalar top-level keys.
24const MAIN_BASENAME: &str = "_main";
25
26/// Options controlling disassembly.
27#[derive(Debug, Clone)]
28pub struct DisassembleOptions {
29    /// Path to the input. May be either a single config file or a
30    /// directory; when it is a directory, every matching file under it
31    /// is disassembled in place (see also `ignore_path`).
32    pub input: PathBuf,
33    /// Format to read the input as. If `None`, the format is inferred
34    /// from each file's extension.
35    pub input_format: Option<Format>,
36    /// Directory to write split files into. Only meaningful when
37    /// `input` is a single file; for directory inputs each file's
38    /// output goes into a sibling directory named after that file's
39    /// stem (mirroring the XML disassembler's behavior).
40    pub output_dir: Option<PathBuf>,
41    /// Format to write split files in. Defaults to `input_format`.
42    pub output_format: Option<Format>,
43    /// For array roots, name array-element files using the value of this
44    /// field if present on each element (must be a scalar).
45    pub unique_id: Option<String>,
46    /// If true, remove the contents of the output directory before writing.
47    pub pre_purge: bool,
48    /// If true, delete the input file (or input directory) after
49    /// disassembling. For directory inputs the entire directory is
50    /// removed only if every file in it was successfully disassembled.
51    pub post_purge: bool,
52    /// Optional path to a `.gitignore`-style ignore file that filters
53    /// which files are processed when `input` is a directory. Pass
54    /// `None` to use [`DEFAULT_IGNORE_FILENAME`] in the input directory
55    /// (silently absent if the file does not exist). Ignored entirely
56    /// for single-file inputs.
57    pub ignore_path: Option<PathBuf>,
58}
59
60impl DisassembleOptions {
61    /// Build options for a single-file disassembly with sensible
62    /// defaults. Directory walks should construct `DisassembleOptions`
63    /// directly so they can opt into `ignore_path`.
64    pub fn for_file(input: PathBuf) -> Self {
65        Self {
66            input,
67            input_format: None,
68            output_dir: None,
69            output_format: None,
70            unique_id: None,
71            pre_purge: false,
72            post_purge: false,
73            ignore_path: None,
74        }
75    }
76}
77
78/// Disassemble a configuration file (or directory of files) into split
79/// files.
80///
81/// * When `opts.input` is a regular file, returns the directory the files
82///   were written to (i.e. the single output directory for that file).
83/// * When `opts.input` is a directory, every matching file under it is
84///   disassembled in place and the input directory itself is returned.
85pub fn disassemble(opts: DisassembleOptions) -> Result<PathBuf> {
86    let metadata = fs::metadata(&opts.input)?;
87    if metadata.is_dir() {
88        return disassemble_directory(opts);
89    }
90    disassemble_file(opts)
91}
92
93/// Disassemble a single file. Equivalent to the previous behavior of
94/// [`disassemble`].
95fn disassemble_file(opts: DisassembleOptions) -> Result<PathBuf> {
96    let input_format = match opts.input_format {
97        Some(f) => f,
98        None => Format::from_path(&opts.input)?,
99    };
100    let output_format = opts.output_format.unwrap_or(input_format);
101    enforce_toml_isolation(input_format, output_format)?;
102
103    let output_dir = match opts.output_dir.clone() {
104        Some(d) => d,
105        None => default_output_dir(&opts.input)?,
106    };
107
108    if opts.pre_purge && output_dir.exists() {
109        fs::remove_dir_all(&output_dir)?;
110    }
111    fs::create_dir_all(&output_dir)?;
112
113    let value = input_format.load(&opts.input)?;
114    let source_filename = opts
115        .input
116        .file_name()
117        .and_then(|n| n.to_str())
118        .map(|s| s.to_string());
119
120    let root = match &value {
121        Value::Object(map) => write_object_root(&output_dir, map, output_format)?,
122        Value::Array(items) => {
123            write_array_root(&output_dir, items, output_format, opts.unique_id.as_deref())?
124        }
125        _ => {
126            return Err(Error::Invalid(
127                "top-level value must be an object or array to disassemble".into(),
128            ));
129        }
130    };
131
132    let meta = Meta {
133        source_format: input_format.into(),
134        file_format: output_format.into(),
135        source_filename,
136        root,
137    };
138    meta.write(&output_dir)?;
139
140    if opts.post_purge {
141        fs::remove_file(&opts.input)?;
142    }
143
144    Ok(output_dir)
145}
146
147/// Disassemble every matching file under a directory. Each file's split
148/// output is placed in a sibling directory named after the file's stem,
149/// matching how the XML disassembler treats directory inputs.
150fn disassemble_directory(opts: DisassembleOptions) -> Result<PathBuf> {
151    if opts.output_dir.is_some() {
152        return Err(Error::Usage(
153            "--output-dir is not supported with a directory input; each file's split output is written next to it".into(),
154        ));
155    }
156
157    let root = opts.input.clone();
158    let ignore = load_ignore_rules(opts.ignore_path.as_deref(), &root)?;
159
160    let mut targets = collect_disassemble_targets(&root, &ignore, opts.input_format)?;
161    targets.sort();
162
163    for file in &targets {
164        let mut child_opts = opts.clone();
165        child_opts.input = file.clone();
166        // Each file's output goes into <stem>/ next to the file itself,
167        // never into a shared --output-dir (we rejected that above).
168        child_opts.output_dir = None;
169        // Per-file post_purge would only delete the file; we honor the
170        // user's intent by keeping post_purge here so each input file is
171        // removed if requested, then we remove the (now empty) input
172        // directory at the very end below.
173        disassemble_file(child_opts)?;
174    }
175
176    if opts.post_purge {
177        // Only remove the input directory if it is now empty (every
178        // file we looked at was post-purged and no other content
179        // remains). Otherwise leave it alone so we don't clobber files
180        // the user kept around (output dirs, the ignore file, etc.).
181        if directory_is_empty(&root)? {
182            fs::remove_dir_all(&root)?;
183        }
184    }
185
186    Ok(root)
187}
188
189/// Walk `root` and collect every file whose extension matches one of the
190/// supported formats (or, if `expected_format` is `Some`, only that
191/// format), excluding paths matched by `ignore`.
192fn collect_disassemble_targets(
193    root: &Path,
194    ignore: &Option<Gitignore>,
195    expected_format: Option<Format>,
196) -> Result<Vec<PathBuf>> {
197    let mut out = Vec::new();
198    let mut stack = vec![root.to_path_buf()];
199    while let Some(dir) = stack.pop() {
200        for entry in fs::read_dir(&dir)? {
201            let entry = entry?;
202            let path = entry.path();
203            let ft = entry.file_type()?;
204            if is_ignored(ignore, root, &path, ft.is_dir()) {
205                continue;
206            }
207            if ft.is_dir() {
208                stack.push(path);
209                continue;
210            }
211            if !ft.is_file() {
212                continue;
213            }
214            // Only look at files whose extension parses as a known
215            // format, and (when input_format was set) only the matching
216            // format. Anything else is silently skipped — a directory of
217            // mixed config files commonly contains README/.git/etc.
218            let detected = match Format::from_path(&path) {
219                Ok(f) => f,
220                Err(_) => continue,
221            };
222            if let Some(expected) = expected_format {
223                if expected != detected {
224                    continue;
225                }
226            }
227            out.push(path);
228        }
229    }
230    Ok(out)
231}
232
233fn load_ignore_rules(explicit: Option<&Path>, fallback_dir: &Path) -> Result<Option<Gitignore>> {
234    let path = match explicit {
235        Some(p) => p.to_path_buf(),
236        None => fallback_dir.join(DEFAULT_IGNORE_FILENAME),
237    };
238    if !path.exists() {
239        return Ok(None);
240    }
241    let content = fs::read_to_string(&path)?;
242    let anchor = path.parent().unwrap_or(Path::new("."));
243    let mut builder = GitignoreBuilder::new(anchor);
244    for line in content.lines() {
245        // `add_line` returns a pattern-error on malformed globs; mirror
246        // the XML disassembler's tolerant parsing and skip bad lines
247        // rather than failing the whole run.
248        let _ = builder.add_line(None, line);
249    }
250    Ok(builder.build().ok())
251}
252
253fn is_ignored(ignore: &Option<Gitignore>, root: &Path, path: &Path, is_dir: bool) -> bool {
254    let Some(ign) = ignore.as_ref() else {
255        return false;
256    };
257    let candidate = path.strip_prefix(root).unwrap_or(path);
258    ign.matched(candidate, is_dir).is_ignore()
259}
260
261fn directory_is_empty(dir: &Path) -> Result<bool> {
262    let mut entries = fs::read_dir(dir)?;
263    Ok(entries.next().is_none())
264}
265
266/// Enforce TOML's isolation rule: TOML can only be converted to and
267/// from TOML. Mixing TOML with another format would lose information
268/// (TOML cannot represent `null` or array roots) or reorder values
269/// (TOML's bare-keys-before-tables rule), so refuse the operation up
270/// front with a clear error.
271fn enforce_toml_isolation(input: Format, output: Format) -> Result<()> {
272    if (input == Format::Toml) != (output == Format::Toml) {
273        return Err(Error::Invalid(format!(
274            "TOML can only be converted to and from TOML; got input={input}, output={output}"
275        )));
276    }
277    Ok(())
278}
279
280fn default_output_dir(input: &Path) -> Result<PathBuf> {
281    let stem = input.file_stem().and_then(|s| s.to_str()).ok_or_else(|| {
282        Error::Invalid(format!(
283            "could not derive a directory name from {}",
284            input.display()
285        ))
286    })?;
287    let parent = input.parent().unwrap_or(Path::new("."));
288    Ok(parent.join(stem))
289}
290
291fn write_object_root(dir: &Path, map: &Map<String, Value>, fmt: Format) -> Result<Root> {
292    let mut key_order: Vec<String> = Vec::with_capacity(map.len());
293    let mut key_files: BTreeMap<String, String> = BTreeMap::new();
294    let mut main_object: Map<String, Value> = Map::new();
295    let mut used_names: BTreeSet<String> = BTreeSet::new();
296    used_names.insert(format!("{MAIN_BASENAME}.{}", fmt.extension()));
297
298    for (key, value) in map {
299        key_order.push(key.clone());
300        if is_scalar(value) {
301            main_object.insert(key.clone(), value.clone());
302            continue;
303        }
304
305        let filename = unique_filename_for_key(key, fmt, &used_names);
306        used_names.insert(filename.clone());
307        let path = dir.join(&filename);
308        let payload = wrap_per_key_payload(fmt, key, value);
309        fs::write(&path, fmt.serialize(&payload)?)?;
310        key_files.insert(key.clone(), filename);
311    }
312
313    let main_file = if main_object.is_empty() {
314        None
315    } else {
316        let filename = format!("{MAIN_BASENAME}.{}", fmt.extension());
317        let path = dir.join(&filename);
318        fs::write(&path, fmt.serialize(&Value::Object(main_object))?)?;
319        Some(filename)
320    };
321
322    Ok(Root::Object {
323        key_order,
324        key_files,
325        main_file,
326    })
327}
328
329fn write_array_root(
330    dir: &Path,
331    items: &[Value],
332    fmt: Format,
333    unique_id: Option<&str>,
334) -> Result<Root> {
335    let mut files = Vec::with_capacity(items.len());
336    let mut used_names: BTreeSet<String> = BTreeSet::new();
337    let width = digit_width(items.len());
338
339    for (idx, item) in items.iter().enumerate() {
340        let mut basename = if let Some(field) = unique_id {
341            unique_id_basename(item, field)
342        } else {
343            None
344        };
345        if basename
346            .as_ref()
347            .map(|n| used_names.contains(&format!("{n}.{}", fmt.extension())))
348            .unwrap_or(false)
349        {
350            basename = None;
351        }
352        let basename = basename.unwrap_or_else(|| format!("{:0width$}", idx + 1, width = width));
353
354        let mut filename = format!("{basename}.{}", fmt.extension());
355        if used_names.contains(&filename) {
356            filename = format!("{basename}-{}.{}", hash_value(item, 8), fmt.extension());
357        }
358        used_names.insert(filename.clone());
359
360        let path = dir.join(&filename);
361        fs::write(&path, fmt.serialize(item)?)?;
362        files.push(filename);
363    }
364
365    Ok(Root::Array { files })
366}
367
368/// For TOML output, wrap each per-key payload under its parent key
369/// before serialization. TOML documents must have a table (object)
370/// root, so writing a bare array (e.g. an array-of-tables under a
371/// key like `servers`) would fail. Wrapping produces an idiomatic
372/// TOML file (e.g. `[[servers]]` headers in `servers.toml`) that
373/// reassembly can unwrap deterministically using the metadata.
374///
375/// For the other formats the payload is the value itself; cross-format
376/// round-tripping continues to work unchanged.
377fn wrap_per_key_payload(fmt: Format, key: &str, value: &Value) -> Value {
378    if fmt == Format::Toml {
379        let mut wrapper = Map::new();
380        wrapper.insert(key.to_string(), value.clone());
381        Value::Object(wrapper)
382    } else {
383        value.clone()
384    }
385}
386
387fn is_scalar(value: &Value) -> bool {
388    !matches!(value, Value::Object(_) | Value::Array(_))
389}
390
391fn digit_width(count: usize) -> usize {
392    let mut w = 1;
393    let mut n = count;
394    while n >= 10 {
395        n /= 10;
396        w += 1;
397    }
398    w.max(4)
399}
400
401fn unique_filename_for_key(key: &str, fmt: Format, used: &BTreeSet<String>) -> String {
402    let sanitized = sanitize(key);
403    let base = if sanitized.is_empty() {
404        hash_string(key, 12)
405    } else {
406        sanitized
407    };
408    let mut filename = format!("{base}.{}", fmt.extension());
409    if used.contains(&filename) {
410        filename = format!("{base}-{}.{}", hash_string(key, 8), fmt.extension());
411    }
412    filename
413}
414
415fn unique_id_basename(item: &Value, field: &str) -> Option<String> {
416    let map = item.as_object()?;
417    let raw = match map.get(field)? {
418        Value::String(s) => s.clone(),
419        Value::Number(n) => n.to_string(),
420        Value::Bool(b) => b.to_string(),
421        _ => return None,
422    };
423    let s = sanitize(&raw);
424    if s.is_empty() {
425        None
426    } else {
427        Some(s)
428    }
429}
430
431fn sanitize(input: &str) -> String {
432    input
433        .chars()
434        .map(|c| {
435            if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.' {
436                c
437            } else {
438                '_'
439            }
440        })
441        .collect::<String>()
442        .trim_matches('.')
443        .to_string()
444}
445
446fn hash_string(input: &str, len: usize) -> String {
447    let digest = Sha256::digest(input.as_bytes());
448    let hex: String = digest.iter().map(|b| format!("{b:02x}")).collect();
449    hex.chars().take(len).collect()
450}
451
452fn hash_value(value: &Value, len: usize) -> String {
453    let canonical = serde_json::to_string(value).unwrap_or_default();
454    hash_string(&canonical, len)
455}