dump-json-refs 0.1.0

use anyhow::{anyhow, bail, Context, Result};
use clap::parser::ValueSource;
use clap::{CommandFactory, FromArgMatches, Parser, ValueEnum};
use rusqlite::{params, Connection, Transaction};
use serde::Serialize;
use serde_json::{json, Map, Value};
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fs;
use std::io::{self, Read};
use std::path::{Path, PathBuf};
use std::time::Instant;

const DEFAULT_GRAPH_OUTPUT: &str = "__default_graph_path__";

#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
enum GraphFormat {
    Mermaid,
    MermaidMd,
    Dot,
}

impl GraphFormat {
    fn default_filename(self) -> &'static str {
        match self {
            GraphFormat::Mermaid => "schema.mmd",
            GraphFormat::MermaidMd => "schema.md",
            GraphFormat::Dot => "schema.dot",
        }
    }
}

#[derive(Parser, Debug)]
#[command(name = "dump-json-refs")]
#[command(about = "Generate JSON schema refs and a SQLite path index from JSON/JSONL input")]
struct Args {
    /// Force JSONL mode. Without this flag, *.jsonl input files are also treated as JSONL.
    #[arg(long)]
    jsonl: bool,

    /// Output compact one-line JSON files under the refs directory.
    #[arg(short = 'c', long = "compact-output")]
    compact_output: bool,

    /// Write the full report to this file. Stdout prints only the summary.
    #[arg(short = 'o', long = "output", value_name = "FILE")]
    report_output: Option<PathBuf>,

    /// Read report data from an existing SQLite index instead of generating refs.
    /// When the flag is used without a value, refs/schemas.sqlite is used.
    #[arg(
        long = "from-sqlite",
        value_name = "FILE",
        num_args = 0..=1,
        default_missing_value = "refs/schemas.sqlite"
    )]
    from_sqlite: Option<PathBuf>,

    /// Generate a graph projection from SQLite schema relations.
    ///
    /// When FILE is omitted, the default path depends on --graph-format:
    /// mermaid -> `<outdir>/schema.mmd`, mermaid-md -> `<outdir>/schema.md`,
    /// dot -> `<outdir>/schema.dot`. In --from-sqlite mode, `<outdir>` is
    /// replaced by the SQLite file's parent directory. Existing refs are not
    /// removed in --from-sqlite mode.
    #[arg(
        long = "graph",
        value_name = "FILE",
        num_args = 0..=1,
        default_missing_value = "__default_graph_path__"
    )]
    graph_output: Option<PathBuf>,

    /// Graph output format. Also generates the graph when --graph is omitted.
    ///
    /// Mermaid is the default for quick GitHub/Markdown feedback.
    #[arg(
        long = "graph-format",
        default_value_t = GraphFormat::Mermaid,
        value_enum,
        value_name = "mermaid|mermaid-md|dot",
        hide_possible_values = true
    )]
    graph_format: GraphFormat,

    /// Include structural marker relations such as nested array items.
    ///
    /// Also generates the graph when --graph is omitted.
    #[arg(long = "graph-include-marked")]
    graph_include_marked: bool,

    /// Graph layout direction used by Mermaid and DOT output.
    ///
    /// Also generates the graph when --graph is omitted.
    #[arg(
        long = "graph-rankdir",
        default_value = "LR",
        value_name = "LR|TB|RL|BT",
        value_parser = ["LR", "TB", "RL", "BT"],
        hide_possible_values = true
    )]
    graph_rankdir: String,

    #[arg(skip)]
    graph_requested: bool,

    /// Output directory. It is removed and recreated.
    #[arg(long, default_value = "refs")]
    outdir: PathBuf,

    /// Input JSON/JSONL file. Reads stdin when omitted.
    input_file: Option<PathBuf>,
}

#[derive(Debug, Clone)]
struct Entry {
    path: Vec<String>,
    index: Option<usize>,
    collection: bool,
    value: Map<String, Value>,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Role {
    Object,
    ArrayItem,
    RootCollection,
}

#[derive(Debug, Clone)]
struct Occurrence {
    segments: Vec<String>,
    value: Map<String, Value>,
    role: Role,
    array_indexes: Vec<usize>,
}

#[derive(Debug, Clone, Serialize)]
struct Record {
    schema_path: String,
    object_paths: Vec<String>,
    schema: Value,
    #[serde(skip_serializing_if = "Option::is_none")]
    array_parent: Option<String>,
    #[serde(skip_serializing_if = "Vec::is_empty")]
    array_index_paths: Vec<Vec<usize>>,
}

#[derive(Debug)]
struct SchemaBuild {
    records: Vec<Record>,
    occurrences: Vec<Occurrence>,
}

fn main() -> Result<()> {
    let matches = Args::command().get_matches();
    let graph_requested = graph_arg_was_provided(&matches);
    let mut args = Args::from_arg_matches(&matches)?;
    args.graph_requested = graph_requested;
    run(args)
}

fn graph_arg_was_provided(matches: &clap::ArgMatches) -> bool {
    [
        "graph_output",
        "graph_format",
        "graph_include_marked",
        "graph_rankdir",
    ]
    .iter()
    .any(|id| {
        matches
            .value_source(id)
            .is_some_and(|source| source == ValueSource::CommandLine)
    })
}

fn run(args: Args) -> Result<()> {
    if let Some(sqlite_path) = args.from_sqlite.as_deref() {
        if args.input_file.is_some() {
            bail!("input file cannot be used with --from-sqlite");
        }

        let report = load_report_from_sqlite(sqlite_path)?;
        emit_report(
            &report,
            &ReportSummary::FromSqlite {
                sqlite_path: sqlite_path.display().to_string(),
            },
            args.report_output.as_deref(),
        )?;

        if let Some(graph_output) = resolve_graph_output_path(
            args.graph_requested,
            args.graph_output.as_deref(),
            &args.outdir,
            Some(sqlite_path),
            args.graph_format,
        ) {
            write_graph_from_sqlite(
                sqlite_path,
                &graph_output,
                args.graph_include_marked,
                &args.graph_rankdir,
                args.graph_format,
            )?;
        }
        return Ok(());
    }

    let started = Instant::now();
    reject_unsafe_outdir(&args.outdir)?;

    let input_source = args
        .input_file
        .as_ref()
        .map(|path| path.display().to_string())
        .unwrap_or_else(|| "stdin".to_string());

    let (input, file_mode, base, source_is_jsonl) = read_input(&args)?;
    let jsonl_mode = args.jsonl || source_is_jsonl;
    let entries = normalize_input(&input, jsonl_mode, file_mode, &base)?;
    let build = build_schema_output(&args.outdir, entries)?;

    if args.outdir.exists() {
        fs::remove_dir_all(&args.outdir)
            .with_context(|| format!("failed to remove {}", args.outdir.display()))?;
    }
    fs::create_dir_all(&args.outdir)
        .with_context(|| format!("failed to create {}", args.outdir.display()))?;

    write_files_and_index(
        &args.outdir,
        &build.records,
        &build.occurrences,
        args.compact_output,
    )?;

    let database = args.outdir.join("schemas.sqlite");
    let report = load_report_from_sqlite(&database)?;
    emit_report(
        &report,
        &ReportSummary::Generated {
            execution_time_ms: started.elapsed().as_millis(),
            input_source,
        },
        args.report_output.as_deref(),
    )?;

    if let Some(graph_output) = resolve_graph_output_path(
        args.graph_requested,
        args.graph_output.as_deref(),
        &args.outdir,
        None,
        args.graph_format,
    ) {
        write_graph_from_sqlite(
            &database,
            &graph_output,
            args.graph_include_marked,
            &args.graph_rankdir,
            args.graph_format,
        )?;
    }

    Ok(())
}

fn reject_unsafe_outdir(outdir: &Path) -> Result<()> {
    let s = outdir.as_os_str().to_string_lossy();
    if s.is_empty() || s == "/" || s == "." || s == ".." {
        bail!("refusing unsafe output directory: {}", outdir.display());
    }
    Ok(())
}

fn read_input(args: &Args) -> Result<(String, bool, String, bool)> {
    if let Some(path) = &args.input_file {
        let input = fs::read_to_string(path)
            .with_context(|| format!("cannot read input file: {}", path.display()))?;
        let file_name = path
            .file_name()
            .ok_or_else(|| anyhow!("invalid input path: {}", path.display()))?
            .to_string_lossy();
        let source_is_jsonl = file_name.ends_with(".jsonl");
        let base = file_name
            .strip_suffix(".jsonl")
            .or_else(|| file_name.strip_suffix(".json"))
            .unwrap_or(&file_name)
            .to_string();
        Ok((input, true, base, source_is_jsonl))
    } else {
        let mut input = String::new();
        io::stdin()
            .read_to_string(&mut input)
            .context("failed to read stdin")?;
        Ok((input, false, "root".to_string(), false))
    }
}

fn normalize_input(
    input: &str,
    jsonl_mode: bool,
    file_mode: bool,
    base: &str,
) -> Result<Vec<Entry>> {
    if input.trim().is_empty() {
        bail!("input is empty");
    }

    if jsonl_mode {
        let values = parse_jsonl_stream(input)?;
        return normalize_jsonl_values(values, file_mode, base);
    }

    let values = match parse_json_stream(input) {
        Ok(values) => values,
        Err(json_error) => {
            return match parse_jsonl_stream(input) {
                Ok(values) => normalize_jsonl_values(values, file_mode, base),
                Err(jsonl_error) => Err(json_error).with_context(|| {
                    format!("invalid JSON input; JSONL fallback also failed: {jsonl_error}")
                }),
            };
        }
    };

    normalize_json_values(values, file_mode, base)
}

fn normalize_jsonl_values(values: Vec<Value>, file_mode: bool, base: &str) -> Result<Vec<Entry>> {
    let root = if file_mode {
        format!("{base}_ref")
    } else {
        "root_item".to_string()
    };

    if values.is_empty() {
        bail!("input is empty");
    }

    values
        .into_iter()
        .enumerate()
        .map(|(i, value)| match value {
            Value::Object(map) => Ok(Entry {
                path: vec![root.clone()],
                index: Some(i),
                collection: true,
                value: map,
            }),
            _ => bail!("JSONL values must be objects"),
        })
        .collect()
}

fn normalize_json_values(values: Vec<Value>, file_mode: bool, base: &str) -> Result<Vec<Entry>> {
    if values.is_empty() {
        bail!("input is empty");
    }

    let root_object = if file_mode {
        base.to_string()
    } else {
        "root".to_string()
    };
    let root_collection = if file_mode {
        format!("{base}_ref")
    } else {
        "root_item".to_string()
    };

    if values.len() == 1 {
        match values.into_iter().next().unwrap() {
            Value::Object(map) => Ok(vec![Entry {
                path: vec![root_object],
                index: None,
                collection: false,
                value: map,
            }]),
            Value::Array(items) => items
                .into_iter()
                .enumerate()
                .map(|(i, value)| match value {
                    Value::Object(map) => Ok(Entry {
                        path: vec![root_collection.clone()],
                        index: Some(i),
                        collection: true,
                        value: map,
                    }),
                    _ => bail!("top-level array values must be objects"),
                })
                .collect(),
            _ => bail!("top-level JSON value must be an object or array"),
        }
    } else {
        values
            .into_iter()
            .enumerate()
            .map(|(i, value)| match value {
                Value::Object(map) => Ok(Entry {
                    path: vec![root_collection.clone()],
                    index: Some(i),
                    collection: true,
                    value: map,
                }),
                _ => bail!("top-level JSON values must be objects"),
            })
            .collect()
    }
}

fn parse_json_stream(input: &str) -> Result<Vec<Value>> {
    let de = serde_json::Deserializer::from_str(input);
    de.into_iter::<Value>()
        .collect::<std::result::Result<Vec<_>, _>>()
        .context("invalid JSON input")
}

fn parse_jsonl_stream(input: &str) -> Result<Vec<Value>> {
    let mut values = Vec::new();
    for (index, line) in input.lines().enumerate() {
        let line_number = index + 1;
        let record = line.trim_start_matches('\0');
        if record.trim().is_empty() {
            if line.trim().is_empty() {
                continue;
            }
            bail!("invalid JSONL input at line {line_number}: record contains only NUL padding");
        }
        let parsed = serde_json::Deserializer::from_str(record)
            .into_iter::<Value>()
            .collect::<std::result::Result<Vec<_>, _>>()
            .with_context(|| format!("invalid JSONL input at line {line_number}"))?;
        values.extend(parsed);
    }
    Ok(values)
}

fn build_schema_output(outdir: &Path, entries: Vec<Entry>) -> Result<SchemaBuild> {
    let mut occurrences = Vec::new();
    for entry in entries {
        let role = if entry.collection {
            Role::RootCollection
        } else {
            Role::Object
        };
        collect_object_occurrences(
            entry.path,
            entry.value,
            role,
            entry.index.into_iter().collect(),
            &mut occurrences,
        );
    }

    let mut records = Vec::new();
    records.extend(object_records(
        outdir,
        occurrences
            .iter()
            .filter(|o| o.role == Role::Object)
            .cloned()
            .collect(),
    )?);
    records.extend(array_records(
        outdir,
        occurrences
            .iter()
            .filter(|o| o.role == Role::ArrayItem)
            .cloned()
            .collect(),
    )?);
    records.extend(root_collection_records(
        outdir,
        occurrences
            .iter()
            .filter(|o| o.role == Role::RootCollection)
            .cloned()
            .collect(),
    )?);
    Ok(SchemaBuild {
        records,
        occurrences,
    })
}

#[cfg(test)]
fn distinct_schemas(outdir: &Path, entries: Vec<Entry>) -> Result<Vec<Record>> {
    Ok(build_schema_output(outdir, entries)?.records)
}

fn collect_object_occurrences(
    segments: Vec<String>,
    value: Map<String, Value>,
    role: Role,
    array_indexes: Vec<usize>,
    out: &mut Vec<Occurrence>,
) {
    out.push(Occurrence {
        segments: segments.clone(),
        value: value.clone(),
        role,
        array_indexes: array_indexes.clone(),
    });

    for (key, child) in value {
        let mut child_segments = segments.clone();
        child_segments.push(key);
        match child {
            Value::Object(map) => {
                collect_object_occurrences(
                    child_segments,
                    map,
                    Role::Object,
                    array_indexes.clone(),
                    out,
                );
            }
            Value::Array(items) => {
                collect_array_object_occurrences(child_segments, items, array_indexes.clone(), out);
            }
            _ => {}
        }
    }
}

fn collect_array_object_occurrences(
    segments: Vec<String>,
    items: Vec<Value>,
    array_indexes: Vec<usize>,
    out: &mut Vec<Occurrence>,
) {
    for (index, item) in items.into_iter().enumerate() {
        let mut item_indexes = array_indexes.clone();
        item_indexes.push(index);
        match item {
            Value::Object(map) => collect_object_occurrences(
                segments.clone(),
                map,
                Role::ArrayItem,
                item_indexes,
                out,
            ),
            Value::Array(nested) => {
                collect_array_object_occurrences(segments.clone(), nested, item_indexes, out);
            }
            _ => {}
        }
    }
}

fn object_records(outdir: &Path, occurrences: Vec<Occurrence>) -> Result<Vec<Record>> {
    let mut by_segments: BTreeMap<Vec<String>, Vec<Map<String, Value>>> = BTreeMap::new();
    for occ in occurrences {
        by_segments.entry(occ.segments).or_default().push(occ.value);
    }

    let mut by_canonical: BTreeMap<String, Vec<(String, Value)>> = BTreeMap::new();
    for (segments, objects) in by_segments {
        let schema_path = format!("{}.json", reference_path(outdir, &segments));
        let schema = schema_for_values(outdir, &segments, &objects)?;
        let canonical = canonical_json(&schema)?;
        by_canonical
            .entry(canonical)
            .or_default()
            .push((schema_path, schema));
    }

    let mut records = Vec::new();
    for (_canonical, mut same) in by_canonical {
        same.sort_by(|a, b| a.0.cmp(&b.0));
        let object_paths = same.iter().map(|(p, _)| p.clone()).collect::<Vec<_>>();
        records.push(Record {
            schema_path: object_paths[0].clone(),
            object_paths,
            schema: same[0].1.clone(),
            array_parent: None,
            array_index_paths: Vec::new(),
        });
    }
    Ok(records)
}

fn array_records(outdir: &Path, occurrences: Vec<Occurrence>) -> Result<Vec<Record>> {
    let mut by_segments: BTreeMap<Vec<String>, Vec<Occurrence>> = BTreeMap::new();
    for occ in occurrences {
        by_segments
            .entry(occ.segments.clone())
            .or_default()
            .push(occ);
    }

    let mut records = Vec::new();
    for (segments, items) in by_segments {
        let array_base = reference_path(outdir, &segments);
        let mut groups: BTreeMap<String, Vec<(Vec<usize>, Value)>> = BTreeMap::new();
        for item in items {
            if item.array_indexes.is_empty() {
                bail!("array item missing index path");
            }
            let schema = schema_for_values(outdir, &segments, &[item.value])?;
            groups
                .entry(canonical_json(&schema)?)
                .or_default()
                .push((item.array_indexes, schema));
        }

        if groups.len() == 1 {
            let (_, same) = groups.into_iter().next().unwrap();
            let mut array_index_paths = same
                .iter()
                .map(|(path, _)| path.clone())
                .collect::<Vec<_>>();
            array_index_paths.sort_unstable();
            array_index_paths.dedup();
            records.push(Record {
                schema_path: format!("{array_base}.json"),
                object_paths: vec![format!("{array_base}.json")],
                schema: same[0].1.clone(),
                array_parent: Some(format!("{array_base}.json")),
                array_index_paths,
            });
        } else {
            records.push(Record {
                schema_path: format!("{array_base}.json"),
                object_paths: vec![format!("{array_base}.json")],
                schema: json!({"$refs_mut": format!("{array_base}/")}),
                array_parent: None,
                array_index_paths: Vec::new(),
            });

            let mut distinct = groups
                .into_values()
                .map(|same| {
                    let first_index_path = same.iter().map(|(path, _)| path.clone()).min().unwrap();
                    (first_index_path, same)
                })
                .collect::<Vec<_>>();
            distinct.sort_by_key(|(first_index_path, _)| first_index_path.clone());

            for (first_index_path, same) in distinct {
                let file_stem = index_path_file_stem(&first_index_path);
                let mut array_index_paths = same
                    .iter()
                    .map(|(path, _)| path.clone())
                    .collect::<Vec<_>>();
                array_index_paths.sort_unstable();
                array_index_paths.dedup();
                records.push(Record {
                    schema_path: format!("{array_base}/{file_stem}.json"),
                    object_paths: vec![format!("{array_base}/{file_stem}.json")],
                    schema: same[0].1.clone(),
                    array_parent: Some(format!("{array_base}.json")),
                    array_index_paths,
                });
            }
        }
    }
    Ok(records)
}

fn index_path_file_stem(index_path: &[usize]) -> String {
    index_path
        .iter()
        .map(usize::to_string)
        .collect::<Vec<_>>()
        .join("_")
}

fn root_collection_records(outdir: &Path, occurrences: Vec<Occurrence>) -> Result<Vec<Record>> {
    let mut by_segments: BTreeMap<Vec<String>, Vec<Occurrence>> = BTreeMap::new();
    for occ in occurrences {
        by_segments
            .entry(occ.segments.clone())
            .or_default()
            .push(occ);
    }

    let mut records = Vec::new();
    for (segments, items) in by_segments {
        let collection_base = reference_path(outdir, &segments);
        let mut groups: BTreeMap<String, Vec<(Vec<usize>, Value)>> = BTreeMap::new();
        for item in items {
            if item.array_indexes.is_empty() {
                bail!("root collection item missing index path");
            }
            let schema = schema_for_values(outdir, &segments, &[item.value])?;
            groups
                .entry(canonical_json(&schema)?)
                .or_default()
                .push((item.array_indexes, schema));
        }

        if groups.len() == 1 {
            let (_, same) = groups.into_iter().next().unwrap();
            let mut array_index_paths = same
                .iter()
                .map(|(path, _)| path.clone())
                .collect::<Vec<_>>();
            array_index_paths.sort_unstable();
            array_index_paths.dedup();
            records.push(Record {
                schema_path: format!("{collection_base}.json"),
                object_paths: vec![format!("{collection_base}.json")],
                schema: same[0].1.clone(),
                array_parent: Some(format!("{collection_base}.json")),
                array_index_paths,
            });
        } else {
            let mut distinct = groups
                .into_values()
                .map(|same| {
                    let first_index_path = same.iter().map(|(path, _)| path.clone()).min().unwrap();
                    (first_index_path, same)
                })
                .collect::<Vec<_>>();
            distinct.sort_by_key(|(first_index_path, _)| first_index_path.clone());

            for (first_index_path, same) in distinct {
                let file_stem = index_path_file_stem(&first_index_path);
                let mut array_index_paths = same
                    .iter()
                    .map(|(path, _)| path.clone())
                    .collect::<Vec<_>>();
                array_index_paths.sort_unstable();
                array_index_paths.dedup();
                records.push(Record {
                    schema_path: format!("{collection_base}/{file_stem}.json"),
                    object_paths: vec![format!("{collection_base}/{file_stem}.json")],
                    schema: same[0].1.clone(),
                    array_parent: Some(format!("{collection_base}.json")),
                    array_index_paths,
                });
            }
        }
    }
    Ok(records)
}

fn schema_for_values(
    outdir: &Path,
    segments: &[String],
    objects: &[Map<String, Value>],
) -> Result<Value> {
    let total = objects.len();
    let mut keys = BTreeSet::new();
    for object in objects {
        keys.extend(object.keys().cloned());
    }

    let mut schema = Map::new();
    for key in keys {
        let values = objects
            .iter()
            .filter_map(|object| object.get(&key))
            .collect::<Vec<_>>();
        let types = unique_types(values.iter().copied());
        let mut child_segments = segments.to_vec();
        child_segments.push(key.clone());

        let label = if types.contains(&"array") {
            let mut labels = BTreeSet::new();
            labels.insert(array_label(outdir, &child_segments, &values)?);
            for value_type in types {
                if !matches!(value_type, "array" | "null") {
                    labels.insert(value_type.to_string());
                }
            }
            union_member_labels(labels)
        } else if types.contains(&"object") {
            if types.iter().all(|t| matches!(*t, "object" | "null")) {
                format!("{}.json", reference_path(outdir, &child_segments))
            } else {
                types
                    .iter()
                    .filter(|t| **t != "null")
                    .copied()
                    .collect::<Vec<_>>()
                    .join("|")
            }
        } else {
            primitive_label(&values, total)
        };
        schema.insert(key, Value::String(label));
    }
    Ok(Value::Object(schema))
}

fn unique_types<'a>(values: impl IntoIterator<Item = &'a Value>) -> Vec<&'static str> {
    let mut out = BTreeSet::new();
    for value in values {
        out.insert(value_type(value));
    }
    out.into_iter().collect()
}

fn value_type(value: &Value) -> &'static str {
    match value {
        Value::Null => "null",
        Value::Bool(_) => "boolean",
        Value::Number(_) => "number",
        Value::String(_) => "string",
        Value::Array(_) => "array",
        Value::Object(_) => "object",
    }
}

fn primitive_label(values: &[&Value], total: usize) -> String {
    let present = values.len();
    let types = unique_types(values.iter().copied());
    let mut result = if types == vec!["null"] {
        "null".to_string()
    } else if types.len() == 1 {
        types[0].to_string()
    } else if types.len() == 2 && types[0] == "null" {
        format!("{}?", types[1])
    } else {
        types.join("|")
    };

    if present < total && result != "null" && !result.ends_with('?') {
        result.push('?');
    }
    result
}

fn array_label(outdir: &Path, segments: &[String], arrays: &[&Value]) -> Result<String> {
    let members = arrays
        .iter()
        .filter_map(|value| value.as_array())
        .flat_map(|items| items.iter())
        .collect::<Vec<_>>();
    Ok(format!(
        "array({})",
        array_member_label(outdir, segments, &members)?
    ))
}

fn array_member_label(outdir: &Path, segments: &[String], members: &[&Value]) -> Result<String> {
    let types = unique_types(members.iter().copied());
    if members.is_empty() {
        return Ok("empty".to_string());
    }

    let mut labels = BTreeSet::new();
    for primitive in ["boolean", "number", "string", "null"] {
        if types.contains(&primitive) {
            labels.insert(primitive.to_string());
        }
    }
    if types.contains(&"object") {
        labels.insert(format!("{}.json", reference_path(outdir, segments)));
    }
    if types.contains(&"array") {
        let nested = members
            .iter()
            .filter_map(|value| value.as_array())
            .flat_map(|items| items.iter())
            .collect::<Vec<_>>();
        labels.insert(format!(
            "array({})",
            array_member_label(outdir, segments, &nested)?
        ));
    }
    Ok(union_member_labels(labels))
}

fn union_member_labels(mut labels: BTreeSet<String>) -> String {
    if labels.len() == 2 && labels.contains("null") {
        labels.remove("null");
        return format!("{}?", labels.into_iter().next().unwrap());
    }
    labels.into_iter().collect::<Vec<_>>().join("|")
}

fn path_segment(segment: &str) -> String {
    segment.replace('%', "%25").replace('/', "%2F")
}

fn reference_path(outdir: &Path, segments: &[String]) -> String {
    let joined = segments
        .iter()
        .map(|s| path_segment(s))
        .collect::<Vec<_>>()
        .join("/");
    format!("{}/{}", outdir.to_string_lossy(), joined)
}

fn canonical_json(value: &Value) -> Result<String> {
    match value {
        Value::Object(map) => {
            let sorted = map
                .iter()
                .map(|(k, v)| (k.clone(), v.clone()))
                .collect::<BTreeMap<_, _>>();
            serde_json::to_string(&sorted).context("failed to serialize canonical schema")
        }
        _ => serde_json::to_string(value).context("failed to serialize canonical schema"),
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct SchemaCount {
    schema_path: String,
    schema_kind: String,
    object_count: i64,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct SchemaObjectPath {
    schema_path: String,
    object_path: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct SchemaArrayIndexRef {
    schema_path: String,
    array_path: String,
    array_index_path: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct FieldCount {
    schema_path: String,
    field_name: String,
    field_type: String,
    field_count: i64,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct ReportData {
    schemas: Vec<SchemaCount>,
    object_paths: Vec<SchemaObjectPath>,
    array_index_refs: Vec<SchemaArrayIndexRef>,
    fields: Vec<FieldCount>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
enum ReportSummary {
    Generated {
        execution_time_ms: u128,
        input_source: String,
    },
    FromSqlite {
        sqlite_path: String,
    },
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ReportDetail {
    CompactStdout,
    FullFile,
}

fn load_report_from_sqlite(database: &Path) -> Result<ReportData> {
    if !database.exists() {
        bail!(
            "SQLite report source does not exist: {}",
            database.display()
        );
    }

    let conn = Connection::open(database).with_context(|| {
        format!(
            "failed to open SQLite report source: {}",
            database.display()
        )
    })?;
    read_report_data(&conn)
}

fn read_report_data(conn: &Connection) -> Result<ReportData> {
    ensure_field_type_column_exists(conn)?;

    let schema_order_sql = "SELECT paths.schema_path, defs.schema_kind, COALESCE(counts.object_count, 0) AS object_count \
         FROM (SELECT DISTINCT schema_path FROM schema_paths) AS paths \
         JOIN schema_definitions AS defs ON defs.schema_path = paths.schema_path \
         LEFT JOIN schema_object_counts AS counts ON counts.schema_path = paths.schema_path";

    let mut schema_stmt = conn.prepare(&format!(
        "{schema_order_sql} \
         ORDER BY object_count DESC, paths.schema_path"
    ))?;

    let schemas = schema_stmt
        .query_map([], |row| {
            Ok(SchemaCount {
                schema_path: row.get(0)?,
                schema_kind: row.get(1)?,
                object_count: row.get(2)?,
            })
        })?
        .collect::<rusqlite::Result<Vec<_>>>()?;

    let mut object_path_stmt = conn.prepare(&format!(
        "SELECT p.schema_path, p.object_path \
         FROM schema_paths AS p \
         JOIN ({schema_order_sql}) AS schema_order ON schema_order.schema_path = p.schema_path \
         ORDER BY schema_order.object_count DESC, p.schema_path, p.object_path"
    ))?;

    let object_paths = object_path_stmt
        .query_map([], |row| {
            Ok(SchemaObjectPath {
                schema_path: row.get(0)?,
                object_path: row.get(1)?,
            })
        })?
        .collect::<rusqlite::Result<Vec<_>>>()?;

    let mut array_index_stmt = conn.prepare(&format!(
        "SELECT r.schema_path, r.array_path, r.array_index_path \
         FROM array_index_refs AS r \
         JOIN ({schema_order_sql}) AS schema_order ON schema_order.schema_path = r.schema_path \
         ORDER BY schema_order.object_count DESC, r.schema_path, r.array_path, r.array_index_path"
    ))?;

    let array_index_refs = array_index_stmt
        .query_map([], |row| {
            Ok(SchemaArrayIndexRef {
                schema_path: row.get(0)?,
                array_path: row.get(1)?,
                array_index_path: row.get(2)?,
            })
        })?
        .collect::<rusqlite::Result<Vec<_>>>()?;

    let mut field_stmt = conn.prepare(&format!(
        "SELECT f.schema_path, f.field_name, f.field_type, f.field_count \
         FROM schema_field_counts AS f \
         JOIN ({schema_order_sql}) AS schema_order ON schema_order.schema_path = f.schema_path \
         ORDER BY schema_order.object_count DESC, f.schema_path, f.field_count DESC, f.field_name"
    ))?;

    let fields = field_stmt
        .query_map([], |row| {
            Ok(FieldCount {
                schema_path: row.get(0)?,
                field_name: row.get(1)?,
                field_type: row.get(2)?,
                field_count: row.get(3)?,
            })
        })?
        .collect::<rusqlite::Result<Vec<_>>>()?;

    Ok(ReportData {
        schemas,
        object_paths,
        array_index_refs,
        fields,
    })
}

fn ensure_field_type_column_exists(conn: &Connection) -> Result<()> {
    ensure_table_has_column(
        conn,
        "schema_field_counts",
        "field_type",
        "SQLite report source does not contain schema_field_counts.field_type; regenerate refs with the current dump-json-refs version",
    )?;
    ensure_table_has_column(
        conn,
        "schema_definitions",
        "schema_kind",
        "SQLite report source does not contain schema_definitions.schema_kind; regenerate refs with the current dump-json-refs version",
    )?;
    Ok(())
}

fn ensure_table_has_column(
    conn: &Connection,
    table_name: &str,
    column_name: &str,
    error_message: &str,
) -> Result<()> {
    let pragma = format!("PRAGMA table_info({table_name})");
    let mut stmt = conn.prepare(&pragma)?;
    let columns = stmt
        .query_map([], |row| row.get::<_, String>(1))?
        .collect::<rusqlite::Result<Vec<_>>>()?;

    if columns.iter().any(|column| column == column_name) {
        Ok(())
    } else {
        bail!("{error_message}")
    }
}

fn emit_report(
    report: &ReportData,
    summary: &ReportSummary,
    output_path: Option<&Path>,
) -> Result<()> {
    if let Some(output_path) = output_path {
        write_text_file(
            output_path,
            &render_report(report, summary, ReportDetail::FullFile),
        )?;
        print!("{}", render_summary(report, summary, Some(output_path)));
    } else {
        print!(
            "{}",
            render_report(report, summary, ReportDetail::CompactStdout)
        );
    }

    Ok(())
}

fn write_text_file(path: &Path, text: &str) -> Result<()> {
    if let Some(parent) = path
        .parent()
        .filter(|parent| !parent.as_os_str().is_empty())
    {
        fs::create_dir_all(parent)
            .with_context(|| format!("failed to create {}", parent.display()))?;
    }

    fs::write(path, text).with_context(|| format!("failed to write {}", path.display()))
}

fn render_report(report: &ReportData, summary: &ReportSummary, detail: ReportDetail) -> String {
    let mut out = String::new();

    out.push_str("# dump-json-refs report\n\n");
    push_schemas_section(&mut out, report);

    match detail {
        ReportDetail::CompactStdout => {
            push_fields_section(&mut out, report);
            push_schema_aliases_section_if_non_empty(&mut out, report);
        }
        ReportDetail::FullFile => {
            push_schema_object_paths_section(&mut out, report);
            push_schema_array_index_refs_section(&mut out, report);
            push_fields_section(&mut out, report);
        }
    }

    out.push_str(&render_summary(report, summary, None));
    out
}

fn push_schemas_section(out: &mut String, report: &ReportData) {
    out.push_str("[schemas]\n");
    out.push_str("schema_path\tschema_kind\tobject_count\n");
    for schema in &report.schemas {
        out.push_str(&format!(
            "{}\t{}\t{}\n",
            schema.schema_path, schema.schema_kind, schema.object_count
        ));
    }
    out.push('\n');
}

fn push_schema_object_paths_section(out: &mut String, report: &ReportData) {
    out.push_str("[schema_object_paths]\n");
    out.push_str("schema_path\tobject_path\n");
    for mapping in &report.object_paths {
        out.push_str(&format!(
            "{}\t{}\n",
            mapping.schema_path, mapping.object_path
        ));
    }
    out.push('\n');
}

fn push_schema_aliases_section_if_non_empty(out: &mut String, report: &ReportData) {
    let aliases = report
        .object_paths
        .iter()
        .filter(|mapping| mapping.schema_path != mapping.object_path)
        .collect::<Vec<_>>();

    if aliases.is_empty() {
        return;
    }

    out.push_str("[schema_aliases]\n");
    out.push_str("schema_path\tobject_path\n");
    for mapping in aliases {
        out.push_str(&format!(
            "{}\t{}\n",
            mapping.schema_path, mapping.object_path
        ));
    }
    out.push('\n');
}

fn push_schema_array_index_refs_section(out: &mut String, report: &ReportData) {
    out.push_str("[schema_array_index_refs]\n");
    out.push_str("schema_path\tarray_path\tarray_index_path\n");
    for mapping in &report.array_index_refs {
        out.push_str(&format!(
            "{}\t{}\t{}\n",
            mapping.schema_path, mapping.array_path, mapping.array_index_path
        ));
    }
    out.push('\n');
}

fn push_fields_section(out: &mut String, report: &ReportData) {
    out.push_str("[fields]\n");
    out.push_str("schema_path\tfield_name\tfield_type\tfield_count\n");
    for field in &report.fields {
        out.push_str(&format!(
            "{}\t{}\t{}\t{}\n",
            field.schema_path, field.field_name, field.field_type, field.field_count
        ));
    }
    out.push('\n');
}

fn render_summary(
    report: &ReportData,
    summary: &ReportSummary,
    output_path: Option<&Path>,
) -> String {
    let mut out = String::new();

    out.push_str("[summary]\n");
    out.push_str(&format!("schema_count\t{}\n", report.schemas.len()));
    out.push_str(&format!("field_count\t{}\n", report.fields.len()));
    out.push_str(&format!(
        "object_path_mapping_count\t{}\n",
        report.object_paths.len()
    ));
    out.push_str(&format!(
        "schema_alias_count\t{}\n",
        report
            .object_paths
            .iter()
            .filter(|mapping| mapping.schema_path != mapping.object_path)
            .count()
    ));
    out.push_str(&format!(
        "array_index_ref_count\t{}\n",
        report.array_index_refs.len()
    ));

    match summary {
        ReportSummary::Generated {
            execution_time_ms,
            input_source,
        } => {
            out.push_str(&format!("execution_time_ms\t{execution_time_ms}\n"));
            out.push_str(&format!("input\t{input_source}\n"));
        }
        ReportSummary::FromSqlite { sqlite_path } => {
            out.push_str(&format!("sqlite_path\t{sqlite_path}\n"));
        }
    }

    if let Some(output_path) = output_path {
        out.push_str(&format!("report\t{}\n", output_path.display()));
    }

    out
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct DotNode {
    schema_path: String,
    schema_kind: String,
    object_count: i64,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct DotRelation {
    from_schema_path: String,
    to_schema_path: String,
    relation_kind: String,
    fk_owner: String,
    fk_candidate: bool,
    field_name: String,
    cardinality: String,
    required: bool,
    mixed: bool,
    nested_array_depth: usize,
    via_schema_path: Option<String>,
    via_array_path: Option<String>,
    parent_object_count: i64,
    child_object_count: i64,
    field_count: i64,
}

fn resolve_graph_output_path(
    graph_requested: bool,
    graph_output: Option<&Path>,
    outdir: &Path,
    from_sqlite: Option<&Path>,
    graph_format: GraphFormat,
) -> Option<PathBuf> {
    if !graph_requested {
        return None;
    }

    if let Some(graph_output) = graph_output {
        if graph_output != Path::new(DEFAULT_GRAPH_OUTPUT) {
            return Some(graph_output.to_path_buf());
        }
    }

    if let Some(sqlite_path) = from_sqlite {
        return Some(
            sqlite_path
                .parent()
                .unwrap_or_else(|| Path::new("."))
                .join(graph_format.default_filename()),
        );
    }

    Some(outdir.join(graph_format.default_filename()))
}

fn write_graph_from_sqlite(
    database: &Path,
    output_path: &Path,
    include_marked: bool,
    rankdir: &str,
    graph_format: GraphFormat,
) -> Result<()> {
    if !database.exists() {
        bail!("SQLite graph source does not exist: {}", database.display());
    }

    let conn = Connection::open(database)
        .with_context(|| format!("failed to open SQLite graph source: {}", database.display()))?;
    ensure_graph_tables_exist(&conn)?;
    let (nodes, relations) = load_graph_data(&conn, include_marked)?;
    let rendered = match graph_format {
        GraphFormat::Mermaid => render_mermaid_graph(&nodes, &relations, rankdir, false),
        GraphFormat::MermaidMd => render_mermaid_graph(&nodes, &relations, rankdir, true),
        GraphFormat::Dot => render_dot_graph(&nodes, &relations, rankdir),
    };
    write_text_file(output_path, &rendered)
}

fn ensure_graph_tables_exist(conn: &Connection) -> Result<()> {
    ensure_table_has_column(
        conn,
        "schema_relations",
        "relation_kind",
        "SQLite graph source does not contain schema_relations; regenerate refs with the current dump-json-refs version",
    )?;
    ensure_table_has_column(
        conn,
        "schema_relations",
        "fk_candidate",
        "SQLite graph source does not contain schema_relations.fk_candidate; regenerate refs with the current dump-json-refs version",
    )?;
    ensure_table_has_column(
        conn,
        "schema_relations",
        "nested_array_depth",
        "SQLite graph source does not contain schema_relations.nested_array_depth; regenerate refs with the current dump-json-refs version",
    )?;
    Ok(())
}

fn load_graph_data(
    conn: &Connection,
    include_marked: bool,
) -> Result<(Vec<DotNode>, Vec<DotRelation>)> {
    let relation_filter = if include_marked {
        ""
    } else {
        "WHERE fk_candidate = 1"
    };
    let mut relation_stmt = conn.prepare(&format!(
        "SELECT from_schema_path, to_schema_path, relation_kind, fk_owner, fk_candidate, \
                field_name, cardinality, required, mixed, nested_array_depth, \
                via_schema_path, via_array_path, parent_object_count, child_object_count, field_count \
         FROM schema_relations \
         {relation_filter} \
         ORDER BY fk_candidate DESC, relation_kind, from_schema_path, to_schema_path, field_name"
    ))?;

    let relations = relation_stmt
        .query_map([], |row| {
            Ok(DotRelation {
                from_schema_path: row.get(0)?,
                to_schema_path: row.get(1)?,
                relation_kind: row.get(2)?,
                fk_owner: row.get(3)?,
                fk_candidate: row.get::<_, i64>(4)? == 1,
                field_name: row.get(5)?,
                cardinality: row.get(6)?,
                required: row.get::<_, i64>(7)? == 1,
                mixed: row.get::<_, i64>(8)? == 1,
                nested_array_depth: row.get::<_, i64>(9)? as usize,
                via_schema_path: row.get(10)?,
                via_array_path: row.get(11)?,
                parent_object_count: row.get(12)?,
                child_object_count: row.get(13)?,
                field_count: row.get(14)?,
            })
        })?
        .collect::<rusqlite::Result<Vec<_>>>()?;

    let mut used_paths = BTreeSet::<String>::new();
    for relation in &relations {
        used_paths.insert(relation.from_schema_path.clone());
        used_paths.insert(relation.to_schema_path.clone());
    }

    let mut node_stmt = conn.prepare(
        "SELECT d.schema_path, d.schema_kind, COALESCE(c.object_count, 0) AS object_count \
         FROM schema_definitions AS d \
         LEFT JOIN schema_object_counts AS c ON c.schema_path = d.schema_path \
         ORDER BY object_count DESC, d.schema_path",
    )?;
    let nodes = node_stmt
        .query_map([], |row| {
            Ok(DotNode {
                schema_path: row.get(0)?,
                schema_kind: row.get(1)?,
                object_count: row.get(2)?,
            })
        })?
        .collect::<rusqlite::Result<Vec<_>>>()?
        .into_iter()
        .filter(|node| used_paths.contains(&node.schema_path))
        .collect::<Vec<_>>();

    Ok((nodes, relations))
}

fn render_mermaid_graph(
    nodes: &[DotNode],
    relations: &[DotRelation],
    rankdir: &str,
    markdown_fence: bool,
) -> String {
    let mut out = String::new();
    if markdown_fence {
        out.push_str("```mermaid\n");
    }
    out.push_str(&format!("flowchart {}\n", mermaid_direction(rankdir)));

    let mut node_ids = HashMap::<String, String>::new();
    for (index, node) in nodes.iter().enumerate() {
        let id = format!("n{index}");
        node_ids.insert(node.schema_path.clone(), id.clone());
        let label = format!(
            "{}<br/>kind={}<br/>object_count={}",
            node.schema_path, node.schema_kind, node.object_count
        );
        out.push_str(&format!("  {id}[\"{}\"]\n", mermaid_label(&label)));
    }

    if !nodes.is_empty() {
        out.push('\n');
    }

    for relation in relations {
        let Some(from_id) = node_ids.get(&relation.from_schema_path) else {
            continue;
        };
        let Some(to_id) = node_ids.get(&relation.to_schema_path) else {
            continue;
        };

        let mut label = vec![
            relation_edge_label(relation),
            relation.relation_kind.clone(),
            relation.cardinality.clone(),
        ];
        if relation.required {
            label.push("required".to_string());
        } else {
            label.push("optional".to_string());
        }
        if relation.mixed {
            label.push("mixed".to_string());
        }
        if relation.nested_array_depth >= 2 {
            label.push(format!("depth={}", relation.nested_array_depth));
        }
        let label = mermaid_label(&label.join("<br/>"));

        if relation.fk_candidate {
            out.push_str(&format!("  {from_id} -->|\"{label}\"| {to_id}\n"));
        } else {
            out.push_str(&format!("  {from_id} -. \"{label}\" .-> {to_id}\n"));
        }
    }

    if markdown_fence {
        out.push_str("```\n");
    }
    out
}

fn mermaid_direction(rankdir: &str) -> &'static str {
    match rankdir {
        "LR" => "LR",
        "TB" => "TB",
        "RL" => "RL",
        "BT" => "BT",
        _ => "LR",
    }
}

fn mermaid_label(value: &str) -> String {
    value
        .replace('&', "&amp;")
        .replace('"', "&quot;")
        .replace('|', "&#124;")
}

fn render_dot_graph(nodes: &[DotNode], relations: &[DotRelation], rankdir: &str) -> String {
    let mut out = String::new();
    out.push_str("digraph schema_fk {\n");
    out.push_str(&format!("  rankdir={};\n", dot_id(rankdir)));
    out.push_str("  node [shape=box];\n\n");

    for node in nodes {
        let label = format!(
            "{}\nkind={}\nobject_count={}",
            node.schema_path, node.schema_kind, node.object_count
        );
        out.push_str(&format!(
            "  {} [label={}, schema_kind={}, object_count={}];\n",
            dot_string(&node.schema_path),
            dot_string(&label),
            dot_string(&node.schema_kind),
            dot_string(&node.object_count.to_string()),
        ));
    }

    if !nodes.is_empty() {
        out.push('\n');
    }

    for relation in relations {
        let label = relation_edge_label(relation);
        let mut attributes = vec![
            ("label".to_string(), label),
            ("relation_kind".to_string(), relation.relation_kind.clone()),
            ("fk_owner".to_string(), relation.fk_owner.clone()),
            (
                "fk_candidate".to_string(),
                relation.fk_candidate.to_string(),
            ),
            ("field_name".to_string(), relation.field_name.clone()),
            ("cardinality".to_string(), relation.cardinality.clone()),
            ("required".to_string(), relation.required.to_string()),
            ("mixed".to_string(), relation.mixed.to_string()),
            (
                "nested_array_depth".to_string(),
                relation.nested_array_depth.to_string(),
            ),
            (
                "parent_object_count".to_string(),
                relation.parent_object_count.to_string(),
            ),
            (
                "child_object_count".to_string(),
                relation.child_object_count.to_string(),
            ),
            ("field_count".to_string(), relation.field_count.to_string()),
        ];

        if let Some(via_schema_path) = &relation.via_schema_path {
            attributes.push(("via_schema_path".to_string(), via_schema_path.clone()));
        }
        if let Some(via_array_path) = &relation.via_array_path {
            attributes.push(("via_array_path".to_string(), via_array_path.clone()));
        }
        if !relation.fk_candidate {
            attributes.push(("style".to_string(), "dashed".to_string()));
        }

        out.push_str(&format!(
            "  {} -> {} [{}];\n",
            dot_string(&relation.from_schema_path),
            dot_string(&relation.to_schema_path),
            attributes
                .into_iter()
                .map(|(key, value)| format!("{}={}", key, dot_string(&value)))
                .collect::<Vec<_>>()
                .join(", "),
        ));
    }

    out.push_str("}\n");
    out
}

fn relation_edge_label(relation: &DotRelation) -> String {
    match relation.relation_kind.as_str() {
        "object_ref" => relation.field_name.clone(),
        "array_item" | "heterogeneous_array_item" => format!("{}[*]", relation.field_name),
        "nested_array_item" => {
            let suffix = (0..relation.nested_array_depth)
                .map(|_| "[*]")
                .collect::<Vec<_>>()
                .join("");
            format!("{}{}", relation.field_name, suffix)
        }
        _ => relation.field_name.clone(),
    }
}

fn dot_id(value: &str) -> String {
    match value {
        "LR" | "TB" | "RL" | "BT" => value.to_string(),
        _ => "LR".to_string(),
    }
}

fn dot_string(value: &str) -> String {
    let escaped = value
        .replace('\\', "\\\\")
        .replace('"', "\\\"")
        .replace('\n', "\\n");
    format!("\"{escaped}\"")
}

fn write_files_and_index(
    outdir: &Path,
    records: &[Record],
    occurrences: &[Occurrence],
    compact_output: bool,
) -> Result<()> {
    let mut made_dirs = HashMap::<PathBuf, ()>::new();

    for record in records {
        let canonical = PathBuf::from(&record.schema_path);
        ensure_parent(&canonical, &mut made_dirs)?;
        let schema_json = if compact_output {
            serde_json::to_string(&record.schema)?
        } else {
            serde_json::to_string_pretty(&record.schema)?
        };

        fs::write(&canonical, format!("{schema_json}\n"))
            .with_context(|| format!("failed to write {}", canonical.display()))?;

        for object_path in &record.object_paths {
            if object_path == &record.schema_path {
                continue;
            }
            let object_path = PathBuf::from(object_path);
            ensure_parent(&object_path, &mut made_dirs)?;
            replace_with_symlink(&canonical, &object_path)?;
        }
    }

    let database = outdir.join("schemas.sqlite");
    let mut conn = Connection::open(&database)
        .with_context(|| format!("failed to open {}", database.display()))?;
    let tx = conn.transaction()?;
    tx.execute_batch(
        "CREATE TABLE schema_paths (schema_path TEXT NOT NULL, object_path TEXT PRIMARY KEY);
         CREATE TABLE array_index_refs (array_path TEXT NOT NULL, array_index_path TEXT NOT NULL, schema_path TEXT NOT NULL, PRIMARY KEY (array_path, array_index_path));
         CREATE TABLE schema_definitions (schema_path TEXT PRIMARY KEY, schema_kind TEXT NOT NULL, schema_json TEXT NOT NULL);
         CREATE TABLE schema_object_counts (schema_path TEXT PRIMARY KEY, object_count INTEGER NOT NULL CHECK (object_count > 0));
         CREATE TABLE schema_field_counts (schema_path TEXT NOT NULL, field_name TEXT NOT NULL, field_type TEXT NOT NULL, field_count INTEGER NOT NULL CHECK (field_count > 0), PRIMARY KEY (schema_path, field_name));
         CREATE TABLE schema_relations (relation_id INTEGER PRIMARY KEY AUTOINCREMENT, from_schema_path TEXT NOT NULL, to_schema_path TEXT NOT NULL, relation_kind TEXT NOT NULL, fk_owner TEXT NOT NULL, fk_candidate INTEGER NOT NULL CHECK (fk_candidate IN (0, 1)), field_name TEXT NOT NULL, field_type TEXT NOT NULL, cardinality TEXT NOT NULL, required INTEGER NOT NULL CHECK (required IN (0, 1)), mixed INTEGER NOT NULL CHECK (mixed IN (0, 1)), nested_array_depth INTEGER NOT NULL DEFAULT 0 CHECK (nested_array_depth >= 0), via_schema_path TEXT, via_array_path TEXT, parent_schema_path TEXT NOT NULL, child_schema_path TEXT NOT NULL, parent_object_count INTEGER NOT NULL DEFAULT 0 CHECK (parent_object_count >= 0), child_object_count INTEGER NOT NULL DEFAULT 0 CHECK (child_object_count >= 0), field_count INTEGER NOT NULL DEFAULT 0 CHECK (field_count >= 0), UNIQUE (from_schema_path, to_schema_path, relation_kind, field_name, field_type, via_schema_path, via_array_path));
         CREATE INDEX idx_schema_relations_from ON schema_relations(from_schema_path);
         CREATE INDEX idx_schema_relations_to ON schema_relations(to_schema_path);
         CREATE INDEX idx_schema_relations_kind ON schema_relations(relation_kind, fk_candidate);
         CREATE INDEX idx_schema_relations_parent_child ON schema_relations(parent_schema_path, child_schema_path);",
    )?;

    for record in records {
        let schema_kind = schema_kind(&record.schema);
        let schema_json = serde_json::to_string(&record.schema)?;
        tx.execute(
            "INSERT INTO schema_definitions(schema_path, schema_kind, schema_json) VALUES (?1, ?2, ?3) \
             ON CONFLICT(schema_path) DO UPDATE SET schema_kind=excluded.schema_kind, schema_json=excluded.schema_json",
            params![&record.schema_path, schema_kind, schema_json],
        )?;

        for object_path in &record.object_paths {
            tx.execute(
                "INSERT INTO schema_paths(schema_path, object_path) VALUES (?1, ?2) \
                 ON CONFLICT(object_path) DO UPDATE SET schema_path=excluded.schema_path",
                params![record.schema_path, object_path],
            )?;
        }

        if let Some(parent) = &record.array_parent {
            for index_path in &record.array_index_paths {
                let index_path = serde_json::to_string(index_path)?;
                tx.execute(
                    "INSERT INTO array_index_refs(array_path, array_index_path, schema_path) VALUES (?1, ?2, ?3) \
                     ON CONFLICT(array_path, array_index_path) DO UPDATE SET schema_path=excluded.schema_path",
                    params![parent, index_path, &record.schema_path],
                )?;
            }
        }
    }
    write_occurrence_statistics(&tx, outdir, records, occurrences)?;
    write_schema_relations(&tx, records)?;
    tx.commit()?;
    Ok(())
}

fn schema_kind(schema: &Value) -> &'static str {
    match schema.as_object() {
        Some(map) if is_refs_mut_schema(map) => "heterogeneous",
        _ => "object",
    }
}

fn is_refs_mut_schema(schema: &Map<String, Value>) -> bool {
    schema.len() == 1 && schema.get("$refs_mut").and_then(Value::as_str).is_some()
}

fn write_occurrence_statistics(
    tx: &Transaction<'_>,
    outdir: &Path,
    records: &[Record],
    occurrences: &[Occurrence],
) -> Result<()> {
    let schemas_by_path = records
        .iter()
        .filter_map(|record| {
            record
                .schema
                .as_object()
                .map(|schema| (record.schema_path.as_str(), schema))
        })
        .collect::<HashMap<_, _>>();

    for occurrence in occurrences {
        let schema_path = canonical_schema_path_for_occurrence(tx, outdir, occurrence)?;
        let schema = schemas_by_path
            .get(schema_path.as_str())
            .with_context(|| format!("missing in-memory schema for {schema_path}"))?;

        if is_refs_mut_schema(schema) {
            continue;
        }

        tx.execute(
            "INSERT INTO schema_object_counts(schema_path, object_count) VALUES (?1, 1) \
             ON CONFLICT(schema_path) DO UPDATE SET object_count = schema_object_counts.object_count + 1",
            params![&schema_path],
        )?;

        for field_name in occurrence.value.keys() {
            let field_type = schema
                .get(field_name)
                .and_then(Value::as_str)
                .with_context(|| {
                    format!("missing field type for {schema_path}.{field_name} in canonical schema")
                })?;

            tx.execute(
                "INSERT INTO schema_field_counts(schema_path, field_name, field_type, field_count) VALUES (?1, ?2, ?3, 1) \
                 ON CONFLICT(schema_path, field_name) DO UPDATE SET \
                 field_type = excluded.field_type, \
                 field_count = schema_field_counts.field_count + 1",
                params![&schema_path, field_name, field_type],
            )?;
        }
    }
    Ok(())
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct RelationTarget {
    target_schema_path: String,
    array_depth: usize,
    mixed: bool,
    optional: bool,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct SchemaRelation {
    from_schema_path: String,
    to_schema_path: String,
    relation_kind: String,
    fk_owner: String,
    fk_candidate: bool,
    field_name: String,
    field_type: String,
    cardinality: String,
    required: bool,
    mixed: bool,
    nested_array_depth: usize,
    via_schema_path: Option<String>,
    via_array_path: Option<String>,
    parent_schema_path: String,
    child_schema_path: String,
    parent_object_count: i64,
    child_object_count: i64,
    field_count: i64,
}

fn write_schema_relations(tx: &Transaction<'_>, records: &[Record]) -> Result<()> {
    let schema_kinds = records
        .iter()
        .map(|record| {
            (
                record.schema_path.clone(),
                schema_kind(&record.schema).to_string(),
            )
        })
        .collect::<HashMap<_, _>>();

    let mut variants_by_array_parent = HashMap::<String, Vec<String>>::new();
    for record in records {
        if let Some(array_parent) = &record.array_parent {
            if record.schema_path != *array_parent {
                variants_by_array_parent
                    .entry(array_parent.clone())
                    .or_default()
                    .push(record.schema_path.clone());
            }
        }
    }
    for variants in variants_by_array_parent.values_mut() {
        variants.sort();
        variants.dedup();
    }

    for record in records {
        let Some(schema) = record.schema.as_object() else {
            continue;
        };
        if is_refs_mut_schema(schema) {
            continue;
        }

        let parent_schema_path = &record.schema_path;
        let parent_object_count = object_count_for_schema(tx, parent_schema_path)?;

        for (field_name, field_type_value) in schema {
            let Some(field_type) = field_type_value.as_str() else {
                continue;
            };

            let field_count = field_count_for_schema(tx, parent_schema_path, field_name)?;
            if field_count == 0 {
                continue;
            }
            let required = parent_object_count > 0 && field_count == parent_object_count;

            for target in relation_targets_from_field_type(field_type) {
                let target_kind = schema_kinds
                    .get(&target.target_schema_path)
                    .map(String::as_str)
                    .unwrap_or("object");

                if target.array_depth == 0 {
                    if target_kind == "heterogeneous" {
                        continue;
                    }
                    let child_schema_path = target.target_schema_path.clone();
                    let relation = SchemaRelation {
                        from_schema_path: parent_schema_path.clone(),
                        to_schema_path: child_schema_path.clone(),
                        relation_kind: "object_ref".to_string(),
                        fk_owner: "parent".to_string(),
                        fk_candidate: true,
                        field_name: field_name.clone(),
                        field_type: field_type.to_string(),
                        cardinality: if required {
                            "one_to_one_candidate".to_string()
                        } else {
                            "zero_or_one_to_one_candidate".to_string()
                        },
                        required,
                        mixed: target.mixed,
                        nested_array_depth: 0,
                        via_schema_path: None,
                        via_array_path: None,
                        parent_schema_path: parent_schema_path.clone(),
                        child_schema_path: child_schema_path.clone(),
                        parent_object_count,
                        child_object_count: object_count_for_schema(tx, &child_schema_path)?,
                        field_count,
                    };
                    insert_schema_relation(tx, &relation)?;
                    continue;
                }

                let via_array_path = Some(target.target_schema_path.clone());
                if target.array_depth >= 2 {
                    let children = if target_kind == "heterogeneous" {
                        variants_by_array_parent
                            .get(&target.target_schema_path)
                            .cloned()
                            .unwrap_or_default()
                    } else {
                        vec![target.target_schema_path.clone()]
                    };
                    for child_schema_path in children {
                        let relation = SchemaRelation {
                            from_schema_path: child_schema_path.clone(),
                            to_schema_path: parent_schema_path.clone(),
                            relation_kind: "nested_array_item".to_string(),
                            fk_owner: "child".to_string(),
                            fk_candidate: false,
                            field_name: field_name.clone(),
                            field_type: field_type.to_string(),
                            cardinality: "nested_many_to_one_marker".to_string(),
                            required,
                            mixed: target.mixed,
                            nested_array_depth: target.array_depth,
                            via_schema_path: if target_kind == "heterogeneous" {
                                Some(target.target_schema_path.clone())
                            } else {
                                Some(child_schema_path.clone())
                            },
                            via_array_path: via_array_path.clone(),
                            parent_schema_path: parent_schema_path.clone(),
                            child_schema_path: child_schema_path.clone(),
                            parent_object_count,
                            child_object_count: object_count_for_schema(tx, &child_schema_path)?,
                            field_count,
                        };
                        insert_schema_relation(tx, &relation)?;
                    }
                    continue;
                }

                if target_kind == "heterogeneous" {
                    for child_schema_path in variants_by_array_parent
                        .get(&target.target_schema_path)
                        .cloned()
                        .unwrap_or_default()
                    {
                        let relation = SchemaRelation {
                            from_schema_path: child_schema_path.clone(),
                            to_schema_path: parent_schema_path.clone(),
                            relation_kind: "heterogeneous_array_item".to_string(),
                            fk_owner: "child".to_string(),
                            fk_candidate: true,
                            field_name: field_name.clone(),
                            field_type: field_type.to_string(),
                            cardinality: "many_to_one".to_string(),
                            required,
                            mixed: target.mixed,
                            nested_array_depth: 1,
                            via_schema_path: Some(target.target_schema_path.clone()),
                            via_array_path: via_array_path.clone(),
                            parent_schema_path: parent_schema_path.clone(),
                            child_schema_path: child_schema_path.clone(),
                            parent_object_count,
                            child_object_count: object_count_for_schema(tx, &child_schema_path)?,
                            field_count,
                        };
                        insert_schema_relation(tx, &relation)?;
                    }
                } else {
                    let child_schema_path = target.target_schema_path.clone();
                    let relation = SchemaRelation {
                        from_schema_path: child_schema_path.clone(),
                        to_schema_path: parent_schema_path.clone(),
                        relation_kind: "array_item".to_string(),
                        fk_owner: "child".to_string(),
                        fk_candidate: true,
                        field_name: field_name.clone(),
                        field_type: field_type.to_string(),
                        cardinality: "many_to_one".to_string(),
                        required,
                        mixed: target.mixed,
                        nested_array_depth: 1,
                        via_schema_path: Some(child_schema_path.clone()),
                        via_array_path,
                        parent_schema_path: parent_schema_path.clone(),
                        child_schema_path: child_schema_path.clone(),
                        parent_object_count,
                        child_object_count: object_count_for_schema(tx, &child_schema_path)?,
                        field_count,
                    };
                    insert_schema_relation(tx, &relation)?;
                }
            }
        }
    }

    Ok(())
}

fn insert_schema_relation(tx: &Transaction<'_>, relation: &SchemaRelation) -> Result<()> {
    tx.execute(
        "INSERT OR IGNORE INTO schema_relations(\
         from_schema_path, to_schema_path, relation_kind, fk_owner, fk_candidate, \
         field_name, field_type, cardinality, required, mixed, nested_array_depth, \
         via_schema_path, via_array_path, parent_schema_path, child_schema_path, \
         parent_object_count, child_object_count, field_count) \
         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18)",
        params![
            &relation.from_schema_path,
            &relation.to_schema_path,
            &relation.relation_kind,
            &relation.fk_owner,
            relation.fk_candidate as i64,
            &relation.field_name,
            &relation.field_type,
            &relation.cardinality,
            relation.required as i64,
            relation.mixed as i64,
            relation.nested_array_depth as i64,
            relation.via_schema_path.as_deref(),
            relation.via_array_path.as_deref(),
            &relation.parent_schema_path,
            &relation.child_schema_path,
            relation.parent_object_count,
            relation.child_object_count,
            relation.field_count,
        ],
    )?;
    Ok(())
}

fn object_count_for_schema(tx: &Transaction<'_>, schema_path: &str) -> Result<i64> {
    tx.query_row(
        "SELECT COALESCE((SELECT object_count FROM schema_object_counts WHERE schema_path = ?1), 0)",
        params![schema_path],
        |row| row.get(0),
    )
    .with_context(|| format!("failed to read object count for {schema_path}"))
}

fn field_count_for_schema(
    tx: &Transaction<'_>,
    schema_path: &str,
    field_name: &str,
) -> Result<i64> {
    tx.query_row(
        "SELECT COALESCE((SELECT field_count FROM schema_field_counts WHERE schema_path = ?1 AND field_name = ?2), 0)",
        params![schema_path, field_name],
        |row| row.get(0),
    )
    .with_context(|| format!("failed to read field count for {schema_path}.{field_name}"))
}

fn relation_targets_from_field_type(field_type: &str) -> Vec<RelationTarget> {
    let optional = field_type.ends_with('?');
    let trimmed = field_type.trim_end_matches('?');
    let mut targets = relation_targets_from_label(trimmed, 0, optional);
    targets.sort_by(|a, b| {
        a.target_schema_path
            .cmp(&b.target_schema_path)
            .then(a.array_depth.cmp(&b.array_depth))
    });
    targets.dedup();
    targets
}

fn relation_targets_from_label(
    label: &str,
    array_depth: usize,
    inherited_optional: bool,
) -> Vec<RelationTarget> {
    let parts = split_top_level_union(label);
    let mixed_here = parts.len() > 1;
    let mut out = Vec::new();

    for part in parts {
        let part = part.trim();
        if let Some(inner) = array_inner(part) {
            for mut target in
                relation_targets_from_label(inner, array_depth + 1, inherited_optional)
            {
                target.mixed |= mixed_here;
                out.push(target);
            }
        } else if part.ends_with(".json") {
            out.push(RelationTarget {
                target_schema_path: part.to_string(),
                array_depth,
                mixed: mixed_here,
                optional: inherited_optional,
            });
        }
    }

    out
}

fn split_top_level_union(label: &str) -> Vec<&str> {
    let mut parts = Vec::new();
    let mut depth = 0usize;
    let mut start = 0usize;

    for (idx, ch) in label.char_indices() {
        match ch {
            '(' => depth += 1,
            ')' => depth = depth.saturating_sub(1),
            '|' if depth == 0 => {
                parts.push(&label[start..idx]);
                start = idx + ch.len_utf8();
            }
            _ => {}
        }
    }

    parts.push(&label[start..]);
    parts
}

fn array_inner(label: &str) -> Option<&str> {
    if !label.starts_with("array(") || !label.ends_with(')') {
        return None;
    }
    Some(&label[6..label.len() - 1])
}

fn canonical_schema_path_for_occurrence(
    tx: &Transaction<'_>,
    outdir: &Path,
    occurrence: &Occurrence,
) -> Result<String> {
    match occurrence.role {
        Role::Object => {
            let object_path = format!("{}.json", reference_path(outdir, &occurrence.segments));
            tx.query_row(
                "SELECT schema_path FROM schema_paths WHERE object_path = ?1",
                params![&object_path],
                |row| row.get(0),
            )
            .with_context(|| format!("missing schema_paths entry for {object_path}"))
        }
        Role::ArrayItem | Role::RootCollection => {
            let array_path = format!("{}.json", reference_path(outdir, &occurrence.segments));
            let array_index_path = serde_json::to_string(&occurrence.array_indexes)?;
            tx.query_row(
                "SELECT schema_path FROM array_index_refs \
                 WHERE array_path = ?1 AND array_index_path = ?2",
                params![&array_path, &array_index_path],
                |row| row.get(0),
            )
            .with_context(|| {
                format!("missing array_index_refs entry for {array_path} at {array_index_path}")
            })
        }
    }
}

fn ensure_parent(path: &Path, made_dirs: &mut HashMap<PathBuf, ()>) -> Result<()> {
    let dir = path.parent().unwrap_or_else(|| Path::new("."));
    if !made_dirs.contains_key(dir) {
        fs::create_dir_all(dir).with_context(|| format!("failed to create {}", dir.display()))?;
        made_dirs.insert(dir.to_path_buf(), ());
    }
    Ok(())
}

fn replace_with_symlink(target: &Path, link: &Path) -> Result<()> {
    let _ = fs::remove_file(link);
    let link_dir = link.parent().unwrap_or_else(|| Path::new("."));
    let relative = pathdiff::diff_paths(target, link_dir).unwrap_or_else(|| target.to_path_buf());
    #[cfg(unix)]
    {
        std::os::unix::fs::symlink(&relative, link).with_context(|| {
            format!(
                "failed to symlink {} -> {}",
                link.display(),
                relative.display()
            )
        })?;
    }
    #[cfg(not(unix))]
    {
        fs::copy(target, link).with_context(|| {
            format!("failed to copy {} to {}", target.display(), link.display())
        })?;
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use clap::{CommandFactory, Parser};
    use serde_json::json;

    fn object(value: Value) -> Map<String, Value> {
        value.as_object().unwrap().clone()
    }

    fn temp_outdir(name: &str) -> PathBuf {
        let path =
            std::env::temp_dir().join(format!("dump-json-refs-{name}-{}", std::process::id()));
        let _ = fs::remove_dir_all(&path);
        fs::create_dir_all(&path).unwrap();
        path
    }

    #[test]
    fn build_schema_output_retains_every_walked_object_occurrence() {
        let build = build_schema_output(
            Path::new("refs"),
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "items": [
                        {"meta": {"id": 1, "note": null}},
                        {"meta": {"id": 2}}
                    ]
                })),
            }],
        )
        .unwrap();

        assert_eq!(build.occurrences.len(), 5);
        assert!(build.occurrences.iter().any(|occurrence| {
            occurrence.role == Role::Object
                && occurrence.segments == vec!["root", "items", "meta"]
                && occurrence.value.contains_key("note")
        }));
        assert!(build.occurrences.iter().any(|occurrence| {
            occurrence.role == Role::ArrayItem
                && occurrence.segments == vec!["root", "items"]
                && occurrence.array_indexes == vec![1]
        }));
    }

    #[test]
    fn nullable_containers_use_schema_references_and_dump_nested_schemas() {
        let outdir = Path::new("refs");
        let root = vec!["root".to_string()];
        let objects = vec![
            object(json!({"child": null, "items": null})),
            object(json!({
                "child": {"name": "example"},
                "items": [{"id": 1}]
            })),
        ];

        let schema = schema_for_values(outdir, &root, &objects).unwrap();
        assert_eq!(schema["child"], "refs/root/child.json");
        assert_eq!(schema["items"], "array(refs/root/items.json)");

        let records = distinct_schemas(
            outdir,
            vec![Entry {
                path: root,
                index: None,
                collection: false,
                value: objects[1].clone(),
            }],
        )
        .unwrap();
        let paths = records
            .iter()
            .map(|record| record.schema_path.as_str())
            .collect::<Vec<_>>();
        assert!(paths.contains(&"refs/root/child.json"));
        assert!(paths.contains(&"refs/root/items.json"));
    }

    #[test]
    fn primitive_array_fields_do_not_gain_optional_suffixes() {
        let objects = vec![
            object(json!({"tags": ["stable"]})),
            object(json!({})),
            object(json!({"tags": null})),
        ];

        let schema = schema_for_values(Path::new("refs"), &["root".to_string()], &objects).unwrap();
        assert_eq!(schema["tags"], "array(string)");
    }

    #[test]
    fn array_labels_distinguish_objects_nested_arrays_and_mixed_members() {
        let schema = schema_for_values(
            Path::new("refs"),
            &["root".to_string()],
            &[object(json!({
                "child": {"id": 1},
                "objects": [{"id": 1}],
                "mixed": [{"id": 1}, "fallback"],
                "matrix": [[{"id": 1}]],
                "capabilities": [["gpu"]],
            }))],
        )
        .unwrap();

        assert_eq!(schema["child"], "refs/root/child.json");
        assert_eq!(schema["objects"], "array(refs/root/objects.json)");
        assert_eq!(schema["mixed"], "array(refs/root/mixed.json|string)");
        assert_eq!(schema["matrix"], "array(array(refs/root/matrix.json))");
        assert_eq!(schema["capabilities"], "array(array(string))");
    }

    #[test]
    fn mixed_array_and_scalar_fields_preserve_the_array_member_label() {
        let schema = schema_for_values(
            Path::new("refs"),
            &["root".to_string()],
            &[
                object(json!({"content": [{"type": "text"}]})),
                object(json!({"content": "<command-message>example</command-message>"})),
            ],
        )
        .unwrap();

        assert_eq!(schema["content"], "array(refs/root/content.json)|string");
    }

    #[test]
    fn nested_object_arrays_preserve_full_index_paths() {
        let records = distinct_schemas(
            Path::new("refs"),
            vec![
                Entry {
                    path: vec!["root_item".to_string()],
                    index: Some(0),
                    collection: true,
                    value: object(json!({"nested": [{"id": 1}]})),
                },
                Entry {
                    path: vec!["root_item".to_string()],
                    index: Some(1),
                    collection: true,
                    value: object(json!({"nested": [{"id": 2}]})),
                },
            ],
        )
        .unwrap();

        let nested = records
            .iter()
            .find(|record| record.schema_path == "refs/root_item/nested.json")
            .unwrap();
        assert_eq!(
            nested.array_parent.as_deref(),
            Some("refs/root_item/nested.json")
        );
        assert_eq!(nested.array_index_paths, vec![vec![0, 0], vec![1, 0]]);
    }

    #[test]
    fn sqlite_stores_nested_array_index_paths_as_json() {
        let outdir =
            std::env::temp_dir().join(format!("dump-json-refs-index-paths-{}", std::process::id()));
        let _ = fs::remove_dir_all(&outdir);
        fs::create_dir_all(&outdir).unwrap();

        let build = build_schema_output(
            &outdir,
            vec![
                Entry {
                    path: vec!["root_item".to_string()],
                    index: Some(0),
                    collection: true,
                    value: object(json!({"nested": [{"id": 1}]})),
                },
                Entry {
                    path: vec!["root_item".to_string()],
                    index: Some(1),
                    collection: true,
                    value: object(json!({"nested": [{"id": 2}]})),
                },
            ],
        )
        .unwrap();
        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let conn = Connection::open(outdir.join("schemas.sqlite")).unwrap();
        let mut stmt = conn
            .prepare(
                "SELECT array_index_path FROM array_index_refs \
                 WHERE array_path = ?1 ORDER BY array_index_path",
            )
            .unwrap();
        let stored = stmt
            .query_map(
                [format!("{}/root_item/nested.json", outdir.display())],
                |row| row.get::<_, String>(0),
            )
            .unwrap()
            .collect::<rusqlite::Result<Vec<_>>>()
            .unwrap();
        assert_eq!(stored, vec!["[0,0]", "[1,0]"]);

        drop(stmt);
        drop(conn);
        fs::remove_dir_all(&outdir).unwrap();
    }

    #[test]
    fn sqlite_counts_object_fields_by_the_resolved_canonical_schema() {
        let outdir = temp_outdir("field-counts");
        let build = build_schema_output(
            &outdir,
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "items": [
                        {"meta": {"id": 1, "note": null}},
                        {"meta": {"id": 2}}
                    ]
                })),
            }],
        )
        .unwrap();
        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let conn = Connection::open(outdir.join("schemas.sqlite")).unwrap();
        let meta_path = format!("{}/root/items/meta.json", outdir.display());
        let item_path = format!("{}/root/items.json", outdir.display());
        assert_eq!(
            conn.query_row(
                "SELECT object_count FROM schema_object_counts WHERE schema_path = ?1",
                [&meta_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap(),
            2
        );
        assert_eq!(
            conn.query_row(
                "SELECT field_count FROM schema_field_counts \
             WHERE schema_path = ?1 AND field_name = 'id'",
                [&meta_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap(),
            2
        );
        assert_eq!(
            conn.query_row(
                "SELECT field_count FROM schema_field_counts \
             WHERE schema_path = ?1 AND field_name = 'note'",
                [&meta_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap(),
            1
        );
        assert_eq!(
            conn.query_row(
                "SELECT object_count FROM schema_object_counts WHERE schema_path = ?1",
                [&item_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap(),
            2
        );

        drop(conn);
        fs::remove_dir_all(outdir).unwrap();
    }

    #[test]
    fn sqlite_does_not_count_refs_mut_array_containers() {
        let outdir = temp_outdir("refs-mut-counts");
        let build = build_schema_output(
            &outdir,
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "items": [{"id": 1}, {"label": "second"}]
                })),
            }],
        )
        .unwrap();
        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let conn = Connection::open(outdir.join("schemas.sqlite")).unwrap();
        let container_path = format!("{}/root/items.json", outdir.display());
        let count = conn
            .query_row(
                "SELECT COUNT(*) FROM schema_object_counts WHERE schema_path = ?1",
                [&container_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap();
        assert_eq!(count, 0);

        drop(conn);
        fs::remove_dir_all(outdir).unwrap();
    }

    #[test]
    fn sqlite_counts_homogeneous_root_collection_items_via_array_index_refs() {
        let outdir = temp_outdir("root-collection-counts");
        let build = build_schema_output(
            &outdir,
            vec![
                Entry {
                    path: vec!["root_item".to_string()],
                    index: Some(0),
                    collection: true,
                    value: object(json!({"kind": "snapshot"})),
                },
                Entry {
                    path: vec!["root_item".to_string()],
                    index: Some(1),
                    collection: true,
                    value: object(json!({"kind": "event"})),
                },
            ],
        )
        .unwrap();
        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let conn = Connection::open(outdir.join("schemas.sqlite")).unwrap();
        let root_path = format!("{}/root_item.json", outdir.display());
        let counts = conn
            .prepare(
                "SELECT object_count, (SELECT field_count FROM schema_field_counts \
             WHERE schema_path = ?1 AND field_name = 'kind') \
             FROM schema_object_counts WHERE schema_path = ?1",
            )
            .unwrap()
            .query_row([&root_path], |row| {
                Ok((row.get::<_, i64>(0)?, row.get::<_, i64>(1)?))
            })
            .unwrap();
        assert_eq!(counts, (2, 2));

        drop(conn);
        fs::remove_dir_all(outdir).unwrap();
    }

    #[test]
    fn jsonl_strips_leading_nul_padding_per_record() {
        let values =
            parse_jsonl_stream("\0\0{\"kind\":\"snapshot\"}\n{\"kind\":\"event\"}\n").unwrap();
        assert_eq!(
            values,
            vec![json!({"kind": "snapshot"}), json!({"kind": "event"})]
        );
    }

    #[test]
    fn jsonl_rejects_a_control_byte_inside_a_record() {
        let error = parse_jsonl_stream("{\"kind\":\"bad\0value\"}\n").unwrap_err();
        assert!(error.to_string().contains("invalid JSONL input at line 1"));
    }

    #[test]
    fn jqfile_is_not_a_supported_cli_option_or_help_entry() {
        assert!(Args::try_parse_from(["dump-json-refs", "--jqfile", "schema.jq"]).is_err());

        let mut command = Args::command();
        let help = command.render_help().to_string();
        assert!(!help.contains("--jqfile"));
    }
    #[test]
    fn writes_pretty_json_files_by_default() {
        let outdir = temp_outdir("pretty-json-output");
        let build = build_schema_output(
            &outdir,
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "id": 1,
                    "name": "example"
                })),
            }],
        )
        .unwrap();

        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let schema_path = outdir.join("root.json");
        let written = fs::read_to_string(&schema_path).unwrap();

        assert!(written.contains('\n'));
        assert!(written.contains("  \"id\""));
        assert_eq!(
            serde_json::from_str::<Value>(&written).unwrap(),
            json!({
                "id": "number",
                "name": "string"
            })
        );

        fs::remove_dir_all(outdir).unwrap();
    }

    #[test]
    fn writes_compact_json_files_with_compact_output_flag() {
        let outdir = temp_outdir("compact-json-output");
        let build = build_schema_output(
            &outdir,
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "id": 1,
                    "name": "example"
                })),
            }],
        )
        .unwrap();

        write_files_and_index(&outdir, &build.records, &build.occurrences, true).unwrap();

        let schema_path = outdir.join("root.json");
        let written = fs::read_to_string(&schema_path).unwrap();

        assert_eq!(written, "{\"id\":\"number\",\"name\":\"string\"}\n");

        fs::remove_dir_all(outdir).unwrap();
    }

    #[test]
    fn stdin_style_input_falls_back_to_jsonl_when_later_record_has_nul_padding() {
        let entries = normalize_input(
            "{\"kind\":\"snapshot\"}\n\0\0{\"kind\":\"event\"}\n",
            false,
            false,
            "root",
        )
        .unwrap();

        assert_eq!(entries.len(), 2);
        assert_eq!(entries[0].path, vec!["root_item"]);
        assert_eq!(entries[0].index, Some(0));
        assert_eq!(entries[1].index, Some(1));
    }

    #[test]
    fn sqlite_stores_field_types_from_canonical_schema() {
        let outdir = temp_outdir("field-types");
        let build = build_schema_output(
            &outdir,
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "id": 1,
                    "optional": null,
                    "items": [{"kind": "a"}, {"kind": "b"}]
                })),
            }],
        )
        .unwrap();
        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let conn = Connection::open(outdir.join("schemas.sqlite")).unwrap();
        let root_path = format!("{}/root.json", outdir.display());
        let item_path = format!("{}/root/items.json", outdir.display());

        assert_eq!(
            conn.query_row(
                "SELECT field_type FROM schema_field_counts WHERE schema_path = ?1 AND field_name = 'items'",
                [&root_path],
                |row| row.get::<_, String>(0),
            )
            .unwrap(),
            format!("array({item_path})")
        );
        assert_eq!(
            conn.query_row(
                "SELECT field_type FROM schema_field_counts WHERE schema_path = ?1 AND field_name = 'kind'",
                [&item_path],
                |row| row.get::<_, String>(0),
            )
            .unwrap(),
            "string".to_string()
        );

        drop(conn);
        fs::remove_dir_all(outdir).unwrap();
    }

    #[test]
    fn report_fields_are_sorted_by_count_desc_and_include_field_type() {
        let report = ReportData {
            schemas: vec![SchemaCount {
                schema_path: "refs/root.json".to_string(),
                schema_kind: "object".to_string(),
                object_count: 2,
            }],
            object_paths: vec![SchemaObjectPath {
                schema_path: "refs/root.json".to_string(),
                object_path: "refs/root.json".to_string(),
            }],
            array_index_refs: Vec::new(),
            fields: vec![
                FieldCount {
                    schema_path: "refs/root.json".to_string(),
                    field_name: "frequent".to_string(),
                    field_type: "string".to_string(),
                    field_count: 10,
                },
                FieldCount {
                    schema_path: "refs/root.json".to_string(),
                    field_name: "rare".to_string(),
                    field_type: "boolean?".to_string(),
                    field_count: 1,
                },
            ],
        };

        let rendered = render_report(
            &report,
            &ReportSummary::Generated {
                execution_time_ms: 12,
                input_source: "stdin".to_string(),
            },
            ReportDetail::CompactStdout,
        );

        assert!(rendered.contains("schema_path\tfield_name\tfield_type\tfield_count\n"));
        assert!(rendered.contains("refs/root.json\tfrequent\tstring\t10\n"));
        assert!(rendered.contains("refs/root.json\trare\tboolean?\t1\n"));
        assert!(!rendered.contains("[schema_array_index_refs]"));
        assert!(!rendered.contains("[schema_object_paths]"));
    }

    #[test]
    fn full_file_report_includes_object_paths_and_array_index_refs() {
        let report = ReportData {
            schemas: vec![SchemaCount {
                schema_path: "refs/root_item.json".to_string(),
                schema_kind: "object".to_string(),
                object_count: 2,
            }],
            object_paths: vec![
                SchemaObjectPath {
                    schema_path: "refs/root_item.json".to_string(),
                    object_path: "refs/root_item.json".to_string(),
                },
                SchemaObjectPath {
                    schema_path: "refs/root_item.json".to_string(),
                    object_path: "refs/alias.json".to_string(),
                },
            ],
            array_index_refs: vec![SchemaArrayIndexRef {
                schema_path: "refs/root_item.json".to_string(),
                array_path: "refs/root_item.json".to_string(),
                array_index_path: "[0]".to_string(),
            }],
            fields: Vec::new(),
        };

        let compact = render_report(
            &report,
            &ReportSummary::FromSqlite {
                sqlite_path: "refs/schemas.sqlite".to_string(),
            },
            ReportDetail::CompactStdout,
        );
        assert!(compact.contains("[schema_aliases]\n"));
        assert!(compact.contains("refs/root_item.json\trefs/alias.json\n"));
        assert!(!compact.contains("[schema_object_paths]"));
        assert!(!compact.contains("[schema_array_index_refs]"));

        let full = render_report(
            &report,
            &ReportSummary::FromSqlite {
                sqlite_path: "refs/schemas.sqlite".to_string(),
            },
            ReportDetail::FullFile,
        );
        assert!(full.contains("[schema_object_paths]\n"));
        assert!(full.contains("[schema_array_index_refs]\n"));
        assert!(full.contains("refs/root_item.json\trefs/root_item.json\t[0]\n"));
    }

    #[test]
    fn report_data_orders_schemas_by_object_count_and_groups_fields_by_schema_order() {
        let conn = Connection::open_in_memory().unwrap();
        conn.execute_batch(
            "CREATE TABLE schema_paths (schema_path TEXT NOT NULL, object_path TEXT PRIMARY KEY);
             CREATE TABLE array_index_refs (array_path TEXT NOT NULL, array_index_path TEXT NOT NULL, schema_path TEXT NOT NULL, PRIMARY KEY (array_path, array_index_path));
             CREATE TABLE schema_definitions (schema_path TEXT PRIMARY KEY, schema_kind TEXT NOT NULL, schema_json TEXT NOT NULL);
             CREATE TABLE schema_object_counts (schema_path TEXT PRIMARY KEY, object_count INTEGER NOT NULL CHECK (object_count > 0));
             CREATE TABLE schema_field_counts (schema_path TEXT NOT NULL, field_name TEXT NOT NULL, field_type TEXT NOT NULL, field_count INTEGER NOT NULL CHECK (field_count > 0), PRIMARY KEY (schema_path, field_name));"
        ).unwrap();

        conn.execute(
            "INSERT INTO schema_paths(schema_path, object_path) VALUES (?1, ?2)",
            ["refs/high.json", "refs/high.json"],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO schema_paths(schema_path, object_path) VALUES (?1, ?2)",
            ["refs/low.json", "refs/low.json"],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO schema_paths(schema_path, object_path) VALUES (?1, ?2)",
            ["refs/container.json", "refs/container.json"],
        )
        .unwrap();

        for (schema_path, schema_kind) in [
            ("refs/high.json", "object"),
            ("refs/low.json", "object"),
            ("refs/container.json", "heterogeneous"),
        ] {
            conn.execute(
                "INSERT INTO schema_definitions(schema_path, schema_kind, schema_json) VALUES (?1, ?2, '{}')",
                params![schema_path, schema_kind],
            )
            .unwrap();
        }

        conn.execute(
            "INSERT INTO schema_object_counts(schema_path, object_count) VALUES (?1, ?2)",
            params!["refs/high.json", 100],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO schema_object_counts(schema_path, object_count) VALUES (?1, ?2)",
            params!["refs/low.json", 50],
        )
        .unwrap();

        conn.execute(
            "INSERT INTO schema_field_counts(schema_path, field_name, field_type, field_count) VALUES (?1, ?2, ?3, ?4)",
            params!["refs/high.json", "rare", "string", 1],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO schema_field_counts(schema_path, field_name, field_type, field_count) VALUES (?1, ?2, ?3, ?4)",
            params!["refs/high.json", "common", "number", 90],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO schema_field_counts(schema_path, field_name, field_type, field_count) VALUES (?1, ?2, ?3, ?4)",
            params!["refs/low.json", "very_common", "boolean", 999],
        )
        .unwrap();

        let report = read_report_data(&conn).unwrap();
        assert_eq!(
            report
                .schemas
                .iter()
                .map(|schema| schema.schema_path.as_str())
                .collect::<Vec<_>>(),
            vec!["refs/high.json", "refs/low.json", "refs/container.json"]
        );
        assert_eq!(
            report
                .fields
                .iter()
                .map(|field| (
                    field.schema_path.as_str(),
                    field.field_name.as_str(),
                    field.field_count
                ))
                .collect::<Vec<_>>(),
            vec![
                ("refs/high.json", "common", 90),
                ("refs/high.json", "rare", 1),
                ("refs/low.json", "very_common", 999),
            ]
        );
    }

    #[test]
    fn refs_mut_schemas_are_marked_heterogeneous_and_skipped_for_statistics() {
        let outdir = temp_outdir("refs-mut-kind");
        let build = build_schema_output(
            &outdir,
            vec![Entry {
                path: vec!["root".to_string()],
                index: None,
                collection: false,
                value: object(json!({
                    "items": [{"kind": "a"}, {"other": "b"}]
                })),
            }],
        )
        .unwrap();
        write_files_and_index(&outdir, &build.records, &build.occurrences, false).unwrap();

        let conn = Connection::open(outdir.join("schemas.sqlite")).unwrap();
        let container_path = format!("{}/root/items.json", outdir.display());

        assert_eq!(
            conn.query_row(
                "SELECT schema_kind FROM schema_definitions WHERE schema_path = ?1",
                [&container_path],
                |row| row.get::<_, String>(0),
            )
            .unwrap(),
            "heterogeneous"
        );

        let object_count_rows = conn
            .query_row(
                "SELECT COUNT(*) FROM schema_object_counts WHERE schema_path = ?1",
                [&container_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap();
        assert_eq!(object_count_rows, 0);

        let field_count_rows = conn
            .query_row(
                "SELECT COUNT(*) FROM schema_field_counts WHERE schema_path = ?1",
                [&container_path],
                |row| row.get::<_, i64>(0),
            )
            .unwrap();
        assert_eq!(field_count_rows, 0);

        let report = load_report_from_sqlite(&outdir.join("schemas.sqlite")).unwrap();
        let rendered = render_report(
            &report,
            &ReportSummary::FromSqlite {
                sqlite_path: outdir.join("schemas.sqlite").display().to_string(),
            },
            ReportDetail::CompactStdout,
        );
        assert!(rendered.contains(&format!("{container_path}\theterogeneous\t0\n")));

        drop(conn);
        fs::remove_dir_all(outdir).unwrap();
    }
}