quelch 0.9.2

Ingest data from Jira, Confluence, and more directly into Azure AI Search
Documentation
/// Config validation rules for Quelch v2.
///
/// Three invariants are checked:
/// 1. Every source referenced in a deployment exists in `sources`.
/// 2. Every `(source, subsource)` pair appears in at most one ingest deployment.
/// 3. Every name in any `expose:` list is defined in `mcp.data_sources` (explicit
///    or auto-derived via [`super::data_sources::resolve`]).
use super::{Config, ConfigError, DeploymentRole, SourceConfig, data_sources};
use std::collections::{HashMap, HashSet};

/// Run all validation rules against `config`.
pub fn run(config: &Config) -> Result<(), ConfigError> {
    validate_sources_referenced(config)?;
    validate_disjoint_subsources(config)?;
    validate_expose_resolves(config)?;
    Ok(())
}

/// Every source name referenced in a deployment must be defined in `sources`.
fn validate_sources_referenced(config: &Config) -> Result<(), ConfigError> {
    let defined: HashSet<&str> = config.sources.iter().map(|s| s.name()).collect();

    for deployment in &config.deployments {
        let Some(ref sources) = deployment.sources else {
            continue;
        };
        for ds in sources {
            if !defined.contains(ds.source.as_str()) {
                return Err(ConfigError::Validation(format!(
                    "deployment '{}' references source '{}' which is not defined in sources",
                    deployment.name, ds.source
                )));
            }
        }
    }
    Ok(())
}

/// Each `(source, subsource)` pair must appear in at most one ingest deployment.
///
/// A `DeploymentSource` without an explicit `projects`/`spaces` list means "all
/// subsources" of that source.  For the disjoint check we treat "all subsources"
/// as a special sentinel: if one deployment claims all subsources of a source and
/// another deployment also references that source (with or without a subset), that
/// is an overlap.
fn validate_disjoint_subsources(config: &Config) -> Result<(), ConfigError> {
    // Map from source name → list of (deployment_name, subsource_key)
    // where subsource_key is either a specific project/space or "ALL".
    let mut claimed: HashMap<&str, Vec<(&str, String)>> = HashMap::new();

    for deployment in &config.deployments {
        if !matches!(deployment.role, DeploymentRole::Ingest) {
            continue;
        }
        let Some(ref sources) = deployment.sources else {
            continue;
        };
        for ds in sources {
            let source_name = ds.source.as_str();
            let source_def = config.sources.iter().find(|s| s.name() == source_name);

            // Collect explicit subsource keys, or "ALL" if none specified.
            let subsources: Vec<String> = match (ds.projects.as_ref(), ds.spaces.as_ref()) {
                (Some(projects), _) if !projects.is_empty() => projects.clone(),
                (_, Some(spaces)) if !spaces.is_empty() => spaces.clone(),
                _ => {
                    // No explicit subset — derive from source definition or use sentinel.
                    match source_def {
                        Some(SourceConfig::Jira(j)) if !j.projects.is_empty() => j.projects.clone(),
                        Some(SourceConfig::Confluence(c)) if !c.spaces.is_empty() => {
                            c.spaces.clone()
                        }
                        _ => vec!["ALL".to_string()],
                    }
                }
            };

            let entry = claimed.entry(source_name).or_default();
            for sub in subsources {
                // Check if this subsource is already claimed.
                for (prev_dep, prev_sub) in entry.iter() {
                    let overlap = prev_sub == "ALL" || sub == "ALL" || prev_sub == &sub;
                    if overlap {
                        return Err(ConfigError::Validation(format!(
                            "subsource '{}' of source '{}' appears in both deployment '{}' \
                             and deployment '{}' — each (source, subsource) pair must appear \
                             in at most one ingest deployment",
                            sub, source_name, prev_dep, deployment.name
                        )));
                    }
                }
                entry.push((deployment.name.as_str(), sub));
            }
        }
    }
    Ok(())
}

/// Every name in any `expose:` list must be resolvable — either explicitly
/// defined in `mcp.data_sources` or auto-derivable from the configured sources.
fn validate_expose_resolves(config: &Config) -> Result<(), ConfigError> {
    // Compute the full resolved set once (explicit overrides OR auto-derived).
    let resolved = data_sources::resolve(config);

    for deployment in &config.deployments {
        let Some(ref expose) = deployment.expose else {
            continue;
        };
        for name in expose {
            if !resolved.contains_key(name) {
                return Err(ConfigError::Validation(format!(
                    "deployment '{}' exposes '{}' which is not defined in mcp.data_sources \
                     and cannot be auto-derived from the configured sources",
                    deployment.name, name
                )));
            }
        }
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::Config;

    #[test]
    fn rejects_overlapping_subsources() {
        let yaml = include_str!("../../tests/fixtures/config_overlapping.yaml");
        let cfg: Config = serde_yaml::from_str(yaml).unwrap();
        let err = run(&cfg).unwrap_err();
        let msg = err.to_string();
        assert!(msg.contains("DO"), "expected 'DO' in error: {msg}");
        assert!(
            msg.contains("appears in"),
            "expected 'appears in' in error: {msg}"
        );
    }

    #[test]
    fn rejects_undefined_expose() {
        let yaml = include_str!("../../tests/fixtures/config_undefined_expose.yaml");
        let cfg: Config = serde_yaml::from_str(yaml).unwrap();
        let err = run(&cfg).unwrap_err();
        let msg = err.to_string();
        assert!(
            msg.contains("no_such_source"),
            "expected 'no_such_source' in error: {msg}"
        );
    }

    #[test]
    fn rejects_undefined_source_in_deployment() {
        let yaml = include_str!("../../tests/fixtures/config_undefined_source.yaml");
        let cfg: Config = serde_yaml::from_str(yaml).unwrap();
        let err = run(&cfg).unwrap_err();
        let msg = err.to_string();
        assert!(
            msg.contains("ghost-source"),
            "expected 'ghost-source' in error: {msg}"
        );
    }

    #[test]
    fn accepts_valid_config() {
        let yaml = include_str!("../../tests/fixtures/quelch.minimal.yaml");
        let cfg: Config = serde_yaml::from_str(yaml).unwrap();
        run(&cfg).expect("valid config should pass validation");
    }

    /// Regression test: a config with `expose: [jira_issues]` and no explicit
    /// `mcp.data_sources` must pass validation (relies on auto-derivation).
    #[test]
    fn accepts_expose_with_auto_derived_data_sources() {
        let yaml = r#"
azure:
  subscription_id: "sub-test"
  resource_group: "rg-test"
  region: "swedencentral"
cosmos:
  database: "quelch"
openai:
  endpoint: "https://test.openai.azure.com"
  embedding_deployment: "text-embedding-3-large"
  embedding_dimensions: 3072
sources:
  - type: jira
    name: jira-cloud
    url: "https://cloud.atlassian.net"
    auth:
      email: "u@example.com"
      api_token: "tok"
    projects: ["DO"]
deployments:
  - name: ingest
    role: ingest
    target: azure
    sources:
      - source: jira-cloud
  - name: mcp
    role: mcp
    target: azure
    expose:
      - jira_issues
    auth:
      mode: "api_key"
# No mcp.data_sources — relies on auto-derivation.
"#;
        let cfg: Config = serde_yaml::from_str(yaml).unwrap();
        run(&cfg).expect("expose with auto-derived data sources should pass validation");
    }
}