hamelin_translation 0.9.6

//! Pass: MATCH command lowering to FROM + SET + WHERE + WINDOW + SET + WHERE + DROP.
//!
//! Transforms MATCH commands into a pipeline using native Hamelin commands:
//!
//! ```hamelin
//! MATCH a=events+ b=logs BY host WITHIN 5m AGG total = sum(value)
//! ```
//! becomes:
//! ```hamelin
//! FROM a=events, b=logs
//! | SET __pattern_label = case(a IS NOT NULL: 'a', b IS NOT NULL: 'b')
//! | WHERE __pattern_label IS NOT NULL
//! | WINDOW __labels_array = array_agg(__pattern_label),
//!         __agg_values_value = array_agg(value)
//!         WITHIN 5m BY host
//! | SET __state = array_join(__labels_array, ',')
//! | SET __match_length = len(split(regexp_extract(__state, '^pattern'), ','))
//! | SET total = sum(slice(__agg_values_value, 1, __match_length))
//! | WHERE __pattern_label = 'a' AND regexp_like(__state, '^(a,)+b(,b)*')
//! | DROP __pattern_label, __state, __labels_array, __agg_values_value, __match_length
//! ```
//!
//! NOTE: When AGG is used, we avoid generating duplicate window expressions like
//! `__state = array_join(array_agg(x), ','), __arr = array_agg(x)` because
//! this triggers a DataFusion optimizer bug. Instead, we generate only
//! `__labels_array` in the WINDOW and derive `__state` with a SET afterward.
//!
//! This pass must run FIRST, before `nest_from_aliases`,
//! since it generates FROM with aliases that those passes need to process.

use std::collections::HashMap;
use std::sync::Arc;

use hamelin_lib::{
    err::TranslationError,
    func::def::{ParameterBinding, SpecialPosition},
    tree::{
        ast::{
            clause::SortOrder,
            expression::{Expression, ExpressionKind},
            identifier::Identifier,
            pattern::QuantifierKind,
            query::Query,
        },
        builder::{pipeline as pipeline_builder, query, window_command, ExpressionBuilder},
        typed_ast::{
            command::{TypedCommandKind, TypedMatchCommand},
            context::StatementTranslationContext,
            expression::{MapExpressionAlgebra, TypedApply, TypedExpression, TypedExpressionKind},
            pattern::TypedPattern,
            pipeline::TypedPipeline,
            query::TypedStatement,
        },
    },
};

/// Lower MATCH commands to FROM + SET + WHERE + WINDOW + WHERE + DROP.
///
/// Transforms pipelines containing MATCH into the regexp-based structure.
/// Pipelines without MATCH are passed through unchanged.
pub fn lower_match(
    statement: Arc<TypedStatement>,
    ctx: &mut StatementTranslationContext,
) -> Result<Arc<TypedStatement>, Arc<TranslationError>> {
    // Check if any pipeline has MATCH commands
    if !statement_has_match(&statement)? {
        return Ok(statement);
    }

    let new_query = transform_statement(&statement, ctx)?;

    Ok(Arc::new(TypedStatement::from_ast_with_context(
        Arc::new(new_query),
        ctx,
    )))
}

/// Check if the statement has any MATCH commands that need processing.
fn statement_has_match(statement: &TypedStatement) -> Result<bool, Arc<TranslationError>> {
    statement
        .iter()
        .try_fold(false, |acc, p| pipeline_has_match(p).map(|pm| pm || acc))
}

/// Check if a pipeline has any MATCH commands.
fn pipeline_has_match(pipeline: &TypedPipeline) -> Result<bool, Arc<TranslationError>> {
    let res = pipeline
        .valid_ref()?
        .commands
        .iter()
        .any(|c| matches!(&c.kind, TypedCommandKind::Match(_)));

    Ok(res)
}

/// Transform a full statement, processing all pipelines and returning a new Query.
fn transform_statement(
    statement: &TypedStatement,
    ctx: &mut StatementTranslationContext,
) -> Result<Query, Arc<TranslationError>> {
    let mut query_builder = query();

    for sd in &statement.scalar_defs {
        let name = sd.name.valid_ref()?.clone();
        query_builder = query_builder.def_expression(name, sd.expression.ast.clone());
    }

    // Existing tabular DEF pipelines (as CTEs)
    for pd in &statement.pipeline_defs {
        let transformed = transform_pipeline(&pd.pipeline, ctx)?;
        let valid_name = pd.name.clone().valid()?;
        query_builder = query_builder.merge_as_cte(transformed, valid_name);
    }

    // Process main pipeline
    let main_query = transform_pipeline(&statement.pipeline, ctx)?;
    Ok(query_builder.merge_as_main(main_query))
}

/// Transform a pipeline, lowering MATCH commands.
///
/// Returns a Query (main pipeline only, no CTEs generated by this pass).
fn transform_pipeline(
    pipeline: &TypedPipeline,
    ctx: &mut StatementTranslationContext,
) -> Result<Query, Arc<TranslationError>> {
    let commands = &pipeline.valid_ref()?.commands;

    // Check if first command is MATCH
    if let Some(first_cmd) = commands.first() {
        if let TypedCommandKind::Match(match_cmd) = &first_cmd.kind {
            // Lower MATCH to pipeline
            let lowered_pipeline = lower_match_command(match_cmd, ctx)?;

            // Append any remaining commands after MATCH
            let mut pipe_builder = pipeline_builder().at(pipeline.ast.span.clone());

            // Add lowered MATCH commands
            for cmd in lowered_pipeline.commands {
                pipe_builder = pipe_builder.command(cmd);
            }

            // Add remaining commands (after MATCH)
            for cmd in commands.iter().skip(1) {
                pipe_builder = pipe_builder.command(cmd.ast.clone());
            }

            return Ok(query().main(pipe_builder.build()).build());
        }
    }

    // No MATCH - pass through unchanged
    Ok(query().main(pipeline.ast.clone()).build())
}

/// Lower a single MATCH command to a pipeline.
///
/// Generates: FROM + SET + WHERE + WINDOW + WHERE + DROP
fn lower_match_command(
    match_cmd: &TypedMatchCommand,
    _ctx: &mut StatementTranslationContext,
) -> Result<hamelin_lib::tree::ast::pipeline::Pipeline, Arc<TranslationError>> {
    // Step 1: Extract pattern variables and assign labels
    let pattern_vars = extract_pattern_variables(&match_cmd.patterns)?;

    // Step 2: Build the regex pattern
    let regex_pattern = pattern_to_regex(&match_cmd.patterns, &pattern_vars)?;

    // Step 3: Find starting pattern labels for optimization
    let starting_labels = find_starting_pattern_labels(&match_cmd.patterns, &pattern_vars)?;

    // Step 4: Build the pipeline
    let pipeline =
        build_lowered_pipeline(match_cmd, &pattern_vars, &regex_pattern, &starting_labels)?;

    Ok(pipeline)
}

// ============================================================================
// Pattern Variable Extraction
// ============================================================================

/// A pattern variable with its assigned label and quantifier.
#[derive(Debug, Clone)]
struct PatternVariable {
    /// The alias name (e.g., "a" from "a=events")
    alias: String,
    /// The table name (e.g., "events")
    table: String,
    /// The assigned label for regex (e.g., 'a', 'b', 'A', 'aa')
    label: String,
    /// The quantifier
    quantifier: PatternQuantifier,
}

/// Simplified quantifier for pattern matching.
#[derive(Debug, Clone, Copy, PartialEq)]
enum PatternQuantifier {
    One,          // exactly 1 (no quantifier)
    OneOrMore,    // +
    ZeroOrMore,   // *
    ZeroOrOne,    // ?
    Exactly(u32), // {n}
}

impl PatternQuantifier {
    fn is_optional(&self) -> bool {
        matches!(
            self,
            PatternQuantifier::ZeroOrMore | PatternQuantifier::ZeroOrOne
        )
    }
}

/// Extract pattern variables from patterns and assign labels.
fn extract_pattern_variables(
    patterns: &[TypedPattern],
) -> Result<Vec<PatternVariable>, Arc<TranslationError>> {
    let mut variables = Vec::new();
    let mut label_gen = LabelGenerator::new();

    for pattern in patterns {
        extract_from_pattern(pattern, &mut variables, &mut label_gen)?;
    }

    Ok(variables)
}

/// Recursively extract variables from a pattern.
fn extract_from_pattern(
    pattern: &TypedPattern,
    variables: &mut Vec<PatternVariable>,
    label_gen: &mut LabelGenerator,
) -> Result<(), Arc<TranslationError>> {
    match pattern {
        TypedPattern::Quantified(quant) => {
            let (alias, table) = extract_alias_and_table(quant)?;
            let quantifier = convert_quantifier(&quant.quantifier);

            variables.push(PatternVariable {
                alias,
                table,
                label: label_gen.next(),
                quantifier,
            });
        }
        TypedPattern::Nested(nested) => {
            for sub_pattern in &nested.patterns {
                extract_from_pattern(sub_pattern, variables, label_gen)?;
            }
        }
        TypedPattern::Error(err) => {
            return Err(err.clone());
        }
    }

    Ok(())
}

/// Extract alias and table name from a quantified pattern.
fn extract_alias_and_table(
    quant: &hamelin_lib::tree::typed_ast::pattern::TypedQuantifiedPattern,
) -> Result<(String, String), Arc<TranslationError>> {
    use hamelin_lib::tree::typed_ast::clause::TypedFromClause;

    match &quant.typed_from {
        TypedFromClause::Alias(alias_clause) => {
            let alias = alias_clause.alias.valid_ref()?.to_string();
            let table = alias_clause.ast.table.identifier.valid_ref()?.to_string();
            Ok((alias, table))
        }
        TypedFromClause::Reference(table_ref) => {
            let table = table_ref.ast.identifier.valid_ref()?.to_string();
            // Use table name as alias if no explicit alias
            Ok((table.clone(), table))
        }
        TypedFromClause::Error(err) => Err(err.clone()),
    }
}

/// Convert AST quantifier to our simplified form.
fn convert_quantifier(
    quantifier: &Arc<hamelin_lib::tree::ast::pattern::Quantifier>,
) -> PatternQuantifier {
    match &quantifier.kind {
        QuantifierKind::AtLeastOne => PatternQuantifier::OneOrMore,
        QuantifierKind::AnyNumber => PatternQuantifier::ZeroOrMore,
        QuantifierKind::ZeroOrOne => PatternQuantifier::ZeroOrOne,
        QuantifierKind::Exactly(n) => {
            if let Ok(num) = n.parse::<u32>() {
                if num == 1 {
                    PatternQuantifier::One
                } else {
                    PatternQuantifier::Exactly(num)
                }
            } else {
                PatternQuantifier::One
            }
        }
        QuantifierKind::Error(_) => PatternQuantifier::One,
    }
}

/// Label generator: a, b, ..., z, A, B, ..., Z, aa, ab, ...
struct LabelGenerator {
    index: usize,
}

impl LabelGenerator {
    fn new() -> Self {
        Self { index: 0 }
    }

    fn next(&mut self) -> String {
        let label = if self.index < 26 {
            // a-z
            char::from(b'a' + self.index as u8).to_string()
        } else if self.index < 52 {
            // A-Z
            char::from(b'A' + (self.index - 26) as u8).to_string()
        } else {
            // aa, ab, ac, ...
            let idx = self.index - 52;
            let first = char::from(b'a' + (idx / 26) as u8);
            let second = char::from(b'a' + (idx % 26) as u8);
            format!("{}{}", first, second)
        };
        self.index += 1;
        label
    }
}

// ============================================================================
// Pattern to Regex Conversion
// ============================================================================

/// Convert patterns to regex string.
///
/// Uses comma-separated labels in state string, so regex accounts for commas.
fn pattern_to_regex(
    patterns: &[TypedPattern],
    variables: &[PatternVariable],
) -> Result<String, Arc<TranslationError>> {
    let elements = collect_pattern_elements(patterns, variables)?;
    Ok(build_regex_from_elements(&elements))
}

/// A flattened pattern element for regex generation.
struct PatternElement {
    label: String,
    quantifier: PatternQuantifier,
}

/// Collect pattern elements in order, handling nesting.
fn collect_pattern_elements(
    patterns: &[TypedPattern],
    variables: &[PatternVariable],
) -> Result<Vec<PatternElement>, Arc<TranslationError>> {
    let mut elements = Vec::new();
    let mut var_index = 0;

    for pattern in patterns {
        collect_from_pattern(pattern, variables, &mut var_index, &mut elements)?;
    }

    Ok(elements)
}

fn collect_from_pattern(
    pattern: &TypedPattern,
    variables: &[PatternVariable],
    var_index: &mut usize,
    elements: &mut Vec<PatternElement>,
) -> Result<(), Arc<TranslationError>> {
    match pattern {
        TypedPattern::Quantified(_) => {
            if let Some(var) = variables.get(*var_index) {
                elements.push(PatternElement {
                    label: var.label.clone(),
                    quantifier: var.quantifier,
                });
                *var_index += 1;
            }
        }
        TypedPattern::Nested(_) => {
            return Err(TranslationError::fatal(
                "lower_match",
                "nested pattern groups with quantifiers (e.g., (a b)+) are not yet supported \
                 — use flat patterns instead (e.g., a+ b+)"
                    .into(),
            )
            .into());
        }
        TypedPattern::Error(_) => {}
    }
    Ok(())
}

/// Build regex string from pattern elements.
fn build_regex_from_elements(elements: &[PatternElement]) -> String {
    if elements.is_empty() {
        return String::new();
    }

    fn all_remaining_optional(elements: &[PatternElement], from_idx: usize) -> bool {
        elements[from_idx..]
            .iter()
            .all(|e| e.quantifier.is_optional())
    }

    let mut parts = Vec::new();

    for (i, elem) in elements.iter().enumerate() {
        let is_last = i == elements.len() - 1;
        let next_all_optional = !is_last && all_remaining_optional(elements, i + 1);
        let label = &elem.label;

        let part = match elem.quantifier {
            PatternQuantifier::One => {
                if is_last || next_all_optional {
                    label.clone()
                } else {
                    format!("{},", label)
                }
            }
            PatternQuantifier::OneOrMore => {
                if is_last || next_all_optional {
                    format!("{}(,{})*", label, label)
                } else {
                    format!("({},)+", label)
                }
            }
            PatternQuantifier::ZeroOrMore => {
                if is_last || next_all_optional {
                    format!("(,{}(,{})*)?", label, label)
                } else {
                    format!("({},)*", label)
                }
            }
            PatternQuantifier::ZeroOrOne => {
                if is_last || next_all_optional {
                    format!("(,{})?", label)
                } else {
                    format!("({},)?", label)
                }
            }
            PatternQuantifier::Exactly(n) => {
                if n == 0 {
                    String::new()
                } else if is_last || next_all_optional {
                    (0..n)
                        .map(|i| {
                            if i == 0 {
                                label.clone()
                            } else {
                                format!(",{}", label)
                            }
                        })
                        .collect::<Vec<_>>()
                        .join("")
                } else {
                    (0..n)
                        .map(|_| format!("{},", label))
                        .collect::<Vec<_>>()
                        .join("")
                }
            }
        };

        if !part.is_empty() {
            parts.push(part);
        }
    }

    format!("^{}", parts.join(""))
}

/// Find the pattern labels that can start a match.
///
/// A match can start on any row whose label appears at or before the first required
/// element. For example, in `A? B`, both `a` and `b` can start a match. In `A B`,
/// only `a` can start a match.
///
/// Returns the list of starting labels for the WHERE filter optimization.
fn find_starting_pattern_labels(
    patterns: &[TypedPattern],
    variables: &[PatternVariable],
) -> Result<Vec<String>, Arc<TranslationError>> {
    let elements = collect_pattern_elements(patterns, variables)?;
    let mut labels = Vec::new();

    for elem in &elements {
        labels.push(elem.label.clone());
        if !elem.quantifier.is_optional() {
            break;
        }
    }

    Ok(labels)
}

// ============================================================================
// Pipeline Generation
// ============================================================================

/// A synthetic window column materialised to support one or more MATCH AGG
/// aggregate calls.
///
/// For each aggregate call `agg_fn(<arg>)` found inside a MATCH AGG
/// assignment we emit `__agg_values_<suffix> = array_agg(<arg>)` in the
/// generated WINDOW command and later slice it by `__match_length`. The
/// original AST expression is preserved verbatim so qualified field
/// references (e.g. `sf.event_id`) and arbitrary row-level computations
/// (e.g. `if(is_admin, 1, 0)`) re-typecheck correctly.
#[derive(Debug, Clone)]
struct SyntheticAggColumn {
    /// Full field name, e.g. `__agg_values_event_id`, `__agg_values_sf__event_id`,
    /// or `__agg_values_0` for complex expressions.
    name: String,
    /// The row-level AST expression to wrap in `array_agg(...)` inside the WINDOW.
    source_expr: Arc<Expression>,
}

/// State shared across the per-AGG-assignment expression rewrites.
///
/// Tracks the deduplicated set of synthetic `__agg_values_<suffix>` window
/// columns we need to materialize, plus any errors emitted while walking the
/// expression trees. Dedupe is keyed on the structural form of the aggregate's
/// argument so `sum(x)` and `count(x)` share a single window array.
#[derive(Default)]
struct AggLoweringState {
    /// Ordered list of synthetic window columns to materialize.
    columns: Vec<SyntheticAggColumn>,
    /// Dedup: pretty-printed (Display) form of the argument expression → index into `columns`.
    seen: HashMap<String, usize>,
    /// Counter for fallback suffixes when the argument isn't a simple field chain.
    counter: usize,
    /// Errors collected while walking the expression (reported after `cata` completes).
    errors: Vec<Arc<TranslationError>>,
}

impl AggLoweringState {
    /// Return the `__agg_values_<suffix>` column name for the given argument
    /// expression, allocating a new column if this argument hasn't been seen
    /// yet. Identical arguments always return the same column name.
    fn column_for(&mut self, arg_ast: &Arc<Expression>) -> String {
        // Use Display rather than Debug for the dedup key: Debug serializes
        // every `Span` field, so two occurrences of `user` at different
        // source positions would hash differently and fail to share a
        // `__agg_values_user` column. Display is the pretty-printed form
        // which ignores spans.
        let key = format!("{}", arg_ast);
        if let Some(&idx) = self.seen.get(&key) {
            return self.columns[idx].name.clone();
        }

        // Prefer a human-readable suffix for simple field chains. If that
        // suffix would collide with a previously allocated column for a
        // *different* source expression (possible with unusual field names
        // containing underscores), fall back to a counter.
        let candidate_suffix = infer_readable_suffix(arg_ast.as_ref());
        let suffix = match candidate_suffix {
            Some(s) if !self.suffix_taken(&s) => s,
            _ => loop {
                let candidate = self.counter.to_string();
                self.counter += 1;
                if !self.suffix_taken(&candidate) {
                    break candidate;
                }
            },
        };
        let name = format!("__agg_values_{}", suffix);
        let idx = self.columns.len();
        self.columns.push(SyntheticAggColumn {
            name: name.clone(),
            source_expr: arg_ast.clone(),
        });
        self.seen.insert(key, idx);
        name
    }

    fn suffix_taken(&self, suffix: &str) -> bool {
        let target = format!("__agg_values_{}", suffix);
        self.columns.iter().any(|c| c.name == target)
    }
}

/// Produce a readable suffix for a MATCH AGG aggregate's argument, if the
/// argument is a raw field reference (`event_id`) or a chain of field
/// lookups (`sf.event_id`, `a.b.c`). Returns `None` for any other shape —
/// those get a counter-based suffix instead.
///
/// Gated on the *AST* kind rather than delegating blindly to
/// [`Identifier::infer_from_expression`], because that helper transparently
/// unwraps `Cast`, `TsTrunc`, and zero-arg function wrappers. Letting it
/// unwrap would risk two different expressions (e.g. `x` and `cast(x, int)`)
/// mapping to the same suffix and colliding in the WINDOW.
fn infer_readable_suffix(arg_ast: &Expression) -> Option<String> {
    match &arg_ast.kind {
        ExpressionKind::FieldReference(_) | ExpressionKind::FieldLookup(_) => {}
        _ => return None,
    }

    let id = Identifier::infer_from_expression(arg_ast)?.valid().ok()?;
    Some(match id {
        Identifier::Simple(s) => s.as_str().to_string(),
        Identifier::Compound(c) => c
            .parts()
            .iter()
            .map(|p| p.as_str().to_string())
            .collect::<Vec<_>>()
            .join("__"),
    })
}

/// Algebra that rewrites a typed AGG expression into a lowered AST
/// expression, inlining per-aggregate-function rewrites for any aggregate
/// call it encounters (at any depth) and leaving the scalar structure
/// around them untouched.
///
/// - Each aggregate call `agg_fn(<arg>)` is replaced with the appropriate
///   slice-based form (e.g. `max` → `max(slice(__agg_values_N, 0, __match_length))`,
///   `first` → `get(slice(...), 0)`), and its argument is registered as a
///   synthetic window column via [`AggLoweringState::column_for`].
/// - Non-aggregate nodes use the default `MapExpressionAlgebra` behaviour,
///   which rebuilds the AST from its transformed children. This means scalar
///   wrappers like `array_distinct(array_agg(x))` or `max(...) > 0` flow
///   through unchanged.
///
/// This makes MATCH AGG consistent with regular AGG: any expression that
/// would parse in a plain `AGG ... BY ...` clause can also be used here.
struct AggRewriteAlgebra<'a> {
    state: &'a mut AggLoweringState,
}

impl MapExpressionAlgebra for AggRewriteAlgebra<'_> {
    fn apply(
        &mut self,
        node: &TypedApply,
        expr: &TypedExpression,
        children: ParameterBinding<Arc<Expression>>,
    ) -> Arc<Expression> {
        // Non-aggregate functions pass through with their transformed children.
        // Aggregate functions inside a MATCH AGG clause are resolved to the
        // `SpecialPosition::Match` variants during type-checking (see
        // `hamelin_lib::tree::typed_ast::command` where the agg-fctx is set
        // up with `with_special_allowed(SpecialPosition::Match)`). We also
        // accept `SpecialPosition::Agg` defensively in case a future
        // type-check path leaves a regular `Agg`-variant in an AGG
        // expression.
        if !is_match_aggregate(node) {
            return node.replace_children_ast(expr, children);
        }

        // Disallow aggregates nested inside other aggregates — we can't
        // reasonably lower `max(count(x))` because the inner `count(x)` is
        // already a scalar-per-match, not a scalar-per-row, so there's no
        // sensible per-row value to feed `array_agg` for the outer.
        if let Ok(arg) = node.parameter_binding.get_by_index(0) {
            if arg
                .find(&mut |expr: &TypedExpression| {
                    matches!(
                        &expr.kind,
                        TypedExpressionKind::Apply(apply) if is_match_aggregate(apply)
                    )
                })
                .is_some()
            {
                self.state.errors.push(Arc::new(TranslationError::msg(
                    expr,
                    "aggregates nested inside another aggregate are not supported in MATCH AGG",
                )));
                return expr.ast.clone();
            }
        }

        let func_name = node.function_def.name().to_lowercase();
        // Use the *original* argument AST (not the transformed `children`)
        // because the argument is a row-level expression we'll push into
        // `array_agg(...)`; `children` is the transformed form, which for
        // a pure row-level tree is identical anyway (we reject nested
        // aggregates above).
        let arg_ast = node
            .parameter_binding
            .get_by_index(0)
            .ok()
            .map(|te| te.ast.clone());

        match rewrite_aggregate(&func_name, arg_ast, self.state) {
            Ok(rewritten) => rewritten,
            Err(msg) => {
                self.state
                    .errors
                    .push(Arc::new(TranslationError::msg(expr, &msg)));
                expr.ast.clone()
            }
        }
    }
}

/// Whether a `TypedApply` represents an aggregate function that should be
/// lowered by the MATCH AGG rewrite (i.e. `SpecialPosition::Match` or
/// `SpecialPosition::Agg`).
fn is_match_aggregate(node: &TypedApply) -> bool {
    matches!(
        node.function_def.special_position(),
        Some(SpecialPosition::Match) | Some(SpecialPosition::Agg)
    )
}

/// Rewrite a single aggregate function call into its sliced-array form.
///
/// `arg_ast` is the original row-level AST expression of the aggregate's
/// first positional argument, or `None` for zero-argument aggregates like
/// `count()`.
///
/// Returns an AST expression suitable for inclusion in the LET command that
/// computes the user-facing AGG target.
fn rewrite_aggregate(
    func_name: &str,
    arg_ast: Option<Arc<Expression>>,
    state: &mut AggLoweringState,
) -> Result<Arc<Expression>, String> {
    use hamelin_lib::tree::builder::{call, cast, field_ref, int, subtract};
    use hamelin_lib::types::{array::Array, Type};

    // Helper: slice(<col>, 0, __match_length) — the sliced window array
    // representing just the rows that matched the pattern.
    let slice_of = |col: &str| {
        call("slice")
            .arg(field_ref(col))
            .arg(int(0))
            .arg(field_ref("__match_length"))
    };

    // Allocate (or reuse) a synthetic window column for the aggregate's
    // argument. Only `count()` (the zero-arg form) can skip this — it
    // counts matched rows via the always-present `__labels_array`.
    let agg_col = arg_ast.as_ref().map(|ast| state.column_for(ast));

    match (func_name, agg_col.as_deref()) {
        // count() with no argument: length of the matched labels.
        ("count", None) => Ok(Arc::new(
            call("len").arg(slice_of("__labels_array")).build(),
        )),
        // count(x): number of non-null x's in the matched rows.
        ("count", Some(col)) => Ok(Arc::new(
            call("len")
                .arg(call("filter_null").arg(slice_of(col)))
                .build(),
        )),
        // sum(col) → cast to array(double) then sum(...) for type-safe aggregation.
        ("sum", Some(col)) => {
            let slice_double = cast(slice_of(col), Type::Array(Array::new(Type::Double)));
            Ok(Arc::new(call("sum").arg(slice_double).build()))
        }
        // avg(col) → cast to array(double) then avg(...), same reasoning.
        ("avg", Some(col)) => {
            let slice_double = cast(slice_of(col), Type::Array(Array::new(Type::Double)));
            Ok(Arc::new(call("avg").arg(slice_double).build()))
        }
        // max/min over the sliced array are plain scalar forms.
        ("max", Some(col)) | ("min", Some(col)) => {
            Ok(Arc::new(call(func_name).arg(slice_of(col)).build()))
        }
        // first(col) → first element of the sliced array (0-based get).
        ("first", Some(col)) => Ok(Arc::new(call("get").arg(slice_of(col)).arg(int(0)).build())),
        // last(col) → element at index __match_length - 1.
        ("last", Some(col)) => Ok(Arc::new(
            call("get")
                .arg(slice_of(col))
                .arg(subtract(field_ref("__match_length"), int(1)))
                .build(),
        )),
        // array_agg(col) is literally the sliced array.
        ("array_agg", Some(col)) => Ok(Arc::new(slice_of(col).build())),
        // count_distinct(col) → len(array_distinct(filter_null(slice))).
        ("count_distinct", Some(col)) => Ok(Arc::new(
            call("len")
                .arg(call("array_distinct").arg(call("filter_null").arg(slice_of(col))))
                .build(),
        )),
        // count with an argument but we somehow didn't allocate a column — shouldn't happen.
        (_, None) => Err(format!("{} requires an argument", func_name)),
        (name, _) => Err(format!("unsupported MATCH AGG function: {}", name)),
    }
}

/// Result of lowering the AGG clause: synthetic window columns plus rewritten
/// LET assignments, one per original AGG entry.
struct LoweredAgg {
    columns: Vec<SyntheticAggColumn>,
    assignments: Vec<(Identifier, Arc<Expression>)>,
}

/// Walk every AGG assignment with [`AggRewriteAlgebra`], collecting the
/// synthetic window columns we need and the rewritten LET expression for
/// each user-facing AGG target.
fn lower_agg_assignments(
    match_cmd: &TypedMatchCommand,
) -> Result<LoweredAgg, Arc<TranslationError>> {
    let mut state = AggLoweringState::default();
    let mut assignments = Vec::new();

    for assignment in &match_cmd.agg.assignments {
        let target = assignment.identifier.clone().valid()?;
        let mut alg = AggRewriteAlgebra { state: &mut state };
        let rewritten = assignment.expression.cata(&mut alg);
        assignments.push((target, rewritten));
    }

    if let Some(err) = state.errors.into_iter().next() {
        return Err(err);
    }

    Ok(LoweredAgg {
        columns: state.columns,
        assignments,
    })
}

/// Build the lowered pipeline from MATCH command.
fn build_lowered_pipeline(
    match_cmd: &TypedMatchCommand,
    pattern_vars: &[PatternVariable],
    regex_pattern: &str,
    starting_labels: &[String],
) -> Result<hamelin_lib::tree::ast::pipeline::Pipeline, Arc<TranslationError>> {
    use hamelin_lib::tree::builder::{
        and, call, eq, field_ref, in_, is_not_null, pair, pipeline, sort_command, string, tuple,
    };

    // Lower AGG expressions into synthetic window columns + LET bindings.
    let lowered_agg = lower_agg_assignments(match_cmd)?;
    let has_agg = !lowered_agg.assignments.is_empty();

    let mut pipe = pipeline();

    // Step 1: FROM with all pattern sources
    pipe = pipe.from(|f| {
        let mut from_builder = f;
        for var in pattern_vars {
            from_builder = from_builder.table_alias(var.alias.as_str(), var.table.as_str());
        }
        from_builder
    });

    // Step 2: SET __pattern_label = case(condition: value, ...)
    // case() is a function that takes pairs as arguments
    let mut case_call = call("case");
    for var in pattern_vars {
        case_call = case_call.arg(pair(
            is_not_null(field_ref(var.alias.as_str())),
            string(&var.label),
        ));
    }
    pipe = pipe.set_cmd(|l| l.named_field("__pattern_label", case_call));

    // Step 3: WHERE __pattern_label IS NOT NULL
    pipe = pipe.where_cmd(is_not_null(field_ref("__pattern_label")));

    // Step 4: WINDOW with state tracking and array collection for AGG
    // WINDOW uses prepend_overwrite which puts new columns (window expressions + BY)
    // BEFORE base columns. This matches the new legacy MATCH output order:
    // AGG → BY → base columns
    //
    // IMPORTANT: Avoid generating duplicate window expressions like:
    //   WINDOW __state = array_join(array_agg(x), ','), __arr = array_agg(x)
    // This triggers a DataFusion optimizer bug (push_down_filter panic on duplicate window exprs).
    //
    // When has_agg: generate __labels_array only, derive __state with SET after
    // When no AGG: generate __state directly (array_join is pushed into the window)
    let mut window_builder = if has_agg {
        // Only generate __labels_array, derive __state later
        let labels_array = call("array_agg").arg(field_ref("__pattern_label"));
        self::window_command().named_field("__labels_array", labels_array)
    } else {
        // No AGG: generate __state directly in window
        let state_expr = call("array_join")
            .arg(call("array_agg").arg(field_ref("__pattern_label")))
            .arg(string(","));
        self::window_command().named_field("__state", state_expr)
    };

    // If we have AGG, collect value arrays for each synthetic column used in AGG.
    // We pass the original AST expression (not a rebuilt field_ref) so qualified
    // references like `sf.event_id` and arbitrary row-level expressions like
    // `if(is_admin, 1, 0)` re-typecheck correctly.
    if has_agg {
        for col in &lowered_agg.columns {
            let value_array = call("array_agg").arg(col.source_expr.clone());
            window_builder = window_builder.named_field(col.name.as_str(), value_array);
        }
    }

    // Pass through WITHIN from MATCH
    if let Some(within) = &match_cmd.within {
        window_builder = window_builder.within(within.ast.as_ref().clone());
    }

    // Add BY columns to WINDOW for partitioning
    for assignment in &match_cmd.group_by.assignments {
        if let Ok(id) = assignment.identifier.valid_ref() {
            window_builder =
                window_builder.group_by(id.clone(), assignment.expression.ast.as_ref().clone());
        }
    }

    // Pass through SORT from MATCH (using sort method with SortCommandBuilder)
    if !match_cmd.sort.is_empty() {
        let mut sort_builder = sort_command();
        for sort_expr in &match_cmd.sort {
            // Use the AST expression and check the order
            let expr = sort_expr.ast.expression.as_ref().clone();
            sort_builder = match sort_expr.order {
                SortOrder::Asc => sort_builder.asc(expr),
                SortOrder::Desc => sort_builder.desc(expr),
            };
        }
        window_builder = window_builder.sort(sort_builder);
    }

    pipe = pipe.window(|_| window_builder);

    // Step 5: If we have AGG, derive __state from __labels_array, then compute match length
    if has_agg {
        // First, derive __state from __labels_array
        // SET __state = array_join(__labels_array, ',')
        let state_expr = call("array_join")
            .arg(field_ref("__labels_array"))
            .arg(string(","));
        pipe = pipe.set_cmd(|l| l.named_field("__state", state_expr));

        // SET __match_length = len(split(regexp_extract(__state, '<pattern>'), ','))
        let match_length_expr = call("len").arg(
            call("split")
                .arg(
                    call("regexp_extract")
                        .arg(field_ref("__state"))
                        .arg(string(regex_pattern)),
                )
                .arg(string(",")),
        );
        pipe = pipe.set_cmd(|l| l.named_field("__match_length", match_length_expr));

        // Single LET for all AGG expressions (preserves order from the original AGG clause).
        // The builder's named_field() pushes onto a Vec in insertion order, so
        // iterating forward preserves the user's original AGG ordering.
        let mut agg_set_builder: Option<hamelin_lib::tree::builder::SetCommandBuilder> = None;
        for (target, rewritten) in &lowered_agg.assignments {
            agg_set_builder = Some(match agg_set_builder {
                None => hamelin_lib::tree::builder::set_command()
                    .named_field(target.clone(), rewritten.clone()),
                Some(builder) => builder.named_field(target.clone(), rewritten.clone()),
            });
        }

        if let Some(builder) = agg_set_builder {
            pipe = pipe.set_cmd(|_| builder);
        }
    }

    // Step 6: WHERE regexp filter
    let regexp_filter = call("regexp_like")
        .arg(field_ref("__state"))
        .arg(string(regex_pattern));

    pipe = if starting_labels.len() == 1 {
        // Single starting label: __pattern_label = 'x' AND regexp_like(...)
        pipe.where_cmd(and(
            eq(field_ref("__pattern_label"), string(&starting_labels[0])),
            regexp_filter,
        ))
    } else if starting_labels.len() > 1 {
        // Multiple starting labels: __pattern_label IN ('a', 'b') AND regexp_like(...)
        let mut tup = tuple();
        for label in starting_labels {
            tup = tup.element(string(label));
        }
        pipe.where_cmd(and(in_(field_ref("__pattern_label"), tup), regexp_filter))
    } else {
        pipe.where_cmd(regexp_filter)
    };

    // Step 7: DROP synthetic columns
    let mut drop_builder = pipe.drop(|d| d.field("__pattern_label").field("__state"));

    if has_agg {
        drop_builder = drop_builder.drop(|d| {
            let mut db = d.field("__labels_array").field("__match_length");
            for col in &lowered_agg.columns {
                db = db.field(col.name.as_str());
            }
            db
        });
    }

    Ok(drop_builder.build())
}

#[cfg(test)]
mod tests {
    use hamelin_lib::type_check_with_provider;

    use super::*;

    #[test]
    fn test_label_generator() {
        let mut gen = LabelGenerator::new();
        assert_eq!(gen.next(), "a");
        assert_eq!(gen.next(), "b");

        // Skip to Z
        for _ in 2..26 {
            gen.next();
        }
        assert_eq!(gen.next(), "A");

        // Skip to end of uppercase
        for _ in 27..52 {
            gen.next();
        }
        assert_eq!(gen.next(), "aa");
        assert_eq!(gen.next(), "ab");
    }

    #[test]
    fn test_pattern_to_regex_single() {
        let elements = vec![PatternElement {
            label: "a".to_string(),
            quantifier: PatternQuantifier::OneOrMore,
        }];
        let regex = build_regex_from_elements(&elements);
        assert_eq!(regex, "^a(,a)*");
    }

    #[test]
    fn test_pattern_to_regex_sequence() {
        let elements = vec![
            PatternElement {
                label: "a".to_string(),
                quantifier: PatternQuantifier::OneOrMore,
            },
            PatternElement {
                label: "b".to_string(),
                quantifier: PatternQuantifier::OneOrMore,
            },
        ];
        let regex = build_regex_from_elements(&elements);
        assert_eq!(regex, "^(a,)+b(,b)*");
    }

    #[test]
    fn test_pattern_to_regex_optional_end() {
        let elements = vec![
            PatternElement {
                label: "a".to_string(),
                quantifier: PatternQuantifier::One,
            },
            PatternElement {
                label: "b".to_string(),
                quantifier: PatternQuantifier::ZeroOrOne,
            },
        ];
        let regex = build_regex_from_elements(&elements);
        assert_eq!(regex, "^a(,b)?");
    }

    #[test]
    fn agg_column_counter_suffix_avoids_collision_with_readable_numeric_field() {
        use std::sync::Arc;

        use hamelin_lib::tree::builder::{field_ref, int};

        let mut state = AggLoweringState::default();

        let field_named_1 = Arc::new(field_ref("1").build());
        assert_eq!(state.column_for(&field_named_1), "__agg_values_1");

        let lit0 = Arc::new(int(0).build());
        assert_eq!(state.column_for(&lit0), "__agg_values_0");

        // Counter would naturally emit `1` next; that must not collide with `__agg_values_1`.
        let lit1 = Arc::new(int(1).build());
        assert_eq!(state.column_for(&lit1), "__agg_values_2");
    }

    #[test]
    fn test_no_match_passthrough() -> Result<(), Arc<TranslationError>> {
        use hamelin_lib::{
            provider::EnvironmentProvider,
            tree::{
                ast::identifier::{Identifier, SimpleIdentifier as AstSimpleIdentifier},
                builder::{eq, field_ref, pipeline, query, QueryBuilderWithMain},
            },
            types::{struct_type::Struct, INT},
        };
        use std::sync::Arc;

        #[derive(Debug)]
        struct MockProvider;

        impl EnvironmentProvider for MockProvider {
            fn reflect_columns(&self, name: &Identifier) -> anyhow::Result<Struct> {
                let events: Identifier = AstSimpleIdentifier::new("events").into();

                if name == &events {
                    Ok(Struct::default()
                        .with_str("timestamp", INT)
                        .with_str("value", INT))
                } else {
                    anyhow::bail!("Table not found: {}", name)
                }
            }

            fn reflect_datasets(&self) -> anyhow::Result<Vec<Identifier>> {
                Ok(vec![])
            }
        }

        fn typed_query(builder: QueryBuilderWithMain) -> TypedStatement {
            type_check_with_provider(builder.build(), Arc::new(MockProvider)).output
        }

        let q = query().main(
            pipeline()
                .from(|f| f.table_reference("events"))
                .where_cmd(eq(field_ref("value"), 10)),
        );

        let statement = typed_query(q);
        assert!(!statement_has_match(&statement)?);
        Ok(())
    }

    #[test]
    fn match_agg_compound_targets_grp_struct_in_output_schema() {
        use hamelin_lib::{
            func::registry::FunctionRegistry,
            parse,
            provider::EnvironmentProvider,
            tree::ast::identifier::{Identifier, SimpleIdentifier as AstSimpleIdentifier},
            type_check_with_provider,
            types::{struct_type::Struct, Type, STRING, TIMESTAMP},
        };

        #[derive(Debug)]
        struct MatchAggProvider;

        impl EnvironmentProvider for MatchAggProvider {
            fn reflect_columns(&self, name: &Identifier) -> anyhow::Result<Struct> {
                let test_t: Identifier = AstSimpleIdentifier::new("test_t").into();
                if name == &test_t {
                    Ok(Struct::default()
                        .with_str("timestamp", TIMESTAMP)
                        .with_str("host", STRING))
                } else {
                    anyhow::bail!("Table not found: {}", name)
                }
            }

            fn reflect_datasets(&self) -> anyhow::Result<Vec<Identifier>> {
                Ok(vec![])
            }
        }

        // MATCH must be the first command in its pipeline (no leading pipe before it).
        let src = r#"MATCH a=test_t+ b=test_t+
  AGG grp.lo = min(timestamp), grp.hi = max(timestamp)
  BY host
  WITHIN 5s"#;

        let query = parse(src)
            .into_result()
            .expect("MATCH + compound AGG fixture should parse");
        let provider = Arc::new(MatchAggProvider);
        let typed = type_check_with_provider(query, provider.clone())
            .into_result()
            .expect("MATCH + compound AGG fixture should type-check");

        let mut ctx =
            StatementTranslationContext::new(Arc::new(FunctionRegistry::default()), provider);
        let lowered = lower_match(Arc::new(typed), &mut ctx)
            .expect("lower_match should succeed for compound AGG targets");

        let schema = lowered.pipeline.schema();
        let grp_ty = schema
            .lookup(&AstSimpleIdentifier::new("grp"))
            .unwrap_or_else(|| {
                panic!(
                    "output schema should include struct `grp`; have columns {:?}",
                    schema
                        .iter()
                        .map(|(k, _)| k.as_str().to_string())
                        .collect::<Vec<_>>()
                )
            });
        let Type::Struct(grp_struct) = grp_ty else {
            panic!(
                "`grp` should be a struct in the output schema, got {:?}",
                grp_ty
            );
        };
        assert!(
            grp_struct.lookup(&AstSimpleIdentifier::new("lo")).is_some(),
            "`grp` should contain field `lo`, got {:?}",
            grp_struct
                .iter()
                .map(|(k, _)| k.as_str().to_string())
                .collect::<Vec<_>>()
        );
        assert!(
            grp_struct.lookup(&AstSimpleIdentifier::new("hi")).is_some(),
            "`grp` should contain field `hi`, got {:?}",
            grp_struct
                .iter()
                .map(|(k, _)| k.as_str().to_string())
                .collect::<Vec<_>>()
        );
    }
}