hamelin_translation 0.4.3

Lowering and IR for Hamelin query language
Documentation
//! Pipeline pass: EXPLODE normalization.
//!
//! Normalizes EXPLODE commands to the canonical `EXPLODE col = col` form required by IR.
//!
//! **Case 1: Compound identifier**
//! ```text
//! EXPLODE items.expanded = array_field
//! ```
//! becomes:
//! ```text
//! EXPLODE __explode_0 = array_field | LET items.expanded = __explode_0 | DROP __explode_0
//! ```
//!
//! **Case 2: Simple identifier with different expression**
//! ```text
//! EXPLODE x = some_array_expr
//! ```
//! becomes:
//! ```text
//! LET x = some_array_expr | EXPLODE x = x
//! ```
//!
//! **Case 3: Already canonical** (`EXPLODE col = col`)
//! ```text
//! EXPLODE x = x
//! ```
//! Passes through unchanged.

use std::sync::Arc;

use hamelin_lib::err::TranslationError;
use hamelin_lib::tree::{
    ast::{command::Command, identifier::Identifier, identifier::SimpleIdentifier},
    builder::{self, column_ref, drop_command, explode_command, let_command},
    typed_ast::{
        command::{TypedCommand, TypedCommandKind, TypedExplodeCommand},
        context::StatementTranslationContext,
        expression::TypedExpressionKind,
        pipeline::TypedPipeline,
    },
};

use super::super::unique::UniqueNameGenerator;

/// Normalize EXPLODE commands to canonical `EXPLODE col = col` form.
///
/// Contract: `Arc<TypedPipeline> -> Result<Arc<TypedPipeline>, ...>`
pub fn normalize_explode(
    pipeline: Arc<TypedPipeline>,
    ctx: &mut StatementTranslationContext,
) -> Result<Arc<TypedPipeline>, Arc<TranslationError>> {
    // Check if any EXPLODE command needs normalization
    if !pipeline
        .valid_ref()?
        .commands
        .iter()
        .any(explode_needs_normalization)
    {
        return Ok(pipeline);
    }

    let valid = pipeline.valid_ref()?;

    // Shared name generator for all EXPLODE commands in this pipeline
    let mut name_gen = UniqueNameGenerator::new("__explode");

    // Transform commands
    let mut pipe_builder = builder::pipeline();
    for cmd in &valid.commands {
        for c in normalize_command(cmd, &mut name_gen)? {
            pipe_builder = pipe_builder.command(c);
        }
    }

    let new_ast = pipe_builder.build().at(pipeline.ast.span);

    // Re-typecheck
    Ok(Arc::new(TypedPipeline::from_ast_with_context(
        Arc::new(new_ast),
        ctx,
    )))
}

/// Check if an EXPLODE command needs normalization.
///
/// Returns true if the EXPLODE is NOT in canonical `col = col` form.
fn explode_needs_normalization(cmd: &Arc<TypedCommand>) -> bool {
    let TypedCommandKind::Explode(explode_cmd) = &cmd.kind else {
        return false;
    };
    !is_canonical_explode(explode_cmd)
}

/// Check if EXPLODE is in canonical `col = col` form.
fn is_canonical_explode(explode_cmd: &TypedExplodeCommand) -> bool {
    // Get the identifier (must be simple)
    let Ok(Identifier::Simple(simple_id)) = explode_cmd.identifier.valid_ref() else {
        return false;
    };

    // Check if expression is a column reference to the same name
    let TypedExpressionKind::ColumnReference(col_ref) = &explode_cmd.expression.kind else {
        return false;
    };

    let Ok(col_name) = col_ref.column_name.valid_ref() else {
        return false;
    };

    // Canonical if identifier == expression column name
    simple_id.as_str() == col_name.as_str()
}

/// Normalize a single command - transforms EXPLODE to canonical form, passes others through.
fn normalize_command(
    cmd: &Arc<TypedCommand>,
    name_gen: &mut UniqueNameGenerator,
) -> Result<Vec<Arc<Command>>, Arc<TranslationError>> {
    let TypedCommandKind::Explode(explode_cmd) = &cmd.kind else {
        return Ok(vec![cmd.ast.clone()]);
    };

    // Already canonical - pass through
    if is_canonical_explode(explode_cmd) {
        return Ok(vec![cmd.ast.clone()]);
    }

    transform_explode(explode_cmd, cmd, name_gen)
}

/// Transform an EXPLODE command to canonical form.
fn transform_explode(
    explode_cmd: &TypedExplodeCommand,
    cmd: &TypedCommand,
    name_gen: &mut UniqueNameGenerator,
) -> Result<Vec<Arc<Command>>, Arc<TranslationError>> {
    let identifier = explode_cmd.identifier.valid_ref()?;

    match identifier {
        Identifier::Simple(simple_id) => {
            // Simple identifier but not canonical (expression is not `col`)
            // Transform: EXPLODE x = expr → LET x = expr | EXPLODE x = x
            let col_name = simple_id.clone();

            // LET x = expr
            let let_cmd = let_command()
                .named_field(
                    col_name.clone(),
                    explode_cmd.expression.ast.as_ref().clone(),
                )
                .at(cmd.ast.span)
                .build();

            // EXPLODE x = x (canonical form)
            let explode = explode_command()
                .named_field(col_name.clone(), column_ref(col_name.as_str()))
                .at(cmd.ast.span)
                .build();

            Ok(vec![Arc::new(let_cmd), Arc::new(explode)])
        }
        Identifier::Compound(compound) => {
            // Compound identifier - use temp name
            // Transform: EXPLODE x.y = expr → EXPLODE __temp = expr | LET x.y = __temp | DROP __temp
            // But we need canonical form, so:
            // EXPLODE x.y = expr → LET __temp = expr | EXPLODE __temp = __temp | LET x.y = __temp | DROP __temp
            let temp_name: SimpleIdentifier = name_gen.next(&cmd.input_schema);

            // LET __temp = expr
            let let_expr = let_command()
                .named_field(
                    temp_name.clone(),
                    explode_cmd.expression.ast.as_ref().clone(),
                )
                .at(cmd.ast.span)
                .build();

            // EXPLODE __temp = __temp (canonical form)
            let explode = explode_command()
                .named_field(temp_name.clone(), column_ref(temp_name.as_str()))
                .at(cmd.ast.span)
                .build();

            // LET x.y = __temp (restore original name)
            let original: Identifier = compound.clone().into();
            let restore = let_command()
                .named_field(original, column_ref(temp_name.as_str()))
                .at(cmd.ast.span)
                .build();

            // DROP __temp
            let cleanup = drop_command().field(temp_name).at(cmd.ast.span).build();

            Ok(vec![
                Arc::new(let_expr),
                Arc::new(explode),
                Arc::new(restore),
                Arc::new(cleanup),
            ])
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use hamelin_lib::{
        tree::{
            ast::{pipeline::Pipeline, IntoTyped, TypeCheckExecutor},
            builder::{
                array, column_ref, drop_command, explode_command, let_command, pipeline,
                select_command,
            },
        },
        types::{array::Array, struct_type::Struct, INT},
    };
    use pretty_assertions::assert_eq;
    use rstest::rstest;
    use std::sync::Arc;

    #[rstest]
    // Case 1: Already canonical (EXPLODE arr = arr) - passes through unchanged
    #[case::canonical_unchanged(
        pipeline()
            .command(select_command().named_field("arr", array().element(1).element(2)).build())
            .command(explode_command().named_field("arr", column_ref("arr")).build())
            .build(),
        pipeline()
            .command(select_command().named_field("arr", array().element(1).element(2)).build())
            .command(explode_command().named_field("arr", column_ref("arr")).build())
            .build(),
        Struct::default().with_str("arr", INT)
    )]
    // Case 2: Simple id, different expr (EXPLODE x = arr) -> LET x = arr | EXPLODE x = x
    #[case::simple_id_different_expr(
        pipeline()
            .command(select_command().named_field("arr", array().element(1).element(2)).build())
            .command(explode_command().named_field("x", column_ref("arr")).build())
            .build(),
        pipeline()
            .command(select_command().named_field("arr", array().element(1).element(2)).build())
            .command(let_command().named_field("x", column_ref("arr")).build())
            .command(explode_command().named_field("x", column_ref("x")).build())
            .build(),
        Struct::default().with_str("x", INT).with_str("arr", Array::new(INT).into())
    )]
    // Case 3: No EXPLODE commands - passes through unchanged
    #[case::no_explode_passthrough(
        pipeline()
            .command(select_command().named_field("a", 1).named_field("b", 2).build())
            .build(),
        pipeline()
            .command(select_command().named_field("a", 1).named_field("b", 2).build())
            .build(),
        Struct::default().with_str("a", INT).with_str("b", INT)
    )]
    // Case 4: Multiple non-canonical EXPLODEs
    #[case::multiple_explodes(
        pipeline()
            .command(select_command()
                .named_field("arr1", array().element(1))
                .named_field("arr2", array().element(2))
                .build())
            .command(explode_command().named_field("x", column_ref("arr1")).build())
            .command(explode_command().named_field("y", column_ref("arr2")).build())
            .build(),
        pipeline()
            .command(select_command()
                .named_field("arr1", array().element(1))
                .named_field("arr2", array().element(2))
                .build())
            .command(let_command().named_field("x", column_ref("arr1")).build())
            .command(explode_command().named_field("x", column_ref("x")).build())
            .command(let_command().named_field("y", column_ref("arr2")).build())
            .command(explode_command().named_field("y", column_ref("y")).build())
            .build(),
        Struct::default()
            .with_str("y", INT)
            .with_str("x", INT)
            .with_str("arr1", Array::new(INT).into())
            .with_str("arr2", Array::new(INT).into())
    )]
    // Case 5: Compound identifier (EXPLODE result.item = arr) -> LET + EXPLODE + LET + DROP
    #[case::compound_id(
        pipeline()
            .command(select_command().named_field("arr", array().element(1)).build())
            .command(explode_command()
                .named_field(
                    hamelin_lib::tree::ast::identifier::CompoundIdentifier::new("result".into(), "item".into(), vec![]),
                    column_ref("arr")
                )
                .build())
            .build(),
        pipeline()
            .command(select_command().named_field("arr", array().element(1)).build())
            .command(let_command().named_field("__explode_0", column_ref("arr")).build())
            .command(explode_command().named_field("__explode_0", column_ref("__explode_0")).build())
            .command(let_command()
                .named_field(
                    hamelin_lib::tree::ast::identifier::CompoundIdentifier::new("result".into(), "item".into(), vec![]),
                    column_ref("__explode_0")
                )
                .build())
            .command(drop_command().field("__explode_0").build())
            .build(),
        Struct::default()
            .with_str("result", Struct::default().with_str("item", INT).into())
            .with_str("arr", Array::new(INT).into())
    )]
    fn test_normalize_explode(
        #[case] input: Pipeline,
        #[case] expected: Pipeline,
        #[case] expected_output_schema: Struct,
    ) {
        let input_typed = input.typed_with().typed();
        let expected_typed = expected.typed_with().typed();

        let mut ctx = StatementTranslationContext::default();
        let result = normalize_explode(Arc::new(input_typed), &mut ctx).unwrap();

        // Compare ASTs
        assert_eq!(result.ast, expected_typed.ast);

        // Verify output schema
        let result_schema = result.environment().flatten();
        assert_eq!(result_schema, expected_output_schema);
    }
}