adze-tablegen 0.8.0

Table generation and compression for pure-Rust Tree-sitter
#![cfg_attr(feature = "strict_docs", allow(missing_docs))]
//! Enhanced external scanner generator with state-based validity.

// Enhanced external scanner generator with state-based validity computation
use adze_glr_core::ParseTable;
use adze_ir::{ExternalToken, Grammar, SymbolId};
use quote::quote;
use std::collections::HashMap;

#[cfg(not(debug_assertions))]
macro_rules! debug_trace {
    ($($arg:tt)*) => {};
}

#[cfg(debug_assertions)]
macro_rules! debug_trace {
    ($($arg:tt)*) => {
        if std::env::var("RUST_LOG")
            .ok()
            .unwrap_or_default()
            .contains("debug")
        {
            eprintln!($($arg)*);
        }
    };
}

/// Enhanced external scanner generator that computes state-based validity
pub struct ExternalScannerGenerator {
    #[allow(dead_code)]
    grammar: Grammar,
    external_tokens: Vec<ExternalToken>,
    /// Maps symbol IDs to their indices in the external scanner
    #[allow(dead_code)]
    symbol_map: HashMap<SymbolId, usize>,
    /// Parse table for computing valid external tokens
    parse_table: ParseTable,
}

impl ExternalScannerGenerator {
    pub fn new(grammar: Grammar, parse_table: ParseTable) -> Self {
        let external_tokens = grammar.externals.clone();
        let mut symbol_map = HashMap::new();

        for (index, token) in external_tokens.iter().enumerate() {
            symbol_map.insert(token.symbol_id, index);
        }

        Self {
            grammar,
            external_tokens,
            symbol_map,
            parse_table,
        }
    }

    /// Computes which external tokens are valid in each state
    pub fn compute_state_validity(&self) -> Vec<Vec<bool>> {
        // Use the pre-computed external scanner states from the parse table
        // These were already calculated during LR(1) automaton construction
        self.parse_table.external_scanner_states.clone()
    }

    /// Generates the external scanner state bitmap with computed validity
    pub fn generate_state_bitmap(&self) -> Vec<Vec<bool>> {
        // Use the pre-computed external scanner states from the parse table
        self.parse_table.external_scanner_states.clone()
    }

    /// Generates the symbol map array that maps external scanner indices to symbol IDs
    pub fn generate_symbol_map(&self) -> Vec<u16> {
        let mut map = vec![0u16; self.external_tokens.len()];

        for (token_index, token) in self.external_tokens.iter().enumerate() {
            map[token_index] = token.symbol_id.0;
        }

        map
    }

    /// Generates the external scanner FFI interface code
    pub fn generate_scanner_interface(&self) -> proc_macro2::TokenStream {
        if self.external_tokens.is_empty() {
            return quote! {};
        }

        // Generate external scanner state data with computed validity
        let state_bitmap = self.generate_state_bitmap();
        let mut state_data = Vec::new();

        for state in &state_bitmap {
            for &valid in state {
                state_data.push(valid);
            }
        }

        // Generate symbol map
        let symbol_map = self.generate_symbol_map();

        // Generate external token count and state count constants
        let external_count = self.external_tokens.len();
        let state_count = state_bitmap.len();

        quote! {
            // External scanner constants
            const EXTERNAL_TOKEN_COUNT: usize = #external_count;
            const STATE_COUNT: usize = #state_count;

            // External scanner state bitmap (computed from parse table)
            static EXTERNAL_SCANNER_STATES: &[bool] = &[#(#state_data),*];

            // External scanner symbol map
            static EXTERNAL_SCANNER_SYMBOL_MAP: &[u16] = &[#(#symbol_map),*];

            // External scanner data
            #[allow(dead_code)]
            static EXTERNAL_SCANNER_DATA: adze::ffi::TSExternalScannerData = adze::ffi::TSExternalScannerData {
                states: EXTERNAL_SCANNER_STATES.as_ptr(),
                symbol_map: EXTERNAL_SCANNER_SYMBOL_MAP.as_ptr(),
                create: None, // TODO: Link to user scanner
                destroy: None,
                scan: None,
                serialize: None,
                deserialize: None,
            };

            // Helper function to get valid external tokens for a state
            #[allow(dead_code)]
            fn get_valid_external_tokens(state: usize) -> Vec<bool> {
                if state >= STATE_COUNT {
                    return vec![false; EXTERNAL_TOKEN_COUNT];
                }

                let start = state * EXTERNAL_TOKEN_COUNT;
                let end = start + EXTERNAL_TOKEN_COUNT;
                EXTERNAL_SCANNER_STATES[start..end].to_vec()
            }
        }
    }

    /// Returns whether the grammar has external tokens
    pub fn has_external_tokens(&self) -> bool {
        !self.external_tokens.is_empty()
    }

    /// Returns the number of external tokens
    pub fn external_token_count(&self) -> usize {
        self.external_tokens.len()
    }

    /// Debug helper: print validity matrix
    pub fn debug_print_validity(&self) {
        let state_bitmap = self.compute_state_validity();

        debug_trace!("External Token Validity Matrix:");
        debug_trace!("States x External Tokens");

        // Print header with external token names
        let mut header = String::from("State |");
        for token in &self.external_tokens {
            header.push_str(&format!(" {} |", token.name));
        }
        debug_trace!("{}", header);

        // Print validity for each state
        for (state_idx, state_validity) in state_bitmap.iter().enumerate() {
            let mut row = format!("{:5} |", state_idx);
            for &valid in state_validity {
                row.push_str(&format!(" {:5} |", if valid { "" } else { " " }));
            }
            debug_trace!("{}", row);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use adze_glr_core::{Action, FirstFollowSets, build_lr1_automaton};
    use adze_ir::{ProductionId, Rule, Symbol, Token, TokenPattern};

    #[test]
    fn test_state_validity_computation() {
        let mut grammar = Grammar::new("test".to_string());

        // Add external tokens
        grammar.externals.push(ExternalToken {
            name: "INDENT".to_string(),
            symbol_id: SymbolId(100),
        });
        grammar.externals.push(ExternalToken {
            name: "DEDENT".to_string(),
            symbol_id: SymbolId(101),
        });

        // Create a simple parse table
        let mut parse_table = crate::test_helpers::test::make_minimal_table(
            vec![vec![vec![Action::Error]; 2]; 2], // 2 states, 2 symbols
            vec![vec![crate::test_helpers::test::INVALID; 2]; 2],
            vec![],
            adze_ir::SymbolId(1), // start_symbol
            adze_ir::SymbolId(1), // eof_symbol
            0,                    // external_token_count
        );
        parse_table.external_scanner_states = vec![
            vec![true, false], // State 0: INDENT is valid
            vec![false, true], // State 1: DEDENT is valid
        ];

        // Map external symbols to indices
        parse_table.symbol_to_index.insert(SymbolId(100), 0); // INDENT
        parse_table.symbol_to_index.insert(SymbolId(101), 1); // DEDENT

        // State 0: INDENT is valid (shift to state 1)
        parse_table.action_table[0][0] = vec![Action::Shift(adze_ir::StateId(1))];

        // State 1: DEDENT is valid (shift to state 2)
        parse_table.action_table[1][1] = vec![Action::Shift(adze_ir::StateId(2))];

        let generator = ExternalScannerGenerator::new(grammar, parse_table);
        let validity = generator.compute_state_validity();

        // Check state 0: only INDENT should be valid
        assert_eq!(validity[0], vec![true, false]);

        // Check state 1: only DEDENT should be valid
        assert_eq!(validity[1], vec![false, true]);
    }

    #[test]
    fn test_symbol_map_generation() {
        let mut grammar = Grammar::new("test".to_string());

        // Add a minimal token and rule to make the grammar valid
        let start_symbol = SymbolId(1);
        let dummy_token_id = SymbolId(2);

        // Add token to grammar
        grammar.tokens.insert(
            dummy_token_id,
            Token {
                name: "dummy".to_string(),
                pattern: TokenPattern::String("dummy".to_string()),
                fragile: false,
            },
        );

        // Add rule that uses the token
        grammar.rules.insert(
            start_symbol,
            vec![Rule {
                lhs: start_symbol,
                rhs: vec![Symbol::Terminal(dummy_token_id)],
                precedence: None,
                associativity: None,
                fields: vec![],
                production_id: ProductionId(0),
            }],
        );

        grammar.externals.push(ExternalToken {
            name: "TOKEN1".to_string(),
            symbol_id: SymbolId(200),
        });
        grammar.externals.push(ExternalToken {
            name: "TOKEN2".to_string(),
            symbol_id: SymbolId(201),
        });

        let first_follow = FirstFollowSets::compute(&grammar).unwrap();
        let parse_table = build_lr1_automaton(&grammar, &first_follow).unwrap();
        let generator = ExternalScannerGenerator::new(grammar, parse_table);

        let symbol_map = generator.generate_symbol_map();
        assert_eq!(symbol_map, vec![200, 201]);
    }
}