lexigram 0.9.3

Lexer and LL(1) parser generator
// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.

use std::iter::Peekable;
use std::str::FromStr;
use lexi_gram::{gencode, genspec};
use lexi_gram::lexigram_lib::CollectJoin;
use lexi_gram::lexigram_lib::lexer::CaretCol;
use lexi_gram::lexigram_lib::lexergen::LexigramCrate;
use lexi_gram::lexigram_lib::parsergen::NTValue;
use lexi_gram::options::{Action, CodeLocation, Options, OptionsBuilder, Specification};
use crate::{ExeAnsi, ExeError};

/// Description of the command-line arguments
pub static HELP_MESSAGE: &str = r##"lexigram is a lexer / parser generator.

Usage:    lexigram [options]

Main options. Please note:
- The lexer options must be given before the parser options, and the parser options must
  be given before the template options (types/listener).
- Generating/verifying the parser is optional, but the lexer is mandatory.

  -l|--lexer <location>     Location of the generated lexer code, where <location> is
                                <location> = <filename> | <filename> tag <tag> | -
                            "-" instead of a filename outputs the code to stdout.
                            This option is mandatory.

                            When a <tag> is used, the file must already exist and the code
                            will be located between surrounding "[<tag>]", with one empty
                            line of separation between the tag and the code. When the code
                            hasn't been generated yet, lexigram expects to find two tags
                            separated by a couple of empty lines.

                            Examples:
                            --lexer lexer.rs             -> file "lexer.rs" will be created
                            --lexer lexer.rs tag LEXER   -> between "[LEXER]" tags in
                                                            existing file "lexer.rs"
                            --lexer -                    -> code will be output to stdout

  -x|--lexicon <location>   Location of the lexicon, where <location> is the same as for
                            --lexer, exept "-" isn't valid.
                            This option is mandatory.

  -p|--parser <location>    Location of the generated parser code, where <location> is the same
                            as for --lexer. If the --parser and --grammar options aren't
                            present, the parser code isn't generated.

  -g|--grammar <location>   Location of the grammar, where <location> is the same as for
                            --lexer, except "-" isn't valid. This option is mandatory if
                            --parser is set.

  -c|--combined <location>  Location of the combined lexicon and grammar. Both must be
                            included in the same location, starting with the lexicon:

                                lexicon <lexer name>;
                                ...
                                grammar <parser name>;
                                ...

                            The location of the code generated for the lexer (-l) and the
                            parser (-p) must be specified after, and in that order.

  --types <location>        Location of the template for the user types.
  --listener <location>     Location of the template for the listener implementation.

Secondary lexer / parser / template options. Those options can be set before -l/--lexer and -p/--parser
if they apply to both, or after either of them if they only apply to the lexer or the parser:

  --header <string>         Adds a header in front of the generated code. This option can
                            be used multiple times if several headers are required.
                            There are no headers in the templates, so this options will be
                            ignored after types and listener.

                            Example: --header "#[cfg(feature = \"parser\")]"

  --indent <number>         Defines the code indentation in number of spaces (default: 0).
                            This can also be used for the templates.

Other options related to the generated code:

  --tab-width <number>      Sets the tab width for input files like the lexicon and the grammar.
                            The default is 4 (space positions per tab).

  --lib <string>            Adds a custom lib (crates/modules) to the "use" bindings in the
                            parser / wrapper / listener generated code. This option can be
                            used multiple times if several custom bindings are required.

                            Example: --lib "super::listener_types::*"

  --start-nt <name>         Defines the start nonterminal. By default, the first nonterminal
                            defined in the grammar is the start one, but this option can be
                            used if you want to implement only a subset of the grammar.
                            Lexigram will give you warnings because it will detect there are
                            unused terminals in your grammar, but this is fine.

  --spans                   Adds parameters to the listener method that give the locations of
                            the terminals and nonterminals of each rule alternative in the
                            parsed text.

  --token-enums             Generates enums for the terminal and nonterminal values. They may
                            be helpful in the optional listener trait methods like 'hook()'
                            and 'intercept_token()' when they are used.
                            These enums are generated by default only when hooks are declared
                            in the lexicon.

  --no-wrapper              Generates only a parser; doesn't generate the code for the
                            wrapper and the listener.

  --debug-info              Adds extra info in the parser to generate clearer debug messages.

  --use-full-lib            Uses the full crate lexigram_lib in the generated code instead
                            of the smaller lexigram_core with the minimal features required
                            by the lexer and parser. The full crate includes all the code
                            generation features, so it's normally not necessary.

                            See also the --lib-crate option to set a custom path to the core
                            library.

  --lib-crate <path>        Sets a custom `use` path to the lexigram core library in the
                            generated code.

                            See also the --use-full-lib option for the most common situations.

                            Example: --lib-crate "core"

  --nt-value <type>         Defines which nonterminals have a value in the listener. Those
                            which have a value are included in the contexts and must be
                            given a value in the corresponding exit callback.

                            The list of nonterminals is defined by the argument <type>

                            - none: no nonterminal
                            - parents: only top parents
                            - default: top parents and children of (<L> ...)+*
                            - set <list>: explicit list of nonterminal names; "<default>"
                                and "<parents>" can be included to set an entire
                                predefined class.
                                A "-" can be put in front of individual nonterminals in the
                                list indicate they don't hold a value.

                            By default, "default" is used, which is usually a good strategy.

                            Examples:
                                --nt-value set "<parents>,id_i,decl_i"
                                --nt_value set "<default>,-inst,-decl"

General options:

  --ansi <off/on/passive>   ANSI colour option for the output log and messages.
                            - off: no ANSI colours
                            - on: use ANSI colours and activate ANSI support in Windows
                                console. This is the default behaviour.
                            - passive: use ANSI colours but don't active ANSI support in
                                the console. Try this option if "on" creates problems.

  -v|--verify               Verifies that the generated code matches what is already in the
                            lexer and parser locations. The files must already exist and
                            aren't modified.

  --log                     Shows the log.

  -h|--help                 Shows this message.

  -V|--version              Shows the program version.

Example:

  lexigram -x lexicon.l -l lexer.rs -g grammar.g -p parser.rs --lib "super::listener_types::*" --spans --log
"##;

#[derive(Clone, Debug)]
pub(crate) struct ArgOptions {
    pub gen_options: Options,
    pub show_log: bool,
    pub ansi: ExeAnsi,
}

fn take_argument<'a, I: Iterator<Item=&'a str>, S: Into<String>>(args: &mut I, error_message: S) -> Result<&'a str, ExeError> {
    let Some(value) = args.next() else {
        return Err(ExeError::Option(error_message.into()));
    };
    Ok(value)
}

fn get_code<'a, I: Iterator<Item=&'a str>>(label: &str, args: &mut Peekable<I>) -> Result<CodeLocation, ExeError> {
    let Some(filename) = args.next() else {
        return Err(ExeError::Option(format!("missing filename after --{label}")));
    };
    let code = if matches!(args.peek(), Some(&"tag")) {
        if filename == "-" {
            return Err(ExeError::Option(format!("- (stdout) cannot be used with a tag as {label} code location")))
        }
        args.next();
        let Some(tag) = args.next() else {
            return Err(ExeError::Option(format!("missing tag name in {label} code location")));
        };
        gencode!(filename: filename, tag: tag)
    } else if filename == "-" {
        gencode!(stdout)
    } else {
        gencode!(filename: filename)
    };
    Ok(code)
}

fn get_spec<'a, I: Iterator<Item=&'a str>>(label: &str, spec_name: &str, args: &mut Peekable<I>) -> Result<Specification, ExeError> {
    let filename = take_argument(args, format!("missing filename after --{label}"))?;
    if filename == "-" {
        return Err(ExeError::Option(format!("- (stdout) is not a valid {spec_name} source for {label}")));
    }
    let code = if matches!(args.peek(), Some(&"tag")) {
        if filename == "-" {
            return Err(ExeError::Option(format!("- (stdout) cannot be used with a tag as {label} {spec_name} location")))
        }
        args.next();
        let tag = take_argument(args, format!("missing tag name in {label} {spec_name} location"))?;
        genspec!(filename: filename, tag: tag)
    } else {
        genspec!(filename: filename)
    };
    Ok(code)
}

pub(crate) fn parse_args(all_args: Vec<String>) -> Result<(Action, ArgOptions), ExeError> {
    if all_args.is_empty() {
        return Err(ExeError::Help);
    }
    let mut builder = OptionsBuilder::new();
    let mut action = Action::Generate;
    let mut show_log = false;
    let mut ansi = ExeAnsi::On;
    let mut args = all_args.iter().map(|s| s.as_str()).peekable();
    while let Some(arg) = args.next() {
        match arg {
            "-h" | "--help" => {
                return Err(ExeError::Help); // not a real error
            }
            "-l" | "--lexer" => {
                builder.lexer_code(get_code("lexer", &mut args)?);
            },
            "-x" | "--lexicon" => {
                builder.lexer_spec(get_spec("lexer", "lexicon", &mut args)?);
            }
            "-p" | "--parser" => {
                builder.parser_code(get_code("parser", &mut args)?);
            }
            "-g" | "--grammar" => {
                builder.parser_spec(get_spec("parser", "grammar", &mut args)?);
            }
            "-c" | "--combined" => {
                builder.combined_spec(get_spec("combined", "lexicon and grammar", &mut args)?);
            }
            "--types" => {
                builder.types_code(get_code("types", &mut args)?);
            }
            "--listener" => {
                builder.listener_code(get_code("listener", &mut args)?);
            }
            "--header" => {
                let header = take_argument(&mut args, "missing argument after --header")?;
                builder.headers([header]);
            }
            "--indent" => {
                let indent = take_argument(&mut args, "missing argument after --indent")?;
                let indent_value = usize::from_str(indent)
                    .map_err(|e| ExeError::Option(format!("error while parsing --indent {indent}: {e}")))?;
                builder.indent(indent_value);
            }
            "--ansi" => {
                let ansi_str = take_argument(&mut args, "missing argument after --ansi")?.to_ascii_lowercase();
                ansi = match ansi_str.as_str() {
                    "off" => ExeAnsi::Off,
                    "on" => ExeAnsi::On,
                    "passive" => ExeAnsi::OnPassive,
                    s => return Err(ExeError::Option(format!("ERROR: unexpected argument '{s}'"))),
                };
                builder.ansi(ansi != ExeAnsi::Off);
            }
            "-v" | "--verify" => {
                action = Action::Verify;
            }
            "--tab-width" => {
                let tab = take_argument(&mut args, "missing argument after --tab-width")?;
                let tab_value = CaretCol::from_str(tab)
                    .map_err(|e| ExeError::Option(format!("error while parsing --tab-width {tab}: {e}")))?;
                builder.tab_width(tab_value);
            }
            "--lib" => {
                let lib = take_argument(&mut args, "missing argument after --lib")?;
                builder.libs([lib]);
            }
            "--start-nt" => {
                let name = take_argument(&mut args, "missing nonterminal name after --start-nt")?;
                builder.start_nt(Some(name));
            }
            "--spans" => {
                builder.span_params(true);
            }
            "--token-enums" => {
                builder.token_enums(true);
            }
            "--no-wrapper" => {
                builder.wrapper(false);
            }
            "--debug-info" => {
                builder.parser_alts(true);
            }
            "--use-full-lib" => {
                builder.use_full_lib(true);
            }
            "--lib-crate" => {
                let path = take_argument(&mut args, "missing argument after --lib-crate")?;
                builder.set_crate(LexigramCrate::Custom(path.to_string()));
            }
            "--nt-value" => {
                let nt_type = take_argument(&mut args, "missing argument after --nt-value")?;
                let nt_value = match nt_type {
                    "none" => NTValue::None,
                    "parents" => NTValue::Parents,
                    "default" => NTValue::Default,
                    "set" => {
                        let ids = take_argument(&mut args, "missing list after --nt-value set")?;
                        NTValue::SetNames(ids.split(",").map(|s| s.trim().to_string()).to_vec())
                    }
                    _ => return Err(ExeError::Option(format!("ERROR: incorrect type after --nt-value: {nt_type}"))),
                };
                builder.set_nt_value(nt_value);
            }
            "--log" => {
                show_log = true;
            }
            "-V" | "--version" => {
                return Err(ExeError::Version); // not a real error
            }
            s => {
                return Err(ExeError::Option(format!("ERROR: unexpected argument '{s}'")));
            }
        }
    }
    let gen_options = builder.build()
        .map_err(ExeError::Option)?;
    let arg_options = ArgOptions {
        gen_options,
        show_log,
        ansi,
    } ;
    Ok((action, arg_options))
}