mle 0.28.0

The markup link extractor (mle) extracts links from markup files (Markdown and HTML).
Documentation
// SPDX-FileCopyrightText: 2022 - 2025 Robin Vobruba <hoijui.quaero@gmail.com>
//
// SPDX-License-Identifier: AGPL-3.0-or-later

//! Command line interface for mle.
//!
//! This module implements the command line interface for mle.
//! Most of it is declared `pub`,
//! because we want to be able to re-use it.
//! We will want do that in CLIs that have link-extraction as a pre-step
//! to their functionality,
//! which is the case for example for [`mlc`](https://github.com/hoijui/mlc)
//! (Markup Link Checker).

use crate::BoxResult;
use crate::config::{Extractor as ExtractorConfig, Tool as ToolConfig};
use crate::ignore_link;
use crate::result;
use async_std::io::BufReadExt;
use clap::builder::ValueParser;
use clap::command;
use clap::value_parser;
use clap::{Arg, ArgAction, ArgMatches, Command, ValueHint};
use cli_utils::StreamIdent;
use cli_utils::path_buf::PathBuf;
use const_format::formatcp;
use futures::StreamExt;
use futures::pin_mut;
use std::collections::HashSet;
use std::sync::LazyLock;
use std::{env, io};
use wildmatch::WildMatch;

pub const A_N_MARKUP_FILES: &str = "markup_files";
pub const A_L_MARKUP_FILES_LIST: &str = "markup-files-list";
pub const A_S_MARKUP_FILES_LIST: char = 'I';
pub const A_L_VERSION: &str = "version";
pub const A_S_VERSION: char = 'V';
pub const A_S_QUIET: char = 'q';
pub const A_L_QUIET: &str = "quiet";
pub const A_L_NO_LINKS: &str = "no-links";
pub const A_S_NO_LINKS: char = 'n';
pub const A_L_ANCHORS: &str = "anchors";
pub const A_S_ANCHORS: char = 'a';
pub const A_L_IGNORE_LINKS: &str = "ignore-links";
pub const A_S_IGNORE_LINKS: char = 'i';
pub const A_L_LINKS_FILE: &str = "links-file";
pub const A_S_LINKS_FILE: char = 'P';
pub const A_L_RESULT_FORMAT: &str = "result-format";
pub const A_S_RESULT_FORMAT: char = 'F';
pub const A_L_RESULT_EXTENDED: &str = "result-extended";
pub const A_S_RESULT_EXTENDED: char = 'E';
pub const A_L_RESULT_FLUSH: &str = "result-flush";
pub const A_S_RESULT_FLUSH: char = 'f';
pub const HH_VERBOSITY: &str = "Verbosity";
pub const HH_ADVANCED: &str = "Advanced";

#[must_use]
pub fn arg_version() -> Arg {
    Arg::new(A_L_VERSION)
        .help_heading(HH_VERBOSITY)
        .help("Print version information and exit")
        .long_help(formatcp!(
            "Print version information and exit. \
May be combined with -{A_S_QUIET},--{A_L_QUIET}, \
to really only output the version string."
        ))
        .short(A_S_VERSION)
        .long(A_L_VERSION)
        .action(ArgAction::SetTrue)
}

#[must_use]
pub fn arg_quiet() -> Arg {
    Arg::new(A_L_QUIET)
        .help_heading(HH_VERBOSITY)
        .help("Minimize or suppress output to stdout")
        .long_help(
            "Minimize or suppress output to stdout, \
and only shows log output on stderr.",
        )
        .action(ArgAction::SetTrue)
        .short(A_S_QUIET)
        .long(A_L_QUIET)
}

#[must_use]
pub fn arg_markup_files() -> Arg {
    Arg::new(A_N_MARKUP_FILES)
        .help("The markup files to extract links and/or anchors from")
        .num_args(1..)
        .value_parser(value_parser!(PathBuf))
        .value_name("MARKUP_FILE")
        .value_hint(ValueHint::DirPath)
        .action(ArgAction::Append)
        .required_unless_present_any([A_L_VERSION, A_L_MARKUP_FILES_LIST])
        .conflicts_with(A_L_MARKUP_FILES_LIST)
}

#[must_use]
pub fn arg_markup_files_list() -> Arg {
    Arg::new(A_L_MARKUP_FILES_LIST)
        .help(
            "A file containing a list of markup files \
to extract links and/or anchors from; one per line.",
        )
        .num_args(1)
        .value_name("LIST_FILE")
        .short(A_S_MARKUP_FILES_LIST)
        .long(A_L_MARKUP_FILES_LIST)
        .value_parser(value_parser!(PathBuf))
        .action(ArgAction::Set)
        .required_unless_present_any([A_L_VERSION, A_N_MARKUP_FILES])
        .conflicts_with(A_N_MARKUP_FILES)
}

#[must_use]
pub fn arg_no_links() -> Arg {
    Arg::new(A_L_NO_LINKS)
        .help_heading(HH_ADVANCED)
        .help("Do not extract links")
        .long_help(
            "Do not extract links. \
See -{A_S_ANCHORS},--{A_L_ANCHORS}.",
        )
        .short(A_S_NO_LINKS)
        .long(A_L_NO_LINKS)
        .requires(A_L_ANCHORS)
        .action(ArgAction::SetTrue)
}

#[must_use]
pub fn arg_anchors() -> Arg {
    Arg::new(A_L_ANCHORS)
        .help_heading(HH_ADVANCED)
        .help(
            "Enable extract of anchors, \
and optionally the file to store them to",
        )
        .num_args(0..=1)
        .value_name("FILE")
        .short(A_S_ANCHORS)
        .long(A_L_ANCHORS)
        .value_parser(value_parser!(PathBuf))
        .action(ArgAction::Set)
        .default_value(cli_utils::STREAM_PATH_STR)
}

#[must_use]
pub fn arg_ignore_links() -> Arg {
    Arg::new(A_L_IGNORE_LINKS)
        .help_heading(HH_ADVANCED)
        .help("List of links which will not be extracted; space separated")
        .long_help(
            "One or more wildcard-patterns/globs, matching links \
which will not be extracted; separated by white-space.",
        )
        .num_args(1..)
        .value_parser(ValueParser::new(ignore_link::parse))
        .value_name("GLOB")
        .short(A_S_IGNORE_LINKS)
        .long(A_L_IGNORE_LINKS)
        .action(ArgAction::Append)
}

#[must_use]
pub fn arg_links_file() -> Arg {
    Arg::new(A_L_LINKS_FILE)
        .help_heading(HH_ADVANCED)
        .help("Which file to store the extracted links to")
        .num_args(1)
        .value_hint(ValueHint::FilePath)
        .value_name("FILE")
        .value_parser(value_parser!(PathBuf))
        .short(A_S_LINKS_FILE)
        .long(A_L_LINKS_FILE)
        .action(ArgAction::Set)
}

#[must_use]
pub fn arg_result_format() -> Arg {
    Arg::new(A_L_RESULT_FORMAT)
        .help("Data format of the output")
        .num_args(1)
        .value_parser(value_parser!(result::Type))
        .value_name("FORMAT")
        .short(A_S_RESULT_FORMAT)
        .long(A_L_RESULT_FORMAT)
        .action(ArgAction::Set)
}

#[must_use]
pub fn arg_result_extended() -> Arg {
    Arg::new(A_L_RESULT_EXTENDED)
        .help_heading(HH_ADVANCED)
        .help("Output additional properties per link/anchor")
        .short(A_S_RESULT_EXTENDED)
        .long(A_L_RESULT_EXTENDED)
        .action(ArgAction::SetTrue)
}

#[must_use]
pub fn arg_result_flush() -> Arg {
    Arg::new(A_L_RESULT_FLUSH)
        .help_heading(HH_ADVANCED)
        .help("Flush output after each link/anchor.")
        .long_help(
            "Flush output after each link/anchor. \
Not all output formats support this.",
        )
        .short(A_S_RESULT_FLUSH)
        .long(A_L_RESULT_FLUSH)
        .action(ArgAction::SetTrue)
}

static ARGS: LazyLock<Vec<Arg>> = LazyLock::new(|| {
    vec![
        arg_version(),
        arg_quiet(),
        arg_markup_files(),
        arg_markup_files_list(),
        arg_no_links(),
        arg_anchors(),
        arg_ignore_links(),
        arg_links_file(),
        arg_result_format(),
        arg_result_extended(),
        arg_result_flush(),
    ]
});

pub fn find_duplicate_short_options(args: &[Arg]) -> Vec<char> {
    let mut short_options: Vec<char> = args.iter().filter_map(clap::Arg::get_short).collect();
    // standard option --help
    short_options.push('h');
    // standard option --version
    // NOTE This is now implemented manually
    // short_options.push('V');
    short_options.sort_unstable();
    let mut duplicate_short_options = HashSet::new();
    let mut last_chr = '&';
    for chr in &short_options {
        if *chr == last_chr {
            duplicate_short_options.insert(*chr);
        }
        last_chr = *chr;
    }
    duplicate_short_options.iter().copied().collect()
}

/// Returns the argument matcher for the CLI.
///
/// # Panics
///
/// - if duplicate argument short options are found -
///   which is a programmer error
#[must_use]
pub fn arg_matcher(bin_name: &str, args: &[Arg]) -> Command {
    let duplicate_short_options = find_duplicate_short_options(args);
    assert!(
        duplicate_short_options.is_empty(),
        "Duplicate argument short options: {duplicate_short_options:?}",
    );
    command!()
        .bin_name(bin_name)
        .help_expected(true)
        .disable_version_flag(true)
        .args(args.iter())
}

async fn read_lines<P>(
    filename: P,
) -> io::Result<async_std::io::Lines<async_std::io::BufReader<async_std::fs::File>>>
where
    P: AsRef<async_std::path::Path>,
{
    let file = async_std::fs::File::open(filename).await?;
    Ok(async_std::io::BufReader::new(file).lines())
}

/// Returns a list of markup files provided through the CLI.
///
/// # Errors
///
/// - if a list file was provided, and there is an error while reading it
pub async fn markup_files(args: &mut ArgMatches) -> io::Result<Vec<PathBuf>> {
    let mut files = vec![];
    if let Some(arg_files) = args.remove_many::<PathBuf>(A_N_MARKUP_FILES) {
        for arg_file in arg_files {
            files.push(arg_file);
        }
    }
    if let Some(list_file) = args.remove_one::<PathBuf>(A_L_MARKUP_FILES_LIST) {
        let lines = read_lines(list_file).await?;
        pin_mut!(lines);
        while let Some(line) = lines.next().await {
            files.push(line?.as_str().into());
        }
    }
    if files.is_empty() {
        return Err(io::Error::other("No markup files provided on the CLI"));
    }

    Ok(files)
}

/// Returns a list of globs for links to be ignored
/// provided through the CLI.
pub fn ignore_links(args: &mut ArgMatches) -> Vec<WildMatch> {
    args.remove_many::<WildMatch>(A_L_IGNORE_LINKS)
        .unwrap_or_default()
        .collect()
}

pub fn print_version_and_exit(version: &str, quiet: bool) {
    #![allow(clippy::print_stdout)]

    if !quiet {
        print!("{} ", clap::crate_name!());
    }
    println!("{version}");
    std::process::exit(0);
}

/// Parses CLI arguments into our own config structure.
///
/// # Errors
///
/// If fetching the CWD failed.
pub async fn parse_args() -> BoxResult<ToolConfig> {
    let mut args = arg_matcher(clap::crate_name!(), &ARGS).get_matches();

    let quiet = args.get_flag(A_L_QUIET);
    let version = args.get_flag(A_L_VERSION);
    if version {
        print_version_and_exit(crate::VERSION, quiet);
    }

    let markup_files = markup_files(&mut args).await?;
    let links = if args.get_flag(A_L_NO_LINKS) {
        None
    } else {
        Some(StreamIdent::from((
            args.remove_one::<PathBuf>(A_L_LINKS_FILE)
                .map(Into::<async_std::path::PathBuf>::into)
                .map(Into::<cli_utils::path_buf::PathBuf>::into),
            false,
        )))
    };
    let anchors = if args.get_raw(A_L_ANCHORS).is_none() {
        None
    } else {
        Some(StreamIdent::from((
            args.remove_one::<PathBuf>(A_L_ANCHORS)
                .map(Into::<async_std::path::PathBuf>::into)
                .map(Into::<cli_utils::path_buf::PathBuf>::into),
            false,
        )))
    };

    let ignore_links: Vec<WildMatch> = ignore_links(&mut args);
    let result_format = args
        .remove_one::<result::Type>(A_L_RESULT_FORMAT)
        .unwrap_or_default();
    let result_extended = args.get_flag(A_L_RESULT_EXTENDED);
    let result_flush = args.get_flag(A_L_RESULT_FLUSH);

    Ok(ToolConfig {
        extractor: ExtractorConfig {
            markup_files,
            links: links.is_some(),
            anchors: anchors.is_some(),
            ignore_links,
        },
        links,
        anchors,
        result_format,
        result_extended,
        result_flush,
    })
}